- creating to_raw_data and to_data_frame methods for Dataset
- bug fix for empty value mapping with multi value columns
This commit is contained in:
Родитель
167ce16107
Коммит
d017bffa2b
|
@ -26,5 +26,6 @@
|
|||
},
|
||||
"[python]": {
|
||||
"editor.defaultFormatter": "ms-python.python"
|
||||
}
|
||||
},
|
||||
"cSpell.enabled": true
|
||||
}
|
||||
|
|
|
@ -375,6 +375,7 @@ fn main() {
|
|||
gd.write_synthetic_data(
|
||||
&synthetic_path,
|
||||
synthetic_delimiter.chars().next().unwrap(),
|
||||
"",
|
||||
join_multi_value_columns,
|
||||
long_form,
|
||||
)
|
||||
|
|
|
@ -4,14 +4,17 @@ use super::{
|
|||
DataBlockHeaders, DataBlockRecords,
|
||||
},
|
||||
value::DataBlockValue,
|
||||
MultiValueColumnMetadataMap,
|
||||
MultiValueColumnMetadataMap, RawData, RawDataMultiValueColumnJoiner,
|
||||
};
|
||||
use fnv::FnvHashMap;
|
||||
use itertools::Itertools;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::{processing::aggregator::RecordsSet, utils::math::uround_down};
|
||||
use crate::{
|
||||
processing::{aggregator::RecordsSet, generator::SynthesizerCacheKey},
|
||||
utils::math::uround_down,
|
||||
};
|
||||
|
||||
#[cfg(feature = "pyo3")]
|
||||
use pyo3::prelude::*;
|
||||
|
@ -194,4 +197,73 @@ impl DataBlock {
|
|||
usize::min(reporting_length, self.headers.len())
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Creates a `RawData` vector for the stored data,
|
||||
/// where the first entry is the headers
|
||||
/// # Arguments
|
||||
/// * `empty_value` - Empty values will be replaced by this
|
||||
pub fn to_raw_data(&self, empty_value: &Arc<String>) -> RawData {
|
||||
let mut raw_data: RawData = vec![self.headers.iter().map(|h| (*h).clone()).collect()];
|
||||
let n_headers = self.headers.len();
|
||||
|
||||
raw_data.append(
|
||||
&mut self
|
||||
.records
|
||||
.iter()
|
||||
.map(|r| SynthesizerCacheKey::new(n_headers, &r.values).format_record(empty_value))
|
||||
.collect(),
|
||||
);
|
||||
raw_data
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Clones the data block data to a `Vec<Vec<String>>`,
|
||||
/// where the first entry is the headers
|
||||
/// # Arguments
|
||||
/// * `empty_value` - Empty values will be replaced by this
|
||||
/// * `join_multi_value_columns` - Whether multi value columns should be joined back together or not
|
||||
pub fn to_raw_data_vec(
|
||||
&self,
|
||||
empty_value: &Arc<String>,
|
||||
join_multi_value_columns: bool,
|
||||
) -> Vec<Vec<String>> {
|
||||
Self::raw_data_to_vec(
|
||||
&self.to_raw_data(empty_value),
|
||||
empty_value,
|
||||
&self.multi_value_column_metadata_map,
|
||||
join_multi_value_columns,
|
||||
)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Clones the `raw_data` data to a `Vec<Vec<String>>`,
|
||||
/// where the first entry is the headers
|
||||
/// # Arguments
|
||||
/// * `raw_data` - Raw data to be cloned
|
||||
/// * `empty_value` - Empty values will be replaced by this
|
||||
/// * `multi_value_column_metadata_map` - Maps a normalized multi-value header name (such as A_a1)
|
||||
/// to its corresponding metadata
|
||||
/// * `join_multi_value_columns` - Whether multi value columns should be joined back together or not
|
||||
pub fn raw_data_to_vec(
|
||||
raw_data: &RawData,
|
||||
empty_value: &Arc<String>,
|
||||
multi_value_column_metadata_map: &MultiValueColumnMetadataMap,
|
||||
join_multi_value_columns: bool,
|
||||
) -> Vec<Vec<String>> {
|
||||
let mut raw_data_vec = if join_multi_value_columns {
|
||||
RawDataMultiValueColumnJoiner::new(
|
||||
raw_data,
|
||||
multi_value_column_metadata_map,
|
||||
empty_value,
|
||||
)
|
||||
.join()
|
||||
} else {
|
||||
raw_data.clone()
|
||||
};
|
||||
raw_data_vec
|
||||
.drain(..)
|
||||
.map(|mut record| record.drain(..).map(|value| (*value).clone()).collect())
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,13 +17,15 @@ enum JoinSpec {
|
|||
}
|
||||
|
||||
/// Helper to join columns in the raw data that were spread using multiple values
|
||||
pub struct RawDataMultiValueColumnJoiner<'raw_data, 'multi_value_column_metadata_map> {
|
||||
pub struct RawDataMultiValueColumnJoiner<'raw_data, 'multi_value_column_metadata_map, 'empty_value>
|
||||
{
|
||||
raw_data: &'raw_data [CsvRecordRef],
|
||||
multi_value_column_metadata_map: &'multi_value_column_metadata_map MultiValueColumnMetadataMap,
|
||||
empty_value: &'empty_value Arc<String>,
|
||||
}
|
||||
|
||||
impl<'raw_data, 'multi_value_column_metadata_map>
|
||||
RawDataMultiValueColumnJoiner<'raw_data, 'multi_value_column_metadata_map>
|
||||
impl<'raw_data, 'multi_value_column_metadata_map, 'empty_value>
|
||||
RawDataMultiValueColumnJoiner<'raw_data, 'multi_value_column_metadata_map, 'empty_value>
|
||||
{
|
||||
/// Creates a new joiner
|
||||
/// # Arguments
|
||||
|
@ -33,10 +35,12 @@ impl<'raw_data, 'multi_value_column_metadata_map>
|
|||
pub fn new(
|
||||
raw_data: &'raw_data [CsvRecordRef],
|
||||
multi_value_column_metadata_map: &'multi_value_column_metadata_map MultiValueColumnMetadataMap,
|
||||
empty_value: &'empty_value Arc<String>,
|
||||
) -> Self {
|
||||
RawDataMultiValueColumnJoiner {
|
||||
raw_data,
|
||||
multi_value_column_metadata_map,
|
||||
empty_value,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -113,21 +117,25 @@ impl<'raw_data, 'multi_value_column_metadata_map>
|
|||
new_record.push(record[*value_index].clone());
|
||||
}
|
||||
JoinSpec::MultiValue(entry) => {
|
||||
new_record.push(Arc::new(
|
||||
record
|
||||
.iter()
|
||||
.take(entry.end_index + 1)
|
||||
.skip(entry.start_index)
|
||||
.enumerate()
|
||||
.filter_map(|(attr_index, value)| {
|
||||
if **value == "1" {
|
||||
Some(entry.attributes[attr_index].clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.join(&entry.delimiter),
|
||||
));
|
||||
let new_value = record
|
||||
.iter()
|
||||
.take(entry.end_index + 1)
|
||||
.skip(entry.start_index)
|
||||
.enumerate()
|
||||
.filter_map(|(attr_index, value)| {
|
||||
if **value == "1" {
|
||||
Some(entry.attributes[attr_index].clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.join(&entry.delimiter);
|
||||
|
||||
new_record.push(if !new_value.is_empty() {
|
||||
Arc::new(new_value)
|
||||
} else {
|
||||
self.empty_value.clone()
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
use csv::Writer;
|
||||
use csv::WriterBuilder;
|
||||
use log::info;
|
||||
use std::io::Write;
|
||||
use std::{io::Write, sync::Arc};
|
||||
|
||||
#[cfg(feature = "pyo3")]
|
||||
use pyo3::prelude::*;
|
||||
|
@ -10,7 +10,9 @@ use pyo3::prelude::*;
|
|||
use crate::data_block::CsvRecord;
|
||||
|
||||
use crate::{
|
||||
data_block::{CsvIOError, MultiValueColumnMetadataMap, RawData, RawDataMultiValueColumnJoiner},
|
||||
data_block::{
|
||||
CsvIOError, DataBlock, MultiValueColumnMetadataMap, RawData, RawDataMultiValueColumnJoiner,
|
||||
},
|
||||
utils::time::ElapsedDurationLogger,
|
||||
};
|
||||
|
||||
|
@ -60,6 +62,7 @@ impl GeneratedData {
|
|||
&self,
|
||||
writer: &mut T,
|
||||
delimiter: char,
|
||||
empty_value: &str,
|
||||
join_multi_value_columns: bool,
|
||||
long_form: bool,
|
||||
) -> Result<(), CsvIOError> {
|
||||
|
@ -72,6 +75,7 @@ impl GeneratedData {
|
|||
joined_synthetic_data = RawDataMultiValueColumnJoiner::new(
|
||||
&self.synthetic_data,
|
||||
&self.multi_value_column_metadata_map,
|
||||
&Arc::new(empty_value.to_owned()),
|
||||
)
|
||||
.join();
|
||||
&joined_synthetic_data
|
||||
|
@ -161,12 +165,14 @@ impl GeneratedData {
|
|||
/// # Arguments
|
||||
/// * `path` - File path to be written
|
||||
/// * `delimiter` - Delimiter to use when writing to `path`
|
||||
/// * `empty_value` - Empty values will be replaced by this
|
||||
/// * `join_multi_value_columns` - Whether multi value columns should be joined back together or not
|
||||
/// * `long_form` - Pivots column headers and value pairs to key-value row entries.
|
||||
pub fn write_synthetic_data(
|
||||
&self,
|
||||
path: &str,
|
||||
delimiter: char,
|
||||
empty_value: &str,
|
||||
join_multi_value_columns: bool,
|
||||
long_form: bool,
|
||||
) -> Result<(), CsvIOError> {
|
||||
|
@ -178,17 +184,25 @@ impl GeneratedData {
|
|||
|
||||
info!("writing file {}", path);
|
||||
|
||||
self._write_synthetic_data(&mut file, delimiter, join_multi_value_columns, long_form)
|
||||
self._write_synthetic_data(
|
||||
&mut file,
|
||||
delimiter,
|
||||
empty_value,
|
||||
join_multi_value_columns,
|
||||
long_form,
|
||||
)
|
||||
}
|
||||
|
||||
/// Generates a CSV string from the synthetic data
|
||||
/// # Arguments
|
||||
/// * `delimiter` - CSV delimiter to use
|
||||
/// * `empty_value` - Empty values will be replaced by this
|
||||
/// * `join_multi_value_columns` - Whether multi value columns should be joined back together or not
|
||||
/// * `long_form` - Pivots column headers and value pairs to key-value row entries.
|
||||
pub fn synthetic_data_to_string(
|
||||
&self,
|
||||
delimiter: char,
|
||||
empty_value: &str,
|
||||
join_multi_value_columns: bool,
|
||||
long_form: bool,
|
||||
) -> Result<String, CsvIOError> {
|
||||
|
@ -197,6 +211,7 @@ impl GeneratedData {
|
|||
self._write_synthetic_data(
|
||||
&mut csv_data,
|
||||
delimiter,
|
||||
empty_value,
|
||||
join_multi_value_columns,
|
||||
long_form,
|
||||
)?;
|
||||
|
@ -205,22 +220,20 @@ impl GeneratedData {
|
|||
}
|
||||
|
||||
/// Clones the raw synthetic data to a `Vec<Vec<String>>`,
|
||||
/// where the first entry are the headers
|
||||
/// where the first entry is the headers
|
||||
/// # Arguments
|
||||
/// * `empty_value` - Empty values will be replaced by this
|
||||
/// * `join_multi_value_columns` - Whether multi value columns should be joined back together or not
|
||||
pub fn synthetic_data_to_vec(&self, join_multi_value_columns: bool) -> Vec<Vec<String>> {
|
||||
let mut synthetic_data = if join_multi_value_columns {
|
||||
RawDataMultiValueColumnJoiner::new(
|
||||
&self.synthetic_data,
|
||||
&self.multi_value_column_metadata_map,
|
||||
)
|
||||
.join()
|
||||
} else {
|
||||
self.synthetic_data.clone()
|
||||
};
|
||||
synthetic_data
|
||||
.drain(..)
|
||||
.map(|mut record| record.drain(..).map(|value| (*value).clone()).collect())
|
||||
.collect()
|
||||
pub fn synthetic_data_to_vec(
|
||||
&self,
|
||||
empty_value: &str,
|
||||
join_multi_value_columns: bool,
|
||||
) -> Vec<Vec<String>> {
|
||||
DataBlock::raw_data_to_vec(
|
||||
&self.synthetic_data,
|
||||
&Arc::new(empty_value.to_owned()),
|
||||
&self.multi_value_column_metadata_map,
|
||||
join_multi_value_columns,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -256,3 +256,97 @@ fn validate_normalize_reporting_length() {
|
|||
assert_eq!(db.normalize_reporting_length(10), 4);
|
||||
assert_eq!(db.normalize_reporting_length(2), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn validate_to_raw_data_vec() {
|
||||
let db = read_test_data_block(
|
||||
TEST_FILE_PATH,
|
||||
DELIMITER,
|
||||
None,
|
||||
&[],
|
||||
&HashMap::default(),
|
||||
&[],
|
||||
0,
|
||||
);
|
||||
let raw_data = db.to_raw_data_vec(&Arc::new("".to_owned()), false);
|
||||
|
||||
assert_eq!(
|
||||
raw_data,
|
||||
vec![
|
||||
vec![
|
||||
"A".to_owned(),
|
||||
"B".to_owned(),
|
||||
"C".to_owned(),
|
||||
"D".to_owned()
|
||||
],
|
||||
vec![
|
||||
"a1".to_owned(),
|
||||
"b1".to_owned(),
|
||||
"c1".to_owned(),
|
||||
"d1".to_owned()
|
||||
],
|
||||
vec![
|
||||
"a2".to_owned(),
|
||||
"b2".to_owned(),
|
||||
"".to_owned(),
|
||||
"d2".to_owned()
|
||||
],
|
||||
vec![
|
||||
"a1".to_owned(),
|
||||
"b2".to_owned(),
|
||||
"".to_owned(),
|
||||
"d3".to_owned()
|
||||
],
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn validate_to_raw_data_vec_with_multi_value_columns() {
|
||||
let db = read_test_data_block(
|
||||
TEST_FILE_PATH,
|
||||
DELIMITER,
|
||||
None,
|
||||
&[],
|
||||
&[
|
||||
("C".to_owned(), ";".to_owned()),
|
||||
("D".to_owned(), "|".to_owned()),
|
||||
]
|
||||
.iter()
|
||||
.cloned()
|
||||
.collect(),
|
||||
&[],
|
||||
0,
|
||||
);
|
||||
let raw_data = db.to_raw_data_vec(&Arc::new("empty".to_owned()), true);
|
||||
|
||||
assert_eq!(
|
||||
raw_data,
|
||||
vec![
|
||||
vec![
|
||||
"A".to_owned(),
|
||||
"B".to_owned(),
|
||||
"C".to_owned(),
|
||||
"D".to_owned()
|
||||
],
|
||||
vec![
|
||||
"a1".to_owned(),
|
||||
"b1".to_owned(),
|
||||
"c1".to_owned(),
|
||||
"d1".to_owned()
|
||||
],
|
||||
vec![
|
||||
"a2".to_owned(),
|
||||
"b2".to_owned(),
|
||||
"empty".to_owned(),
|
||||
"d2".to_owned()
|
||||
],
|
||||
vec![
|
||||
"a1".to_owned(),
|
||||
"b2".to_owned(),
|
||||
"empty".to_owned(),
|
||||
"d3".to_owned()
|
||||
],
|
||||
]
|
||||
);
|
||||
}
|
||||
|
|
|
@ -99,8 +99,12 @@ fn valid_duplicated_id() {
|
|||
}));
|
||||
|
||||
assert!(
|
||||
RawDataMultiValueColumnJoiner::new(&raw_data, &data_block.multi_value_column_metadata_map)
|
||||
.join()
|
||||
RawDataMultiValueColumnJoiner::new(
|
||||
&raw_data,
|
||||
&data_block.multi_value_column_metadata_map,
|
||||
&empty_value,
|
||||
)
|
||||
.join()
|
||||
== expected
|
||||
);
|
||||
}
|
||||
|
|
|
@ -55,8 +55,75 @@ fn validate_multi_value_column_joiner() {
|
|||
}));
|
||||
|
||||
assert!(
|
||||
RawDataMultiValueColumnJoiner::new(&raw_data, &data_block.multi_value_column_metadata_map)
|
||||
.join()
|
||||
RawDataMultiValueColumnJoiner::new(
|
||||
&raw_data,
|
||||
&data_block.multi_value_column_metadata_map,
|
||||
&empty_value,
|
||||
)
|
||||
.join()
|
||||
== expected
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn validate_multi_value_column_joiner_with_different_empty_values() {
|
||||
let data_block = read_test_data_block(
|
||||
TEST_FILE_PATH,
|
||||
DELIMITER,
|
||||
None,
|
||||
&[],
|
||||
&[
|
||||
("B".to_owned(), ";".to_owned()),
|
||||
("C".to_owned(), ";".to_owned()),
|
||||
("D".to_owned(), "|".to_owned()),
|
||||
]
|
||||
.iter()
|
||||
.cloned()
|
||||
.collect(),
|
||||
&["B".to_owned()],
|
||||
0,
|
||||
);
|
||||
let mut raw_data = RawData::default();
|
||||
let empty_value = Arc::new("empty".to_owned());
|
||||
let expected = [
|
||||
["ID", "A", "B", "C", "F", "G", "D"].map(|s| Arc::new(s.to_owned())),
|
||||
[
|
||||
"1",
|
||||
"a1",
|
||||
"0;b1;b3;b4",
|
||||
"c1;c2;c3",
|
||||
"empty",
|
||||
"empty",
|
||||
"d1|d3",
|
||||
]
|
||||
.map(|s| Arc::new(s.to_owned())),
|
||||
["2", "a1", "b1", "c1", "empty", "empty", "d1"].map(|s| Arc::new(s.to_owned())),
|
||||
["3", "a1", "b1", "c1", "empty", "empty", "d1"].map(|s| Arc::new(s.to_owned())),
|
||||
["4", "a1", "b1;b2;b3", "c2", "1", "empty", "d2"].map(|s| Arc::new(s.to_owned())),
|
||||
["5", "a2", "b1", "c2", "empty", "empty", "d2"].map(|s| Arc::new(s.to_owned())),
|
||||
["6", "a2", "b2;b3", "c1", "empty", "empty", "d1|d4"].map(|s| Arc::new(s.to_owned())),
|
||||
["7", "a2", "b2", "c2", "1", "empty", "d2"].map(|s| Arc::new(s.to_owned())),
|
||||
["8", "a2", "b2", "c2", "empty", "empty", "d3"].map(|s| Arc::new(s.to_owned())),
|
||||
["9", "a2", "b2", "c1;c3;c4", "empty", "empty", "d2"].map(|s| Arc::new(s.to_owned())),
|
||||
["10", "a2", "empty_multi", "c2", "empty", "empty", "d2"].map(|s| Arc::new(s.to_owned())),
|
||||
["11", "a3", "b2", "c2", "empty", "empty", "d2"].map(|s| Arc::new(s.to_owned())),
|
||||
]
|
||||
.iter()
|
||||
.cloned()
|
||||
.collect_vec();
|
||||
|
||||
raw_data.push(data_block.headers.to_vec());
|
||||
raw_data.extend(data_block.records.iter().map(|r| {
|
||||
SynthesizerCacheKey::new(data_block.headers.len(), &r.values).format_record(&empty_value)
|
||||
}));
|
||||
|
||||
assert!(
|
||||
RawDataMultiValueColumnJoiner::new(
|
||||
&raw_data,
|
||||
&data_block.multi_value_column_metadata_map,
|
||||
&Arc::new("empty_multi".to_owned()),
|
||||
)
|
||||
.join()
|
||||
== expected
|
||||
);
|
||||
}
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -9,7 +9,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 71,
|
||||
"execution_count": 92,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -32,7 +32,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 72,
|
||||
"execution_count": 93,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -50,7 +50,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 73,
|
||||
"execution_count": 94,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -66,7 +66,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 74,
|
||||
"execution_count": 95,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -98,7 +98,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 75,
|
||||
"execution_count": 96,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -118,7 +118,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 76,
|
||||
"execution_count": 97,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -170,29 +170,29 @@
|
|||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>mean</th>\n",
|
||||
" <td>1.008167</td>\n",
|
||||
" <td>2.631000</td>\n",
|
||||
" <td>4.590833</td>\n",
|
||||
" <td>0.511667</td>\n",
|
||||
" <td>0.510500</td>\n",
|
||||
" <td>0.500000</td>\n",
|
||||
" <td>0.496167</td>\n",
|
||||
" <td>0.496500</td>\n",
|
||||
" <td>0.498500</td>\n",
|
||||
" <td>0.496333</td>\n",
|
||||
" <td>1.012500</td>\n",
|
||||
" <td>2.599167</td>\n",
|
||||
" <td>4.537500</td>\n",
|
||||
" <td>0.503000</td>\n",
|
||||
" <td>0.500333</td>\n",
|
||||
" <td>0.506500</td>\n",
|
||||
" <td>0.504667</td>\n",
|
||||
" <td>0.514333</td>\n",
|
||||
" <td>0.502667</td>\n",
|
||||
" <td>0.499833</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>std</th>\n",
|
||||
" <td>0.816626</td>\n",
|
||||
" <td>2.112167</td>\n",
|
||||
" <td>3.340800</td>\n",
|
||||
" <td>0.499906</td>\n",
|
||||
" <td>0.499931</td>\n",
|
||||
" <td>0.817999</td>\n",
|
||||
" <td>2.117998</td>\n",
|
||||
" <td>3.335733</td>\n",
|
||||
" <td>0.500033</td>\n",
|
||||
" <td>0.500042</td>\n",
|
||||
" <td>0.499999</td>\n",
|
||||
" <td>0.500020</td>\n",
|
||||
" <td>0.499836</td>\n",
|
||||
" <td>0.500035</td>\n",
|
||||
" <td>0.500042</td>\n",
|
||||
" <td>0.500027</td>\n",
|
||||
" <td>0.500029</td>\n",
|
||||
" <td>0.500039</td>\n",
|
||||
" <td>0.500028</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>min</th>\n",
|
||||
|
@ -210,8 +210,8 @@
|
|||
" <tr>\n",
|
||||
" <th>25%</th>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>1.000000</td>\n",
|
||||
" <td>2.000000</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
|
@ -223,21 +223,21 @@
|
|||
" <tr>\n",
|
||||
" <th>50%</th>\n",
|
||||
" <td>1.000000</td>\n",
|
||||
" <td>3.000000</td>\n",
|
||||
" <td>2.000000</td>\n",
|
||||
" <td>4.000000</td>\n",
|
||||
" <td>1.000000</td>\n",
|
||||
" <td>1.000000</td>\n",
|
||||
" <td>0.500000</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>1.000000</td>\n",
|
||||
" <td>1.000000</td>\n",
|
||||
" <td>1.000000</td>\n",
|
||||
" <td>1.000000</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>75%</th>\n",
|
||||
" <td>2.000000</td>\n",
|
||||
" <td>5.000000</td>\n",
|
||||
" <td>8.000000</td>\n",
|
||||
" <td>4.000000</td>\n",
|
||||
" <td>7.000000</td>\n",
|
||||
" <td>1.000000</td>\n",
|
||||
" <td>1.000000</td>\n",
|
||||
" <td>1.000000</td>\n",
|
||||
|
@ -266,26 +266,26 @@
|
|||
"text/plain": [
|
||||
" H1 H2 H3 H4 H5 \\\n",
|
||||
"count 6000.000000 6000.000000 6000.000000 6000.000000 6000.000000 \n",
|
||||
"mean 1.008167 2.631000 4.590833 0.511667 0.510500 \n",
|
||||
"std 0.816626 2.112167 3.340800 0.499906 0.499931 \n",
|
||||
"mean 1.012500 2.599167 4.537500 0.503000 0.500333 \n",
|
||||
"std 0.817999 2.117998 3.335733 0.500033 0.500042 \n",
|
||||
"min 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
|
||||
"25% 0.000000 1.000000 2.000000 0.000000 0.000000 \n",
|
||||
"50% 1.000000 3.000000 4.000000 1.000000 1.000000 \n",
|
||||
"75% 2.000000 5.000000 8.000000 1.000000 1.000000 \n",
|
||||
"25% 0.000000 0.000000 1.000000 0.000000 0.000000 \n",
|
||||
"50% 1.000000 2.000000 4.000000 1.000000 1.000000 \n",
|
||||
"75% 2.000000 4.000000 7.000000 1.000000 1.000000 \n",
|
||||
"max 2.000000 6.000000 10.000000 1.000000 1.000000 \n",
|
||||
"\n",
|
||||
" H6 H7 H8 H9 H10 \n",
|
||||
"count 6000.000000 6000.000000 6000.000000 6000.000000 6000.000000 \n",
|
||||
"mean 0.500000 0.496167 0.496500 0.498500 0.496333 \n",
|
||||
"std 0.500042 0.500027 0.500029 0.500039 0.500028 \n",
|
||||
"mean 0.506500 0.504667 0.514333 0.502667 0.499833 \n",
|
||||
"std 0.499999 0.500020 0.499836 0.500035 0.500042 \n",
|
||||
"min 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
|
||||
"25% 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
|
||||
"50% 0.500000 0.000000 0.000000 0.000000 0.000000 \n",
|
||||
"50% 1.000000 1.000000 1.000000 1.000000 0.000000 \n",
|
||||
"75% 1.000000 1.000000 1.000000 1.000000 1.000000 \n",
|
||||
"max 1.000000 1.000000 1.000000 1.000000 1.000000 "
|
||||
]
|
||||
},
|
||||
"execution_count": 76,
|
||||
"execution_count": 97,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -296,7 +296,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 77,
|
||||
"execution_count": 98,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -335,42 +335,42 @@
|
|||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>count</th>\n",
|
||||
" <td>6024.000000</td>\n",
|
||||
" <td>6024.000000</td>\n",
|
||||
" <td>6024.000000</td>\n",
|
||||
" <td>6024.000000</td>\n",
|
||||
" <td>6024.000000</td>\n",
|
||||
" <td>6024.000000</td>\n",
|
||||
" <td>6024.000000</td>\n",
|
||||
" <td>6024.000000</td>\n",
|
||||
" <td>6024.000000</td>\n",
|
||||
" <td>6024.000000</td>\n",
|
||||
" <td>6003.000000</td>\n",
|
||||
" <td>6003.000000</td>\n",
|
||||
" <td>6003.000000</td>\n",
|
||||
" <td>6003.000000</td>\n",
|
||||
" <td>6003.000000</td>\n",
|
||||
" <td>6003.000000</td>\n",
|
||||
" <td>6003.000000</td>\n",
|
||||
" <td>6003.000000</td>\n",
|
||||
" <td>6003.000000</td>\n",
|
||||
" <td>6003.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>mean</th>\n",
|
||||
" <td>0.921647</td>\n",
|
||||
" <td>2.335159</td>\n",
|
||||
" <td>4.047643</td>\n",
|
||||
" <td>0.495684</td>\n",
|
||||
" <td>0.481740</td>\n",
|
||||
" <td>0.475100</td>\n",
|
||||
" <td>0.482404</td>\n",
|
||||
" <td>0.469788</td>\n",
|
||||
" <td>0.482902</td>\n",
|
||||
" <td>0.483898</td>\n",
|
||||
" <td>0.913043</td>\n",
|
||||
" <td>2.156755</td>\n",
|
||||
" <td>3.909545</td>\n",
|
||||
" <td>0.489255</td>\n",
|
||||
" <td>0.488089</td>\n",
|
||||
" <td>0.492754</td>\n",
|
||||
" <td>0.478094</td>\n",
|
||||
" <td>0.481759</td>\n",
|
||||
" <td>0.489755</td>\n",
|
||||
" <td>0.479427</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>std</th>\n",
|
||||
" <td>0.827974</td>\n",
|
||||
" <td>2.155591</td>\n",
|
||||
" <td>3.490394</td>\n",
|
||||
" <td>0.500023</td>\n",
|
||||
" <td>0.499708</td>\n",
|
||||
" <td>0.499421</td>\n",
|
||||
" <td>0.499732</td>\n",
|
||||
" <td>0.499128</td>\n",
|
||||
" <td>0.499749</td>\n",
|
||||
" <td>0.499782</td>\n",
|
||||
" <td>0.832789</td>\n",
|
||||
" <td>2.159173</td>\n",
|
||||
" <td>3.476390</td>\n",
|
||||
" <td>0.499926</td>\n",
|
||||
" <td>0.499900</td>\n",
|
||||
" <td>0.499989</td>\n",
|
||||
" <td>0.499562</td>\n",
|
||||
" <td>0.499709</td>\n",
|
||||
" <td>0.499937</td>\n",
|
||||
" <td>0.499618</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>min</th>\n",
|
||||
|
@ -402,7 +402,7 @@
|
|||
" <th>50%</th>\n",
|
||||
" <td>1.000000</td>\n",
|
||||
" <td>2.000000</td>\n",
|
||||
" <td>4.000000</td>\n",
|
||||
" <td>3.000000</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
|
@ -443,19 +443,19 @@
|
|||
],
|
||||
"text/plain": [
|
||||
" H1 H2 H3 H4 H5 \\\n",
|
||||
"count 6024.000000 6024.000000 6024.000000 6024.000000 6024.000000 \n",
|
||||
"mean 0.921647 2.335159 4.047643 0.495684 0.481740 \n",
|
||||
"std 0.827974 2.155591 3.490394 0.500023 0.499708 \n",
|
||||
"count 6003.000000 6003.000000 6003.000000 6003.000000 6003.000000 \n",
|
||||
"mean 0.913043 2.156755 3.909545 0.489255 0.488089 \n",
|
||||
"std 0.832789 2.159173 3.476390 0.499926 0.499900 \n",
|
||||
"min 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
|
||||
"25% 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
|
||||
"50% 1.000000 2.000000 4.000000 0.000000 0.000000 \n",
|
||||
"50% 1.000000 2.000000 3.000000 0.000000 0.000000 \n",
|
||||
"75% 2.000000 4.000000 7.000000 1.000000 1.000000 \n",
|
||||
"max 2.000000 6.000000 10.000000 1.000000 1.000000 \n",
|
||||
"\n",
|
||||
" H6 H7 H8 H9 H10 \n",
|
||||
"count 6024.000000 6024.000000 6024.000000 6024.000000 6024.000000 \n",
|
||||
"mean 0.475100 0.482404 0.469788 0.482902 0.483898 \n",
|
||||
"std 0.499421 0.499732 0.499128 0.499749 0.499782 \n",
|
||||
"count 6003.000000 6003.000000 6003.000000 6003.000000 6003.000000 \n",
|
||||
"mean 0.492754 0.478094 0.481759 0.489755 0.479427 \n",
|
||||
"std 0.499989 0.499562 0.499709 0.499937 0.499618 \n",
|
||||
"min 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
|
||||
"25% 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
|
||||
"50% 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
|
||||
|
@ -463,7 +463,7 @@
|
|||
"max 1.000000 1.000000 1.000000 1.000000 1.000000 "
|
||||
]
|
||||
},
|
||||
"execution_count": 77,
|
||||
"execution_count": 98,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
|
|
@ -79,7 +79,10 @@ impl DpAggregatedSeededSynthesizer {
|
|||
target_number_of_records,
|
||||
&mut create_progress_reporter(),
|
||||
)?;
|
||||
Ok(generated_data.synthetic_data_to_vec(join_multi_value_columns.unwrap_or(false)))
|
||||
Ok(generated_data.synthetic_data_to_vec(
|
||||
&self._parameters.empty_value,
|
||||
join_multi_value_columns.unwrap_or(false),
|
||||
))
|
||||
} else {
|
||||
Err(PyRuntimeError::new_err(
|
||||
"make sure 'fit' method has been successfully called first",
|
||||
|
|
|
@ -101,6 +101,27 @@ impl Dataset {
|
|||
})
|
||||
}
|
||||
|
||||
pub fn to_raw_data(
|
||||
&self,
|
||||
empty_value: Option<String>,
|
||||
join_multi_value_columns: Option<bool>,
|
||||
) -> DatasetRawData {
|
||||
self.data_block.to_raw_data_vec(
|
||||
&empty_value
|
||||
.map(Arc::new)
|
||||
.unwrap_or_else(|| Arc::new("".to_owned())),
|
||||
join_multi_value_columns.unwrap_or(false),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn to_data_frame(
|
||||
&self,
|
||||
empty_value: Option<String>,
|
||||
join_multi_value_columns: Option<bool>,
|
||||
) -> PyResult<PyObject> {
|
||||
Self::raw_data_to_data_frame(self.to_raw_data(empty_value, join_multi_value_columns))
|
||||
}
|
||||
|
||||
pub fn get_aggregates(
|
||||
&self,
|
||||
reporting_length: usize,
|
||||
|
|
|
@ -51,7 +51,7 @@ impl WasmGenerateResult {
|
|||
long_form: bool,
|
||||
) -> JsResult<String> {
|
||||
self.generated_data
|
||||
.synthetic_data_to_string(delimiter, join_multi_value_columns, long_form)
|
||||
.synthetic_data_to_string(delimiter, "", join_multi_value_columns, long_form)
|
||||
.map_err(|err| JsValue::from(err.to_string()))
|
||||
}
|
||||
|
||||
|
|
|
@ -93,7 +93,8 @@ def generate(config):
|
|||
else:
|
||||
raise ValueError(f'invalid synthesis mode: {synthesis_mode}')
|
||||
|
||||
generated_data.write_synthetic_data(synthetic_microdata_path, '\t', False, False)
|
||||
generated_data.write_synthetic_data(
|
||||
synthetic_microdata_path, '\t', '', False, False)
|
||||
syn_ratio = generated_data.expansion_ratio
|
||||
|
||||
config['expansion_ratio'] = syn_ratio
|
||||
|
|
Загрузка…
Ссылка в новой задаче