Merge pull request #15 from urishapira/gt-parquet-row-groups-data
Get parquet row groups data
This commit is contained in:
Коммит
c3541e81aa
|
@ -2,14 +2,14 @@
|
|||
<package >
|
||||
<metadata>
|
||||
<id>pq2json</id>
|
||||
<version>0.1.10</version>
|
||||
<version>0.1.11</version>
|
||||
<authors>Evgeney Ryzhyk</authors>
|
||||
<owners>Evgeney Ryzhyk</owners>
|
||||
<license type="expression">MIT</license>
|
||||
<projectUrl>https://github.com/Azure/azure-kusto-parquet-conv</projectUrl>
|
||||
<requireLicenseAcceptance>false</requireLicenseAcceptance>
|
||||
<description>Parquet to JSON (line delimited) converter tool.</description>
|
||||
<releaseNotes>Added support for producing empty values for columns missing from the file</releaseNotes>
|
||||
<releaseNotes>Added optional argument to return row groups metadata</releaseNotes>
|
||||
<copyright>Copyright 2020</copyright>
|
||||
<tags></tags>
|
||||
<dependencies></dependencies>
|
||||
|
|
|
@ -102,6 +102,13 @@ fn main() {
|
|||
.takes_value(false)
|
||||
.required(false),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("rowgroups")
|
||||
.long("rowgroups")
|
||||
.help("Print Row Groups Metadata")
|
||||
.takes_value(false)
|
||||
.required(false),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("INPUT")
|
||||
.help("Input file to use")
|
||||
|
@ -142,6 +149,8 @@ fn main() {
|
|||
schema::print_schema(input)
|
||||
} else if matches.is_present("cslschema") {
|
||||
schema::print_csl_schema(input)
|
||||
} else if matches.is_present("rowgroups") {
|
||||
schema::print_row_groups_metadata(input)
|
||||
} else {
|
||||
converter::convert(&settings, input, output)
|
||||
};
|
||||
|
|
|
@ -107,3 +107,37 @@ fn field_csl_schema(field_type: &Type) -> (&str, &str) {
|
|||
Type::GroupType { ref basic_info, .. } => (basic_info.name(), "dynamic"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Prints limited row groups metadata of a specified Parquet file as JSON,
|
||||
/// for each row group its size in bytes and the number of rows.
|
||||
///
|
||||
/// Arguments:
|
||||
///
|
||||
/// * `input_file` - Parquet file path
|
||||
///
|
||||
pub fn print_row_groups_metadata(input_file: &str) -> Result<(), Box<dyn Error>> {
|
||||
let file = File::open(&Path::new(input_file))?;
|
||||
let reader = SerializedFileReader::new(file)?;
|
||||
let row_groups = Value::Array(
|
||||
reader
|
||||
.metadata()
|
||||
.row_groups()
|
||||
.iter()
|
||||
.map(|row_group_metadata| {
|
||||
let mut map = serde_json::Map::with_capacity(2);
|
||||
map.insert(
|
||||
String::from("numberOfRows"),
|
||||
Value::String(row_group_metadata.num_rows().to_string()),
|
||||
);
|
||||
map.insert(
|
||||
String::from("totalByteSize"),
|
||||
Value::String(row_group_metadata.total_byte_size().to_string()),
|
||||
);
|
||||
Value::Object(map)
|
||||
})
|
||||
.collect_vec(),
|
||||
);
|
||||
|
||||
println!("{}", serde_json::to_string(&row_groups)?);
|
||||
Ok(())
|
||||
}
|
||||
|
|
Загрузка…
Ссылка в новой задаче