Merge pull request #15 from urishapira/gt-parquet-row-groups-data

Get parquet row groups data
This commit is contained in:
Michael Spector 2022-02-15 11:06:48 +02:00 коммит произвёл GitHub
Родитель 9f00a1f95b 7bdca360a3
Коммит c3541e81aa
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
3 изменённых файлов: 45 добавлений и 2 удалений

Просмотреть файл

@ -2,14 +2,14 @@
<package >
<metadata>
<id>pq2json</id>
<version>0.1.10</version>
<version>0.1.11</version>
<authors>Evgeney Ryzhyk</authors>
<owners>Evgeney Ryzhyk</owners>
<license type="expression">MIT</license>
<projectUrl>https://github.com/Azure/azure-kusto-parquet-conv</projectUrl>
<requireLicenseAcceptance>false</requireLicenseAcceptance>
<description>Parquet to JSON (line delimited) converter tool.</description>
<releaseNotes>Added support for producing empty values for columns missing from the file</releaseNotes>
<releaseNotes>Added optional argument to return row groups metadata</releaseNotes>
<copyright>Copyright 2020</copyright>
<tags></tags>
<dependencies></dependencies>

Просмотреть файл

@ -102,6 +102,13 @@ fn main() {
.takes_value(false)
.required(false),
)
.arg(
Arg::with_name("rowgroups")
.long("rowgroups")
.help("Print Row Groups Metadata")
.takes_value(false)
.required(false),
)
.arg(
Arg::with_name("INPUT")
.help("Input file to use")
@ -142,6 +149,8 @@ fn main() {
schema::print_schema(input)
} else if matches.is_present("cslschema") {
schema::print_csl_schema(input)
} else if matches.is_present("rowgroups") {
schema::print_row_groups_metadata(input)
} else {
converter::convert(&settings, input, output)
};

Просмотреть файл

@ -107,3 +107,37 @@ fn field_csl_schema(field_type: &Type) -> (&str, &str) {
Type::GroupType { ref basic_info, .. } => (basic_info.name(), "dynamic"),
}
}
/// Prints limited row groups metadata of a specified Parquet file as JSON,
/// for each row group its size in bytes and the number of rows.
///
/// Arguments:
///
/// * `input_file` - Parquet file path
///
pub fn print_row_groups_metadata(input_file: &str) -> Result<(), Box<dyn Error>> {
let file = File::open(&Path::new(input_file))?;
let reader = SerializedFileReader::new(file)?;
let row_groups = Value::Array(
reader
.metadata()
.row_groups()
.iter()
.map(|row_group_metadata| {
let mut map = serde_json::Map::with_capacity(2);
map.insert(
String::from("numberOfRows"),
Value::String(row_group_metadata.num_rows().to_string()),
);
map.insert(
String::from("totalByteSize"),
Value::String(row_group_metadata.total_byte_size().to_string()),
);
Value::Object(map)
})
.collect_vec(),
);
println!("{}", serde_json::to_string(&row_groups)?);
Ok(())
}