This commit is contained in:
urishapira 2022-02-14 16:35:44 +02:00
Родитель 9f00a1f95b
Коммит 28b50bf4d9
3 изменённых файлов: 56 добавлений и 1 удалений

Просмотреть файл

@ -8,7 +8,7 @@ edition = "2018"
clap = "2"
parquet = { git = "https://github.com/rzheka/arrow.git", branch = "dev" }
itertools = "0.8"
serde = "1"
serde = { version = "1", features = ["derive"] }
serde_json = "1"
num-bigint = "0.2"
chrono = "0.4"

Просмотреть файл

@ -101,6 +101,13 @@ fn main() {
.help("Print Kusto schema")
.takes_value(false)
.required(false),
)
.arg(
Arg::with_name("rowgroups")
.long("rowgroups")
.help("Print Row Groups Metadata")
.takes_value(false)
.required(false),
)
.arg(
Arg::with_name("INPUT")
@ -142,6 +149,8 @@ fn main() {
schema::print_schema(input)
} else if matches.is_present("cslschema") {
schema::print_csl_schema(input)
} else if matches.is_present("rowgroups") {
schema::print_row_groups_metadata(input)
} else {
converter::convert(&settings, input, output)
};

Просмотреть файл

@ -8,6 +8,7 @@ use parquet::file::reader::{FileReader, SerializedFileReader};
use parquet::schema::printer::{print_file_metadata, print_parquet_metadata};
use parquet::schema::types::Type;
use serde_json::Value;
use serde::Serialize;
/// Prints Parquet file schema information
///
@ -107,3 +108,48 @@ fn field_csl_schema(field_type: &Type) -> (&str, &str) {
Type::GroupType { ref basic_info, .. } => (basic_info.name(), "dynamic"),
}
}
/// Prints limited row groups metadata of a specified Parquet file as JSON,
/// for each row group its size in bytes and the number of rows.
///
/// Arguments:
///
/// * `input_file` - Parquet file path
///
pub fn print_row_groups_metadata(input_file: &str) -> Result<(), Box<dyn Error>> {
let file = File::open(&Path::new(input_file))?;
let reader = SerializedFileReader::new(file)?;
let row_groups_count = reader.metadata().num_row_groups();
let row_groups = Value::Array(
reader
.metadata()
.row_groups()
.iter()
.map(|row_group_metadata| {
let mut map = serde_json::Map::with_capacity(2);
map.insert(
String::from("numberOfRows"),
Value::String(row_group_metadata.num_rows().to_string()),
);
map.insert(
String::from("totalByteSize"),
Value::String(row_group_metadata.total_byte_size().to_string()),
);
Value::Object(map)
})
.collect_vec(),
);
let row_groups_object = RowGroups {
count: row_groups_count,
row_groups,
};
println!("{}", serde_json::to_string(&row_groups_object)?);
Ok(())
}
#[derive(Serialize)]
pub struct RowGroups {
pub count: usize,
pub row_groups: Value,
}