Merge pull request #34 from Azure/timestamp_micros

Support microsecond precision in timestamps
This commit is contained in:
Michael Spector 2024-06-25 14:47:54 +03:00 коммит произвёл GitHub
Родитель 15679f976d 3ce37c100b
Коммит 316f4eddfe
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
618 изменённых файлов: 277440 добавлений и 699 удалений

638
Cargo.lock сгенерированный
Просмотреть файл

@ -1,638 +0,0 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "adler"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
[[package]]
name = "alloc-no-stdlib"
version = "2.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "35ef4730490ad1c4eae5c4325b2a95f521d023e5c885853ff7aca0a6a1631db3"
[[package]]
name = "alloc-stdlib"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "697ed7edc0f1711de49ce108c541623a0af97c6c60b2f6e2b65229847ac843c2"
dependencies = [
"alloc-no-stdlib",
]
[[package]]
name = "android_system_properties"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
dependencies = [
"libc",
]
[[package]]
name = "ansi_term"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2"
dependencies = [
"winapi",
]
[[package]]
name = "atty"
version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
dependencies = [
"hermit-abi",
"libc",
"winapi",
]
[[package]]
name = "autocfg"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]]
name = "bitflags"
version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]]
name = "brotli"
version = "3.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1a0b1dbcc8ae29329621f8d4f0d835787c1c38bb1401979b49d13b0b305ff68"
dependencies = [
"alloc-no-stdlib",
"alloc-stdlib",
"brotli-decompressor",
]
[[package]]
name = "brotli-decompressor"
version = "2.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59ad2d4653bf5ca36ae797b1f4bb4dbddb60ce49ca4aed8a2ce4829f60425b80"
dependencies = [
"alloc-no-stdlib",
"alloc-stdlib",
]
[[package]]
name = "bstr"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223"
dependencies = [
"lazy_static",
"memchr",
"regex-automata",
"serde",
]
[[package]]
name = "bumpalo"
version = "3.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c1ad822118d20d2c234f427000d5acc36eabe1e29a348c89b63dd60b13f28e5d"
[[package]]
name = "byteorder"
version = "1.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
[[package]]
name = "cc"
version = "1.0.73"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11"
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "chrono"
version = "0.4.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bfd4d1b31faaa3a89d7934dbded3111da0d2ef28e3ebccdb4f0179f5929d1ef1"
dependencies = [
"iana-time-zone",
"js-sys",
"num-integer",
"num-traits",
"time",
"wasm-bindgen",
"winapi",
]
[[package]]
name = "clap"
version = "2.34.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c"
dependencies = [
"ansi_term",
"atty",
"bitflags",
"strsim",
"textwrap",
"unicode-width",
"vec_map",
]
[[package]]
name = "core-foundation-sys"
version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc"
[[package]]
name = "crc32fast"
version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"
dependencies = [
"cfg-if",
]
[[package]]
name = "csv"
version = "1.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1"
dependencies = [
"bstr",
"csv-core",
"itoa 0.4.8",
"ryu",
"serde",
]
[[package]]
name = "csv-core"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
dependencies = [
"memchr",
]
[[package]]
name = "either"
version = "1.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797"
[[package]]
name = "flate2"
version = "1.0.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f82b0f4c27ad9f8bfd1f3208d882da2b09c301bc1c828fd3a00d0216d2fbbff6"
dependencies = [
"crc32fast",
"miniz_oxide",
]
[[package]]
name = "hermit-abi"
version = "0.1.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
dependencies = [
"libc",
]
[[package]]
name = "iana-time-zone"
version = "0.1.47"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c495f162af0bf17656d0014a0eded5f3cd2f365fdd204548c2869db89359dc7"
dependencies = [
"android_system_properties",
"core-foundation-sys",
"js-sys",
"once_cell",
"wasm-bindgen",
"winapi",
]
[[package]]
name = "integer-encoding"
version = "1.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48dc51180a9b377fd75814d0cc02199c20f8e99433d6762f650d39cdbbd3b56f"
[[package]]
name = "itertools"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f56a2d0bc861f9165be4eb3442afd3c236d8a98afd426f65d92324ae1091a484"
dependencies = [
"either",
]
[[package]]
name = "itoa"
version = "0.4.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"
[[package]]
name = "itoa"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c8af84674fe1f223a982c933a0ee1086ac4d4052aa0fb8060c12c6ad838e754"
[[package]]
name = "js-sys"
version = "0.3.59"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "258451ab10b34f8af53416d1fdab72c22e805f0c92a1136d59470ec0b11138b2"
dependencies = [
"wasm-bindgen",
]
[[package]]
name = "lazy_static"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "libc"
version = "0.2.132"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8371e4e5341c3a96db127eb2465ac681ced4c433e01dd0e938adbef26ba93ba5"
[[package]]
name = "log"
version = "0.4.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e"
dependencies = [
"cfg-if",
]
[[package]]
name = "lz4"
version = "1.24.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7e9e2dd86df36ce760a60f6ff6ad526f7ba1f14ba0356f8254fb6905e6494df1"
dependencies = [
"libc",
"lz4-sys",
]
[[package]]
name = "lz4-sys"
version = "1.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57d27b317e207b10f69f5e75494119e391a96f48861ae870d1da6edac98ca900"
dependencies = [
"cc",
"libc",
]
[[package]]
name = "memchr"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
[[package]]
name = "miniz_oxide"
version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6f5c75688da582b8ffc1f1799e9db273f32133c49e048f614d22ec3256773ccc"
dependencies = [
"adler",
]
[[package]]
name = "num-bigint"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "090c7f9998ee0ff65aa5b723e4009f7b217707f1fb5ea551329cc4d6231fb304"
dependencies = [
"autocfg",
"num-integer",
"num-traits",
]
[[package]]
name = "num-bigint"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f6f7833f2cbf2360a6cfd58cd41a53aa7a90bd4c202f5b1c7dd2ed73c57b2c3"
dependencies = [
"autocfg",
"num-integer",
"num-traits",
]
[[package]]
name = "num-integer"
version = "0.1.45"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9"
dependencies = [
"autocfg",
"num-traits",
]
[[package]]
name = "num-traits"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
dependencies = [
"autocfg",
]
[[package]]
name = "num_cpus"
version = "1.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1"
dependencies = [
"hermit-abi",
"libc",
]
[[package]]
name = "once_cell"
version = "1.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "074864da206b4973b84eb91683020dbefd6a8c3f0f38e054d93954e891935e4e"
[[package]]
name = "ordered-float"
version = "1.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3305af35278dd29f46fcdd139e0b1fbfae2153f0e5928b39b035542dd31e37b7"
dependencies = [
"num-traits",
]
[[package]]
name = "parquet"
version = "4.0.0-SNAPSHOT"
source = "git+https://github.com/rzheka/arrow.git?branch=dev#0b9b97461178c4ea35569e3ac6d2ce929a62e42f"
dependencies = [
"brotli",
"byteorder",
"chrono",
"flate2",
"lz4",
"num-bigint 0.3.3",
"parquet-format",
"snap",
"thrift",
]
[[package]]
name = "parquet-format"
version = "2.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a5bc6b23543b5dedc8f6cce50758a35e5582e148e0cfa26bd0cacd569cda5b71"
dependencies = [
"thrift",
]
[[package]]
name = "pq2json"
version = "0.1.0"
dependencies = [
"chrono",
"clap",
"csv",
"itertools",
"num-bigint 0.2.6",
"parquet",
"ryu",
"serde",
"serde_json",
]
[[package]]
name = "proc-macro2"
version = "1.0.43"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0a2ca2c61bc9f3d74d2886294ab7b9853abd9c1ad903a3ac7815c58989bb7bab"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179"
dependencies = [
"proc-macro2",
]
[[package]]
name = "regex-automata"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
[[package]]
name = "ryu"
version = "1.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4501abdff3ae82a1c1b477a17252eb69cee9e66eb915c1abaa4f44d873df9f09"
[[package]]
name = "serde"
version = "1.0.144"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0f747710de3dcd43b88c9168773254e809d8ddbdf9653b84e2554ab219f17860"
[[package]]
name = "serde_json"
version = "1.0.85"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e55a28e3aaef9d5ce0506d0a14dbba8054ddc7e499ef522dd8b26859ec9d4a44"
dependencies = [
"itoa 1.0.3",
"ryu",
"serde",
]
[[package]]
name = "snap"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "45456094d1983e2ee2a18fdfebce3189fa451699d0502cb8e3b49dba5ba41451"
[[package]]
name = "strsim"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
[[package]]
name = "syn"
version = "1.0.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "58dbef6ec655055e20b86b15a8cc6d439cca19b667537ac6a1369572d151ab13"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "textwrap"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
dependencies = [
"unicode-width",
]
[[package]]
name = "threadpool"
version = "1.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d050e60b33d41c19108b32cea32164033a9013fe3b46cbd4457559bfbf77afaa"
dependencies = [
"num_cpus",
]
[[package]]
name = "thrift"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c6d965454947cc7266d22716ebfd07b18d84ebaf35eec558586bbb2a8cb6b5b"
dependencies = [
"byteorder",
"integer-encoding",
"log",
"ordered-float",
"threadpool",
]
[[package]]
name = "time"
version = "0.1.44"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255"
dependencies = [
"libc",
"wasi",
"winapi",
]
[[package]]
name = "unicode-ident"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c4f5b37a154999a8f3f98cc23a628d850e154479cd94decf3414696e12e31aaf"
[[package]]
name = "unicode-width"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973"
[[package]]
name = "vec_map"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"
[[package]]
name = "wasi"
version = "0.10.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f"
[[package]]
name = "wasm-bindgen"
version = "0.2.82"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fc7652e3f6c4706c8d9cd54832c4a4ccb9b5336e2c3bd154d5cccfbf1c1f5f7d"
dependencies = [
"cfg-if",
"wasm-bindgen-macro",
]
[[package]]
name = "wasm-bindgen-backend"
version = "0.2.82"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "662cd44805586bd52971b9586b1df85cdbbd9112e4ef4d8f41559c334dc6ac3f"
dependencies = [
"bumpalo",
"log",
"once_cell",
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.82"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b260f13d3012071dfb1512849c033b1925038373aea48ced3012c09df952c602"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
]
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.82"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5be8e654bdd9b79216c2929ab90721aa82faf65c48cdf08bdc4e7f51357b80da"
dependencies = [
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-backend",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.82"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6598dd0bd3c7d51095ff6531a5b23e02acdc81804e30d8f07afb77b7215a140a"
[[package]]
name = "winapi"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
dependencies = [
"winapi-i686-pc-windows-gnu",
"winapi-x86_64-pc-windows-gnu",
]
[[package]]
name = "winapi-i686-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"

Просмотреть файл

@ -1,13 +0,0 @@
[workspace]
members = [
"pq2json"
]
[profile.dev]
[profile.release]
debug = true
lto = true
codegen-units = 1
opt-level = 3
debug-assertions = false

299
KUSTO.patch Normal file
Просмотреть файл

@ -0,0 +1,299 @@
Only in arrow-rs: Cargo.lock
diff -ru arrow-rs.orig/parquet/src/data_type.rs arrow-rs/parquet/src/data_type.rs
--- arrow-rs.orig/parquet/src/data_type.rs 2024-06-25 13:10:49.853750300 +0300
+++ arrow-rs/parquet/src/data_type.rs 2024-06-25 13:46:47.955666200 +0300
@@ -62,6 +62,12 @@
seconds * 1_000 + nanoseconds / 1_000_000
}
+ /// Converts this INT96 into an i64 representing the number of MICROSECONDS since Epoch
+ pub fn to_micros(&self) -> i64 {
+ let (seconds, nanoseconds) = self.to_seconds_and_nanos();
+ seconds * 1_000_000 + nanoseconds / 1_000
+ }
+
/// Converts this INT96 into an i64 representing the number of NANOSECONDS since EPOCH
///
/// Will wrap around on overflow
diff -ru arrow-rs.orig/parquet/src/record/api.rs arrow-rs/parquet/src/record/api.rs
--- arrow-rs.orig/parquet/src/record/api.rs 2024-06-25 13:10:49.870755100 +0300
+++ arrow-rs/parquet/src/record/api.rs 2024-06-25 14:03:23.583254500 +0300
@@ -132,8 +132,58 @@
}
}
+/// Type of the field within the `Row`.
+#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
+pub enum FieldType {
+ /// Null
+ Null,
+ /// Boolean value (`true`, `false`).
+ Bool,
+ /// Signed integer INT_8.
+ Byte,
+ /// Signed integer INT_16.
+ Short,
+ /// Signed integer INT_32.
+ Int,
+ /// Signed integer INT_64.
+ Long,
+ // Unsigned integer UINT_8.
+ UByte,
+ // Unsigned integer UINT_16.
+ UShort,
+ // Unsigned integer UINT_32.
+ UInt,
+ // Unsigned integer UINT_64.
+ ULong,
+ /// IEEE 32-bit floating point value.
+ Float,
+ /// IEEE 64-bit floating point value.
+ Double,
+ /// Decimal value.
+ Decimal,
+ /// UTF-8 encoded character string.
+ Str,
+ /// General binary value.
+ Bytes,
+ /// Date without a time of day, stores the number of days from the
+ /// Unix epoch, 1 January 1970.
+ Date,
+ /// Milliseconds from the Unix epoch, 1 January 1970.
+ TimestampMillis,
+ /// Microseconds from the Unix epoch, 1 January 1970.
+ TimestampMicros,
+ /// Struct, child elements are tuples of field-value pairs.
+ Group,
+ /// List of elements.
+ List,
+ /// List of key-value pairs.
+ Map,
+}
+
/// Trait for type-safe convenient access to fields within a Row.
pub trait RowAccessor {
+ fn get_field_type(&self, i: usize) -> FieldType;
+ fn get_field_name(&self, i: usize) -> &str;
fn get_bool(&self, i: usize) -> Result<bool>;
fn get_byte(&self, i: usize) -> Result<i8>;
fn get_short(&self, i: usize) -> Result<i16>;
@@ -148,6 +198,7 @@
fn get_double(&self, i: usize) -> Result<f64>;
fn get_timestamp_millis(&self, i: usize) -> Result<i64>;
fn get_timestamp_micros(&self, i: usize) -> Result<i64>;
+ fn get_date(&self, i: usize) -> Result<i32>;
fn get_decimal(&self, i: usize) -> Result<&Decimal>;
fn get_string(&self, i: usize) -> Result<&String>;
fn get_bytes(&self, i: usize) -> Result<&ByteArray>;
@@ -220,6 +271,14 @@
}
impl RowAccessor for Row {
+ fn get_field_type(&self, i: usize) -> FieldType {
+ self.fields[i].1.get_field_type()
+ }
+
+ fn get_field_name(&self, i: usize) -> &str {
+ &self.fields[i].0
+ }
+
row_primitive_accessor!(get_bool, Bool, bool);
row_primitive_accessor!(get_byte, Byte, i8);
@@ -248,6 +307,8 @@
row_primitive_accessor!(get_timestamp_micros, TimestampMicros, i64);
+ row_primitive_accessor!(get_date, Date, i32);
+
row_complex_accessor!(get_decimal, Decimal, Decimal);
row_complex_accessor!(get_string, Str, String);
@@ -309,6 +370,7 @@
/// Trait for type-safe access of an index for a `List`.
/// Note that the get_XXX methods do not do bound checking.
pub trait ListAccessor {
+ fn get_element_type(&self, i: usize) -> FieldType;
fn get_bool(&self, i: usize) -> Result<bool>;
fn get_byte(&self, i: usize) -> Result<i8>;
fn get_short(&self, i: usize) -> Result<i16>;
@@ -323,6 +385,7 @@
fn get_double(&self, i: usize) -> Result<f64>;
fn get_timestamp_millis(&self, i: usize) -> Result<i64>;
fn get_timestamp_micros(&self, i: usize) -> Result<i64>;
+ fn get_date(&self, i: usize) -> Result<i32>;
fn get_decimal(&self, i: usize) -> Result<&Decimal>;
fn get_string(&self, i: usize) -> Result<&String>;
fn get_bytes(&self, i: usize) -> Result<&ByteArray>;
@@ -366,6 +429,10 @@
}
impl ListAccessor for List {
+ fn get_element_type(&self, i: usize) -> FieldType {
+ self.elements[i].get_field_type()
+ }
+
list_primitive_accessor!(get_bool, Bool, bool);
list_primitive_accessor!(get_byte, Byte, i8);
@@ -394,6 +461,8 @@
list_primitive_accessor!(get_timestamp_micros, TimestampMicros, i64);
+ list_primitive_accessor!(get_date, Date, i32);
+
list_complex_accessor!(get_decimal, Decimal, Decimal);
list_complex_accessor!(get_string, Str, String);
@@ -433,6 +502,8 @@
/// Trait for type-safe access of an index for a `Map`
pub trait MapAccessor {
+ fn key_type(&self, i: usize) -> FieldType;
+ fn value_type(&self, i: usize) -> FieldType;
fn get_keys<'a>(&'a self) -> Box<dyn ListAccessor + 'a>;
fn get_values<'a>(&'a self) -> Box<dyn ListAccessor + 'a>;
}
@@ -459,6 +530,10 @@
}
impl<'a> ListAccessor for MapList<'a> {
+ fn get_element_type(&self, i: usize) -> FieldType {
+ self.elements[i].get_field_type()
+ }
+
map_list_primitive_accessor!(get_bool, Bool, bool);
map_list_primitive_accessor!(get_byte, Byte, i8);
@@ -487,6 +562,8 @@
map_list_primitive_accessor!(get_timestamp_micros, TimestampMicros, i64);
+ map_list_primitive_accessor!(get_date, Date, i32);
+
list_complex_accessor!(get_decimal, Decimal, Decimal);
list_complex_accessor!(get_string, Str, String);
@@ -501,6 +578,14 @@
}
impl MapAccessor for Map {
+ fn key_type(&self, i: usize) -> FieldType {
+ self.entries[i].0.get_field_type()
+ }
+
+ fn value_type(&self, i: usize) -> FieldType {
+ self.entries[i].1.get_field_type()
+ }
+
fn get_keys<'a>(&'a self) -> Box<dyn ListAccessor + 'a> {
let map_list = MapList {
elements: self.entries.iter().map(|v| &v.0).collect(),
@@ -599,6 +684,33 @@
}
}
+ pub fn get_field_type(&self) -> FieldType {
+ match self {
+ Field::Null => FieldType::Null,
+ Field::Bool(_) => FieldType::Bool,
+ Field::Byte(_) => FieldType::Byte,
+ Field::Short(_) => FieldType::Short,
+ Field::Int(_) => FieldType::Int,
+ Field::Long(_) => FieldType::Long,
+ Field::UByte(_) => FieldType::UByte,
+ Field::UShort(_) => FieldType::UShort,
+ Field::UInt(_) => FieldType::UInt,
+ Field::ULong(_) => FieldType::ULong,
+ Field::Float(_) => FieldType::Float,
+ Field::Float16(_) => FieldType::Float,
+ Field::Double(_) => FieldType::Double,
+ Field::Decimal(_) => FieldType::Decimal,
+ Field::Str(_) => FieldType::Str,
+ Field::Bytes(_) => FieldType::Bytes,
+ Field::TimestampMillis(_) => FieldType::TimestampMillis,
+ Field::TimestampMicros(_) => FieldType::TimestampMicros,
+ Field::Date(_) => FieldType::Date,
+ Field::Group(_) => FieldType::Group,
+ Field::ListInternal(_) => FieldType::List,
+ Field::MapInternal(_) => FieldType::Map,
+ }
+ }
+
/// Determines if this Row represents a primitive value.
pub fn is_primitive(&self) -> bool {
!matches!(
@@ -654,7 +766,13 @@
/// `Timestamp` value.
#[inline]
pub fn convert_int96(_descr: &ColumnDescPtr, value: Int96) -> Self {
- Field::TimestampMillis(value.to_i64())
+ let micros = value.to_micros();
+ if micros < 0 {
+ // XXX: Temporary workaround for negative timestamps -- return 1970-01-01T00:00:00Z
+ Field::TimestampMicros(0)
+ } else {
+ Field::TimestampMicros(micros as i64)
+ }
}
/// Converts Parquet FLOAT type with logical type into `f32` value.
@@ -1050,8 +1168,23 @@
let value = Int96::from(vec![4165425152, 13, 2454923]);
let row = Field::convert_int96(&descr, value);
assert_eq!(row, Field::TimestampMillis(1238544060000));
+
+ // Negative int96
+ let value = Int96::from(vec![0, 0, 0]);
+ let row = Field::convert_int96(&descr, value);
+ assert_eq!(row, Field::TimestampMillis(0));
}
+ //#[test]
+ //#[should_panic(expected = "Expected non-negative milliseconds when converting Int96")]
+ //fn test_row_convert_int96_invalid() {
+ // INT96 value does not depend on logical type
+ // let descr = make_column_descr![PhysicalType::INT96, LogicalType::NONE];
+
+ // let value = Int96::from(vec![0, 0, 0]);
+ // Field::convert_int96(&descr, value);
+ //}
+
#[test]
fn test_row_convert_float() {
// FLOAT value does not depend on logical type
diff -ru arrow-rs.orig/parquet/src/record/mod.rs arrow-rs/parquet/src/record/mod.rs
--- arrow-rs.orig/parquet/src/record/mod.rs 2024-06-25 13:10:49.870755100 +0300
+++ arrow-rs/parquet/src/record/mod.rs 2024-06-25 12:49:09.422781500 +0300
@@ -25,7 +25,8 @@
pub use self::{
api::{
- Field, List, ListAccessor, Map, MapAccessor, Row, RowAccessor, RowColumnIter, RowFormatter,
+ Field, FieldType, List, ListAccessor, Map, MapAccessor, Row, RowAccessor, RowColumnIter,
+ RowFormatter,
},
record_reader::RecordReader,
record_writer::RecordWriter,
diff -ru arrow-rs.orig/parquet/src/record/reader.rs arrow-rs/parquet/src/record/reader.rs
--- arrow-rs.orig/parquet/src/record/reader.rs 2024-06-25 13:10:49.871756000 +0300
+++ arrow-rs/parquet/src/record/reader.rs 2024-06-25 12:50:25.274021200 +0300
@@ -400,9 +400,15 @@
fn read_field(&mut self) -> Result<Field> {
let field = match *self {
Reader::PrimitiveReader(_, ref mut column) => {
- let value = column.current_value()?;
- column.read_next()?;
- value
+ // It's not obvious why some bytearray (string) fields are read through this PrimitiveReader,
+ // the following condition adds support for nullable fields:
+ if column.is_null() {
+ Field::Null
+ } else {
+ let value = column.current_value()?;
+ column.read_next()?;
+ value
+ }
}
Reader::OptionReader(def_level, ref mut reader) => {
if reader.current_def_level() > def_level {

Просмотреть файл

@ -2,19 +2,19 @@
<package >
<metadata>
<id>pq2json_linux</id>
<version>0.1.21</version>
<version>0.1.22</version>
<authors>Evgeney Ryzhyk</authors>
<owners>Evgeney Ryzhyk</owners>
<license type="expression">MIT</license>
<projectUrl>https://github.com/Azure/azure-kusto-parquet-conv</projectUrl>
<requireLicenseAcceptance>false</requireLicenseAcceptance>
<description>Parquet to JSON (line delimited) converter tool.</description>
<releaseNotes>Accept --columns as a JSON array.</releaseNotes>
<releaseNotes>Support timestamps up to microsecond precision.</releaseNotes>
<copyright>Copyright 2020</copyright>
<tags></tags>
<dependencies></dependencies>
</metadata>
<files>
<file src="target\release\pq2json.exe" target="tools\linux" />
<file src="pq2json\target\release\pq2json.exe" target="tools\linux" />
</files>
</package>

Просмотреть файл

@ -2,19 +2,19 @@
<package >
<metadata>
<id>pq2json</id>
<version>0.1.21</version>
<version>0.1.22</version>
<authors>Evgeney Ryzhyk</authors>
<owners>Evgeney Ryzhyk</owners>
<license type="expression">MIT</license>
<projectUrl>https://github.com/Azure/azure-kusto-parquet-conv</projectUrl>
<requireLicenseAcceptance>false</requireLicenseAcceptance>
<description>Parquet to JSON (line delimited) converter tool.</description>
<releaseNotes>Accept --columns as a JSON array.</releaseNotes>
<releaseNotes>Support timestamps up to microsecond precision.</releaseNotes>
<copyright>Copyright 2020</copyright>
<tags></tags>
<dependencies></dependencies>
</metadata>
<files>
<file src="target\release\pq2json.exe" target="tools\x64" />
<file src="pq2json\target\release\pq2json.exe" target="tools\x64" />
</files>
</package>

Просмотреть файл

@ -1,5 +1,11 @@
Parquet to JSON (line delimited) converter tool.
# Dependencies
The project is based on [arrow-rs](https://github.com/apache/arrow-rs) library.
The clone of this repository presents in this repository under `arrow-rs` directory.
The file `KUSTO.patch` contains all custom patches applied on the original `arrow-rs` code.
# Building
`cargo build --release`

47
arrow-rs/.asf.yaml Normal file
Просмотреть файл

@ -0,0 +1,47 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# Documentation can be found here:
# https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=127405038
notifications:
commits: commits@arrow.apache.org
issues: github@arrow.apache.org
pullrequests: github@arrow.apache.org
jira_options: link label worklog
github:
description: "Official Rust implementation of Apache Arrow"
homepage: https://arrow.apache.org/
enabled_merge_buttons:
squash: true
merge: false
rebase: false
features:
issues: true
protected_branches:
master:
required_status_checks:
# require branches to be up-to-date before merging
strict: true
# don't require any jobs to pass
contexts: []
# publishes the content of the `asf-site` branch to
# https://arrow.apache.org/rust/
publish:
whoami: asf-site
subdir: rust

3
arrow-rs/.gitattributes поставляемый Normal file
Просмотреть файл

@ -0,0 +1,3 @@
parquet/src/format.rs linguist-generated
arrow-flight/src/arrow.flight.protocol.rs linguist-generated
arrow-flight/src/sql/arrow.flight.protocol.sql.rs linguist-generated

28
arrow-rs/.github/ISSUE_TEMPLATE/bug_report.md поставляемый Normal file
Просмотреть файл

@ -0,0 +1,28 @@
---
name: Bug report
about: Create a report to help us improve
title: ''
labels: bug
assignees: ''
---
**Describe the bug**
<!--
A clear and concise description of what the bug is.
-->
**To Reproduce**
<!--
Steps to reproduce the behavior:
-->
**Expected behavior**
<!--
A clear and concise description of what you expected to happen.
-->
**Additional context**
<!--
Add any other context about the problem here.
-->

29
arrow-rs/.github/ISSUE_TEMPLATE/feature_request.md поставляемый Normal file
Просмотреть файл

@ -0,0 +1,29 @@
---
name: Feature request
about: Suggest an idea for this project
title: ''
labels: enhancement
assignees: ''
---
**Is your feature request related to a problem or challenge? Please describe what you are trying to do.**
<!--
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
(This section helps Arrow developers understand the context and *why* for this feature, in addition to the *what*)
-->
**Describe the solution you'd like**
<!--
A clear and concise description of what you want to happen.
-->
**Describe alternatives you've considered**
<!--
A clear and concise description of any alternative solutions or features you've considered.
-->
**Additional context**
<!--
Add any other context or screenshots about the feature request here.
-->

23
arrow-rs/.github/ISSUE_TEMPLATE/question.md поставляемый Normal file
Просмотреть файл

@ -0,0 +1,23 @@
---
name: Question
about: Ask question about this project
title: ''
labels: question
assignees: ''
---
**Which part is this question about**
<!--
Is it code base, library api, documentation or some other part?
-->
**Describe your question**
<!--
A clear and concise description of what the question is.
-->
**Additional context**
<!--
Add any other context about the problem here.
-->

71
arrow-rs/.github/actions/setup-builder/action.yaml поставляемый Normal file
Просмотреть файл

@ -0,0 +1,71 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
name: Prepare Rust Builder
description: 'Prepare Rust Build Environment'
inputs:
rust-version:
description: 'version of rust to install (e.g. stable)'
required: false
default: 'stable'
target:
description: 'target architecture(s)'
required: false
default: 'x86_64-unknown-linux-gnu'
runs:
using: "composite"
steps:
- name: Cache Cargo
uses: actions/cache@v3
with:
# these represent dependencies downloaded by cargo
# and thus do not depend on the OS, arch nor rust version.
#
# source https://github.com/actions/cache/blob/main/examples.md#rust---cargo
path: |
/usr/local/cargo/bin/
/usr/local/cargo/registry/index/
/usr/local/cargo/registry/cache/
/usr/local/cargo/git/db/
key: cargo-cache3-${{ hashFiles('**/Cargo.toml') }}
restore-keys: cargo-cache3-
- name: Generate lockfile
shell: bash
run: cargo fetch
- name: Install Build Dependencies
shell: bash
run: |
apt-get update
apt-get install -y protobuf-compiler
- name: Setup Rust toolchain
shell: bash
run: |
echo "Installing ${{ inputs.rust-version }}"
rustup toolchain install ${{ inputs.rust-version }} --target ${{ inputs.target }}
rustup default ${{ inputs.rust-version }}
- name: Disable debuginfo generation
# Disable full debug symbol generation to speed up CI build and keep memory down
# "1" means line tables only, which is useful for panic tracebacks.
shell: bash
run: echo "RUSTFLAGS=-C debuginfo=1" >> $GITHUB_ENV
- name: Enable backtraces
shell: bash
run: echo "RUST_BACKTRACE=1" >> $GITHUB_ENV
- name: Fixup git permissions
# https://github.com/actions/checkout/issues/766
shell: bash
run: git config --global --add safe.directory "$GITHUB_WORKSPACE"

22
arrow-rs/.github/dependabot.yml поставляемый Normal file
Просмотреть файл

@ -0,0 +1,22 @@
version: 2
updates:
- package-ecosystem: cargo
directory: "/"
schedule:
interval: daily
open-pull-requests-limit: 10
target-branch: master
labels: [ auto-dependencies, arrow ]
- package-ecosystem: cargo
directory: "/object_store"
schedule:
interval: daily
open-pull-requests-limit: 10
target-branch: master
labels: [ auto-dependencies, object_store ]
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "daily"
open-pull-requests-limit: 10
labels: [ auto-dependencies ]

31
arrow-rs/.github/pull_request_template.md поставляемый Normal file
Просмотреть файл

@ -0,0 +1,31 @@
# Which issue does this PR close?
<!--
We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes #123` indicates that this PR will close issue #123.
-->
Closes #.
# Rationale for this change
<!--
Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed.
Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes.
-->
# What changes are included in this PR?
<!--
There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR.
-->
# Are there any user-facing changes?
<!--
If there are user-facing changes then we may require documentation to be updated before approving the PR.
-->
<!---
If there are any breaking changes to public APIs, please add the `breaking change` label.
-->

25
arrow-rs/.github/workflows/README.md поставляемый Normal file
Просмотреть файл

@ -0,0 +1,25 @@
<!---
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
The CI is structured so most tests are run in specific workflows:
`arrow.yml` for `arrow`, `parquet.yml` for `parquet` and so on.
The basic idea is to run all tests on pushes to master (to ensure we
keep master green) but run only the individual workflows on PRs that
change files that could affect them.

199
arrow-rs/.github/workflows/arrow.yml поставляемый Normal file
Просмотреть файл

@ -0,0 +1,199 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# tests for arrow crate
name: arrow
concurrency:
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
cancel-in-progress: true
on:
# always trigger
push:
branches:
- master
pull_request:
paths:
- .github/**
- arrow-arith/**
- arrow-array/**
- arrow-buffer/**
- arrow-cast/**
- arrow-csv/**
- arrow-data/**
- arrow-integration-test/**
- arrow-ipc/**
- arrow-json/**
- arrow-avro/**
- arrow-ord/**
- arrow-row/**
- arrow-schema/**
- arrow-select/**
- arrow-string/**
- arrow/**
jobs:
# test the crate
linux-test:
name: Test
runs-on: ubuntu-latest
container:
image: amd64/rust
steps:
- uses: actions/checkout@v4
with:
submodules: true
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
- name: Test arrow-buffer with all features
run: cargo test -p arrow-buffer --all-features
- name: Test arrow-data with all features
run: cargo test -p arrow-data --all-features
- name: Test arrow-schema with all features
run: cargo test -p arrow-schema --all-features
- name: Test arrow-array with all features
run: cargo test -p arrow-array --all-features
- name: Test arrow-select with all features
run: cargo test -p arrow-select --all-features
- name: Test arrow-cast with all features
run: cargo test -p arrow-cast --all-features
- name: Test arrow-ipc with all features
run: cargo test -p arrow-ipc --all-features
- name: Test arrow-csv with all features
run: cargo test -p arrow-csv --all-features
- name: Test arrow-json with all features
run: cargo test -p arrow-json --all-features
- name: Test arrow-avro with all features
run: cargo test -p arrow-avro --all-features
- name: Test arrow-string with all features
run: cargo test -p arrow-string --all-features
- name: Test arrow-ord with all features
run: cargo test -p arrow-ord --all-features
- name: Test arrow-arith with all features
run: cargo test -p arrow-arith --all-features
- name: Test arrow-row with all features
run: cargo test -p arrow-row --all-features
- name: Test arrow-integration-test with all features
run: cargo test -p arrow-integration-test --all-features
- name: Test arrow with default features
run: cargo test -p arrow
- name: Test arrow with all features except pyarrow
run: cargo test -p arrow --features=force_validate,prettyprint,ipc_compression,ffi,chrono-tz
- name: Run examples
run: |
# Test arrow examples
cargo run --example builders
cargo run --example dynamic_types
cargo run --example read_csv
cargo run --example read_csv_infer_schema
- name: Run non-archery based integration-tests
run: cargo test -p arrow-integration-testing
# test compilation features
linux-features:
name: Check Compilation
runs-on: ubuntu-latest
container:
image: amd64/rust
steps:
- uses: actions/checkout@v4
with:
submodules: true
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
- name: Check compilation
run: cargo check -p arrow
- name: Check compilation --no-default-features
run: cargo check -p arrow --no-default-features
- name: Check compilation --all-targets
run: cargo check -p arrow --all-targets
- name: Check compilation --no-default-features --all-targets
run: cargo check -p arrow --no-default-features --all-targets
- name: Check compilation --no-default-features --all-targets --features test_utils
run: cargo check -p arrow --no-default-features --all-targets --features test_utils
- name: Check compilation --no-default-features --all-targets --features ffi
run: cargo check -p arrow --no-default-features --all-targets --features ffi
- name: Check compilation --no-default-features --all-targets --features chrono-tz
run: cargo check -p arrow --no-default-features --all-targets --features chrono-tz
# test the arrow crate builds against wasm32 in nightly rust
wasm32-build:
name: Build wasm32
runs-on: ubuntu-latest
container:
image: amd64/rust
steps:
- uses: actions/checkout@v4
with:
submodules: true
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
target: wasm32-unknown-unknown,wasm32-wasi
- name: Build wasm32-unknown-unknown
run: cargo build -p arrow --no-default-features --features=json,csv,ipc,ffi --target wasm32-unknown-unknown
- name: Build wasm32-wasi
run: cargo build -p arrow --no-default-features --features=json,csv,ipc,ffi --target wasm32-wasi
clippy:
name: Clippy
runs-on: ubuntu-latest
container:
image: amd64/rust
steps:
- uses: actions/checkout@v4
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
- name: Setup Clippy
run: rustup component add clippy
- name: Clippy arrow-buffer with all features
run: cargo clippy -p arrow-buffer --all-targets --all-features -- -D warnings
- name: Clippy arrow-data with all features
run: cargo clippy -p arrow-data --all-targets --all-features -- -D warnings
- name: Clippy arrow-schema with all features
run: cargo clippy -p arrow-schema --all-targets --all-features -- -D warnings
- name: Clippy arrow-array with all features
run: cargo clippy -p arrow-array --all-targets --all-features -- -D warnings
- name: Clippy arrow-select with all features
run: cargo clippy -p arrow-select --all-targets --all-features -- -D warnings
- name: Clippy arrow-cast with all features
run: cargo clippy -p arrow-cast --all-targets --all-features -- -D warnings
- name: Clippy arrow-ipc with all features
run: cargo clippy -p arrow-ipc --all-targets --all-features -- -D warnings
- name: Clippy arrow-csv with all features
run: cargo clippy -p arrow-csv --all-targets --all-features -- -D warnings
- name: Clippy arrow-json with all features
run: cargo clippy -p arrow-json --all-targets --all-features -- -D warnings
- name: Clippy arrow-avro with all features
run: cargo clippy -p arrow-avro --all-targets --all-features -- -D warnings
- name: Clippy arrow-string with all features
run: cargo clippy -p arrow-string --all-targets --all-features -- -D warnings
- name: Clippy arrow-ord with all features
run: cargo clippy -p arrow-ord --all-targets --all-features -- -D warnings
- name: Clippy arrow-arith with all features
run: cargo clippy -p arrow-arith --all-targets --all-features -- -D warnings
- name: Clippy arrow-row with all features
run: cargo clippy -p arrow-row --all-targets --all-features -- -D warnings
- name: Clippy arrow with all features
run: cargo clippy -p arrow --all-features --all-targets -- -D warnings
- name: Clippy arrow-integration-test with all features
run: cargo clippy -p arrow-integration-test --all-targets --all-features -- -D warnings
- name: Clippy arrow-integration-testing with all features
run: cargo clippy -p arrow-integration-testing --all-targets --all-features -- -D warnings

91
arrow-rs/.github/workflows/arrow_flight.yml поставляемый Normal file
Просмотреть файл

@ -0,0 +1,91 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
---
# tests for arrow_flight crate
name: arrow_flight
concurrency:
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
cancel-in-progress: true
# trigger for all PRs that touch certain files and changes to master
on:
push:
branches:
- master
pull_request:
paths:
- arrow-array/**
- arrow-buffer/**
- arrow-cast/**
- arrow-data/**
- arrow-flight/**
- arrow-ipc/**
- arrow-schema/**
- arrow-select/**
- .github/**
jobs:
linux-test:
name: Test
runs-on: ubuntu-latest
container:
image: amd64/rust
steps:
- uses: actions/checkout@v4
with:
submodules: true
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
- name: Test
run: |
cargo test -p arrow-flight
- name: Test --all-features
run: |
cargo test -p arrow-flight --all-features
- name: Test --examples
run: |
cargo test -p arrow-flight --features=flight-sql-experimental,tls --examples
vendor:
name: Verify Vendored Code
runs-on: ubuntu-latest
container:
image: amd64/rust
steps:
- uses: actions/checkout@v4
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
- name: Run gen
run: ./arrow-flight/regen.sh
- name: Verify workspace clean (if this fails, run ./arrow-flight/regen.sh and check in results)
run: git diff --exit-code
clippy:
name: Clippy
runs-on: ubuntu-latest
container:
image: amd64/rust
steps:
- uses: actions/checkout@v4
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
- name: Setup Clippy
run: rustup component add clippy
- name: Run clippy
run: cargo clippy -p arrow-flight --all-targets --all-features -- -D warnings

43
arrow-rs/.github/workflows/audit.yml поставляемый Normal file
Просмотреть файл

@ -0,0 +1,43 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
name: audit
concurrency:
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
cancel-in-progress: true
# trigger for all PRs that touch certain files and changes to master
on:
push:
branches:
- master
pull_request:
paths:
- '**/Cargo.toml'
- '**/Cargo.lock'
jobs:
cargo-audit:
name: Audit
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install cargo-audit
run: cargo install cargo-audit
- name: Run audit check
run: cargo audit

67
arrow-rs/.github/workflows/coverage.yml поставляемый Normal file
Просмотреть файл

@ -0,0 +1,67 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
name: coverage
concurrency:
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
cancel-in-progress: true
# Trigger only on pushes to master, not pull requests
on:
push:
branches:
- master
jobs:
coverage:
name: Coverage
runs-on: ubuntu-latest
# Note runs outside of a container
# otherwise we get this error:
# Failed to run tests: ASLR disable failed: EPERM: Operation not permitted
steps:
- uses: actions/checkout@v4
with:
submodules: true
- name: Setup Rust toolchain
run: |
rustup toolchain install stable
rustup default stable
- name: Install protobuf compiler in /protoc
run: |
sudo mkdir /protoc
sudo chmod a+rwx /protoc
cd /protoc
curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v21.4/protoc-21.4-linux-x86_64.zip
unzip protoc-21.4-linux-x86_64.zip
- name: Cache Cargo
uses: actions/cache@v4
with:
path: /home/runner/.cargo
key: cargo-coverage-cache3-
- name: Run coverage
run: |
export PATH=$PATH:/protoc/bin
rustup toolchain install stable
rustup default stable
cargo install --version 0.18.2 cargo-tarpaulin
cargo tarpaulin --all --out Xml
- name: Report coverage
continue-on-error: true
run: bash <(curl -s https://codecov.io/bash)

61
arrow-rs/.github/workflows/dev.yml поставляемый Normal file
Просмотреть файл

@ -0,0 +1,61 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
name: dev
concurrency:
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
cancel-in-progress: true
# trigger for all PRs and changes to master
on:
push:
branches:
- master
pull_request:
env:
ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }}
ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }}
jobs:
rat:
name: Release Audit Tool (RAT)
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: 3.8
- name: Audit licenses
run: ./dev/release/run-rat.sh .
prettier:
name: Markdown format
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
with:
node-version: "14"
- name: Prettier check
run: |
# if you encounter error, run the command below and commit the changes
npx prettier@2.3.2 --write {arrow,arrow-flight,dev,arrow-integration-testing,parquet}/**/*.md README.md CODE_OF_CONDUCT.md CONTRIBUTING.md
git diff --exit-code

51
arrow-rs/.github/workflows/dev_pr.yml поставляемый Normal file
Просмотреть файл

@ -0,0 +1,51 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
name: dev_pr
concurrency:
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
cancel-in-progress: true
# Trigger whenever a PR is changed (title as well as new / changed commits)
on:
pull_request_target:
types:
- opened
- edited
- synchronize
jobs:
process:
name: Process
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: write
steps:
- uses: actions/checkout@v4
- name: Assign GitHub labels
if: |
github.event_name == 'pull_request_target' &&
(github.event.action == 'opened' ||
github.event.action == 'synchronize')
uses: actions/labeler@v5.0.0
with:
repo-token: ${{ secrets.GITHUB_TOKEN }}
configuration-path: .github/workflows/dev_pr/labeler.yml
sync-labels: true

55
arrow-rs/.github/workflows/dev_pr/labeler.yml поставляемый Normal file
Просмотреть файл

@ -0,0 +1,55 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
arrow:
- changed-files:
- any-glob-to-any-file:
- 'arrow-arith/**/*'
- 'arrow-array/**/*'
- 'arrow-buffer/**/*'
- 'arrow-cast/**/*'
- 'arrow-csv/**/*'
- 'arrow-data/**/*'
- 'arrow-flight/**/*'
- 'arrow-integration-test/**/*'
- 'arrow-integration-testing/**/*'
- 'arrow-ipc/**/*'
- 'arrow-json/**/*'
- 'arrow-avro/**/*'
- 'arrow-ord/**/*'
- 'arrow-row/**/*'
- 'arrow-schema/**/*'
- 'arrow-select/**/*'
- 'arrow-string/**/*'
- 'arrow/**/*'
arrow-flight:
- changed-files:
- any-glob-to-any-file:
- 'arrow-flight/**/*'
parquet:
- changed-files:
- any-glob-to-any-file: [ 'parquet/**/*' ]
parquet-derive:
- changed-files:
- any-glob-to-any-file: [ 'parquet_derive/**/*' ]
object-store:
- changed-files:
- any-glob-to-any-file: [ 'object_store/**/*' ]

97
arrow-rs/.github/workflows/docs.yml поставляемый Normal file
Просмотреть файл

@ -0,0 +1,97 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
name: docs
concurrency:
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
cancel-in-progress: true
# trigger for all PRs and changes to master
on:
push:
branches:
- master
pull_request:
jobs:
# test doc links still work
docs:
name: Rustdocs are clean
runs-on: ubuntu-latest
strategy:
matrix:
arch: [ amd64 ]
rust: [ nightly ]
container:
image: ${{ matrix.arch }}/rust
env:
RUSTDOCFLAGS: "-Dwarnings --enable-index-page -Zunstable-options"
steps:
- uses: actions/checkout@v4
with:
submodules: true
- name: Install python dev
run: |
apt update
apt install -y libpython3.11-dev
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
rust-version: ${{ matrix.rust }}
- name: Run cargo doc
run: cargo doc --document-private-items --no-deps --workspace --all-features
- name: Fix file permissions
shell: sh
run: |
chmod -c -R +rX "target/doc" |
while read line; do
echo "::warning title=Invalid file permissions automatically fixed::$line"
done
- name: Upload artifacts
uses: actions/upload-pages-artifact@v3
with:
name: crate-docs
path: target/doc
deploy:
# Only deploy if a push to master
if: github.ref_name == 'master' && github.event_name == 'push'
needs: docs
permissions:
contents: write
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Download crate docs
uses: actions/download-artifact@v4
with:
name: crate-docs
path: website/build
- name: Prepare website
run: |
tar -xf website/build/artifact.tar -C website/build
rm website/build/artifact.tar
cp .asf.yaml ./website/build/.asf.yaml
- name: Deploy to gh-pages
uses: peaceiris/actions-gh-pages@v4.0.0
if: github.event_name == 'push' && github.ref_name == 'master'
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
publish_dir: website/build
publish_branch: asf-site

163
arrow-rs/.github/workflows/integration.yml поставляемый Normal file
Просмотреть файл

@ -0,0 +1,163 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
name: integration
concurrency:
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
cancel-in-progress: true
# trigger for all PRs that touch certain files and changes to master
on:
push:
branches:
- master
pull_request:
paths:
- .github/**
- arrow-array/**
- arrow-buffer/**
- arrow-cast/**
- arrow-csv/**
- arrow-data/**
- arrow-integration-test/**
- arrow-integration-testing/**
- arrow-ipc/**
- arrow-json/**
- arrow-avro/**
- arrow-ord/**
- arrow-pyarrow-integration-testing/**
- arrow-schema/**
- arrow-select/**
- arrow-sort/**
- arrow-string/**
- arrow/**
jobs:
integration:
name: Archery test With other arrows
runs-on: ubuntu-latest
container:
image: apache/arrow-dev:amd64-conda-integration
env:
ARROW_USE_CCACHE: OFF
ARROW_CPP_EXE_PATH: /build/cpp/debug
ARROW_NANOARROW_PATH: /build/nanoarrow
ARROW_RUST_EXE_PATH: /build/rust/debug
BUILD_DOCS_CPP: OFF
ARROW_INTEGRATION_CPP: ON
ARROW_INTEGRATION_CSHARP: ON
ARROW_INTEGRATION_GO: ON
ARROW_INTEGRATION_JAVA: ON
ARROW_INTEGRATION_JS: ON
ARCHERY_INTEGRATION_WITH_NANOARROW: "1"
# https://github.com/apache/arrow/pull/38403/files#r1371281630
ARCHERY_INTEGRATION_WITH_RUST: "1"
# These are necessary because the github runner overrides $HOME
# https://github.com/actions/runner/issues/863
RUSTUP_HOME: /root/.rustup
CARGO_HOME: /root/.cargo
defaults:
run:
shell: bash
steps:
# This is necessary so that actions/checkout can find git
- name: Export conda path
run: echo "/opt/conda/envs/arrow/bin" >> $GITHUB_PATH
# This is necessary so that Rust can find cargo
- name: Export cargo path
run: echo "/root/.cargo/bin" >> $GITHUB_PATH
- name: Check rustup
run: which rustup
- name: Check cmake
run: which cmake
- name: Checkout Arrow
uses: actions/checkout@v4
with:
repository: apache/arrow
submodules: true
fetch-depth: 0
- name: Checkout Arrow Rust
uses: actions/checkout@v4
with:
path: rust
fetch-depth: 0
- name: Checkout Arrow nanoarrow
uses: actions/checkout@v4
with:
repository: apache/arrow-nanoarrow
path: nanoarrow
fetch-depth: 0
# Workaround https://github.com/rust-lang/rust/issues/125067
- name: Downgrade rust
working-directory: rust
run: rustup override set 1.77
- name: Build
run: conda run --no-capture-output ci/scripts/integration_arrow_build.sh $PWD /build
- name: Run
run: conda run --no-capture-output ci/scripts/integration_arrow.sh $PWD /build
# test FFI against the C-Data interface exposed by pyarrow
pyarrow-integration-test:
name: Pyarrow C Data Interface
runs-on: ubuntu-latest
strategy:
matrix:
rust: [ stable ]
# PyArrow 13 was the last version prior to introduction to Arrow PyCapsules
pyarrow: [ "13", "14" ]
steps:
- uses: actions/checkout@v4
with:
submodules: true
- name: Setup Rust toolchain
run: |
rustup toolchain install ${{ matrix.rust }}
rustup default ${{ matrix.rust }}
rustup component add rustfmt clippy
- name: Cache Cargo
uses: actions/cache@v4
with:
path: /home/runner/.cargo
key: cargo-maturin-cache-
- name: Cache Rust dependencies
uses: actions/cache@v4
with:
path: /home/runner/target
# this key is not equal because maturin uses different compilation flags.
key: ${{ runner.os }}-${{ matrix.arch }}-target-maturin-cache-${{ matrix.rust }}-
- uses: actions/setup-python@v5
with:
python-version: '3.8'
- name: Upgrade pip and setuptools
run: pip install --upgrade pip setuptools wheel virtualenv
- name: Create virtualenv and install dependencies
run: |
virtualenv venv
source venv/bin/activate
pip install maturin toml pytest pytz pyarrow==${{ matrix.pyarrow }}
- name: Run Rust tests
run: |
source venv/bin/activate
cargo test -p arrow --test pyarrow --features pyarrow
- name: Run tests
run: |
source venv/bin/activate
cd arrow-pyarrow-integration-testing
maturin develop
pytest -v .

20
arrow-rs/.github/workflows/miri.sh поставляемый Normal file
Просмотреть файл

@ -0,0 +1,20 @@
#!/bin/bash
#
# Script
#
# Must be run with nightly rust for example
# rustup default nightly
set -e
export MIRIFLAGS="-Zmiri-disable-isolation"
cargo miri setup
cargo clean
echo "Starting Arrow MIRI run..."
cargo miri test -p arrow-buffer
cargo miri test -p arrow-data --features ffi
cargo miri test -p arrow-schema --features ffi
cargo miri test -p arrow-ord
cargo miri test -p arrow-array
cargo miri test -p arrow-arith

62
arrow-rs/.github/workflows/miri.yaml поставляемый Normal file
Просмотреть файл

@ -0,0 +1,62 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
name: miri
concurrency:
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
cancel-in-progress: true
# trigger for all PRs that touch certain files and changes to master
on:
push:
branches:
- master
pull_request:
paths:
- .github/**
- arrow-array/**
- arrow-buffer/**
- arrow-cast/**
- arrow-csv/**
- arrow-data/**
- arrow-ipc/**
- arrow-json/**
- arrow-avro/**
- arrow-schema/**
- arrow-select/**
- arrow-string/**
- arrow/**
jobs:
miri-checks:
name: MIRI
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
submodules: true
- name: Setup Rust toolchain
run: |
rustup toolchain install nightly --component miri
rustup override set nightly
cargo miri setup
- name: Run Miri Checks
env:
RUST_BACKTRACE: full
RUST_LOG: "trace"
run: bash .github/workflows/miri.sh

200
arrow-rs/.github/workflows/object_store.yml поставляемый Normal file
Просмотреть файл

@ -0,0 +1,200 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
---
# tests for `object_store` crate
name: object_store
concurrency:
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
cancel-in-progress: true
# trigger for all PRs that touch certain files and changes to master
on:
push:
branches:
- master
pull_request:
paths:
- object_store/**
- .github/**
jobs:
clippy:
name: Clippy
runs-on: ubuntu-latest
container:
image: amd64/rust
defaults:
run:
working-directory: object_store
steps:
- uses: actions/checkout@v4
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
- name: Setup Clippy
run: rustup component add clippy
# Run different tests for the library on its own as well as
# all targets to ensure that it still works in the absence of
# features that might be enabled by dev-dependencies of other
# targets.
- name: Run clippy with default features
run: cargo clippy -- -D warnings
- name: Run clippy with aws feature
run: cargo clippy --features aws -- -D warnings
- name: Run clippy with gcp feature
run: cargo clippy --features gcp -- -D warnings
- name: Run clippy with azure feature
run: cargo clippy --features azure -- -D warnings
- name: Run clippy with http feature
run: cargo clippy --features http -- -D warnings
- name: Run clippy with all features
run: cargo clippy --all-features -- -D warnings
- name: Run clippy with all features and all targets
run: cargo clippy --all-features --all-targets -- -D warnings
# test doc links still work
#
# Note that since object_store is not part of the main workspace,
# this needs a separate docs job as it is not covered by
# `cargo doc --workspace`
docs:
name: Rustdocs
runs-on: ubuntu-latest
defaults:
run:
working-directory: object_store
env:
RUSTDOCFLAGS: "-Dwarnings"
steps:
- uses: actions/checkout@v4
- name: Run cargo doc
run: cargo doc --document-private-items --no-deps --workspace --all-features
# test the crate
# This runs outside a container to workaround lack of support for passing arguments
# to service containers - https://github.com/orgs/community/discussions/26688
linux-test:
name: Emulator Tests
runs-on: ubuntu-latest
defaults:
run:
working-directory: object_store
env:
# Disable full debug symbol generation to speed up CI build and keep memory down
# "1" means line tables only, which is useful for panic tracebacks.
RUSTFLAGS: "-C debuginfo=1"
RUST_BACKTRACE: "1"
# Run integration tests
TEST_INTEGRATION: 1
EC2_METADATA_ENDPOINT: http://localhost:1338
AZURE_CONTAINER_NAME: test-bucket
AZURE_STORAGE_USE_EMULATOR: "1"
AZURITE_BLOB_STORAGE_URL: "http://localhost:10000"
AZURITE_QUEUE_STORAGE_URL: "http://localhost:10001"
AWS_BUCKET: test-bucket
AWS_DEFAULT_REGION: "us-east-1"
AWS_ACCESS_KEY_ID: test
AWS_SECRET_ACCESS_KEY: test
AWS_ENDPOINT: http://localhost:4566
AWS_ALLOW_HTTP: true
AWS_COPY_IF_NOT_EXISTS: dynamo:test-table:2000
AWS_CONDITIONAL_PUT: dynamo:test-table:2000
AWS_SERVER_SIDE_ENCRYPTION: aws:kms
HTTP_URL: "http://localhost:8080"
GOOGLE_BUCKET: test-bucket
GOOGLE_SERVICE_ACCOUNT: "/tmp/gcs.json"
steps:
- uses: actions/checkout@v4
# We are forced to use docker commands instead of service containers as we need to override the entrypoints
# which is currently not supported - https://github.com/actions/runner/discussions/1872
- name: Configure Fake GCS Server (GCP emulation)
# Custom image - see fsouza/fake-gcs-server#1164
run: |
echo "GCS_CONTAINER=$(docker run -d -p 4443:4443 tustvold/fake-gcs-server -scheme http -backend memory -public-host localhost:4443)" >> $GITHUB_ENV
# Give the container a moment to start up prior to configuring it
sleep 1
curl -v -X POST --data-binary '{"name":"test-bucket"}' -H "Content-Type: application/json" "http://localhost:4443/storage/v1/b"
echo '{"gcs_base_url": "http://localhost:4443", "disable_oauth": true, "client_email": "", "private_key": "", "private_key_id": ""}' > "$GOOGLE_SERVICE_ACCOUNT"
- name: Setup WebDav
run: docker run -d -p 8080:80 rclone/rclone serve webdav /data --addr :80
- name: Setup LocalStack (AWS emulation)
run: |
echo "LOCALSTACK_CONTAINER=$(docker run -d -p 4566:4566 localstack/localstack:3.3.0)" >> $GITHUB_ENV
echo "EC2_METADATA_CONTAINER=$(docker run -d -p 1338:1338 amazon/amazon-ec2-metadata-mock:v1.9.2 --imdsv2)" >> $GITHUB_ENV
aws --endpoint-url=http://localhost:4566 s3 mb s3://test-bucket
aws --endpoint-url=http://localhost:4566 dynamodb create-table --table-name test-table --key-schema AttributeName=path,KeyType=HASH AttributeName=etag,KeyType=RANGE --attribute-definitions AttributeName=path,AttributeType=S AttributeName=etag,AttributeType=S --provisioned-throughput ReadCapacityUnits=5,WriteCapacityUnits=5
KMS_KEY=$(aws --endpoint-url=http://localhost:4566 kms create-key --description "test key")
echo "AWS_SSE_KMS_KEY_ID=$(echo $KMS_KEY | jq -r .KeyMetadata.KeyId)" >> $GITHUB_ENV
- name: Configure Azurite (Azure emulation)
# the magical connection string is from
# https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azurite?tabs=visual-studio#http-connection-strings
run: |
echo "AZURITE_CONTAINER=$(docker run -d -p 10000:10000 -p 10001:10001 -p 10002:10002 mcr.microsoft.com/azure-storage/azurite)" >> $GITHUB_ENV
az storage container create -n test-bucket --connection-string 'DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://localhost:10000/devstoreaccount1;QueueEndpoint=http://localhost:10001/devstoreaccount1;'
- name: Setup Rust toolchain
run: |
rustup toolchain install stable
rustup default stable
- name: Run object_store tests
run: cargo test --features=aws,azure,gcp,http
- name: GCS Output
if: ${{ !cancelled() }}
run: docker logs $GCS_CONTAINER
- name: LocalStack Output
if: ${{ !cancelled() }}
run: docker logs $LOCALSTACK_CONTAINER
- name: EC2 Metadata Output
if: ${{ !cancelled() }}
run: docker logs $EC2_METADATA_CONTAINER
- name: Azurite Output
if: ${{ !cancelled() }}
run: docker logs $AZURITE_CONTAINER
# test the object_store crate builds against wasm32 in stable rust
wasm32-build:
name: Build wasm32
runs-on: ubuntu-latest
container:
image: amd64/rust
defaults:
run:
working-directory: object_store
steps:
- uses: actions/checkout@v4
with:
submodules: true
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
target: wasm32-unknown-unknown,wasm32-wasi
- name: Build wasm32-unknown-unknown
run: cargo build --target wasm32-unknown-unknown
- name: Build wasm32-wasi
run: cargo build --target wasm32-wasi

180
arrow-rs/.github/workflows/parquet.yml поставляемый Normal file
Просмотреть файл

@ -0,0 +1,180 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
---
# tests for parquet crate
name: "parquet"
concurrency:
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
cancel-in-progress: true
# trigger for all PRs that touch certain files and changes to master
on:
push:
branches:
- master
pull_request:
paths:
- arrow/**
- arrow-array/**
- arrow-buffer/**
- arrow-cast/**
- arrow-data/**
- arrow-schema/**
- arrow-select/**
- arrow-ipc/**
- arrow-csv/**
- arrow-json/**
- arrow-avro/**
- parquet/**
- .github/**
jobs:
# test the crate
linux-test:
name: Test
runs-on: ubuntu-latest
container:
image: amd64/rust
steps:
- uses: actions/checkout@v4
with:
submodules: true
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
- name: Test
run: cargo test -p parquet
- name: Test --all-features
run: cargo test -p parquet --all-features
- name: Run examples
run: |
# Test parquet examples
cargo run -p parquet --example read_parquet
cargo run -p parquet --example async_read_parquet --features="async"
cargo run -p parquet --example read_with_rowgroup --features="async"
# test compilation
linux-features:
name: Check Compilation
runs-on: ubuntu-latest
container:
image: amd64/rust
steps:
- uses: actions/checkout@v4
with:
submodules: true
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
# Run different tests for the library on its own as well as
# all targets to ensure that it still works in the absence of
# features that might be enabled by dev-dependencies of other
# targets.
#
# This for each of (library and all-targets), check
# 1. compiles with default features
# 1. compiles with no default features
# 3. compiles with just arrow feature
# 3. compiles with all features
- name: Check compilation
run: cargo check -p parquet
- name: Check compilation --no-default-features
run: cargo check -p parquet --no-default-features
- name: Check compilation --no-default-features --features arrow
run: cargo check -p parquet --no-default-features --features arrow
- name: Check compilation --no-default-features --all-features
run: cargo check -p parquet --all-features
- name: Check compilation --all-targets
run: cargo check -p parquet --all-targets
- name: Check compilation --all-targets --no-default-features
run: cargo check -p parquet --all-targets --no-default-features
- name: Check compilation --all-targets --no-default-features --features arrow
run: cargo check -p parquet --all-targets --no-default-features --features arrow
- name: Check compilation --all-targets --all-features
run: cargo check -p parquet --all-targets --all-features
- name: Check compilation --all-targets --no-default-features --features json
run: cargo check -p parquet --all-targets --no-default-features --features json
# test the parquet crate builds against wasm32 in stable rust
wasm32-build:
name: Build wasm32
runs-on: ubuntu-latest
container:
image: amd64/rust
steps:
- uses: actions/checkout@v4
with:
submodules: true
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
target: wasm32-unknown-unknown,wasm32-wasi
- name: Install clang # Needed for zlib compilation
run: apt-get update && apt-get install -y clang gcc-multilib
- name: Build wasm32-unknown-unknown
run: cargo build -p parquet --target wasm32-unknown-unknown
- name: Build wasm32-wasi
run: cargo build -p parquet --target wasm32-wasi
pyspark-integration-test:
name: PySpark Integration Test
runs-on: ubuntu-latest
strategy:
matrix:
rust: [ stable ]
steps:
- uses: actions/checkout@v4
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: "3.10"
cache: "pip"
- name: Install Python dependencies
run: |
cd parquet/pytest
pip install -r requirements.txt
- name: Black check the test files
run: |
cd parquet/pytest
black --check *.py --verbose
- name: Setup Rust toolchain
run: |
rustup toolchain install ${{ matrix.rust }}
rustup default ${{ matrix.rust }}
- name: Install binary for checking
run: |
cargo install --path parquet --bin parquet-show-bloom-filter --features=cli
cargo install --path parquet --bin parquet-fromcsv --features=arrow,cli
- name: Run pytest
run: |
cd parquet/pytest
pytest -v
clippy:
name: Clippy
runs-on: ubuntu-latest
container:
image: amd64/rust
steps:
- uses: actions/checkout@v4
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
- name: Setup Clippy
run: rustup component add clippy
- name: Run clippy
run: cargo clippy -p parquet --all-targets --all-features -- -D warnings

66
arrow-rs/.github/workflows/parquet_derive.yml поставляемый Normal file
Просмотреть файл

@ -0,0 +1,66 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
---
# tests for parquet_derive crate
name: parquet_derive
concurrency:
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
cancel-in-progress: true
# trigger for all PRs that touch certain files and changes to master
on:
push:
branches:
- master
pull_request:
paths:
- parquet/**
- parquet_derive/**
- parquet_derive_test/**
- .github/**
jobs:
# test the crate
linux-test:
name: Test
runs-on: ubuntu-latest
container:
image: amd64/rust
steps:
- uses: actions/checkout@v4
with:
submodules: true
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
- name: Test
run: cargo test -p parquet_derive
clippy:
name: Clippy
runs-on: ubuntu-latest
container:
image: amd64/rust
steps:
- uses: actions/checkout@v4
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
- name: Setup Clippy
run: rustup component add clippy
- name: Run clippy
run: cargo clippy -p parquet_derive --all-features -- -D warnings

138
arrow-rs/.github/workflows/rust.yml поставляемый Normal file
Просмотреть файл

@ -0,0 +1,138 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# workspace wide tests
name: rust
concurrency:
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
cancel-in-progress: true
# trigger for all PRs and changes to master
on:
push:
branches:
- master
pull_request:
jobs:
# Check workspace wide compile and test with default features for
# mac
macos:
name: Test on Mac
runs-on: macos-latest
steps:
- uses: actions/checkout@v4
with:
submodules: true
- name: Install protoc with brew
run: brew install protobuf
- name: Setup Rust toolchain
run: |
rustup toolchain install stable --no-self-update
rustup default stable
- name: Run tests
shell: bash
run: |
# do not produce debug symbols to keep memory usage down
export RUSTFLAGS="-C debuginfo=0"
cargo test
# Check workspace wide compile and test with default features for
# windows
windows:
name: Test on Windows
runs-on: windows-latest
steps:
- uses: actions/checkout@v4
with:
submodules: true
- name: Install protobuf compiler in /d/protoc
shell: bash
run: |
mkdir /d/protoc
cd /d/protoc
curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v21.4/protoc-21.4-win64.zip
unzip protoc-21.4-win64.zip
export PATH=$PATH:/d/protoc/bin
protoc --version
- name: Setup Rust toolchain
run: |
rustup toolchain install stable --no-self-update
rustup default stable
- name: Run tests
shell: bash
run: |
# do not produce debug symbols to keep memory usage down
export RUSTFLAGS="-C debuginfo=0"
export PATH=$PATH:/d/protoc/bin
cargo test
# Run cargo fmt for all crates
lint:
name: Lint (cargo fmt)
runs-on: ubuntu-latest
container:
image: amd64/rust
steps:
- uses: actions/checkout@v4
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
- name: Setup rustfmt
run: rustup component add rustfmt
- name: Format arrow
run: cargo fmt --all -- --check
- name: Format object_store
working-directory: object_store
run: cargo fmt --all -- --check
msrv:
name: Verify MSRV
runs-on: ubuntu-latest
container:
image: amd64/rust
steps:
- uses: actions/checkout@v4
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
- name: Install cargo-msrv
run: cargo install cargo-msrv
- name: Downgrade arrow dependencies
run: cargo update -p ahash --precise 0.8.7
- name: Check arrow
working-directory: arrow
run: cargo msrv --log-target stdout verify
- name: Check parquet
working-directory: parquet
run: cargo msrv --log-target stdout verify
- name: Check arrow-flight
working-directory: arrow-flight
run: cargo msrv --log-target stdout verify
- name: Downgrade object_store dependencies
working-directory: object_store
# Necessary because tokio 1.30.0 updates MSRV to 1.63
# and url 2.5.1, updates to 1.67
run: |
cargo update -p tokio --precise 1.29.1
cargo update -p url --precise 2.5.0
- name: Check object_store
working-directory: object_store
run: cargo msrv --log-target stdout verify

Просмотреть файл

@ -0,0 +1,28 @@
#!/bin/bash
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# Add special sections for documentation, security and performance
add-sections={"documentation":{"prefix":"**Documentation updates:**","labels":["documentation"]},"security":{"prefix":"**Security updates:**","labels":["security"]},"performance":{"prefix":"**Performance improvements:**","labels":["performance"]}}
# uncomment to not show PRs. TBD if we shown them or not.
#pull-requests=false
# so that the component is shown associated with the issue
issue-line-labels=arrow,parquet,arrow-flight
exclude-labels=development-process,invalid,object-store,question
breaking_labels=api-change

99
arrow-rs/.gitignore поставляемый Normal file
Просмотреть файл

@ -0,0 +1,99 @@
Cargo.lock
target
rusty-tags.vi
.history
.flatbuffers/
.idea/
.vscode
.devcontainer
venv/*
# created by doctests
parquet/data.parquet
# release notes cache
.githubchangeloggenerator.cache
.githubchangeloggenerator.cache.log
justfile
.prettierignore
.env
.editorconfig
# local azurite file
__azurite*
__blobstorage__
# .bak files
*.bak
*.bak2
# OS-specific .gitignores
# Mac .gitignore
# General
.DS_Store
.AppleDouble
.LSOverride
# Icon must end with two \r
Icon
# Thumbnails
._*
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
# Linux .gitignore
*~
# temporary files which can be created if a process still has a handle open of a deleted file
.fuse_hidden*
# KDE directory preferences
.directory
# Linux trash folder which might appear on any partition or disk
.Trash-*
# .nfs files are created when an open file is removed but is still being accessed
.nfs*
# Windows .gitignore
# Windows thumbnail cache files
Thumbs.db
Thumbs.db:encryptable
ehthumbs.db
ehthumbs_vista.db
# Dump file
*.stackdump
# Folder config file
[Dd]esktop.ini
# Recycle Bin used on file shares
$RECYCLE.BIN/
# Windows Installer files
*.cab
*.msi
*.msix
*.msm
*.msp
# Windows shortcuts
*.lnk
# Python virtual env in parquet crate
parquet/pytest/venv/
__pycache__/

6
arrow-rs/.gitmodules поставляемый Normal file
Просмотреть файл

@ -0,0 +1,6 @@
[submodule "testing"]
path = testing
url = https://github.com/apache/arrow-testing
[submodule "parquet-testing"]
path = parquet-testing
url = https://github.com/apache/parquet-testing.git

Просмотреть файл

@ -0,0 +1,69 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# To use this, install the python package `pre-commit` and
# run once `pre-commit install`. This will setup a git pre-commit-hook
# that is executed on each commit and will report the linting problems.
# To run all hooks on all files use `pre-commit run -a`
repos:
- repo: local
hooks:
- id: rat
name: Release Audit Tool
language: system
entry: bash -c "git archive HEAD --prefix=apache-arrow/ --output=arrow-src.tar && ./dev/release/run-rat.sh arrow-src.tar"
always_run: true
pass_filenames: false
- id: rustfmt
name: Rust Format
language: system
entry: bash -c "cargo +stable fmt --all -- --check"
files: ^.*\.rs$
types:
- file
- rust
- id: cmake-format
name: CMake Format
language: python
entry: python run-cmake-format.py
types: [cmake]
additional_dependencies:
- cmake_format==0.5.2
- id: hadolint
name: Docker Format
language: docker_image
types:
- dockerfile
entry: --entrypoint /bin/hadolint hadolint/hadolint:latest -
exclude: ^dev/.*$
- repo: git://github.com/pre-commit/pre-commit-hooks
sha: v1.2.3
hooks:
- id: flake8
name: Python Format
files: ^(python|dev|integration)/
types:
- file
- python
- id: flake8
name: Cython Format
files: ^python/
types:
- file
- cython
args: [--config=python/.flake8.cython]

4374
arrow-rs/CHANGELOG-old.md Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

207
arrow-rs/CHANGELOG.md Normal file
Просмотреть файл

@ -0,0 +1,207 @@
<!---
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
# Changelog
## [52.0.0](https://github.com/apache/arrow-rs/tree/52.0.0) (2024-06-03)
[Full Changelog](https://github.com/apache/arrow-rs/compare/51.0.0...52.0.0)
**Breaking changes:**
- chore: Make binary\_mut kernel accept different type for second arg [\#5833](https://github.com/apache/arrow-rs/pull/5833) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya))
- fix\(flightsql\): remove Any encoding of `DoPutPreparedStatementResult` [\#5817](https://github.com/apache/arrow-rs/pull/5817) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([erratic-pattern](https://github.com/erratic-pattern))
- Encode UUID as FixedLenByteArray in parquet\_derive [\#5773](https://github.com/apache/arrow-rs/pull/5773) ([conradludgate](https://github.com/conradludgate))
- Structured interval types for `IntervalMonthDayNano` or `IntervalDayTime` \(\#3125\) \(\#5654\) [\#5769](https://github.com/apache/arrow-rs/pull/5769) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold))
- Fallible stream for arrow-flight do\_exchange call \(\#3462\) [\#5698](https://github.com/apache/arrow-rs/pull/5698) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([opensourcegeek](https://github.com/opensourcegeek))
- Update object\_store dependency in arrow to `0.10.0` [\#5675](https://github.com/apache/arrow-rs/pull/5675) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold))
- Remove deprecated JSON writer [\#5651](https://github.com/apache/arrow-rs/pull/5651) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold))
- Change `UnionArray` constructors [\#5623](https://github.com/apache/arrow-rs/pull/5623) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([mbrobbel](https://github.com/mbrobbel))
- Update py03 from 0.20 to 0.21 [\#5566](https://github.com/apache/arrow-rs/pull/5566) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey))
- Optionally require alignment when reading IPC, respect alignment when writing [\#5554](https://github.com/apache/arrow-rs/pull/5554) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([hzuo](https://github.com/hzuo))
**Implemented enhancements:**
- Serialize `Binary` and `LargeBinary` as HEX with JSON Writer [\#5783](https://github.com/apache/arrow-rs/issues/5783) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
- Some optimizations in arrow\_buffer::util::bit\_util do more harm than good [\#5771](https://github.com/apache/arrow-rs/issues/5771) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
- Support skipping comments in CSV files [\#5758](https://github.com/apache/arrow-rs/issues/5758) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
- `parquet-derive` should be included in repository README. [\#5751](https://github.com/apache/arrow-rs/issues/5751)
- proposal: Make AsyncArrowWriter accepts AsyncFileWriter trait instead [\#5738](https://github.com/apache/arrow-rs/issues/5738) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)]
- Nested nullable fields do not get treated as nullable in data\_gen [\#5712](https://github.com/apache/arrow-rs/issues/5712) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
- Optionally support flexible column lengths [\#5678](https://github.com/apache/arrow-rs/issues/5678) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
- Arrow Flight SQL example server: do\_handshake should include auth header [\#5665](https://github.com/apache/arrow-rs/issues/5665) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)]
- Add support for the "r+" datatype in the C Data interface / `RunArray` [\#5631](https://github.com/apache/arrow-rs/issues/5631) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
- Serialize `FixedSizeBinary` as HEX with JSON Writer [\#5620](https://github.com/apache/arrow-rs/issues/5620) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
- Cleanup UnionArray Constructors [\#5613](https://github.com/apache/arrow-rs/issues/5613) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)]
- Zero Copy Support [\#5593](https://github.com/apache/arrow-rs/issues/5593)
- ObjectStore bulk delete [\#5591](https://github.com/apache/arrow-rs/issues/5591)
- Retry on Broken Connection [\#5589](https://github.com/apache/arrow-rs/issues/5589)
- `StreamReader` is not zero-copy [\#5584](https://github.com/apache/arrow-rs/issues/5584) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
- Create `ArrowReaderMetadata` from externalized metadata [\#5582](https://github.com/apache/arrow-rs/issues/5582) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)]
- Make `filter` in `filter_leaves` API propagate error [\#5574](https://github.com/apache/arrow-rs/issues/5574) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
- Support `List` in `compare_op` [\#5572](https://github.com/apache/arrow-rs/issues/5572)
- Make FixedSizedList Json serializable [\#5568](https://github.com/apache/arrow-rs/issues/5568) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
- arrow-ord: Support sortting StructArray [\#5559](https://github.com/apache/arrow-rs/issues/5559) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
- Add scientific notation decimal parsing in `parse_decimal` [\#5549](https://github.com/apache/arrow-rs/issues/5549) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
- `take` kernel support for `StringViewArray` and `BinaryViewArray` [\#5511](https://github.com/apache/arrow-rs/issues/5511) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
- `filter` kernel support for `StringViewArray` and `BinaryViewArray` [\#5510](https://github.com/apache/arrow-rs/issues/5510) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
- Display support for `StringViewArray` and `BinaryViewArray` [\#5509](https://github.com/apache/arrow-rs/issues/5509) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
- Arrow Flight format support for `StringViewArray` and `BinaryViewArray` [\#5507](https://github.com/apache/arrow-rs/issues/5507) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)]
- IPC format support for `StringViewArray` and `BinaryViewArray` [\#5506](https://github.com/apache/arrow-rs/issues/5506) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
**Fixed bugs:**
- List Row Encoding Sorts Incorrectly [\#5807](https://github.com/apache/arrow-rs/issues/5807) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
- Schema Root Message Name Ignored by parquet-fromcsv [\#5804](https://github.com/apache/arrow-rs/issues/5804) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)]
- Compute data buffer length by using start and end values in offset buffer [\#5756](https://github.com/apache/arrow-rs/issues/5756) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
- parquet: ByteArrayEncoder allocates large unused FallbackEncoder for Parquet 2 [\#5755](https://github.com/apache/arrow-rs/issues/5755) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)]
- The CI pipeline `Archery test With other arrow` is broken [\#5742](https://github.com/apache/arrow-rs/issues/5742) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
- Unable to parse scientific notation string to decimal when scale is 0 [\#5739](https://github.com/apache/arrow-rs/issues/5739) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
- Stateless prepared statements wrap `DoPutPreparedStatementResult` with `Any` which is differs from Go implementation [\#5731](https://github.com/apache/arrow-rs/issues/5731) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)]
- "Rustdocs are clean \(amd64, nightly\)" CI check is failing [\#5725](https://github.com/apache/arrow-rs/issues/5725) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
- "Archery test With other arrows" integration tests are failing [\#5719](https://github.com/apache/arrow-rs/issues/5719) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
- parquet\_derive: invalid examples/documentation [\#5687](https://github.com/apache/arrow-rs/issues/5687)
- Arrow FLight SQL: invalid location in get\_flight\_info\_prepared\_statement [\#5669](https://github.com/apache/arrow-rs/issues/5669) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)]
- Rust Interval definition incorrect [\#5654](https://github.com/apache/arrow-rs/issues/5654) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
- DECIMAL regex in csv reader does not accept positive exponent specifier [\#5648](https://github.com/apache/arrow-rs/issues/5648) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
- panic when casting `ListArray` to `FixedSizeList` [\#5642](https://github.com/apache/arrow-rs/issues/5642) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
- FixedSizeListArray::try\_new Errors on Entirely Null Array With Size 0 [\#5614](https://github.com/apache/arrow-rs/issues/5614) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
- `parquet / Build wasm32 (pull_request)` CI check failing on main [\#5565](https://github.com/apache/arrow-rs/issues/5565) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
- Documentation fix: example in parquet/src/column/mod.rs is incorrect [\#5560](https://github.com/apache/arrow-rs/issues/5560) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)]
- IPC code writes data with insufficient alignment [\#5553](https://github.com/apache/arrow-rs/issues/5553) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)]
- Cannot access example Flight SQL Server from dbeaver [\#5540](https://github.com/apache/arrow-rs/issues/5540) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)]
- parquet: "not yet implemented" error when codec is actually implemented but disabled [\#5520](https://github.com/apache/arrow-rs/issues/5520) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)]
**Documentation updates:**
- Minor: Improve arrow\_cast documentation [\#5825](https://github.com/apache/arrow-rs/pull/5825) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb))
- Minor: Improve `ArrowReaderBuilder::with_row_selection` docs [\#5824](https://github.com/apache/arrow-rs/pull/5824) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb))
- Minor: Add examples for ColumnPath::from [\#5813](https://github.com/apache/arrow-rs/pull/5813) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb))
- Minor: Clarify docs on `EnabledStatistics` [\#5812](https://github.com/apache/arrow-rs/pull/5812) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb))
- Add parquet-derive to repository README [\#5795](https://github.com/apache/arrow-rs/pull/5795) ([konjac](https://github.com/konjac))
- Refine ParquetRecordBatchReaderBuilder docs [\#5774](https://github.com/apache/arrow-rs/pull/5774) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb))
- docs: add sizing explanation to bloom filter docs in parquet [\#5705](https://github.com/apache/arrow-rs/pull/5705) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([hiltontj](https://github.com/hiltontj))
**Closed issues:**
- `binary_mut` kernel requires both args to be the same type \(which is inconsistent with `binary`\) [\#5818](https://github.com/apache/arrow-rs/issues/5818) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
- Panic when displaying debug the results via log::info in the browser. [\#5599](https://github.com/apache/arrow-rs/issues/5599) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
**Merged pull requests:**
- feat: impl \*Assign ops for types in arrow-buffer [\#5832](https://github.com/apache/arrow-rs/pull/5832) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waynexia](https://github.com/waynexia))
- Relax zstd-sys Version Pin [\#5829](https://github.com/apache/arrow-rs/pull/5829) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([waynexia](https://github.com/waynexia))
- Minor: Document timestamp with/without cast behavior [\#5826](https://github.com/apache/arrow-rs/pull/5826) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb))
- fix: invalid examples/documentation in parquet\_derive doc [\#5823](https://github.com/apache/arrow-rs/pull/5823) ([Weijun-H](https://github.com/Weijun-H))
- Check length of `FIXED_LEN_BYTE_ARRAY` for `uuid` logical parquet type [\#5821](https://github.com/apache/arrow-rs/pull/5821) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([mbrobbel](https://github.com/mbrobbel))
- Allow overriding the inferred parquet schema root [\#5814](https://github.com/apache/arrow-rs/pull/5814) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold))
- Revisit List Row Encoding \(\#5807\) [\#5811](https://github.com/apache/arrow-rs/pull/5811) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold))
- Update proc-macro2 requirement from =1.0.83 to =1.0.84 [\#5805](https://github.com/apache/arrow-rs/pull/5805) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot))
- Fix typo continuation maker -\> marker [\#5802](https://github.com/apache/arrow-rs/pull/5802) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([djanderson](https://github.com/djanderson))
- fix: serialization of decimal [\#5801](https://github.com/apache/arrow-rs/pull/5801) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([yjshen](https://github.com/yjshen))
- Allow constructing ByteViewArray from existing blocks [\#5796](https://github.com/apache/arrow-rs/pull/5796) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold))
- Push SortOptions into DynComparator Allowing Nested Comparisons \(\#5426\) [\#5792](https://github.com/apache/arrow-rs/pull/5792) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold))
- Fix incorrect URL to Parquet CPP types.h [\#5790](https://github.com/apache/arrow-rs/pull/5790) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya))
- Update proc-macro2 requirement from =1.0.82 to =1.0.83 [\#5789](https://github.com/apache/arrow-rs/pull/5789) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot))
- Update prost-build requirement from =0.12.4 to =0.12.6 [\#5788](https://github.com/apache/arrow-rs/pull/5788) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot))
- Refine parquet documentation on types and metadata [\#5786](https://github.com/apache/arrow-rs/pull/5786) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb))
- feat\(arrow-json\): encode `Binary` and `LargeBinary` types as hex when writing JSON [\#5785](https://github.com/apache/arrow-rs/pull/5785) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([hiltontj](https://github.com/hiltontj))
- fix broken link to ballista crate in README.md [\#5784](https://github.com/apache/arrow-rs/pull/5784) ([navicore](https://github.com/navicore))
- feat\(arrow-csv\): support encoding of binary in CSV writer [\#5782](https://github.com/apache/arrow-rs/pull/5782) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([hiltontj](https://github.com/hiltontj))
- Fix documentation for parquet `parse_metadata`, `decode_metadata` and `decode_footer` [\#5781](https://github.com/apache/arrow-rs/pull/5781) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb))
- Support casting a `FixedSizedList<T>[1]` to `T` [\#5779](https://github.com/apache/arrow-rs/pull/5779) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([sadboy](https://github.com/sadboy))
- \[parquet\] Set the default size of BitWriter in DeltaBitPackEncoder to 1MB [\#5776](https://github.com/apache/arrow-rs/pull/5776) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([AdamGS](https://github.com/AdamGS))
- Remove harmful table lookup optimization for bitmap operations [\#5772](https://github.com/apache/arrow-rs/pull/5772) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HadrienG2](https://github.com/HadrienG2))
- Remove deprecated comparison kernels \(\#4733\) [\#5768](https://github.com/apache/arrow-rs/pull/5768) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold))
- Add environment variable definitions to run the nanoarrow integration tests [\#5764](https://github.com/apache/arrow-rs/pull/5764) ([paleolimbot](https://github.com/paleolimbot))
- Downgrade to Rust 1.77 in integration pipeline to fix CI \(\#5719\) [\#5761](https://github.com/apache/arrow-rs/pull/5761) ([tustvold](https://github.com/tustvold))
- Expose boolean builder contents [\#5760](https://github.com/apache/arrow-rs/pull/5760) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HadrienG2](https://github.com/HadrienG2))
- Allow specifying comment character for CSV reader [\#5759](https://github.com/apache/arrow-rs/pull/5759) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([bbannier](https://github.com/bbannier))
- Expose the null buffer of every builder that has one [\#5754](https://github.com/apache/arrow-rs/pull/5754) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HadrienG2](https://github.com/HadrienG2))
- feat: Make AsyncArrowWriter accepts AsyncFileWriter [\#5753](https://github.com/apache/arrow-rs/pull/5753) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Xuanwo](https://github.com/Xuanwo))
- Improve repository readme [\#5752](https://github.com/apache/arrow-rs/pull/5752) ([alamb](https://github.com/alamb))
- Document object store release cadence [\#5750](https://github.com/apache/arrow-rs/pull/5750) ([alamb](https://github.com/alamb))
- Compute data buffer length by using start and end values in offset buffer [\#5741](https://github.com/apache/arrow-rs/pull/5741) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya))
- fix: parse string of scientific notation to decimal when the scale is 0 [\#5740](https://github.com/apache/arrow-rs/pull/5740) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([yjshen](https://github.com/yjshen))
- Minor: avoid \(likely unreachable\) panic in FlightClient [\#5734](https://github.com/apache/arrow-rs/pull/5734) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb))
- Update proc-macro2 requirement from =1.0.81 to =1.0.82 [\#5732](https://github.com/apache/arrow-rs/pull/5732) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot))
- Improve error message for timestamp queries outside supported range [\#5730](https://github.com/apache/arrow-rs/pull/5730) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Abdi-29](https://github.com/Abdi-29))
- Refactor to share code between do\_put and do\_exchange calls [\#5728](https://github.com/apache/arrow-rs/pull/5728) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([opensourcegeek](https://github.com/opensourcegeek))
- Update brotli requirement from 5.0 to 6.0 [\#5726](https://github.com/apache/arrow-rs/pull/5726) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([dependabot[bot]](https://github.com/apps/dependabot))
- Fix `GenericListBuilder` test typo [\#5724](https://github.com/apache/arrow-rs/pull/5724) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Kikkon](https://github.com/Kikkon))
- Deprecate NullBuilder capacity, as it behaves in a surprising way [\#5721](https://github.com/apache/arrow-rs/pull/5721) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HadrienG2](https://github.com/HadrienG2))
- Fix nested nullability when randomly generating arrays [\#5713](https://github.com/apache/arrow-rs/pull/5713) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel))
- Fix up clippy for Rust 1.78 [\#5710](https://github.com/apache/arrow-rs/pull/5710) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb))
- Support casting `StringView`/`BinaryView` --\> `StringArray`/`BinaryArray`. [\#5704](https://github.com/apache/arrow-rs/pull/5704) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([RinChanNOWWW](https://github.com/RinChanNOWWW))
- Fix documentation around handling of nulls in cmp kernels [\#5697](https://github.com/apache/arrow-rs/pull/5697) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey))
- Support casting `StringArray`/`BinaryArray` --\> `StringView` / `BinaryView` [\#5686](https://github.com/apache/arrow-rs/pull/5686) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([RinChanNOWWW](https://github.com/RinChanNOWWW))
- Add support for flexible column lengths [\#5679](https://github.com/apache/arrow-rs/pull/5679) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Posnet](https://github.com/Posnet))
- Move ffi stream and utils from arrow to arrow-array [\#5670](https://github.com/apache/arrow-rs/pull/5670) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alexandreyc](https://github.com/alexandreyc))
- Arrow Flight SQL example JDBC driver incompatibility [\#5666](https://github.com/apache/arrow-rs/pull/5666) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([istvan-fodor](https://github.com/istvan-fodor))
- Add `ListView` & `LargeListView` basic construction and validation [\#5664](https://github.com/apache/arrow-rs/pull/5664) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Kikkon](https://github.com/Kikkon))
- Update proc-macro2 requirement from =1.0.80 to =1.0.81 [\#5659](https://github.com/apache/arrow-rs/pull/5659) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot))
- Modify decimal regex to accept positive exponent specifier [\#5649](https://github.com/apache/arrow-rs/pull/5649) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jdcasale](https://github.com/jdcasale))
- feat: JSON encoding of `FixedSizeList` [\#5646](https://github.com/apache/arrow-rs/pull/5646) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([hiltontj](https://github.com/hiltontj))
- Update proc-macro2 requirement from =1.0.79 to =1.0.80 [\#5644](https://github.com/apache/arrow-rs/pull/5644) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot))
- fix: panic when casting `ListArray` to `FixedSizeList` [\#5643](https://github.com/apache/arrow-rs/pull/5643) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jonahgao](https://github.com/jonahgao))
- Add more invalid utf8 parquet reader tests [\#5639](https://github.com/apache/arrow-rs/pull/5639) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb))
- Update brotli requirement from 4.0 to 5.0 [\#5637](https://github.com/apache/arrow-rs/pull/5637) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([dependabot[bot]](https://github.com/apps/dependabot))
- Update flatbuffers requirement from 23.1.21 to 24.3.25 [\#5636](https://github.com/apache/arrow-rs/pull/5636) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot))
- Increase `BinaryViewArray` test coverage [\#5635](https://github.com/apache/arrow-rs/pull/5635) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb))
- PrettyPrint support for `StringViewArray` and `BinaryViewArray` [\#5634](https://github.com/apache/arrow-rs/pull/5634) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb))
- feat\(ffi\): add run end encoded arrays [\#5632](https://github.com/apache/arrow-rs/pull/5632) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([notfilippo](https://github.com/notfilippo))
- Accept parquet schemas without explicitly required Map keys [\#5630](https://github.com/apache/arrow-rs/pull/5630) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jupiter](https://github.com/jupiter))
- Implement `filter` kernel for byte view arrays. [\#5624](https://github.com/apache/arrow-rs/pull/5624) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([RinChanNOWWW](https://github.com/RinChanNOWWW))
- feat: encode FixedSizeBinary in JSON as hex string [\#5622](https://github.com/apache/arrow-rs/pull/5622) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([hiltontj](https://github.com/hiltontj))
- Update Flight crate README version [\#5621](https://github.com/apache/arrow-rs/pull/5621) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([phillipleblanc](https://github.com/phillipleblanc))
- feat: support reading and writing`StringView` and `BinaryView` in parquet \(part 1\) [\#5618](https://github.com/apache/arrow-rs/pull/5618) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb))
- Use FixedSizeListArray::new in FixedSizeListBuilder [\#5612](https://github.com/apache/arrow-rs/pull/5612) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold))
- String to decimal conversion written using E/scientific notation [\#5611](https://github.com/apache/arrow-rs/pull/5611) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Nekit2217](https://github.com/Nekit2217))
- Account for Timezone when Casting Timestamp to Date32 [\#5605](https://github.com/apache/arrow-rs/pull/5605) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Lordworms](https://github.com/Lordworms))
- Update prost-build requirement from =0.12.3 to =0.12.4 [\#5604](https://github.com/apache/arrow-rs/pull/5604) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot))
- Fix panic when displaying dates on 32-bit platforms [\#5603](https://github.com/apache/arrow-rs/pull/5603) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ivanceras](https://github.com/ivanceras))
- Implement `take` kernel for byte view array. [\#5602](https://github.com/apache/arrow-rs/pull/5602) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([RinChanNOWWW](https://github.com/RinChanNOWWW))
- Add tests for Arrow Flight support for `StringViewArray` and `BinaryViewArray` [\#5601](https://github.com/apache/arrow-rs/pull/5601) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([XiangpengHao](https://github.com/XiangpengHao))
- test: Add a test for RowFilter with nested type [\#5600](https://github.com/apache/arrow-rs/pull/5600) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya))
- Minor: Add docs for GenericBinaryBuilder, links to `GenericStringBuilder` [\#5597](https://github.com/apache/arrow-rs/pull/5597) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb))
- Bump chrono-tz from 0.8 to 0.9 [\#5596](https://github.com/apache/arrow-rs/pull/5596) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey))
- Update brotli requirement from 3.3 to 4.0 [\#5586](https://github.com/apache/arrow-rs/pull/5586) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([dependabot[bot]](https://github.com/apps/dependabot))
- Add `UnionArray::into_parts` [\#5585](https://github.com/apache/arrow-rs/pull/5585) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel))
- Expose ArrowReaderMetadata::try\_new [\#5583](https://github.com/apache/arrow-rs/pull/5583) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([kylebarron](https://github.com/kylebarron))
- Add `try_filter_leaves` to propagate error from filter closure [\#5575](https://github.com/apache/arrow-rs/pull/5575) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya))
- filter for run end array [\#5573](https://github.com/apache/arrow-rs/pull/5573) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([fabianmurariu](https://github.com/fabianmurariu))
- Pin zstd-sys to `v2.0.9` in parquet [\#5567](https://github.com/apache/arrow-rs/pull/5567) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jefffrey](https://github.com/Jefffrey))
- Split arrow\_cast::cast::string into it's own submodule [\#5563](https://github.com/apache/arrow-rs/pull/5563) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([monkwire](https://github.com/monkwire))
- Correct example code for column \(\#5560\) [\#5561](https://github.com/apache/arrow-rs/pull/5561) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([zgershkoff](https://github.com/zgershkoff))
- Split arrow\_cast::cast::dictionary into it's own submodule [\#5555](https://github.com/apache/arrow-rs/pull/5555) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([monkwire](https://github.com/monkwire))
- Split arrow\_cast::cast::decimal into it's own submodule [\#5552](https://github.com/apache/arrow-rs/pull/5552) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([monkwire](https://github.com/monkwire))
- Fix new clippy lints for Rust 1.77 [\#5544](https://github.com/apache/arrow-rs/pull/5544) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb))
- fix: correctly encode ticket [\#5543](https://github.com/apache/arrow-rs/pull/5543) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([freddieptf](https://github.com/freddieptf))
- feat: implemented with\_field\(\) for FixedSizeListBuilder [\#5541](https://github.com/apache/arrow-rs/pull/5541) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([istvan-fodor](https://github.com/istvan-fodor))
- Split arrow\_cast::cast::list into it's own submodule [\#5537](https://github.com/apache/arrow-rs/pull/5537) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([monkwire](https://github.com/monkwire))
- Bump black from 22.10.0 to 24.3.0 in /parquet/pytest [\#5535](https://github.com/apache/arrow-rs/pull/5535) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([dependabot[bot]](https://github.com/apps/dependabot))
- Add OffsetBufferBuilder [\#5532](https://github.com/apache/arrow-rs/pull/5532) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold))
- Add IPC StreamDecoder [\#5531](https://github.com/apache/arrow-rs/pull/5531) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold))
- IPC format support for StringViewArray and BinaryViewArray [\#5525](https://github.com/apache/arrow-rs/pull/5525) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([XiangpengHao](https://github.com/XiangpengHao))
- parquet: Use specific error variant when codec is disabled [\#5521](https://github.com/apache/arrow-rs/pull/5521) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([progval](https://github.com/progval))
- impl `From<ScalarBuffer<T>>` for `Vec<T>` [\#5518](https://github.com/apache/arrow-rs/pull/5518) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel))
\* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)*

Просмотреть файл

@ -0,0 +1,24 @@
<!---
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
# Code of Conduct
- [Code of Conduct for The Apache Software Foundation][1]
[1]: https://www.apache.org/foundation/policies/conduct.html

208
arrow-rs/CONTRIBUTING.md Normal file
Просмотреть файл

@ -0,0 +1,208 @@
<!---
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
## Introduction
We welcome and encourage contributions of all kinds, such as:
1. Tickets with issue reports of feature requests
2. Documentation improvements
3. Code (PR or PR Review)
In addition to submitting new PRs, we have a healthy tradition of community members helping review each other's PRs. Doing so is a great way to help the community as well as get more familiar with Rust and the relevant codebases.
## Developer's guide to Arrow Rust
### Setting Up Your Build Environment
Install the Rust tool chain:
https://www.rust-lang.org/tools/install
Also, make sure your Rust tool chain is up-to-date, because we always use the latest stable version of Rust to test this project.
```bash
rustup update stable
```
### How to compile
This is a standard cargo project with workspaces. To build it, you need to have `rust` and `cargo`:
```bash
cargo build
```
You can also use rust's official docker image:
```bash
docker run --rm -v $(pwd):/arrow-rs -it rust /bin/bash -c "cd /arrow-rs && rustup component add rustfmt && cargo build"
```
The command above assumes that are in the root directory of the project, not in the same
directory as this README.md.
You can also compile specific workspaces:
```bash
cd arrow && cargo build
```
### Git Submodules
Before running tests and examples, it is necessary to set up the local development environment.
The tests rely on test data that is contained in git submodules.
To pull down this data run the following:
```bash
git submodule update --init
```
This populates data in two git submodules:
- `../parquet-testing/data` (sourced from https://github.com/apache/parquet-testing.git)
- `../testing` (sourced from https://github.com/apache/arrow-testing)
By default, `cargo test` will look for these directories at their
standard location. The following environment variables can be used to override the location:
```bash
# Optionally specify a different location for test data
export PARQUET_TEST_DATA=$(cd ../parquet-testing/data; pwd)
export ARROW_TEST_DATA=$(cd ../testing/data; pwd)
```
From here on, this is a pure Rust project and `cargo` can be used to run tests, benchmarks, docs and examples as usual.
## Running the tests
Run tests using the Rust standard `cargo test` command:
```bash
# run all unit and integration tests
cargo test
# run tests for the arrow crate
cargo test -p arrow
```
For some changes, you may want to run additional tests. You can find up-to-date information on the current CI tests in [.github/workflows](https://github.com/apache/arrow-rs/tree/master/.github/workflows). Here are some examples of additional tests you may want to run:
```bash
# run tests for the parquet crate
cargo test -p parquet
# run arrow tests with all features enabled
cargo test -p arrow --all-features
# run the doc tests
cargo test --doc
```
## Code Formatting
Our CI uses `rustfmt` to check code formatting. Before submitting a
PR be sure to run the following and check for lint issues:
```bash
cargo +stable fmt --all -- --check
```
## Clippy Lints
We recommend using `clippy` for checking lints during development. While we do not yet enforce `clippy` checks, we recommend not introducing new `clippy` errors or warnings.
Run the following to check for `clippy` lints:
```bash
# run clippy with default settings
cargo clippy
```
More comprehensive `clippy` checks can be run by adding flags:
```bash
# run clippy on the arrow crate with all features enabled, targeting all tests, examples, and benchmarks
cargo clippy -p arrow --all-features --all-targets
```
If you use Visual Studio Code with the `rust-analyzer` plugin, you can enable `clippy` to run each time you save a file. See https://users.rust-lang.org/t/how-to-use-clippy-in-vs-code-with-rust-analyzer/41881.
One of the concerns with `clippy` is that it often produces a lot of false positives, or that some recommendations may hurt readability. We do not have a policy of which lints are ignored, but if you disagree with a `clippy` lint, you may disable the lint and briefly justify it.
Search for `allow(clippy::` in the codebase to identify lints that are ignored/allowed. We currently prefer ignoring lints on the lowest unit possible.
- If you are introducing a line that returns a lint warning or error, you may disable the lint on that line.
- If you have several lints on a function or module, you may disable the lint on the function or module.
- If a lint is pervasive across multiple modules, you may disable it at the crate level.
## Running Benchmarks
Running benchmarks are a good way to test the performance of a change. As benchmarks usually take a long time to run, we recommend running targeted tests instead of the full suite.
```bash
# run all benchmarks
cargo bench
# run arrow benchmarks
cargo bench -p arrow
# run benchmark for the parse_time function within the arrow-cast crate
cargo bench -p arrow-cast --bench parse_time
```
To set the baseline for your benchmarks, use the --save-baseline flag:
```bash
git checkout master
cargo bench --bench parse_time -- --save-baseline master
git checkout feature
cargo bench --bench parse_time -- --baseline master
```
## Git Pre-Commit Hook
We can use [git pre-commit hook](https://git-scm.com/book/en/v2/Customizing-Git-Git-Hooks) to automate various kinds of git pre-commit checking/formatting.
Suppose you are in the root directory of the project.
First check if the file already exists:
```bash
ls -l .git/hooks/pre-commit
```
If the file already exists, to avoid mistakenly **overriding**, you MAY have to check
the link source or file content. Else if not exist, let's safely soft link [pre-commit.sh](pre-commit.sh) as file `.git/hooks/pre-commit`:
```bash
ln -s ../../pre-commit.sh .git/hooks/pre-commit
```
If sometimes you want to commit without checking, just run `git commit` with `--no-verify`:
```bash
git commit --no-verify -m "... commit message ..."
```

96
arrow-rs/Cargo.toml Normal file
Просмотреть файл

@ -0,0 +1,96 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
[workspace]
members = [
"arrow",
"arrow-arith",
"arrow-array",
"arrow-avro",
"arrow-buffer",
"arrow-cast",
"arrow-csv",
"arrow-data",
"arrow-flight",
"arrow-flight/gen",
"arrow-integration-test",
"arrow-integration-testing",
"arrow-ipc",
"arrow-json",
"arrow-ord",
"arrow-row",
"arrow-schema",
"arrow-select",
"arrow-string",
"parquet",
"parquet_derive",
"parquet_derive_test",
]
# Enable the version 2 feature resolver, which avoids unifying features for targets that are not being built
#
# Critically this prevents dev-dependencies from enabling features even when not building a target that
# uses dev-dependencies, e.g. the library crate. This in turn ensures that we can catch invalid feature
# flag combinations that would otherwise only surface in dependent crates
#
# Reference - https://doc.rust-lang.org/nightly/cargo/reference/features.html#feature-resolver-version-2
#
resolver = "2"
exclude = [
# arrow-pyarrow-integration-testing is excluded because it requires different compilation flags, thereby
# significantly changing how it is compiled within the workspace, causing the whole workspace to be compiled from
# scratch this way, this is a stand-alone package that compiles independently of the others.
"arrow-pyarrow-integration-testing",
# object_store is excluded because it follows a separate release cycle from the other arrow crates
"object_store"
]
[workspace.package]
version = "52.0.0"
homepage = "https://github.com/apache/arrow-rs"
repository = "https://github.com/apache/arrow-rs"
authors = ["Apache Arrow <dev@arrow.apache.org>"]
license = "Apache-2.0"
keywords = ["arrow"]
include = [
"benches/*.rs",
"src/**/*.rs",
"Cargo.toml",
]
edition = "2021"
rust-version = "1.62"
[workspace.dependencies]
arrow = { version = "52.0.0", path = "./arrow", default-features = false }
arrow-arith = { version = "52.0.0", path = "./arrow-arith" }
arrow-array = { version = "52.0.0", path = "./arrow-array" }
arrow-buffer = { version = "52.0.0", path = "./arrow-buffer" }
arrow-cast = { version = "52.0.0", path = "./arrow-cast" }
arrow-csv = { version = "52.0.0", path = "./arrow-csv" }
arrow-data = { version = "52.0.0", path = "./arrow-data" }
arrow-ipc = { version = "52.0.0", path = "./arrow-ipc" }
arrow-json = { version = "52.0.0", path = "./arrow-json" }
arrow-ord = { version = "52.0.0", path = "./arrow-ord" }
arrow-row = { version = "52.0.0", path = "./arrow-row" }
arrow-schema = { version = "52.0.0", path = "./arrow-schema" }
arrow-select = { version = "52.0.0", path = "./arrow-select" }
arrow-string = { version = "52.0.0", path = "./arrow-string" }
parquet = { version = "52.0.0", path = "./parquet", default-features = false }
chrono = { version = "0.4.34", default-features = false, features = ["clock"] }

212
arrow-rs/LICENSE.txt Normal file
Просмотреть файл

@ -0,0 +1,212 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
This project includes code from Apache Aurora.
* dev/release/{release,changelog,release-candidate} are based on the scripts from
Apache Aurora
Copyright: 2016 The Apache Software Foundation.
Home page: https://aurora.apache.org/
License: http://www.apache.org/licenses/LICENSE-2.0

84
arrow-rs/NOTICE.txt Normal file
Просмотреть файл

@ -0,0 +1,84 @@
Apache Arrow
Copyright 2016-2019 The Apache Software Foundation
This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).
This product includes software from the SFrame project (BSD, 3-clause).
* Copyright (C) 2015 Dato, Inc.
* Copyright (c) 2009 Carnegie Mellon University.
This product includes software from the Feather project (Apache 2.0)
https://github.com/wesm/feather
This product includes software from the DyND project (BSD 2-clause)
https://github.com/libdynd
This product includes software from the LLVM project
* distributed under the University of Illinois Open Source
This product includes software from the google-lint project
* Copyright (c) 2009 Google Inc. All rights reserved.
This product includes software from the mman-win32 project
* Copyright https://code.google.com/p/mman-win32/
* Licensed under the MIT License;
This product includes software from the LevelDB project
* Copyright (c) 2011 The LevelDB Authors. All rights reserved.
* Use of this source code is governed by a BSD-style license that can be
* Moved from Kudu http://github.com/cloudera/kudu
This product includes software from the CMake project
* Copyright 2001-2009 Kitware, Inc.
* Copyright 2012-2014 Continuum Analytics, Inc.
* All rights reserved.
This product includes software from https://github.com/matthew-brett/multibuild (BSD 2-clause)
* Copyright (c) 2013-2016, Matt Terry and Matthew Brett; all rights reserved.
This product includes software from the Ibis project (Apache 2.0)
* Copyright (c) 2015 Cloudera, Inc.
* https://github.com/cloudera/ibis
This product includes software from Dremio (Apache 2.0)
* Copyright (C) 2017-2018 Dremio Corporation
* https://github.com/dremio/dremio-oss
This product includes software from Google Guava (Apache 2.0)
* Copyright (C) 2007 The Guava Authors
* https://github.com/google/guava
This product include software from CMake (BSD 3-Clause)
* CMake - Cross Platform Makefile Generator
* Copyright 2000-2019 Kitware, Inc. and Contributors
The web site includes files generated by Jekyll.
--------------------------------------------------------------------------------
This product includes code from Apache Kudu, which includes the following in
its NOTICE file:
Apache Kudu
Copyright 2016 The Apache Software Foundation
This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).
Portions of this software were developed at
Cloudera, Inc (http://www.cloudera.com/).
--------------------------------------------------------------------------------
This product includes code from Apache ORC, which includes the following in
its NOTICE file:
Apache ORC
Copyright 2013-2019 The Apache Software Foundation
This product includes software developed by The Apache Software
Foundation (http://www.apache.org/).
This product includes software developed by Hewlett-Packard:
(c) Copyright [2014-2015] Hewlett-Packard Development Company, L.P

137
arrow-rs/README.md Normal file
Просмотреть файл

@ -0,0 +1,137 @@
<!---
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
# Native Rust implementation of Apache Arrow and Apache Parquet
[![Coverage Status](https://codecov.io/gh/apache/arrow-rs/rust/branch/master/graph/badge.svg)](https://codecov.io/gh/apache/arrow-rs?branch=master)
Welcome to the [Rust][rust] implementation of [Apache Arrow], the popular in-memory columnar format.
This repo contains the following main components:
| Crate | Description | Latest API Docs | README |
| ------------------ | ---------------------------------------------------------------------------- | ------------------------------------------------ | --------------------------------- |
| [`arrow`] | Core functionality (memory layout, arrays, low level computations) | [docs.rs](https://docs.rs/arrow/latest) | [(README)][arrow-readme] |
| [`arrow-flight`] | Support for Arrow-Flight IPC protocol | [docs.rs](https://docs.rs/arrow-flight/latest) | [(README)][flight-readme] |
| [`object-store`] | Support for object store interactions (aws, azure, gcp, local, in-memory) | [docs.rs](https://docs.rs/object_store/latest) | [(README)][objectstore-readme] |
| [`parquet`] | Support for Parquet columnar file format | [docs.rs](https://docs.rs/parquet/latest) | [(README)][parquet-readme] |
| [`parquet_derive`] | A crate for deriving RecordWriter/RecordReader for arbitrary, simple structs | [docs.rs](https://docs.rs/parquet-derive/latest) | [(README)][parquet-derive-readme] |
The current development version the API documentation in this repo can be found [here](https://arrow.apache.org/rust).
[apache arrow]: https://arrow.apache.org/
[`arrow`]: https://crates.io/crates/arrow
[`parquet`]: https://crates.io/crates/parquet
[`parquet_derive`]: https://crates.io/crates/parquet-derive
[`arrow-flight`]: https://crates.io/crates/arrow-flight
[`object-store`]: https://crates.io/crates/object-store
## Release Versioning and Schedule
### `arrow` and `parquet` crates
The Arrow Rust project releases approximately monthly and follows [Semantic
Versioning].
Due to available maintainer and testing bandwidth, [`arrow`] crates ([`arrow`],
[`arrow-flight`], etc.) are released on the same schedule with the same versions
as the [`parquet`] and [`parquet-derive`] crates.
Starting June 2024, we plan to release new major versions with potentially
breaking API changes at most once a quarter, and release incremental minor versions in
the intervening months. See [this ticket] for more details.
For example:
| Approximate Date | Version | Notes |
| ---------------- | -------- | --------------------------------------- |
| Jun 2024 | `52.0.0` | Major, potentially breaking API changes |
| Jul 2024 | `52.1.0` | Minor, NO breaking API changes |
| Aug 2024 | `52.2.0` | Minor, NO breaking API changes |
| Sep 2024 | `53.0.0` | Major, potentially breaking API changes |
[this ticket]: https://github.com/apache/arrow-rs/issues/5368
[semantic versioning]: https://semver.org/
### `object_store` crate
The [`object_store`] crate is released independently of the `arrow` and
`parquet` crates and follows [Semantic Versioning]. We aim to release new
versions approximately every 2 months.
[`object_store`]: https://crates.io/crates/object_store
## Related Projects
There are several related crates in different repositories
| Crate | Description | Documentation |
| ------------------------ | ------------------------------------------- | --------------------------------------- |
| [`datafusion`] | In-memory query engine with SQL support | [(README)][datafusion-readme] |
| [`ballista`] | Distributed query execution | [(README)][ballista-readme] |
| [`object_store_opendal`] | Use [`opendal`] as [`object_store`] backend | [(README)][object_store_opendal-readme] |
[`datafusion`]: https://crates.io/crates/datafusion
[`ballista`]: https://crates.io/crates/ballista
[`object_store_opendal`]: https://crates.io/crates/object_store_opendal
[`opendal`]: https://crates.io/crates/opendal
[object_store_opendal-readme]: https://github.com/apache/opendal/blob/main/integrations/object_store/README.md
Collectively, these crates support a wider array of functionality for analytic computations in Rust.
For example, you can write SQL queries or a `DataFrame` (using the
[`datafusion`] crate) to read a parquet file (using the [`parquet`] crate),
evaluate it in-memory using Arrow's columnar format (using the [`arrow`] crate),
and send to another process (using the [`arrow-flight`] crate).
Generally speaking, the [`arrow`] crate offers functionality for using Arrow
arrays, and [`datafusion`] offers most operations typically found in SQL,
including `join`s and window functions.
You can find more details about each crate in their respective READMEs.
## Arrow Rust Community
The `dev@arrow.apache.org` mailing list serves as the core communication channel for the Arrow community. Instructions for signing up and links to the archives can be found on the [Arrow Community](https://arrow.apache.org/community/) page. All major announcements and communications happen there.
The Rust Arrow community also uses the official [ASF Slack](https://s.apache.org/slack-invite) for informal discussions and coordination. This is
a great place to meet other contributors and get guidance on where to contribute. Join us in the `#arrow-rust` channel and feel free to ask for an invite via:
1. the `dev@arrow.apache.org` mailing list
2. the [GitHub Discussions][discussions]
3. the [Discord channel](https://discord.gg/YAb2TdazKQ)
The Rust implementation uses [GitHub issues][issues] as the system of record for new features and bug fixes and
this plays a critical role in the release process.
For design discussions we generally collaborate on Google documents and file a GitHub issue linking to the document.
There is more information in the [contributing] guide.
[rust]: https://www.rust-lang.org/
[arrow-readme]: arrow/README.md
[contributing]: CONTRIBUTING.md
[parquet-readme]: parquet/README.md
[flight-readme]: arrow-flight/README.md
[datafusion-readme]: https://github.com/apache/datafusion/blob/main/README.md
[ballista-readme]: https://github.com/apache/datafusion-ballista/blob/main/README.md
[objectstore-readme]: object_store/README.md
[parquet-derive-readme]: parquet_derive/README.md
[issues]: https://github.com/apache/arrow-rs/issues
[discussions]: https://github.com/apache/arrow-rs/discussions

Просмотреть файл

@ -0,0 +1,45 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
[package]
name = "arrow-arith"
version = { workspace = true }
description = "Arrow arithmetic kernels"
homepage = { workspace = true }
repository = { workspace = true }
authors = { workspace = true }
license = { workspace = true }
keywords = { workspace = true }
include = { workspace = true }
edition = { workspace = true }
rust-version = { workspace = true }
[lib]
name = "arrow_arith"
path = "src/lib.rs"
bench = false
[dependencies]
arrow-array = { workspace = true }
arrow-buffer = { workspace = true }
arrow-data = { workspace = true }
arrow-schema = { workspace = true }
chrono = { workspace = true }
half = { version = "2.1", default-features = false }
num = { version = "0.4", default-features = false, features = ["std"] }
[dev-dependencies]

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,341 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//! Defines basic arithmetic kernels for `PrimitiveArrays`.
//!
//! These kernels can leverage SIMD if available on your system. Currently no runtime
//! detection is provided, you should enable the specific SIMD intrinsics using
//! `RUSTFLAGS="-C target-feature=+avx2"` for example. See the documentation
//! [here](https://doc.rust-lang.org/stable/core/arch/) for more information.
use crate::arity::*;
use arrow_array::types::*;
use arrow_array::*;
use arrow_buffer::i256;
use arrow_buffer::ArrowNativeType;
use arrow_schema::*;
use std::cmp::min;
use std::sync::Arc;
/// Returns the precision and scale of the result of a multiplication of two decimal types,
/// and the divisor for fixed point multiplication.
fn get_fixed_point_info(
left: (u8, i8),
right: (u8, i8),
required_scale: i8,
) -> Result<(u8, i8, i256), ArrowError> {
let product_scale = left.1 + right.1;
let precision = min(left.0 + right.0 + 1, DECIMAL128_MAX_PRECISION);
if required_scale > product_scale {
return Err(ArrowError::ComputeError(format!(
"Required scale {} is greater than product scale {}",
required_scale, product_scale
)));
}
let divisor = i256::from_i128(10).pow_wrapping((product_scale - required_scale) as u32);
Ok((precision, product_scale, divisor))
}
/// Perform `left * right` operation on two decimal arrays. If either left or right value is
/// null then the result is also null.
///
/// This performs decimal multiplication which allows precision loss if an exact representation
/// is not possible for the result, according to the required scale. In the case, the result
/// will be rounded to the required scale.
///
/// If the required scale is greater than the product scale, an error is returned.
///
/// This doesn't detect overflow. Once overflowing, the result will wrap around.
///
/// It is implemented for compatibility with precision loss `multiply` function provided by
/// other data processing engines. For multiplication with precision loss detection, use
/// `multiply_dyn` or `multiply_dyn_checked` instead.
pub fn multiply_fixed_point_dyn(
left: &dyn Array,
right: &dyn Array,
required_scale: i8,
) -> Result<ArrayRef, ArrowError> {
match (left.data_type(), right.data_type()) {
(DataType::Decimal128(_, _), DataType::Decimal128(_, _)) => {
let left = left.as_any().downcast_ref::<Decimal128Array>().unwrap();
let right = right.as_any().downcast_ref::<Decimal128Array>().unwrap();
multiply_fixed_point(left, right, required_scale).map(|a| Arc::new(a) as ArrayRef)
}
(_, _) => Err(ArrowError::CastError(format!(
"Unsupported data type {}, {}",
left.data_type(),
right.data_type()
))),
}
}
/// Perform `left * right` operation on two decimal arrays. If either left or right value is
/// null then the result is also null.
///
/// This performs decimal multiplication which allows precision loss if an exact representation
/// is not possible for the result, according to the required scale. In the case, the result
/// will be rounded to the required scale.
///
/// If the required scale is greater than the product scale, an error is returned.
///
/// It is implemented for compatibility with precision loss `multiply` function provided by
/// other data processing engines. For multiplication with precision loss detection, use
/// `multiply` or `multiply_checked` instead.
pub fn multiply_fixed_point_checked(
left: &PrimitiveArray<Decimal128Type>,
right: &PrimitiveArray<Decimal128Type>,
required_scale: i8,
) -> Result<PrimitiveArray<Decimal128Type>, ArrowError> {
let (precision, product_scale, divisor) = get_fixed_point_info(
(left.precision(), left.scale()),
(right.precision(), right.scale()),
required_scale,
)?;
if required_scale == product_scale {
return try_binary::<_, _, _, Decimal128Type>(left, right, |a, b| a.mul_checked(b))?
.with_precision_and_scale(precision, required_scale);
}
try_binary::<_, _, _, Decimal128Type>(left, right, |a, b| {
let a = i256::from_i128(a);
let b = i256::from_i128(b);
let mut mul = a.wrapping_mul(b);
mul = divide_and_round::<Decimal256Type>(mul, divisor);
mul.to_i128().ok_or_else(|| {
ArrowError::ComputeError(format!("Overflow happened on: {:?} * {:?}", a, b))
})
})
.and_then(|a| a.with_precision_and_scale(precision, required_scale))
}
/// Perform `left * right` operation on two decimal arrays. If either left or right value is
/// null then the result is also null.
///
/// This performs decimal multiplication which allows precision loss if an exact representation
/// is not possible for the result, according to the required scale. In the case, the result
/// will be rounded to the required scale.
///
/// If the required scale is greater than the product scale, an error is returned.
///
/// This doesn't detect overflow. Once overflowing, the result will wrap around.
/// For an overflow-checking variant, use `multiply_fixed_point_checked` instead.
///
/// It is implemented for compatibility with precision loss `multiply` function provided by
/// other data processing engines. For multiplication with precision loss detection, use
/// `multiply` or `multiply_checked` instead.
pub fn multiply_fixed_point(
left: &PrimitiveArray<Decimal128Type>,
right: &PrimitiveArray<Decimal128Type>,
required_scale: i8,
) -> Result<PrimitiveArray<Decimal128Type>, ArrowError> {
let (precision, product_scale, divisor) = get_fixed_point_info(
(left.precision(), left.scale()),
(right.precision(), right.scale()),
required_scale,
)?;
if required_scale == product_scale {
return binary(left, right, |a, b| a.mul_wrapping(b))?
.with_precision_and_scale(precision, required_scale);
}
binary::<_, _, _, Decimal128Type>(left, right, |a, b| {
let a = i256::from_i128(a);
let b = i256::from_i128(b);
let mut mul = a.wrapping_mul(b);
mul = divide_and_round::<Decimal256Type>(mul, divisor);
mul.as_i128()
})
.and_then(|a| a.with_precision_and_scale(precision, required_scale))
}
/// Divide a decimal native value by given divisor and round the result.
fn divide_and_round<I>(input: I::Native, div: I::Native) -> I::Native
where
I: DecimalType,
I::Native: ArrowNativeTypeOp,
{
let d = input.div_wrapping(div);
let r = input.mod_wrapping(div);
let half = div.div_wrapping(I::Native::from_usize(2).unwrap());
let half_neg = half.neg_wrapping();
// Round result
match input >= I::Native::ZERO {
true if r >= half => d.add_wrapping(I::Native::ONE),
false if r <= half_neg => d.sub_wrapping(I::Native::ONE),
_ => d,
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::numeric::mul;
#[test]
fn test_decimal_multiply_allow_precision_loss() {
// Overflow happening as i128 cannot hold multiplying result.
// [123456789]
let a = Decimal128Array::from(vec![123456789000000000000000000])
.with_precision_and_scale(38, 18)
.unwrap();
// [10]
let b = Decimal128Array::from(vec![10000000000000000000])
.with_precision_and_scale(38, 18)
.unwrap();
let err = mul(&a, &b).unwrap_err();
assert!(err
.to_string()
.contains("Overflow happened on: 123456789000000000000000000 * 10000000000000000000"));
// Allow precision loss.
let result = multiply_fixed_point_checked(&a, &b, 28).unwrap();
// [1234567890]
let expected = Decimal128Array::from(vec![12345678900000000000000000000000000000])
.with_precision_and_scale(38, 28)
.unwrap();
assert_eq!(&expected, &result);
assert_eq!(
result.value_as_string(0),
"1234567890.0000000000000000000000000000"
);
// Rounding case
// [0.000000000000000001, 123456789.555555555555555555, 1.555555555555555555]
let a = Decimal128Array::from(vec![1, 123456789555555555555555555, 1555555555555555555])
.with_precision_and_scale(38, 18)
.unwrap();
// [1.555555555555555555, 11.222222222222222222, 0.000000000000000001]
let b = Decimal128Array::from(vec![1555555555555555555, 11222222222222222222, 1])
.with_precision_and_scale(38, 18)
.unwrap();
let result = multiply_fixed_point_checked(&a, &b, 28).unwrap();
// [
// 0.0000000000000000015555555556,
// 1385459527.2345679012071330528765432099,
// 0.0000000000000000015555555556
// ]
let expected = Decimal128Array::from(vec![
15555555556,
13854595272345679012071330528765432099,
15555555556,
])
.with_precision_and_scale(38, 28)
.unwrap();
assert_eq!(&expected, &result);
// Rounded the value "1385459527.234567901207133052876543209876543210".
assert_eq!(
result.value_as_string(1),
"1385459527.2345679012071330528765432099"
);
assert_eq!(result.value_as_string(0), "0.0000000000000000015555555556");
assert_eq!(result.value_as_string(2), "0.0000000000000000015555555556");
let a = Decimal128Array::from(vec![1230])
.with_precision_and_scale(4, 2)
.unwrap();
let b = Decimal128Array::from(vec![1000])
.with_precision_and_scale(4, 2)
.unwrap();
// Required scale is same as the product of the input scales. Behavior is same as multiply.
let result = multiply_fixed_point_checked(&a, &b, 4).unwrap();
assert_eq!(result.precision(), 9);
assert_eq!(result.scale(), 4);
let expected = mul(&a, &b).unwrap();
assert_eq!(expected.as_ref(), &result);
// Required scale cannot be larger than the product of the input scales.
let result = multiply_fixed_point_checked(&a, &b, 5).unwrap_err();
assert!(result
.to_string()
.contains("Required scale 5 is greater than product scale 4"));
}
#[test]
fn test_decimal_multiply_allow_precision_loss_overflow() {
// [99999999999123456789]
let a = Decimal128Array::from(vec![99999999999123456789000000000000000000])
.with_precision_and_scale(38, 18)
.unwrap();
// [9999999999910]
let b = Decimal128Array::from(vec![9999999999910000000000000000000])
.with_precision_and_scale(38, 18)
.unwrap();
let err = multiply_fixed_point_checked(&a, &b, 28).unwrap_err();
assert!(err.to_string().contains(
"Overflow happened on: 99999999999123456789000000000000000000 * 9999999999910000000000000000000"
));
let result = multiply_fixed_point(&a, &b, 28).unwrap();
let expected = Decimal128Array::from(vec![62946009661555981610246871926660136960])
.with_precision_and_scale(38, 28)
.unwrap();
assert_eq!(&expected, &result);
}
#[test]
fn test_decimal_multiply_fixed_point() {
// [123456789]
let a = Decimal128Array::from(vec![123456789000000000000000000])
.with_precision_and_scale(38, 18)
.unwrap();
// [10]
let b = Decimal128Array::from(vec![10000000000000000000])
.with_precision_and_scale(38, 18)
.unwrap();
// `multiply` overflows on this case.
let err = mul(&a, &b).unwrap_err();
assert_eq!(err.to_string(), "Compute error: Overflow happened on: 123456789000000000000000000 * 10000000000000000000");
// Avoid overflow by reducing the scale.
let result = multiply_fixed_point(&a, &b, 28).unwrap();
// [1234567890]
let expected = Decimal128Array::from(vec![12345678900000000000000000000000000000])
.with_precision_and_scale(38, 28)
.unwrap();
assert_eq!(&expected, &result);
assert_eq!(
result.value_as_string(0),
"1234567890.0000000000000000000000000000"
);
}
}

Просмотреть файл

@ -0,0 +1,605 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//! Kernels for operating on [`PrimitiveArray`]s
use arrow_array::builder::BufferBuilder;
use arrow_array::types::ArrowDictionaryKeyType;
use arrow_array::*;
use arrow_buffer::buffer::NullBuffer;
use arrow_buffer::ArrowNativeType;
use arrow_buffer::{Buffer, MutableBuffer};
use arrow_data::ArrayData;
use arrow_schema::ArrowError;
use std::sync::Arc;
/// See [`PrimitiveArray::unary`]
pub fn unary<I, F, O>(array: &PrimitiveArray<I>, op: F) -> PrimitiveArray<O>
where
I: ArrowPrimitiveType,
O: ArrowPrimitiveType,
F: Fn(I::Native) -> O::Native,
{
array.unary(op)
}
/// See [`PrimitiveArray::unary_mut`]
pub fn unary_mut<I, F>(
array: PrimitiveArray<I>,
op: F,
) -> Result<PrimitiveArray<I>, PrimitiveArray<I>>
where
I: ArrowPrimitiveType,
F: Fn(I::Native) -> I::Native,
{
array.unary_mut(op)
}
/// See [`PrimitiveArray::try_unary`]
pub fn try_unary<I, F, O>(array: &PrimitiveArray<I>, op: F) -> Result<PrimitiveArray<O>, ArrowError>
where
I: ArrowPrimitiveType,
O: ArrowPrimitiveType,
F: Fn(I::Native) -> Result<O::Native, ArrowError>,
{
array.try_unary(op)
}
/// See [`PrimitiveArray::try_unary_mut`]
pub fn try_unary_mut<I, F>(
array: PrimitiveArray<I>,
op: F,
) -> Result<Result<PrimitiveArray<I>, ArrowError>, PrimitiveArray<I>>
where
I: ArrowPrimitiveType,
F: Fn(I::Native) -> Result<I::Native, ArrowError>,
{
array.try_unary_mut(op)
}
/// A helper function that applies an infallible unary function to a dictionary array with primitive value type.
fn unary_dict<K, F, T>(array: &DictionaryArray<K>, op: F) -> Result<ArrayRef, ArrowError>
where
K: ArrowDictionaryKeyType + ArrowNumericType,
T: ArrowPrimitiveType,
F: Fn(T::Native) -> T::Native,
{
let dict_values = array.values().as_any().downcast_ref().unwrap();
let values = unary::<T, F, T>(dict_values, op);
Ok(Arc::new(array.with_values(Arc::new(values))))
}
/// A helper function that applies a fallible unary function to a dictionary array with primitive value type.
fn try_unary_dict<K, F, T>(array: &DictionaryArray<K>, op: F) -> Result<ArrayRef, ArrowError>
where
K: ArrowDictionaryKeyType + ArrowNumericType,
T: ArrowPrimitiveType,
F: Fn(T::Native) -> Result<T::Native, ArrowError>,
{
if !PrimitiveArray::<T>::is_compatible(&array.value_type()) {
return Err(ArrowError::CastError(format!(
"Cannot perform the unary operation of type {} on dictionary array of value type {}",
T::DATA_TYPE,
array.value_type()
)));
}
let dict_values = array.values().as_any().downcast_ref().unwrap();
let values = try_unary::<T, F, T>(dict_values, op)?;
Ok(Arc::new(array.with_values(Arc::new(values))))
}
/// Applies an infallible unary function to an array with primitive values.
#[deprecated(note = "Use arrow_array::AnyDictionaryArray")]
pub fn unary_dyn<F, T>(array: &dyn Array, op: F) -> Result<ArrayRef, ArrowError>
where
T: ArrowPrimitiveType,
F: Fn(T::Native) -> T::Native,
{
downcast_dictionary_array! {
array => unary_dict::<_, F, T>(array, op),
t => {
if PrimitiveArray::<T>::is_compatible(t) {
Ok(Arc::new(unary::<T, F, T>(
array.as_any().downcast_ref::<PrimitiveArray<T>>().unwrap(),
op,
)))
} else {
Err(ArrowError::NotYetImplemented(format!(
"Cannot perform unary operation of type {} on array of type {}",
T::DATA_TYPE,
t
)))
}
}
}
}
/// Applies a fallible unary function to an array with primitive values.
#[deprecated(note = "Use arrow_array::AnyDictionaryArray")]
pub fn try_unary_dyn<F, T>(array: &dyn Array, op: F) -> Result<ArrayRef, ArrowError>
where
T: ArrowPrimitiveType,
F: Fn(T::Native) -> Result<T::Native, ArrowError>,
{
downcast_dictionary_array! {
array => if array.values().data_type() == &T::DATA_TYPE {
try_unary_dict::<_, F, T>(array, op)
} else {
Err(ArrowError::NotYetImplemented(format!(
"Cannot perform unary operation on dictionary array of type {}",
array.data_type()
)))
},
t => {
if PrimitiveArray::<T>::is_compatible(t) {
Ok(Arc::new(try_unary::<T, F, T>(
array.as_any().downcast_ref::<PrimitiveArray<T>>().unwrap(),
op,
)?))
} else {
Err(ArrowError::NotYetImplemented(format!(
"Cannot perform unary operation of type {} on array of type {}",
T::DATA_TYPE,
t
)))
}
}
}
}
/// Allies a binary infallable function to two [`PrimitiveArray`]s,
/// producing a new [`PrimitiveArray`]
///
/// # Details
///
/// Given two arrays of length `len`, calls `op(a[i], b[i])` for `i` in `0..len`, collecting
/// the results in a [`PrimitiveArray`].
///
/// If any index is null in either `a` or `b`, the
/// corresponding index in the result will also be null
///
/// Like [`unary`], the `op` is evaluated for every element in the two arrays,
/// including those elements which are NULL. This is beneficial as the cost of
/// the operation is low compared to the cost of branching, and especially when
/// the operation can be vectorised, however, requires `op` to be infallible for
/// all possible values of its inputs
///
/// # Errors
///
/// * if the arrays have different lengths.
///
/// # Example
/// ```
/// # use arrow_arith::arity::binary;
/// # use arrow_array::{Float32Array, Int32Array};
/// # use arrow_array::types::Int32Type;
/// let a = Float32Array::from(vec![Some(5.1f32), None, Some(6.8), Some(7.2)]);
/// let b = Int32Array::from(vec![1, 2, 4, 9]);
/// // compute int(a) + b for each element
/// let c = binary(&a, &b, |a, b| a as i32 + b).unwrap();
/// assert_eq!(c, Int32Array::from(vec![Some(6), None, Some(10), Some(16)]));
/// ```
pub fn binary<A, B, F, O>(
a: &PrimitiveArray<A>,
b: &PrimitiveArray<B>,
op: F,
) -> Result<PrimitiveArray<O>, ArrowError>
where
A: ArrowPrimitiveType,
B: ArrowPrimitiveType,
O: ArrowPrimitiveType,
F: Fn(A::Native, B::Native) -> O::Native,
{
if a.len() != b.len() {
return Err(ArrowError::ComputeError(
"Cannot perform binary operation on arrays of different length".to_string(),
));
}
if a.is_empty() {
return Ok(PrimitiveArray::from(ArrayData::new_empty(&O::DATA_TYPE)));
}
let nulls = NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref());
let values = a.values().iter().zip(b.values()).map(|(l, r)| op(*l, *r));
// JUSTIFICATION
// Benefit
// ~60% speedup
// Soundness
// `values` is an iterator with a known size from a PrimitiveArray
let buffer = unsafe { Buffer::from_trusted_len_iter(values) };
Ok(PrimitiveArray::new(buffer.into(), nulls))
}
/// Applies a binary and infallible function to values in two arrays, replacing
/// the values in the first array in place.
///
/// # Details
///
/// Given two arrays of length `len`, calls `op(a[i], b[i])` for `i` in
/// `0..len`, modifying the [`PrimitiveArray`] `a` in place, if possible.
///
/// If any index is null in either `a` or `b`, the corresponding index in the
/// result will also be null.
///
/// # Buffer Reuse
///
/// If the underlying buffers in `a` are not shared with other arrays, mutates
/// the underlying buffer in place, without allocating.
///
/// If the underlying buffer in `a` are shared, returns Err(self)
///
/// Like [`unary`] the provided function is evaluated for every index, ignoring validity. This
/// is beneficial when the cost of the operation is low compared to the cost of branching, and
/// especially when the operation can be vectorised, however, requires `op` to be infallible
/// for all possible values of its inputs
///
/// # Errors
///
/// * If the arrays have different lengths
/// * If the array is not mutable (see "Buffer Reuse")
///
/// # See Also
///
/// * Documentation on [`PrimitiveArray::unary_mut`] for operating on [`ArrayRef`].
///
/// # Example
/// ```
/// # use arrow_arith::arity::binary_mut;
/// # use arrow_array::{Float32Array, Int32Array};
/// # use arrow_array::types::Int32Type;
/// // compute a + b for each element
/// let a = Float32Array::from(vec![Some(5.1f32), None, Some(6.8)]);
/// let b = Int32Array::from(vec![Some(1), None, Some(2)]);
/// // compute a + b, updating the value in a in place if possible
/// let a = binary_mut(a, &b, |a, b| a + b as f32).unwrap().unwrap();
/// // a is updated in place
/// assert_eq!(a, Float32Array::from(vec![Some(6.1), None, Some(8.8)]));
/// ```
///
/// # Example with shared buffers
/// ```
/// # use arrow_arith::arity::binary_mut;
/// # use arrow_array::Float32Array;
/// # use arrow_array::types::Int32Type;
/// let a = Float32Array::from(vec![Some(5.1f32), None, Some(6.8)]);
/// let b = Float32Array::from(vec![Some(1.0f32), None, Some(2.0)]);
/// // a_clone shares the buffer with a
/// let a_cloned = a.clone();
/// // try to update a in place, but it is shared. Returns Err(a)
/// let a = binary_mut(a, &b, |a, b| a + b).unwrap_err();
/// assert_eq!(a_cloned, a);
/// // drop shared reference
/// drop(a_cloned);
/// // now a is not shared, so we can update it in place
/// let a = binary_mut(a, &b, |a, b| a + b).unwrap().unwrap();
/// assert_eq!(a, Float32Array::from(vec![Some(6.1), None, Some(8.8)]));
/// ```
pub fn binary_mut<T, U, F>(
a: PrimitiveArray<T>,
b: &PrimitiveArray<U>,
op: F,
) -> Result<Result<PrimitiveArray<T>, ArrowError>, PrimitiveArray<T>>
where
T: ArrowPrimitiveType,
U: ArrowPrimitiveType,
F: Fn(T::Native, U::Native) -> T::Native,
{
if a.len() != b.len() {
return Ok(Err(ArrowError::ComputeError(
"Cannot perform binary operation on arrays of different length".to_string(),
)));
}
if a.is_empty() {
return Ok(Ok(PrimitiveArray::from(ArrayData::new_empty(
&T::DATA_TYPE,
))));
}
let nulls = NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref());
let mut builder = a.into_builder()?;
builder
.values_slice_mut()
.iter_mut()
.zip(b.values())
.for_each(|(l, r)| *l = op(*l, *r));
let array_builder = builder.finish().into_data().into_builder().nulls(nulls);
let array_data = unsafe { array_builder.build_unchecked() };
Ok(Ok(PrimitiveArray::<T>::from(array_data)))
}
/// Applies the provided fallible binary operation across `a` and `b`, returning any error,
/// and collecting the results into a [`PrimitiveArray`]. If any index is null in either `a`
/// or `b`, the corresponding index in the result will also be null
///
/// Like [`try_unary`] the function is only evaluated for non-null indices
///
/// # Error
///
/// Return an error if the arrays have different lengths or
/// the operation is under erroneous
pub fn try_binary<A: ArrayAccessor, B: ArrayAccessor, F, O>(
a: A,
b: B,
op: F,
) -> Result<PrimitiveArray<O>, ArrowError>
where
O: ArrowPrimitiveType,
F: Fn(A::Item, B::Item) -> Result<O::Native, ArrowError>,
{
if a.len() != b.len() {
return Err(ArrowError::ComputeError(
"Cannot perform a binary operation on arrays of different length".to_string(),
));
}
if a.is_empty() {
return Ok(PrimitiveArray::from(ArrayData::new_empty(&O::DATA_TYPE)));
}
let len = a.len();
if a.null_count() == 0 && b.null_count() == 0 {
try_binary_no_nulls(len, a, b, op)
} else {
let nulls =
NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()).unwrap();
let mut buffer = BufferBuilder::<O::Native>::new(len);
buffer.append_n_zeroed(len);
let slice = buffer.as_slice_mut();
nulls.try_for_each_valid_idx(|idx| {
unsafe {
*slice.get_unchecked_mut(idx) = op(a.value_unchecked(idx), b.value_unchecked(idx))?
};
Ok::<_, ArrowError>(())
})?;
let values = buffer.finish().into();
Ok(PrimitiveArray::new(values, Some(nulls)))
}
}
/// Applies the provided fallible binary operation across `a` and `b` by mutating the mutable
/// [`PrimitiveArray`] `a` with the results, returning any error. If any index is null in
/// either `a` or `b`, the corresponding index in the result will also be null
///
/// Like [`try_unary`] the function is only evaluated for non-null indices
///
/// See [`binary_mut`] for errors and buffer reuse information
pub fn try_binary_mut<T, F>(
a: PrimitiveArray<T>,
b: &PrimitiveArray<T>,
op: F,
) -> Result<Result<PrimitiveArray<T>, ArrowError>, PrimitiveArray<T>>
where
T: ArrowPrimitiveType,
F: Fn(T::Native, T::Native) -> Result<T::Native, ArrowError>,
{
if a.len() != b.len() {
return Ok(Err(ArrowError::ComputeError(
"Cannot perform binary operation on arrays of different length".to_string(),
)));
}
let len = a.len();
if a.is_empty() {
return Ok(Ok(PrimitiveArray::from(ArrayData::new_empty(
&T::DATA_TYPE,
))));
}
if a.null_count() == 0 && b.null_count() == 0 {
try_binary_no_nulls_mut(len, a, b, op)
} else {
let nulls =
NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()).unwrap();
let mut builder = a.into_builder()?;
let slice = builder.values_slice_mut();
let r = nulls.try_for_each_valid_idx(|idx| {
unsafe {
*slice.get_unchecked_mut(idx) =
op(*slice.get_unchecked(idx), b.value_unchecked(idx))?
};
Ok::<_, ArrowError>(())
});
if let Err(err) = r {
return Ok(Err(err));
}
let array_builder = builder.finish().into_data().into_builder();
let array_data = unsafe { array_builder.nulls(Some(nulls)).build_unchecked() };
Ok(Ok(PrimitiveArray::<T>::from(array_data)))
}
}
/// This intentional inline(never) attribute helps LLVM optimize the loop.
#[inline(never)]
fn try_binary_no_nulls<A: ArrayAccessor, B: ArrayAccessor, F, O>(
len: usize,
a: A,
b: B,
op: F,
) -> Result<PrimitiveArray<O>, ArrowError>
where
O: ArrowPrimitiveType,
F: Fn(A::Item, B::Item) -> Result<O::Native, ArrowError>,
{
let mut buffer = MutableBuffer::new(len * O::Native::get_byte_width());
for idx in 0..len {
unsafe {
buffer.push_unchecked(op(a.value_unchecked(idx), b.value_unchecked(idx))?);
};
}
Ok(PrimitiveArray::new(buffer.into(), None))
}
/// This intentional inline(never) attribute helps LLVM optimize the loop.
#[inline(never)]
fn try_binary_no_nulls_mut<T, F>(
len: usize,
a: PrimitiveArray<T>,
b: &PrimitiveArray<T>,
op: F,
) -> Result<Result<PrimitiveArray<T>, ArrowError>, PrimitiveArray<T>>
where
T: ArrowPrimitiveType,
F: Fn(T::Native, T::Native) -> Result<T::Native, ArrowError>,
{
let mut builder = a.into_builder()?;
let slice = builder.values_slice_mut();
for idx in 0..len {
unsafe {
match op(*slice.get_unchecked(idx), b.value_unchecked(idx)) {
Ok(value) => *slice.get_unchecked_mut(idx) = value,
Err(err) => return Ok(Err(err)),
};
};
}
Ok(Ok(builder.finish()))
}
#[cfg(test)]
mod tests {
use super::*;
use arrow_array::builder::*;
use arrow_array::types::*;
#[test]
#[allow(deprecated)]
fn test_unary_f64_slice() {
let input = Float64Array::from(vec![Some(5.1f64), None, Some(6.8), None, Some(7.2)]);
let input_slice = input.slice(1, 4);
let result = unary(&input_slice, |n| n.round());
assert_eq!(
result,
Float64Array::from(vec![None, Some(7.0), None, Some(7.0)])
);
let result = unary_dyn::<_, Float64Type>(&input_slice, |n| n + 1.0).unwrap();
assert_eq!(
result.as_any().downcast_ref::<Float64Array>().unwrap(),
&Float64Array::from(vec![None, Some(7.8), None, Some(8.2)])
);
}
#[test]
#[allow(deprecated)]
fn test_unary_dict_and_unary_dyn() {
let mut builder = PrimitiveDictionaryBuilder::<Int8Type, Int32Type>::new();
builder.append(5).unwrap();
builder.append(6).unwrap();
builder.append(7).unwrap();
builder.append(8).unwrap();
builder.append_null();
builder.append(9).unwrap();
let dictionary_array = builder.finish();
let mut builder = PrimitiveDictionaryBuilder::<Int8Type, Int32Type>::new();
builder.append(6).unwrap();
builder.append(7).unwrap();
builder.append(8).unwrap();
builder.append(9).unwrap();
builder.append_null();
builder.append(10).unwrap();
let expected = builder.finish();
let result = unary_dict::<_, _, Int32Type>(&dictionary_array, |n| n + 1).unwrap();
assert_eq!(
result
.as_any()
.downcast_ref::<DictionaryArray<Int8Type>>()
.unwrap(),
&expected
);
let result = unary_dyn::<_, Int32Type>(&dictionary_array, |n| n + 1).unwrap();
assert_eq!(
result
.as_any()
.downcast_ref::<DictionaryArray<Int8Type>>()
.unwrap(),
&expected
);
}
#[test]
fn test_binary_mut() {
let a = Int32Array::from(vec![15, 14, 9, 8, 1]);
let b = Int32Array::from(vec![Some(1), None, Some(3), None, Some(5)]);
let c = binary_mut(a, &b, |l, r| l + r).unwrap().unwrap();
let expected = Int32Array::from(vec![Some(16), None, Some(12), None, Some(6)]);
assert_eq!(c, expected);
}
#[test]
fn test_try_binary_mut() {
let a = Int32Array::from(vec![15, 14, 9, 8, 1]);
let b = Int32Array::from(vec![Some(1), None, Some(3), None, Some(5)]);
let c = try_binary_mut(a, &b, |l, r| Ok(l + r)).unwrap().unwrap();
let expected = Int32Array::from(vec![Some(16), None, Some(12), None, Some(6)]);
assert_eq!(c, expected);
let a = Int32Array::from(vec![15, 14, 9, 8, 1]);
let b = Int32Array::from(vec![1, 2, 3, 4, 5]);
let c = try_binary_mut(a, &b, |l, r| Ok(l + r)).unwrap().unwrap();
let expected = Int32Array::from(vec![16, 16, 12, 12, 6]);
assert_eq!(c, expected);
let a = Int32Array::from(vec![15, 14, 9, 8, 1]);
let b = Int32Array::from(vec![Some(1), None, Some(3), None, Some(5)]);
let _ = try_binary_mut(a, &b, |l, r| {
if l == 1 {
Err(ArrowError::InvalidArgumentError(
"got error".parse().unwrap(),
))
} else {
Ok(l + r)
}
})
.unwrap()
.expect_err("should got error");
}
#[test]
fn test_unary_dict_mut() {
let values = Int32Array::from(vec![Some(10), Some(20), None]);
let keys = Int8Array::from_iter_values([0, 0, 1, 2]);
let dictionary = DictionaryArray::new(keys, Arc::new(values));
let updated = dictionary.unary_mut::<_, Int32Type>(|x| x + 1).unwrap();
let typed = updated.downcast_dict::<Int32Array>().unwrap();
assert_eq!(typed.value(0), 11);
assert_eq!(typed.value(1), 11);
assert_eq!(typed.value(2), 21);
let values = updated.values();
assert!(values.is_null(2));
}
}

Просмотреть файл

@ -0,0 +1,390 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::arity::{binary, unary};
use arrow_array::*;
use arrow_buffer::ArrowNativeType;
use arrow_schema::ArrowError;
use num::traits::{WrappingShl, WrappingShr};
use std::ops::{BitAnd, BitOr, BitXor, Not};
/// The helper function for bitwise operation with two array
fn bitwise_op<T, F>(
left: &PrimitiveArray<T>,
right: &PrimitiveArray<T>,
op: F,
) -> Result<PrimitiveArray<T>, ArrowError>
where
T: ArrowNumericType,
F: Fn(T::Native, T::Native) -> T::Native,
{
binary(left, right, op)
}
/// Perform `left & right` operation on two arrays. If either left or right value is null
/// then the result is also null.
pub fn bitwise_and<T>(
left: &PrimitiveArray<T>,
right: &PrimitiveArray<T>,
) -> Result<PrimitiveArray<T>, ArrowError>
where
T: ArrowNumericType,
T::Native: BitAnd<Output = T::Native>,
{
bitwise_op(left, right, |a, b| a & b)
}
/// Perform `left | right` operation on two arrays. If either left or right value is null
/// then the result is also null.
pub fn bitwise_or<T>(
left: &PrimitiveArray<T>,
right: &PrimitiveArray<T>,
) -> Result<PrimitiveArray<T>, ArrowError>
where
T: ArrowNumericType,
T::Native: BitOr<Output = T::Native>,
{
bitwise_op(left, right, |a, b| a | b)
}
/// Perform `left ^ right` operation on two arrays. If either left or right value is null
/// then the result is also null.
pub fn bitwise_xor<T>(
left: &PrimitiveArray<T>,
right: &PrimitiveArray<T>,
) -> Result<PrimitiveArray<T>, ArrowError>
where
T: ArrowNumericType,
T::Native: BitXor<Output = T::Native>,
{
bitwise_op(left, right, |a, b| a ^ b)
}
/// Perform bitwise `left << right` operation on two arrays. If either left or right value is null
/// then the result is also null.
pub fn bitwise_shift_left<T>(
left: &PrimitiveArray<T>,
right: &PrimitiveArray<T>,
) -> Result<PrimitiveArray<T>, ArrowError>
where
T: ArrowNumericType,
T::Native: WrappingShl<Output = T::Native>,
{
bitwise_op(left, right, |a, b| {
let b = b.as_usize();
a.wrapping_shl(b as u32)
})
}
/// Perform bitwise `left >> right` operation on two arrays. If either left or right value is null
/// then the result is also null.
pub fn bitwise_shift_right<T>(
left: &PrimitiveArray<T>,
right: &PrimitiveArray<T>,
) -> Result<PrimitiveArray<T>, ArrowError>
where
T: ArrowNumericType,
T::Native: WrappingShr<Output = T::Native>,
{
bitwise_op(left, right, |a, b| {
let b = b.as_usize();
a.wrapping_shr(b as u32)
})
}
/// Perform `!array` operation on array. If array value is null
/// then the result is also null.
pub fn bitwise_not<T>(array: &PrimitiveArray<T>) -> Result<PrimitiveArray<T>, ArrowError>
where
T: ArrowNumericType,
T::Native: Not<Output = T::Native>,
{
Ok(unary(array, |value| !value))
}
/// Perform `left & !right` operation on two arrays. If either left or right value is null
/// then the result is also null.
pub fn bitwise_and_not<T>(
left: &PrimitiveArray<T>,
right: &PrimitiveArray<T>,
) -> Result<PrimitiveArray<T>, ArrowError>
where
T: ArrowNumericType,
T::Native: BitAnd<Output = T::Native>,
T::Native: Not<Output = T::Native>,
{
bitwise_op(left, right, |a, b| a & !b)
}
/// Perform bitwise `and` every value in an array with the scalar. If any value in the array is null then the
/// result is also null.
pub fn bitwise_and_scalar<T>(
array: &PrimitiveArray<T>,
scalar: T::Native,
) -> Result<PrimitiveArray<T>, ArrowError>
where
T: ArrowNumericType,
T::Native: BitAnd<Output = T::Native>,
{
Ok(unary(array, |value| value & scalar))
}
/// Perform bitwise `or` every value in an array with the scalar. If any value in the array is null then the
/// result is also null.
pub fn bitwise_or_scalar<T>(
array: &PrimitiveArray<T>,
scalar: T::Native,
) -> Result<PrimitiveArray<T>, ArrowError>
where
T: ArrowNumericType,
T::Native: BitOr<Output = T::Native>,
{
Ok(unary(array, |value| value | scalar))
}
/// Perform bitwise `xor` every value in an array with the scalar. If any value in the array is null then the
/// result is also null.
pub fn bitwise_xor_scalar<T>(
array: &PrimitiveArray<T>,
scalar: T::Native,
) -> Result<PrimitiveArray<T>, ArrowError>
where
T: ArrowNumericType,
T::Native: BitXor<Output = T::Native>,
{
Ok(unary(array, |value| value ^ scalar))
}
/// Perform bitwise `left << right` every value in an array with the scalar. If any value in the array is null then the
/// result is also null.
pub fn bitwise_shift_left_scalar<T>(
array: &PrimitiveArray<T>,
scalar: T::Native,
) -> Result<PrimitiveArray<T>, ArrowError>
where
T: ArrowNumericType,
T::Native: WrappingShl<Output = T::Native>,
{
Ok(unary(array, |value| {
let scalar = scalar.as_usize();
value.wrapping_shl(scalar as u32)
}))
}
/// Perform bitwise `left >> right` every value in an array with the scalar. If any value in the array is null then the
/// result is also null.
pub fn bitwise_shift_right_scalar<T>(
array: &PrimitiveArray<T>,
scalar: T::Native,
) -> Result<PrimitiveArray<T>, ArrowError>
where
T: ArrowNumericType,
T::Native: WrappingShr<Output = T::Native>,
{
Ok(unary(array, |value| {
let scalar = scalar.as_usize();
value.wrapping_shr(scalar as u32)
}))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_bitwise_and_array() -> Result<(), ArrowError> {
// unsigned value
let left = UInt64Array::from(vec![Some(1), Some(2), None, Some(4)]);
let right = UInt64Array::from(vec![Some(5), Some(10), Some(8), Some(12)]);
let expected = UInt64Array::from(vec![Some(1), Some(2), None, Some(4)]);
let result = bitwise_and(&left, &right)?;
assert_eq!(expected, result);
// signed value
let left = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]);
let right = Int32Array::from(vec![Some(5), Some(-10), Some(8), Some(12)]);
let expected = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]);
let result = bitwise_and(&left, &right)?;
assert_eq!(expected, result);
Ok(())
}
#[test]
fn test_bitwise_shift_left() {
let left = UInt64Array::from(vec![Some(1), Some(2), None, Some(4), Some(8)]);
let right = UInt64Array::from(vec![Some(5), Some(10), Some(8), Some(12), Some(u64::MAX)]);
let expected = UInt64Array::from(vec![Some(32), Some(2048), None, Some(16384), Some(0)]);
let result = bitwise_shift_left(&left, &right).unwrap();
assert_eq!(expected, result);
}
#[test]
fn test_bitwise_shift_left_scalar() {
let left = UInt64Array::from(vec![Some(1), Some(2), None, Some(4), Some(8)]);
let scalar = 2;
let expected = UInt64Array::from(vec![Some(4), Some(8), None, Some(16), Some(32)]);
let result = bitwise_shift_left_scalar(&left, scalar).unwrap();
assert_eq!(expected, result);
}
#[test]
fn test_bitwise_shift_right() {
let left = UInt64Array::from(vec![Some(32), Some(2048), None, Some(16384), Some(3)]);
let right = UInt64Array::from(vec![Some(5), Some(10), Some(8), Some(12), Some(65)]);
let expected = UInt64Array::from(vec![Some(1), Some(2), None, Some(4), Some(1)]);
let result = bitwise_shift_right(&left, &right).unwrap();
assert_eq!(expected, result);
}
#[test]
fn test_bitwise_shift_right_scalar() {
let left = UInt64Array::from(vec![Some(32), Some(2048), None, Some(16384), Some(3)]);
let scalar = 2;
let expected = UInt64Array::from(vec![Some(8), Some(512), None, Some(4096), Some(0)]);
let result = bitwise_shift_right_scalar(&left, scalar).unwrap();
assert_eq!(expected, result);
}
#[test]
fn test_bitwise_and_array_scalar() {
// unsigned value
let left = UInt64Array::from(vec![Some(15), Some(2), None, Some(4)]);
let scalar = 7;
let expected = UInt64Array::from(vec![Some(7), Some(2), None, Some(4)]);
let result = bitwise_and_scalar(&left, scalar).unwrap();
assert_eq!(expected, result);
// signed value
let left = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]);
let scalar = -20;
let expected = Int32Array::from(vec![Some(0), Some(0), None, Some(4)]);
let result = bitwise_and_scalar(&left, scalar).unwrap();
assert_eq!(expected, result);
}
#[test]
fn test_bitwise_or_array() {
// unsigned value
let left = UInt64Array::from(vec![Some(1), Some(2), None, Some(4)]);
let right = UInt64Array::from(vec![Some(7), Some(5), Some(8), Some(13)]);
let expected = UInt64Array::from(vec![Some(7), Some(7), None, Some(13)]);
let result = bitwise_or(&left, &right).unwrap();
assert_eq!(expected, result);
// signed value
let left = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]);
let right = Int32Array::from(vec![Some(-7), Some(-5), Some(8), Some(13)]);
let expected = Int32Array::from(vec![Some(-7), Some(-5), None, Some(13)]);
let result = bitwise_or(&left, &right).unwrap();
assert_eq!(expected, result);
}
#[test]
fn test_bitwise_not_array() {
// unsigned value
let array = UInt64Array::from(vec![Some(1), Some(2), None, Some(4)]);
let expected = UInt64Array::from(vec![
Some(18446744073709551614),
Some(18446744073709551613),
None,
Some(18446744073709551611),
]);
let result = bitwise_not(&array).unwrap();
assert_eq!(expected, result);
// signed value
let array = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]);
let expected = Int32Array::from(vec![Some(-2), Some(-3), None, Some(-5)]);
let result = bitwise_not(&array).unwrap();
assert_eq!(expected, result);
}
#[test]
fn test_bitwise_and_not_array() {
// unsigned value
let left = UInt64Array::from(vec![Some(8), Some(2), None, Some(4)]);
let right = UInt64Array::from(vec![Some(7), Some(5), Some(8), Some(13)]);
let expected = UInt64Array::from(vec![Some(8), Some(2), None, Some(0)]);
let result = bitwise_and_not(&left, &right).unwrap();
assert_eq!(expected, result);
assert_eq!(
bitwise_and(&left, &bitwise_not(&right).unwrap()).unwrap(),
result
);
// signed value
let left = Int32Array::from(vec![Some(2), Some(1), None, Some(3)]);
let right = Int32Array::from(vec![Some(-7), Some(-5), Some(8), Some(13)]);
let expected = Int32Array::from(vec![Some(2), Some(0), None, Some(2)]);
let result = bitwise_and_not(&left, &right).unwrap();
assert_eq!(expected, result);
assert_eq!(
bitwise_and(&left, &bitwise_not(&right).unwrap()).unwrap(),
result
);
}
#[test]
fn test_bitwise_or_array_scalar() {
// unsigned value
let left = UInt64Array::from(vec![Some(15), Some(2), None, Some(4)]);
let scalar = 7;
let expected = UInt64Array::from(vec![Some(15), Some(7), None, Some(7)]);
let result = bitwise_or_scalar(&left, scalar).unwrap();
assert_eq!(expected, result);
// signed value
let left = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]);
let scalar = 20;
let expected = Int32Array::from(vec![Some(21), Some(22), None, Some(20)]);
let result = bitwise_or_scalar(&left, scalar).unwrap();
assert_eq!(expected, result);
}
#[test]
fn test_bitwise_xor_array() {
// unsigned value
let left = UInt64Array::from(vec![Some(1), Some(2), None, Some(4)]);
let right = UInt64Array::from(vec![Some(7), Some(5), Some(8), Some(13)]);
let expected = UInt64Array::from(vec![Some(6), Some(7), None, Some(9)]);
let result = bitwise_xor(&left, &right).unwrap();
assert_eq!(expected, result);
// signed value
let left = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]);
let right = Int32Array::from(vec![Some(-7), Some(5), Some(8), Some(-13)]);
let expected = Int32Array::from(vec![Some(-8), Some(7), None, Some(-9)]);
let result = bitwise_xor(&left, &right).unwrap();
assert_eq!(expected, result);
}
#[test]
fn test_bitwise_xor_array_scalar() {
// unsigned value
let left = UInt64Array::from(vec![Some(15), Some(2), None, Some(4)]);
let scalar = 7;
let expected = UInt64Array::from(vec![Some(8), Some(5), None, Some(3)]);
let result = bitwise_xor_scalar(&left, scalar).unwrap();
assert_eq!(expected, result);
// signed value
let left = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]);
let scalar = -20;
let expected = Int32Array::from(vec![Some(-19), Some(-18), None, Some(-24)]);
let result = bitwise_xor_scalar(&left, scalar).unwrap();
assert_eq!(expected, result);
}
}

Просмотреть файл

@ -0,0 +1,914 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//! Defines boolean kernels on Arrow `BooleanArray`'s, e.g. `AND`, `OR` and `NOT`.
//!
//! These kernels can leverage SIMD if available on your system. Currently no runtime
//! detection is provided, you should enable the specific SIMD intrinsics using
//! `RUSTFLAGS="-C target-feature=+avx2"` for example. See the documentation
//! [here](https://doc.rust-lang.org/stable/core/arch/) for more information.
use arrow_array::*;
use arrow_buffer::buffer::{bitwise_bin_op_helper, bitwise_quaternary_op_helper};
use arrow_buffer::{buffer_bin_and_not, BooleanBuffer, NullBuffer};
use arrow_schema::ArrowError;
/// Logical 'and' boolean values with Kleene logic
///
/// # Behavior
///
/// This function behaves as follows with nulls:
///
/// * `true` and `null` = `null`
/// * `null` and `true` = `null`
/// * `false` and `null` = `false`
/// * `null` and `false` = `false`
/// * `null` and `null` = `null`
///
/// In other words, in this context a null value really means \"unknown\",
/// and an unknown value 'and' false is always false.
/// For a different null behavior, see function \"and\".
///
/// # Example
///
/// ```rust
/// # use arrow_array::BooleanArray;
/// # use arrow_arith::boolean::and_kleene;
/// let a = BooleanArray::from(vec![Some(true), Some(false), None]);
/// let b = BooleanArray::from(vec![None, None, None]);
/// let and_ab = and_kleene(&a, &b).unwrap();
/// assert_eq!(and_ab, BooleanArray::from(vec![None, Some(false), None]));
/// ```
///
/// # Fails
///
/// If the operands have different lengths
pub fn and_kleene(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanArray, ArrowError> {
if left.len() != right.len() {
return Err(ArrowError::ComputeError(
"Cannot perform bitwise operation on arrays of different length".to_string(),
));
}
let left_values = left.values();
let right_values = right.values();
let buffer = match (left.nulls(), right.nulls()) {
(None, None) => None,
(Some(left_null_buffer), None) => {
// The right side has no null values.
// The final null bit is set only if:
// 1. left null bit is set, or
// 2. right data bit is false (because null AND false = false).
Some(bitwise_bin_op_helper(
left_null_buffer.buffer(),
left_null_buffer.offset(),
right_values.inner(),
right_values.offset(),
left.len(),
|a, b| a | !b,
))
}
(None, Some(right_null_buffer)) => {
// Same as above
Some(bitwise_bin_op_helper(
right_null_buffer.buffer(),
right_null_buffer.offset(),
left_values.inner(),
left_values.offset(),
left.len(),
|a, b| a | !b,
))
}
(Some(left_null_buffer), Some(right_null_buffer)) => {
// Follow the same logic above. Both sides have null values.
// Assume a is left null bits, b is left data bits, c is right null bits,
// d is right data bits.
// The final null bits are:
// (a | (c & !d)) & (c | (a & !b))
Some(bitwise_quaternary_op_helper(
[
left_null_buffer.buffer(),
left_values.inner(),
right_null_buffer.buffer(),
right_values.inner(),
],
[
left_null_buffer.offset(),
left_values.offset(),
right_null_buffer.offset(),
right_values.offset(),
],
left.len(),
|a, b, c, d| (a | (c & !d)) & (c | (a & !b)),
))
}
};
let nulls = buffer.map(|b| NullBuffer::new(BooleanBuffer::new(b, 0, left.len())));
Ok(BooleanArray::new(left_values & right_values, nulls))
}
/// Logical 'or' boolean values with Kleene logic
///
/// # Behavior
///
/// This function behaves as follows with nulls:
///
/// * `true` or `null` = `true`
/// * `null` or `true` = `true`
/// * `false` or `null` = `null`
/// * `null` or `false` = `null`
/// * `null` or `null` = `null`
///
/// In other words, in this context a null value really means \"unknown\",
/// and an unknown value 'or' true is always true.
/// For a different null behavior, see function \"or\".
///
/// # Example
///
/// ```rust
/// # use arrow_array::BooleanArray;
/// # use arrow_arith::boolean::or_kleene;
/// let a = BooleanArray::from(vec![Some(true), Some(false), None]);
/// let b = BooleanArray::from(vec![None, None, None]);
/// let or_ab = or_kleene(&a, &b).unwrap();
/// assert_eq!(or_ab, BooleanArray::from(vec![Some(true), None, None]));
/// ```
///
/// # Fails
///
/// If the operands have different lengths
pub fn or_kleene(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanArray, ArrowError> {
if left.len() != right.len() {
return Err(ArrowError::ComputeError(
"Cannot perform bitwise operation on arrays of different length".to_string(),
));
}
let left_values = left.values();
let right_values = right.values();
let buffer = match (left.nulls(), right.nulls()) {
(None, None) => None,
(Some(left_nulls), None) => {
// The right side has no null values.
// The final null bit is set only if:
// 1. left null bit is set, or
// 2. right data bit is true (because null OR true = true).
Some(bitwise_bin_op_helper(
left_nulls.buffer(),
left_nulls.offset(),
right_values.inner(),
right_values.offset(),
left.len(),
|a, b| a | b,
))
}
(None, Some(right_nulls)) => {
// Same as above
Some(bitwise_bin_op_helper(
right_nulls.buffer(),
right_nulls.offset(),
left_values.inner(),
left_values.offset(),
left.len(),
|a, b| a | b,
))
}
(Some(left_nulls), Some(right_nulls)) => {
// Follow the same logic above. Both sides have null values.
// Assume a is left null bits, b is left data bits, c is right null bits,
// d is right data bits.
// The final null bits are:
// (a | (c & d)) & (c | (a & b))
Some(bitwise_quaternary_op_helper(
[
left_nulls.buffer(),
left_values.inner(),
right_nulls.buffer(),
right_values.inner(),
],
[
left_nulls.offset(),
left_values.offset(),
right_nulls.offset(),
right_values.offset(),
],
left.len(),
|a, b, c, d| (a | (c & d)) & (c | (a & b)),
))
}
};
let nulls = buffer.map(|b| NullBuffer::new(BooleanBuffer::new(b, 0, left.len())));
Ok(BooleanArray::new(left_values | right_values, nulls))
}
/// Helper function to implement binary kernels
pub(crate) fn binary_boolean_kernel<F>(
left: &BooleanArray,
right: &BooleanArray,
op: F,
) -> Result<BooleanArray, ArrowError>
where
F: Fn(&BooleanBuffer, &BooleanBuffer) -> BooleanBuffer,
{
if left.len() != right.len() {
return Err(ArrowError::ComputeError(
"Cannot perform bitwise operation on arrays of different length".to_string(),
));
}
let nulls = NullBuffer::union(left.nulls(), right.nulls());
let values = op(left.values(), right.values());
Ok(BooleanArray::new(values, nulls))
}
/// Performs `AND` operation on two arrays. If either left or right value is null then the
/// result is also null.
/// # Error
/// This function errors when the arrays have different lengths.
/// # Example
/// ```rust
/// # use arrow_array::BooleanArray;
/// # use arrow_arith::boolean::and;
/// let a = BooleanArray::from(vec![Some(false), Some(true), None]);
/// let b = BooleanArray::from(vec![Some(true), Some(true), Some(false)]);
/// let and_ab = and(&a, &b).unwrap();
/// assert_eq!(and_ab, BooleanArray::from(vec![Some(false), Some(true), None]));
/// ```
pub fn and(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanArray, ArrowError> {
binary_boolean_kernel(left, right, |a, b| a & b)
}
/// Performs `OR` operation on two arrays. If either left or right value is null then the
/// result is also null.
/// # Error
/// This function errors when the arrays have different lengths.
/// # Example
/// ```rust
/// # use arrow_array::BooleanArray;
/// # use arrow_arith::boolean::or;
/// let a = BooleanArray::from(vec![Some(false), Some(true), None]);
/// let b = BooleanArray::from(vec![Some(true), Some(true), Some(false)]);
/// let or_ab = or(&a, &b).unwrap();
/// assert_eq!(or_ab, BooleanArray::from(vec![Some(true), Some(true), None]));
/// ```
pub fn or(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanArray, ArrowError> {
binary_boolean_kernel(left, right, |a, b| a | b)
}
/// Performs `AND_NOT` operation on two arrays. If either left or right value is null then the
/// result is also null.
/// # Error
/// This function errors when the arrays have different lengths.
/// # Example
/// ```rust
/// # use arrow_array::BooleanArray;
/// # use arrow_arith::boolean::{and, not, and_not};
/// let a = BooleanArray::from(vec![Some(false), Some(true), None]);
/// let b = BooleanArray::from(vec![Some(true), Some(true), Some(false)]);
/// let andn_ab = and_not(&a, &b).unwrap();
/// assert_eq!(andn_ab, BooleanArray::from(vec![Some(false), Some(false), None]));
/// // It's equal to and(left, not(right))
/// assert_eq!(andn_ab, and(&a, &not(&b).unwrap()).unwrap());
pub fn and_not(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanArray, ArrowError> {
binary_boolean_kernel(left, right, |a, b| {
let buffer = buffer_bin_and_not(a.inner(), b.offset(), b.inner(), a.offset(), a.len());
BooleanBuffer::new(buffer, left.offset(), left.len())
})
}
/// Performs unary `NOT` operation on an arrays. If value is null then the result is also
/// null.
/// # Error
/// This function never errors. It returns an error for consistency.
/// # Example
/// ```rust
/// # use arrow_array::BooleanArray;
/// # use arrow_arith::boolean::not;
/// let a = BooleanArray::from(vec![Some(false), Some(true), None]);
/// let not_a = not(&a).unwrap();
/// assert_eq!(not_a, BooleanArray::from(vec![Some(true), Some(false), None]));
/// ```
pub fn not(left: &BooleanArray) -> Result<BooleanArray, ArrowError> {
let nulls = left.nulls().cloned();
let values = !left.values();
Ok(BooleanArray::new(values, nulls))
}
/// Returns a non-null [BooleanArray] with whether each value of the array is null.
/// # Error
/// This function never errors.
/// # Example
/// ```rust
/// # use arrow_array::BooleanArray;
/// # use arrow_arith::boolean::is_null;
/// let a = BooleanArray::from(vec![Some(false), Some(true), None]);
/// let a_is_null = is_null(&a).unwrap();
/// assert_eq!(a_is_null, BooleanArray::from(vec![false, false, true]));
/// ```
pub fn is_null(input: &dyn Array) -> Result<BooleanArray, ArrowError> {
let values = match input.logical_nulls() {
None => BooleanBuffer::new_unset(input.len()),
Some(nulls) => !nulls.inner(),
};
Ok(BooleanArray::new(values, None))
}
/// Returns a non-null [BooleanArray] with whether each value of the array is not null.
/// # Error
/// This function never errors.
/// # Example
/// ```rust
/// # use arrow_array::BooleanArray;
/// # use arrow_arith::boolean::is_not_null;
/// let a = BooleanArray::from(vec![Some(false), Some(true), None]);
/// let a_is_not_null = is_not_null(&a).unwrap();
/// assert_eq!(a_is_not_null, BooleanArray::from(vec![true, true, false]));
/// ```
pub fn is_not_null(input: &dyn Array) -> Result<BooleanArray, ArrowError> {
let values = match input.logical_nulls() {
None => BooleanBuffer::new_set(input.len()),
Some(n) => n.inner().clone(),
};
Ok(BooleanArray::new(values, None))
}
#[cfg(test)]
mod tests {
use super::*;
use std::sync::Arc;
#[test]
fn test_bool_array_and() {
let a = BooleanArray::from(vec![false, false, true, true]);
let b = BooleanArray::from(vec![false, true, false, true]);
let c = and(&a, &b).unwrap();
let expected = BooleanArray::from(vec![false, false, false, true]);
assert_eq!(c, expected);
}
#[test]
fn test_bool_array_or() {
let a = BooleanArray::from(vec![false, false, true, true]);
let b = BooleanArray::from(vec![false, true, false, true]);
let c = or(&a, &b).unwrap();
let expected = BooleanArray::from(vec![false, true, true, true]);
assert_eq!(c, expected);
}
#[test]
fn test_bool_array_and_not() {
let a = BooleanArray::from(vec![false, false, true, true]);
let b = BooleanArray::from(vec![false, true, false, true]);
let c = and_not(&a, &b).unwrap();
let expected = BooleanArray::from(vec![false, false, true, false]);
assert_eq!(c, expected);
assert_eq!(c, and(&a, &not(&b).unwrap()).unwrap());
}
#[test]
fn test_bool_array_or_nulls() {
let a = BooleanArray::from(vec![
None,
None,
None,
Some(false),
Some(false),
Some(false),
Some(true),
Some(true),
Some(true),
]);
let b = BooleanArray::from(vec![
None,
Some(false),
Some(true),
None,
Some(false),
Some(true),
None,
Some(false),
Some(true),
]);
let c = or(&a, &b).unwrap();
let expected = BooleanArray::from(vec![
None,
None,
None,
None,
Some(false),
Some(true),
None,
Some(true),
Some(true),
]);
assert_eq!(c, expected);
}
#[test]
fn test_boolean_array_kleene_no_remainder() {
let n = 1024;
let a = BooleanArray::from(vec![true; n]);
let b = BooleanArray::from(vec![None; n]);
let result = or_kleene(&a, &b).unwrap();
assert_eq!(result, a);
}
#[test]
fn test_bool_array_and_kleene_nulls() {
let a = BooleanArray::from(vec![
None,
None,
None,
Some(false),
Some(false),
Some(false),
Some(true),
Some(true),
Some(true),
]);
let b = BooleanArray::from(vec![
None,
Some(false),
Some(true),
None,
Some(false),
Some(true),
None,
Some(false),
Some(true),
]);
let c = and_kleene(&a, &b).unwrap();
let expected = BooleanArray::from(vec![
None,
Some(false),
None,
Some(false),
Some(false),
Some(false),
None,
Some(false),
Some(true),
]);
assert_eq!(c, expected);
}
#[test]
fn test_bool_array_or_kleene_nulls() {
let a = BooleanArray::from(vec![
None,
None,
None,
Some(false),
Some(false),
Some(false),
Some(true),
Some(true),
Some(true),
]);
let b = BooleanArray::from(vec![
None,
Some(false),
Some(true),
None,
Some(false),
Some(true),
None,
Some(false),
Some(true),
]);
let c = or_kleene(&a, &b).unwrap();
let expected = BooleanArray::from(vec![
None,
None,
Some(true),
None,
Some(false),
Some(true),
Some(true),
Some(true),
Some(true),
]);
assert_eq!(c, expected);
}
#[test]
fn test_bool_array_or_kleene_right_sided_nulls() {
let a = BooleanArray::from(vec![false, false, false, true, true, true]);
// ensure null bitmap of a is absent
assert!(a.nulls().is_none());
let b = BooleanArray::from(vec![
Some(true),
Some(false),
None,
Some(true),
Some(false),
None,
]);
// ensure null bitmap of b is present
assert!(b.nulls().is_some());
let c = or_kleene(&a, &b).unwrap();
let expected = BooleanArray::from(vec![
Some(true),
Some(false),
None,
Some(true),
Some(true),
Some(true),
]);
assert_eq!(c, expected);
}
#[test]
fn test_bool_array_or_kleene_left_sided_nulls() {
let a = BooleanArray::from(vec![
Some(true),
Some(false),
None,
Some(true),
Some(false),
None,
]);
// ensure null bitmap of b is absent
assert!(a.nulls().is_some());
let b = BooleanArray::from(vec![false, false, false, true, true, true]);
// ensure null bitmap of a is present
assert!(b.nulls().is_none());
let c = or_kleene(&a, &b).unwrap();
let expected = BooleanArray::from(vec![
Some(true),
Some(false),
None,
Some(true),
Some(true),
Some(true),
]);
assert_eq!(c, expected);
}
#[test]
fn test_bool_array_not() {
let a = BooleanArray::from(vec![false, true]);
let c = not(&a).unwrap();
let expected = BooleanArray::from(vec![true, false]);
assert_eq!(c, expected);
}
#[test]
fn test_bool_array_not_sliced() {
let a = BooleanArray::from(vec![None, Some(true), Some(false), None, Some(true)]);
let a = a.slice(1, 4);
let a = a.as_any().downcast_ref::<BooleanArray>().unwrap();
let c = not(a).unwrap();
let expected = BooleanArray::from(vec![Some(false), Some(true), None, Some(false)]);
assert_eq!(c, expected);
}
#[test]
fn test_bool_array_and_nulls() {
let a = BooleanArray::from(vec![
None,
None,
None,
Some(false),
Some(false),
Some(false),
Some(true),
Some(true),
Some(true),
]);
let b = BooleanArray::from(vec![
None,
Some(false),
Some(true),
None,
Some(false),
Some(true),
None,
Some(false),
Some(true),
]);
let c = and(&a, &b).unwrap();
let expected = BooleanArray::from(vec![
None,
None,
None,
None,
Some(false),
Some(false),
None,
Some(false),
Some(true),
]);
assert_eq!(c, expected);
}
#[test]
fn test_bool_array_and_sliced_same_offset() {
let a = BooleanArray::from(vec![
false, false, false, false, false, false, false, false, false, false, true, true,
]);
let b = BooleanArray::from(vec![
false, false, false, false, false, false, false, false, false, true, false, true,
]);
let a = a.slice(8, 4);
let a = a.as_any().downcast_ref::<BooleanArray>().unwrap();
let b = b.slice(8, 4);
let b = b.as_any().downcast_ref::<BooleanArray>().unwrap();
let c = and(a, b).unwrap();
let expected = BooleanArray::from(vec![false, false, false, true]);
assert_eq!(expected, c);
}
#[test]
fn test_bool_array_and_sliced_same_offset_mod8() {
let a = BooleanArray::from(vec![
false, false, true, true, false, false, false, false, false, false, false, false,
]);
let b = BooleanArray::from(vec![
false, false, false, false, false, false, false, false, false, true, false, true,
]);
let a = a.slice(0, 4);
let a = a.as_any().downcast_ref::<BooleanArray>().unwrap();
let b = b.slice(8, 4);
let b = b.as_any().downcast_ref::<BooleanArray>().unwrap();
let c = and(a, b).unwrap();
let expected = BooleanArray::from(vec![false, false, false, true]);
assert_eq!(expected, c);
}
#[test]
fn test_bool_array_and_sliced_offset1() {
let a = BooleanArray::from(vec![
false, false, false, false, false, false, false, false, false, false, true, true,
]);
let b = BooleanArray::from(vec![false, true, false, true]);
let a = a.slice(8, 4);
let a = a.as_any().downcast_ref::<BooleanArray>().unwrap();
let c = and(a, &b).unwrap();
let expected = BooleanArray::from(vec![false, false, false, true]);
assert_eq!(expected, c);
}
#[test]
fn test_bool_array_and_sliced_offset2() {
let a = BooleanArray::from(vec![false, false, true, true]);
let b = BooleanArray::from(vec![
false, false, false, false, false, false, false, false, false, true, false, true,
]);
let b = b.slice(8, 4);
let b = b.as_any().downcast_ref::<BooleanArray>().unwrap();
let c = and(&a, b).unwrap();
let expected = BooleanArray::from(vec![false, false, false, true]);
assert_eq!(expected, c);
}
#[test]
fn test_bool_array_and_nulls_offset() {
let a = BooleanArray::from(vec![None, Some(false), Some(true), None, Some(true)]);
let a = a.slice(1, 4);
let a = a.as_any().downcast_ref::<BooleanArray>().unwrap();
let b = BooleanArray::from(vec![
None,
None,
Some(true),
Some(false),
Some(true),
Some(true),
]);
let b = b.slice(2, 4);
let b = b.as_any().downcast_ref::<BooleanArray>().unwrap();
let c = and(a, b).unwrap();
let expected = BooleanArray::from(vec![Some(false), Some(false), None, Some(true)]);
assert_eq!(expected, c);
}
#[test]
fn test_nonnull_array_is_null() {
let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
let res = is_null(a.as_ref()).unwrap();
let expected = BooleanArray::from(vec![false, false, false, false]);
assert_eq!(expected, res);
assert!(res.nulls().is_none());
}
#[test]
fn test_nonnull_array_with_offset_is_null() {
let a = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 7, 6, 5, 4, 3, 2, 1]);
let a = a.slice(8, 4);
let res = is_null(&a).unwrap();
let expected = BooleanArray::from(vec![false, false, false, false]);
assert_eq!(expected, res);
assert!(res.nulls().is_none());
}
#[test]
fn test_nonnull_array_is_not_null() {
let a = Int32Array::from(vec![1, 2, 3, 4]);
let res = is_not_null(&a).unwrap();
let expected = BooleanArray::from(vec![true, true, true, true]);
assert_eq!(expected, res);
assert!(res.nulls().is_none());
}
#[test]
fn test_nonnull_array_with_offset_is_not_null() {
let a = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 7, 6, 5, 4, 3, 2, 1]);
let a = a.slice(8, 4);
let res = is_not_null(&a).unwrap();
let expected = BooleanArray::from(vec![true, true, true, true]);
assert_eq!(expected, res);
assert!(res.nulls().is_none());
}
#[test]
fn test_nullable_array_is_null() {
let a = Int32Array::from(vec![Some(1), None, Some(3), None]);
let res = is_null(&a).unwrap();
let expected = BooleanArray::from(vec![false, true, false, true]);
assert_eq!(expected, res);
assert!(res.nulls().is_none());
}
#[test]
fn test_nullable_array_with_offset_is_null() {
let a = Int32Array::from(vec![
None,
None,
None,
None,
None,
None,
None,
None,
// offset 8, previous None values are skipped by the slice
Some(1),
None,
Some(2),
None,
Some(3),
Some(4),
None,
None,
]);
let a = a.slice(8, 4);
let res = is_null(&a).unwrap();
let expected = BooleanArray::from(vec![false, true, false, true]);
assert_eq!(expected, res);
assert!(res.nulls().is_none());
}
#[test]
fn test_nullable_array_is_not_null() {
let a = Int32Array::from(vec![Some(1), None, Some(3), None]);
let res = is_not_null(&a).unwrap();
let expected = BooleanArray::from(vec![true, false, true, false]);
assert_eq!(expected, res);
assert!(res.nulls().is_none());
}
#[test]
fn test_nullable_array_with_offset_is_not_null() {
let a = Int32Array::from(vec![
None,
None,
None,
None,
None,
None,
None,
None,
// offset 8, previous None values are skipped by the slice
Some(1),
None,
Some(2),
None,
Some(3),
Some(4),
None,
None,
]);
let a = a.slice(8, 4);
let res = is_not_null(&a).unwrap();
let expected = BooleanArray::from(vec![true, false, true, false]);
assert_eq!(expected, res);
assert!(res.nulls().is_none());
}
#[test]
fn test_null_array_is_null() {
let a = NullArray::new(3);
let res = is_null(&a).unwrap();
let expected = BooleanArray::from(vec![true, true, true]);
assert_eq!(expected, res);
assert!(res.nulls().is_none());
}
#[test]
fn test_null_array_is_not_null() {
let a = NullArray::new(3);
let res = is_not_null(&a).unwrap();
let expected = BooleanArray::from(vec![false, false, false]);
assert_eq!(expected, res);
assert!(res.nulls().is_none());
}
}

Просмотреть файл

@ -0,0 +1,27 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//! Arrow arithmetic and aggregation kernels
pub mod aggregate;
#[doc(hidden)] // Kernels to be removed in a future release
pub mod arithmetic;
pub mod arity;
pub mod bitwise;
pub mod boolean;
pub mod numeric;
pub mod temporal;

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,72 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
[package]
name = "arrow-array"
version = { workspace = true }
description = "Array abstractions for Apache Arrow"
homepage = { workspace = true }
repository = { workspace = true }
authors = { workspace = true }
license = { workspace = true }
keywords = { workspace = true }
include = { workspace = true }
edition = { workspace = true }
rust-version = { workspace = true }
[lib]
name = "arrow_array"
path = "src/lib.rs"
bench = false
[target.'cfg(target_arch = "wasm32")'.dependencies]
ahash = { version = "0.8", default-features = false, features = ["compile-time-rng"] }
[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] }
[dependencies]
arrow-buffer = { workspace = true }
arrow-schema = { workspace = true }
arrow-data = { workspace = true }
chrono = { workspace = true }
chrono-tz = { version = "0.9", optional = true }
num = { version = "0.4.1", default-features = false, features = ["std"] }
half = { version = "2.1", default-features = false, features = ["num-traits"] }
hashbrown = { version = "0.14", default-features = false }
[features]
ffi = ["arrow-schema/ffi", "arrow-data/ffi"]
[dev-dependencies]
rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] }
criterion = { version = "0.5", default-features = false }
[build-dependencies]
[[bench]]
name = "occupancy"
harness = false
[[bench]]
name = "gc_view_types"
harness = false
[[bench]]
name = "fixed_size_list_array"
harness = false

Просмотреть файл

@ -0,0 +1,51 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use arrow_array::{Array, FixedSizeListArray, Int32Array};
use arrow_schema::Field;
use criterion::*;
use rand::{thread_rng, Rng};
use std::sync::Arc;
fn gen_fsl(len: usize, value_len: usize) -> FixedSizeListArray {
let mut rng = thread_rng();
let values = Arc::new(Int32Array::from(
(0..len).map(|_| rng.gen::<i32>()).collect::<Vec<_>>(),
));
let field = Arc::new(Field::new("item", values.data_type().clone(), true));
FixedSizeListArray::new(field, value_len as i32, values, None)
}
fn criterion_benchmark(c: &mut Criterion) {
let len = 4096;
for value_len in [1, 32, 1024] {
let fsl = gen_fsl(len, value_len);
c.bench_function(
&format!("fixed_size_list_array(len: {len}, value_len: {value_len})"),
|b| {
b.iter(|| {
for i in 0..len / value_len {
black_box(fsl.value(i));
}
});
},
);
}
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

Просмотреть файл

@ -0,0 +1,48 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use arrow_array::StringViewArray;
use criterion::*;
fn gen_view_array(size: usize) -> StringViewArray {
StringViewArray::from_iter((0..size).map(|v| match v % 3 {
0 => Some("small"),
1 => Some("larger than 12 bytes array"),
2 => None,
_ => unreachable!("unreachable"),
}))
}
fn criterion_benchmark(c: &mut Criterion) {
let array = gen_view_array(100_000);
c.bench_function("gc view types all", |b| {
b.iter(|| {
black_box(array.gc());
});
});
let sliced = array.slice(0, 100_000 / 2);
c.bench_function("gc view types slice half", |b| {
b.iter(|| {
black_box(sliced.gc());
});
});
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

Просмотреть файл

@ -0,0 +1,57 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use arrow_array::types::Int32Type;
use arrow_array::{DictionaryArray, Int32Array};
use arrow_buffer::NullBuffer;
use criterion::*;
use rand::{thread_rng, Rng};
use std::sync::Arc;
fn gen_dict(
len: usize,
values_len: usize,
occupancy: f64,
null_percent: f64,
) -> DictionaryArray<Int32Type> {
let mut rng = thread_rng();
let values = Int32Array::from(vec![0; values_len]);
let max_key = (values_len as f64 * occupancy) as i32;
let keys = (0..len).map(|_| rng.gen_range(0..max_key)).collect();
let nulls = (0..len).map(|_| !rng.gen_bool(null_percent)).collect();
let keys = Int32Array::new(keys, Some(NullBuffer::new(nulls)));
DictionaryArray::new(keys, Arc::new(values))
}
fn criterion_benchmark(c: &mut Criterion) {
for values in [10, 100, 512] {
for occupancy in [1., 0.5, 0.1] {
for null_percent in [0.0, 0.1, 0.5, 0.9] {
let dict = gen_dict(1024, values, occupancy, null_percent);
c.bench_function(&format!("occupancy(values: {values}, occupancy: {occupancy}, null_percent: {null_percent})"), |b| {
b.iter(|| {
black_box(&dict).occupancy()
});
});
}
}
}
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

Просмотреть файл

@ -0,0 +1,864 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use arrow_buffer::{i256, ArrowNativeType, IntervalDayTime, IntervalMonthDayNano};
use arrow_schema::ArrowError;
use half::f16;
use num::complex::ComplexFloat;
use std::cmp::Ordering;
/// Trait for [`ArrowNativeType`] that adds checked and unchecked arithmetic operations,
/// and totally ordered comparison operations
///
/// The APIs with `_wrapping` suffix do not perform overflow-checking. For integer
/// types they will wrap around the boundary of the type. For floating point types they
/// will overflow to INF or -INF preserving the expected sign value
///
/// Note `div_wrapping` and `mod_wrapping` will panic for integer types if `rhs` is zero
/// although this may be subject to change <https://github.com/apache/arrow-rs/issues/2647>
///
/// The APIs with `_checked` suffix perform overflow-checking. For integer types
/// these will return `Err` instead of wrapping. For floating point types they will
/// overflow to INF or -INF preserving the expected sign value
///
/// Comparison of integer types is as per normal integer comparison rules, floating
/// point values are compared as per IEEE 754's totalOrder predicate see [`f32::total_cmp`]
///
pub trait ArrowNativeTypeOp: ArrowNativeType {
/// The additive identity
const ZERO: Self;
/// The multiplicative identity
const ONE: Self;
/// The minimum value and identity for the `max` aggregation.
/// Note that the aggregation uses the total order predicate for floating point values,
/// which means that this value is a negative NaN.
const MIN_TOTAL_ORDER: Self;
/// The maximum value and identity for the `min` aggregation.
/// Note that the aggregation uses the total order predicate for floating point values,
/// which means that this value is a positive NaN.
const MAX_TOTAL_ORDER: Self;
/// Checked addition operation
fn add_checked(self, rhs: Self) -> Result<Self, ArrowError>;
/// Wrapping addition operation
fn add_wrapping(self, rhs: Self) -> Self;
/// Checked subtraction operation
fn sub_checked(self, rhs: Self) -> Result<Self, ArrowError>;
/// Wrapping subtraction operation
fn sub_wrapping(self, rhs: Self) -> Self;
/// Checked multiplication operation
fn mul_checked(self, rhs: Self) -> Result<Self, ArrowError>;
/// Wrapping multiplication operation
fn mul_wrapping(self, rhs: Self) -> Self;
/// Checked division operation
fn div_checked(self, rhs: Self) -> Result<Self, ArrowError>;
/// Wrapping division operation
fn div_wrapping(self, rhs: Self) -> Self;
/// Checked remainder operation
fn mod_checked(self, rhs: Self) -> Result<Self, ArrowError>;
/// Wrapping remainder operation
fn mod_wrapping(self, rhs: Self) -> Self;
/// Checked negation operation
fn neg_checked(self) -> Result<Self, ArrowError>;
/// Wrapping negation operation
fn neg_wrapping(self) -> Self;
/// Checked exponentiation operation
fn pow_checked(self, exp: u32) -> Result<Self, ArrowError>;
/// Wrapping exponentiation operation
fn pow_wrapping(self, exp: u32) -> Self;
/// Returns true if zero else false
fn is_zero(self) -> bool;
/// Compare operation
fn compare(self, rhs: Self) -> Ordering;
/// Equality operation
fn is_eq(self, rhs: Self) -> bool;
/// Not equal operation
#[inline]
fn is_ne(self, rhs: Self) -> bool {
!self.is_eq(rhs)
}
/// Less than operation
#[inline]
fn is_lt(self, rhs: Self) -> bool {
self.compare(rhs).is_lt()
}
/// Less than equals operation
#[inline]
fn is_le(self, rhs: Self) -> bool {
self.compare(rhs).is_le()
}
/// Greater than operation
#[inline]
fn is_gt(self, rhs: Self) -> bool {
self.compare(rhs).is_gt()
}
/// Greater than equals operation
#[inline]
fn is_ge(self, rhs: Self) -> bool {
self.compare(rhs).is_ge()
}
}
macro_rules! native_type_op {
($t:tt) => {
native_type_op!($t, 0, 1);
};
($t:tt, $zero:expr, $one: expr) => {
native_type_op!($t, $zero, $one, $t::MIN, $t::MAX);
};
($t:tt, $zero:expr, $one: expr, $min: expr, $max: expr) => {
impl ArrowNativeTypeOp for $t {
const ZERO: Self = $zero;
const ONE: Self = $one;
const MIN_TOTAL_ORDER: Self = $min;
const MAX_TOTAL_ORDER: Self = $max;
#[inline]
fn add_checked(self, rhs: Self) -> Result<Self, ArrowError> {
self.checked_add(rhs).ok_or_else(|| {
ArrowError::ComputeError(format!(
"Overflow happened on: {:?} + {:?}",
self, rhs
))
})
}
#[inline]
fn add_wrapping(self, rhs: Self) -> Self {
self.wrapping_add(rhs)
}
#[inline]
fn sub_checked(self, rhs: Self) -> Result<Self, ArrowError> {
self.checked_sub(rhs).ok_or_else(|| {
ArrowError::ComputeError(format!(
"Overflow happened on: {:?} - {:?}",
self, rhs
))
})
}
#[inline]
fn sub_wrapping(self, rhs: Self) -> Self {
self.wrapping_sub(rhs)
}
#[inline]
fn mul_checked(self, rhs: Self) -> Result<Self, ArrowError> {
self.checked_mul(rhs).ok_or_else(|| {
ArrowError::ComputeError(format!(
"Overflow happened on: {:?} * {:?}",
self, rhs
))
})
}
#[inline]
fn mul_wrapping(self, rhs: Self) -> Self {
self.wrapping_mul(rhs)
}
#[inline]
fn div_checked(self, rhs: Self) -> Result<Self, ArrowError> {
if rhs.is_zero() {
Err(ArrowError::DivideByZero)
} else {
self.checked_div(rhs).ok_or_else(|| {
ArrowError::ComputeError(format!(
"Overflow happened on: {:?} / {:?}",
self, rhs
))
})
}
}
#[inline]
fn div_wrapping(self, rhs: Self) -> Self {
self.wrapping_div(rhs)
}
#[inline]
fn mod_checked(self, rhs: Self) -> Result<Self, ArrowError> {
if rhs.is_zero() {
Err(ArrowError::DivideByZero)
} else {
self.checked_rem(rhs).ok_or_else(|| {
ArrowError::ComputeError(format!(
"Overflow happened on: {:?} % {:?}",
self, rhs
))
})
}
}
#[inline]
fn mod_wrapping(self, rhs: Self) -> Self {
self.wrapping_rem(rhs)
}
#[inline]
fn neg_checked(self) -> Result<Self, ArrowError> {
self.checked_neg().ok_or_else(|| {
ArrowError::ComputeError(format!("Overflow happened on: {:?}", self))
})
}
#[inline]
fn pow_checked(self, exp: u32) -> Result<Self, ArrowError> {
self.checked_pow(exp).ok_or_else(|| {
ArrowError::ComputeError(format!("Overflow happened on: {:?} ^ {exp:?}", self))
})
}
#[inline]
fn pow_wrapping(self, exp: u32) -> Self {
self.wrapping_pow(exp)
}
#[inline]
fn neg_wrapping(self) -> Self {
self.wrapping_neg()
}
#[inline]
fn is_zero(self) -> bool {
self == Self::ZERO
}
#[inline]
fn compare(self, rhs: Self) -> Ordering {
self.cmp(&rhs)
}
#[inline]
fn is_eq(self, rhs: Self) -> bool {
self == rhs
}
}
};
}
native_type_op!(i8);
native_type_op!(i16);
native_type_op!(i32);
native_type_op!(i64);
native_type_op!(i128);
native_type_op!(u8);
native_type_op!(u16);
native_type_op!(u32);
native_type_op!(u64);
native_type_op!(i256, i256::ZERO, i256::ONE, i256::MIN, i256::MAX);
native_type_op!(IntervalDayTime, IntervalDayTime::ZERO, IntervalDayTime::ONE);
native_type_op!(
IntervalMonthDayNano,
IntervalMonthDayNano::ZERO,
IntervalMonthDayNano::ONE
);
macro_rules! native_type_float_op {
($t:tt, $zero:expr, $one:expr, $min:expr, $max:expr) => {
impl ArrowNativeTypeOp for $t {
const ZERO: Self = $zero;
const ONE: Self = $one;
const MIN_TOTAL_ORDER: Self = $min;
const MAX_TOTAL_ORDER: Self = $max;
#[inline]
fn add_checked(self, rhs: Self) -> Result<Self, ArrowError> {
Ok(self + rhs)
}
#[inline]
fn add_wrapping(self, rhs: Self) -> Self {
self + rhs
}
#[inline]
fn sub_checked(self, rhs: Self) -> Result<Self, ArrowError> {
Ok(self - rhs)
}
#[inline]
fn sub_wrapping(self, rhs: Self) -> Self {
self - rhs
}
#[inline]
fn mul_checked(self, rhs: Self) -> Result<Self, ArrowError> {
Ok(self * rhs)
}
#[inline]
fn mul_wrapping(self, rhs: Self) -> Self {
self * rhs
}
#[inline]
fn div_checked(self, rhs: Self) -> Result<Self, ArrowError> {
if rhs.is_zero() {
Err(ArrowError::DivideByZero)
} else {
Ok(self / rhs)
}
}
#[inline]
fn div_wrapping(self, rhs: Self) -> Self {
self / rhs
}
#[inline]
fn mod_checked(self, rhs: Self) -> Result<Self, ArrowError> {
if rhs.is_zero() {
Err(ArrowError::DivideByZero)
} else {
Ok(self % rhs)
}
}
#[inline]
fn mod_wrapping(self, rhs: Self) -> Self {
self % rhs
}
#[inline]
fn neg_checked(self) -> Result<Self, ArrowError> {
Ok(-self)
}
#[inline]
fn neg_wrapping(self) -> Self {
-self
}
#[inline]
fn pow_checked(self, exp: u32) -> Result<Self, ArrowError> {
Ok(self.powi(exp as i32))
}
#[inline]
fn pow_wrapping(self, exp: u32) -> Self {
self.powi(exp as i32)
}
#[inline]
fn is_zero(self) -> bool {
self == $zero
}
#[inline]
fn compare(self, rhs: Self) -> Ordering {
<$t>::total_cmp(&self, &rhs)
}
#[inline]
fn is_eq(self, rhs: Self) -> bool {
// Equivalent to `self.total_cmp(&rhs).is_eq()`
// but LLVM isn't able to realise this is bitwise equality
// https://rust.godbolt.org/z/347nWGxoW
self.to_bits() == rhs.to_bits()
}
}
};
}
// the smallest/largest bit patterns for floating point numbers are NaN, but differ from the canonical NAN constants.
// See test_float_total_order_min_max for details.
native_type_float_op!(
f16,
f16::ZERO,
f16::ONE,
f16::from_bits(-1 as _),
f16::from_bits(i16::MAX as _)
);
// from_bits is not yet stable as const fn, see https://github.com/rust-lang/rust/issues/72447
native_type_float_op!(
f32,
0.,
1.,
unsafe { std::mem::transmute(-1_i32) },
unsafe { std::mem::transmute(i32::MAX) }
);
native_type_float_op!(
f64,
0.,
1.,
unsafe { std::mem::transmute(-1_i64) },
unsafe { std::mem::transmute(i64::MAX) }
);
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_native_type_is_zero() {
assert!(0_i8.is_zero());
assert!(0_i16.is_zero());
assert!(0_i32.is_zero());
assert!(0_i64.is_zero());
assert!(0_i128.is_zero());
assert!(i256::ZERO.is_zero());
assert!(0_u8.is_zero());
assert!(0_u16.is_zero());
assert!(0_u32.is_zero());
assert!(0_u64.is_zero());
assert!(f16::ZERO.is_zero());
assert!(0.0_f32.is_zero());
assert!(0.0_f64.is_zero());
}
#[test]
fn test_native_type_comparison() {
// is_eq
assert!(8_i8.is_eq(8_i8));
assert!(8_i16.is_eq(8_i16));
assert!(8_i32.is_eq(8_i32));
assert!(8_i64.is_eq(8_i64));
assert!(8_i128.is_eq(8_i128));
assert!(i256::from_parts(8, 0).is_eq(i256::from_parts(8, 0)));
assert!(8_u8.is_eq(8_u8));
assert!(8_u16.is_eq(8_u16));
assert!(8_u32.is_eq(8_u32));
assert!(8_u64.is_eq(8_u64));
assert!(f16::from_f32(8.0).is_eq(f16::from_f32(8.0)));
assert!(8.0_f32.is_eq(8.0_f32));
assert!(8.0_f64.is_eq(8.0_f64));
// is_ne
assert!(8_i8.is_ne(1_i8));
assert!(8_i16.is_ne(1_i16));
assert!(8_i32.is_ne(1_i32));
assert!(8_i64.is_ne(1_i64));
assert!(8_i128.is_ne(1_i128));
assert!(i256::from_parts(8, 0).is_ne(i256::from_parts(1, 0)));
assert!(8_u8.is_ne(1_u8));
assert!(8_u16.is_ne(1_u16));
assert!(8_u32.is_ne(1_u32));
assert!(8_u64.is_ne(1_u64));
assert!(f16::from_f32(8.0).is_ne(f16::from_f32(1.0)));
assert!(8.0_f32.is_ne(1.0_f32));
assert!(8.0_f64.is_ne(1.0_f64));
// is_lt
assert!(8_i8.is_lt(10_i8));
assert!(8_i16.is_lt(10_i16));
assert!(8_i32.is_lt(10_i32));
assert!(8_i64.is_lt(10_i64));
assert!(8_i128.is_lt(10_i128));
assert!(i256::from_parts(8, 0).is_lt(i256::from_parts(10, 0)));
assert!(8_u8.is_lt(10_u8));
assert!(8_u16.is_lt(10_u16));
assert!(8_u32.is_lt(10_u32));
assert!(8_u64.is_lt(10_u64));
assert!(f16::from_f32(8.0).is_lt(f16::from_f32(10.0)));
assert!(8.0_f32.is_lt(10.0_f32));
assert!(8.0_f64.is_lt(10.0_f64));
// is_gt
assert!(8_i8.is_gt(1_i8));
assert!(8_i16.is_gt(1_i16));
assert!(8_i32.is_gt(1_i32));
assert!(8_i64.is_gt(1_i64));
assert!(8_i128.is_gt(1_i128));
assert!(i256::from_parts(8, 0).is_gt(i256::from_parts(1, 0)));
assert!(8_u8.is_gt(1_u8));
assert!(8_u16.is_gt(1_u16));
assert!(8_u32.is_gt(1_u32));
assert!(8_u64.is_gt(1_u64));
assert!(f16::from_f32(8.0).is_gt(f16::from_f32(1.0)));
assert!(8.0_f32.is_gt(1.0_f32));
assert!(8.0_f64.is_gt(1.0_f64));
}
#[test]
fn test_native_type_add() {
// add_wrapping
assert_eq!(8_i8.add_wrapping(2_i8), 10_i8);
assert_eq!(8_i16.add_wrapping(2_i16), 10_i16);
assert_eq!(8_i32.add_wrapping(2_i32), 10_i32);
assert_eq!(8_i64.add_wrapping(2_i64), 10_i64);
assert_eq!(8_i128.add_wrapping(2_i128), 10_i128);
assert_eq!(
i256::from_parts(8, 0).add_wrapping(i256::from_parts(2, 0)),
i256::from_parts(10, 0)
);
assert_eq!(8_u8.add_wrapping(2_u8), 10_u8);
assert_eq!(8_u16.add_wrapping(2_u16), 10_u16);
assert_eq!(8_u32.add_wrapping(2_u32), 10_u32);
assert_eq!(8_u64.add_wrapping(2_u64), 10_u64);
assert_eq!(
f16::from_f32(8.0).add_wrapping(f16::from_f32(2.0)),
f16::from_f32(10.0)
);
assert_eq!(8.0_f32.add_wrapping(2.0_f32), 10_f32);
assert_eq!(8.0_f64.add_wrapping(2.0_f64), 10_f64);
// add_checked
assert_eq!(8_i8.add_checked(2_i8).unwrap(), 10_i8);
assert_eq!(8_i16.add_checked(2_i16).unwrap(), 10_i16);
assert_eq!(8_i32.add_checked(2_i32).unwrap(), 10_i32);
assert_eq!(8_i64.add_checked(2_i64).unwrap(), 10_i64);
assert_eq!(8_i128.add_checked(2_i128).unwrap(), 10_i128);
assert_eq!(
i256::from_parts(8, 0)
.add_checked(i256::from_parts(2, 0))
.unwrap(),
i256::from_parts(10, 0)
);
assert_eq!(8_u8.add_checked(2_u8).unwrap(), 10_u8);
assert_eq!(8_u16.add_checked(2_u16).unwrap(), 10_u16);
assert_eq!(8_u32.add_checked(2_u32).unwrap(), 10_u32);
assert_eq!(8_u64.add_checked(2_u64).unwrap(), 10_u64);
assert_eq!(
f16::from_f32(8.0).add_checked(f16::from_f32(2.0)).unwrap(),
f16::from_f32(10.0)
);
assert_eq!(8.0_f32.add_checked(2.0_f32).unwrap(), 10_f32);
assert_eq!(8.0_f64.add_checked(2.0_f64).unwrap(), 10_f64);
}
#[test]
fn test_native_type_sub() {
// sub_wrapping
assert_eq!(8_i8.sub_wrapping(2_i8), 6_i8);
assert_eq!(8_i16.sub_wrapping(2_i16), 6_i16);
assert_eq!(8_i32.sub_wrapping(2_i32), 6_i32);
assert_eq!(8_i64.sub_wrapping(2_i64), 6_i64);
assert_eq!(8_i128.sub_wrapping(2_i128), 6_i128);
assert_eq!(
i256::from_parts(8, 0).sub_wrapping(i256::from_parts(2, 0)),
i256::from_parts(6, 0)
);
assert_eq!(8_u8.sub_wrapping(2_u8), 6_u8);
assert_eq!(8_u16.sub_wrapping(2_u16), 6_u16);
assert_eq!(8_u32.sub_wrapping(2_u32), 6_u32);
assert_eq!(8_u64.sub_wrapping(2_u64), 6_u64);
assert_eq!(
f16::from_f32(8.0).sub_wrapping(f16::from_f32(2.0)),
f16::from_f32(6.0)
);
assert_eq!(8.0_f32.sub_wrapping(2.0_f32), 6_f32);
assert_eq!(8.0_f64.sub_wrapping(2.0_f64), 6_f64);
// sub_checked
assert_eq!(8_i8.sub_checked(2_i8).unwrap(), 6_i8);
assert_eq!(8_i16.sub_checked(2_i16).unwrap(), 6_i16);
assert_eq!(8_i32.sub_checked(2_i32).unwrap(), 6_i32);
assert_eq!(8_i64.sub_checked(2_i64).unwrap(), 6_i64);
assert_eq!(8_i128.sub_checked(2_i128).unwrap(), 6_i128);
assert_eq!(
i256::from_parts(8, 0)
.sub_checked(i256::from_parts(2, 0))
.unwrap(),
i256::from_parts(6, 0)
);
assert_eq!(8_u8.sub_checked(2_u8).unwrap(), 6_u8);
assert_eq!(8_u16.sub_checked(2_u16).unwrap(), 6_u16);
assert_eq!(8_u32.sub_checked(2_u32).unwrap(), 6_u32);
assert_eq!(8_u64.sub_checked(2_u64).unwrap(), 6_u64);
assert_eq!(
f16::from_f32(8.0).sub_checked(f16::from_f32(2.0)).unwrap(),
f16::from_f32(6.0)
);
assert_eq!(8.0_f32.sub_checked(2.0_f32).unwrap(), 6_f32);
assert_eq!(8.0_f64.sub_checked(2.0_f64).unwrap(), 6_f64);
}
#[test]
fn test_native_type_mul() {
// mul_wrapping
assert_eq!(8_i8.mul_wrapping(2_i8), 16_i8);
assert_eq!(8_i16.mul_wrapping(2_i16), 16_i16);
assert_eq!(8_i32.mul_wrapping(2_i32), 16_i32);
assert_eq!(8_i64.mul_wrapping(2_i64), 16_i64);
assert_eq!(8_i128.mul_wrapping(2_i128), 16_i128);
assert_eq!(
i256::from_parts(8, 0).mul_wrapping(i256::from_parts(2, 0)),
i256::from_parts(16, 0)
);
assert_eq!(8_u8.mul_wrapping(2_u8), 16_u8);
assert_eq!(8_u16.mul_wrapping(2_u16), 16_u16);
assert_eq!(8_u32.mul_wrapping(2_u32), 16_u32);
assert_eq!(8_u64.mul_wrapping(2_u64), 16_u64);
assert_eq!(
f16::from_f32(8.0).mul_wrapping(f16::from_f32(2.0)),
f16::from_f32(16.0)
);
assert_eq!(8.0_f32.mul_wrapping(2.0_f32), 16_f32);
assert_eq!(8.0_f64.mul_wrapping(2.0_f64), 16_f64);
// mul_checked
assert_eq!(8_i8.mul_checked(2_i8).unwrap(), 16_i8);
assert_eq!(8_i16.mul_checked(2_i16).unwrap(), 16_i16);
assert_eq!(8_i32.mul_checked(2_i32).unwrap(), 16_i32);
assert_eq!(8_i64.mul_checked(2_i64).unwrap(), 16_i64);
assert_eq!(8_i128.mul_checked(2_i128).unwrap(), 16_i128);
assert_eq!(
i256::from_parts(8, 0)
.mul_checked(i256::from_parts(2, 0))
.unwrap(),
i256::from_parts(16, 0)
);
assert_eq!(8_u8.mul_checked(2_u8).unwrap(), 16_u8);
assert_eq!(8_u16.mul_checked(2_u16).unwrap(), 16_u16);
assert_eq!(8_u32.mul_checked(2_u32).unwrap(), 16_u32);
assert_eq!(8_u64.mul_checked(2_u64).unwrap(), 16_u64);
assert_eq!(
f16::from_f32(8.0).mul_checked(f16::from_f32(2.0)).unwrap(),
f16::from_f32(16.0)
);
assert_eq!(8.0_f32.mul_checked(2.0_f32).unwrap(), 16_f32);
assert_eq!(8.0_f64.mul_checked(2.0_f64).unwrap(), 16_f64);
}
#[test]
fn test_native_type_div() {
// div_wrapping
assert_eq!(8_i8.div_wrapping(2_i8), 4_i8);
assert_eq!(8_i16.div_wrapping(2_i16), 4_i16);
assert_eq!(8_i32.div_wrapping(2_i32), 4_i32);
assert_eq!(8_i64.div_wrapping(2_i64), 4_i64);
assert_eq!(8_i128.div_wrapping(2_i128), 4_i128);
assert_eq!(
i256::from_parts(8, 0).div_wrapping(i256::from_parts(2, 0)),
i256::from_parts(4, 0)
);
assert_eq!(8_u8.div_wrapping(2_u8), 4_u8);
assert_eq!(8_u16.div_wrapping(2_u16), 4_u16);
assert_eq!(8_u32.div_wrapping(2_u32), 4_u32);
assert_eq!(8_u64.div_wrapping(2_u64), 4_u64);
assert_eq!(
f16::from_f32(8.0).div_wrapping(f16::from_f32(2.0)),
f16::from_f32(4.0)
);
assert_eq!(8.0_f32.div_wrapping(2.0_f32), 4_f32);
assert_eq!(8.0_f64.div_wrapping(2.0_f64), 4_f64);
// div_checked
assert_eq!(8_i8.div_checked(2_i8).unwrap(), 4_i8);
assert_eq!(8_i16.div_checked(2_i16).unwrap(), 4_i16);
assert_eq!(8_i32.div_checked(2_i32).unwrap(), 4_i32);
assert_eq!(8_i64.div_checked(2_i64).unwrap(), 4_i64);
assert_eq!(8_i128.div_checked(2_i128).unwrap(), 4_i128);
assert_eq!(
i256::from_parts(8, 0)
.div_checked(i256::from_parts(2, 0))
.unwrap(),
i256::from_parts(4, 0)
);
assert_eq!(8_u8.div_checked(2_u8).unwrap(), 4_u8);
assert_eq!(8_u16.div_checked(2_u16).unwrap(), 4_u16);
assert_eq!(8_u32.div_checked(2_u32).unwrap(), 4_u32);
assert_eq!(8_u64.div_checked(2_u64).unwrap(), 4_u64);
assert_eq!(
f16::from_f32(8.0).div_checked(f16::from_f32(2.0)).unwrap(),
f16::from_f32(4.0)
);
assert_eq!(8.0_f32.div_checked(2.0_f32).unwrap(), 4_f32);
assert_eq!(8.0_f64.div_checked(2.0_f64).unwrap(), 4_f64);
}
#[test]
fn test_native_type_mod() {
// mod_wrapping
assert_eq!(9_i8.mod_wrapping(2_i8), 1_i8);
assert_eq!(9_i16.mod_wrapping(2_i16), 1_i16);
assert_eq!(9_i32.mod_wrapping(2_i32), 1_i32);
assert_eq!(9_i64.mod_wrapping(2_i64), 1_i64);
assert_eq!(9_i128.mod_wrapping(2_i128), 1_i128);
assert_eq!(
i256::from_parts(9, 0).mod_wrapping(i256::from_parts(2, 0)),
i256::from_parts(1, 0)
);
assert_eq!(9_u8.mod_wrapping(2_u8), 1_u8);
assert_eq!(9_u16.mod_wrapping(2_u16), 1_u16);
assert_eq!(9_u32.mod_wrapping(2_u32), 1_u32);
assert_eq!(9_u64.mod_wrapping(2_u64), 1_u64);
assert_eq!(
f16::from_f32(9.0).mod_wrapping(f16::from_f32(2.0)),
f16::from_f32(1.0)
);
assert_eq!(9.0_f32.mod_wrapping(2.0_f32), 1_f32);
assert_eq!(9.0_f64.mod_wrapping(2.0_f64), 1_f64);
// mod_checked
assert_eq!(9_i8.mod_checked(2_i8).unwrap(), 1_i8);
assert_eq!(9_i16.mod_checked(2_i16).unwrap(), 1_i16);
assert_eq!(9_i32.mod_checked(2_i32).unwrap(), 1_i32);
assert_eq!(9_i64.mod_checked(2_i64).unwrap(), 1_i64);
assert_eq!(9_i128.mod_checked(2_i128).unwrap(), 1_i128);
assert_eq!(
i256::from_parts(9, 0)
.mod_checked(i256::from_parts(2, 0))
.unwrap(),
i256::from_parts(1, 0)
);
assert_eq!(9_u8.mod_checked(2_u8).unwrap(), 1_u8);
assert_eq!(9_u16.mod_checked(2_u16).unwrap(), 1_u16);
assert_eq!(9_u32.mod_checked(2_u32).unwrap(), 1_u32);
assert_eq!(9_u64.mod_checked(2_u64).unwrap(), 1_u64);
assert_eq!(
f16::from_f32(9.0).mod_checked(f16::from_f32(2.0)).unwrap(),
f16::from_f32(1.0)
);
assert_eq!(9.0_f32.mod_checked(2.0_f32).unwrap(), 1_f32);
assert_eq!(9.0_f64.mod_checked(2.0_f64).unwrap(), 1_f64);
}
#[test]
fn test_native_type_neg() {
// neg_wrapping
assert_eq!(8_i8.neg_wrapping(), -8_i8);
assert_eq!(8_i16.neg_wrapping(), -8_i16);
assert_eq!(8_i32.neg_wrapping(), -8_i32);
assert_eq!(8_i64.neg_wrapping(), -8_i64);
assert_eq!(8_i128.neg_wrapping(), -8_i128);
assert_eq!(i256::from_parts(8, 0).neg_wrapping(), i256::from_i128(-8));
assert_eq!(8_u8.neg_wrapping(), u8::MAX - 7_u8);
assert_eq!(8_u16.neg_wrapping(), u16::MAX - 7_u16);
assert_eq!(8_u32.neg_wrapping(), u32::MAX - 7_u32);
assert_eq!(8_u64.neg_wrapping(), u64::MAX - 7_u64);
assert_eq!(f16::from_f32(8.0).neg_wrapping(), f16::from_f32(-8.0));
assert_eq!(8.0_f32.neg_wrapping(), -8_f32);
assert_eq!(8.0_f64.neg_wrapping(), -8_f64);
// neg_checked
assert_eq!(8_i8.neg_checked().unwrap(), -8_i8);
assert_eq!(8_i16.neg_checked().unwrap(), -8_i16);
assert_eq!(8_i32.neg_checked().unwrap(), -8_i32);
assert_eq!(8_i64.neg_checked().unwrap(), -8_i64);
assert_eq!(8_i128.neg_checked().unwrap(), -8_i128);
assert_eq!(
i256::from_parts(8, 0).neg_checked().unwrap(),
i256::from_i128(-8)
);
assert!(8_u8.neg_checked().is_err());
assert!(8_u16.neg_checked().is_err());
assert!(8_u32.neg_checked().is_err());
assert!(8_u64.neg_checked().is_err());
assert_eq!(
f16::from_f32(8.0).neg_checked().unwrap(),
f16::from_f32(-8.0)
);
assert_eq!(8.0_f32.neg_checked().unwrap(), -8_f32);
assert_eq!(8.0_f64.neg_checked().unwrap(), -8_f64);
}
#[test]
fn test_native_type_pow() {
// pow_wrapping
assert_eq!(8_i8.pow_wrapping(2_u32), 64_i8);
assert_eq!(8_i16.pow_wrapping(2_u32), 64_i16);
assert_eq!(8_i32.pow_wrapping(2_u32), 64_i32);
assert_eq!(8_i64.pow_wrapping(2_u32), 64_i64);
assert_eq!(8_i128.pow_wrapping(2_u32), 64_i128);
assert_eq!(
i256::from_parts(8, 0).pow_wrapping(2_u32),
i256::from_parts(64, 0)
);
assert_eq!(8_u8.pow_wrapping(2_u32), 64_u8);
assert_eq!(8_u16.pow_wrapping(2_u32), 64_u16);
assert_eq!(8_u32.pow_wrapping(2_u32), 64_u32);
assert_eq!(8_u64.pow_wrapping(2_u32), 64_u64);
assert_eq!(f16::from_f32(8.0).pow_wrapping(2_u32), f16::from_f32(64.0));
assert_eq!(8.0_f32.pow_wrapping(2_u32), 64_f32);
assert_eq!(8.0_f64.pow_wrapping(2_u32), 64_f64);
// pow_checked
assert_eq!(8_i8.pow_checked(2_u32).unwrap(), 64_i8);
assert_eq!(8_i16.pow_checked(2_u32).unwrap(), 64_i16);
assert_eq!(8_i32.pow_checked(2_u32).unwrap(), 64_i32);
assert_eq!(8_i64.pow_checked(2_u32).unwrap(), 64_i64);
assert_eq!(8_i128.pow_checked(2_u32).unwrap(), 64_i128);
assert_eq!(
i256::from_parts(8, 0).pow_checked(2_u32).unwrap(),
i256::from_parts(64, 0)
);
assert_eq!(8_u8.pow_checked(2_u32).unwrap(), 64_u8);
assert_eq!(8_u16.pow_checked(2_u32).unwrap(), 64_u16);
assert_eq!(8_u32.pow_checked(2_u32).unwrap(), 64_u32);
assert_eq!(8_u64.pow_checked(2_u32).unwrap(), 64_u64);
assert_eq!(
f16::from_f32(8.0).pow_checked(2_u32).unwrap(),
f16::from_f32(64.0)
);
assert_eq!(8.0_f32.pow_checked(2_u32).unwrap(), 64_f32);
assert_eq!(8.0_f64.pow_checked(2_u32).unwrap(), 64_f64);
}
#[test]
fn test_float_total_order_min_max() {
assert!(<f64 as ArrowNativeTypeOp>::MIN_TOTAL_ORDER.is_lt(f64::NEG_INFINITY));
assert!(<f64 as ArrowNativeTypeOp>::MAX_TOTAL_ORDER.is_gt(f64::INFINITY));
assert!(<f64 as ArrowNativeTypeOp>::MIN_TOTAL_ORDER.is_nan());
assert!(<f64 as ArrowNativeTypeOp>::MIN_TOTAL_ORDER.is_sign_negative());
assert!(<f64 as ArrowNativeTypeOp>::MIN_TOTAL_ORDER.is_lt(-f64::NAN));
assert!(<f64 as ArrowNativeTypeOp>::MAX_TOTAL_ORDER.is_nan());
assert!(<f64 as ArrowNativeTypeOp>::MAX_TOTAL_ORDER.is_sign_positive());
assert!(<f64 as ArrowNativeTypeOp>::MAX_TOTAL_ORDER.is_gt(f64::NAN));
assert!(<f32 as ArrowNativeTypeOp>::MIN_TOTAL_ORDER.is_lt(f32::NEG_INFINITY));
assert!(<f32 as ArrowNativeTypeOp>::MAX_TOTAL_ORDER.is_gt(f32::INFINITY));
assert!(<f32 as ArrowNativeTypeOp>::MIN_TOTAL_ORDER.is_nan());
assert!(<f32 as ArrowNativeTypeOp>::MIN_TOTAL_ORDER.is_sign_negative());
assert!(<f32 as ArrowNativeTypeOp>::MIN_TOTAL_ORDER.is_lt(-f32::NAN));
assert!(<f32 as ArrowNativeTypeOp>::MAX_TOTAL_ORDER.is_nan());
assert!(<f32 as ArrowNativeTypeOp>::MAX_TOTAL_ORDER.is_sign_positive());
assert!(<f32 as ArrowNativeTypeOp>::MAX_TOTAL_ORDER.is_gt(f32::NAN));
assert!(<f16 as ArrowNativeTypeOp>::MIN_TOTAL_ORDER.is_lt(f16::NEG_INFINITY));
assert!(<f16 as ArrowNativeTypeOp>::MAX_TOTAL_ORDER.is_gt(f16::INFINITY));
assert!(<f16 as ArrowNativeTypeOp>::MIN_TOTAL_ORDER.is_nan());
assert!(<f16 as ArrowNativeTypeOp>::MIN_TOTAL_ORDER.is_sign_negative());
assert!(<f16 as ArrowNativeTypeOp>::MIN_TOTAL_ORDER.is_lt(-f16::NAN));
assert!(<f16 as ArrowNativeTypeOp>::MAX_TOTAL_ORDER.is_nan());
assert!(<f16 as ArrowNativeTypeOp>::MAX_TOTAL_ORDER.is_sign_positive());
assert!(<f16 as ArrowNativeTypeOp>::MAX_TOTAL_ORDER.is_gt(f16::NAN));
}
}

Просмотреть файл

@ -0,0 +1,643 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::types::{ByteArrayType, GenericBinaryType};
use crate::{Array, GenericByteArray, GenericListArray, GenericStringArray, OffsetSizeTrait};
use arrow_data::ArrayData;
use arrow_schema::DataType;
/// A [`GenericBinaryArray`] for storing `[u8]`
pub type GenericBinaryArray<OffsetSize> = GenericByteArray<GenericBinaryType<OffsetSize>>;
impl<OffsetSize: OffsetSizeTrait> GenericBinaryArray<OffsetSize> {
/// Get the data type of the array.
#[deprecated(note = "please use `Self::DATA_TYPE` instead")]
pub const fn get_data_type() -> DataType {
Self::DATA_TYPE
}
/// Creates a [GenericBinaryArray] from a vector of byte slices
///
/// See also [`Self::from_iter_values`]
pub fn from_vec(v: Vec<&[u8]>) -> Self {
Self::from_iter_values(v)
}
/// Creates a [GenericBinaryArray] from a vector of Optional (null) byte slices
pub fn from_opt_vec(v: Vec<Option<&[u8]>>) -> Self {
v.into_iter().collect()
}
fn from_list(v: GenericListArray<OffsetSize>) -> Self {
let v = v.into_data();
assert_eq!(
v.child_data().len(),
1,
"BinaryArray can only be created from list array of u8 values \
(i.e. List<PrimitiveArray<u8>>)."
);
let child_data = &v.child_data()[0];
assert_eq!(
child_data.child_data().len(),
0,
"BinaryArray can only be created from list array of u8 values \
(i.e. List<PrimitiveArray<u8>>)."
);
assert_eq!(
child_data.data_type(),
&DataType::UInt8,
"BinaryArray can only be created from List<u8> arrays, mismatched data types."
);
assert_eq!(
child_data.null_count(),
0,
"The child array cannot contain null values."
);
let builder = ArrayData::builder(Self::DATA_TYPE)
.len(v.len())
.offset(v.offset())
.add_buffer(v.buffers()[0].clone())
.add_buffer(child_data.buffers()[0].slice(child_data.offset()))
.nulls(v.nulls().cloned());
let data = unsafe { builder.build_unchecked() };
Self::from(data)
}
/// Returns an iterator that returns the values of `array.value(i)` for an iterator with each element `i`
pub fn take_iter<'a>(
&'a self,
indexes: impl Iterator<Item = Option<usize>> + 'a,
) -> impl Iterator<Item = Option<&[u8]>> + 'a {
indexes.map(|opt_index| opt_index.map(|index| self.value(index)))
}
/// Returns an iterator that returns the values of `array.value(i)` for an iterator with each element `i`
/// # Safety
///
/// caller must ensure that the indexes in the iterator are less than the `array.len()`
pub unsafe fn take_iter_unchecked<'a>(
&'a self,
indexes: impl Iterator<Item = Option<usize>> + 'a,
) -> impl Iterator<Item = Option<&[u8]>> + 'a {
indexes.map(|opt_index| opt_index.map(|index| self.value_unchecked(index)))
}
}
impl<OffsetSize: OffsetSizeTrait> From<Vec<Option<&[u8]>>> for GenericBinaryArray<OffsetSize> {
fn from(v: Vec<Option<&[u8]>>) -> Self {
Self::from_opt_vec(v)
}
}
impl<OffsetSize: OffsetSizeTrait> From<Vec<&[u8]>> for GenericBinaryArray<OffsetSize> {
fn from(v: Vec<&[u8]>) -> Self {
Self::from_iter_values(v)
}
}
impl<T: OffsetSizeTrait> From<GenericListArray<T>> for GenericBinaryArray<T> {
fn from(v: GenericListArray<T>) -> Self {
Self::from_list(v)
}
}
impl<OffsetSize: OffsetSizeTrait> From<GenericStringArray<OffsetSize>>
for GenericBinaryArray<OffsetSize>
{
fn from(value: GenericStringArray<OffsetSize>) -> Self {
let builder = value
.into_data()
.into_builder()
.data_type(GenericBinaryType::<OffsetSize>::DATA_TYPE);
// Safety:
// A StringArray is a valid BinaryArray
Self::from(unsafe { builder.build_unchecked() })
}
}
/// A [`GenericBinaryArray`] of `[u8]` using `i32` offsets
///
/// The byte length of each element is represented by an i32.
///
/// # Examples
///
/// Create a BinaryArray from a vector of byte slices.
///
/// ```
/// use arrow_array::{Array, BinaryArray};
/// let values: Vec<&[u8]> =
/// vec![b"one", b"two", b"", b"three"];
/// let array = BinaryArray::from_vec(values);
/// assert_eq!(4, array.len());
/// assert_eq!(b"one", array.value(0));
/// assert_eq!(b"two", array.value(1));
/// assert_eq!(b"", array.value(2));
/// assert_eq!(b"three", array.value(3));
/// ```
///
/// Create a BinaryArray from a vector of Optional (null) byte slices.
///
/// ```
/// use arrow_array::{Array, BinaryArray};
/// let values: Vec<Option<&[u8]>> =
/// vec![Some(b"one"), Some(b"two"), None, Some(b""), Some(b"three")];
/// let array = BinaryArray::from_opt_vec(values);
/// assert_eq!(5, array.len());
/// assert_eq!(b"one", array.value(0));
/// assert_eq!(b"two", array.value(1));
/// assert_eq!(b"", array.value(3));
/// assert_eq!(b"three", array.value(4));
/// assert!(!array.is_null(0));
/// assert!(!array.is_null(1));
/// assert!(array.is_null(2));
/// assert!(!array.is_null(3));
/// assert!(!array.is_null(4));
/// ```
///
/// See [`GenericByteArray`] for more information and examples
pub type BinaryArray = GenericBinaryArray<i32>;
/// A [`GenericBinaryArray`] of `[u8]` using `i64` offsets
///
/// # Examples
///
/// Create a LargeBinaryArray from a vector of byte slices.
///
/// ```
/// use arrow_array::{Array, LargeBinaryArray};
/// let values: Vec<&[u8]> =
/// vec![b"one", b"two", b"", b"three"];
/// let array = LargeBinaryArray::from_vec(values);
/// assert_eq!(4, array.len());
/// assert_eq!(b"one", array.value(0));
/// assert_eq!(b"two", array.value(1));
/// assert_eq!(b"", array.value(2));
/// assert_eq!(b"three", array.value(3));
/// ```
///
/// Create a LargeBinaryArray from a vector of Optional (null) byte slices.
///
/// ```
/// use arrow_array::{Array, LargeBinaryArray};
/// let values: Vec<Option<&[u8]>> =
/// vec![Some(b"one"), Some(b"two"), None, Some(b""), Some(b"three")];
/// let array = LargeBinaryArray::from_opt_vec(values);
/// assert_eq!(5, array.len());
/// assert_eq!(b"one", array.value(0));
/// assert_eq!(b"two", array.value(1));
/// assert_eq!(b"", array.value(3));
/// assert_eq!(b"three", array.value(4));
/// assert!(!array.is_null(0));
/// assert!(!array.is_null(1));
/// assert!(array.is_null(2));
/// assert!(!array.is_null(3));
/// assert!(!array.is_null(4));
/// ```
///
/// See [`GenericByteArray`] for more information and examples
pub type LargeBinaryArray = GenericBinaryArray<i64>;
#[cfg(test)]
mod tests {
use super::*;
use crate::{ListArray, StringArray};
use arrow_buffer::Buffer;
use arrow_schema::Field;
use std::sync::Arc;
#[test]
fn test_binary_array() {
let values: [u8; 12] = [
b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't',
];
let offsets: [i32; 4] = [0, 5, 5, 12];
// Array data: ["hello", "", "parquet"]
let array_data = ArrayData::builder(DataType::Binary)
.len(3)
.add_buffer(Buffer::from_slice_ref(offsets))
.add_buffer(Buffer::from_slice_ref(values))
.build()
.unwrap();
let binary_array = BinaryArray::from(array_data);
assert_eq!(3, binary_array.len());
assert_eq!(0, binary_array.null_count());
assert_eq!([b'h', b'e', b'l', b'l', b'o'], binary_array.value(0));
assert_eq!([b'h', b'e', b'l', b'l', b'o'], unsafe {
binary_array.value_unchecked(0)
});
assert_eq!([] as [u8; 0], binary_array.value(1));
assert_eq!([] as [u8; 0], unsafe { binary_array.value_unchecked(1) });
assert_eq!(
[b'p', b'a', b'r', b'q', b'u', b'e', b't'],
binary_array.value(2)
);
assert_eq!([b'p', b'a', b'r', b'q', b'u', b'e', b't'], unsafe {
binary_array.value_unchecked(2)
});
assert_eq!(5, binary_array.value_offsets()[2]);
assert_eq!(7, binary_array.value_length(2));
for i in 0..3 {
assert!(binary_array.is_valid(i));
assert!(!binary_array.is_null(i));
}
}
#[test]
fn test_binary_array_with_offsets() {
let values: [u8; 12] = [
b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't',
];
let offsets: [i32; 4] = [0, 5, 5, 12];
// Test binary array with offset
let array_data = ArrayData::builder(DataType::Binary)
.len(2)
.offset(1)
.add_buffer(Buffer::from_slice_ref(offsets))
.add_buffer(Buffer::from_slice_ref(values))
.build()
.unwrap();
let binary_array = BinaryArray::from(array_data);
assert_eq!(
[b'p', b'a', b'r', b'q', b'u', b'e', b't'],
binary_array.value(1)
);
assert_eq!(5, binary_array.value_offsets()[0]);
assert_eq!(0, binary_array.value_length(0));
assert_eq!(5, binary_array.value_offsets()[1]);
assert_eq!(7, binary_array.value_length(1));
}
#[test]
fn test_large_binary_array() {
let values: [u8; 12] = [
b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't',
];
let offsets: [i64; 4] = [0, 5, 5, 12];
// Array data: ["hello", "", "parquet"]
let array_data = ArrayData::builder(DataType::LargeBinary)
.len(3)
.add_buffer(Buffer::from_slice_ref(offsets))
.add_buffer(Buffer::from_slice_ref(values))
.build()
.unwrap();
let binary_array = LargeBinaryArray::from(array_data);
assert_eq!(3, binary_array.len());
assert_eq!(0, binary_array.null_count());
assert_eq!([b'h', b'e', b'l', b'l', b'o'], binary_array.value(0));
assert_eq!([b'h', b'e', b'l', b'l', b'o'], unsafe {
binary_array.value_unchecked(0)
});
assert_eq!([] as [u8; 0], binary_array.value(1));
assert_eq!([] as [u8; 0], unsafe { binary_array.value_unchecked(1) });
assert_eq!(
[b'p', b'a', b'r', b'q', b'u', b'e', b't'],
binary_array.value(2)
);
assert_eq!([b'p', b'a', b'r', b'q', b'u', b'e', b't'], unsafe {
binary_array.value_unchecked(2)
});
assert_eq!(5, binary_array.value_offsets()[2]);
assert_eq!(7, binary_array.value_length(2));
for i in 0..3 {
assert!(binary_array.is_valid(i));
assert!(!binary_array.is_null(i));
}
}
#[test]
fn test_large_binary_array_with_offsets() {
let values: [u8; 12] = [
b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't',
];
let offsets: [i64; 4] = [0, 5, 5, 12];
// Test binary array with offset
let array_data = ArrayData::builder(DataType::LargeBinary)
.len(2)
.offset(1)
.add_buffer(Buffer::from_slice_ref(offsets))
.add_buffer(Buffer::from_slice_ref(values))
.build()
.unwrap();
let binary_array = LargeBinaryArray::from(array_data);
assert_eq!(
[b'p', b'a', b'r', b'q', b'u', b'e', b't'],
binary_array.value(1)
);
assert_eq!([b'p', b'a', b'r', b'q', b'u', b'e', b't'], unsafe {
binary_array.value_unchecked(1)
});
assert_eq!(5, binary_array.value_offsets()[0]);
assert_eq!(0, binary_array.value_length(0));
assert_eq!(5, binary_array.value_offsets()[1]);
assert_eq!(7, binary_array.value_length(1));
}
fn _test_generic_binary_array_from_list_array<O: OffsetSizeTrait>() {
let values = b"helloparquet";
let child_data = ArrayData::builder(DataType::UInt8)
.len(12)
.add_buffer(Buffer::from(&values[..]))
.build()
.unwrap();
let offsets = [0, 5, 5, 12].map(|n| O::from_usize(n).unwrap());
// Array data: ["hello", "", "parquet"]
let array_data1 = ArrayData::builder(GenericBinaryArray::<O>::DATA_TYPE)
.len(3)
.add_buffer(Buffer::from_slice_ref(offsets))
.add_buffer(Buffer::from_slice_ref(values))
.build()
.unwrap();
let binary_array1 = GenericBinaryArray::<O>::from(array_data1);
let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new(
"item",
DataType::UInt8,
false,
)));
let array_data2 = ArrayData::builder(data_type)
.len(3)
.add_buffer(Buffer::from_slice_ref(offsets))
.add_child_data(child_data)
.build()
.unwrap();
let list_array = GenericListArray::<O>::from(array_data2);
let binary_array2 = GenericBinaryArray::<O>::from(list_array);
assert_eq!(binary_array1.len(), binary_array2.len());
assert_eq!(binary_array1.null_count(), binary_array2.null_count());
assert_eq!(binary_array1.value_offsets(), binary_array2.value_offsets());
for i in 0..binary_array1.len() {
assert_eq!(binary_array1.value(i), binary_array2.value(i));
assert_eq!(binary_array1.value(i), unsafe {
binary_array2.value_unchecked(i)
});
assert_eq!(binary_array1.value_length(i), binary_array2.value_length(i));
}
}
#[test]
fn test_binary_array_from_list_array() {
_test_generic_binary_array_from_list_array::<i32>();
}
#[test]
fn test_large_binary_array_from_list_array() {
_test_generic_binary_array_from_list_array::<i64>();
}
fn _test_generic_binary_array_from_list_array_with_offset<O: OffsetSizeTrait>() {
let values = b"HelloArrowAndParquet";
// b"ArrowAndParquet"
let child_data = ArrayData::builder(DataType::UInt8)
.len(15)
.offset(5)
.add_buffer(Buffer::from(&values[..]))
.build()
.unwrap();
let offsets = [0, 5, 8, 15].map(|n| O::from_usize(n).unwrap());
let null_buffer = Buffer::from_slice_ref([0b101]);
let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new(
"item",
DataType::UInt8,
false,
)));
// [None, Some(b"Parquet")]
let array_data = ArrayData::builder(data_type)
.len(2)
.offset(1)
.add_buffer(Buffer::from_slice_ref(offsets))
.null_bit_buffer(Some(null_buffer))
.add_child_data(child_data)
.build()
.unwrap();
let list_array = GenericListArray::<O>::from(array_data);
let binary_array = GenericBinaryArray::<O>::from(list_array);
assert_eq!(2, binary_array.len());
assert_eq!(1, binary_array.null_count());
assert!(binary_array.is_null(0));
assert!(binary_array.is_valid(1));
assert_eq!(b"Parquet", binary_array.value(1));
}
#[test]
fn test_binary_array_from_list_array_with_offset() {
_test_generic_binary_array_from_list_array_with_offset::<i32>();
}
#[test]
fn test_large_binary_array_from_list_array_with_offset() {
_test_generic_binary_array_from_list_array_with_offset::<i64>();
}
fn _test_generic_binary_array_from_list_array_with_child_nulls_failed<O: OffsetSizeTrait>() {
let values = b"HelloArrow";
let child_data = ArrayData::builder(DataType::UInt8)
.len(10)
.add_buffer(Buffer::from(&values[..]))
.null_bit_buffer(Some(Buffer::from_slice_ref([0b1010101010])))
.build()
.unwrap();
let offsets = [0, 5, 10].map(|n| O::from_usize(n).unwrap());
let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new(
"item",
DataType::UInt8,
true,
)));
// [None, Some(b"Parquet")]
let array_data = ArrayData::builder(data_type)
.len(2)
.add_buffer(Buffer::from_slice_ref(offsets))
.add_child_data(child_data)
.build()
.unwrap();
let list_array = GenericListArray::<O>::from(array_data);
drop(GenericBinaryArray::<O>::from(list_array));
}
#[test]
#[should_panic(expected = "The child array cannot contain null values.")]
fn test_binary_array_from_list_array_with_child_nulls_failed() {
_test_generic_binary_array_from_list_array_with_child_nulls_failed::<i32>();
}
#[test]
#[should_panic(expected = "The child array cannot contain null values.")]
fn test_large_binary_array_from_list_array_with_child_nulls_failed() {
_test_generic_binary_array_from_list_array_with_child_nulls_failed::<i64>();
}
fn test_generic_binary_array_from_opt_vec<T: OffsetSizeTrait>() {
let values: Vec<Option<&[u8]>> =
vec![Some(b"one"), Some(b"two"), None, Some(b""), Some(b"three")];
let array = GenericBinaryArray::<T>::from_opt_vec(values);
assert_eq!(array.len(), 5);
assert_eq!(array.value(0), b"one");
assert_eq!(array.value(1), b"two");
assert_eq!(array.value(3), b"");
assert_eq!(array.value(4), b"three");
assert!(!array.is_null(0));
assert!(!array.is_null(1));
assert!(array.is_null(2));
assert!(!array.is_null(3));
assert!(!array.is_null(4));
}
#[test]
fn test_large_binary_array_from_opt_vec() {
test_generic_binary_array_from_opt_vec::<i64>()
}
#[test]
fn test_binary_array_from_opt_vec() {
test_generic_binary_array_from_opt_vec::<i32>()
}
#[test]
fn test_binary_array_from_unbound_iter() {
// iterator that doesn't declare (upper) size bound
let value_iter = (0..)
.scan(0usize, |pos, i| {
if *pos < 10 {
*pos += 1;
Some(Some(format!("value {i}")))
} else {
// actually returns up to 10 values
None
}
})
// limited using take()
.take(100);
let (_, upper_size_bound) = value_iter.size_hint();
// the upper bound, defined by take above, is 100
assert_eq!(upper_size_bound, Some(100));
let binary_array: BinaryArray = value_iter.collect();
// but the actual number of items in the array should be 10
assert_eq!(binary_array.len(), 10);
}
#[test]
#[should_panic(
expected = "BinaryArray can only be created from List<u8> arrays, mismatched data types."
)]
fn test_binary_array_from_incorrect_list_array() {
let values: [u32; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
let values_data = ArrayData::builder(DataType::UInt32)
.len(12)
.add_buffer(Buffer::from_slice_ref(values))
.build()
.unwrap();
let offsets: [i32; 4] = [0, 5, 5, 12];
let data_type = DataType::List(Arc::new(Field::new("item", DataType::UInt32, false)));
let array_data = ArrayData::builder(data_type)
.len(3)
.add_buffer(Buffer::from_slice_ref(offsets))
.add_child_data(values_data)
.build()
.unwrap();
let list_array = ListArray::from(array_data);
drop(BinaryArray::from(list_array));
}
#[test]
#[should_panic(
expected = "Trying to access an element at index 4 from a BinaryArray of length 3"
)]
fn test_binary_array_get_value_index_out_of_bound() {
let values: [u8; 12] = [104, 101, 108, 108, 111, 112, 97, 114, 113, 117, 101, 116];
let offsets: [i32; 4] = [0, 5, 5, 12];
let array_data = ArrayData::builder(DataType::Binary)
.len(3)
.add_buffer(Buffer::from_slice_ref(offsets))
.add_buffer(Buffer::from_slice_ref(values))
.build()
.unwrap();
let binary_array = BinaryArray::from(array_data);
binary_array.value(4);
}
#[test]
#[should_panic(expected = "LargeBinaryArray expects DataType::LargeBinary")]
fn test_binary_array_validation() {
let array = BinaryArray::from_iter_values([&[1, 2]]);
let _ = LargeBinaryArray::from(array.into_data());
}
#[test]
fn test_binary_array_all_null() {
let data = vec![None];
let array = BinaryArray::from(data);
array
.into_data()
.validate_full()
.expect("All null array has valid array data");
}
#[test]
fn test_large_binary_array_all_null() {
let data = vec![None];
let array = LargeBinaryArray::from(data);
array
.into_data()
.validate_full()
.expect("All null array has valid array data");
}
#[test]
fn test_empty_offsets() {
let string = BinaryArray::from(
ArrayData::builder(DataType::Binary)
.buffers(vec![Buffer::from(&[]), Buffer::from(&[])])
.build()
.unwrap(),
);
assert_eq!(string.value_offsets(), &[0]);
let string = LargeBinaryArray::from(
ArrayData::builder(DataType::LargeBinary)
.buffers(vec![Buffer::from(&[]), Buffer::from(&[])])
.build()
.unwrap(),
);
assert_eq!(string.len(), 0);
assert_eq!(string.value_offsets(), &[0]);
}
#[test]
fn test_to_from_string() {
let s = StringArray::from_iter_values(["a", "b", "c", "d"]);
let b = BinaryArray::from(s.clone());
let sa = StringArray::from(b); // Performs UTF-8 validation again
assert_eq!(s, sa);
}
}

Просмотреть файл

@ -0,0 +1,643 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::array::print_long_array;
use crate::builder::BooleanBuilder;
use crate::iterator::BooleanIter;
use crate::{Array, ArrayAccessor, ArrayRef, Scalar};
use arrow_buffer::{bit_util, BooleanBuffer, MutableBuffer, NullBuffer};
use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::DataType;
use std::any::Any;
use std::sync::Arc;
/// An array of [boolean values](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-primitive-layout)
///
/// # Example: From a Vec
///
/// ```
/// # use arrow_array::{Array, BooleanArray};
/// let arr: BooleanArray = vec![true, true, false].into();
/// ```
///
/// # Example: From an optional Vec
///
/// ```
/// # use arrow_array::{Array, BooleanArray};
/// let arr: BooleanArray = vec![Some(true), None, Some(false)].into();
/// ```
///
/// # Example: From an iterator
///
/// ```
/// # use arrow_array::{Array, BooleanArray};
/// let arr: BooleanArray = (0..5).map(|x| (x % 2 == 0).then(|| x % 3 == 0)).collect();
/// let values: Vec<_> = arr.iter().collect();
/// assert_eq!(&values, &[Some(true), None, Some(false), None, Some(false)])
/// ```
///
/// # Example: Using Builder
///
/// ```
/// # use arrow_array::Array;
/// # use arrow_array::builder::BooleanBuilder;
/// let mut builder = BooleanBuilder::new();
/// builder.append_value(true);
/// builder.append_null();
/// builder.append_value(false);
/// let array = builder.finish();
/// let values: Vec<_> = array.iter().collect();
/// assert_eq!(&values, &[Some(true), None, Some(false)])
/// ```
///
#[derive(Clone)]
pub struct BooleanArray {
values: BooleanBuffer,
nulls: Option<NullBuffer>,
}
impl std::fmt::Debug for BooleanArray {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "BooleanArray\n[\n")?;
print_long_array(self, f, |array, index, f| {
std::fmt::Debug::fmt(&array.value(index), f)
})?;
write!(f, "]")
}
}
impl BooleanArray {
/// Create a new [`BooleanArray`] from the provided values and nulls
///
/// # Panics
///
/// Panics if `values.len() != nulls.len()`
pub fn new(values: BooleanBuffer, nulls: Option<NullBuffer>) -> Self {
if let Some(n) = nulls.as_ref() {
assert_eq!(values.len(), n.len());
}
Self { values, nulls }
}
/// Create a new [`BooleanArray`] with length `len` consisting only of nulls
pub fn new_null(len: usize) -> Self {
Self {
values: BooleanBuffer::new_unset(len),
nulls: Some(NullBuffer::new_null(len)),
}
}
/// Create a new [`Scalar`] from `value`
pub fn new_scalar(value: bool) -> Scalar<Self> {
let values = match value {
true => BooleanBuffer::new_set(1),
false => BooleanBuffer::new_unset(1),
};
Scalar::new(Self::new(values, None))
}
/// Returns the length of this array.
pub fn len(&self) -> usize {
self.values.len()
}
/// Returns whether this array is empty.
pub fn is_empty(&self) -> bool {
self.values.is_empty()
}
/// Returns a zero-copy slice of this array with the indicated offset and length.
pub fn slice(&self, offset: usize, length: usize) -> Self {
Self {
values: self.values.slice(offset, length),
nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)),
}
}
/// Returns a new boolean array builder
pub fn builder(capacity: usize) -> BooleanBuilder {
BooleanBuilder::with_capacity(capacity)
}
/// Returns the underlying [`BooleanBuffer`] holding all the values of this array
pub fn values(&self) -> &BooleanBuffer {
&self.values
}
/// Returns the number of non null, true values within this array
pub fn true_count(&self) -> usize {
match self.nulls() {
Some(nulls) => {
let null_chunks = nulls.inner().bit_chunks().iter_padded();
let value_chunks = self.values().bit_chunks().iter_padded();
null_chunks
.zip(value_chunks)
.map(|(a, b)| (a & b).count_ones() as usize)
.sum()
}
None => self.values().count_set_bits(),
}
}
/// Returns the number of non null, false values within this array
pub fn false_count(&self) -> usize {
self.len() - self.null_count() - self.true_count()
}
/// Returns the boolean value at index `i`.
///
/// # Safety
/// This doesn't check bounds, the caller must ensure that index < self.len()
pub unsafe fn value_unchecked(&self, i: usize) -> bool {
self.values.value_unchecked(i)
}
/// Returns the boolean value at index `i`.
/// # Panics
/// Panics if index `i` is out of bounds
pub fn value(&self, i: usize) -> bool {
assert!(
i < self.len(),
"Trying to access an element at index {} from a BooleanArray of length {}",
i,
self.len()
);
// Safety:
// `i < self.len()
unsafe { self.value_unchecked(i) }
}
/// Returns an iterator that returns the values of `array.value(i)` for an iterator with each element `i`
pub fn take_iter<'a>(
&'a self,
indexes: impl Iterator<Item = Option<usize>> + 'a,
) -> impl Iterator<Item = Option<bool>> + 'a {
indexes.map(|opt_index| opt_index.map(|index| self.value(index)))
}
/// Returns an iterator that returns the values of `array.value(i)` for an iterator with each element `i`
/// # Safety
///
/// caller must ensure that the offsets in the iterator are less than the array len()
pub unsafe fn take_iter_unchecked<'a>(
&'a self,
indexes: impl Iterator<Item = Option<usize>> + 'a,
) -> impl Iterator<Item = Option<bool>> + 'a {
indexes.map(|opt_index| opt_index.map(|index| self.value_unchecked(index)))
}
/// Create a [`BooleanArray`] by evaluating the operation for
/// each element of the provided array
///
/// ```
/// # use arrow_array::{BooleanArray, Int32Array};
///
/// let array = Int32Array::from(vec![1, 2, 3, 4, 5]);
/// let r = BooleanArray::from_unary(&array, |x| x > 2);
/// assert_eq!(&r, &BooleanArray::from(vec![false, false, true, true, true]));
/// ```
pub fn from_unary<T: ArrayAccessor, F>(left: T, mut op: F) -> Self
where
F: FnMut(T::Item) -> bool,
{
let nulls = left.logical_nulls();
let values = BooleanBuffer::collect_bool(left.len(), |i| unsafe {
// SAFETY: i in range 0..len
op(left.value_unchecked(i))
});
Self::new(values, nulls)
}
/// Create a [`BooleanArray`] by evaluating the binary operation for
/// each element of the provided arrays
///
/// ```
/// # use arrow_array::{BooleanArray, Int32Array};
///
/// let a = Int32Array::from(vec![1, 2, 3, 4, 5]);
/// let b = Int32Array::from(vec![1, 2, 0, 2, 5]);
/// let r = BooleanArray::from_binary(&a, &b, |a, b| a == b);
/// assert_eq!(&r, &BooleanArray::from(vec![true, true, false, false, true]));
/// ```
///
/// # Panics
///
/// This function panics if left and right are not the same length
///
pub fn from_binary<T: ArrayAccessor, S: ArrayAccessor, F>(left: T, right: S, mut op: F) -> Self
where
F: FnMut(T::Item, S::Item) -> bool,
{
assert_eq!(left.len(), right.len());
let nulls = NullBuffer::union(
left.logical_nulls().as_ref(),
right.logical_nulls().as_ref(),
);
let values = BooleanBuffer::collect_bool(left.len(), |i| unsafe {
// SAFETY: i in range 0..len
op(left.value_unchecked(i), right.value_unchecked(i))
});
Self::new(values, nulls)
}
/// Deconstruct this array into its constituent parts
pub fn into_parts(self) -> (BooleanBuffer, Option<NullBuffer>) {
(self.values, self.nulls)
}
}
impl Array for BooleanArray {
fn as_any(&self) -> &dyn Any {
self
}
fn to_data(&self) -> ArrayData {
self.clone().into()
}
fn into_data(self) -> ArrayData {
self.into()
}
fn data_type(&self) -> &DataType {
&DataType::Boolean
}
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
Arc::new(self.slice(offset, length))
}
fn len(&self) -> usize {
self.values.len()
}
fn is_empty(&self) -> bool {
self.values.is_empty()
}
fn offset(&self) -> usize {
self.values.offset()
}
fn nulls(&self) -> Option<&NullBuffer> {
self.nulls.as_ref()
}
fn get_buffer_memory_size(&self) -> usize {
let mut sum = self.values.inner().capacity();
if let Some(x) = &self.nulls {
sum += x.buffer().capacity()
}
sum
}
fn get_array_memory_size(&self) -> usize {
std::mem::size_of::<Self>() + self.get_buffer_memory_size()
}
}
impl<'a> ArrayAccessor for &'a BooleanArray {
type Item = bool;
fn value(&self, index: usize) -> Self::Item {
BooleanArray::value(self, index)
}
unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
BooleanArray::value_unchecked(self, index)
}
}
impl From<Vec<bool>> for BooleanArray {
fn from(data: Vec<bool>) -> Self {
let mut mut_buf = MutableBuffer::new_null(data.len());
{
let mut_slice = mut_buf.as_slice_mut();
for (i, b) in data.iter().enumerate() {
if *b {
bit_util::set_bit(mut_slice, i);
}
}
}
let array_data = ArrayData::builder(DataType::Boolean)
.len(data.len())
.add_buffer(mut_buf.into());
let array_data = unsafe { array_data.build_unchecked() };
BooleanArray::from(array_data)
}
}
impl From<Vec<Option<bool>>> for BooleanArray {
fn from(data: Vec<Option<bool>>) -> Self {
data.iter().collect()
}
}
impl From<ArrayData> for BooleanArray {
fn from(data: ArrayData) -> Self {
assert_eq!(
data.data_type(),
&DataType::Boolean,
"BooleanArray expected ArrayData with type {} got {}",
DataType::Boolean,
data.data_type()
);
assert_eq!(
data.buffers().len(),
1,
"BooleanArray data should contain a single buffer only (values buffer)"
);
let values = BooleanBuffer::new(data.buffers()[0].clone(), data.offset(), data.len());
Self {
values,
nulls: data.nulls().cloned(),
}
}
}
impl From<BooleanArray> for ArrayData {
fn from(array: BooleanArray) -> Self {
let builder = ArrayDataBuilder::new(DataType::Boolean)
.len(array.values.len())
.offset(array.values.offset())
.nulls(array.nulls)
.buffers(vec![array.values.into_inner()]);
unsafe { builder.build_unchecked() }
}
}
impl<'a> IntoIterator for &'a BooleanArray {
type Item = Option<bool>;
type IntoIter = BooleanIter<'a>;
fn into_iter(self) -> Self::IntoIter {
BooleanIter::<'a>::new(self)
}
}
impl<'a> BooleanArray {
/// constructs a new iterator
pub fn iter(&'a self) -> BooleanIter<'a> {
BooleanIter::<'a>::new(self)
}
}
impl<Ptr: std::borrow::Borrow<Option<bool>>> FromIterator<Ptr> for BooleanArray {
fn from_iter<I: IntoIterator<Item = Ptr>>(iter: I) -> Self {
let iter = iter.into_iter();
let (_, data_len) = iter.size_hint();
let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound.
let num_bytes = bit_util::ceil(data_len, 8);
let mut null_builder = MutableBuffer::from_len_zeroed(num_bytes);
let mut val_builder = MutableBuffer::from_len_zeroed(num_bytes);
let data = val_builder.as_slice_mut();
let null_slice = null_builder.as_slice_mut();
iter.enumerate().for_each(|(i, item)| {
if let Some(a) = item.borrow() {
bit_util::set_bit(null_slice, i);
if *a {
bit_util::set_bit(data, i);
}
}
});
let data = unsafe {
ArrayData::new_unchecked(
DataType::Boolean,
data_len,
None,
Some(null_builder.into()),
0,
vec![val_builder.into()],
vec![],
)
};
BooleanArray::from(data)
}
}
impl From<BooleanBuffer> for BooleanArray {
fn from(values: BooleanBuffer) -> Self {
Self {
values,
nulls: None,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use arrow_buffer::Buffer;
use rand::{thread_rng, Rng};
#[test]
fn test_boolean_fmt_debug() {
let arr = BooleanArray::from(vec![true, false, false]);
assert_eq!(
"BooleanArray\n[\n true,\n false,\n false,\n]",
format!("{arr:?}")
);
}
#[test]
fn test_boolean_with_null_fmt_debug() {
let mut builder = BooleanArray::builder(3);
builder.append_value(true);
builder.append_null();
builder.append_value(false);
let arr = builder.finish();
assert_eq!(
"BooleanArray\n[\n true,\n null,\n false,\n]",
format!("{arr:?}")
);
}
#[test]
fn test_boolean_array_from_vec() {
let buf = Buffer::from([10_u8]);
let arr = BooleanArray::from(vec![false, true, false, true]);
assert_eq!(&buf, arr.values().inner());
assert_eq!(4, arr.len());
assert_eq!(0, arr.offset());
assert_eq!(0, arr.null_count());
for i in 0..4 {
assert!(!arr.is_null(i));
assert!(arr.is_valid(i));
assert_eq!(i == 1 || i == 3, arr.value(i), "failed at {i}")
}
}
#[test]
fn test_boolean_array_from_vec_option() {
let buf = Buffer::from([10_u8]);
let arr = BooleanArray::from(vec![Some(false), Some(true), None, Some(true)]);
assert_eq!(&buf, arr.values().inner());
assert_eq!(4, arr.len());
assert_eq!(0, arr.offset());
assert_eq!(1, arr.null_count());
for i in 0..4 {
if i == 2 {
assert!(arr.is_null(i));
assert!(!arr.is_valid(i));
} else {
assert!(!arr.is_null(i));
assert!(arr.is_valid(i));
assert_eq!(i == 1 || i == 3, arr.value(i), "failed at {i}")
}
}
}
#[test]
fn test_boolean_array_from_iter() {
let v = vec![Some(false), Some(true), Some(false), Some(true)];
let arr = v.into_iter().collect::<BooleanArray>();
assert_eq!(4, arr.len());
assert_eq!(0, arr.offset());
assert_eq!(0, arr.null_count());
assert!(arr.nulls().is_none());
for i in 0..3 {
assert!(!arr.is_null(i));
assert!(arr.is_valid(i));
assert_eq!(i == 1 || i == 3, arr.value(i), "failed at {i}")
}
}
#[test]
fn test_boolean_array_from_nullable_iter() {
let v = vec![Some(true), None, Some(false), None];
let arr = v.into_iter().collect::<BooleanArray>();
assert_eq!(4, arr.len());
assert_eq!(0, arr.offset());
assert_eq!(2, arr.null_count());
assert!(arr.nulls().is_some());
assert!(arr.is_valid(0));
assert!(arr.is_null(1));
assert!(arr.is_valid(2));
assert!(arr.is_null(3));
assert!(arr.value(0));
assert!(!arr.value(2));
}
#[test]
fn test_boolean_array_builder() {
// Test building a boolean array with ArrayData builder and offset
// 000011011
let buf = Buffer::from([27_u8]);
let buf2 = buf.clone();
let data = ArrayData::builder(DataType::Boolean)
.len(5)
.offset(2)
.add_buffer(buf)
.build()
.unwrap();
let arr = BooleanArray::from(data);
assert_eq!(&buf2, arr.values().inner());
assert_eq!(5, arr.len());
assert_eq!(2, arr.offset());
assert_eq!(0, arr.null_count());
for i in 0..3 {
assert_eq!(i != 0, arr.value(i), "failed at {i}");
}
}
#[test]
#[should_panic(
expected = "Trying to access an element at index 4 from a BooleanArray of length 3"
)]
fn test_fixed_size_binary_array_get_value_index_out_of_bound() {
let v = vec![Some(true), None, Some(false)];
let array = v.into_iter().collect::<BooleanArray>();
array.value(4);
}
#[test]
#[should_panic(expected = "BooleanArray data should contain a single buffer only \
(values buffer)")]
// Different error messages, so skip for now
// https://github.com/apache/arrow-rs/issues/1545
#[cfg(not(feature = "force_validate"))]
fn test_boolean_array_invalid_buffer_len() {
let data = unsafe {
ArrayData::builder(DataType::Boolean)
.len(5)
.build_unchecked()
};
drop(BooleanArray::from(data));
}
#[test]
#[should_panic(expected = "BooleanArray expected ArrayData with type Boolean got Int32")]
fn test_from_array_data_validation() {
let _ = BooleanArray::from(ArrayData::new_empty(&DataType::Int32));
}
#[test]
#[cfg_attr(miri, ignore)] // Takes too long
fn test_true_false_count() {
let mut rng = thread_rng();
for _ in 0..10 {
// No nulls
let d: Vec<_> = (0..2000).map(|_| rng.gen_bool(0.5)).collect();
let b = BooleanArray::from(d.clone());
let expected_true = d.iter().filter(|x| **x).count();
assert_eq!(b.true_count(), expected_true);
assert_eq!(b.false_count(), d.len() - expected_true);
// With nulls
let d: Vec<_> = (0..2000)
.map(|_| rng.gen_bool(0.5).then(|| rng.gen_bool(0.5)))
.collect();
let b = BooleanArray::from(d.clone());
let expected_true = d.iter().filter(|x| matches!(x, Some(true))).count();
assert_eq!(b.true_count(), expected_true);
let expected_false = d.iter().filter(|x| matches!(x, Some(false))).count();
assert_eq!(b.false_count(), expected_false);
}
}
#[test]
fn test_into_parts() {
let boolean_array = [Some(true), None, Some(false)]
.into_iter()
.collect::<BooleanArray>();
let (values, nulls) = boolean_array.into_parts();
assert_eq!(values.values(), &[0b0000_0001]);
assert!(nulls.is_some());
assert_eq!(nulls.unwrap().buffer().as_slice(), &[0b0000_0101]);
let boolean_array =
BooleanArray::from(vec![false, false, false, false, false, false, false, true]);
let (values, nulls) = boolean_array.into_parts();
assert_eq!(values.values(), &[0b1000_0000]);
assert!(nulls.is_none());
}
}

Просмотреть файл

@ -0,0 +1,617 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::array::{get_offsets, print_long_array};
use crate::builder::GenericByteBuilder;
use crate::iterator::ArrayIter;
use crate::types::bytes::ByteArrayNativeType;
use crate::types::ByteArrayType;
use crate::{Array, ArrayAccessor, ArrayRef, OffsetSizeTrait, Scalar};
use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer};
use arrow_buffer::{NullBuffer, OffsetBuffer};
use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::{ArrowError, DataType};
use std::any::Any;
use std::sync::Arc;
/// An array of [variable length byte arrays](https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout)
///
/// See [`StringArray`] and [`LargeStringArray`] for storing utf8 encoded string data
///
/// See [`BinaryArray`] and [`LargeBinaryArray`] for storing arbitrary bytes
///
/// # Example: From a Vec
///
/// ```
/// # use arrow_array::{Array, GenericByteArray, types::Utf8Type};
/// let arr: GenericByteArray<Utf8Type> = vec!["hello", "world", ""].into();
/// assert_eq!(arr.value_data(), b"helloworld");
/// assert_eq!(arr.value_offsets(), &[0, 5, 10, 10]);
/// let values: Vec<_> = arr.iter().collect();
/// assert_eq!(values, &[Some("hello"), Some("world"), Some("")]);
/// ```
///
/// # Example: From an optional Vec
///
/// ```
/// # use arrow_array::{Array, GenericByteArray, types::Utf8Type};
/// let arr: GenericByteArray<Utf8Type> = vec![Some("hello"), Some("world"), Some(""), None].into();
/// assert_eq!(arr.value_data(), b"helloworld");
/// assert_eq!(arr.value_offsets(), &[0, 5, 10, 10, 10]);
/// let values: Vec<_> = arr.iter().collect();
/// assert_eq!(values, &[Some("hello"), Some("world"), Some(""), None]);
/// ```
///
/// # Example: From an iterator of option
///
/// ```
/// # use arrow_array::{Array, GenericByteArray, types::Utf8Type};
/// let arr: GenericByteArray<Utf8Type> = (0..5).map(|x| (x % 2 == 0).then(|| x.to_string())).collect();
/// let values: Vec<_> = arr.iter().collect();
/// assert_eq!(values, &[Some("0"), None, Some("2"), None, Some("4")]);
/// ```
///
/// # Example: Using Builder
///
/// ```
/// # use arrow_array::Array;
/// # use arrow_array::builder::GenericByteBuilder;
/// # use arrow_array::types::Utf8Type;
/// let mut builder = GenericByteBuilder::<Utf8Type>::new();
/// builder.append_value("hello");
/// builder.append_null();
/// builder.append_value("world");
/// let array = builder.finish();
/// let values: Vec<_> = array.iter().collect();
/// assert_eq!(values, &[Some("hello"), None, Some("world")]);
/// ```
///
/// [`StringArray`]: crate::StringArray
/// [`LargeStringArray`]: crate::LargeStringArray
/// [`BinaryArray`]: crate::BinaryArray
/// [`LargeBinaryArray`]: crate::LargeBinaryArray
pub struct GenericByteArray<T: ByteArrayType> {
data_type: DataType,
value_offsets: OffsetBuffer<T::Offset>,
value_data: Buffer,
nulls: Option<NullBuffer>,
}
impl<T: ByteArrayType> Clone for GenericByteArray<T> {
fn clone(&self) -> Self {
Self {
data_type: T::DATA_TYPE,
value_offsets: self.value_offsets.clone(),
value_data: self.value_data.clone(),
nulls: self.nulls.clone(),
}
}
}
impl<T: ByteArrayType> GenericByteArray<T> {
/// Data type of the array.
pub const DATA_TYPE: DataType = T::DATA_TYPE;
/// Create a new [`GenericByteArray`] from the provided parts, panicking on failure
///
/// # Panics
///
/// Panics if [`GenericByteArray::try_new`] returns an error
pub fn new(
offsets: OffsetBuffer<T::Offset>,
values: Buffer,
nulls: Option<NullBuffer>,
) -> Self {
Self::try_new(offsets, values, nulls).unwrap()
}
/// Create a new [`GenericByteArray`] from the provided parts, returning an error on failure
///
/// # Errors
///
/// * `offsets.len() - 1 != nulls.len()`
/// * Any consecutive pair of `offsets` does not denote a valid slice of `values`
pub fn try_new(
offsets: OffsetBuffer<T::Offset>,
values: Buffer,
nulls: Option<NullBuffer>,
) -> Result<Self, ArrowError> {
let len = offsets.len() - 1;
// Verify that each pair of offsets is a valid slices of values
T::validate(&offsets, &values)?;
if let Some(n) = nulls.as_ref() {
if n.len() != len {
return Err(ArrowError::InvalidArgumentError(format!(
"Incorrect length of null buffer for {}{}Array, expected {len} got {}",
T::Offset::PREFIX,
T::PREFIX,
n.len(),
)));
}
}
Ok(Self {
data_type: T::DATA_TYPE,
value_offsets: offsets,
value_data: values,
nulls,
})
}
/// Create a new [`GenericByteArray`] from the provided parts, without validation
///
/// # Safety
///
/// Safe if [`Self::try_new`] would not error
pub unsafe fn new_unchecked(
offsets: OffsetBuffer<T::Offset>,
values: Buffer,
nulls: Option<NullBuffer>,
) -> Self {
Self {
data_type: T::DATA_TYPE,
value_offsets: offsets,
value_data: values,
nulls,
}
}
/// Create a new [`GenericByteArray`] of length `len` where all values are null
pub fn new_null(len: usize) -> Self {
Self {
data_type: T::DATA_TYPE,
value_offsets: OffsetBuffer::new_zeroed(len),
value_data: MutableBuffer::new(0).into(),
nulls: Some(NullBuffer::new_null(len)),
}
}
/// Create a new [`Scalar`] from `v`
pub fn new_scalar(value: impl AsRef<T::Native>) -> Scalar<Self> {
Scalar::new(Self::from_iter_values(std::iter::once(value)))
}
/// Creates a [`GenericByteArray`] based on an iterator of values without nulls
pub fn from_iter_values<Ptr, I>(iter: I) -> Self
where
Ptr: AsRef<T::Native>,
I: IntoIterator<Item = Ptr>,
{
let iter = iter.into_iter();
let (_, data_len) = iter.size_hint();
let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound.
let mut offsets = MutableBuffer::new((data_len + 1) * std::mem::size_of::<T::Offset>());
offsets.push(T::Offset::usize_as(0));
let mut values = MutableBuffer::new(0);
for s in iter {
let s: &[u8] = s.as_ref().as_ref();
values.extend_from_slice(s);
offsets.push(T::Offset::usize_as(values.len()));
}
T::Offset::from_usize(values.len()).expect("offset overflow");
let offsets = Buffer::from(offsets);
// Safety: valid by construction
let value_offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) };
Self {
data_type: T::DATA_TYPE,
value_data: values.into(),
value_offsets,
nulls: None,
}
}
/// Deconstruct this array into its constituent parts
pub fn into_parts(self) -> (OffsetBuffer<T::Offset>, Buffer, Option<NullBuffer>) {
(self.value_offsets, self.value_data, self.nulls)
}
/// Returns the length for value at index `i`.
/// # Panics
/// Panics if index `i` is out of bounds.
#[inline]
pub fn value_length(&self, i: usize) -> T::Offset {
let offsets = self.value_offsets();
offsets[i + 1] - offsets[i]
}
/// Returns a reference to the offsets of this array
///
/// Unlike [`Self::value_offsets`] this returns the [`OffsetBuffer`]
/// allowing for zero-copy cloning
#[inline]
pub fn offsets(&self) -> &OffsetBuffer<T::Offset> {
&self.value_offsets
}
/// Returns the values of this array
///
/// Unlike [`Self::value_data`] this returns the [`Buffer`]
/// allowing for zero-copy cloning
#[inline]
pub fn values(&self) -> &Buffer {
&self.value_data
}
/// Returns the raw value data
pub fn value_data(&self) -> &[u8] {
self.value_data.as_slice()
}
/// Returns true if all data within this array is ASCII
pub fn is_ascii(&self) -> bool {
let offsets = self.value_offsets();
let start = offsets.first().unwrap();
let end = offsets.last().unwrap();
self.value_data()[start.as_usize()..end.as_usize()].is_ascii()
}
/// Returns the offset values in the offsets buffer
#[inline]
pub fn value_offsets(&self) -> &[T::Offset] {
&self.value_offsets
}
/// Returns the element at index `i`
/// # Safety
/// Caller is responsible for ensuring that the index is within the bounds of the array
pub unsafe fn value_unchecked(&self, i: usize) -> &T::Native {
let end = *self.value_offsets().get_unchecked(i + 1);
let start = *self.value_offsets().get_unchecked(i);
// Soundness
// pointer alignment & location is ensured by RawPtrBox
// buffer bounds/offset is ensured by the value_offset invariants
// Safety of `to_isize().unwrap()`
// `start` and `end` are &OffsetSize, which is a generic type that implements the
// OffsetSizeTrait. Currently, only i32 and i64 implement OffsetSizeTrait,
// both of which should cleanly cast to isize on an architecture that supports
// 32/64-bit offsets
let b = std::slice::from_raw_parts(
self.value_data.as_ptr().offset(start.to_isize().unwrap()),
(end - start).to_usize().unwrap(),
);
// SAFETY:
// ArrayData is valid
T::Native::from_bytes_unchecked(b)
}
/// Returns the element at index `i`
/// # Panics
/// Panics if index `i` is out of bounds.
pub fn value(&self, i: usize) -> &T::Native {
assert!(
i < self.len(),
"Trying to access an element at index {} from a {}{}Array of length {}",
i,
T::Offset::PREFIX,
T::PREFIX,
self.len()
);
// SAFETY:
// Verified length above
unsafe { self.value_unchecked(i) }
}
/// constructs a new iterator
pub fn iter(&self) -> ArrayIter<&Self> {
ArrayIter::new(self)
}
/// Returns a zero-copy slice of this array with the indicated offset and length.
pub fn slice(&self, offset: usize, length: usize) -> Self {
Self {
data_type: T::DATA_TYPE,
value_offsets: self.value_offsets.slice(offset, length),
value_data: self.value_data.clone(),
nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)),
}
}
/// Returns `GenericByteBuilder` of this byte array for mutating its values if the underlying
/// offset and data buffers are not shared by others.
pub fn into_builder(self) -> Result<GenericByteBuilder<T>, Self> {
let len = self.len();
let value_len = T::Offset::as_usize(self.value_offsets()[len] - self.value_offsets()[0]);
let data = self.into_data();
let null_bit_buffer = data.nulls().map(|b| b.inner().sliced());
let element_len = std::mem::size_of::<T::Offset>();
let offset_buffer = data.buffers()[0]
.slice_with_length(data.offset() * element_len, (len + 1) * element_len);
let element_len = std::mem::size_of::<u8>();
let value_buffer = data.buffers()[1]
.slice_with_length(data.offset() * element_len, value_len * element_len);
drop(data);
let try_mutable_null_buffer = match null_bit_buffer {
None => Ok(None),
Some(null_buffer) => {
// Null buffer exists, tries to make it mutable
null_buffer.into_mutable().map(Some)
}
};
let try_mutable_buffers = match try_mutable_null_buffer {
Ok(mutable_null_buffer) => {
// Got mutable null buffer, tries to get mutable value buffer
let try_mutable_offset_buffer = offset_buffer.into_mutable();
let try_mutable_value_buffer = value_buffer.into_mutable();
// try_mutable_offset_buffer.map(...).map_err(...) doesn't work as the compiler complains
// mutable_null_buffer is moved into map closure.
match (try_mutable_offset_buffer, try_mutable_value_buffer) {
(Ok(mutable_offset_buffer), Ok(mutable_value_buffer)) => unsafe {
Ok(GenericByteBuilder::<T>::new_from_buffer(
mutable_offset_buffer,
mutable_value_buffer,
mutable_null_buffer,
))
},
(Ok(mutable_offset_buffer), Err(value_buffer)) => Err((
mutable_offset_buffer.into(),
value_buffer,
mutable_null_buffer.map(|b| b.into()),
)),
(Err(offset_buffer), Ok(mutable_value_buffer)) => Err((
offset_buffer,
mutable_value_buffer.into(),
mutable_null_buffer.map(|b| b.into()),
)),
(Err(offset_buffer), Err(value_buffer)) => Err((
offset_buffer,
value_buffer,
mutable_null_buffer.map(|b| b.into()),
)),
}
}
Err(mutable_null_buffer) => {
// Unable to get mutable null buffer
Err((offset_buffer, value_buffer, Some(mutable_null_buffer)))
}
};
match try_mutable_buffers {
Ok(builder) => Ok(builder),
Err((offset_buffer, value_buffer, null_bit_buffer)) => {
let builder = ArrayData::builder(T::DATA_TYPE)
.len(len)
.add_buffer(offset_buffer)
.add_buffer(value_buffer)
.null_bit_buffer(null_bit_buffer);
let array_data = unsafe { builder.build_unchecked() };
let array = GenericByteArray::<T>::from(array_data);
Err(array)
}
}
}
}
impl<T: ByteArrayType> std::fmt::Debug for GenericByteArray<T> {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "{}{}Array\n[\n", T::Offset::PREFIX, T::PREFIX)?;
print_long_array(self, f, |array, index, f| {
std::fmt::Debug::fmt(&array.value(index), f)
})?;
write!(f, "]")
}
}
impl<T: ByteArrayType> Array for GenericByteArray<T> {
fn as_any(&self) -> &dyn Any {
self
}
fn to_data(&self) -> ArrayData {
self.clone().into()
}
fn into_data(self) -> ArrayData {
self.into()
}
fn data_type(&self) -> &DataType {
&self.data_type
}
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
Arc::new(self.slice(offset, length))
}
fn len(&self) -> usize {
self.value_offsets.len() - 1
}
fn is_empty(&self) -> bool {
self.value_offsets.len() <= 1
}
fn offset(&self) -> usize {
0
}
fn nulls(&self) -> Option<&NullBuffer> {
self.nulls.as_ref()
}
fn get_buffer_memory_size(&self) -> usize {
let mut sum = self.value_offsets.inner().inner().capacity();
sum += self.value_data.capacity();
if let Some(x) = &self.nulls {
sum += x.buffer().capacity()
}
sum
}
fn get_array_memory_size(&self) -> usize {
std::mem::size_of::<Self>() + self.get_buffer_memory_size()
}
}
impl<'a, T: ByteArrayType> ArrayAccessor for &'a GenericByteArray<T> {
type Item = &'a T::Native;
fn value(&self, index: usize) -> Self::Item {
GenericByteArray::value(self, index)
}
unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
GenericByteArray::value_unchecked(self, index)
}
}
impl<T: ByteArrayType> From<ArrayData> for GenericByteArray<T> {
fn from(data: ArrayData) -> Self {
assert_eq!(
data.data_type(),
&Self::DATA_TYPE,
"{}{}Array expects DataType::{}",
T::Offset::PREFIX,
T::PREFIX,
Self::DATA_TYPE
);
assert_eq!(
data.buffers().len(),
2,
"{}{}Array data should contain 2 buffers only (offsets and values)",
T::Offset::PREFIX,
T::PREFIX,
);
// SAFETY:
// ArrayData is valid, and verified type above
let value_offsets = unsafe { get_offsets(&data) };
let value_data = data.buffers()[1].clone();
Self {
value_offsets,
value_data,
data_type: T::DATA_TYPE,
nulls: data.nulls().cloned(),
}
}
}
impl<T: ByteArrayType> From<GenericByteArray<T>> for ArrayData {
fn from(array: GenericByteArray<T>) -> Self {
let len = array.len();
let offsets = array.value_offsets.into_inner().into_inner();
let builder = ArrayDataBuilder::new(array.data_type)
.len(len)
.buffers(vec![offsets, array.value_data])
.nulls(array.nulls);
unsafe { builder.build_unchecked() }
}
}
impl<'a, T: ByteArrayType> IntoIterator for &'a GenericByteArray<T> {
type Item = Option<&'a T::Native>;
type IntoIter = ArrayIter<Self>;
fn into_iter(self) -> Self::IntoIter {
ArrayIter::new(self)
}
}
impl<'a, Ptr, T: ByteArrayType> FromIterator<&'a Option<Ptr>> for GenericByteArray<T>
where
Ptr: AsRef<T::Native> + 'a,
{
fn from_iter<I: IntoIterator<Item = &'a Option<Ptr>>>(iter: I) -> Self {
iter.into_iter()
.map(|o| o.as_ref().map(|p| p.as_ref()))
.collect()
}
}
impl<Ptr, T: ByteArrayType> FromIterator<Option<Ptr>> for GenericByteArray<T>
where
Ptr: AsRef<T::Native>,
{
fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
let iter = iter.into_iter();
let mut builder = GenericByteBuilder::with_capacity(iter.size_hint().0, 1024);
builder.extend(iter);
builder.finish()
}
}
#[cfg(test)]
mod tests {
use crate::{BinaryArray, StringArray};
use arrow_buffer::{Buffer, NullBuffer, OffsetBuffer};
#[test]
fn try_new() {
let data = Buffer::from_slice_ref("helloworld");
let offsets = OffsetBuffer::new(vec![0, 5, 10].into());
StringArray::new(offsets.clone(), data.clone(), None);
let nulls = NullBuffer::new_null(3);
let err =
StringArray::try_new(offsets.clone(), data.clone(), Some(nulls.clone())).unwrap_err();
assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for StringArray, expected 2 got 3");
let err = BinaryArray::try_new(offsets.clone(), data.clone(), Some(nulls)).unwrap_err();
assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for BinaryArray, expected 2 got 3");
let non_utf8_data = Buffer::from_slice_ref(b"he\xFFloworld");
let err = StringArray::try_new(offsets.clone(), non_utf8_data.clone(), None).unwrap_err();
assert_eq!(err.to_string(), "Invalid argument error: Encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 2");
BinaryArray::new(offsets, non_utf8_data, None);
let offsets = OffsetBuffer::new(vec![0, 5, 11].into());
let err = StringArray::try_new(offsets.clone(), data.clone(), None).unwrap_err();
assert_eq!(
err.to_string(),
"Invalid argument error: Offset of 11 exceeds length of values 10"
);
let err = BinaryArray::try_new(offsets.clone(), data, None).unwrap_err();
assert_eq!(
err.to_string(),
"Invalid argument error: Maximum offset of 11 is larger than values of length 10"
);
let non_ascii_data = Buffer::from_slice_ref("heìloworld");
StringArray::new(offsets.clone(), non_ascii_data.clone(), None);
BinaryArray::new(offsets, non_ascii_data.clone(), None);
let offsets = OffsetBuffer::new(vec![0, 3, 10].into());
let err = StringArray::try_new(offsets.clone(), non_ascii_data.clone(), None).unwrap_err();
assert_eq!(
err.to_string(),
"Invalid argument error: Split UTF-8 codepoint at offset 3"
);
BinaryArray::new(offsets, non_ascii_data, None);
}
}

Просмотреть файл

@ -0,0 +1,810 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::array::print_long_array;
use crate::builder::{ArrayBuilder, GenericByteViewBuilder};
use crate::iterator::ArrayIter;
use crate::types::bytes::ByteArrayNativeType;
use crate::types::{BinaryViewType, ByteViewType, StringViewType};
use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, Scalar};
use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer};
use arrow_data::{ArrayData, ArrayDataBuilder, ByteView};
use arrow_schema::{ArrowError, DataType};
use num::ToPrimitive;
use std::any::Any;
use std::fmt::Debug;
use std::marker::PhantomData;
use std::sync::Arc;
use super::ByteArrayType;
/// [Variable-size Binary View Layout]: An array of variable length bytes view arrays.
///
/// Different than [`crate::GenericByteArray`] as it stores both an offset and length
/// meaning that take / filter operations can be implemented without copying the underlying data.
///
/// See [`StringViewArray`] for storing utf8 encoded string data and
/// [`BinaryViewArray`] for storing bytes.
///
/// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout
///
/// A `GenericByteViewArray` stores variable length byte strings. An array of
/// `N` elements is stored as `N` fixed length "views" and a variable number
/// of variable length "buffers".
///
/// Each view is a `u128` value layout is different depending on the
/// length of the string stored at that location:
///
/// ```text
/// ┌──────┬────────────────────────┐
/// │length│ string value │
/// Strings (len <= 12) │ │ (padded with 0) │
/// └──────┴────────────────────────┘
/// 0 31 127
///
/// ┌───────┬───────┬───────┬───────┐
/// │length │prefix │ buf │offset │
/// Strings (len > 12) │ │ │ index │ │
/// └───────┴───────┴───────┴───────┘
/// 0 31 63 95 127
/// ```
///
/// * Strings with length <= 12 are stored directly in the view.
///
/// * Strings with length > 12: The first four bytes are stored inline in the
/// view and the entire string is stored in one of the buffers.
///
/// Unlike [`GenericByteArray`], there are no constraints on the offsets other
/// than they must point into a valid buffer. However, they can be out of order,
/// non continuous and overlapping.
///
/// For example, in the following diagram, the strings "FishWasInTownToday" and
/// "CrumpleFacedFish" are both longer than 12 bytes and thus are stored in a
/// separate buffer while the string "LavaMonster" is stored inlined in the
/// view. In this case, the same bytes for "Fish" are used to store both strings.
///
/// ```text
/// ┌───┐
/// ┌──────┬──────┬──────┬──────┐ offset │...│
/// "FishWasInTownTodayYay" │ 21 │ Fish │ 0 │ 115 │─ ─ 103 │Mr.│
/// └──────┴──────┴──────┴──────┘ │ ┌ ─ ─ ─ ─ ▶ │Cru│
/// ┌──────┬──────┬──────┬──────┐ │mpl│
/// "CrumpleFacedFish" │ 16 │ Crum │ 0 │ 103 │─ ─│─ ─ ─ ┘ │eFa│
/// └──────┴──────┴──────┴──────┘ │ced│
/// ┌──────┬────────────────────┐ └ ─ ─ ─ ─ ─ ─ ─ ─ ▶│Fis│
/// "LavaMonster" │ 11 │ LavaMonster\0 │ │hWa│
/// └──────┴────────────────────┘ offset │sIn│
/// 115 │Tow│
/// │nTo│
/// │day│
/// u128 "views" │Yay│
/// buffer 0 │...│
/// └───┘
/// ```
/// [`GenericByteArray`]: crate::array::GenericByteArray
pub struct GenericByteViewArray<T: ByteViewType + ?Sized> {
data_type: DataType,
views: ScalarBuffer<u128>,
buffers: Vec<Buffer>,
phantom: PhantomData<T>,
nulls: Option<NullBuffer>,
}
impl<T: ByteViewType + ?Sized> Clone for GenericByteViewArray<T> {
fn clone(&self) -> Self {
Self {
data_type: T::DATA_TYPE,
views: self.views.clone(),
buffers: self.buffers.clone(),
nulls: self.nulls.clone(),
phantom: Default::default(),
}
}
}
impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
/// Create a new [`GenericByteViewArray`] from the provided parts, panicking on failure
///
/// # Panics
///
/// Panics if [`GenericByteViewArray::try_new`] returns an error
pub fn new(views: ScalarBuffer<u128>, buffers: Vec<Buffer>, nulls: Option<NullBuffer>) -> Self {
Self::try_new(views, buffers, nulls).unwrap()
}
/// Create a new [`GenericByteViewArray`] from the provided parts, returning an error on failure
///
/// # Errors
///
/// * `views.len() != nulls.len()`
/// * [ByteViewType::validate] fails
pub fn try_new(
views: ScalarBuffer<u128>,
buffers: Vec<Buffer>,
nulls: Option<NullBuffer>,
) -> Result<Self, ArrowError> {
T::validate(&views, &buffers)?;
if let Some(n) = nulls.as_ref() {
if n.len() != views.len() {
return Err(ArrowError::InvalidArgumentError(format!(
"Incorrect length of null buffer for {}ViewArray, expected {} got {}",
T::PREFIX,
views.len(),
n.len(),
)));
}
}
Ok(Self {
data_type: T::DATA_TYPE,
views,
buffers,
nulls,
phantom: Default::default(),
})
}
/// Create a new [`GenericByteViewArray`] from the provided parts, without validation
///
/// # Safety
///
/// Safe if [`Self::try_new`] would not error
pub unsafe fn new_unchecked(
views: ScalarBuffer<u128>,
buffers: Vec<Buffer>,
nulls: Option<NullBuffer>,
) -> Self {
Self {
data_type: T::DATA_TYPE,
phantom: Default::default(),
views,
buffers,
nulls,
}
}
/// Create a new [`GenericByteViewArray`] of length `len` where all values are null
pub fn new_null(len: usize) -> Self {
Self {
data_type: T::DATA_TYPE,
views: vec![0; len].into(),
buffers: vec![],
nulls: Some(NullBuffer::new_null(len)),
phantom: Default::default(),
}
}
/// Create a new [`Scalar`] from `value`
pub fn new_scalar(value: impl AsRef<T::Native>) -> Scalar<Self> {
Scalar::new(Self::from_iter_values(std::iter::once(value)))
}
/// Creates a [`GenericByteViewArray`] based on an iterator of values without nulls
pub fn from_iter_values<Ptr, I>(iter: I) -> Self
where
Ptr: AsRef<T::Native>,
I: IntoIterator<Item = Ptr>,
{
let iter = iter.into_iter();
let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
for v in iter {
builder.append_value(v);
}
builder.finish()
}
/// Deconstruct this array into its constituent parts
pub fn into_parts(self) -> (ScalarBuffer<u128>, Vec<Buffer>, Option<NullBuffer>) {
(self.views, self.buffers, self.nulls)
}
/// Returns the views buffer
#[inline]
pub fn views(&self) -> &ScalarBuffer<u128> {
&self.views
}
/// Returns the buffers storing string data
#[inline]
pub fn data_buffers(&self) -> &[Buffer] {
&self.buffers
}
/// Returns the element at index `i`
/// # Panics
/// Panics if index `i` is out of bounds.
pub fn value(&self, i: usize) -> &T::Native {
assert!(
i < self.len(),
"Trying to access an element at index {} from a {}ViewArray of length {}",
i,
T::PREFIX,
self.len()
);
unsafe { self.value_unchecked(i) }
}
/// Returns the element at index `i`
/// # Safety
/// Caller is responsible for ensuring that the index is within the bounds of the array
pub unsafe fn value_unchecked(&self, idx: usize) -> &T::Native {
let v = self.views.get_unchecked(idx);
let len = *v as u32;
let b = if len <= 12 {
Self::inline_value(v, len as usize)
} else {
let view = ByteView::from(*v);
let data = self.buffers.get_unchecked(view.buffer_index as usize);
let offset = view.offset as usize;
data.get_unchecked(offset..offset + len as usize)
};
T::Native::from_bytes_unchecked(b)
}
/// Returns the inline value of the view.
///
/// # Safety
/// - The `view` must be a valid element from `Self::views()` that adheres to the view layout.
/// - The `len` must be the length of the inlined value. It should never be larger than 12.
#[inline(always)]
pub unsafe fn inline_value(view: &u128, len: usize) -> &[u8] {
debug_assert!(len <= 12);
std::slice::from_raw_parts((view as *const u128 as *const u8).wrapping_add(4), len)
}
/// constructs a new iterator
pub fn iter(&self) -> ArrayIter<&Self> {
ArrayIter::new(self)
}
/// Returns a zero-copy slice of this array with the indicated offset and length.
pub fn slice(&self, offset: usize, length: usize) -> Self {
Self {
data_type: T::DATA_TYPE,
views: self.views.slice(offset, length),
buffers: self.buffers.clone(),
nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)),
phantom: Default::default(),
}
}
/// Returns a "compacted" version of this array
///
/// The original array will *not* be modified
///
/// # Garbage Collection
///
/// Before GC:
/// ```text
/// ┌──────┐
/// │......│
/// │......│
/// ┌────────────────────┐ ┌ ─ ─ ─ ▶ │Data1 │ Large buffer
/// │ View 1 │─ ─ ─ ─ │......│ with data that
/// ├────────────────────┤ │......│ is not referred
/// │ View 2 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data2 │ to by View 1 or
/// └────────────────────┘ │......│ View 2
/// │......│
/// 2 views, refer to │......│
/// small portions of a └──────┘
/// large buffer
/// ```
///
/// After GC:
///
/// ```text
/// ┌────────────────────┐ ┌─────┐ After gc, only
/// │ View 1 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│ data that is
/// ├────────────────────┤ ┌ ─ ─ ─ ▶ │Data2│ pointed to by
/// │ View 2 │─ ─ ─ ─ └─────┘ the views is
/// └────────────────────┘ left
///
///
/// 2 views
/// ```
/// This method will compact the data buffers by recreating the view array and only include the data
/// that is pointed to by the views.
///
/// Note that it will copy the array regardless of whether the original array is compact.
/// Use with caution as this can be an expensive operation, only use it when you are sure that the view
/// array is significantly smaller than when it is originally created, e.g., after filtering or slicing.
pub fn gc(&self) -> Self {
let mut builder = GenericByteViewBuilder::<T>::with_capacity(self.len());
for v in self.iter() {
builder.append_option(v);
}
builder.finish()
}
}
impl<T: ByteViewType + ?Sized> Debug for GenericByteViewArray<T> {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "{}ViewArray\n[\n", T::PREFIX)?;
print_long_array(self, f, |array, index, f| {
std::fmt::Debug::fmt(&array.value(index), f)
})?;
write!(f, "]")
}
}
impl<T: ByteViewType + ?Sized> Array for GenericByteViewArray<T> {
fn as_any(&self) -> &dyn Any {
self
}
fn to_data(&self) -> ArrayData {
self.clone().into()
}
fn into_data(self) -> ArrayData {
self.into()
}
fn data_type(&self) -> &DataType {
&self.data_type
}
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
Arc::new(self.slice(offset, length))
}
fn len(&self) -> usize {
self.views.len()
}
fn is_empty(&self) -> bool {
self.views.is_empty()
}
fn offset(&self) -> usize {
0
}
fn nulls(&self) -> Option<&NullBuffer> {
self.nulls.as_ref()
}
fn get_buffer_memory_size(&self) -> usize {
let mut sum = self.buffers.iter().map(|b| b.capacity()).sum::<usize>();
sum += self.views.inner().capacity();
if let Some(x) = &self.nulls {
sum += x.buffer().capacity()
}
sum
}
fn get_array_memory_size(&self) -> usize {
std::mem::size_of::<Self>() + self.get_buffer_memory_size()
}
}
impl<'a, T: ByteViewType + ?Sized> ArrayAccessor for &'a GenericByteViewArray<T> {
type Item = &'a T::Native;
fn value(&self, index: usize) -> Self::Item {
GenericByteViewArray::value(self, index)
}
unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
GenericByteViewArray::value_unchecked(self, index)
}
}
impl<'a, T: ByteViewType + ?Sized> IntoIterator for &'a GenericByteViewArray<T> {
type Item = Option<&'a T::Native>;
type IntoIter = ArrayIter<Self>;
fn into_iter(self) -> Self::IntoIter {
ArrayIter::new(self)
}
}
impl<T: ByteViewType + ?Sized> From<ArrayData> for GenericByteViewArray<T> {
fn from(value: ArrayData) -> Self {
let views = value.buffers()[0].clone();
let views = ScalarBuffer::new(views, value.offset(), value.len());
let buffers = value.buffers()[1..].to_vec();
Self {
data_type: T::DATA_TYPE,
views,
buffers,
nulls: value.nulls().cloned(),
phantom: Default::default(),
}
}
}
/// Convert a [`GenericByteArray`] to a [`GenericByteViewArray`] but in a smart way:
/// If the offsets are all less than u32::MAX, then we directly build the view array on top of existing buffer.
impl<FROM, V> From<&GenericByteArray<FROM>> for GenericByteViewArray<V>
where
FROM: ByteArrayType,
FROM::Offset: OffsetSizeTrait + ToPrimitive,
V: ByteViewType<Native = FROM::Native>,
{
fn from(byte_array: &GenericByteArray<FROM>) -> Self {
let offsets = byte_array.offsets();
let can_reuse_buffer = match offsets.last() {
Some(offset) => offset.as_usize() < u32::MAX as usize,
None => true,
};
if can_reuse_buffer {
let len = byte_array.len();
let mut views_builder = GenericByteViewBuilder::<V>::with_capacity(len);
let str_values_buf = byte_array.values().clone();
let block = views_builder.append_block(str_values_buf);
for (i, w) in offsets.windows(2).enumerate() {
let offset = w[0].as_usize();
let end = w[1].as_usize();
let length = end - offset;
if byte_array.is_null(i) {
views_builder.append_null();
} else {
// Safety: the input was a valid array so it valid UTF8 (if string). And
// all offsets were valid
unsafe {
views_builder.append_view_unchecked(block, offset as u32, length as u32)
}
}
}
assert_eq!(views_builder.len(), len);
views_builder.finish()
} else {
// TODO: the first u32::MAX can still be reused
GenericByteViewArray::<V>::from_iter(byte_array.iter())
}
}
}
impl<T: ByteViewType + ?Sized> From<GenericByteViewArray<T>> for ArrayData {
fn from(mut array: GenericByteViewArray<T>) -> Self {
let len = array.len();
array.buffers.insert(0, array.views.into_inner());
let builder = ArrayDataBuilder::new(T::DATA_TYPE)
.len(len)
.buffers(array.buffers)
.nulls(array.nulls);
unsafe { builder.build_unchecked() }
}
}
impl<'a, Ptr, T> FromIterator<&'a Option<Ptr>> for GenericByteViewArray<T>
where
Ptr: AsRef<T::Native> + 'a,
T: ByteViewType + ?Sized,
{
fn from_iter<I: IntoIterator<Item = &'a Option<Ptr>>>(iter: I) -> Self {
iter.into_iter()
.map(|o| o.as_ref().map(|p| p.as_ref()))
.collect()
}
}
impl<Ptr, T: ByteViewType + ?Sized> FromIterator<Option<Ptr>> for GenericByteViewArray<T>
where
Ptr: AsRef<T::Native>,
{
fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
let iter = iter.into_iter();
let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
builder.extend(iter);
builder.finish()
}
}
/// A [`GenericByteViewArray`] of `[u8]`
///
/// # Example
/// ```
/// use arrow_array::BinaryViewArray;
/// let array = BinaryViewArray::from_iter_values(vec![b"hello" as &[u8], b"world", b"lulu", b"large payload over 12 bytes"]);
/// assert_eq!(array.value(0), b"hello");
/// assert_eq!(array.value(3), b"large payload over 12 bytes");
/// ```
pub type BinaryViewArray = GenericByteViewArray<BinaryViewType>;
impl BinaryViewArray {
/// Convert the [`BinaryViewArray`] to [`StringViewArray`]
/// If items not utf8 data, validate will fail and error returned.
pub fn to_string_view(self) -> Result<StringViewArray, ArrowError> {
StringViewType::validate(self.views(), self.data_buffers())?;
unsafe { Ok(self.to_string_view_unchecked()) }
}
/// Convert the [`BinaryViewArray`] to [`StringViewArray`]
/// # Safety
/// Caller is responsible for ensuring that items in array are utf8 data.
pub unsafe fn to_string_view_unchecked(self) -> StringViewArray {
StringViewArray::new_unchecked(self.views, self.buffers, self.nulls)
}
}
impl From<Vec<&[u8]>> for BinaryViewArray {
fn from(v: Vec<&[u8]>) -> Self {
Self::from_iter_values(v)
}
}
impl From<Vec<Option<&[u8]>>> for BinaryViewArray {
fn from(v: Vec<Option<&[u8]>>) -> Self {
v.into_iter().collect()
}
}
/// A [`GenericByteViewArray`] that stores utf8 data
///
/// # Example
/// ```
/// use arrow_array::StringViewArray;
/// let array = StringViewArray::from_iter_values(vec!["hello", "world", "lulu", "large payload over 12 bytes"]);
/// assert_eq!(array.value(0), "hello");
/// assert_eq!(array.value(3), "large payload over 12 bytes");
/// ```
pub type StringViewArray = GenericByteViewArray<StringViewType>;
impl StringViewArray {
/// Convert the [`StringViewArray`] to [`BinaryViewArray`]
pub fn to_binary_view(self) -> BinaryViewArray {
unsafe { BinaryViewArray::new_unchecked(self.views, self.buffers, self.nulls) }
}
/// Returns true if all data within this array is ASCII
pub fn is_ascii(&self) -> bool {
// Alternative (but incorrect): directly check the underlying buffers
// (1) Our string view might be sparse, i.e., a subset of the buffers,
// so even if the buffer is not ascii, we can still be ascii.
// (2) It is quite difficult to know the range of each buffer (unlike StringArray)
// This means that this operation is quite expensive, shall we cache the result?
// i.e. track `is_ascii` in the builder.
self.iter().all(|v| match v {
Some(v) => v.is_ascii(),
None => true,
})
}
}
impl From<Vec<&str>> for StringViewArray {
fn from(v: Vec<&str>) -> Self {
Self::from_iter_values(v)
}
}
impl From<Vec<Option<&str>>> for StringViewArray {
fn from(v: Vec<Option<&str>>) -> Self {
v.into_iter().collect()
}
}
impl From<Vec<String>> for StringViewArray {
fn from(v: Vec<String>) -> Self {
Self::from_iter_values(v)
}
}
impl From<Vec<Option<String>>> for StringViewArray {
fn from(v: Vec<Option<String>>) -> Self {
v.into_iter().collect()
}
}
#[cfg(test)]
mod tests {
use crate::builder::{BinaryViewBuilder, StringViewBuilder};
use crate::{Array, BinaryViewArray, StringViewArray};
use arrow_buffer::{Buffer, ScalarBuffer};
use arrow_data::ByteView;
#[test]
fn try_new_string() {
let array = StringViewArray::from_iter_values(vec![
"hello",
"world",
"lulu",
"large payload over 12 bytes",
]);
assert_eq!(array.value(0), "hello");
assert_eq!(array.value(3), "large payload over 12 bytes");
}
#[test]
fn try_new_binary() {
let array = BinaryViewArray::from_iter_values(vec![
b"hello".as_slice(),
b"world".as_slice(),
b"lulu".as_slice(),
b"large payload over 12 bytes".as_slice(),
]);
assert_eq!(array.value(0), b"hello");
assert_eq!(array.value(3), b"large payload over 12 bytes");
}
#[test]
fn try_new_empty_string() {
// test empty array
let array = {
let mut builder = StringViewBuilder::new();
builder.finish()
};
assert!(array.is_empty());
}
#[test]
fn try_new_empty_binary() {
// test empty array
let array = {
let mut builder = BinaryViewBuilder::new();
builder.finish()
};
assert!(array.is_empty());
}
#[test]
fn test_append_string() {
// test builder append
let array = {
let mut builder = StringViewBuilder::new();
builder.append_value("hello");
builder.append_null();
builder.append_option(Some("large payload over 12 bytes"));
builder.finish()
};
assert_eq!(array.value(0), "hello");
assert!(array.is_null(1));
assert_eq!(array.value(2), "large payload over 12 bytes");
}
#[test]
fn test_append_binary() {
// test builder append
let array = {
let mut builder = BinaryViewBuilder::new();
builder.append_value(b"hello");
builder.append_null();
builder.append_option(Some(b"large payload over 12 bytes"));
builder.finish()
};
assert_eq!(array.value(0), b"hello");
assert!(array.is_null(1));
assert_eq!(array.value(2), b"large payload over 12 bytes");
}
#[test]
fn test_in_progress_recreation() {
let array = {
// make a builder with small block size.
let mut builder = StringViewBuilder::new().with_block_size(14);
builder.append_value("large payload over 12 bytes");
builder.append_option(Some("another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created"));
builder.finish()
};
assert_eq!(array.value(0), "large payload over 12 bytes");
assert_eq!(array.value(1), "another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created");
assert_eq!(2, array.buffers.len());
}
#[test]
#[should_panic(expected = "Invalid buffer index at 0: got index 3 but only has 1 buffers")]
fn new_with_invalid_view_data() {
let v = "large payload over 12 bytes";
let view = ByteView {
length: 13,
prefix: u32::from_le_bytes(v.as_bytes()[0..4].try_into().unwrap()),
buffer_index: 3,
offset: 1,
};
let views = ScalarBuffer::from(vec![view.into()]);
let buffers = vec![Buffer::from_slice_ref(v)];
StringViewArray::new(views, buffers, None);
}
#[test]
#[should_panic(
expected = "Encountered non-UTF-8 data at index 0: invalid utf-8 sequence of 1 bytes from index 0"
)]
fn new_with_invalid_utf8_data() {
let v: Vec<u8> = vec![0xf0, 0x80, 0x80, 0x80];
let view = ByteView {
length: v.len() as u32,
prefix: u32::from_le_bytes(v[0..4].try_into().unwrap()),
buffer_index: 0,
offset: 0,
};
let views = ScalarBuffer::from(vec![view.into()]);
let buffers = vec![Buffer::from_slice_ref(v)];
StringViewArray::new(views, buffers, None);
}
#[test]
#[should_panic(expected = "View at index 0 contained non-zero padding for string of length 1")]
fn new_with_invalid_zero_padding() {
let mut data = [0; 12];
data[0] = b'H';
data[11] = 1; // no zero padding
let mut view_buffer = [0; 16];
view_buffer[0..4].copy_from_slice(&1u32.to_le_bytes());
view_buffer[4..].copy_from_slice(&data);
let view = ByteView::from(u128::from_le_bytes(view_buffer));
let views = ScalarBuffer::from(vec![view.into()]);
let buffers = vec![];
StringViewArray::new(views, buffers, None);
}
#[test]
#[should_panic(expected = "Mismatch between embedded prefix and data")]
fn test_mismatch_between_embedded_prefix_and_data() {
let input_str_1 = "Hello, Rustaceans!";
let input_str_2 = "Hallo, Rustaceans!";
let length = input_str_1.len() as u32;
assert!(input_str_1.len() > 12);
let mut view_buffer = [0; 16];
view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
view_buffer[4..8].copy_from_slice(&input_str_1.as_bytes()[0..4]);
view_buffer[8..12].copy_from_slice(&0u32.to_le_bytes());
view_buffer[12..].copy_from_slice(&0u32.to_le_bytes());
let view = ByteView::from(u128::from_le_bytes(view_buffer));
let views = ScalarBuffer::from(vec![view.into()]);
let buffers = vec![Buffer::from_slice_ref(input_str_2.as_bytes())];
StringViewArray::new(views, buffers, None);
}
#[test]
fn test_gc() {
let test_data = [
Some("longer than 12 bytes"),
Some("short"),
Some("t"),
Some("longer than 12 bytes"),
None,
Some("short"),
];
let array = {
let mut builder = StringViewBuilder::new().with_block_size(8); // create multiple buffers
test_data.into_iter().for_each(|v| builder.append_option(v));
builder.finish()
};
assert!(array.buffers.len() > 1);
fn check_gc(to_test: &StringViewArray) {
let gc = to_test.gc();
assert_ne!(to_test.data_buffers().len(), gc.data_buffers().len());
to_test.iter().zip(gc.iter()).for_each(|(a, b)| {
assert_eq!(a, b);
});
assert_eq!(to_test.len(), gc.len());
}
check_gc(&array);
check_gc(&array.slice(1, 3));
check_gc(&array.slice(2, 1));
check_gc(&array.slice(2, 2));
check_gc(&array.slice(3, 1));
}
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,977 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::array::print_long_array;
use crate::iterator::FixedSizeBinaryIter;
use crate::{Array, ArrayAccessor, ArrayRef, FixedSizeListArray, Scalar};
use arrow_buffer::buffer::NullBuffer;
use arrow_buffer::{bit_util, ArrowNativeType, BooleanBuffer, Buffer, MutableBuffer};
use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::{ArrowError, DataType};
use std::any::Any;
use std::sync::Arc;
/// An array of [fixed size binary arrays](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-primitive-layout)
///
/// # Examples
///
/// Create an array from an iterable argument of byte slices.
///
/// ```
/// use arrow_array::{Array, FixedSizeBinaryArray};
/// let input_arg = vec![ vec![1, 2], vec![3, 4], vec![5, 6] ];
/// let arr = FixedSizeBinaryArray::try_from_iter(input_arg.into_iter()).unwrap();
///
/// assert_eq!(3, arr.len());
///
/// ```
/// Create an array from an iterable argument of sparse byte slices.
/// Sparsity means that the input argument can contain `None` items.
/// ```
/// use arrow_array::{Array, FixedSizeBinaryArray};
/// let input_arg = vec![ None, Some(vec![7, 8]), Some(vec![9, 10]), None, Some(vec![13, 14]) ];
/// let arr = FixedSizeBinaryArray::try_from_sparse_iter_with_size(input_arg.into_iter(), 2).unwrap();
/// assert_eq!(5, arr.len())
///
/// ```
///
#[derive(Clone)]
pub struct FixedSizeBinaryArray {
data_type: DataType, // Must be DataType::FixedSizeBinary(value_length)
value_data: Buffer,
nulls: Option<NullBuffer>,
len: usize,
value_length: i32,
}
impl FixedSizeBinaryArray {
/// Create a new [`FixedSizeBinaryArray`] with `size` element size, panicking on failure
///
/// # Panics
///
/// Panics if [`Self::try_new`] returns an error
pub fn new(size: i32, values: Buffer, nulls: Option<NullBuffer>) -> Self {
Self::try_new(size, values, nulls).unwrap()
}
/// Create a new [`Scalar`] from `value`
pub fn new_scalar(value: impl AsRef<[u8]>) -> Scalar<Self> {
let v = value.as_ref();
Scalar::new(Self::new(v.len() as _, Buffer::from(v), None))
}
/// Create a new [`FixedSizeBinaryArray`] from the provided parts, returning an error on failure
///
/// # Errors
///
/// * `size < 0`
/// * `values.len() / size != nulls.len()`
pub fn try_new(
size: i32,
values: Buffer,
nulls: Option<NullBuffer>,
) -> Result<Self, ArrowError> {
let data_type = DataType::FixedSizeBinary(size);
let s = size.to_usize().ok_or_else(|| {
ArrowError::InvalidArgumentError(format!("Size cannot be negative, got {}", size))
})?;
let len = values.len() / s;
if let Some(n) = nulls.as_ref() {
if n.len() != len {
return Err(ArrowError::InvalidArgumentError(format!(
"Incorrect length of null buffer for FixedSizeBinaryArray, expected {} got {}",
len,
n.len(),
)));
}
}
Ok(Self {
data_type,
value_data: values,
value_length: size,
nulls,
len,
})
}
/// Create a new [`FixedSizeBinaryArray`] of length `len` where all values are null
///
/// # Panics
///
/// Panics if
///
/// * `size < 0`
/// * `size * len` would overflow `usize`
pub fn new_null(size: i32, len: usize) -> Self {
let capacity = size.to_usize().unwrap().checked_mul(len).unwrap();
Self {
data_type: DataType::FixedSizeBinary(size),
value_data: MutableBuffer::new(capacity).into(),
nulls: Some(NullBuffer::new_null(len)),
value_length: size,
len,
}
}
/// Deconstruct this array into its constituent parts
pub fn into_parts(self) -> (i32, Buffer, Option<NullBuffer>) {
(self.value_length, self.value_data, self.nulls)
}
/// Returns the element at index `i` as a byte slice.
/// # Panics
/// Panics if index `i` is out of bounds.
pub fn value(&self, i: usize) -> &[u8] {
assert!(
i < self.len(),
"Trying to access an element at index {} from a FixedSizeBinaryArray of length {}",
i,
self.len()
);
let offset = i + self.offset();
unsafe {
let pos = self.value_offset_at(offset);
std::slice::from_raw_parts(
self.value_data.as_ptr().offset(pos as isize),
(self.value_offset_at(offset + 1) - pos) as usize,
)
}
}
/// Returns the element at index `i` as a byte slice.
/// # Safety
/// Caller is responsible for ensuring that the index is within the bounds of the array
pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] {
let offset = i + self.offset();
let pos = self.value_offset_at(offset);
std::slice::from_raw_parts(
self.value_data.as_ptr().offset(pos as isize),
(self.value_offset_at(offset + 1) - pos) as usize,
)
}
/// Returns the offset for the element at index `i`.
///
/// Note this doesn't do any bound checking, for performance reason.
#[inline]
pub fn value_offset(&self, i: usize) -> i32 {
self.value_offset_at(self.offset() + i)
}
/// Returns the length for an element.
///
/// All elements have the same length as the array is a fixed size.
#[inline]
pub fn value_length(&self) -> i32 {
self.value_length
}
/// Returns the values of this array.
///
/// Unlike [`Self::value_data`] this returns the [`Buffer`]
/// allowing for zero-copy cloning.
#[inline]
pub fn values(&self) -> &Buffer {
&self.value_data
}
/// Returns the raw value data.
pub fn value_data(&self) -> &[u8] {
self.value_data.as_slice()
}
/// Returns a zero-copy slice of this array with the indicated offset and length.
pub fn slice(&self, offset: usize, len: usize) -> Self {
assert!(
offset.saturating_add(len) <= self.len,
"the length + offset of the sliced FixedSizeBinaryArray cannot exceed the existing length"
);
let size = self.value_length as usize;
Self {
data_type: self.data_type.clone(),
nulls: self.nulls.as_ref().map(|n| n.slice(offset, len)),
value_length: self.value_length,
value_data: self.value_data.slice_with_length(offset * size, len * size),
len,
}
}
/// Create an array from an iterable argument of sparse byte slices.
/// Sparsity means that items returned by the iterator are optional, i.e input argument can
/// contain `None` items.
///
/// # Examples
///
/// ```
/// use arrow_array::FixedSizeBinaryArray;
/// let input_arg = vec![
/// None,
/// Some(vec![7, 8]),
/// Some(vec![9, 10]),
/// None,
/// Some(vec![13, 14]),
/// None,
/// ];
/// let array = FixedSizeBinaryArray::try_from_sparse_iter(input_arg.into_iter()).unwrap();
/// ```
///
/// # Errors
///
/// Returns error if argument has length zero, or sizes of nested slices don't match.
#[deprecated(
note = "This function will fail if the iterator produces only None values; prefer `try_from_sparse_iter_with_size`"
)]
pub fn try_from_sparse_iter<T, U>(mut iter: T) -> Result<Self, ArrowError>
where
T: Iterator<Item = Option<U>>,
U: AsRef<[u8]>,
{
let mut len = 0;
let mut size = None;
let mut byte = 0;
let iter_size_hint = iter.size_hint().0;
let mut null_buf = MutableBuffer::new(bit_util::ceil(iter_size_hint, 8));
let mut buffer = MutableBuffer::new(0);
let mut prepend = 0;
iter.try_for_each(|item| -> Result<(), ArrowError> {
// extend null bitmask by one byte per each 8 items
if byte == 0 {
null_buf.push(0u8);
byte = 8;
}
byte -= 1;
if let Some(slice) = item {
let slice = slice.as_ref();
if let Some(size) = size {
if size != slice.len() {
return Err(ArrowError::InvalidArgumentError(format!(
"Nested array size mismatch: one is {}, and the other is {}",
size,
slice.len()
)));
}
} else {
let len = slice.len();
size = Some(len);
// Now that we know how large each element is we can reserve
// sufficient capacity in the underlying mutable buffer for
// the data.
buffer.reserve(iter_size_hint * len);
buffer.extend_zeros(slice.len() * prepend);
}
bit_util::set_bit(null_buf.as_slice_mut(), len);
buffer.extend_from_slice(slice);
} else if let Some(size) = size {
buffer.extend_zeros(size);
} else {
prepend += 1;
}
len += 1;
Ok(())
})?;
if len == 0 {
return Err(ArrowError::InvalidArgumentError(
"Input iterable argument has no data".to_owned(),
));
}
let null_buf = BooleanBuffer::new(null_buf.into(), 0, len);
let nulls = Some(NullBuffer::new(null_buf)).filter(|n| n.null_count() > 0);
let size = size.unwrap_or(0) as i32;
Ok(Self {
data_type: DataType::FixedSizeBinary(size),
value_data: buffer.into(),
nulls,
value_length: size,
len,
})
}
/// Create an array from an iterable argument of sparse byte slices.
/// Sparsity means that items returned by the iterator are optional, i.e input argument can
/// contain `None` items. In cases where the iterator returns only `None` values, this
/// also takes a size parameter to ensure that the a valid FixedSizeBinaryArray is still
/// created.
///
/// # Examples
///
/// ```
/// use arrow_array::FixedSizeBinaryArray;
/// let input_arg = vec![
/// None,
/// Some(vec![7, 8]),
/// Some(vec![9, 10]),
/// None,
/// Some(vec![13, 14]),
/// None,
/// ];
/// let array = FixedSizeBinaryArray::try_from_sparse_iter_with_size(input_arg.into_iter(), 2).unwrap();
/// ```
///
/// # Errors
///
/// Returns error if argument has length zero, or sizes of nested slices don't match.
pub fn try_from_sparse_iter_with_size<T, U>(mut iter: T, size: i32) -> Result<Self, ArrowError>
where
T: Iterator<Item = Option<U>>,
U: AsRef<[u8]>,
{
let mut len = 0;
let mut byte = 0;
let iter_size_hint = iter.size_hint().0;
let mut null_buf = MutableBuffer::new(bit_util::ceil(iter_size_hint, 8));
let mut buffer = MutableBuffer::new(iter_size_hint * (size as usize));
iter.try_for_each(|item| -> Result<(), ArrowError> {
// extend null bitmask by one byte per each 8 items
if byte == 0 {
null_buf.push(0u8);
byte = 8;
}
byte -= 1;
if let Some(slice) = item {
let slice = slice.as_ref();
if size as usize != slice.len() {
return Err(ArrowError::InvalidArgumentError(format!(
"Nested array size mismatch: one is {}, and the other is {}",
size,
slice.len()
)));
}
bit_util::set_bit(null_buf.as_slice_mut(), len);
buffer.extend_from_slice(slice);
} else {
buffer.extend_zeros(size as usize);
}
len += 1;
Ok(())
})?;
let null_buf = BooleanBuffer::new(null_buf.into(), 0, len);
let nulls = Some(NullBuffer::new(null_buf)).filter(|n| n.null_count() > 0);
Ok(Self {
data_type: DataType::FixedSizeBinary(size),
value_data: buffer.into(),
nulls,
len,
value_length: size,
})
}
/// Create an array from an iterable argument of byte slices.
///
/// # Examples
///
/// ```
/// use arrow_array::FixedSizeBinaryArray;
/// let input_arg = vec![
/// vec![1, 2],
/// vec![3, 4],
/// vec![5, 6],
/// ];
/// let array = FixedSizeBinaryArray::try_from_iter(input_arg.into_iter()).unwrap();
/// ```
///
/// # Errors
///
/// Returns error if argument has length zero, or sizes of nested slices don't match.
pub fn try_from_iter<T, U>(mut iter: T) -> Result<Self, ArrowError>
where
T: Iterator<Item = U>,
U: AsRef<[u8]>,
{
let mut len = 0;
let mut size = None;
let iter_size_hint = iter.size_hint().0;
let mut buffer = MutableBuffer::new(0);
iter.try_for_each(|item| -> Result<(), ArrowError> {
let slice = item.as_ref();
if let Some(size) = size {
if size != slice.len() {
return Err(ArrowError::InvalidArgumentError(format!(
"Nested array size mismatch: one is {}, and the other is {}",
size,
slice.len()
)));
}
} else {
let len = slice.len();
size = Some(len);
buffer.reserve(iter_size_hint * len);
}
buffer.extend_from_slice(slice);
len += 1;
Ok(())
})?;
if len == 0 {
return Err(ArrowError::InvalidArgumentError(
"Input iterable argument has no data".to_owned(),
));
}
let size = size.unwrap_or(0).try_into().unwrap();
Ok(Self {
data_type: DataType::FixedSizeBinary(size),
value_data: buffer.into(),
nulls: None,
value_length: size,
len,
})
}
#[inline]
fn value_offset_at(&self, i: usize) -> i32 {
self.value_length * i as i32
}
/// constructs a new iterator
pub fn iter(&self) -> FixedSizeBinaryIter<'_> {
FixedSizeBinaryIter::new(self)
}
}
impl From<ArrayData> for FixedSizeBinaryArray {
fn from(data: ArrayData) -> Self {
assert_eq!(
data.buffers().len(),
1,
"FixedSizeBinaryArray data should contain 1 buffer only (values)"
);
let value_length = match data.data_type() {
DataType::FixedSizeBinary(len) => *len,
_ => panic!("Expected data type to be FixedSizeBinary"),
};
let size = value_length as usize;
let value_data =
data.buffers()[0].slice_with_length(data.offset() * size, data.len() * size);
Self {
data_type: data.data_type().clone(),
nulls: data.nulls().cloned(),
len: data.len(),
value_data,
value_length,
}
}
}
impl From<FixedSizeBinaryArray> for ArrayData {
fn from(array: FixedSizeBinaryArray) -> Self {
let builder = ArrayDataBuilder::new(array.data_type)
.len(array.len)
.buffers(vec![array.value_data])
.nulls(array.nulls);
unsafe { builder.build_unchecked() }
}
}
/// Creates a `FixedSizeBinaryArray` from `FixedSizeList<u8>` array
impl From<FixedSizeListArray> for FixedSizeBinaryArray {
fn from(v: FixedSizeListArray) -> Self {
let value_len = v.value_length();
let v = v.into_data();
assert_eq!(
v.child_data().len(),
1,
"FixedSizeBinaryArray can only be created from list array of u8 values \
(i.e. FixedSizeList<PrimitiveArray<u8>>)."
);
let child_data = &v.child_data()[0];
assert_eq!(
child_data.child_data().len(),
0,
"FixedSizeBinaryArray can only be created from list array of u8 values \
(i.e. FixedSizeList<PrimitiveArray<u8>>)."
);
assert_eq!(
child_data.data_type(),
&DataType::UInt8,
"FixedSizeBinaryArray can only be created from FixedSizeList<u8> arrays, mismatched data types."
);
assert_eq!(
child_data.null_count(),
0,
"The child array cannot contain null values."
);
let builder = ArrayData::builder(DataType::FixedSizeBinary(value_len))
.len(v.len())
.offset(v.offset())
.add_buffer(child_data.buffers()[0].slice(child_data.offset()))
.nulls(v.nulls().cloned());
let data = unsafe { builder.build_unchecked() };
Self::from(data)
}
}
impl From<Vec<Option<&[u8]>>> for FixedSizeBinaryArray {
fn from(v: Vec<Option<&[u8]>>) -> Self {
#[allow(deprecated)]
Self::try_from_sparse_iter(v.into_iter()).unwrap()
}
}
impl From<Vec<&[u8]>> for FixedSizeBinaryArray {
fn from(v: Vec<&[u8]>) -> Self {
Self::try_from_iter(v.into_iter()).unwrap()
}
}
impl<const N: usize> From<Vec<&[u8; N]>> for FixedSizeBinaryArray {
fn from(v: Vec<&[u8; N]>) -> Self {
Self::try_from_iter(v.into_iter()).unwrap()
}
}
impl std::fmt::Debug for FixedSizeBinaryArray {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "FixedSizeBinaryArray<{}>\n[\n", self.value_length())?;
print_long_array(self, f, |array, index, f| {
std::fmt::Debug::fmt(&array.value(index), f)
})?;
write!(f, "]")
}
}
impl Array for FixedSizeBinaryArray {
fn as_any(&self) -> &dyn Any {
self
}
fn to_data(&self) -> ArrayData {
self.clone().into()
}
fn into_data(self) -> ArrayData {
self.into()
}
fn data_type(&self) -> &DataType {
&self.data_type
}
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
Arc::new(self.slice(offset, length))
}
fn len(&self) -> usize {
self.len
}
fn is_empty(&self) -> bool {
self.len == 0
}
fn offset(&self) -> usize {
0
}
fn nulls(&self) -> Option<&NullBuffer> {
self.nulls.as_ref()
}
fn get_buffer_memory_size(&self) -> usize {
let mut sum = self.value_data.capacity();
if let Some(n) = &self.nulls {
sum += n.buffer().capacity();
}
sum
}
fn get_array_memory_size(&self) -> usize {
std::mem::size_of::<Self>() + self.get_buffer_memory_size()
}
}
impl<'a> ArrayAccessor for &'a FixedSizeBinaryArray {
type Item = &'a [u8];
fn value(&self, index: usize) -> Self::Item {
FixedSizeBinaryArray::value(self, index)
}
unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
FixedSizeBinaryArray::value_unchecked(self, index)
}
}
impl<'a> IntoIterator for &'a FixedSizeBinaryArray {
type Item = Option<&'a [u8]>;
type IntoIter = FixedSizeBinaryIter<'a>;
fn into_iter(self) -> Self::IntoIter {
FixedSizeBinaryIter::<'a>::new(self)
}
}
#[cfg(test)]
mod tests {
use crate::RecordBatch;
use arrow_schema::{Field, Schema};
use super::*;
#[test]
fn test_fixed_size_binary_array() {
let values: [u8; 15] = *b"hellotherearrow";
let array_data = ArrayData::builder(DataType::FixedSizeBinary(5))
.len(3)
.add_buffer(Buffer::from(&values[..]))
.build()
.unwrap();
let fixed_size_binary_array = FixedSizeBinaryArray::from(array_data);
assert_eq!(3, fixed_size_binary_array.len());
assert_eq!(0, fixed_size_binary_array.null_count());
assert_eq!(
[b'h', b'e', b'l', b'l', b'o'],
fixed_size_binary_array.value(0)
);
assert_eq!(
[b't', b'h', b'e', b'r', b'e'],
fixed_size_binary_array.value(1)
);
assert_eq!(
[b'a', b'r', b'r', b'o', b'w'],
fixed_size_binary_array.value(2)
);
assert_eq!(5, fixed_size_binary_array.value_length());
assert_eq!(10, fixed_size_binary_array.value_offset(2));
for i in 0..3 {
assert!(fixed_size_binary_array.is_valid(i));
assert!(!fixed_size_binary_array.is_null(i));
}
// Test binary array with offset
let array_data = ArrayData::builder(DataType::FixedSizeBinary(5))
.len(2)
.offset(1)
.add_buffer(Buffer::from(&values[..]))
.build()
.unwrap();
let fixed_size_binary_array = FixedSizeBinaryArray::from(array_data);
assert_eq!(
[b't', b'h', b'e', b'r', b'e'],
fixed_size_binary_array.value(0)
);
assert_eq!(
[b'a', b'r', b'r', b'o', b'w'],
fixed_size_binary_array.value(1)
);
assert_eq!(2, fixed_size_binary_array.len());
assert_eq!(0, fixed_size_binary_array.value_offset(0));
assert_eq!(5, fixed_size_binary_array.value_length());
assert_eq!(5, fixed_size_binary_array.value_offset(1));
}
#[test]
fn test_fixed_size_binary_array_from_fixed_size_list_array() {
let values = [0_u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13];
let values_data = ArrayData::builder(DataType::UInt8)
.len(12)
.offset(2)
.add_buffer(Buffer::from_slice_ref(values))
.build()
.unwrap();
// [null, [10, 11, 12, 13]]
let array_data = unsafe {
ArrayData::builder(DataType::FixedSizeList(
Arc::new(Field::new("item", DataType::UInt8, false)),
4,
))
.len(2)
.offset(1)
.add_child_data(values_data)
.null_bit_buffer(Some(Buffer::from_slice_ref([0b101])))
.build_unchecked()
};
let list_array = FixedSizeListArray::from(array_data);
let binary_array = FixedSizeBinaryArray::from(list_array);
assert_eq!(2, binary_array.len());
assert_eq!(1, binary_array.null_count());
assert!(binary_array.is_null(0));
assert!(binary_array.is_valid(1));
assert_eq!(&[10, 11, 12, 13], binary_array.value(1));
}
#[test]
#[should_panic(
expected = "FixedSizeBinaryArray can only be created from FixedSizeList<u8> arrays"
)]
// Different error messages, so skip for now
// https://github.com/apache/arrow-rs/issues/1545
#[cfg(not(feature = "force_validate"))]
fn test_fixed_size_binary_array_from_incorrect_fixed_size_list_array() {
let values: [u32; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
let values_data = ArrayData::builder(DataType::UInt32)
.len(12)
.add_buffer(Buffer::from_slice_ref(values))
.build()
.unwrap();
let array_data = unsafe {
ArrayData::builder(DataType::FixedSizeList(
Arc::new(Field::new("item", DataType::Binary, false)),
4,
))
.len(3)
.add_child_data(values_data)
.build_unchecked()
};
let list_array = FixedSizeListArray::from(array_data);
drop(FixedSizeBinaryArray::from(list_array));
}
#[test]
#[should_panic(expected = "The child array cannot contain null values.")]
fn test_fixed_size_binary_array_from_fixed_size_list_array_with_child_nulls_failed() {
let values = [0_u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
let values_data = ArrayData::builder(DataType::UInt8)
.len(12)
.add_buffer(Buffer::from_slice_ref(values))
.null_bit_buffer(Some(Buffer::from_slice_ref([0b101010101010])))
.build()
.unwrap();
let array_data = unsafe {
ArrayData::builder(DataType::FixedSizeList(
Arc::new(Field::new("item", DataType::UInt8, false)),
4,
))
.len(3)
.add_child_data(values_data)
.build_unchecked()
};
let list_array = FixedSizeListArray::from(array_data);
drop(FixedSizeBinaryArray::from(list_array));
}
#[test]
fn test_fixed_size_binary_array_fmt_debug() {
let values: [u8; 15] = *b"hellotherearrow";
let array_data = ArrayData::builder(DataType::FixedSizeBinary(5))
.len(3)
.add_buffer(Buffer::from(&values[..]))
.build()
.unwrap();
let arr = FixedSizeBinaryArray::from(array_data);
assert_eq!(
"FixedSizeBinaryArray<5>\n[\n [104, 101, 108, 108, 111],\n [116, 104, 101, 114, 101],\n [97, 114, 114, 111, 119],\n]",
format!("{arr:?}")
);
}
#[test]
fn test_fixed_size_binary_array_from_iter() {
let input_arg = vec![vec![1, 2], vec![3, 4], vec![5, 6]];
let arr = FixedSizeBinaryArray::try_from_iter(input_arg.into_iter()).unwrap();
assert_eq!(2, arr.value_length());
assert_eq!(3, arr.len())
}
#[test]
fn test_all_none_fixed_size_binary_array_from_sparse_iter() {
let none_option: Option<[u8; 32]> = None;
let input_arg = vec![none_option, none_option, none_option];
#[allow(deprecated)]
let arr = FixedSizeBinaryArray::try_from_sparse_iter(input_arg.into_iter()).unwrap();
assert_eq!(0, arr.value_length());
assert_eq!(3, arr.len())
}
#[test]
fn test_fixed_size_binary_array_from_sparse_iter() {
let input_arg = vec![
None,
Some(vec![7, 8]),
Some(vec![9, 10]),
None,
Some(vec![13, 14]),
];
#[allow(deprecated)]
let arr = FixedSizeBinaryArray::try_from_sparse_iter(input_arg.iter().cloned()).unwrap();
assert_eq!(2, arr.value_length());
assert_eq!(5, arr.len());
let arr =
FixedSizeBinaryArray::try_from_sparse_iter_with_size(input_arg.into_iter(), 2).unwrap();
assert_eq!(2, arr.value_length());
assert_eq!(5, arr.len());
}
#[test]
fn test_fixed_size_binary_array_from_sparse_iter_with_size_all_none() {
let input_arg = vec![None, None, None, None, None] as Vec<Option<Vec<u8>>>;
let arr = FixedSizeBinaryArray::try_from_sparse_iter_with_size(input_arg.into_iter(), 16)
.unwrap();
assert_eq!(16, arr.value_length());
assert_eq!(5, arr.len())
}
#[test]
fn test_fixed_size_binary_array_from_vec() {
let values = vec!["one".as_bytes(), b"two", b"six", b"ten"];
let array = FixedSizeBinaryArray::from(values);
assert_eq!(array.len(), 4);
assert_eq!(array.null_count(), 0);
assert_eq!(array.value(0), b"one");
assert_eq!(array.value(1), b"two");
assert_eq!(array.value(2), b"six");
assert_eq!(array.value(3), b"ten");
assert!(!array.is_null(0));
assert!(!array.is_null(1));
assert!(!array.is_null(2));
assert!(!array.is_null(3));
}
#[test]
#[should_panic(expected = "Nested array size mismatch: one is 3, and the other is 5")]
fn test_fixed_size_binary_array_from_vec_incorrect_length() {
let values = vec!["one".as_bytes(), b"two", b"three", b"four"];
let _ = FixedSizeBinaryArray::from(values);
}
#[test]
fn test_fixed_size_binary_array_from_opt_vec() {
let values = vec![
Some("one".as_bytes()),
Some(b"two"),
None,
Some(b"six"),
Some(b"ten"),
];
let array = FixedSizeBinaryArray::from(values);
assert_eq!(array.len(), 5);
assert_eq!(array.value(0), b"one");
assert_eq!(array.value(1), b"two");
assert_eq!(array.value(3), b"six");
assert_eq!(array.value(4), b"ten");
assert!(!array.is_null(0));
assert!(!array.is_null(1));
assert!(array.is_null(2));
assert!(!array.is_null(3));
assert!(!array.is_null(4));
}
#[test]
#[should_panic(expected = "Nested array size mismatch: one is 3, and the other is 5")]
fn test_fixed_size_binary_array_from_opt_vec_incorrect_length() {
let values = vec![
Some("one".as_bytes()),
Some(b"two"),
None,
Some(b"three"),
Some(b"four"),
];
let _ = FixedSizeBinaryArray::from(values);
}
#[test]
fn fixed_size_binary_array_all_null() {
let data = vec![None] as Vec<Option<String>>;
let array =
FixedSizeBinaryArray::try_from_sparse_iter_with_size(data.into_iter(), 0).unwrap();
array
.into_data()
.validate_full()
.expect("All null array has valid array data");
}
#[test]
// Test for https://github.com/apache/arrow-rs/issues/1390
fn fixed_size_binary_array_all_null_in_batch_with_schema() {
let schema = Schema::new(vec![Field::new("a", DataType::FixedSizeBinary(2), true)]);
let none_option: Option<[u8; 2]> = None;
let item = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
vec![none_option, none_option, none_option].into_iter(),
2,
)
.unwrap();
// Should not panic
RecordBatch::try_new(Arc::new(schema), vec![Arc::new(item)]).unwrap();
}
#[test]
#[should_panic(
expected = "Trying to access an element at index 4 from a FixedSizeBinaryArray of length 3"
)]
fn test_fixed_size_binary_array_get_value_index_out_of_bound() {
let values = vec![Some("one".as_bytes()), Some(b"two"), None];
let array = FixedSizeBinaryArray::from(values);
array.value(4);
}
#[test]
fn test_constructors() {
let buffer = Buffer::from_vec(vec![0_u8; 10]);
let a = FixedSizeBinaryArray::new(2, buffer.clone(), None);
assert_eq!(a.len(), 5);
let nulls = NullBuffer::new_null(5);
FixedSizeBinaryArray::new(2, buffer.clone(), Some(nulls));
let a = FixedSizeBinaryArray::new(3, buffer.clone(), None);
assert_eq!(a.len(), 3);
let nulls = NullBuffer::new_null(3);
FixedSizeBinaryArray::new(3, buffer.clone(), Some(nulls));
let err = FixedSizeBinaryArray::try_new(-1, buffer.clone(), None).unwrap_err();
assert_eq!(
err.to_string(),
"Invalid argument error: Size cannot be negative, got -1"
);
let nulls = NullBuffer::new_null(3);
let err = FixedSizeBinaryArray::try_new(2, buffer, Some(nulls)).unwrap_err();
assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for FixedSizeBinaryArray, expected 5 got 3");
}
}

Просмотреть файл

@ -0,0 +1,693 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::array::print_long_array;
use crate::builder::{FixedSizeListBuilder, PrimitiveBuilder};
use crate::iterator::FixedSizeListIter;
use crate::{make_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType};
use arrow_buffer::buffer::NullBuffer;
use arrow_buffer::ArrowNativeType;
use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::{ArrowError, DataType, FieldRef};
use std::any::Any;
use std::sync::Arc;
/// An array of [fixed length lists], similar to JSON arrays
/// (e.g. `["A", "B"]`).
///
/// Lists are represented using a `values` child
/// array where each list has a fixed size of `value_length`.
///
/// Use [`FixedSizeListBuilder`] to construct a [`FixedSizeListArray`].
///
/// # Representation
///
/// A [`FixedSizeListArray`] can represent a list of values of any other
/// supported Arrow type. Each element of the `FixedSizeListArray` itself is
/// a list which may contain NULL and non-null values,
/// or may itself be NULL.
///
/// For example, this `FixedSizeListArray` stores lists of strings:
///
/// ```text
/// ┌─────────────┐
/// │ [A,B] │
/// ├─────────────┤
/// │ NULL │
/// ├─────────────┤
/// │ [C,NULL] │
/// └─────────────┘
/// ```
///
/// The `values` of this `FixedSizeListArray`s are stored in a child
/// [`StringArray`] where logical null values take up `values_length` slots in the array
/// as shown in the following diagram. The logical values
/// are shown on the left, and the actual `FixedSizeListArray` encoding on the right
///
/// ```text
/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┐
/// ┌ ─ ─ ─ ─ ─ ─ ─ ─┐
/// ┌─────────────┐ │ ┌───┐ ┌───┐ ┌──────┐ │
/// │ [A,B] │ │ 1 │ │ │ 1 │ │ A │ │ 0
/// ├─────────────┤ │ ├───┤ ├───┤ ├──────┤ │
/// │ NULL │ │ 0 │ │ │ 1 │ │ B │ │ 1
/// ├─────────────┤ │ ├───┤ ├───┤ ├──────┤ │
/// │ [C,NULL] │ │ 1 │ │ │ 0 │ │ ???? │ │ 2
/// └─────────────┘ │ └───┘ ├───┤ ├──────┤ │
/// | │ 0 │ │ ???? │ │ 3
/// Logical Values │ Validity ├───┤ ├──────┤ │
/// (nulls) │ │ 1 │ │ C │ │ 4
/// │ ├───┤ ├──────┤ │
/// │ │ 0 │ │ ???? │ │ 5
/// │ └───┘ └──────┘ │
/// │ Values │
/// │ FixedSizeListArray (Array) │
/// └ ─ ─ ─ ─ ─ ─ ─ ─┘
/// └ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘
/// ```
///
/// # Example
///
/// ```
/// # use std::sync::Arc;
/// # use arrow_array::{Array, FixedSizeListArray, Int32Array};
/// # use arrow_data::ArrayData;
/// # use arrow_schema::{DataType, Field};
/// # use arrow_buffer::Buffer;
/// // Construct a value array
/// let value_data = ArrayData::builder(DataType::Int32)
/// .len(9)
/// .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8]))
/// .build()
/// .unwrap();
/// let list_data_type = DataType::FixedSizeList(
/// Arc::new(Field::new("item", DataType::Int32, false)),
/// 3,
/// );
/// let list_data = ArrayData::builder(list_data_type.clone())
/// .len(3)
/// .add_child_data(value_data.clone())
/// .build()
/// .unwrap();
/// let list_array = FixedSizeListArray::from(list_data);
/// let list0 = list_array.value(0);
/// let list1 = list_array.value(1);
/// let list2 = list_array.value(2);
///
/// assert_eq!( &[0, 1, 2], list0.as_any().downcast_ref::<Int32Array>().unwrap().values());
/// assert_eq!( &[3, 4, 5], list1.as_any().downcast_ref::<Int32Array>().unwrap().values());
/// assert_eq!( &[6, 7, 8], list2.as_any().downcast_ref::<Int32Array>().unwrap().values());
/// ```
///
/// [`StringArray`]: crate::array::StringArray
/// [fixed size arrays](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-list-layout)
#[derive(Clone)]
pub struct FixedSizeListArray {
data_type: DataType, // Must be DataType::FixedSizeList(value_length)
values: ArrayRef,
nulls: Option<NullBuffer>,
value_length: i32,
len: usize,
}
impl FixedSizeListArray {
/// Create a new [`FixedSizeListArray`] with `size` element size, panicking on failure
///
/// # Panics
///
/// Panics if [`Self::try_new`] returns an error
pub fn new(field: FieldRef, size: i32, values: ArrayRef, nulls: Option<NullBuffer>) -> Self {
Self::try_new(field, size, values, nulls).unwrap()
}
/// Create a new [`FixedSizeListArray`] from the provided parts, returning an error on failure
///
/// # Errors
///
/// * `size < 0`
/// * `values.len() / size != nulls.len()`
/// * `values.data_type() != field.data_type()`
/// * `!field.is_nullable() && !nulls.expand(size).contains(values.logical_nulls())`
pub fn try_new(
field: FieldRef,
size: i32,
values: ArrayRef,
nulls: Option<NullBuffer>,
) -> Result<Self, ArrowError> {
let s = size.to_usize().ok_or_else(|| {
ArrowError::InvalidArgumentError(format!("Size cannot be negative, got {}", size))
})?;
let len = match s {
0 => nulls.as_ref().map(|x| x.len()).unwrap_or_default(),
_ => {
let len = values.len() / s.max(1);
if let Some(n) = nulls.as_ref() {
if n.len() != len {
return Err(ArrowError::InvalidArgumentError(format!(
"Incorrect length of null buffer for FixedSizeListArray, expected {} got {}",
len,
n.len(),
)));
}
}
len
}
};
if field.data_type() != values.data_type() {
return Err(ArrowError::InvalidArgumentError(format!(
"FixedSizeListArray expected data type {} got {} for {:?}",
field.data_type(),
values.data_type(),
field.name()
)));
}
if let Some(a) = values.logical_nulls() {
let nulls_valid = field.is_nullable()
|| nulls
.as_ref()
.map(|n| n.expand(size as _).contains(&a))
.unwrap_or_default()
|| (nulls.is_none() && a.null_count() == 0);
if !nulls_valid {
return Err(ArrowError::InvalidArgumentError(format!(
"Found unmasked nulls for non-nullable FixedSizeListArray field {:?}",
field.name()
)));
}
}
let data_type = DataType::FixedSizeList(field, size);
Ok(Self {
data_type,
values,
value_length: size,
nulls,
len,
})
}
/// Create a new [`FixedSizeListArray`] of length `len` where all values are null
///
/// # Panics
///
/// Panics if
///
/// * `size < 0`
/// * `size * len` would overflow `usize`
pub fn new_null(field: FieldRef, size: i32, len: usize) -> Self {
let capacity = size.to_usize().unwrap().checked_mul(len).unwrap();
Self {
values: make_array(ArrayData::new_null(field.data_type(), capacity)),
data_type: DataType::FixedSizeList(field, size),
nulls: Some(NullBuffer::new_null(len)),
value_length: size,
len,
}
}
/// Deconstruct this array into its constituent parts
pub fn into_parts(self) -> (FieldRef, i32, ArrayRef, Option<NullBuffer>) {
let f = match self.data_type {
DataType::FixedSizeList(f, _) => f,
_ => unreachable!(),
};
(f, self.value_length, self.values, self.nulls)
}
/// Returns a reference to the values of this list.
pub fn values(&self) -> &ArrayRef {
&self.values
}
/// Returns a clone of the value type of this list.
pub fn value_type(&self) -> DataType {
self.values.data_type().clone()
}
/// Returns ith value of this list array.
pub fn value(&self, i: usize) -> ArrayRef {
self.values
.slice(self.value_offset_at(i), self.value_length() as usize)
}
/// Returns the offset for value at index `i`.
///
/// Note this doesn't do any bound checking, for performance reason.
#[inline]
pub fn value_offset(&self, i: usize) -> i32 {
self.value_offset_at(i) as i32
}
/// Returns the length for an element.
///
/// All elements have the same length as the array is a fixed size.
#[inline]
pub const fn value_length(&self) -> i32 {
self.value_length
}
#[inline]
const fn value_offset_at(&self, i: usize) -> usize {
i * self.value_length as usize
}
/// Returns a zero-copy slice of this array with the indicated offset and length.
pub fn slice(&self, offset: usize, len: usize) -> Self {
assert!(
offset.saturating_add(len) <= self.len,
"the length + offset of the sliced FixedSizeListArray cannot exceed the existing length"
);
let size = self.value_length as usize;
Self {
data_type: self.data_type.clone(),
values: self.values.slice(offset * size, len * size),
nulls: self.nulls.as_ref().map(|n| n.slice(offset, len)),
value_length: self.value_length,
len,
}
}
/// Creates a [`FixedSizeListArray`] from an iterator of primitive values
/// # Example
/// ```
/// # use arrow_array::FixedSizeListArray;
/// # use arrow_array::types::Int32Type;
///
/// let data = vec![
/// Some(vec![Some(0), Some(1), Some(2)]),
/// None,
/// Some(vec![Some(3), None, Some(5)]),
/// Some(vec![Some(6), Some(7), Some(45)]),
/// ];
/// let list_array = FixedSizeListArray::from_iter_primitive::<Int32Type, _, _>(data, 3);
/// println!("{:?}", list_array);
/// ```
pub fn from_iter_primitive<T, P, I>(iter: I, length: i32) -> Self
where
T: ArrowPrimitiveType,
P: IntoIterator<Item = Option<<T as ArrowPrimitiveType>::Native>>,
I: IntoIterator<Item = Option<P>>,
{
let l = length as usize;
let iter = iter.into_iter();
let size_hint = iter.size_hint().0;
let mut builder = FixedSizeListBuilder::with_capacity(
PrimitiveBuilder::<T>::with_capacity(size_hint * l),
length,
size_hint,
);
for i in iter {
match i {
Some(p) => {
for t in p {
builder.values().append_option(t);
}
builder.append(true);
}
None => {
builder.values().append_nulls(l);
builder.append(false)
}
}
}
builder.finish()
}
/// constructs a new iterator
pub fn iter(&self) -> FixedSizeListIter<'_> {
FixedSizeListIter::new(self)
}
}
impl From<ArrayData> for FixedSizeListArray {
fn from(data: ArrayData) -> Self {
let value_length = match data.data_type() {
DataType::FixedSizeList(_, len) => *len,
_ => {
panic!("FixedSizeListArray data should contain a FixedSizeList data type")
}
};
let size = value_length as usize;
let values =
make_array(data.child_data()[0].slice(data.offset() * size, data.len() * size));
Self {
data_type: data.data_type().clone(),
values,
nulls: data.nulls().cloned(),
value_length,
len: data.len(),
}
}
}
impl From<FixedSizeListArray> for ArrayData {
fn from(array: FixedSizeListArray) -> Self {
let builder = ArrayDataBuilder::new(array.data_type)
.len(array.len)
.nulls(array.nulls)
.child_data(vec![array.values.to_data()]);
unsafe { builder.build_unchecked() }
}
}
impl Array for FixedSizeListArray {
fn as_any(&self) -> &dyn Any {
self
}
fn to_data(&self) -> ArrayData {
self.clone().into()
}
fn into_data(self) -> ArrayData {
self.into()
}
fn data_type(&self) -> &DataType {
&self.data_type
}
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
Arc::new(self.slice(offset, length))
}
fn len(&self) -> usize {
self.len
}
fn is_empty(&self) -> bool {
self.len == 0
}
fn offset(&self) -> usize {
0
}
fn nulls(&self) -> Option<&NullBuffer> {
self.nulls.as_ref()
}
fn get_buffer_memory_size(&self) -> usize {
let mut size = self.values.get_buffer_memory_size();
if let Some(n) = self.nulls.as_ref() {
size += n.buffer().capacity();
}
size
}
fn get_array_memory_size(&self) -> usize {
let mut size = std::mem::size_of::<Self>() + self.values.get_array_memory_size();
if let Some(n) = self.nulls.as_ref() {
size += n.buffer().capacity();
}
size
}
}
impl ArrayAccessor for FixedSizeListArray {
type Item = ArrayRef;
fn value(&self, index: usize) -> Self::Item {
FixedSizeListArray::value(self, index)
}
unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
FixedSizeListArray::value(self, index)
}
}
impl std::fmt::Debug for FixedSizeListArray {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "FixedSizeListArray<{}>\n[\n", self.value_length())?;
print_long_array(self, f, |array, index, f| {
std::fmt::Debug::fmt(&array.value(index), f)
})?;
write!(f, "]")
}
}
impl<'a> ArrayAccessor for &'a FixedSizeListArray {
type Item = ArrayRef;
fn value(&self, index: usize) -> Self::Item {
FixedSizeListArray::value(self, index)
}
unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
FixedSizeListArray::value(self, index)
}
}
#[cfg(test)]
mod tests {
use arrow_buffer::{bit_util, BooleanBuffer, Buffer};
use arrow_schema::Field;
use crate::cast::AsArray;
use crate::types::Int32Type;
use crate::{new_empty_array, Int32Array};
use super::*;
#[test]
fn test_fixed_size_list_array() {
// Construct a value array
let value_data = ArrayData::builder(DataType::Int32)
.len(9)
.add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8]))
.build()
.unwrap();
// Construct a list array from the above two
let list_data_type =
DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, false)), 3);
let list_data = ArrayData::builder(list_data_type.clone())
.len(3)
.add_child_data(value_data.clone())
.build()
.unwrap();
let list_array = FixedSizeListArray::from(list_data);
assert_eq!(value_data, list_array.values().to_data());
assert_eq!(DataType::Int32, list_array.value_type());
assert_eq!(3, list_array.len());
assert_eq!(0, list_array.null_count());
assert_eq!(6, list_array.value_offset(2));
assert_eq!(3, list_array.value_length());
assert_eq!(0, list_array.value(0).as_primitive::<Int32Type>().value(0));
for i in 0..3 {
assert!(list_array.is_valid(i));
assert!(!list_array.is_null(i));
}
// Now test with a non-zero offset
let list_data = ArrayData::builder(list_data_type)
.len(2)
.offset(1)
.add_child_data(value_data.clone())
.build()
.unwrap();
let list_array = FixedSizeListArray::from(list_data);
assert_eq!(value_data.slice(3, 6), list_array.values().to_data());
assert_eq!(DataType::Int32, list_array.value_type());
assert_eq!(2, list_array.len());
assert_eq!(0, list_array.null_count());
assert_eq!(3, list_array.value(0).as_primitive::<Int32Type>().value(0));
assert_eq!(3, list_array.value_offset(1));
assert_eq!(3, list_array.value_length());
}
#[test]
#[should_panic(expected = "assertion failed: (offset + length) <= self.len()")]
// Different error messages, so skip for now
// https://github.com/apache/arrow-rs/issues/1545
#[cfg(not(feature = "force_validate"))]
fn test_fixed_size_list_array_unequal_children() {
// Construct a value array
let value_data = ArrayData::builder(DataType::Int32)
.len(8)
.add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7]))
.build()
.unwrap();
// Construct a list array from the above two
let list_data_type =
DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, false)), 3);
let list_data = unsafe {
ArrayData::builder(list_data_type)
.len(3)
.add_child_data(value_data)
.build_unchecked()
};
drop(FixedSizeListArray::from(list_data));
}
#[test]
fn test_fixed_size_list_array_slice() {
// Construct a value array
let value_data = ArrayData::builder(DataType::Int32)
.len(10)
.add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))
.build()
.unwrap();
// Set null buts for the nested array:
// [[0, 1], null, null, [6, 7], [8, 9]]
// 01011001 00000001
let mut null_bits: [u8; 1] = [0; 1];
bit_util::set_bit(&mut null_bits, 0);
bit_util::set_bit(&mut null_bits, 3);
bit_util::set_bit(&mut null_bits, 4);
// Construct a fixed size list array from the above two
let list_data_type =
DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, false)), 2);
let list_data = ArrayData::builder(list_data_type)
.len(5)
.add_child_data(value_data.clone())
.null_bit_buffer(Some(Buffer::from(null_bits)))
.build()
.unwrap();
let list_array = FixedSizeListArray::from(list_data);
assert_eq!(value_data, list_array.values().to_data());
assert_eq!(DataType::Int32, list_array.value_type());
assert_eq!(5, list_array.len());
assert_eq!(2, list_array.null_count());
assert_eq!(6, list_array.value_offset(3));
assert_eq!(2, list_array.value_length());
let sliced_array = list_array.slice(1, 4);
assert_eq!(4, sliced_array.len());
assert_eq!(2, sliced_array.null_count());
for i in 0..sliced_array.len() {
if bit_util::get_bit(&null_bits, 1 + i) {
assert!(sliced_array.is_valid(i));
} else {
assert!(sliced_array.is_null(i));
}
}
// Check offset and length for each non-null value.
let sliced_list_array = sliced_array
.as_any()
.downcast_ref::<FixedSizeListArray>()
.unwrap();
assert_eq!(2, sliced_list_array.value_length());
assert_eq!(4, sliced_list_array.value_offset(2));
assert_eq!(6, sliced_list_array.value_offset(3));
}
#[test]
#[should_panic(expected = "the offset of the new Buffer cannot exceed the existing length")]
fn test_fixed_size_list_array_index_out_of_bound() {
// Construct a value array
let value_data = ArrayData::builder(DataType::Int32)
.len(10)
.add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))
.build()
.unwrap();
// Set null buts for the nested array:
// [[0, 1], null, null, [6, 7], [8, 9]]
// 01011001 00000001
let mut null_bits: [u8; 1] = [0; 1];
bit_util::set_bit(&mut null_bits, 0);
bit_util::set_bit(&mut null_bits, 3);
bit_util::set_bit(&mut null_bits, 4);
// Construct a fixed size list array from the above two
let list_data_type =
DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, false)), 2);
let list_data = ArrayData::builder(list_data_type)
.len(5)
.add_child_data(value_data)
.null_bit_buffer(Some(Buffer::from(null_bits)))
.build()
.unwrap();
let list_array = FixedSizeListArray::from(list_data);
list_array.value(10);
}
#[test]
fn test_fixed_size_list_constructors() {
let values = Arc::new(Int32Array::from_iter([
Some(1),
Some(2),
None,
None,
Some(3),
Some(4),
]));
let field = Arc::new(Field::new("item", DataType::Int32, true));
let list = FixedSizeListArray::new(field.clone(), 2, values.clone(), None);
assert_eq!(list.len(), 3);
let nulls = NullBuffer::new_null(3);
let list = FixedSizeListArray::new(field.clone(), 2, values.clone(), Some(nulls));
assert_eq!(list.len(), 3);
let list = FixedSizeListArray::new(field.clone(), 4, values.clone(), None);
assert_eq!(list.len(), 1);
let err = FixedSizeListArray::try_new(field.clone(), -1, values.clone(), None).unwrap_err();
assert_eq!(
err.to_string(),
"Invalid argument error: Size cannot be negative, got -1"
);
let list = FixedSizeListArray::new(field.clone(), 0, values.clone(), None);
assert_eq!(list.len(), 0);
let nulls = NullBuffer::new_null(2);
let err = FixedSizeListArray::try_new(field, 2, values.clone(), Some(nulls)).unwrap_err();
assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for FixedSizeListArray, expected 3 got 2");
let field = Arc::new(Field::new("item", DataType::Int32, false));
let err = FixedSizeListArray::try_new(field.clone(), 2, values.clone(), None).unwrap_err();
assert_eq!(err.to_string(), "Invalid argument error: Found unmasked nulls for non-nullable FixedSizeListArray field \"item\"");
// Valid as nulls in child masked by parent
let nulls = NullBuffer::new(BooleanBuffer::new(vec![0b0000101].into(), 0, 3));
FixedSizeListArray::new(field, 2, values.clone(), Some(nulls));
let field = Arc::new(Field::new("item", DataType::Int64, true));
let err = FixedSizeListArray::try_new(field, 2, values, None).unwrap_err();
assert_eq!(err.to_string(), "Invalid argument error: FixedSizeListArray expected data type Int64 got Int32 for \"item\"");
}
#[test]
fn empty_fixed_size_list() {
let field = Arc::new(Field::new("item", DataType::Int32, true));
let nulls = NullBuffer::new_null(2);
let values = new_empty_array(&DataType::Int32);
let list = FixedSizeListArray::new(field.clone(), 0, values, Some(nulls));
assert_eq!(list.len(), 2);
}
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,801 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::array::{get_offsets, print_long_array};
use crate::iterator::MapArrayIter;
use crate::{make_array, Array, ArrayAccessor, ArrayRef, ListArray, StringArray, StructArray};
use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, OffsetBuffer, ToByteSlice};
use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::{ArrowError, DataType, Field, FieldRef};
use std::any::Any;
use std::sync::Arc;
/// An array of key-value maps
///
/// Keys should always be non-null, but values can be null.
///
/// [`MapArray`] is physically a [`ListArray`] of key values pairs stored as an `entries`
/// [`StructArray`] with 2 child fields.
///
/// See [`MapBuilder`](crate::builder::MapBuilder) for how to construct a [`MapArray`]
#[derive(Clone)]
pub struct MapArray {
data_type: DataType,
nulls: Option<NullBuffer>,
/// The [`StructArray`] that is the direct child of this array
entries: StructArray,
/// The start and end offsets of each entry
value_offsets: OffsetBuffer<i32>,
}
impl MapArray {
/// Create a new [`MapArray`] from the provided parts
///
/// See [`MapBuilder`](crate::builder::MapBuilder) for a higher-level interface
/// to construct a [`MapArray`]
///
/// # Errors
///
/// Errors if
///
/// * `offsets.len() - 1 != nulls.len()`
/// * `offsets.last() > entries.len()`
/// * `field.is_nullable()`
/// * `entries.null_count() != 0`
/// * `entries.columns().len() != 2`
/// * `field.data_type() != entries.data_type()`
pub fn try_new(
field: FieldRef,
offsets: OffsetBuffer<i32>,
entries: StructArray,
nulls: Option<NullBuffer>,
ordered: bool,
) -> Result<Self, ArrowError> {
let len = offsets.len() - 1; // Offsets guaranteed to not be empty
let end_offset = offsets.last().unwrap().as_usize();
// don't need to check other values of `offsets` because they are checked
// during construction of `OffsetBuffer`
if end_offset > entries.len() {
return Err(ArrowError::InvalidArgumentError(format!(
"Max offset of {end_offset} exceeds length of entries {}",
entries.len()
)));
}
if let Some(n) = nulls.as_ref() {
if n.len() != len {
return Err(ArrowError::InvalidArgumentError(format!(
"Incorrect length of null buffer for MapArray, expected {len} got {}",
n.len(),
)));
}
}
if field.is_nullable() || entries.null_count() != 0 {
return Err(ArrowError::InvalidArgumentError(
"MapArray entries cannot contain nulls".to_string(),
));
}
if field.data_type() != entries.data_type() {
return Err(ArrowError::InvalidArgumentError(format!(
"MapArray expected data type {} got {} for {:?}",
field.data_type(),
entries.data_type(),
field.name()
)));
}
if entries.columns().len() != 2 {
return Err(ArrowError::InvalidArgumentError(format!(
"MapArray entries must contain two children, got {}",
entries.columns().len()
)));
}
Ok(Self {
data_type: DataType::Map(field, ordered),
nulls,
entries,
value_offsets: offsets,
})
}
/// Create a new [`MapArray`] from the provided parts
///
/// See [`MapBuilder`](crate::builder::MapBuilder) for a higher-level interface
/// to construct a [`MapArray`]
///
/// # Panics
///
/// Panics if [`Self::try_new`] returns an error
pub fn new(
field: FieldRef,
offsets: OffsetBuffer<i32>,
entries: StructArray,
nulls: Option<NullBuffer>,
ordered: bool,
) -> Self {
Self::try_new(field, offsets, entries, nulls, ordered).unwrap()
}
/// Deconstruct this array into its constituent parts
pub fn into_parts(
self,
) -> (
FieldRef,
OffsetBuffer<i32>,
StructArray,
Option<NullBuffer>,
bool,
) {
let (f, ordered) = match self.data_type {
DataType::Map(f, ordered) => (f, ordered),
_ => unreachable!(),
};
(f, self.value_offsets, self.entries, self.nulls, ordered)
}
/// Returns a reference to the offsets of this map
///
/// Unlike [`Self::value_offsets`] this returns the [`OffsetBuffer`]
/// allowing for zero-copy cloning
#[inline]
pub fn offsets(&self) -> &OffsetBuffer<i32> {
&self.value_offsets
}
/// Returns a reference to the keys of this map
pub fn keys(&self) -> &ArrayRef {
self.entries.column(0)
}
/// Returns a reference to the values of this map
pub fn values(&self) -> &ArrayRef {
self.entries.column(1)
}
/// Returns a reference to the [`StructArray`] entries of this map
pub fn entries(&self) -> &StructArray {
&self.entries
}
/// Returns the data type of the map's keys.
pub fn key_type(&self) -> &DataType {
self.keys().data_type()
}
/// Returns the data type of the map's values.
pub fn value_type(&self) -> &DataType {
self.values().data_type()
}
/// Returns ith value of this map array.
///
/// # Safety
/// Caller must ensure that the index is within the array bounds
pub unsafe fn value_unchecked(&self, i: usize) -> StructArray {
let end = *self.value_offsets().get_unchecked(i + 1);
let start = *self.value_offsets().get_unchecked(i);
self.entries
.slice(start.to_usize().unwrap(), (end - start).to_usize().unwrap())
}
/// Returns ith value of this map array.
///
/// This is a [`StructArray`] containing two fields
pub fn value(&self, i: usize) -> StructArray {
let end = self.value_offsets()[i + 1] as usize;
let start = self.value_offsets()[i] as usize;
self.entries.slice(start, end - start)
}
/// Returns the offset values in the offsets buffer
#[inline]
pub fn value_offsets(&self) -> &[i32] {
&self.value_offsets
}
/// Returns the length for value at index `i`.
#[inline]
pub fn value_length(&self, i: usize) -> i32 {
let offsets = self.value_offsets();
offsets[i + 1] - offsets[i]
}
/// Returns a zero-copy slice of this array with the indicated offset and length.
pub fn slice(&self, offset: usize, length: usize) -> Self {
Self {
data_type: self.data_type.clone(),
nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)),
entries: self.entries.clone(),
value_offsets: self.value_offsets.slice(offset, length),
}
}
/// constructs a new iterator
pub fn iter(&self) -> MapArrayIter<'_> {
MapArrayIter::new(self)
}
}
impl From<ArrayData> for MapArray {
fn from(data: ArrayData) -> Self {
Self::try_new_from_array_data(data)
.expect("Expected infallible creation of MapArray from ArrayData failed")
}
}
impl From<MapArray> for ArrayData {
fn from(array: MapArray) -> Self {
let len = array.len();
let builder = ArrayDataBuilder::new(array.data_type)
.len(len)
.nulls(array.nulls)
.buffers(vec![array.value_offsets.into_inner().into_inner()])
.child_data(vec![array.entries.to_data()]);
unsafe { builder.build_unchecked() }
}
}
impl MapArray {
fn try_new_from_array_data(data: ArrayData) -> Result<Self, ArrowError> {
if !matches!(data.data_type(), DataType::Map(_, _)) {
return Err(ArrowError::InvalidArgumentError(format!(
"MapArray expected ArrayData with DataType::Map got {}",
data.data_type()
)));
}
if data.buffers().len() != 1 {
return Err(ArrowError::InvalidArgumentError(format!(
"MapArray data should contain a single buffer only (value offsets), had {}",
data.len()
)));
}
if data.child_data().len() != 1 {
return Err(ArrowError::InvalidArgumentError(format!(
"MapArray should contain a single child array (values array), had {}",
data.child_data().len()
)));
}
let entries = data.child_data()[0].clone();
if let DataType::Struct(fields) = entries.data_type() {
if fields.len() != 2 {
return Err(ArrowError::InvalidArgumentError(format!(
"MapArray should contain a struct array with 2 fields, have {} fields",
fields.len()
)));
}
} else {
return Err(ArrowError::InvalidArgumentError(format!(
"MapArray should contain a struct array child, found {:?}",
entries.data_type()
)));
}
let entries = entries.into();
// SAFETY:
// ArrayData is valid, and verified type above
let value_offsets = unsafe { get_offsets(&data) };
Ok(Self {
data_type: data.data_type().clone(),
nulls: data.nulls().cloned(),
entries,
value_offsets,
})
}
/// Creates map array from provided keys, values and entry_offsets.
pub fn new_from_strings<'a>(
keys: impl Iterator<Item = &'a str>,
values: &dyn Array,
entry_offsets: &[u32],
) -> Result<Self, ArrowError> {
let entry_offsets_buffer = Buffer::from(entry_offsets.to_byte_slice());
let keys_data = StringArray::from_iter_values(keys);
let keys_field = Arc::new(Field::new("keys", DataType::Utf8, false));
let values_field = Arc::new(Field::new(
"values",
values.data_type().clone(),
values.null_count() > 0,
));
let entry_struct = StructArray::from(vec![
(keys_field, Arc::new(keys_data) as ArrayRef),
(values_field, make_array(values.to_data())),
]);
let map_data_type = DataType::Map(
Arc::new(Field::new(
"entries",
entry_struct.data_type().clone(),
false,
)),
false,
);
let map_data = ArrayData::builder(map_data_type)
.len(entry_offsets.len() - 1)
.add_buffer(entry_offsets_buffer)
.add_child_data(entry_struct.into_data())
.build()?;
Ok(MapArray::from(map_data))
}
}
impl Array for MapArray {
fn as_any(&self) -> &dyn Any {
self
}
fn to_data(&self) -> ArrayData {
self.clone().into_data()
}
fn into_data(self) -> ArrayData {
self.into()
}
fn data_type(&self) -> &DataType {
&self.data_type
}
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
Arc::new(self.slice(offset, length))
}
fn len(&self) -> usize {
self.value_offsets.len() - 1
}
fn is_empty(&self) -> bool {
self.value_offsets.len() <= 1
}
fn offset(&self) -> usize {
0
}
fn nulls(&self) -> Option<&NullBuffer> {
self.nulls.as_ref()
}
fn get_buffer_memory_size(&self) -> usize {
let mut size = self.entries.get_buffer_memory_size();
size += self.value_offsets.inner().inner().capacity();
if let Some(n) = self.nulls.as_ref() {
size += n.buffer().capacity();
}
size
}
fn get_array_memory_size(&self) -> usize {
let mut size = std::mem::size_of::<Self>() + self.entries.get_array_memory_size();
size += self.value_offsets.inner().inner().capacity();
if let Some(n) = self.nulls.as_ref() {
size += n.buffer().capacity();
}
size
}
}
impl<'a> ArrayAccessor for &'a MapArray {
type Item = StructArray;
fn value(&self, index: usize) -> Self::Item {
MapArray::value(self, index)
}
unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
MapArray::value(self, index)
}
}
impl std::fmt::Debug for MapArray {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "MapArray\n[\n")?;
print_long_array(self, f, |array, index, f| {
std::fmt::Debug::fmt(&array.value(index), f)
})?;
write!(f, "]")
}
}
impl From<MapArray> for ListArray {
fn from(value: MapArray) -> Self {
let field = match value.data_type() {
DataType::Map(field, _) => field,
_ => unreachable!("This should be a map type."),
};
let data_type = DataType::List(field.clone());
let builder = value.into_data().into_builder().data_type(data_type);
let array_data = unsafe { builder.build_unchecked() };
ListArray::from(array_data)
}
}
#[cfg(test)]
mod tests {
use crate::cast::AsArray;
use crate::types::UInt32Type;
use crate::{Int32Array, UInt32Array};
use arrow_schema::Fields;
use super::*;
fn create_from_buffers() -> MapArray {
// Construct key and values
let keys_data = ArrayData::builder(DataType::Int32)
.len(8)
.add_buffer(Buffer::from(&[0, 1, 2, 3, 4, 5, 6, 7].to_byte_slice()))
.build()
.unwrap();
let values_data = ArrayData::builder(DataType::UInt32)
.len(8)
.add_buffer(Buffer::from(
&[0u32, 10, 20, 30, 40, 50, 60, 70].to_byte_slice(),
))
.build()
.unwrap();
// Construct a buffer for value offsets, for the nested array:
// [[0, 1, 2], [3, 4, 5], [6, 7]]
let entry_offsets = Buffer::from(&[0, 3, 6, 8].to_byte_slice());
let keys = Arc::new(Field::new("keys", DataType::Int32, false));
let values = Arc::new(Field::new("values", DataType::UInt32, false));
let entry_struct = StructArray::from(vec![
(keys, make_array(keys_data)),
(values, make_array(values_data)),
]);
// Construct a map array from the above two
let map_data_type = DataType::Map(
Arc::new(Field::new(
"entries",
entry_struct.data_type().clone(),
false,
)),
false,
);
let map_data = ArrayData::builder(map_data_type)
.len(3)
.add_buffer(entry_offsets)
.add_child_data(entry_struct.into_data())
.build()
.unwrap();
MapArray::from(map_data)
}
#[test]
fn test_map_array() {
// Construct key and values
let key_data = ArrayData::builder(DataType::Int32)
.len(8)
.add_buffer(Buffer::from(&[0, 1, 2, 3, 4, 5, 6, 7].to_byte_slice()))
.build()
.unwrap();
let value_data = ArrayData::builder(DataType::UInt32)
.len(8)
.add_buffer(Buffer::from(
&[0u32, 10, 20, 0, 40, 0, 60, 70].to_byte_slice(),
))
.null_bit_buffer(Some(Buffer::from(&[0b11010110])))
.build()
.unwrap();
// Construct a buffer for value offsets, for the nested array:
// [[0, 1, 2], [3, 4, 5], [6, 7]]
let entry_offsets = Buffer::from(&[0, 3, 6, 8].to_byte_slice());
let keys_field = Arc::new(Field::new("keys", DataType::Int32, false));
let values_field = Arc::new(Field::new("values", DataType::UInt32, true));
let entry_struct = StructArray::from(vec![
(keys_field.clone(), make_array(key_data)),
(values_field.clone(), make_array(value_data.clone())),
]);
// Construct a map array from the above two
let map_data_type = DataType::Map(
Arc::new(Field::new(
"entries",
entry_struct.data_type().clone(),
false,
)),
false,
);
let map_data = ArrayData::builder(map_data_type)
.len(3)
.add_buffer(entry_offsets)
.add_child_data(entry_struct.into_data())
.build()
.unwrap();
let map_array = MapArray::from(map_data);
assert_eq!(value_data, map_array.values().to_data());
assert_eq!(&DataType::UInt32, map_array.value_type());
assert_eq!(3, map_array.len());
assert_eq!(0, map_array.null_count());
assert_eq!(6, map_array.value_offsets()[2]);
assert_eq!(2, map_array.value_length(2));
let key_array = Arc::new(Int32Array::from(vec![0, 1, 2])) as ArrayRef;
let value_array =
Arc::new(UInt32Array::from(vec![None, Some(10u32), Some(20)])) as ArrayRef;
let struct_array = StructArray::from(vec![
(keys_field.clone(), key_array),
(values_field.clone(), value_array),
]);
assert_eq!(
struct_array,
StructArray::from(map_array.value(0).into_data())
);
assert_eq!(
&struct_array,
unsafe { map_array.value_unchecked(0) }
.as_any()
.downcast_ref::<StructArray>()
.unwrap()
);
for i in 0..3 {
assert!(map_array.is_valid(i));
assert!(!map_array.is_null(i));
}
// Now test with a non-zero offset
let map_array = map_array.slice(1, 2);
assert_eq!(value_data, map_array.values().to_data());
assert_eq!(&DataType::UInt32, map_array.value_type());
assert_eq!(2, map_array.len());
assert_eq!(0, map_array.null_count());
assert_eq!(6, map_array.value_offsets()[1]);
assert_eq!(2, map_array.value_length(1));
let key_array = Arc::new(Int32Array::from(vec![3, 4, 5])) as ArrayRef;
let value_array = Arc::new(UInt32Array::from(vec![None, Some(40), None])) as ArrayRef;
let struct_array =
StructArray::from(vec![(keys_field, key_array), (values_field, value_array)]);
assert_eq!(
&struct_array,
map_array
.value(0)
.as_any()
.downcast_ref::<StructArray>()
.unwrap()
);
assert_eq!(
&struct_array,
unsafe { map_array.value_unchecked(0) }
.as_any()
.downcast_ref::<StructArray>()
.unwrap()
);
}
#[test]
#[ignore = "Test fails because slice of <list<struct>> is still buggy"]
fn test_map_array_slice() {
let map_array = create_from_buffers();
let sliced_array = map_array.slice(1, 2);
assert_eq!(2, sliced_array.len());
assert_eq!(1, sliced_array.offset());
let sliced_array_data = sliced_array.to_data();
for array_data in sliced_array_data.child_data() {
assert_eq!(array_data.offset(), 1);
}
// Check offset and length for each non-null value.
let sliced_map_array = sliced_array.as_any().downcast_ref::<MapArray>().unwrap();
assert_eq!(3, sliced_map_array.value_offsets()[0]);
assert_eq!(3, sliced_map_array.value_length(0));
assert_eq!(6, sliced_map_array.value_offsets()[1]);
assert_eq!(2, sliced_map_array.value_length(1));
// Construct key and values
let keys_data = ArrayData::builder(DataType::Int32)
.len(5)
.add_buffer(Buffer::from(&[3, 4, 5, 6, 7].to_byte_slice()))
.build()
.unwrap();
let values_data = ArrayData::builder(DataType::UInt32)
.len(5)
.add_buffer(Buffer::from(&[30u32, 40, 50, 60, 70].to_byte_slice()))
.build()
.unwrap();
// Construct a buffer for value offsets, for the nested array:
// [[3, 4, 5], [6, 7]]
let entry_offsets = Buffer::from(&[0, 3, 5].to_byte_slice());
let keys = Arc::new(Field::new("keys", DataType::Int32, false));
let values = Arc::new(Field::new("values", DataType::UInt32, false));
let entry_struct = StructArray::from(vec![
(keys, make_array(keys_data)),
(values, make_array(values_data)),
]);
// Construct a map array from the above two
let map_data_type = DataType::Map(
Arc::new(Field::new(
"entries",
entry_struct.data_type().clone(),
false,
)),
false,
);
let expected_map_data = ArrayData::builder(map_data_type)
.len(2)
.add_buffer(entry_offsets)
.add_child_data(entry_struct.into_data())
.build()
.unwrap();
let expected_map_array = MapArray::from(expected_map_data);
assert_eq!(&expected_map_array, sliced_map_array)
}
#[test]
#[should_panic(expected = "index out of bounds: the len is ")]
fn test_map_array_index_out_of_bound() {
let map_array = create_from_buffers();
map_array.value(map_array.len());
}
#[test]
#[should_panic(expected = "MapArray expected ArrayData with DataType::Map got Dictionary")]
fn test_from_array_data_validation() {
// A DictionaryArray has similar buffer layout to a MapArray
// but the meaning of the values differs
let struct_t = DataType::Struct(Fields::from(vec![
Field::new("keys", DataType::Int32, true),
Field::new("values", DataType::UInt32, true),
]));
let dict_t = DataType::Dictionary(Box::new(DataType::Int32), Box::new(struct_t));
let _ = MapArray::from(ArrayData::new_empty(&dict_t));
}
#[test]
fn test_new_from_strings() {
let keys = vec!["a", "b", "c", "d", "e", "f", "g", "h"];
let values_data = UInt32Array::from(vec![0u32, 10, 20, 30, 40, 50, 60, 70]);
// Construct a buffer for value offsets, for the nested array:
// [[a, b, c], [d, e, f], [g, h]]
let entry_offsets = [0, 3, 6, 8];
let map_array =
MapArray::new_from_strings(keys.clone().into_iter(), &values_data, &entry_offsets)
.unwrap();
assert_eq!(
&values_data,
map_array.values().as_primitive::<UInt32Type>()
);
assert_eq!(&DataType::UInt32, map_array.value_type());
assert_eq!(3, map_array.len());
assert_eq!(0, map_array.null_count());
assert_eq!(6, map_array.value_offsets()[2]);
assert_eq!(2, map_array.value_length(2));
let key_array = Arc::new(StringArray::from(vec!["a", "b", "c"])) as ArrayRef;
let value_array = Arc::new(UInt32Array::from(vec![0u32, 10, 20])) as ArrayRef;
let keys_field = Arc::new(Field::new("keys", DataType::Utf8, false));
let values_field = Arc::new(Field::new("values", DataType::UInt32, false));
let struct_array =
StructArray::from(vec![(keys_field, key_array), (values_field, value_array)]);
assert_eq!(
struct_array,
StructArray::from(map_array.value(0).into_data())
);
assert_eq!(
&struct_array,
unsafe { map_array.value_unchecked(0) }
.as_any()
.downcast_ref::<StructArray>()
.unwrap()
);
for i in 0..3 {
assert!(map_array.is_valid(i));
assert!(!map_array.is_null(i));
}
}
#[test]
fn test_try_new() {
let offsets = OffsetBuffer::new(vec![0, 1, 4, 5].into());
let fields = Fields::from(vec![
Field::new("key", DataType::Int32, false),
Field::new("values", DataType::Int32, false),
]);
let columns = vec![
Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])) as _,
Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])) as _,
];
let entries = StructArray::new(fields.clone(), columns, None);
let field = Arc::new(Field::new("entries", DataType::Struct(fields), false));
MapArray::new(field.clone(), offsets.clone(), entries.clone(), None, false);
let nulls = NullBuffer::new_null(3);
MapArray::new(field.clone(), offsets, entries.clone(), Some(nulls), false);
let nulls = NullBuffer::new_null(3);
let offsets = OffsetBuffer::new(vec![0, 1, 2, 4, 5].into());
let err = MapArray::try_new(
field.clone(),
offsets.clone(),
entries.clone(),
Some(nulls),
false,
)
.unwrap_err();
assert_eq!(
err.to_string(),
"Invalid argument error: Incorrect length of null buffer for MapArray, expected 4 got 3"
);
let err = MapArray::try_new(field, offsets.clone(), entries.slice(0, 2), None, false)
.unwrap_err();
assert_eq!(
err.to_string(),
"Invalid argument error: Max offset of 5 exceeds length of entries 2"
);
let field = Arc::new(Field::new("element", DataType::Int64, false));
let err = MapArray::try_new(field, offsets.clone(), entries, None, false)
.unwrap_err()
.to_string();
assert!(
err.starts_with("Invalid argument error: MapArray expected data type Int64 got Struct"),
"{err}"
);
let fields = Fields::from(vec![
Field::new("a", DataType::Int32, false),
Field::new("b", DataType::Int32, false),
Field::new("c", DataType::Int32, false),
]);
let columns = vec![
Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])) as _,
Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])) as _,
Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])) as _,
];
let s = StructArray::new(fields.clone(), columns, None);
let field = Arc::new(Field::new("entries", DataType::Struct(fields), false));
let err = MapArray::try_new(field, offsets, s, None, false).unwrap_err();
assert_eq!(
err.to_string(),
"Invalid argument error: MapArray entries must contain two children, got 3"
);
}
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,197 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//! Contains the `NullArray` type.
use crate::builder::NullBuilder;
use crate::{Array, ArrayRef};
use arrow_buffer::buffer::NullBuffer;
use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::DataType;
use std::any::Any;
use std::sync::Arc;
/// An array of [null values](https://arrow.apache.org/docs/format/Columnar.html#null-layout)
///
/// A `NullArray` is a simplified array where all values are null.
///
/// # Example: Create an array
///
/// ```
/// use arrow_array::{Array, NullArray};
///
/// let array = NullArray::new(10);
///
/// assert!(array.is_nullable());
/// assert_eq!(array.len(), 10);
/// assert_eq!(array.null_count(), 0);
/// assert_eq!(array.logical_nulls().unwrap().null_count(), 10);
/// ```
#[derive(Clone)]
pub struct NullArray {
len: usize,
}
impl NullArray {
/// Create a new [`NullArray`] of the specified length
///
/// *Note*: Use [`crate::array::new_null_array`] if you need an array of some
/// other [`DataType`].
///
pub fn new(length: usize) -> Self {
Self { len: length }
}
/// Returns a zero-copy slice of this array with the indicated offset and length.
pub fn slice(&self, offset: usize, len: usize) -> Self {
assert!(
offset.saturating_add(len) <= self.len,
"the length + offset of the sliced BooleanBuffer cannot exceed the existing length"
);
Self { len }
}
/// Returns a new null array builder
///
/// Note that the `capacity` parameter to this function is _deprecated_. It
/// now does nothing, and will be removed in a future version.
pub fn builder(_capacity: usize) -> NullBuilder {
NullBuilder::new()
}
}
impl Array for NullArray {
fn as_any(&self) -> &dyn Any {
self
}
fn to_data(&self) -> ArrayData {
self.clone().into()
}
fn into_data(self) -> ArrayData {
self.into()
}
fn data_type(&self) -> &DataType {
&DataType::Null
}
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
Arc::new(self.slice(offset, length))
}
fn len(&self) -> usize {
self.len
}
fn is_empty(&self) -> bool {
self.len == 0
}
fn offset(&self) -> usize {
0
}
fn nulls(&self) -> Option<&NullBuffer> {
None
}
fn logical_nulls(&self) -> Option<NullBuffer> {
(self.len != 0).then(|| NullBuffer::new_null(self.len))
}
fn is_nullable(&self) -> bool {
!self.is_empty()
}
fn get_buffer_memory_size(&self) -> usize {
0
}
fn get_array_memory_size(&self) -> usize {
std::mem::size_of::<Self>()
}
}
impl From<ArrayData> for NullArray {
fn from(data: ArrayData) -> Self {
assert_eq!(
data.data_type(),
&DataType::Null,
"NullArray data type should be Null"
);
assert_eq!(
data.buffers().len(),
0,
"NullArray data should contain 0 buffers"
);
assert!(
data.nulls().is_none(),
"NullArray data should not contain a null buffer, as no buffers are required"
);
Self { len: data.len() }
}
}
impl From<NullArray> for ArrayData {
fn from(array: NullArray) -> Self {
let builder = ArrayDataBuilder::new(DataType::Null).len(array.len);
unsafe { builder.build_unchecked() }
}
}
impl std::fmt::Debug for NullArray {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "NullArray({})", self.len())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_null_array() {
let null_arr = NullArray::new(32);
assert_eq!(null_arr.len(), 32);
assert_eq!(null_arr.null_count(), 0);
assert_eq!(null_arr.logical_nulls().unwrap().null_count(), 32);
assert!(null_arr.is_valid(0));
assert!(null_arr.is_nullable());
}
#[test]
fn test_null_array_slice() {
let array1 = NullArray::new(32);
let array2 = array1.slice(8, 16);
assert_eq!(array2.len(), 16);
assert_eq!(array2.null_count(), 0);
assert_eq!(array2.logical_nulls().unwrap().null_count(), 16);
assert!(array2.is_valid(0));
assert!(array2.is_nullable());
}
#[test]
fn test_debug_null_array() {
let array = NullArray::new(1024 * 1024);
assert_eq!(format!("{array:?}"), "NullArray(1048576)");
}
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,630 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::types::GenericStringType;
use crate::{GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait};
use arrow_schema::{ArrowError, DataType};
/// A [`GenericByteArray`] for storing `str`
pub type GenericStringArray<OffsetSize> = GenericByteArray<GenericStringType<OffsetSize>>;
impl<OffsetSize: OffsetSizeTrait> GenericStringArray<OffsetSize> {
/// Get the data type of the array.
#[deprecated(note = "please use `Self::DATA_TYPE` instead")]
pub const fn get_data_type() -> DataType {
Self::DATA_TYPE
}
/// Returns the number of `Unicode Scalar Value` in the string at index `i`.
/// # Performance
/// This function has `O(n)` time complexity where `n` is the string length.
/// If you can make sure that all chars in the string are in the range `U+0x0000` ~ `U+0x007F`,
/// please use the function [`value_length`](#method.value_length) which has O(1) time complexity.
pub fn num_chars(&self, i: usize) -> usize {
self.value(i).chars().count()
}
/// Returns an iterator that returns the values of `array.value(i)` for an iterator with each element `i`
pub fn take_iter<'a>(
&'a self,
indexes: impl Iterator<Item = Option<usize>> + 'a,
) -> impl Iterator<Item = Option<&str>> + 'a {
indexes.map(|opt_index| opt_index.map(|index| self.value(index)))
}
/// Returns an iterator that returns the values of `array.value(i)` for an iterator with each element `i`
/// # Safety
///
/// caller must ensure that the indexes in the iterator are less than the `array.len()`
pub unsafe fn take_iter_unchecked<'a>(
&'a self,
indexes: impl Iterator<Item = Option<usize>> + 'a,
) -> impl Iterator<Item = Option<&str>> + 'a {
indexes.map(|opt_index| opt_index.map(|index| self.value_unchecked(index)))
}
/// Fallibly creates a [`GenericStringArray`] from a [`GenericBinaryArray`] returning
/// an error if [`GenericBinaryArray`] contains invalid UTF-8 data
pub fn try_from_binary(v: GenericBinaryArray<OffsetSize>) -> Result<Self, ArrowError> {
let (offsets, values, nulls) = v.into_parts();
Self::try_new(offsets, values, nulls)
}
}
impl<OffsetSize: OffsetSizeTrait> From<GenericListArray<OffsetSize>>
for GenericStringArray<OffsetSize>
{
fn from(v: GenericListArray<OffsetSize>) -> Self {
GenericBinaryArray::<OffsetSize>::from(v).into()
}
}
impl<OffsetSize: OffsetSizeTrait> From<GenericBinaryArray<OffsetSize>>
for GenericStringArray<OffsetSize>
{
fn from(v: GenericBinaryArray<OffsetSize>) -> Self {
Self::try_from_binary(v).unwrap()
}
}
impl<OffsetSize: OffsetSizeTrait> From<Vec<Option<&str>>> for GenericStringArray<OffsetSize> {
fn from(v: Vec<Option<&str>>) -> Self {
v.into_iter().collect()
}
}
impl<OffsetSize: OffsetSizeTrait> From<Vec<&str>> for GenericStringArray<OffsetSize> {
fn from(v: Vec<&str>) -> Self {
Self::from_iter_values(v)
}
}
impl<OffsetSize: OffsetSizeTrait> From<Vec<Option<String>>> for GenericStringArray<OffsetSize> {
fn from(v: Vec<Option<String>>) -> Self {
v.into_iter().collect()
}
}
impl<OffsetSize: OffsetSizeTrait> From<Vec<String>> for GenericStringArray<OffsetSize> {
fn from(v: Vec<String>) -> Self {
Self::from_iter_values(v)
}
}
/// A [`GenericStringArray`] of `str` using `i32` offsets
///
/// # Examples
///
/// Construction
///
/// ```
/// # use arrow_array::StringArray;
/// // Create from Vec<Option<&str>>
/// let arr = StringArray::from(vec![Some("foo"), Some("bar"), None, Some("baz")]);
/// // Create from Vec<&str>
/// let arr = StringArray::from(vec!["foo", "bar", "baz"]);
/// // Create from iter/collect (requires Option<&str>)
/// let arr: StringArray = std::iter::repeat(Some("foo")).take(10).collect();
/// ```
///
/// Construction and Access
///
/// ```
/// # use arrow_array::StringArray;
/// let array = StringArray::from(vec![Some("foo"), None, Some("bar")]);
/// assert_eq!(array.value(0), "foo");
/// ```
///
/// See [`GenericByteArray`] for more information and examples
pub type StringArray = GenericStringArray<i32>;
/// A [`GenericStringArray`] of `str` using `i64` offsets
///
/// # Examples
///
/// Construction
///
/// ```
/// # use arrow_array::LargeStringArray;
/// // Create from Vec<Option<&str>>
/// let arr = LargeStringArray::from(vec![Some("foo"), Some("bar"), None, Some("baz")]);
/// // Create from Vec<&str>
/// let arr = LargeStringArray::from(vec!["foo", "bar", "baz"]);
/// // Create from iter/collect (requires Option<&str>)
/// let arr: LargeStringArray = std::iter::repeat(Some("foo")).take(10).collect();
/// ```
///
/// Construction and Access
///
/// ```
/// use arrow_array::LargeStringArray;
/// let array = LargeStringArray::from(vec![Some("foo"), None, Some("bar")]);
/// assert_eq!(array.value(2), "bar");
/// ```
///
/// See [`GenericByteArray`] for more information and examples
pub type LargeStringArray = GenericStringArray<i64>;
#[cfg(test)]
mod tests {
use super::*;
use crate::builder::{ListBuilder, PrimitiveBuilder, StringBuilder};
use crate::types::UInt8Type;
use crate::Array;
use arrow_buffer::Buffer;
use arrow_data::ArrayData;
use arrow_schema::Field;
use std::sync::Arc;
#[test]
fn test_string_array_from_u8_slice() {
let values: Vec<&str> = vec!["hello", "", "A£ऀ𖼚𝌆৩ƐZ"];
// Array data: ["hello", "", "A£ऀ𖼚𝌆৩ƐZ"]
let string_array = StringArray::from(values);
assert_eq!(3, string_array.len());
assert_eq!(0, string_array.null_count());
assert_eq!("hello", string_array.value(0));
assert_eq!("hello", unsafe { string_array.value_unchecked(0) });
assert_eq!("", string_array.value(1));
assert_eq!("", unsafe { string_array.value_unchecked(1) });
assert_eq!("A£ऀ𖼚𝌆৩ƐZ", string_array.value(2));
assert_eq!("A£ऀ𖼚𝌆৩ƐZ", unsafe {
string_array.value_unchecked(2)
});
assert_eq!(20, string_array.value_length(2)); // 1 + 2 + 3 + 4 + 4 + 3 + 2 + 1
assert_eq!(8, string_array.num_chars(2));
for i in 0..3 {
assert!(string_array.is_valid(i));
assert!(!string_array.is_null(i));
}
}
#[test]
#[should_panic(expected = "StringArray expects DataType::Utf8")]
fn test_string_array_from_int() {
let array = LargeStringArray::from(vec!["a", "b"]);
drop(StringArray::from(array.into_data()));
}
#[test]
fn test_large_string_array_from_u8_slice() {
let values: Vec<&str> = vec!["hello", "", "A£ऀ𖼚𝌆৩ƐZ"];
// Array data: ["hello", "", "A£ऀ𖼚𝌆৩ƐZ"]
let string_array = LargeStringArray::from(values);
assert_eq!(3, string_array.len());
assert_eq!(0, string_array.null_count());
assert_eq!("hello", string_array.value(0));
assert_eq!("hello", unsafe { string_array.value_unchecked(0) });
assert_eq!("", string_array.value(1));
assert_eq!("", unsafe { string_array.value_unchecked(1) });
assert_eq!("A£ऀ𖼚𝌆৩ƐZ", string_array.value(2));
assert_eq!("A£ऀ𖼚𝌆৩ƐZ", unsafe {
string_array.value_unchecked(2)
});
assert_eq!(5, string_array.value_offsets()[2]);
assert_eq!(20, string_array.value_length(2)); // 1 + 2 + 3 + 4 + 4 + 3 + 2 + 1
assert_eq!(8, string_array.num_chars(2));
for i in 0..3 {
assert!(string_array.is_valid(i));
assert!(!string_array.is_null(i));
}
}
#[test]
fn test_nested_string_array() {
let string_builder = StringBuilder::with_capacity(3, 10);
let mut list_of_string_builder = ListBuilder::new(string_builder);
list_of_string_builder.values().append_value("foo");
list_of_string_builder.values().append_value("bar");
list_of_string_builder.append(true);
list_of_string_builder.values().append_value("foobar");
list_of_string_builder.append(true);
let list_of_strings = list_of_string_builder.finish();
assert_eq!(list_of_strings.len(), 2);
let first_slot = list_of_strings.value(0);
let first_list = first_slot.as_any().downcast_ref::<StringArray>().unwrap();
assert_eq!(first_list.len(), 2);
assert_eq!(first_list.value(0), "foo");
assert_eq!(unsafe { first_list.value_unchecked(0) }, "foo");
assert_eq!(first_list.value(1), "bar");
assert_eq!(unsafe { first_list.value_unchecked(1) }, "bar");
let second_slot = list_of_strings.value(1);
let second_list = second_slot.as_any().downcast_ref::<StringArray>().unwrap();
assert_eq!(second_list.len(), 1);
assert_eq!(second_list.value(0), "foobar");
assert_eq!(unsafe { second_list.value_unchecked(0) }, "foobar");
}
#[test]
#[should_panic(
expected = "Trying to access an element at index 4 from a StringArray of length 3"
)]
fn test_string_array_get_value_index_out_of_bound() {
let values: [u8; 12] = [
b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't',
];
let offsets: [i32; 4] = [0, 5, 5, 12];
let array_data = ArrayData::builder(DataType::Utf8)
.len(3)
.add_buffer(Buffer::from_slice_ref(offsets))
.add_buffer(Buffer::from_slice_ref(values))
.build()
.unwrap();
let string_array = StringArray::from(array_data);
string_array.value(4);
}
#[test]
fn test_string_array_fmt_debug() {
let arr: StringArray = vec!["hello", "arrow"].into();
assert_eq!(
"StringArray\n[\n \"hello\",\n \"arrow\",\n]",
format!("{arr:?}")
);
}
#[test]
fn test_large_string_array_fmt_debug() {
let arr: LargeStringArray = vec!["hello", "arrow"].into();
assert_eq!(
"LargeStringArray\n[\n \"hello\",\n \"arrow\",\n]",
format!("{arr:?}")
);
}
#[test]
fn test_string_array_from_iter() {
let data = [Some("hello"), None, Some("arrow")];
let data_vec = data.to_vec();
// from Vec<Option<&str>>
let array1 = StringArray::from(data_vec.clone());
// from Iterator<Option<&str>>
let array2: StringArray = data_vec.clone().into_iter().collect();
// from Iterator<Option<String>>
let array3: StringArray = data_vec
.into_iter()
.map(|x| x.map(|s| s.to_string()))
.collect();
// from Iterator<&Option<&str>>
let array4: StringArray = data.iter().collect::<StringArray>();
assert_eq!(array1, array2);
assert_eq!(array2, array3);
assert_eq!(array3, array4);
}
#[test]
fn test_string_array_from_iter_values() {
let data = ["hello", "hello2"];
let array1 = StringArray::from_iter_values(data.iter());
assert_eq!(array1.value(0), "hello");
assert_eq!(array1.value(1), "hello2");
// Also works with String types.
let data2 = ["goodbye".to_string(), "goodbye2".to_string()];
let array2 = StringArray::from_iter_values(data2.iter());
assert_eq!(array2.value(0), "goodbye");
assert_eq!(array2.value(1), "goodbye2");
}
#[test]
fn test_string_array_from_unbound_iter() {
// iterator that doesn't declare (upper) size bound
let string_iter = (0..)
.scan(0usize, |pos, i| {
if *pos < 10 {
*pos += 1;
Some(Some(format!("value {i}")))
} else {
// actually returns up to 10 values
None
}
})
// limited using take()
.take(100);
let (_, upper_size_bound) = string_iter.size_hint();
// the upper bound, defined by take above, is 100
assert_eq!(upper_size_bound, Some(100));
let string_array: StringArray = string_iter.collect();
// but the actual number of items in the array should be 10
assert_eq!(string_array.len(), 10);
}
#[test]
fn test_string_array_all_null() {
let data: Vec<Option<&str>> = vec![None];
let array = StringArray::from(data);
array
.into_data()
.validate_full()
.expect("All null array has valid array data");
}
#[test]
fn test_large_string_array_all_null() {
let data: Vec<Option<&str>> = vec![None];
let array = LargeStringArray::from(data);
array
.into_data()
.validate_full()
.expect("All null array has valid array data");
}
#[cfg(feature = "test_utils")]
#[test]
fn bad_size_collect_string() {
use crate::util::test_util::BadIterator;
let data = vec![Some("foo"), None, Some("bar")];
let expected: StringArray = data.clone().into_iter().collect();
// Iterator reports too many items
let arr: StringArray = BadIterator::new(3, 10, data.clone()).collect();
assert_eq!(expected, arr);
// Iterator reports too few items
let arr: StringArray = BadIterator::new(3, 1, data.clone()).collect();
assert_eq!(expected, arr);
}
#[cfg(feature = "test_utils")]
#[test]
fn bad_size_collect_large_string() {
use crate::util::test_util::BadIterator;
let data = vec![Some("foo"), None, Some("bar")];
let expected: LargeStringArray = data.clone().into_iter().collect();
// Iterator reports too many items
let arr: LargeStringArray = BadIterator::new(3, 10, data.clone()).collect();
assert_eq!(expected, arr);
// Iterator reports too few items
let arr: LargeStringArray = BadIterator::new(3, 1, data.clone()).collect();
assert_eq!(expected, arr);
}
#[cfg(feature = "test_utils")]
#[test]
fn bad_size_iter_values_string() {
use crate::util::test_util::BadIterator;
let data = vec!["foo", "bar", "baz"];
let expected: StringArray = data.clone().into_iter().map(Some).collect();
// Iterator reports too many items
let arr = StringArray::from_iter_values(BadIterator::new(3, 10, data.clone()));
assert_eq!(expected, arr);
// Iterator reports too few items
let arr = StringArray::from_iter_values(BadIterator::new(3, 1, data.clone()));
assert_eq!(expected, arr);
}
#[cfg(feature = "test_utils")]
#[test]
fn bad_size_iter_values_large_string() {
use crate::util::test_util::BadIterator;
let data = vec!["foo", "bar", "baz"];
let expected: LargeStringArray = data.clone().into_iter().map(Some).collect();
// Iterator reports too many items
let arr = LargeStringArray::from_iter_values(BadIterator::new(3, 10, data.clone()));
assert_eq!(expected, arr);
// Iterator reports too few items
let arr = LargeStringArray::from_iter_values(BadIterator::new(3, 1, data.clone()));
assert_eq!(expected, arr);
}
fn _test_generic_string_array_from_list_array<O: OffsetSizeTrait>() {
let values = b"HelloArrowAndParquet";
// "ArrowAndParquet"
let child_data = ArrayData::builder(DataType::UInt8)
.len(15)
.offset(5)
.add_buffer(Buffer::from(&values[..]))
.build()
.unwrap();
let offsets = [0, 5, 8, 15].map(|n| O::from_usize(n).unwrap());
let null_buffer = Buffer::from_slice_ref([0b101]);
let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new(
"item",
DataType::UInt8,
false,
)));
// [None, Some("Parquet")]
let array_data = ArrayData::builder(data_type)
.len(2)
.offset(1)
.add_buffer(Buffer::from_slice_ref(offsets))
.null_bit_buffer(Some(null_buffer))
.add_child_data(child_data)
.build()
.unwrap();
let list_array = GenericListArray::<O>::from(array_data);
let string_array = GenericStringArray::<O>::from(list_array);
assert_eq!(2, string_array.len());
assert_eq!(1, string_array.null_count());
assert!(string_array.is_null(0));
assert!(string_array.is_valid(1));
assert_eq!("Parquet", string_array.value(1));
}
#[test]
fn test_string_array_from_list_array() {
_test_generic_string_array_from_list_array::<i32>();
}
#[test]
fn test_large_string_array_from_list_array() {
_test_generic_string_array_from_list_array::<i64>();
}
fn _test_generic_string_array_from_list_array_with_child_nulls_failed<O: OffsetSizeTrait>() {
let values = b"HelloArrow";
let child_data = ArrayData::builder(DataType::UInt8)
.len(10)
.add_buffer(Buffer::from(&values[..]))
.null_bit_buffer(Some(Buffer::from_slice_ref([0b1010101010])))
.build()
.unwrap();
let offsets = [0, 5, 10].map(|n| O::from_usize(n).unwrap());
// It is possible to create a null struct containing a non-nullable child
// see https://github.com/apache/arrow-rs/pull/3244 for details
let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new(
"item",
DataType::UInt8,
true,
)));
// [None, Some(b"Parquet")]
let array_data = ArrayData::builder(data_type)
.len(2)
.add_buffer(Buffer::from_slice_ref(offsets))
.add_child_data(child_data)
.build()
.unwrap();
let list_array = GenericListArray::<O>::from(array_data);
drop(GenericStringArray::<O>::from(list_array));
}
#[test]
#[should_panic(expected = "The child array cannot contain null values.")]
fn test_string_array_from_list_array_with_child_nulls_failed() {
_test_generic_string_array_from_list_array_with_child_nulls_failed::<i32>();
}
#[test]
#[should_panic(expected = "The child array cannot contain null values.")]
fn test_large_string_array_from_list_array_with_child_nulls_failed() {
_test_generic_string_array_from_list_array_with_child_nulls_failed::<i64>();
}
fn _test_generic_string_array_from_list_array_wrong_type<O: OffsetSizeTrait>() {
let values = b"HelloArrow";
let child_data = ArrayData::builder(DataType::UInt16)
.len(5)
.add_buffer(Buffer::from(&values[..]))
.build()
.unwrap();
let offsets = [0, 2, 3].map(|n| O::from_usize(n).unwrap());
let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new(
"item",
DataType::UInt16,
false,
)));
let array_data = ArrayData::builder(data_type)
.len(2)
.add_buffer(Buffer::from_slice_ref(offsets))
.add_child_data(child_data)
.build()
.unwrap();
let list_array = GenericListArray::<O>::from(array_data);
drop(GenericStringArray::<O>::from(list_array));
}
#[test]
#[should_panic(
expected = "BinaryArray can only be created from List<u8> arrays, mismatched data types."
)]
fn test_string_array_from_list_array_wrong_type() {
_test_generic_string_array_from_list_array_wrong_type::<i32>();
}
#[test]
#[should_panic(
expected = "BinaryArray can only be created from List<u8> arrays, mismatched data types."
)]
fn test_large_string_array_from_list_array_wrong_type() {
_test_generic_string_array_from_list_array_wrong_type::<i64>();
}
#[test]
#[should_panic(
expected = "Encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 0"
)]
fn test_list_array_utf8_validation() {
let mut builder = ListBuilder::new(PrimitiveBuilder::<UInt8Type>::new());
builder.values().append_value(0xFF);
builder.append(true);
let list = builder.finish();
let _ = StringArray::from(list);
}
#[test]
fn test_empty_offsets() {
let string = StringArray::from(
ArrayData::builder(DataType::Utf8)
.buffers(vec![Buffer::from(&[]), Buffer::from(&[])])
.build()
.unwrap(),
);
assert_eq!(string.len(), 0);
assert_eq!(string.value_offsets(), &[0]);
let string = LargeStringArray::from(
ArrayData::builder(DataType::LargeUtf8)
.buffers(vec![Buffer::from(&[]), Buffer::from(&[])])
.build()
.unwrap(),
);
assert_eq!(string.len(), 0);
assert_eq!(string.value_offsets(), &[0]);
}
#[test]
fn test_into_builder() {
let array: StringArray = vec!["hello", "arrow"].into();
// Append values
let mut builder = array.into_builder().unwrap();
builder.append_value("rust");
let expected: StringArray = vec!["hello", "arrow", "rust"].into();
let array = builder.finish();
assert_eq!(expected, array);
}
#[test]
fn test_into_builder_err() {
let array: StringArray = vec!["hello", "arrow"].into();
// Clone it, so we cannot get a mutable builder back
let shared_array = array.clone();
let err_return = array.into_builder().unwrap_err();
assert_eq!(&err_return, &shared_array);
}
}

Просмотреть файл

@ -0,0 +1,734 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::{make_array, new_null_array, Array, ArrayRef, RecordBatch};
use arrow_buffer::{BooleanBuffer, Buffer, NullBuffer};
use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields, SchemaBuilder};
use std::sync::Arc;
use std::{any::Any, ops::Index};
/// An array of [structs](https://arrow.apache.org/docs/format/Columnar.html#struct-layout)
///
/// Each child (called *field*) is represented by a separate array.
///
/// # Comparison with [RecordBatch]
///
/// Both [`RecordBatch`] and [`StructArray`] represent a collection of columns / arrays with the
/// same length.
///
/// However, there are a couple of key differences:
///
/// * [`StructArray`] can be nested within other [`Array`], including itself
/// * [`RecordBatch`] can contain top-level metadata on its associated [`Schema`][arrow_schema::Schema]
/// * [`StructArray`] can contain top-level nulls, i.e. `null`
/// * [`RecordBatch`] can only represent nulls in its child columns, i.e. `{"field": null}`
///
/// [`StructArray`] is therefore a more general data container than [`RecordBatch`], and as such
/// code that needs to handle both will typically share an implementation in terms of
/// [`StructArray`] and convert to/from [`RecordBatch`] as necessary.
///
/// [`From`] implementations are provided to facilitate this conversion, however, converting
/// from a [`StructArray`] containing top-level nulls to a [`RecordBatch`] will panic, as there
/// is no way to preserve them.
///
/// # Example: Create an array from a vector of fields
///
/// ```
/// use std::sync::Arc;
/// use arrow_array::{Array, ArrayRef, BooleanArray, Int32Array, StructArray};
/// use arrow_schema::{DataType, Field};
///
/// let boolean = Arc::new(BooleanArray::from(vec![false, false, true, true]));
/// let int = Arc::new(Int32Array::from(vec![42, 28, 19, 31]));
///
/// let struct_array = StructArray::from(vec![
/// (
/// Arc::new(Field::new("b", DataType::Boolean, false)),
/// boolean.clone() as ArrayRef,
/// ),
/// (
/// Arc::new(Field::new("c", DataType::Int32, false)),
/// int.clone() as ArrayRef,
/// ),
/// ]);
/// assert_eq!(struct_array.column(0).as_ref(), boolean.as_ref());
/// assert_eq!(struct_array.column(1).as_ref(), int.as_ref());
/// assert_eq!(4, struct_array.len());
/// assert_eq!(0, struct_array.null_count());
/// assert_eq!(0, struct_array.offset());
/// ```
#[derive(Clone)]
pub struct StructArray {
len: usize,
data_type: DataType,
nulls: Option<NullBuffer>,
fields: Vec<ArrayRef>,
}
impl StructArray {
/// Create a new [`StructArray`] from the provided parts, panicking on failure
///
/// # Panics
///
/// Panics if [`Self::try_new`] returns an error
pub fn new(fields: Fields, arrays: Vec<ArrayRef>, nulls: Option<NullBuffer>) -> Self {
Self::try_new(fields, arrays, nulls).unwrap()
}
/// Create a new [`StructArray`] from the provided parts, returning an error on failure
///
/// # Errors
///
/// Errors if
///
/// * `fields.len() != arrays.len()`
/// * `fields[i].data_type() != arrays[i].data_type()`
/// * `arrays[i].len() != arrays[j].len()`
/// * `arrays[i].len() != nulls.len()`
/// * `!fields[i].is_nullable() && !nulls.contains(arrays[i].nulls())`
pub fn try_new(
fields: Fields,
arrays: Vec<ArrayRef>,
nulls: Option<NullBuffer>,
) -> Result<Self, ArrowError> {
if fields.len() != arrays.len() {
return Err(ArrowError::InvalidArgumentError(format!(
"Incorrect number of arrays for StructArray fields, expected {} got {}",
fields.len(),
arrays.len()
)));
}
let len = arrays.first().map(|x| x.len()).unwrap_or_default();
if let Some(n) = nulls.as_ref() {
if n.len() != len {
return Err(ArrowError::InvalidArgumentError(format!(
"Incorrect number of nulls for StructArray, expected {len} got {}",
n.len(),
)));
}
}
for (f, a) in fields.iter().zip(&arrays) {
if f.data_type() != a.data_type() {
return Err(ArrowError::InvalidArgumentError(format!(
"Incorrect datatype for StructArray field {:?}, expected {} got {}",
f.name(),
f.data_type(),
a.data_type()
)));
}
if a.len() != len {
return Err(ArrowError::InvalidArgumentError(format!(
"Incorrect array length for StructArray field {:?}, expected {} got {}",
f.name(),
len,
a.len()
)));
}
if !f.is_nullable() {
if let Some(a) = a.logical_nulls() {
if !nulls.as_ref().map(|n| n.contains(&a)).unwrap_or_default() {
return Err(ArrowError::InvalidArgumentError(format!(
"Found unmasked nulls for non-nullable StructArray field {:?}",
f.name()
)));
}
}
}
}
Ok(Self {
len,
data_type: DataType::Struct(fields),
nulls: nulls.filter(|n| n.null_count() > 0),
fields: arrays,
})
}
/// Create a new [`StructArray`] of length `len` where all values are null
pub fn new_null(fields: Fields, len: usize) -> Self {
let arrays = fields
.iter()
.map(|f| new_null_array(f.data_type(), len))
.collect();
Self {
len,
data_type: DataType::Struct(fields),
nulls: Some(NullBuffer::new_null(len)),
fields: arrays,
}
}
/// Create a new [`StructArray`] from the provided parts without validation
///
/// # Safety
///
/// Safe if [`Self::new`] would not panic with the given arguments
pub unsafe fn new_unchecked(
fields: Fields,
arrays: Vec<ArrayRef>,
nulls: Option<NullBuffer>,
) -> Self {
let len = arrays.first().map(|x| x.len()).unwrap_or_default();
Self {
len,
data_type: DataType::Struct(fields),
nulls,
fields: arrays,
}
}
/// Create a new [`StructArray`] containing no fields
///
/// # Panics
///
/// If `len != nulls.len()`
pub fn new_empty_fields(len: usize, nulls: Option<NullBuffer>) -> Self {
if let Some(n) = &nulls {
assert_eq!(len, n.len())
}
Self {
len,
data_type: DataType::Struct(Fields::empty()),
fields: vec![],
nulls,
}
}
/// Deconstruct this array into its constituent parts
pub fn into_parts(self) -> (Fields, Vec<ArrayRef>, Option<NullBuffer>) {
let f = match self.data_type {
DataType::Struct(f) => f,
_ => unreachable!(),
};
(f, self.fields, self.nulls)
}
/// Returns the field at `pos`.
pub fn column(&self, pos: usize) -> &ArrayRef {
&self.fields[pos]
}
/// Return the number of fields in this struct array
pub fn num_columns(&self) -> usize {
self.fields.len()
}
/// Returns the fields of the struct array
pub fn columns(&self) -> &[ArrayRef] {
&self.fields
}
/// Returns child array refs of the struct array
#[deprecated(note = "Use columns().to_vec()")]
pub fn columns_ref(&self) -> Vec<ArrayRef> {
self.columns().to_vec()
}
/// Return field names in this struct array
pub fn column_names(&self) -> Vec<&str> {
match self.data_type() {
DataType::Struct(fields) => fields
.iter()
.map(|f| f.name().as_str())
.collect::<Vec<&str>>(),
_ => unreachable!("Struct array's data type is not struct!"),
}
}
/// Returns the [`Fields`] of this [`StructArray`]
pub fn fields(&self) -> &Fields {
match self.data_type() {
DataType::Struct(f) => f,
_ => unreachable!(),
}
}
/// Return child array whose field name equals to column_name
///
/// Note: A schema can currently have duplicate field names, in which case
/// the first field will always be selected.
/// This issue will be addressed in [ARROW-11178](https://issues.apache.org/jira/browse/ARROW-11178)
pub fn column_by_name(&self, column_name: &str) -> Option<&ArrayRef> {
self.column_names()
.iter()
.position(|c| c == &column_name)
.map(|pos| self.column(pos))
}
/// Returns a zero-copy slice of this array with the indicated offset and length.
pub fn slice(&self, offset: usize, len: usize) -> Self {
assert!(
offset.saturating_add(len) <= self.len,
"the length + offset of the sliced StructArray cannot exceed the existing length"
);
let fields = self.fields.iter().map(|a| a.slice(offset, len)).collect();
Self {
len,
data_type: self.data_type.clone(),
nulls: self.nulls.as_ref().map(|n| n.slice(offset, len)),
fields,
}
}
}
impl From<ArrayData> for StructArray {
fn from(data: ArrayData) -> Self {
let fields = data
.child_data()
.iter()
.map(|cd| make_array(cd.clone()))
.collect();
Self {
len: data.len(),
data_type: data.data_type().clone(),
nulls: data.nulls().cloned(),
fields,
}
}
}
impl From<StructArray> for ArrayData {
fn from(array: StructArray) -> Self {
let builder = ArrayDataBuilder::new(array.data_type)
.len(array.len)
.nulls(array.nulls)
.child_data(array.fields.iter().map(|x| x.to_data()).collect());
unsafe { builder.build_unchecked() }
}
}
impl TryFrom<Vec<(&str, ArrayRef)>> for StructArray {
type Error = ArrowError;
/// builds a StructArray from a vector of names and arrays.
fn try_from(values: Vec<(&str, ArrayRef)>) -> Result<Self, ArrowError> {
let (schema, arrays): (SchemaBuilder, _) = values
.into_iter()
.map(|(name, array)| {
(
Field::new(name, array.data_type().clone(), array.is_nullable()),
array,
)
})
.unzip();
StructArray::try_new(schema.finish().fields, arrays, None)
}
}
impl Array for StructArray {
fn as_any(&self) -> &dyn Any {
self
}
fn to_data(&self) -> ArrayData {
self.clone().into()
}
fn into_data(self) -> ArrayData {
self.into()
}
fn data_type(&self) -> &DataType {
&self.data_type
}
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
Arc::new(self.slice(offset, length))
}
fn len(&self) -> usize {
self.len
}
fn is_empty(&self) -> bool {
self.len == 0
}
fn offset(&self) -> usize {
0
}
fn nulls(&self) -> Option<&NullBuffer> {
self.nulls.as_ref()
}
fn get_buffer_memory_size(&self) -> usize {
let mut size = self.fields.iter().map(|a| a.get_buffer_memory_size()).sum();
if let Some(n) = self.nulls.as_ref() {
size += n.buffer().capacity();
}
size
}
fn get_array_memory_size(&self) -> usize {
let mut size = self.fields.iter().map(|a| a.get_array_memory_size()).sum();
size += std::mem::size_of::<Self>();
if let Some(n) = self.nulls.as_ref() {
size += n.buffer().capacity();
}
size
}
}
impl From<Vec<(FieldRef, ArrayRef)>> for StructArray {
fn from(v: Vec<(FieldRef, ArrayRef)>) -> Self {
let (schema, arrays): (SchemaBuilder, _) = v.into_iter().unzip();
StructArray::new(schema.finish().fields, arrays, None)
}
}
impl std::fmt::Debug for StructArray {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "StructArray\n[\n")?;
for (child_index, name) in self.column_names().iter().enumerate() {
let column = self.column(child_index);
writeln!(
f,
"-- child {}: \"{}\" ({:?})",
child_index,
name,
column.data_type()
)?;
std::fmt::Debug::fmt(column, f)?;
writeln!(f)?;
}
write!(f, "]")
}
}
impl From<(Vec<(FieldRef, ArrayRef)>, Buffer)> for StructArray {
fn from(pair: (Vec<(FieldRef, ArrayRef)>, Buffer)) -> Self {
let len = pair.0.first().map(|x| x.1.len()).unwrap_or_default();
let (fields, arrays): (SchemaBuilder, Vec<_>) = pair.0.into_iter().unzip();
let nulls = NullBuffer::new(BooleanBuffer::new(pair.1, 0, len));
Self::new(fields.finish().fields, arrays, Some(nulls))
}
}
impl From<RecordBatch> for StructArray {
fn from(value: RecordBatch) -> Self {
Self {
len: value.num_rows(),
data_type: DataType::Struct(value.schema().fields().clone()),
nulls: None,
fields: value.columns().to_vec(),
}
}
}
impl Index<&str> for StructArray {
type Output = ArrayRef;
/// Get a reference to a column's array by name.
///
/// Note: A schema can currently have duplicate field names, in which case
/// the first field will always be selected.
/// This issue will be addressed in [ARROW-11178](https://issues.apache.org/jira/browse/ARROW-11178)
///
/// # Panics
///
/// Panics if the name is not in the schema.
fn index(&self, name: &str) -> &Self::Output {
self.column_by_name(name).unwrap()
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::{BooleanArray, Float32Array, Float64Array, Int32Array, Int64Array, StringArray};
use arrow_buffer::ToByteSlice;
#[test]
fn test_struct_array_builder() {
let boolean_array = BooleanArray::from(vec![false, false, true, true]);
let int_array = Int64Array::from(vec![42, 28, 19, 31]);
let fields = vec![
Field::new("a", DataType::Boolean, false),
Field::new("b", DataType::Int64, false),
];
let struct_array_data = ArrayData::builder(DataType::Struct(fields.into()))
.len(4)
.add_child_data(boolean_array.to_data())
.add_child_data(int_array.to_data())
.build()
.unwrap();
let struct_array = StructArray::from(struct_array_data);
assert_eq!(struct_array.column(0).as_ref(), &boolean_array);
assert_eq!(struct_array.column(1).as_ref(), &int_array);
}
#[test]
fn test_struct_array_from() {
let boolean = Arc::new(BooleanArray::from(vec![false, false, true, true]));
let int = Arc::new(Int32Array::from(vec![42, 28, 19, 31]));
let struct_array = StructArray::from(vec![
(
Arc::new(Field::new("b", DataType::Boolean, false)),
boolean.clone() as ArrayRef,
),
(
Arc::new(Field::new("c", DataType::Int32, false)),
int.clone() as ArrayRef,
),
]);
assert_eq!(struct_array.column(0).as_ref(), boolean.as_ref());
assert_eq!(struct_array.column(1).as_ref(), int.as_ref());
assert_eq!(4, struct_array.len());
assert_eq!(0, struct_array.null_count());
assert_eq!(0, struct_array.offset());
}
/// validates that struct can be accessed using `column_name` as index i.e. `struct_array["column_name"]`.
#[test]
fn test_struct_array_index_access() {
let boolean = Arc::new(BooleanArray::from(vec![false, false, true, true]));
let int = Arc::new(Int32Array::from(vec![42, 28, 19, 31]));
let struct_array = StructArray::from(vec![
(
Arc::new(Field::new("b", DataType::Boolean, false)),
boolean.clone() as ArrayRef,
),
(
Arc::new(Field::new("c", DataType::Int32, false)),
int.clone() as ArrayRef,
),
]);
assert_eq!(struct_array["b"].as_ref(), boolean.as_ref());
assert_eq!(struct_array["c"].as_ref(), int.as_ref());
}
/// validates that the in-memory representation follows [the spec](https://arrow.apache.org/docs/format/Columnar.html#struct-layout)
#[test]
fn test_struct_array_from_vec() {
let strings: ArrayRef = Arc::new(StringArray::from(vec![
Some("joe"),
None,
None,
Some("mark"),
]));
let ints: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(2), None, Some(4)]));
let arr =
StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]).unwrap();
let struct_data = arr.into_data();
assert_eq!(4, struct_data.len());
assert_eq!(0, struct_data.null_count());
let expected_string_data = ArrayData::builder(DataType::Utf8)
.len(4)
.null_bit_buffer(Some(Buffer::from(&[9_u8])))
.add_buffer(Buffer::from(&[0, 3, 3, 3, 7].to_byte_slice()))
.add_buffer(Buffer::from(b"joemark"))
.build()
.unwrap();
let expected_int_data = ArrayData::builder(DataType::Int32)
.len(4)
.null_bit_buffer(Some(Buffer::from(&[11_u8])))
.add_buffer(Buffer::from(&[1, 2, 0, 4].to_byte_slice()))
.build()
.unwrap();
assert_eq!(expected_string_data, struct_data.child_data()[0]);
assert_eq!(expected_int_data, struct_data.child_data()[1]);
}
#[test]
fn test_struct_array_from_vec_error() {
let strings: ArrayRef = Arc::new(StringArray::from(vec![
Some("joe"),
None,
None,
// 3 elements, not 4
]));
let ints: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(2), None, Some(4)]));
let err = StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())])
.unwrap_err()
.to_string();
assert_eq!(
err,
"Invalid argument error: Incorrect array length for StructArray field \"f2\", expected 3 got 4"
)
}
#[test]
#[should_panic(
expected = "Incorrect datatype for StructArray field \\\"b\\\", expected Int16 got Boolean"
)]
fn test_struct_array_from_mismatched_types_single() {
drop(StructArray::from(vec![(
Arc::new(Field::new("b", DataType::Int16, false)),
Arc::new(BooleanArray::from(vec![false, false, true, true])) as Arc<dyn Array>,
)]));
}
#[test]
#[should_panic(
expected = "Incorrect datatype for StructArray field \\\"b\\\", expected Int16 got Boolean"
)]
fn test_struct_array_from_mismatched_types_multiple() {
drop(StructArray::from(vec![
(
Arc::new(Field::new("b", DataType::Int16, false)),
Arc::new(BooleanArray::from(vec![false, false, true, true])) as Arc<dyn Array>,
),
(
Arc::new(Field::new("c", DataType::Utf8, false)),
Arc::new(Int32Array::from(vec![42, 28, 19, 31])),
),
]));
}
#[test]
fn test_struct_array_slice() {
let boolean_data = ArrayData::builder(DataType::Boolean)
.len(5)
.add_buffer(Buffer::from([0b00010000]))
.null_bit_buffer(Some(Buffer::from([0b00010001])))
.build()
.unwrap();
let int_data = ArrayData::builder(DataType::Int32)
.len(5)
.add_buffer(Buffer::from([0, 28, 42, 0, 0].to_byte_slice()))
.null_bit_buffer(Some(Buffer::from([0b00000110])))
.build()
.unwrap();
let field_types = vec![
Field::new("a", DataType::Boolean, true),
Field::new("b", DataType::Int32, true),
];
let struct_array_data = ArrayData::builder(DataType::Struct(field_types.into()))
.len(5)
.add_child_data(boolean_data.clone())
.add_child_data(int_data.clone())
.null_bit_buffer(Some(Buffer::from([0b00010111])))
.build()
.unwrap();
let struct_array = StructArray::from(struct_array_data);
assert_eq!(5, struct_array.len());
assert_eq!(1, struct_array.null_count());
assert!(struct_array.is_valid(0));
assert!(struct_array.is_valid(1));
assert!(struct_array.is_valid(2));
assert!(struct_array.is_null(3));
assert!(struct_array.is_valid(4));
assert_eq!(boolean_data, struct_array.column(0).to_data());
assert_eq!(int_data, struct_array.column(1).to_data());
let c0 = struct_array.column(0);
let c0 = c0.as_any().downcast_ref::<BooleanArray>().unwrap();
assert_eq!(5, c0.len());
assert_eq!(3, c0.null_count());
assert!(c0.is_valid(0));
assert!(!c0.value(0));
assert!(c0.is_null(1));
assert!(c0.is_null(2));
assert!(c0.is_null(3));
assert!(c0.is_valid(4));
assert!(c0.value(4));
let c1 = struct_array.column(1);
let c1 = c1.as_any().downcast_ref::<Int32Array>().unwrap();
assert_eq!(5, c1.len());
assert_eq!(3, c1.null_count());
assert!(c1.is_null(0));
assert!(c1.is_valid(1));
assert_eq!(28, c1.value(1));
assert!(c1.is_valid(2));
assert_eq!(42, c1.value(2));
assert!(c1.is_null(3));
assert!(c1.is_null(4));
let sliced_array = struct_array.slice(2, 3);
let sliced_array = sliced_array.as_any().downcast_ref::<StructArray>().unwrap();
assert_eq!(3, sliced_array.len());
assert_eq!(1, sliced_array.null_count());
assert!(sliced_array.is_valid(0));
assert!(sliced_array.is_null(1));
assert!(sliced_array.is_valid(2));
let sliced_c0 = sliced_array.column(0);
let sliced_c0 = sliced_c0.as_any().downcast_ref::<BooleanArray>().unwrap();
assert_eq!(3, sliced_c0.len());
assert!(sliced_c0.is_null(0));
assert!(sliced_c0.is_null(1));
assert!(sliced_c0.is_valid(2));
assert!(sliced_c0.value(2));
let sliced_c1 = sliced_array.column(1);
let sliced_c1 = sliced_c1.as_any().downcast_ref::<Int32Array>().unwrap();
assert_eq!(3, sliced_c1.len());
assert!(sliced_c1.is_valid(0));
assert_eq!(42, sliced_c1.value(0));
assert!(sliced_c1.is_null(1));
assert!(sliced_c1.is_null(2));
}
#[test]
#[should_panic(
expected = "Incorrect array length for StructArray field \\\"c\\\", expected 1 got 2"
)]
fn test_invalid_struct_child_array_lengths() {
drop(StructArray::from(vec![
(
Arc::new(Field::new("b", DataType::Float32, false)),
Arc::new(Float32Array::from(vec![1.1])) as Arc<dyn Array>,
),
(
Arc::new(Field::new("c", DataType::Float64, false)),
Arc::new(Float64Array::from(vec![2.2, 3.3])),
),
]));
}
#[test]
fn test_struct_array_from_empty() {
let sa = StructArray::from(vec![]);
assert!(sa.is_empty())
}
#[test]
#[should_panic(expected = "Found unmasked nulls for non-nullable StructArray field \\\"c\\\"")]
fn test_struct_array_from_mismatched_nullability() {
drop(StructArray::from(vec![(
Arc::new(Field::new("c", DataType::Int32, false)),
Arc::new(Int32Array::from(vec![Some(42), None, Some(19)])) as ArrayRef,
)]));
}
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,328 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::builder::{ArrayBuilder, BooleanBufferBuilder};
use crate::{ArrayRef, BooleanArray};
use arrow_buffer::Buffer;
use arrow_buffer::NullBufferBuilder;
use arrow_data::ArrayData;
use arrow_schema::{ArrowError, DataType};
use std::any::Any;
use std::sync::Arc;
/// Builder for [`BooleanArray`]
///
/// # Example
///
/// Create a `BooleanArray` from a `BooleanBuilder`
///
/// ```
///
/// # use arrow_array::{Array, BooleanArray, builder::BooleanBuilder};
///
/// let mut b = BooleanBuilder::new();
/// b.append_value(true);
/// b.append_null();
/// b.append_value(false);
/// b.append_value(true);
/// let arr = b.finish();
///
/// assert_eq!(4, arr.len());
/// assert_eq!(1, arr.null_count());
/// assert_eq!(true, arr.value(0));
/// assert!(arr.is_valid(0));
/// assert!(!arr.is_null(0));
/// assert!(!arr.is_valid(1));
/// assert!(arr.is_null(1));
/// assert_eq!(false, arr.value(2));
/// assert!(arr.is_valid(2));
/// assert!(!arr.is_null(2));
/// assert_eq!(true, arr.value(3));
/// assert!(arr.is_valid(3));
/// assert!(!arr.is_null(3));
/// ```
#[derive(Debug)]
pub struct BooleanBuilder {
values_builder: BooleanBufferBuilder,
null_buffer_builder: NullBufferBuilder,
}
impl Default for BooleanBuilder {
fn default() -> Self {
Self::new()
}
}
impl BooleanBuilder {
/// Creates a new boolean builder
pub fn new() -> Self {
Self::with_capacity(1024)
}
/// Creates a new boolean builder with space for `capacity` elements without re-allocating
pub fn with_capacity(capacity: usize) -> Self {
Self {
values_builder: BooleanBufferBuilder::new(capacity),
null_buffer_builder: NullBufferBuilder::new(capacity),
}
}
/// Returns the capacity of this builder measured in slots of type `T`
pub fn capacity(&self) -> usize {
self.values_builder.capacity()
}
/// Appends a value of type `T` into the builder
#[inline]
pub fn append_value(&mut self, v: bool) {
self.values_builder.append(v);
self.null_buffer_builder.append_non_null();
}
/// Appends a null slot into the builder
#[inline]
pub fn append_null(&mut self) {
self.null_buffer_builder.append_null();
self.values_builder.advance(1);
}
/// Appends `n` `null`s into the builder.
#[inline]
pub fn append_nulls(&mut self, n: usize) {
self.null_buffer_builder.append_n_nulls(n);
self.values_builder.advance(n);
}
/// Appends an `Option<T>` into the builder
#[inline]
pub fn append_option(&mut self, v: Option<bool>) {
match v {
None => self.append_null(),
Some(v) => self.append_value(v),
};
}
/// Appends a slice of type `T` into the builder
#[inline]
pub fn append_slice(&mut self, v: &[bool]) {
self.values_builder.append_slice(v);
self.null_buffer_builder.append_n_non_nulls(v.len());
}
/// Appends values from a slice of type `T` and a validity boolean slice.
///
/// Returns an error if the slices are of different lengths
#[inline]
pub fn append_values(&mut self, values: &[bool], is_valid: &[bool]) -> Result<(), ArrowError> {
if values.len() != is_valid.len() {
Err(ArrowError::InvalidArgumentError(
"Value and validity lengths must be equal".to_string(),
))
} else {
self.null_buffer_builder.append_slice(is_valid);
self.values_builder.append_slice(values);
Ok(())
}
}
/// Builds the [BooleanArray] and reset this builder.
pub fn finish(&mut self) -> BooleanArray {
let len = self.len();
let null_bit_buffer = self.null_buffer_builder.finish();
let builder = ArrayData::builder(DataType::Boolean)
.len(len)
.add_buffer(self.values_builder.finish().into_inner())
.nulls(null_bit_buffer);
let array_data = unsafe { builder.build_unchecked() };
BooleanArray::from(array_data)
}
/// Builds the [BooleanArray] without resetting the builder.
pub fn finish_cloned(&self) -> BooleanArray {
let len = self.len();
let nulls = self.null_buffer_builder.finish_cloned();
let value_buffer = Buffer::from_slice_ref(self.values_builder.as_slice());
let builder = ArrayData::builder(DataType::Boolean)
.len(len)
.add_buffer(value_buffer)
.nulls(nulls);
let array_data = unsafe { builder.build_unchecked() };
BooleanArray::from(array_data)
}
/// Returns the current values buffer as a slice
///
/// Boolean values are bit-packed into bytes. To extract the i-th boolean
/// from the bytes, you can use `arrow_buffer::bit_util::get_bit()`.
pub fn values_slice(&self) -> &[u8] {
self.values_builder.as_slice()
}
/// Returns the current null buffer as a slice
pub fn validity_slice(&self) -> Option<&[u8]> {
self.null_buffer_builder.as_slice()
}
}
impl ArrayBuilder for BooleanBuilder {
/// Returns the builder as a non-mutable `Any` reference.
fn as_any(&self) -> &dyn Any {
self
}
/// Returns the builder as a mutable `Any` reference.
fn as_any_mut(&mut self) -> &mut dyn Any {
self
}
/// Returns the boxed builder as a box of `Any`.
fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
self
}
/// Returns the number of array slots in the builder
fn len(&self) -> usize {
self.values_builder.len()
}
/// Builds the array and reset this builder.
fn finish(&mut self) -> ArrayRef {
Arc::new(self.finish())
}
/// Builds the array without resetting the builder.
fn finish_cloned(&self) -> ArrayRef {
Arc::new(self.finish_cloned())
}
}
impl Extend<Option<bool>> for BooleanBuilder {
#[inline]
fn extend<T: IntoIterator<Item = Option<bool>>>(&mut self, iter: T) {
for v in iter {
self.append_option(v)
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::Array;
#[test]
fn test_boolean_array_builder() {
// 00000010 01001000
let buf = Buffer::from([72_u8, 2_u8]);
let mut builder = BooleanArray::builder(10);
for i in 0..10 {
if i == 3 || i == 6 || i == 9 {
builder.append_value(true);
} else {
builder.append_value(false);
}
}
let arr = builder.finish();
assert_eq!(&buf, arr.values().inner());
assert_eq!(10, arr.len());
assert_eq!(0, arr.offset());
assert_eq!(0, arr.null_count());
for i in 0..10 {
assert!(!arr.is_null(i));
assert!(arr.is_valid(i));
assert_eq!(i == 3 || i == 6 || i == 9, arr.value(i), "failed at {i}")
}
}
#[test]
fn test_boolean_array_builder_append_slice() {
let arr1 = BooleanArray::from(vec![Some(true), Some(false), None, None, Some(false)]);
let mut builder = BooleanArray::builder(0);
builder.append_slice(&[true, false]);
builder.append_null();
builder.append_null();
builder.append_value(false);
let arr2 = builder.finish();
assert_eq!(arr1, arr2);
}
#[test]
fn test_boolean_array_builder_append_slice_large() {
let arr1 = BooleanArray::from(vec![true; 513]);
let mut builder = BooleanArray::builder(512);
builder.append_slice(&[true; 513]);
let arr2 = builder.finish();
assert_eq!(arr1, arr2);
}
#[test]
fn test_boolean_array_builder_no_null() {
let mut builder = BooleanArray::builder(0);
builder.append_option(Some(true));
builder.append_value(false);
builder.append_slice(&[true, false, true]);
builder
.append_values(&[false, false, true], &[true, true, true])
.unwrap();
let array = builder.finish();
assert_eq!(0, array.null_count());
assert!(array.nulls().is_none());
}
#[test]
fn test_boolean_array_builder_finish_cloned() {
let mut builder = BooleanArray::builder(16);
builder.append_option(Some(true));
builder.append_value(false);
builder.append_slice(&[true, false, true]);
let mut array = builder.finish_cloned();
assert_eq!(3, array.true_count());
assert_eq!(2, array.false_count());
builder
.append_values(&[false, false, true], &[true, true, true])
.unwrap();
array = builder.finish();
assert_eq!(4, array.true_count());
assert_eq!(4, array.false_count());
assert_eq!(0, array.null_count());
assert!(array.nulls().is_none());
}
#[test]
fn test_extend() {
let mut builder = BooleanBuilder::new();
builder.extend([false, false, true, false, false].into_iter().map(Some));
builder.extend([true, true, false].into_iter().map(Some));
let array = builder.finish();
let values = array.iter().map(|x| x.unwrap()).collect::<Vec<_>>();
assert_eq!(
&values,
&[false, false, true, false, false, true, true, false]
)
}
}

Просмотреть файл

@ -0,0 +1,225 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
pub use arrow_buffer::BufferBuilder;
use half::f16;
use crate::types::*;
/// Buffer builder for signed 8-bit integer type.
pub type Int8BufferBuilder = BufferBuilder<i8>;
/// Buffer builder for signed 16-bit integer type.
pub type Int16BufferBuilder = BufferBuilder<i16>;
/// Buffer builder for signed 32-bit integer type.
pub type Int32BufferBuilder = BufferBuilder<i32>;
/// Buffer builder for signed 64-bit integer type.
pub type Int64BufferBuilder = BufferBuilder<i64>;
/// Buffer builder for usigned 8-bit integer type.
pub type UInt8BufferBuilder = BufferBuilder<u8>;
/// Buffer builder for usigned 16-bit integer type.
pub type UInt16BufferBuilder = BufferBuilder<u16>;
/// Buffer builder for usigned 32-bit integer type.
pub type UInt32BufferBuilder = BufferBuilder<u32>;
/// Buffer builder for usigned 64-bit integer type.
pub type UInt64BufferBuilder = BufferBuilder<u64>;
/// Buffer builder for 16-bit floating point type.
pub type Float16BufferBuilder = BufferBuilder<f16>;
/// Buffer builder for 32-bit floating point type.
pub type Float32BufferBuilder = BufferBuilder<f32>;
/// Buffer builder for 64-bit floating point type.
pub type Float64BufferBuilder = BufferBuilder<f64>;
/// Buffer builder for 128-bit decimal type.
pub type Decimal128BufferBuilder = BufferBuilder<<Decimal128Type as ArrowPrimitiveType>::Native>;
/// Buffer builder for 256-bit decimal type.
pub type Decimal256BufferBuilder = BufferBuilder<<Decimal256Type as ArrowPrimitiveType>::Native>;
/// Buffer builder for timestamp type of second unit.
pub type TimestampSecondBufferBuilder =
BufferBuilder<<TimestampSecondType as ArrowPrimitiveType>::Native>;
/// Buffer builder for timestamp type of millisecond unit.
pub type TimestampMillisecondBufferBuilder =
BufferBuilder<<TimestampMillisecondType as ArrowPrimitiveType>::Native>;
/// Buffer builder for timestamp type of microsecond unit.
pub type TimestampMicrosecondBufferBuilder =
BufferBuilder<<TimestampMicrosecondType as ArrowPrimitiveType>::Native>;
/// Buffer builder for timestamp type of nanosecond unit.
pub type TimestampNanosecondBufferBuilder =
BufferBuilder<<TimestampNanosecondType as ArrowPrimitiveType>::Native>;
/// Buffer builder for 32-bit date type.
pub type Date32BufferBuilder = BufferBuilder<<Date32Type as ArrowPrimitiveType>::Native>;
/// Buffer builder for 64-bit date type.
pub type Date64BufferBuilder = BufferBuilder<<Date64Type as ArrowPrimitiveType>::Native>;
/// Buffer builder for 32-bit elaspsed time since midnight of second unit.
pub type Time32SecondBufferBuilder =
BufferBuilder<<Time32SecondType as ArrowPrimitiveType>::Native>;
/// Buffer builder for 32-bit elaspsed time since midnight of millisecond unit.
pub type Time32MillisecondBufferBuilder =
BufferBuilder<<Time32MillisecondType as ArrowPrimitiveType>::Native>;
/// Buffer builder for 64-bit elaspsed time since midnight of microsecond unit.
pub type Time64MicrosecondBufferBuilder =
BufferBuilder<<Time64MicrosecondType as ArrowPrimitiveType>::Native>;
/// Buffer builder for 64-bit elaspsed time since midnight of nanosecond unit.
pub type Time64NanosecondBufferBuilder =
BufferBuilder<<Time64NanosecondType as ArrowPrimitiveType>::Native>;
/// Buffer builder for “calendar” interval in months.
pub type IntervalYearMonthBufferBuilder =
BufferBuilder<<IntervalYearMonthType as ArrowPrimitiveType>::Native>;
/// Buffer builder for “calendar” interval in days and milliseconds.
pub type IntervalDayTimeBufferBuilder =
BufferBuilder<<IntervalDayTimeType as ArrowPrimitiveType>::Native>;
/// Buffer builder “calendar” interval in months, days, and nanoseconds.
pub type IntervalMonthDayNanoBufferBuilder =
BufferBuilder<<IntervalMonthDayNanoType as ArrowPrimitiveType>::Native>;
/// Buffer builder for elaspsed time of second unit.
pub type DurationSecondBufferBuilder =
BufferBuilder<<DurationSecondType as ArrowPrimitiveType>::Native>;
/// Buffer builder for elaspsed time of milliseconds unit.
pub type DurationMillisecondBufferBuilder =
BufferBuilder<<DurationMillisecondType as ArrowPrimitiveType>::Native>;
/// Buffer builder for elaspsed time of microseconds unit.
pub type DurationMicrosecondBufferBuilder =
BufferBuilder<<DurationMicrosecondType as ArrowPrimitiveType>::Native>;
/// Buffer builder for elaspsed time of nanoseconds unit.
pub type DurationNanosecondBufferBuilder =
BufferBuilder<<DurationNanosecondType as ArrowPrimitiveType>::Native>;
#[cfg(test)]
mod tests {
use crate::builder::{ArrayBuilder, Int32BufferBuilder, Int8Builder, UInt8BufferBuilder};
use crate::Array;
#[test]
fn test_builder_i32_empty() {
let mut b = Int32BufferBuilder::new(5);
assert_eq!(0, b.len());
assert_eq!(16, b.capacity());
let a = b.finish();
assert_eq!(0, a.len());
}
#[test]
fn test_builder_i32_alloc_zero_bytes() {
let mut b = Int32BufferBuilder::new(0);
b.append(123);
let a = b.finish();
assert_eq!(4, a.len());
}
#[test]
fn test_builder_i32() {
let mut b = Int32BufferBuilder::new(5);
for i in 0..5 {
b.append(i);
}
assert_eq!(16, b.capacity());
let a = b.finish();
assert_eq!(20, a.len());
}
#[test]
fn test_builder_i32_grow_buffer() {
let mut b = Int32BufferBuilder::new(2);
assert_eq!(16, b.capacity());
for i in 0..20 {
b.append(i);
}
assert_eq!(32, b.capacity());
let a = b.finish();
assert_eq!(80, a.len());
}
#[test]
fn test_builder_finish() {
let mut b = Int32BufferBuilder::new(5);
assert_eq!(16, b.capacity());
for i in 0..10 {
b.append(i);
}
let mut a = b.finish();
assert_eq!(40, a.len());
assert_eq!(0, b.len());
assert_eq!(0, b.capacity());
// Try build another buffer after cleaning up.
for i in 0..20 {
b.append(i)
}
assert_eq!(32, b.capacity());
a = b.finish();
assert_eq!(80, a.len());
}
#[test]
fn test_reserve() {
let mut b = UInt8BufferBuilder::new(2);
assert_eq!(64, b.capacity());
b.reserve(64);
assert_eq!(64, b.capacity());
b.reserve(65);
assert_eq!(128, b.capacity());
let mut b = Int32BufferBuilder::new(2);
assert_eq!(16, b.capacity());
b.reserve(16);
assert_eq!(16, b.capacity());
b.reserve(17);
assert_eq!(32, b.capacity());
}
#[test]
fn test_append_slice() {
let mut b = UInt8BufferBuilder::new(0);
b.append_slice(b"Hello, ");
b.append_slice(b"World!");
let buffer = b.finish();
assert_eq!(13, buffer.len());
let mut b = Int32BufferBuilder::new(0);
b.append_slice(&[32, 54]);
let buffer = b.finish();
assert_eq!(8, buffer.len());
}
#[test]
fn test_append_values() {
let mut a = Int8Builder::new();
a.append_value(1);
a.append_null();
a.append_value(-2);
assert_eq!(a.len(), 3);
// append values
let values = &[1, 2, 3, 4];
let is_valid = &[true, true, false, true];
a.append_values(values, is_valid);
assert_eq!(a.len(), 7);
let array = a.finish();
assert_eq!(array.value(0), 1);
assert!(array.is_null(1));
assert_eq!(array.value(2), -2);
assert_eq!(array.value(3), 1);
assert_eq!(array.value(4), 2);
assert!(array.is_null(5));
assert_eq!(array.value(6), 4);
}
}

Просмотреть файл

@ -0,0 +1,255 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::builder::{ArrayBuilder, UInt8BufferBuilder};
use crate::{ArrayRef, FixedSizeBinaryArray};
use arrow_buffer::Buffer;
use arrow_buffer::NullBufferBuilder;
use arrow_data::ArrayData;
use arrow_schema::{ArrowError, DataType};
use std::any::Any;
use std::sync::Arc;
/// Builder for [`FixedSizeBinaryArray`]
/// ```
/// # use arrow_array::builder::FixedSizeBinaryBuilder;
/// # use arrow_array::Array;
/// #
/// let mut builder = FixedSizeBinaryBuilder::with_capacity(3, 5);
/// // [b"hello", null, b"arrow"]
/// builder.append_value(b"hello").unwrap();
/// builder.append_null();
/// builder.append_value(b"arrow").unwrap();
///
/// let array = builder.finish();
/// assert_eq!(array.value(0), b"hello");
/// assert!(array.is_null(1));
/// assert_eq!(array.value(2), b"arrow");
/// ```
#[derive(Debug)]
pub struct FixedSizeBinaryBuilder {
values_builder: UInt8BufferBuilder,
null_buffer_builder: NullBufferBuilder,
value_length: i32,
}
impl FixedSizeBinaryBuilder {
/// Creates a new [`FixedSizeBinaryBuilder`]
pub fn new(byte_width: i32) -> Self {
Self::with_capacity(1024, byte_width)
}
/// Creates a new [`FixedSizeBinaryBuilder`], `capacity` is the number of byte slices
/// that can be appended without reallocating
pub fn with_capacity(capacity: usize, byte_width: i32) -> Self {
assert!(
byte_width >= 0,
"value length ({byte_width}) of the array must >= 0"
);
Self {
values_builder: UInt8BufferBuilder::new(capacity * byte_width as usize),
null_buffer_builder: NullBufferBuilder::new(capacity),
value_length: byte_width,
}
}
/// Appends a byte slice into the builder.
///
/// Automatically update the null buffer to delimit the slice appended in as a
/// distinct value element.
#[inline]
pub fn append_value(&mut self, value: impl AsRef<[u8]>) -> Result<(), ArrowError> {
if self.value_length != value.as_ref().len() as i32 {
Err(ArrowError::InvalidArgumentError(
"Byte slice does not have the same length as FixedSizeBinaryBuilder value lengths"
.to_string(),
))
} else {
self.values_builder.append_slice(value.as_ref());
self.null_buffer_builder.append_non_null();
Ok(())
}
}
/// Append a null value to the array.
#[inline]
pub fn append_null(&mut self) {
self.values_builder
.append_slice(&vec![0u8; self.value_length as usize][..]);
self.null_buffer_builder.append_null();
}
/// Builds the [`FixedSizeBinaryArray`] and reset this builder.
pub fn finish(&mut self) -> FixedSizeBinaryArray {
let array_length = self.len();
let array_data_builder = ArrayData::builder(DataType::FixedSizeBinary(self.value_length))
.add_buffer(self.values_builder.finish())
.nulls(self.null_buffer_builder.finish())
.len(array_length);
let array_data = unsafe { array_data_builder.build_unchecked() };
FixedSizeBinaryArray::from(array_data)
}
/// Builds the [`FixedSizeBinaryArray`] without resetting the builder.
pub fn finish_cloned(&self) -> FixedSizeBinaryArray {
let array_length = self.len();
let values_buffer = Buffer::from_slice_ref(self.values_builder.as_slice());
let array_data_builder = ArrayData::builder(DataType::FixedSizeBinary(self.value_length))
.add_buffer(values_buffer)
.nulls(self.null_buffer_builder.finish_cloned())
.len(array_length);
let array_data = unsafe { array_data_builder.build_unchecked() };
FixedSizeBinaryArray::from(array_data)
}
/// Returns the current null buffer as a slice
pub fn validity_slice(&self) -> Option<&[u8]> {
self.null_buffer_builder.as_slice()
}
}
impl ArrayBuilder for FixedSizeBinaryBuilder {
/// Returns the builder as a non-mutable `Any` reference.
fn as_any(&self) -> &dyn Any {
self
}
/// Returns the builder as a mutable `Any` reference.
fn as_any_mut(&mut self) -> &mut dyn Any {
self
}
/// Returns the boxed builder as a box of `Any`.
fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
self
}
/// Returns the number of array slots in the builder
fn len(&self) -> usize {
self.null_buffer_builder.len()
}
/// Builds the array and reset this builder.
fn finish(&mut self) -> ArrayRef {
Arc::new(self.finish())
}
/// Builds the array without resetting the builder.
fn finish_cloned(&self) -> ArrayRef {
Arc::new(self.finish_cloned())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::Array;
#[test]
fn test_fixed_size_binary_builder() {
let mut builder = FixedSizeBinaryBuilder::with_capacity(3, 5);
// [b"hello", null, "arrow"]
builder.append_value(b"hello").unwrap();
builder.append_null();
builder.append_value(b"arrow").unwrap();
let array: FixedSizeBinaryArray = builder.finish();
assert_eq!(&DataType::FixedSizeBinary(5), array.data_type());
assert_eq!(3, array.len());
assert_eq!(1, array.null_count());
assert_eq!(10, array.value_offset(2));
assert_eq!(5, array.value_length());
}
#[test]
fn test_fixed_size_binary_builder_finish_cloned() {
let mut builder = FixedSizeBinaryBuilder::with_capacity(3, 5);
// [b"hello", null, "arrow"]
builder.append_value(b"hello").unwrap();
builder.append_null();
builder.append_value(b"arrow").unwrap();
let mut array: FixedSizeBinaryArray = builder.finish_cloned();
assert_eq!(&DataType::FixedSizeBinary(5), array.data_type());
assert_eq!(3, array.len());
assert_eq!(1, array.null_count());
assert_eq!(10, array.value_offset(2));
assert_eq!(5, array.value_length());
// [b"finis", null, "clone"]
builder.append_value(b"finis").unwrap();
builder.append_null();
builder.append_value(b"clone").unwrap();
array = builder.finish();
assert_eq!(&DataType::FixedSizeBinary(5), array.data_type());
assert_eq!(6, array.len());
assert_eq!(2, array.null_count());
assert_eq!(25, array.value_offset(5));
assert_eq!(5, array.value_length());
}
#[test]
fn test_fixed_size_binary_builder_with_zero_value_length() {
let mut builder = FixedSizeBinaryBuilder::new(0);
builder.append_value(b"").unwrap();
builder.append_null();
builder.append_value(b"").unwrap();
assert!(!builder.is_empty());
let array: FixedSizeBinaryArray = builder.finish();
assert_eq!(&DataType::FixedSizeBinary(0), array.data_type());
assert_eq!(3, array.len());
assert_eq!(1, array.null_count());
assert_eq!(0, array.value_offset(2));
assert_eq!(0, array.value_length());
assert_eq!(b"", array.value(0));
assert_eq!(b"", array.value(2));
}
#[test]
#[should_panic(
expected = "Byte slice does not have the same length as FixedSizeBinaryBuilder value lengths"
)]
fn test_fixed_size_binary_builder_with_inconsistent_value_length() {
let mut builder = FixedSizeBinaryBuilder::with_capacity(1, 4);
builder.append_value(b"hello").unwrap();
}
#[test]
fn test_fixed_size_binary_builder_empty() {
let mut builder = FixedSizeBinaryBuilder::new(5);
assert!(builder.is_empty());
let fixed_size_binary_array = builder.finish();
assert_eq!(
&DataType::FixedSizeBinary(5),
fixed_size_binary_array.data_type()
);
assert_eq!(0, fixed_size_binary_array.len());
}
#[test]
#[should_panic(expected = "value length (-1) of the array must >= 0")]
fn test_fixed_size_binary_builder_invalid_value_length() {
let _ = FixedSizeBinaryBuilder::with_capacity(15, -1);
}
}

Просмотреть файл

@ -0,0 +1,492 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::builder::ArrayBuilder;
use crate::{ArrayRef, FixedSizeListArray};
use arrow_buffer::NullBufferBuilder;
use arrow_schema::{Field, FieldRef};
use std::any::Any;
use std::sync::Arc;
/// Builder for [`FixedSizeListArray`]
/// ```
/// use arrow_array::{builder::{Int32Builder, FixedSizeListBuilder}, Array, Int32Array};
/// let values_builder = Int32Builder::new();
/// let mut builder = FixedSizeListBuilder::new(values_builder, 3);
///
/// // [[0, 1, 2], null, [3, null, 5], [6, 7, null]]
/// builder.values().append_value(0);
/// builder.values().append_value(1);
/// builder.values().append_value(2);
/// builder.append(true);
/// builder.values().append_null();
/// builder.values().append_null();
/// builder.values().append_null();
/// builder.append(false);
/// builder.values().append_value(3);
/// builder.values().append_null();
/// builder.values().append_value(5);
/// builder.append(true);
/// builder.values().append_value(6);
/// builder.values().append_value(7);
/// builder.values().append_null();
/// builder.append(true);
/// let list_array = builder.finish();
/// assert_eq!(
/// *list_array.value(0),
/// Int32Array::from(vec![Some(0), Some(1), Some(2)])
/// );
/// assert!(list_array.is_null(1));
/// assert_eq!(
/// *list_array.value(2),
/// Int32Array::from(vec![Some(3), None, Some(5)])
/// );
/// assert_eq!(
/// *list_array.value(3),
/// Int32Array::from(vec![Some(6), Some(7), None])
/// )
/// ```
///
#[derive(Debug)]
pub struct FixedSizeListBuilder<T: ArrayBuilder> {
null_buffer_builder: NullBufferBuilder,
values_builder: T,
list_len: i32,
field: Option<FieldRef>,
}
impl<T: ArrayBuilder> FixedSizeListBuilder<T> {
/// Creates a new [`FixedSizeListBuilder`] from a given values array builder
/// `value_length` is the number of values within each array
pub fn new(values_builder: T, value_length: i32) -> Self {
let capacity = values_builder
.len()
.checked_div(value_length as _)
.unwrap_or_default();
Self::with_capacity(values_builder, value_length, capacity)
}
/// Creates a new [`FixedSizeListBuilder`] from a given values array builder
/// `value_length` is the number of values within each array
/// `capacity` is the number of items to pre-allocate space for in this builder
pub fn with_capacity(values_builder: T, value_length: i32, capacity: usize) -> Self {
Self {
null_buffer_builder: NullBufferBuilder::new(capacity),
values_builder,
list_len: value_length,
field: None,
}
}
/// Override the field passed to [`FixedSizeListArray::new`]
///
/// By default, a nullable field is created with the name `item`
///
/// Note: [`Self::finish`] and [`Self::finish_cloned`] will panic if the
/// field's data type does not match that of `T`
pub fn with_field(self, field: impl Into<FieldRef>) -> Self {
Self {
field: Some(field.into()),
..self
}
}
}
impl<T: ArrayBuilder> ArrayBuilder for FixedSizeListBuilder<T>
where
T: 'static,
{
/// Returns the builder as a non-mutable `Any` reference.
fn as_any(&self) -> &dyn Any {
self
}
/// Returns the builder as a mutable `Any` reference.
fn as_any_mut(&mut self) -> &mut dyn Any {
self
}
/// Returns the boxed builder as a box of `Any`.
fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
self
}
/// Returns the number of array slots in the builder
fn len(&self) -> usize {
self.null_buffer_builder.len()
}
/// Builds the array and reset this builder.
fn finish(&mut self) -> ArrayRef {
Arc::new(self.finish())
}
/// Builds the array without resetting the builder.
fn finish_cloned(&self) -> ArrayRef {
Arc::new(self.finish_cloned())
}
}
impl<T: ArrayBuilder> FixedSizeListBuilder<T>
where
T: 'static,
{
/// Returns the child array builder as a mutable reference.
///
/// This mutable reference can be used to append values into the child array builder,
/// but you must call [`append`](#method.append) to delimit each distinct list value.
pub fn values(&mut self) -> &mut T {
&mut self.values_builder
}
/// Returns the length of the list
pub fn value_length(&self) -> i32 {
self.list_len
}
/// Finish the current fixed-length list array slot
#[inline]
pub fn append(&mut self, is_valid: bool) {
self.null_buffer_builder.append(is_valid);
}
/// Builds the [`FixedSizeListBuilder`] and reset this builder.
pub fn finish(&mut self) -> FixedSizeListArray {
let len = self.len();
let values = self.values_builder.finish();
let nulls = self.null_buffer_builder.finish();
assert_eq!(
values.len(), len * self.list_len as usize,
"Length of the child array ({}) must be the multiple of the value length ({}) and the array length ({}).",
values.len(),
self.list_len,
len,
);
let field = self
.field
.clone()
.unwrap_or_else(|| Arc::new(Field::new("item", values.data_type().clone(), true)));
FixedSizeListArray::new(field, self.list_len, values, nulls)
}
/// Builds the [`FixedSizeListBuilder`] without resetting the builder.
pub fn finish_cloned(&self) -> FixedSizeListArray {
let len = self.len();
let values = self.values_builder.finish_cloned();
let nulls = self.null_buffer_builder.finish_cloned();
assert_eq!(
values.len(), len * self.list_len as usize,
"Length of the child array ({}) must be the multiple of the value length ({}) and the array length ({}).",
values.len(),
self.list_len,
len,
);
let field = self
.field
.clone()
.unwrap_or_else(|| Arc::new(Field::new("item", values.data_type().clone(), true)));
FixedSizeListArray::new(field, self.list_len, values, nulls)
}
/// Returns the current null buffer as a slice
pub fn validity_slice(&self) -> Option<&[u8]> {
self.null_buffer_builder.as_slice()
}
}
#[cfg(test)]
mod tests {
use super::*;
use arrow_schema::DataType;
use crate::builder::Int32Builder;
use crate::Array;
use crate::Int32Array;
fn make_list_builder(
include_null_element: bool,
include_null_in_values: bool,
) -> FixedSizeListBuilder<crate::builder::PrimitiveBuilder<crate::types::Int32Type>> {
let values_builder = Int32Builder::new();
let mut builder = FixedSizeListBuilder::new(values_builder, 3);
builder.values().append_value(0);
builder.values().append_value(1);
builder.values().append_value(2);
builder.append(true);
builder.values().append_value(2);
builder.values().append_value(3);
builder.values().append_value(4);
builder.append(true);
if include_null_element {
builder.values().append_null();
builder.values().append_null();
builder.values().append_null();
builder.append(false);
} else {
builder.values().append_value(2);
builder.values().append_value(3);
builder.values().append_value(4);
builder.append(true);
}
if include_null_in_values {
builder.values().append_value(3);
builder.values().append_null();
builder.values().append_value(5);
builder.append(true);
} else {
builder.values().append_value(3);
builder.values().append_value(4);
builder.values().append_value(5);
builder.append(true);
}
builder
}
#[test]
fn test_fixed_size_list_array_builder() {
let mut builder = make_list_builder(true, true);
let list_array = builder.finish();
assert_eq!(DataType::Int32, list_array.value_type());
assert_eq!(4, list_array.len());
assert_eq!(1, list_array.null_count());
assert_eq!(6, list_array.value_offset(2));
assert_eq!(3, list_array.value_length());
}
#[test]
fn test_fixed_size_list_array_builder_with_field() {
let builder = make_list_builder(false, false);
let mut builder = builder.with_field(Field::new("list_element", DataType::Int32, false));
let list_array = builder.finish();
assert_eq!(DataType::Int32, list_array.value_type());
assert_eq!(4, list_array.len());
assert_eq!(0, list_array.null_count());
assert_eq!(6, list_array.value_offset(2));
assert_eq!(3, list_array.value_length());
}
#[test]
fn test_fixed_size_list_array_builder_with_field_and_null() {
let builder = make_list_builder(true, false);
let mut builder = builder.with_field(Field::new("list_element", DataType::Int32, false));
let list_array = builder.finish();
assert_eq!(DataType::Int32, list_array.value_type());
assert_eq!(4, list_array.len());
assert_eq!(1, list_array.null_count());
assert_eq!(6, list_array.value_offset(2));
assert_eq!(3, list_array.value_length());
}
#[test]
#[should_panic(expected = "Found unmasked nulls for non-nullable FixedSizeListArray")]
fn test_fixed_size_list_array_builder_with_field_null_panic() {
let builder = make_list_builder(true, true);
let mut builder = builder.with_field(Field::new("list_item", DataType::Int32, false));
builder.finish();
}
#[test]
#[should_panic(expected = "FixedSizeListArray expected data type Int64 got Int32")]
fn test_fixed_size_list_array_builder_with_field_type_panic() {
let values_builder = Int32Builder::new();
let builder = FixedSizeListBuilder::new(values_builder, 3);
let mut builder = builder.with_field(Field::new("list_item", DataType::Int64, true));
// [[0, 1, 2], null, [3, null, 5], [6, 7, null]]
builder.values().append_value(0);
builder.values().append_value(1);
builder.values().append_value(2);
builder.append(true);
builder.values().append_null();
builder.values().append_null();
builder.values().append_null();
builder.append(false);
builder.values().append_value(3);
builder.values().append_value(4);
builder.values().append_value(5);
builder.append(true);
builder.finish();
}
#[test]
fn test_fixed_size_list_array_builder_cloned_with_field() {
let builder = make_list_builder(true, true);
let builder = builder.with_field(Field::new("list_element", DataType::Int32, true));
let list_array = builder.finish_cloned();
assert_eq!(DataType::Int32, list_array.value_type());
assert_eq!(4, list_array.len());
assert_eq!(1, list_array.null_count());
assert_eq!(6, list_array.value_offset(2));
assert_eq!(3, list_array.value_length());
}
#[test]
#[should_panic(expected = "Found unmasked nulls for non-nullable FixedSizeListArray")]
fn test_fixed_size_list_array_builder_cloned_with_field_null_panic() {
let builder = make_list_builder(true, true);
let builder = builder.with_field(Field::new("list_item", DataType::Int32, false));
builder.finish_cloned();
}
#[test]
fn test_fixed_size_list_array_builder_cloned_with_field_and_null() {
let builder = make_list_builder(true, false);
let mut builder = builder.with_field(Field::new("list_element", DataType::Int32, false));
let list_array = builder.finish();
assert_eq!(DataType::Int32, list_array.value_type());
assert_eq!(4, list_array.len());
assert_eq!(1, list_array.null_count());
assert_eq!(6, list_array.value_offset(2));
assert_eq!(3, list_array.value_length());
}
#[test]
#[should_panic(expected = "FixedSizeListArray expected data type Int64 got Int32")]
fn test_fixed_size_list_array_builder_cloned_with_field_type_panic() {
let builder = make_list_builder(false, false);
let builder = builder.with_field(Field::new("list_item", DataType::Int64, true));
builder.finish_cloned();
}
#[test]
fn test_fixed_size_list_array_builder_finish_cloned() {
let mut builder = make_list_builder(true, true);
let mut list_array = builder.finish_cloned();
assert_eq!(DataType::Int32, list_array.value_type());
assert_eq!(4, list_array.len());
assert_eq!(1, list_array.null_count());
assert_eq!(3, list_array.value_length());
builder.values().append_value(6);
builder.values().append_value(7);
builder.values().append_null();
builder.append(true);
builder.values().append_null();
builder.values().append_null();
builder.values().append_null();
builder.append(false);
list_array = builder.finish();
assert_eq!(DataType::Int32, list_array.value_type());
assert_eq!(6, list_array.len());
assert_eq!(2, list_array.null_count());
assert_eq!(6, list_array.value_offset(2));
assert_eq!(3, list_array.value_length());
}
#[test]
fn test_fixed_size_list_array_builder_with_field_empty() {
let values_builder = Int32Array::builder(0);
let mut builder = FixedSizeListBuilder::new(values_builder, 3).with_field(Field::new(
"list_item",
DataType::Int32,
false,
));
assert!(builder.is_empty());
let arr = builder.finish();
assert_eq!(0, arr.len());
assert_eq!(0, builder.len());
}
#[test]
fn test_fixed_size_list_array_builder_cloned_with_field_empty() {
let values_builder = Int32Array::builder(0);
let builder = FixedSizeListBuilder::new(values_builder, 3).with_field(Field::new(
"list_item",
DataType::Int32,
false,
));
assert!(builder.is_empty());
let arr = builder.finish_cloned();
assert_eq!(0, arr.len());
assert_eq!(0, builder.len());
}
#[test]
fn test_fixed_size_list_array_builder_empty() {
let values_builder = Int32Array::builder(5);
let mut builder = FixedSizeListBuilder::new(values_builder, 3);
assert!(builder.is_empty());
let arr = builder.finish();
assert_eq!(0, arr.len());
assert_eq!(0, builder.len());
}
#[test]
fn test_fixed_size_list_array_builder_finish() {
let values_builder = Int32Array::builder(5);
let mut builder = FixedSizeListBuilder::new(values_builder, 3);
builder.values().append_slice(&[1, 2, 3]);
builder.append(true);
builder.values().append_slice(&[4, 5, 6]);
builder.append(true);
let mut arr = builder.finish();
assert_eq!(2, arr.len());
assert_eq!(0, builder.len());
builder.values().append_slice(&[7, 8, 9]);
builder.append(true);
arr = builder.finish();
assert_eq!(1, arr.len());
assert_eq!(0, builder.len());
}
#[test]
#[should_panic(
expected = "Length of the child array (10) must be the multiple of the value length (3) and the array length (3)."
)]
fn test_fixed_size_list_array_builder_fail() {
let values_builder = Int32Array::builder(5);
let mut builder = FixedSizeListBuilder::new(values_builder, 3);
builder.values().append_slice(&[1, 2, 3]);
builder.append(true);
builder.values().append_slice(&[4, 5, 6]);
builder.append(true);
builder.values().append_slice(&[7, 8, 9, 10]);
builder.append(true);
builder.finish();
}
}

Просмотреть файл

@ -0,0 +1,514 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::types::bytes::ByteArrayNativeType;
use std::{any::Any, sync::Arc};
use crate::{
types::{BinaryType, ByteArrayType, LargeBinaryType, LargeUtf8Type, RunEndIndexType, Utf8Type},
ArrayRef, ArrowPrimitiveType, RunArray,
};
use super::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder};
use arrow_buffer::ArrowNativeType;
/// Builder for [`RunArray`] of [`GenericByteArray`](crate::array::GenericByteArray)
///
/// # Example:
///
/// ```
///
/// # use arrow_array::builder::GenericByteRunBuilder;
/// # use arrow_array::{GenericByteArray, BinaryArray};
/// # use arrow_array::types::{BinaryType, Int16Type};
/// # use arrow_array::{Array, Int16Array};
/// # use arrow_array::cast::AsArray;
///
/// let mut builder =
/// GenericByteRunBuilder::<Int16Type, BinaryType>::new();
/// builder.extend([Some(b"abc"), Some(b"abc"), None, Some(b"def")].into_iter());
/// builder.append_value(b"def");
/// builder.append_null();
/// let array = builder.finish();
///
/// assert_eq!(array.run_ends().values(), &[2, 3, 5, 6]);
///
/// let av = array.values();
///
/// assert!(!av.is_null(0));
/// assert!(av.is_null(1));
/// assert!(!av.is_null(2));
/// assert!(av.is_null(3));
///
/// // Values are polymorphic and so require a downcast.
/// let ava: &BinaryArray = av.as_binary();
///
/// assert_eq!(ava.value(0), b"abc");
/// assert_eq!(ava.value(2), b"def");
/// ```
#[derive(Debug)]
pub struct GenericByteRunBuilder<R, V>
where
R: ArrowPrimitiveType,
V: ByteArrayType,
{
run_ends_builder: PrimitiveBuilder<R>,
values_builder: GenericByteBuilder<V>,
current_value: Vec<u8>,
has_current_value: bool,
current_run_end_index: usize,
prev_run_end_index: usize,
}
impl<R, V> Default for GenericByteRunBuilder<R, V>
where
R: ArrowPrimitiveType,
V: ByteArrayType,
{
fn default() -> Self {
Self::new()
}
}
impl<R, V> GenericByteRunBuilder<R, V>
where
R: ArrowPrimitiveType,
V: ByteArrayType,
{
/// Creates a new `GenericByteRunBuilder`
pub fn new() -> Self {
Self {
run_ends_builder: PrimitiveBuilder::new(),
values_builder: GenericByteBuilder::<V>::new(),
current_value: Vec::new(),
has_current_value: false,
current_run_end_index: 0,
prev_run_end_index: 0,
}
}
/// Creates a new `GenericByteRunBuilder` with the provided capacity
///
/// `capacity`: the expected number of run-end encoded values.
/// `data_capacity`: the expected number of bytes of run end encoded values
pub fn with_capacity(capacity: usize, data_capacity: usize) -> Self {
Self {
run_ends_builder: PrimitiveBuilder::with_capacity(capacity),
values_builder: GenericByteBuilder::<V>::with_capacity(capacity, data_capacity),
current_value: Vec::new(),
has_current_value: false,
current_run_end_index: 0,
prev_run_end_index: 0,
}
}
}
impl<R, V> ArrayBuilder for GenericByteRunBuilder<R, V>
where
R: RunEndIndexType,
V: ByteArrayType,
{
/// Returns the builder as a non-mutable `Any` reference.
fn as_any(&self) -> &dyn Any {
self
}
/// Returns the builder as a mutable `Any` reference.
fn as_any_mut(&mut self) -> &mut dyn Any {
self
}
/// Returns the boxed builder as a box of `Any`.
fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
self
}
/// Returns the length of logical array encoded by
/// the eventual runs array.
fn len(&self) -> usize {
self.current_run_end_index
}
/// Builds the array and reset this builder.
fn finish(&mut self) -> ArrayRef {
Arc::new(self.finish())
}
/// Builds the array without resetting the builder.
fn finish_cloned(&self) -> ArrayRef {
Arc::new(self.finish_cloned())
}
}
impl<R, V> GenericByteRunBuilder<R, V>
where
R: RunEndIndexType,
V: ByteArrayType,
{
/// Appends optional value to the logical array encoded by the RunArray.
pub fn append_option(&mut self, input_value: Option<impl AsRef<V::Native>>) {
match input_value {
Some(value) => self.append_value(value),
None => self.append_null(),
}
}
/// Appends value to the logical array encoded by the RunArray.
pub fn append_value(&mut self, input_value: impl AsRef<V::Native>) {
let value: &[u8] = input_value.as_ref().as_ref();
if !self.has_current_value {
self.append_run_end();
self.current_value.extend_from_slice(value);
self.has_current_value = true;
} else if self.current_value.as_slice() != value {
self.append_run_end();
self.current_value.clear();
self.current_value.extend_from_slice(value);
}
self.current_run_end_index += 1;
}
/// Appends null to the logical array encoded by the RunArray.
pub fn append_null(&mut self) {
if self.has_current_value {
self.append_run_end();
self.current_value.clear();
self.has_current_value = false;
}
self.current_run_end_index += 1;
}
/// Creates the RunArray and resets the builder.
/// Panics if RunArray cannot be built.
pub fn finish(&mut self) -> RunArray<R> {
// write the last run end to the array.
self.append_run_end();
// reset the run end index to zero.
self.current_value.clear();
self.has_current_value = false;
self.current_run_end_index = 0;
self.prev_run_end_index = 0;
// build the run encoded array by adding run_ends and values array as its children.
let run_ends_array = self.run_ends_builder.finish();
let values_array = self.values_builder.finish();
RunArray::<R>::try_new(&run_ends_array, &values_array).unwrap()
}
/// Creates the RunArray and without resetting the builder.
/// Panics if RunArray cannot be built.
pub fn finish_cloned(&self) -> RunArray<R> {
let mut run_ends_array = self.run_ends_builder.finish_cloned();
let mut values_array = self.values_builder.finish_cloned();
// Add current run if one exists
if self.prev_run_end_index != self.current_run_end_index {
let mut run_end_builder = run_ends_array.into_builder().unwrap();
let mut values_builder = values_array.into_builder().unwrap();
self.append_run_end_with_builders(&mut run_end_builder, &mut values_builder);
run_ends_array = run_end_builder.finish();
values_array = values_builder.finish();
}
RunArray::<R>::try_new(&run_ends_array, &values_array).unwrap()
}
// Appends the current run to the array.
fn append_run_end(&mut self) {
// empty array or the function called without appending any value.
if self.prev_run_end_index == self.current_run_end_index {
return;
}
let run_end_index = self.run_end_index_as_native();
self.run_ends_builder.append_value(run_end_index);
if self.has_current_value {
let slice = self.current_value.as_slice();
let native = unsafe {
// Safety:
// As self.current_value is created from V::Native. The value V::Native can be
// built back from the bytes without validations
V::Native::from_bytes_unchecked(slice)
};
self.values_builder.append_value(native);
} else {
self.values_builder.append_null();
}
self.prev_run_end_index = self.current_run_end_index;
}
// Similar to `append_run_end` but on custom builders.
// Used in `finish_cloned` which is not suppose to mutate `self`.
fn append_run_end_with_builders(
&self,
run_ends_builder: &mut PrimitiveBuilder<R>,
values_builder: &mut GenericByteBuilder<V>,
) {
let run_end_index = self.run_end_index_as_native();
run_ends_builder.append_value(run_end_index);
if self.has_current_value {
let slice = self.current_value.as_slice();
let native = unsafe {
// Safety:
// As self.current_value is created from V::Native. The value V::Native can be
// built back from the bytes without validations
V::Native::from_bytes_unchecked(slice)
};
values_builder.append_value(native);
} else {
values_builder.append_null();
}
}
fn run_end_index_as_native(&self) -> R::Native {
R::Native::from_usize(self.current_run_end_index).unwrap_or_else(|| {
panic!(
"Cannot convert the value {} from `usize` to native form of arrow datatype {}",
self.current_run_end_index,
R::DATA_TYPE
)
})
}
}
impl<R, V, S> Extend<Option<S>> for GenericByteRunBuilder<R, V>
where
R: RunEndIndexType,
V: ByteArrayType,
S: AsRef<V::Native>,
{
fn extend<T: IntoIterator<Item = Option<S>>>(&mut self, iter: T) {
for elem in iter {
self.append_option(elem);
}
}
}
/// Builder for [`RunArray`] of [`StringArray`](crate::array::StringArray)
///
/// ```
/// // Create a run-end encoded array with run-end indexes data type as `i16`.
/// // The encoded values are Strings.
///
/// # use arrow_array::builder::StringRunBuilder;
/// # use arrow_array::{Int16Array, StringArray};
/// # use arrow_array::types::Int16Type;
/// # use arrow_array::cast::AsArray;
/// #
/// let mut builder = StringRunBuilder::<Int16Type>::new();
///
/// // The builder builds the dictionary value by value
/// builder.append_value("abc");
/// builder.append_null();
/// builder.extend([Some("def"), Some("def"), Some("abc")]);
/// let array = builder.finish();
///
/// assert_eq!(array.run_ends().values(), &[1, 2, 4, 5]);
///
/// // Values are polymorphic and so require a downcast.
/// let av = array.values();
/// let ava: &StringArray = av.as_string::<i32>();
///
/// assert_eq!(ava.value(0), "abc");
/// assert!(av.is_null(1));
/// assert_eq!(ava.value(2), "def");
/// assert_eq!(ava.value(3), "abc");
///
/// ```
pub type StringRunBuilder<K> = GenericByteRunBuilder<K, Utf8Type>;
/// Builder for [`RunArray`] of [`LargeStringArray`](crate::array::LargeStringArray)
pub type LargeStringRunBuilder<K> = GenericByteRunBuilder<K, LargeUtf8Type>;
/// Builder for [`RunArray`] of [`BinaryArray`](crate::array::BinaryArray)
///
/// ```
/// // Create a run-end encoded array with run-end indexes data type as `i16`.
/// // The encoded data is binary values.
///
/// # use arrow_array::builder::BinaryRunBuilder;
/// # use arrow_array::{BinaryArray, Int16Array};
/// # use arrow_array::cast::AsArray;
/// # use arrow_array::types::Int16Type;
///
/// let mut builder = BinaryRunBuilder::<Int16Type>::new();
///
/// // The builder builds the dictionary value by value
/// builder.append_value(b"abc");
/// builder.append_null();
/// builder.extend([Some(b"def"), Some(b"def"), Some(b"abc")]);
/// let array = builder.finish();
///
/// assert_eq!(array.run_ends().values(), &[1, 2, 4, 5]);
///
/// // Values are polymorphic and so require a downcast.
/// let av = array.values();
/// let ava: &BinaryArray = av.as_binary();
///
/// assert_eq!(ava.value(0), b"abc");
/// assert!(av.is_null(1));
/// assert_eq!(ava.value(2), b"def");
/// assert_eq!(ava.value(3), b"abc");
///
/// ```
pub type BinaryRunBuilder<K> = GenericByteRunBuilder<K, BinaryType>;
/// Builder for [`RunArray`] of [`LargeBinaryArray`](crate::array::LargeBinaryArray)
pub type LargeBinaryRunBuilder<K> = GenericByteRunBuilder<K, LargeBinaryType>;
#[cfg(test)]
mod tests {
use super::*;
use crate::array::Array;
use crate::cast::AsArray;
use crate::types::{Int16Type, Int32Type};
use crate::GenericByteArray;
use crate::Int16RunArray;
fn test_bytes_run_builder<T>(values: Vec<&T::Native>)
where
T: ByteArrayType,
<T as ByteArrayType>::Native: PartialEq,
<T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>,
{
let mut builder = GenericByteRunBuilder::<Int16Type, T>::new();
builder.append_value(values[0]);
builder.append_value(values[0]);
builder.append_value(values[0]);
builder.append_null();
builder.append_null();
builder.append_value(values[1]);
builder.append_value(values[1]);
builder.append_value(values[2]);
builder.append_value(values[2]);
builder.append_value(values[2]);
builder.append_value(values[2]);
let array = builder.finish();
assert_eq!(array.len(), 11);
assert_eq!(array.null_count(), 0);
assert_eq!(array.run_ends().values(), &[3, 5, 7, 11]);
// Values are polymorphic and so require a downcast.
let av = array.values();
let ava: &GenericByteArray<T> = av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
assert_eq!(*ava.value(0), *values[0]);
assert!(ava.is_null(1));
assert_eq!(*ava.value(2), *values[1]);
assert_eq!(*ava.value(3), *values[2]);
}
#[test]
fn test_string_run_builder() {
test_bytes_run_builder::<Utf8Type>(vec!["abc", "def", "ghi"]);
}
#[test]
fn test_string_run_builder_with_empty_strings() {
test_bytes_run_builder::<Utf8Type>(vec!["abc", "", "ghi"]);
}
#[test]
fn test_binary_run_builder() {
test_bytes_run_builder::<BinaryType>(vec![b"abc", b"def", b"ghi"]);
}
fn test_bytes_run_builder_finish_cloned<T>(values: Vec<&T::Native>)
where
T: ByteArrayType,
<T as ByteArrayType>::Native: PartialEq,
<T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>,
{
let mut builder = GenericByteRunBuilder::<Int16Type, T>::new();
builder.append_value(values[0]);
builder.append_null();
builder.append_value(values[1]);
builder.append_value(values[1]);
builder.append_value(values[0]);
let mut array: Int16RunArray = builder.finish_cloned();
assert_eq!(array.len(), 5);
assert_eq!(array.null_count(), 0);
assert_eq!(array.run_ends().values(), &[1, 2, 4, 5]);
// Values are polymorphic and so require a downcast.
let av = array.values();
let ava: &GenericByteArray<T> = av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
assert_eq!(ava.value(0), values[0]);
assert!(ava.is_null(1));
assert_eq!(ava.value(2), values[1]);
assert_eq!(ava.value(3), values[0]);
// Append last value before `finish_cloned` (`value[0]`) again and ensure it has only
// one entry in final output.
builder.append_value(values[0]);
builder.append_value(values[0]);
builder.append_value(values[1]);
array = builder.finish();
assert_eq!(array.len(), 8);
assert_eq!(array.null_count(), 0);
assert_eq!(array.run_ends().values(), &[1, 2, 4, 7, 8]);
// Values are polymorphic and so require a downcast.
let av2 = array.values();
let ava2: &GenericByteArray<T> =
av2.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
assert_eq!(ava2.value(0), values[0]);
assert!(ava2.is_null(1));
assert_eq!(ava2.value(2), values[1]);
// The value appended before and after `finish_cloned` has only one entry.
assert_eq!(ava2.value(3), values[0]);
assert_eq!(ava2.value(4), values[1]);
}
#[test]
fn test_string_run_builder_finish_cloned() {
test_bytes_run_builder_finish_cloned::<Utf8Type>(vec!["abc", "def", "ghi"]);
}
#[test]
fn test_binary_run_builder_finish_cloned() {
test_bytes_run_builder_finish_cloned::<BinaryType>(vec![b"abc", b"def", b"ghi"]);
}
#[test]
fn test_extend() {
let mut builder = StringRunBuilder::<Int32Type>::new();
builder.extend(["a", "a", "a", "", "", "b", "b"].into_iter().map(Some));
builder.extend(["b", "cupcakes", "cupcakes"].into_iter().map(Some));
let array = builder.finish();
assert_eq!(array.len(), 10);
assert_eq!(array.run_ends().values(), &[3, 5, 8, 10]);
let str_array = array.values().as_string::<i32>();
assert_eq!(str_array.value(0), "a");
assert_eq!(str_array.value(1), "");
assert_eq!(str_array.value(2), "b");
assert_eq!(str_array.value(3), "cupcakes");
}
}

Просмотреть файл

@ -0,0 +1,506 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::builder::{ArrayBuilder, BufferBuilder, UInt8BufferBuilder};
use crate::types::{ByteArrayType, GenericBinaryType, GenericStringType};
use crate::{ArrayRef, GenericByteArray, OffsetSizeTrait};
use arrow_buffer::NullBufferBuilder;
use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer};
use arrow_data::ArrayDataBuilder;
use std::any::Any;
use std::fmt::Write;
use std::sync::Arc;
/// Builder for [`GenericByteArray`]
///
/// For building strings, see docs on [`GenericStringBuilder`].
/// For building binary, see docs on [`GenericBinaryBuilder`].
pub struct GenericByteBuilder<T: ByteArrayType> {
value_builder: UInt8BufferBuilder,
offsets_builder: BufferBuilder<T::Offset>,
null_buffer_builder: NullBufferBuilder,
}
impl<T: ByteArrayType> GenericByteBuilder<T> {
/// Creates a new [`GenericByteBuilder`].
pub fn new() -> Self {
Self::with_capacity(1024, 1024)
}
/// Creates a new [`GenericByteBuilder`].
///
/// - `item_capacity` is the number of items to pre-allocate.
/// The size of the preallocated buffer of offsets is the number of items plus one.
/// - `data_capacity` is the total number of bytes of data to pre-allocate
/// (for all items, not per item).
pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
let mut offsets_builder = BufferBuilder::<T::Offset>::new(item_capacity + 1);
offsets_builder.append(T::Offset::from_usize(0).unwrap());
Self {
value_builder: UInt8BufferBuilder::new(data_capacity),
offsets_builder,
null_buffer_builder: NullBufferBuilder::new(item_capacity),
}
}
/// Creates a new [`GenericByteBuilder`] from buffers.
///
/// # Safety
/// This doesn't verify buffer contents as it assumes the buffers are from existing and
/// valid [`GenericByteArray`].
pub unsafe fn new_from_buffer(
offsets_buffer: MutableBuffer,
value_buffer: MutableBuffer,
null_buffer: Option<MutableBuffer>,
) -> Self {
let offsets_builder = BufferBuilder::<T::Offset>::new_from_buffer(offsets_buffer);
let value_builder = BufferBuilder::<u8>::new_from_buffer(value_buffer);
let null_buffer_builder = null_buffer
.map(|buffer| NullBufferBuilder::new_from_buffer(buffer, offsets_builder.len() - 1))
.unwrap_or_else(|| NullBufferBuilder::new_with_len(offsets_builder.len() - 1));
Self {
offsets_builder,
value_builder,
null_buffer_builder,
}
}
#[inline]
fn next_offset(&self) -> T::Offset {
T::Offset::from_usize(self.value_builder.len()).expect("byte array offset overflow")
}
/// Appends a value into the builder.
///
/// # Panics
///
/// Panics if the resulting length of [`Self::values_slice`] would exceed `T::Offset::MAX`
#[inline]
pub fn append_value(&mut self, value: impl AsRef<T::Native>) {
self.value_builder.append_slice(value.as_ref().as_ref());
self.null_buffer_builder.append(true);
self.offsets_builder.append(self.next_offset());
}
/// Append an `Option` value into the builder.
#[inline]
pub fn append_option(&mut self, value: Option<impl AsRef<T::Native>>) {
match value {
None => self.append_null(),
Some(v) => self.append_value(v),
};
}
/// Append a null value into the builder.
#[inline]
pub fn append_null(&mut self) {
self.null_buffer_builder.append(false);
self.offsets_builder.append(self.next_offset());
}
/// Builds the [`GenericByteArray`] and reset this builder.
pub fn finish(&mut self) -> GenericByteArray<T> {
let array_type = T::DATA_TYPE;
let array_builder = ArrayDataBuilder::new(array_type)
.len(self.len())
.add_buffer(self.offsets_builder.finish())
.add_buffer(self.value_builder.finish())
.nulls(self.null_buffer_builder.finish());
self.offsets_builder.append(self.next_offset());
let array_data = unsafe { array_builder.build_unchecked() };
GenericByteArray::from(array_data)
}
/// Builds the [`GenericByteArray`] without resetting the builder.
pub fn finish_cloned(&self) -> GenericByteArray<T> {
let array_type = T::DATA_TYPE;
let offset_buffer = Buffer::from_slice_ref(self.offsets_builder.as_slice());
let value_buffer = Buffer::from_slice_ref(self.value_builder.as_slice());
let array_builder = ArrayDataBuilder::new(array_type)
.len(self.len())
.add_buffer(offset_buffer)
.add_buffer(value_buffer)
.nulls(self.null_buffer_builder.finish_cloned());
let array_data = unsafe { array_builder.build_unchecked() };
GenericByteArray::from(array_data)
}
/// Returns the current values buffer as a slice
pub fn values_slice(&self) -> &[u8] {
self.value_builder.as_slice()
}
/// Returns the current offsets buffer as a slice
pub fn offsets_slice(&self) -> &[T::Offset] {
self.offsets_builder.as_slice()
}
/// Returns the current null buffer as a slice
pub fn validity_slice(&self) -> Option<&[u8]> {
self.null_buffer_builder.as_slice()
}
/// Returns the current null buffer as a mutable slice
pub fn validity_slice_mut(&mut self) -> Option<&mut [u8]> {
self.null_buffer_builder.as_slice_mut()
}
}
impl<T: ByteArrayType> std::fmt::Debug for GenericByteBuilder<T> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}{}Builder", T::Offset::PREFIX, T::PREFIX)?;
f.debug_struct("")
.field("value_builder", &self.value_builder)
.field("offsets_builder", &self.offsets_builder)
.field("null_buffer_builder", &self.null_buffer_builder)
.finish()
}
}
impl<T: ByteArrayType> Default for GenericByteBuilder<T> {
fn default() -> Self {
Self::new()
}
}
impl<T: ByteArrayType> ArrayBuilder for GenericByteBuilder<T> {
/// Returns the number of binary slots in the builder
fn len(&self) -> usize {
self.null_buffer_builder.len()
}
/// Builds the array and reset this builder.
fn finish(&mut self) -> ArrayRef {
Arc::new(self.finish())
}
/// Builds the array without resetting the builder.
fn finish_cloned(&self) -> ArrayRef {
Arc::new(self.finish_cloned())
}
/// Returns the builder as a non-mutable `Any` reference.
fn as_any(&self) -> &dyn Any {
self
}
/// Returns the builder as a mutable `Any` reference.
fn as_any_mut(&mut self) -> &mut dyn Any {
self
}
/// Returns the boxed builder as a box of `Any`.
fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
self
}
}
impl<T: ByteArrayType, V: AsRef<T::Native>> Extend<Option<V>> for GenericByteBuilder<T> {
#[inline]
fn extend<I: IntoIterator<Item = Option<V>>>(&mut self, iter: I) {
for v in iter {
self.append_option(v)
}
}
}
/// Array builder for [`GenericStringArray`][crate::GenericStringArray]
///
/// Values can be appended using [`GenericByteBuilder::append_value`], and nulls with
/// [`GenericByteBuilder::append_null`].
///
/// Additionally, implements [`std::fmt::Write`] with any written data included in the next
/// appended value. This allows use with [`std::fmt::Display`] without intermediate allocations
///
/// # Example
/// ```
/// # use std::fmt::Write;
/// # use arrow_array::builder::GenericStringBuilder;
/// let mut builder = GenericStringBuilder::<i32>::new();
///
/// // Write data
/// write!(builder, "foo").unwrap();
/// write!(builder, "bar").unwrap();
///
/// // Finish value
/// builder.append_value("baz");
///
/// // Write second value
/// write!(builder, "v2").unwrap();
/// builder.append_value("");
///
/// let array = builder.finish();
/// assert_eq!(array.value(0), "foobarbaz");
/// assert_eq!(array.value(1), "v2");
/// ```
pub type GenericStringBuilder<O> = GenericByteBuilder<GenericStringType<O>>;
impl<O: OffsetSizeTrait> Write for GenericStringBuilder<O> {
fn write_str(&mut self, s: &str) -> std::fmt::Result {
self.value_builder.append_slice(s.as_bytes());
Ok(())
}
}
/// Array builder for [`GenericBinaryArray`][crate::GenericBinaryArray]
///
/// Values can be appended using [`GenericByteBuilder::append_value`], and nulls with
/// [`GenericByteBuilder::append_null`].
///
/// # Example
/// ```
/// # use arrow_array::builder::GenericBinaryBuilder;
/// let mut builder = GenericBinaryBuilder::<i32>::new();
///
/// // Write data
/// builder.append_value("foo");
///
/// // Write second value
/// builder.append_value(&[0,1,2]);
///
/// let array = builder.finish();
/// // binary values
/// assert_eq!(array.value(0), b"foo");
/// assert_eq!(array.value(1), b"\x00\x01\x02");
/// ```
pub type GenericBinaryBuilder<O> = GenericByteBuilder<GenericBinaryType<O>>;
#[cfg(test)]
mod tests {
use super::*;
use crate::array::Array;
use crate::GenericStringArray;
fn _test_generic_binary_builder<O: OffsetSizeTrait>() {
let mut builder = GenericBinaryBuilder::<O>::new();
builder.append_value(b"hello");
builder.append_value(b"");
builder.append_null();
builder.append_value(b"rust");
let array = builder.finish();
assert_eq!(4, array.len());
assert_eq!(1, array.null_count());
assert_eq!(b"hello", array.value(0));
assert_eq!([] as [u8; 0], array.value(1));
assert!(array.is_null(2));
assert_eq!(b"rust", array.value(3));
assert_eq!(O::from_usize(5).unwrap(), array.value_offsets()[2]);
assert_eq!(O::from_usize(4).unwrap(), array.value_length(3));
}
#[test]
fn test_binary_builder() {
_test_generic_binary_builder::<i32>()
}
#[test]
fn test_large_binary_builder() {
_test_generic_binary_builder::<i64>()
}
fn _test_generic_binary_builder_all_nulls<O: OffsetSizeTrait>() {
let mut builder = GenericBinaryBuilder::<O>::new();
builder.append_null();
builder.append_null();
builder.append_null();
assert_eq!(3, builder.len());
assert!(!builder.is_empty());
let array = builder.finish();
assert_eq!(3, array.null_count());
assert_eq!(3, array.len());
assert!(array.is_null(0));
assert!(array.is_null(1));
assert!(array.is_null(2));
}
#[test]
fn test_binary_builder_all_nulls() {
_test_generic_binary_builder_all_nulls::<i32>()
}
#[test]
fn test_large_binary_builder_all_nulls() {
_test_generic_binary_builder_all_nulls::<i64>()
}
fn _test_generic_binary_builder_reset<O: OffsetSizeTrait>() {
let mut builder = GenericBinaryBuilder::<O>::new();
builder.append_value(b"hello");
builder.append_value(b"");
builder.append_null();
builder.append_value(b"rust");
builder.finish();
assert!(builder.is_empty());
builder.append_value(b"parquet");
builder.append_null();
builder.append_value(b"arrow");
builder.append_value(b"");
let array = builder.finish();
assert_eq!(4, array.len());
assert_eq!(1, array.null_count());
assert_eq!(b"parquet", array.value(0));
assert!(array.is_null(1));
assert_eq!(b"arrow", array.value(2));
assert_eq!(b"", array.value(1));
assert_eq!(O::zero(), array.value_offsets()[0]);
assert_eq!(O::from_usize(7).unwrap(), array.value_offsets()[2]);
assert_eq!(O::from_usize(5).unwrap(), array.value_length(2));
}
#[test]
fn test_binary_builder_reset() {
_test_generic_binary_builder_reset::<i32>()
}
#[test]
fn test_large_binary_builder_reset() {
_test_generic_binary_builder_reset::<i64>()
}
fn _test_generic_string_array_builder<O: OffsetSizeTrait>() {
let mut builder = GenericStringBuilder::<O>::new();
let owned = "arrow".to_owned();
builder.append_value("hello");
builder.append_value("");
builder.append_value(&owned);
builder.append_null();
builder.append_option(Some("rust"));
builder.append_option(None::<&str>);
builder.append_option(None::<String>);
assert_eq!(7, builder.len());
assert_eq!(
GenericStringArray::<O>::from(vec![
Some("hello"),
Some(""),
Some("arrow"),
None,
Some("rust"),
None,
None
]),
builder.finish()
);
}
#[test]
fn test_string_array_builder() {
_test_generic_string_array_builder::<i32>()
}
#[test]
fn test_large_string_array_builder() {
_test_generic_string_array_builder::<i64>()
}
fn _test_generic_string_array_builder_finish<O: OffsetSizeTrait>() {
let mut builder = GenericStringBuilder::<O>::with_capacity(3, 11);
builder.append_value("hello");
builder.append_value("rust");
builder.append_null();
builder.finish();
assert!(builder.is_empty());
assert_eq!(&[O::zero()], builder.offsets_slice());
builder.append_value("arrow");
builder.append_value("parquet");
let arr = builder.finish();
// array should not have null buffer because there is not `null` value.
assert!(arr.nulls().is_none());
assert_eq!(GenericStringArray::<O>::from(vec!["arrow", "parquet"]), arr,)
}
#[test]
fn test_string_array_builder_finish() {
_test_generic_string_array_builder_finish::<i32>()
}
#[test]
fn test_large_string_array_builder_finish() {
_test_generic_string_array_builder_finish::<i64>()
}
fn _test_generic_string_array_builder_finish_cloned<O: OffsetSizeTrait>() {
let mut builder = GenericStringBuilder::<O>::with_capacity(3, 11);
builder.append_value("hello");
builder.append_value("rust");
builder.append_null();
let mut arr = builder.finish_cloned();
assert!(!builder.is_empty());
assert_eq!(3, arr.len());
builder.append_value("arrow");
builder.append_value("parquet");
arr = builder.finish();
assert!(arr.nulls().is_some());
assert_eq!(&[O::zero()], builder.offsets_slice());
assert_eq!(5, arr.len());
}
#[test]
fn test_string_array_builder_finish_cloned() {
_test_generic_string_array_builder_finish_cloned::<i32>()
}
#[test]
fn test_large_string_array_builder_finish_cloned() {
_test_generic_string_array_builder_finish_cloned::<i64>()
}
#[test]
fn test_extend() {
let mut builder = GenericStringBuilder::<i32>::new();
builder.extend(["a", "b", "c", "", "a", "b", "c"].into_iter().map(Some));
builder.extend(["d", "cupcakes", "hello"].into_iter().map(Some));
let array = builder.finish();
assert_eq!(array.value_offsets(), &[0, 1, 2, 3, 3, 4, 5, 6, 7, 15, 20]);
assert_eq!(array.value_data(), b"abcabcdcupcakeshello");
}
#[test]
fn test_write() {
let mut builder = GenericStringBuilder::<i32>::new();
write!(builder, "foo").unwrap();
builder.append_value("");
writeln!(builder, "bar").unwrap();
builder.append_value("");
write!(builder, "fiz").unwrap();
write!(builder, "buz").unwrap();
builder.append_value("");
let a = builder.finish();
let r: Vec<_> = a.iter().map(|x| x.unwrap()).collect();
assert_eq!(r, &["foo", "bar\n", "fizbuz"])
}
}

Просмотреть файл

@ -0,0 +1,630 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::builder::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder};
use crate::types::{ArrowDictionaryKeyType, ByteArrayType, GenericBinaryType, GenericStringType};
use crate::{Array, ArrayRef, DictionaryArray, GenericByteArray};
use arrow_buffer::ArrowNativeType;
use arrow_schema::{ArrowError, DataType};
use hashbrown::hash_map::RawEntryMut;
use hashbrown::HashMap;
use std::any::Any;
use std::sync::Arc;
/// Builder for [`DictionaryArray`] of [`GenericByteArray`]
///
/// For example to map a set of byte indices to String values. Note that
/// the use of a `HashMap` here will not scale to very large arrays or
/// result in an ordered dictionary.
#[derive(Debug)]
pub struct GenericByteDictionaryBuilder<K, T>
where
K: ArrowDictionaryKeyType,
T: ByteArrayType,
{
state: ahash::RandomState,
/// Used to provide a lookup from string value to key type
///
/// Note: usize's hash implementation is not used, instead the raw entry
/// API is used to store keys w.r.t the hash of the strings themselves
///
dedup: HashMap<usize, (), ()>,
keys_builder: PrimitiveBuilder<K>,
values_builder: GenericByteBuilder<T>,
}
impl<K, T> Default for GenericByteDictionaryBuilder<K, T>
where
K: ArrowDictionaryKeyType,
T: ByteArrayType,
{
fn default() -> Self {
Self::new()
}
}
impl<K, T> GenericByteDictionaryBuilder<K, T>
where
K: ArrowDictionaryKeyType,
T: ByteArrayType,
{
/// Creates a new `GenericByteDictionaryBuilder`
pub fn new() -> Self {
let keys_builder = PrimitiveBuilder::new();
let values_builder = GenericByteBuilder::<T>::new();
Self {
state: Default::default(),
dedup: HashMap::with_capacity_and_hasher(keys_builder.capacity(), ()),
keys_builder,
values_builder,
}
}
/// Creates a new `GenericByteDictionaryBuilder` with the provided capacities
///
/// `keys_capacity`: the number of keys, i.e. length of array to build
/// `value_capacity`: the number of distinct dictionary values, i.e. size of dictionary
/// `data_capacity`: the total number of bytes of all distinct bytes in the dictionary
pub fn with_capacity(
keys_capacity: usize,
value_capacity: usize,
data_capacity: usize,
) -> Self {
Self {
state: Default::default(),
dedup: Default::default(),
keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
values_builder: GenericByteBuilder::<T>::with_capacity(value_capacity, data_capacity),
}
}
/// Creates a new `GenericByteDictionaryBuilder` from a keys capacity and a dictionary
/// which is initialized with the given values.
/// The indices of those dictionary values are used as keys.
///
/// # Example
///
/// ```
/// # use arrow_array::builder::StringDictionaryBuilder;
/// # use arrow_array::{Int16Array, StringArray};
///
/// let dictionary_values = StringArray::from(vec![None, Some("abc"), Some("def")]);
///
/// let mut builder = StringDictionaryBuilder::new_with_dictionary(3, &dictionary_values).unwrap();
/// builder.append("def").unwrap();
/// builder.append_null();
/// builder.append("abc").unwrap();
///
/// let dictionary_array = builder.finish();
///
/// let keys = dictionary_array.keys();
///
/// assert_eq!(keys, &Int16Array::from(vec![Some(2), None, Some(1)]));
/// ```
pub fn new_with_dictionary(
keys_capacity: usize,
dictionary_values: &GenericByteArray<T>,
) -> Result<Self, ArrowError> {
let state = ahash::RandomState::default();
let dict_len = dictionary_values.len();
let mut dedup = HashMap::with_capacity_and_hasher(dict_len, ());
let values_len = dictionary_values.value_data().len();
let mut values_builder = GenericByteBuilder::<T>::with_capacity(dict_len, values_len);
K::Native::from_usize(dictionary_values.len())
.ok_or(ArrowError::DictionaryKeyOverflowError)?;
for (idx, maybe_value) in dictionary_values.iter().enumerate() {
match maybe_value {
Some(value) => {
let value_bytes: &[u8] = value.as_ref();
let hash = state.hash_one(value_bytes);
let entry = dedup.raw_entry_mut().from_hash(hash, |idx: &usize| {
value_bytes == get_bytes(&values_builder, *idx)
});
if let RawEntryMut::Vacant(v) = entry {
v.insert_with_hasher(hash, idx, (), |idx| {
state.hash_one(get_bytes(&values_builder, *idx))
});
}
values_builder.append_value(value);
}
None => values_builder.append_null(),
}
}
Ok(Self {
state,
dedup,
keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
values_builder,
})
}
}
impl<K, T> ArrayBuilder for GenericByteDictionaryBuilder<K, T>
where
K: ArrowDictionaryKeyType,
T: ByteArrayType,
{
/// Returns the builder as an non-mutable `Any` reference.
fn as_any(&self) -> &dyn Any {
self
}
/// Returns the builder as an mutable `Any` reference.
fn as_any_mut(&mut self) -> &mut dyn Any {
self
}
/// Returns the boxed builder as a box of `Any`.
fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
self
}
/// Returns the number of array slots in the builder
fn len(&self) -> usize {
self.keys_builder.len()
}
/// Builds the array and reset this builder.
fn finish(&mut self) -> ArrayRef {
Arc::new(self.finish())
}
/// Builds the array without resetting the builder.
fn finish_cloned(&self) -> ArrayRef {
Arc::new(self.finish_cloned())
}
}
impl<K, T> GenericByteDictionaryBuilder<K, T>
where
K: ArrowDictionaryKeyType,
T: ByteArrayType,
{
/// Append a value to the array. Return an existing index
/// if already present in the values array or a new index if the
/// value is appended to the values array.
///
/// Returns an error if the new index would overflow the key type.
pub fn append(&mut self, value: impl AsRef<T::Native>) -> Result<K::Native, ArrowError> {
let value_native: &T::Native = value.as_ref();
let value_bytes: &[u8] = value_native.as_ref();
let state = &self.state;
let storage = &mut self.values_builder;
let hash = state.hash_one(value_bytes);
let entry = self
.dedup
.raw_entry_mut()
.from_hash(hash, |idx| value_bytes == get_bytes(storage, *idx));
let key = match entry {
RawEntryMut::Occupied(entry) => K::Native::usize_as(*entry.into_key()),
RawEntryMut::Vacant(entry) => {
let idx = storage.len();
storage.append_value(value);
entry.insert_with_hasher(hash, idx, (), |idx| {
state.hash_one(get_bytes(storage, *idx))
});
K::Native::from_usize(idx).ok_or(ArrowError::DictionaryKeyOverflowError)?
}
};
self.keys_builder.append_value(key);
Ok(key)
}
/// Infallibly append a value to this builder
///
/// # Panics
///
/// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
pub fn append_value(&mut self, value: impl AsRef<T::Native>) {
self.append(value).expect("dictionary key overflow");
}
/// Appends a null slot into the builder
#[inline]
pub fn append_null(&mut self) {
self.keys_builder.append_null()
}
/// Append an `Option` value into the builder
///
/// # Panics
///
/// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
#[inline]
pub fn append_option(&mut self, value: Option<impl AsRef<T::Native>>) {
match value {
None => self.append_null(),
Some(v) => self.append_value(v),
};
}
/// Builds the `DictionaryArray` and reset this builder.
pub fn finish(&mut self) -> DictionaryArray<K> {
self.dedup.clear();
let values = self.values_builder.finish();
let keys = self.keys_builder.finish();
let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(T::DATA_TYPE));
let builder = keys
.into_data()
.into_builder()
.data_type(data_type)
.child_data(vec![values.into_data()]);
DictionaryArray::from(unsafe { builder.build_unchecked() })
}
/// Builds the `DictionaryArray` without resetting the builder.
pub fn finish_cloned(&self) -> DictionaryArray<K> {
let values = self.values_builder.finish_cloned();
let keys = self.keys_builder.finish_cloned();
let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(T::DATA_TYPE));
let builder = keys
.into_data()
.into_builder()
.data_type(data_type)
.child_data(vec![values.into_data()]);
DictionaryArray::from(unsafe { builder.build_unchecked() })
}
/// Returns the current null buffer as a slice
pub fn validity_slice(&self) -> Option<&[u8]> {
self.keys_builder.validity_slice()
}
}
impl<K: ArrowDictionaryKeyType, T: ByteArrayType, V: AsRef<T::Native>> Extend<Option<V>>
for GenericByteDictionaryBuilder<K, T>
{
#[inline]
fn extend<I: IntoIterator<Item = Option<V>>>(&mut self, iter: I) {
for v in iter {
self.append_option(v)
}
}
}
fn get_bytes<T: ByteArrayType>(values: &GenericByteBuilder<T>, idx: usize) -> &[u8] {
let offsets = values.offsets_slice();
let values = values.values_slice();
let end_offset = offsets[idx + 1].as_usize();
let start_offset = offsets[idx].as_usize();
&values[start_offset..end_offset]
}
/// Builder for [`DictionaryArray`] of [`StringArray`](crate::array::StringArray)
///
/// ```
/// // Create a dictionary array indexed by bytes whose values are Strings.
/// // It can thus hold up to 256 distinct string values.
///
/// # use arrow_array::builder::StringDictionaryBuilder;
/// # use arrow_array::{Int8Array, StringArray};
/// # use arrow_array::types::Int8Type;
///
/// let mut builder = StringDictionaryBuilder::<Int8Type>::new();
///
/// // The builder builds the dictionary value by value
/// builder.append("abc").unwrap();
/// builder.append_null();
/// builder.append("def").unwrap();
/// builder.append("def").unwrap();
/// builder.append("abc").unwrap();
/// let array = builder.finish();
///
/// assert_eq!(
/// array.keys(),
/// &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
/// );
///
/// // Values are polymorphic and so require a downcast.
/// let av = array.values();
/// let ava: &StringArray = av.as_any().downcast_ref::<StringArray>().unwrap();
///
/// assert_eq!(ava.value(0), "abc");
/// assert_eq!(ava.value(1), "def");
///
/// ```
pub type StringDictionaryBuilder<K> = GenericByteDictionaryBuilder<K, GenericStringType<i32>>;
/// Builder for [`DictionaryArray`] of [`LargeStringArray`](crate::array::LargeStringArray)
pub type LargeStringDictionaryBuilder<K> = GenericByteDictionaryBuilder<K, GenericStringType<i64>>;
/// Builder for [`DictionaryArray`] of [`BinaryArray`](crate::array::BinaryArray)
///
/// ```
/// // Create a dictionary array indexed by bytes whose values are binary.
/// // It can thus hold up to 256 distinct binary values.
///
/// # use arrow_array::builder::BinaryDictionaryBuilder;
/// # use arrow_array::{BinaryArray, Int8Array};
/// # use arrow_array::types::Int8Type;
///
/// let mut builder = BinaryDictionaryBuilder::<Int8Type>::new();
///
/// // The builder builds the dictionary value by value
/// builder.append(b"abc").unwrap();
/// builder.append_null();
/// builder.append(b"def").unwrap();
/// builder.append(b"def").unwrap();
/// builder.append(b"abc").unwrap();
/// let array = builder.finish();
///
/// assert_eq!(
/// array.keys(),
/// &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
/// );
///
/// // Values are polymorphic and so require a downcast.
/// let av = array.values();
/// let ava: &BinaryArray = av.as_any().downcast_ref::<BinaryArray>().unwrap();
///
/// assert_eq!(ava.value(0), b"abc");
/// assert_eq!(ava.value(1), b"def");
///
/// ```
pub type BinaryDictionaryBuilder<K> = GenericByteDictionaryBuilder<K, GenericBinaryType<i32>>;
/// Builder for [`DictionaryArray`] of [`LargeBinaryArray`](crate::array::LargeBinaryArray)
pub type LargeBinaryDictionaryBuilder<K> = GenericByteDictionaryBuilder<K, GenericBinaryType<i64>>;
#[cfg(test)]
mod tests {
use super::*;
use crate::array::Int8Array;
use crate::types::{Int16Type, Int32Type, Int8Type, Utf8Type};
use crate::{BinaryArray, StringArray};
fn test_bytes_dictionary_builder<T>(values: Vec<&T::Native>)
where
T: ByteArrayType,
<T as ByteArrayType>::Native: PartialEq,
<T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>,
{
let mut builder = GenericByteDictionaryBuilder::<Int8Type, T>::new();
builder.append(values[0]).unwrap();
builder.append_null();
builder.append(values[1]).unwrap();
builder.append(values[1]).unwrap();
builder.append(values[0]).unwrap();
let array = builder.finish();
assert_eq!(
array.keys(),
&Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
);
// Values are polymorphic and so require a downcast.
let av = array.values();
let ava: &GenericByteArray<T> = av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
assert_eq!(*ava.value(0), *values[0]);
assert_eq!(*ava.value(1), *values[1]);
}
#[test]
fn test_string_dictionary_builder() {
test_bytes_dictionary_builder::<GenericStringType<i32>>(vec!["abc", "def"]);
}
#[test]
fn test_binary_dictionary_builder() {
test_bytes_dictionary_builder::<GenericBinaryType<i32>>(vec![b"abc", b"def"]);
}
fn test_bytes_dictionary_builder_finish_cloned<T>(values: Vec<&T::Native>)
where
T: ByteArrayType,
<T as ByteArrayType>::Native: PartialEq,
<T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>,
{
let mut builder = GenericByteDictionaryBuilder::<Int8Type, T>::new();
builder.append(values[0]).unwrap();
builder.append_null();
builder.append(values[1]).unwrap();
builder.append(values[1]).unwrap();
builder.append(values[0]).unwrap();
let mut array = builder.finish_cloned();
assert_eq!(
array.keys(),
&Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
);
// Values are polymorphic and so require a downcast.
let av = array.values();
let ava: &GenericByteArray<T> = av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
assert_eq!(ava.value(0), values[0]);
assert_eq!(ava.value(1), values[1]);
builder.append(values[0]).unwrap();
builder.append(values[2]).unwrap();
builder.append(values[1]).unwrap();
array = builder.finish();
assert_eq!(
array.keys(),
&Int8Array::from(vec![
Some(0),
None,
Some(1),
Some(1),
Some(0),
Some(0),
Some(2),
Some(1)
])
);
// Values are polymorphic and so require a downcast.
let av2 = array.values();
let ava2: &GenericByteArray<T> =
av2.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
assert_eq!(ava2.value(0), values[0]);
assert_eq!(ava2.value(1), values[1]);
assert_eq!(ava2.value(2), values[2]);
}
#[test]
fn test_string_dictionary_builder_finish_cloned() {
test_bytes_dictionary_builder_finish_cloned::<GenericStringType<i32>>(vec![
"abc", "def", "ghi",
]);
}
#[test]
fn test_binary_dictionary_builder_finish_cloned() {
test_bytes_dictionary_builder_finish_cloned::<GenericBinaryType<i32>>(vec![
b"abc", b"def", b"ghi",
]);
}
fn test_bytes_dictionary_builder_with_existing_dictionary<T>(
dictionary: GenericByteArray<T>,
values: Vec<&T::Native>,
) where
T: ByteArrayType,
<T as ByteArrayType>::Native: PartialEq,
<T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>,
{
let mut builder =
GenericByteDictionaryBuilder::<Int8Type, T>::new_with_dictionary(6, &dictionary)
.unwrap();
builder.append(values[0]).unwrap();
builder.append_null();
builder.append(values[1]).unwrap();
builder.append(values[1]).unwrap();
builder.append(values[0]).unwrap();
builder.append(values[2]).unwrap();
let array = builder.finish();
assert_eq!(
array.keys(),
&Int8Array::from(vec![Some(2), None, Some(1), Some(1), Some(2), Some(3)])
);
// Values are polymorphic and so require a downcast.
let av = array.values();
let ava: &GenericByteArray<T> = av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
assert!(!ava.is_valid(0));
assert_eq!(ava.value(1), values[1]);
assert_eq!(ava.value(2), values[0]);
assert_eq!(ava.value(3), values[2]);
}
#[test]
fn test_string_dictionary_builder_with_existing_dictionary() {
test_bytes_dictionary_builder_with_existing_dictionary::<GenericStringType<i32>>(
StringArray::from(vec![None, Some("def"), Some("abc")]),
vec!["abc", "def", "ghi"],
);
}
#[test]
fn test_binary_dictionary_builder_with_existing_dictionary() {
let values: Vec<Option<&[u8]>> = vec![None, Some(b"def"), Some(b"abc")];
test_bytes_dictionary_builder_with_existing_dictionary::<GenericBinaryType<i32>>(
BinaryArray::from(values),
vec![b"abc", b"def", b"ghi"],
);
}
fn test_bytes_dictionary_builder_with_reserved_null_value<T>(
dictionary: GenericByteArray<T>,
values: Vec<&T::Native>,
) where
T: ByteArrayType,
<T as ByteArrayType>::Native: PartialEq,
<T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>,
{
let mut builder =
GenericByteDictionaryBuilder::<Int16Type, T>::new_with_dictionary(4, &dictionary)
.unwrap();
builder.append(values[0]).unwrap();
builder.append_null();
builder.append(values[1]).unwrap();
builder.append(values[0]).unwrap();
let array = builder.finish();
assert!(array.is_null(1));
assert!(!array.is_valid(1));
let keys = array.keys();
assert_eq!(keys.value(0), 1);
assert!(keys.is_null(1));
// zero initialization is currently guaranteed by Buffer allocation and resizing
assert_eq!(keys.value(1), 0);
assert_eq!(keys.value(2), 2);
assert_eq!(keys.value(3), 1);
}
#[test]
fn test_string_dictionary_builder_with_reserved_null_value() {
let v: Vec<Option<&str>> = vec![None];
test_bytes_dictionary_builder_with_reserved_null_value::<GenericStringType<i32>>(
StringArray::from(v),
vec!["abc", "def"],
);
}
#[test]
fn test_binary_dictionary_builder_with_reserved_null_value() {
let values: Vec<Option<&[u8]>> = vec![None];
test_bytes_dictionary_builder_with_reserved_null_value::<GenericBinaryType<i32>>(
BinaryArray::from(values),
vec![b"abc", b"def"],
);
}
#[test]
fn test_extend() {
let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
builder.extend(["a", "b", "c", "a", "b", "c"].into_iter().map(Some));
builder.extend(["c", "d", "a"].into_iter().map(Some));
let dict = builder.finish();
assert_eq!(dict.keys().values(), &[0, 1, 2, 0, 1, 2, 2, 3, 0]);
assert_eq!(dict.values().len(), 4);
}
}

Просмотреть файл

@ -0,0 +1,431 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use std::any::Any;
use std::marker::PhantomData;
use std::sync::Arc;
use arrow_buffer::{Buffer, BufferBuilder, NullBufferBuilder, ScalarBuffer};
use arrow_data::ByteView;
use arrow_schema::ArrowError;
use crate::builder::ArrayBuilder;
use crate::types::bytes::ByteArrayNativeType;
use crate::types::{BinaryViewType, ByteViewType, StringViewType};
use crate::{ArrayRef, GenericByteViewArray};
const DEFAULT_BLOCK_SIZE: u32 = 8 * 1024;
/// A builder for [`GenericByteViewArray`]
///
/// A [`GenericByteViewArray`] consists of a list of data blocks containing string data,
/// and a list of views into those buffers.
///
/// This builder can be used in two ways
///
/// # Append Values
///
/// To avoid bump allocating, this builder allocates data in fixed size blocks, configurable
/// using [`GenericByteViewBuilder::with_block_size`]. [`GenericByteViewBuilder::append_value`]
/// writes values larger than 12 bytes to the current in-progress block, with values smaller
/// than 12 bytes inlined into the views. If a value is appended that will not fit in the
/// in-progress block, it will be closed, and a new block of sufficient size allocated
///
/// # Append Views
///
/// Some use-cases may wish to reuse an existing allocation containing string data, for example,
/// when parsing data from a parquet data page. In such a case entire blocks can be appended
/// using [`GenericByteViewBuilder::append_block`] and then views into this block appended
/// using [`GenericByteViewBuilder::try_append_view`]
pub struct GenericByteViewBuilder<T: ByteViewType + ?Sized> {
views_builder: BufferBuilder<u128>,
null_buffer_builder: NullBufferBuilder,
completed: Vec<Buffer>,
in_progress: Vec<u8>,
block_size: u32,
phantom: PhantomData<T>,
}
impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
/// Creates a new [`GenericByteViewBuilder`].
pub fn new() -> Self {
Self::with_capacity(1024)
}
/// Creates a new [`GenericByteViewBuilder`] with space for `capacity` string values.
pub fn with_capacity(capacity: usize) -> Self {
Self {
views_builder: BufferBuilder::new(capacity),
null_buffer_builder: NullBufferBuilder::new(capacity),
completed: vec![],
in_progress: vec![],
block_size: DEFAULT_BLOCK_SIZE,
phantom: Default::default(),
}
}
/// Override the size of buffers to allocate for holding string data
pub fn with_block_size(self, block_size: u32) -> Self {
Self { block_size, ..self }
}
/// Append a new data block returning the new block offset
///
/// Note: this will first flush any in-progress block
///
/// This allows appending views from blocks added using [`Self::append_block`]. See
/// [`Self::append_value`] for appending individual values
///
/// ```
/// # use arrow_array::builder::StringViewBuilder;
/// let mut builder = StringViewBuilder::new();
///
/// let block = builder.append_block(b"helloworldbingobongo".into());
///
/// builder.try_append_view(block, 0, 5).unwrap();
/// builder.try_append_view(block, 5, 5).unwrap();
/// builder.try_append_view(block, 10, 5).unwrap();
/// builder.try_append_view(block, 15, 5).unwrap();
/// builder.try_append_view(block, 0, 15).unwrap();
/// let array = builder.finish();
///
/// let actual: Vec<_> = array.iter().flatten().collect();
/// let expected = &["hello", "world", "bingo", "bongo", "helloworldbingo"];
/// assert_eq!(actual, expected);
/// ```
pub fn append_block(&mut self, buffer: Buffer) -> u32 {
assert!(buffer.len() < u32::MAX as usize);
self.flush_in_progress();
let offset = self.completed.len();
self.push_completed(buffer);
offset as u32
}
/// Append a view of the given `block`, `offset` and `length`
///
/// # Safety
/// (1) The block must have been added using [`Self::append_block`]
/// (2) The range `offset..offset+length` must be within the bounds of the block
/// (3) The data in the block must be valid of type `T`
pub unsafe fn append_view_unchecked(&mut self, block: u32, offset: u32, len: u32) {
let b = self.completed.get_unchecked(block as usize);
let start = offset as usize;
let end = start.saturating_add(len as usize);
let b = b.get_unchecked(start..end);
if len <= 12 {
let mut view_buffer = [0; 16];
view_buffer[0..4].copy_from_slice(&len.to_le_bytes());
view_buffer[4..4 + b.len()].copy_from_slice(b);
self.views_builder.append(u128::from_le_bytes(view_buffer));
} else {
let view = ByteView {
length: len,
prefix: u32::from_le_bytes(b[0..4].try_into().unwrap()),
buffer_index: block,
offset,
};
self.views_builder.append(view.into());
}
self.null_buffer_builder.append_non_null();
}
/// Try to append a view of the given `block`, `offset` and `length`
///
/// See [`Self::append_block`]
pub fn try_append_view(&mut self, block: u32, offset: u32, len: u32) -> Result<(), ArrowError> {
let b = self.completed.get(block as usize).ok_or_else(|| {
ArrowError::InvalidArgumentError(format!("No block found with index {block}"))
})?;
let start = offset as usize;
let end = start.saturating_add(len as usize);
let b = b.get(start..end).ok_or_else(|| {
ArrowError::InvalidArgumentError(format!(
"Range {start}..{end} out of bounds for block of length {}",
b.len()
))
})?;
if T::Native::from_bytes_checked(b).is_none() {
return Err(ArrowError::InvalidArgumentError(
"Invalid view data".to_string(),
));
}
unsafe {
self.append_view_unchecked(block, offset, len);
}
Ok(())
}
/// Flushes the in progress block if any
#[inline]
fn flush_in_progress(&mut self) {
if !self.in_progress.is_empty() {
let f = Buffer::from_vec(std::mem::take(&mut self.in_progress));
self.push_completed(f)
}
}
/// Append a block to `self.completed`, checking for overflow
#[inline]
fn push_completed(&mut self, block: Buffer) {
assert!(block.len() < u32::MAX as usize, "Block too large");
assert!(self.completed.len() < u32::MAX as usize, "Too many blocks");
self.completed.push(block);
}
/// Appends a value into the builder
///
/// # Panics
///
/// Panics if
/// - String buffer count exceeds `u32::MAX`
/// - String length exceeds `u32::MAX`
#[inline]
pub fn append_value(&mut self, value: impl AsRef<T::Native>) {
let v: &[u8] = value.as_ref().as_ref();
let length: u32 = v.len().try_into().unwrap();
if length <= 12 {
let mut view_buffer = [0; 16];
view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
view_buffer[4..4 + v.len()].copy_from_slice(v);
self.views_builder.append(u128::from_le_bytes(view_buffer));
self.null_buffer_builder.append_non_null();
return;
}
let required_cap = self.in_progress.len() + v.len();
if self.in_progress.capacity() < required_cap {
self.flush_in_progress();
let to_reserve = v.len().max(self.block_size as usize);
self.in_progress.reserve(to_reserve);
};
let offset = self.in_progress.len() as u32;
self.in_progress.extend_from_slice(v);
let view = ByteView {
length,
prefix: u32::from_le_bytes(v[0..4].try_into().unwrap()),
buffer_index: self.completed.len() as u32,
offset,
};
self.views_builder.append(view.into());
self.null_buffer_builder.append_non_null();
}
/// Append an `Option` value into the builder
#[inline]
pub fn append_option(&mut self, value: Option<impl AsRef<T::Native>>) {
match value {
None => self.append_null(),
Some(v) => self.append_value(v),
};
}
/// Append a null value into the builder
#[inline]
pub fn append_null(&mut self) {
self.null_buffer_builder.append_null();
self.views_builder.append(0);
}
/// Builds the [`GenericByteViewArray`] and reset this builder
pub fn finish(&mut self) -> GenericByteViewArray<T> {
self.flush_in_progress();
let completed = std::mem::take(&mut self.completed);
let len = self.views_builder.len();
let views = ScalarBuffer::new(self.views_builder.finish(), 0, len);
let nulls = self.null_buffer_builder.finish();
// SAFETY: valid by construction
unsafe { GenericByteViewArray::new_unchecked(views, completed, nulls) }
}
/// Builds the [`GenericByteViewArray`] without resetting the builder
pub fn finish_cloned(&self) -> GenericByteViewArray<T> {
let mut completed = self.completed.clone();
if !self.in_progress.is_empty() {
completed.push(Buffer::from_slice_ref(&self.in_progress));
}
let len = self.views_builder.len();
let views = Buffer::from_slice_ref(self.views_builder.as_slice());
let views = ScalarBuffer::new(views, 0, len);
let nulls = self.null_buffer_builder.finish_cloned();
// SAFETY: valid by construction
unsafe { GenericByteViewArray::new_unchecked(views, completed, nulls) }
}
/// Returns the current null buffer as a slice
pub fn validity_slice(&self) -> Option<&[u8]> {
self.null_buffer_builder.as_slice()
}
}
impl<T: ByteViewType + ?Sized> Default for GenericByteViewBuilder<T> {
fn default() -> Self {
Self::new()
}
}
impl<T: ByteViewType + ?Sized> std::fmt::Debug for GenericByteViewBuilder<T> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}ViewBuilder", T::PREFIX)?;
f.debug_struct("")
.field("views_builder", &self.views_builder)
.field("in_progress", &self.in_progress)
.field("completed", &self.completed)
.field("null_buffer_builder", &self.null_buffer_builder)
.finish()
}
}
impl<T: ByteViewType + ?Sized> ArrayBuilder for GenericByteViewBuilder<T> {
fn len(&self) -> usize {
self.null_buffer_builder.len()
}
fn finish(&mut self) -> ArrayRef {
Arc::new(self.finish())
}
fn finish_cloned(&self) -> ArrayRef {
Arc::new(self.finish_cloned())
}
fn as_any(&self) -> &dyn Any {
self
}
fn as_any_mut(&mut self) -> &mut dyn Any {
self
}
fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
self
}
}
impl<T: ByteViewType + ?Sized, V: AsRef<T::Native>> Extend<Option<V>>
for GenericByteViewBuilder<T>
{
#[inline]
fn extend<I: IntoIterator<Item = Option<V>>>(&mut self, iter: I) {
for v in iter {
self.append_option(v)
}
}
}
/// Array builder for [`StringViewArray`][crate::StringViewArray]
///
/// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with
/// [`GenericByteViewBuilder::append_null`] as normal.
pub type StringViewBuilder = GenericByteViewBuilder<StringViewType>;
/// Array builder for [`BinaryViewArray`][crate::BinaryViewArray]
///
/// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with
/// [`GenericByteViewBuilder::append_null`] as normal.
pub type BinaryViewBuilder = GenericByteViewBuilder<BinaryViewType>;
#[cfg(test)]
mod tests {
use super::*;
use crate::Array;
#[test]
fn test_string_view() {
let b1 = Buffer::from(b"world\xFFbananas\xF0\x9F\x98\x81");
let b2 = Buffer::from(b"cupcakes");
let b3 = Buffer::from(b"Many strings are here contained of great length and verbosity");
let mut v = StringViewBuilder::new();
assert_eq!(v.append_block(b1), 0);
v.append_value("This is a very long string that exceeds the inline length");
v.append_value("This is another very long string that exceeds the inline length");
assert_eq!(v.append_block(b2), 2);
assert_eq!(v.append_block(b3), 3);
// Test short strings
v.try_append_view(0, 0, 5).unwrap(); // world
v.try_append_view(0, 6, 7).unwrap(); // bananas
v.try_append_view(2, 3, 5).unwrap(); // cake
v.try_append_view(2, 0, 3).unwrap(); // cup
v.try_append_view(2, 0, 8).unwrap(); // cupcakes
v.try_append_view(0, 13, 4).unwrap(); // 😁
v.try_append_view(0, 13, 0).unwrap(); //
// Test longer strings
v.try_append_view(3, 0, 16).unwrap(); // Many strings are
v.try_append_view(1, 0, 19).unwrap(); // This is a very long
v.try_append_view(3, 13, 27).unwrap(); // here contained of great length
v.append_value("I do so like long strings");
let array = v.finish_cloned();
array.to_data().validate_full().unwrap();
assert_eq!(array.data_buffers().len(), 5);
let actual: Vec<_> = array.iter().map(Option::unwrap).collect();
assert_eq!(
actual,
&[
"This is a very long string that exceeds the inline length",
"This is another very long string that exceeds the inline length",
"world",
"bananas",
"cakes",
"cup",
"cupcakes",
"😁",
"",
"Many strings are",
"This is a very long",
"are here contained of great",
"I do so like long strings"
]
);
let err = v.try_append_view(0, u32::MAX, 1).unwrap_err();
assert_eq!(err.to_string(), "Invalid argument error: Range 4294967295..4294967296 out of bounds for block of length 17");
let err = v.try_append_view(0, 1, u32::MAX).unwrap_err();
assert_eq!(
err.to_string(),
"Invalid argument error: Range 1..4294967296 out of bounds for block of length 17"
);
let err = v.try_append_view(0, 13, 2).unwrap_err();
assert_eq!(err.to_string(), "Invalid argument error: Invalid view data");
let err = v.try_append_view(0, 40, 0).unwrap_err();
assert_eq!(
err.to_string(),
"Invalid argument error: Range 40..40 out of bounds for block of length 17"
);
let err = v.try_append_view(5, 0, 0).unwrap_err();
assert_eq!(
err.to_string(),
"Invalid argument error: No block found with index 5"
);
}
}

Просмотреть файл

@ -0,0 +1,806 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::builder::{ArrayBuilder, BufferBuilder};
use crate::{Array, ArrayRef, GenericListArray, OffsetSizeTrait};
use arrow_buffer::NullBufferBuilder;
use arrow_buffer::{Buffer, OffsetBuffer};
use arrow_schema::{Field, FieldRef};
use std::any::Any;
use std::sync::Arc;
/// Builder for [`GenericListArray`]
///
/// Use [`ListBuilder`] to build [`ListArray`]s and [`LargeListBuilder`] to build [`LargeListArray`]s.
///
/// # Example
///
/// Here is code that constructs a ListArray with the contents:
/// `[[A,B,C], [], NULL, [D], [NULL, F]]`
///
/// ```
/// # use std::sync::Arc;
/// # use arrow_array::{builder::ListBuilder, builder::StringBuilder, ArrayRef, StringArray, Array};
/// #
/// let values_builder = StringBuilder::new();
/// let mut builder = ListBuilder::new(values_builder);
///
/// // [A, B, C]
/// builder.values().append_value("A");
/// builder.values().append_value("B");
/// builder.values().append_value("C");
/// builder.append(true);
///
/// // [ ] (empty list)
/// builder.append(true);
///
/// // Null
/// builder.values().append_value("?"); // irrelevant
/// builder.append(false);
///
/// // [D]
/// builder.values().append_value("D");
/// builder.append(true);
///
/// // [NULL, F]
/// builder.values().append_null();
/// builder.values().append_value("F");
/// builder.append(true);
///
/// // Build the array
/// let array = builder.finish();
///
/// // Values is a string array
/// // "A", "B" "C", "?", "D", NULL, "F"
/// assert_eq!(
/// array.values().as_ref(),
/// &StringArray::from(vec![
/// Some("A"), Some("B"), Some("C"),
/// Some("?"), Some("D"), None,
/// Some("F")
/// ])
/// );
///
/// // Offsets are indexes into the values array
/// assert_eq!(
/// array.value_offsets(),
/// &[0, 3, 3, 4, 5, 7]
/// );
/// ```
///
/// [`ListBuilder`]: crate::builder::ListBuilder
/// [`ListArray`]: crate::array::ListArray
/// [`LargeListBuilder`]: crate::builder::LargeListBuilder
/// [`LargeListArray`]: crate::array::LargeListArray
#[derive(Debug)]
pub struct GenericListBuilder<OffsetSize: OffsetSizeTrait, T: ArrayBuilder> {
offsets_builder: BufferBuilder<OffsetSize>,
null_buffer_builder: NullBufferBuilder,
values_builder: T,
field: Option<FieldRef>,
}
impl<O: OffsetSizeTrait, T: ArrayBuilder + Default> Default for GenericListBuilder<O, T> {
fn default() -> Self {
Self::new(T::default())
}
}
impl<OffsetSize: OffsetSizeTrait, T: ArrayBuilder> GenericListBuilder<OffsetSize, T> {
/// Creates a new [`GenericListBuilder`] from a given values array builder
pub fn new(values_builder: T) -> Self {
let capacity = values_builder.len();
Self::with_capacity(values_builder, capacity)
}
/// Creates a new [`GenericListBuilder`] from a given values array builder
/// `capacity` is the number of items to pre-allocate space for in this builder
pub fn with_capacity(values_builder: T, capacity: usize) -> Self {
let mut offsets_builder = BufferBuilder::<OffsetSize>::new(capacity + 1);
offsets_builder.append(OffsetSize::zero());
Self {
offsets_builder,
null_buffer_builder: NullBufferBuilder::new(capacity),
values_builder,
field: None,
}
}
/// Override the field passed to [`GenericListArray::new`]
///
/// By default a nullable field is created with the name `item`
///
/// Note: [`Self::finish`] and [`Self::finish_cloned`] will panic if the
/// field's data type does not match that of `T`
pub fn with_field(self, field: impl Into<FieldRef>) -> Self {
Self {
field: Some(field.into()),
..self
}
}
}
impl<OffsetSize: OffsetSizeTrait, T: ArrayBuilder> ArrayBuilder
for GenericListBuilder<OffsetSize, T>
where
T: 'static,
{
/// Returns the builder as a non-mutable `Any` reference.
fn as_any(&self) -> &dyn Any {
self
}
/// Returns the builder as a mutable `Any` reference.
fn as_any_mut(&mut self) -> &mut dyn Any {
self
}
/// Returns the boxed builder as a box of `Any`.
fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
self
}
/// Returns the number of array slots in the builder
fn len(&self) -> usize {
self.null_buffer_builder.len()
}
/// Builds the array and reset this builder.
fn finish(&mut self) -> ArrayRef {
Arc::new(self.finish())
}
/// Builds the array without resetting the builder.
fn finish_cloned(&self) -> ArrayRef {
Arc::new(self.finish_cloned())
}
}
impl<OffsetSize: OffsetSizeTrait, T: ArrayBuilder> GenericListBuilder<OffsetSize, T>
where
T: 'static,
{
/// Returns the child array builder as a mutable reference.
///
/// This mutable reference can be used to append values into the child array builder,
/// but you must call [`append`](#method.append) to delimit each distinct list value.
pub fn values(&mut self) -> &mut T {
&mut self.values_builder
}
/// Returns the child array builder as an immutable reference
pub fn values_ref(&self) -> &T {
&self.values_builder
}
/// Finish the current variable-length list array slot
///
/// # Panics
///
/// Panics if the length of [`Self::values`] exceeds `OffsetSize::MAX`
#[inline]
pub fn append(&mut self, is_valid: bool) {
self.offsets_builder.append(self.next_offset());
self.null_buffer_builder.append(is_valid);
}
/// Returns the next offset
///
/// # Panics
///
/// Panics if the length of [`Self::values`] exceeds `OffsetSize::MAX`
#[inline]
fn next_offset(&self) -> OffsetSize {
OffsetSize::from_usize(self.values_builder.len()).unwrap()
}
/// Append a value to this [`GenericListBuilder`]
///
/// ```
/// # use arrow_array::builder::{Int32Builder, ListBuilder};
/// # use arrow_array::cast::AsArray;
/// # use arrow_array::{Array, Int32Array};
/// # use arrow_array::types::Int32Type;
/// let mut builder = ListBuilder::new(Int32Builder::new());
///
/// builder.append_value([Some(1), Some(2), Some(3)]);
/// builder.append_value([]);
/// builder.append_value([None]);
///
/// let array = builder.finish();
/// assert_eq!(array.len(), 3);
///
/// assert_eq!(array.value_offsets(), &[0, 3, 3, 4]);
/// let values = array.values().as_primitive::<Int32Type>();
/// assert_eq!(values, &Int32Array::from(vec![Some(1), Some(2), Some(3), None]));
/// ```
///
/// This is an alternative API to appending directly to [`Self::values`] and
/// delimiting the result with [`Self::append`]
///
/// ```
/// # use arrow_array::builder::{Int32Builder, ListBuilder};
/// # use arrow_array::cast::AsArray;
/// # use arrow_array::{Array, Int32Array};
/// # use arrow_array::types::Int32Type;
/// let mut builder = ListBuilder::new(Int32Builder::new());
///
/// builder.values().append_value(1);
/// builder.values().append_value(2);
/// builder.values().append_value(3);
/// builder.append(true);
/// builder.append(true);
/// builder.values().append_null();
/// builder.append(true);
///
/// let array = builder.finish();
/// assert_eq!(array.len(), 3);
///
/// assert_eq!(array.value_offsets(), &[0, 3, 3, 4]);
/// let values = array.values().as_primitive::<Int32Type>();
/// assert_eq!(values, &Int32Array::from(vec![Some(1), Some(2), Some(3), None]));
/// ```
#[inline]
pub fn append_value<I, V>(&mut self, i: I)
where
T: Extend<Option<V>>,
I: IntoIterator<Item = Option<V>>,
{
self.extend(std::iter::once(Some(i)))
}
/// Append a null to this [`GenericListBuilder`]
///
/// See [`Self::append_value`] for an example use.
#[inline]
pub fn append_null(&mut self) {
self.offsets_builder.append(self.next_offset());
self.null_buffer_builder.append_null();
}
/// Appends an optional value into this [`GenericListBuilder`]
///
/// If `Some` calls [`Self::append_value`] otherwise calls [`Self::append_null`]
#[inline]
pub fn append_option<I, V>(&mut self, i: Option<I>)
where
T: Extend<Option<V>>,
I: IntoIterator<Item = Option<V>>,
{
match i {
Some(i) => self.append_value(i),
None => self.append_null(),
}
}
/// Builds the [`GenericListArray`] and reset this builder.
pub fn finish(&mut self) -> GenericListArray<OffsetSize> {
let values = self.values_builder.finish();
let nulls = self.null_buffer_builder.finish();
let offsets = self.offsets_builder.finish();
// Safety: Safe by construction
let offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) };
self.offsets_builder.append(OffsetSize::zero());
let field = match &self.field {
Some(f) => f.clone(),
None => Arc::new(Field::new("item", values.data_type().clone(), true)),
};
GenericListArray::new(field, offsets, values, nulls)
}
/// Builds the [`GenericListArray`] without resetting the builder.
pub fn finish_cloned(&self) -> GenericListArray<OffsetSize> {
let values = self.values_builder.finish_cloned();
let nulls = self.null_buffer_builder.finish_cloned();
let offsets = Buffer::from_slice_ref(self.offsets_builder.as_slice());
// Safety: safe by construction
let offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) };
let field = match &self.field {
Some(f) => f.clone(),
None => Arc::new(Field::new("item", values.data_type().clone(), true)),
};
GenericListArray::new(field, offsets, values, nulls)
}
/// Returns the current offsets buffer as a slice
pub fn offsets_slice(&self) -> &[OffsetSize] {
self.offsets_builder.as_slice()
}
/// Returns the current null buffer as a slice
pub fn validity_slice(&self) -> Option<&[u8]> {
self.null_buffer_builder.as_slice()
}
}
impl<O, B, V, E> Extend<Option<V>> for GenericListBuilder<O, B>
where
O: OffsetSizeTrait,
B: ArrayBuilder + Extend<E>,
V: IntoIterator<Item = E>,
{
#[inline]
fn extend<T: IntoIterator<Item = Option<V>>>(&mut self, iter: T) {
for v in iter {
match v {
Some(elements) => {
self.values_builder.extend(elements);
self.append(true);
}
None => self.append(false),
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::builder::{make_builder, Int32Builder, ListBuilder};
use crate::cast::AsArray;
use crate::types::Int32Type;
use crate::Int32Array;
use arrow_schema::DataType;
fn _test_generic_list_array_builder<O: OffsetSizeTrait>() {
let values_builder = Int32Builder::with_capacity(10);
let mut builder = GenericListBuilder::<O, _>::new(values_builder);
// [[0, 1, 2], [3, 4, 5], [6, 7]]
builder.values().append_value(0);
builder.values().append_value(1);
builder.values().append_value(2);
builder.append(true);
builder.values().append_value(3);
builder.values().append_value(4);
builder.values().append_value(5);
builder.append(true);
builder.values().append_value(6);
builder.values().append_value(7);
builder.append(true);
let list_array = builder.finish();
let list_values = list_array.values().as_primitive::<Int32Type>();
assert_eq!(list_values.values(), &[0, 1, 2, 3, 4, 5, 6, 7]);
assert_eq!(list_array.value_offsets(), [0, 3, 6, 8].map(O::usize_as));
assert_eq!(DataType::Int32, list_array.value_type());
assert_eq!(3, list_array.len());
assert_eq!(0, list_array.null_count());
assert_eq!(O::from_usize(6).unwrap(), list_array.value_offsets()[2]);
assert_eq!(O::from_usize(2).unwrap(), list_array.value_length(2));
for i in 0..3 {
assert!(list_array.is_valid(i));
assert!(!list_array.is_null(i));
}
}
#[test]
fn test_list_array_builder() {
_test_generic_list_array_builder::<i32>()
}
#[test]
fn test_large_list_array_builder() {
_test_generic_list_array_builder::<i64>()
}
fn _test_generic_list_array_builder_nulls<O: OffsetSizeTrait>() {
let values_builder = Int32Builder::with_capacity(10);
let mut builder = GenericListBuilder::<O, _>::new(values_builder);
// [[0, 1, 2], null, [3, null, 5], [6, 7]]
builder.values().append_value(0);
builder.values().append_value(1);
builder.values().append_value(2);
builder.append(true);
builder.append(false);
builder.values().append_value(3);
builder.values().append_null();
builder.values().append_value(5);
builder.append(true);
builder.values().append_value(6);
builder.values().append_value(7);
builder.append(true);
let list_array = builder.finish();
assert_eq!(DataType::Int32, list_array.value_type());
assert_eq!(4, list_array.len());
assert_eq!(1, list_array.null_count());
assert_eq!(O::from_usize(3).unwrap(), list_array.value_offsets()[2]);
assert_eq!(O::from_usize(3).unwrap(), list_array.value_length(2));
}
#[test]
fn test_list_array_builder_nulls() {
_test_generic_list_array_builder_nulls::<i32>()
}
#[test]
fn test_large_list_array_builder_nulls() {
_test_generic_list_array_builder_nulls::<i64>()
}
#[test]
fn test_list_array_builder_finish() {
let values_builder = Int32Array::builder(5);
let mut builder = ListBuilder::new(values_builder);
builder.values().append_slice(&[1, 2, 3]);
builder.append(true);
builder.values().append_slice(&[4, 5, 6]);
builder.append(true);
let mut arr = builder.finish();
assert_eq!(2, arr.len());
assert!(builder.is_empty());
builder.values().append_slice(&[7, 8, 9]);
builder.append(true);
arr = builder.finish();
assert_eq!(1, arr.len());
assert!(builder.is_empty());
}
#[test]
fn test_list_array_builder_finish_cloned() {
let values_builder = Int32Array::builder(5);
let mut builder = ListBuilder::new(values_builder);
builder.values().append_slice(&[1, 2, 3]);
builder.append(true);
builder.values().append_slice(&[4, 5, 6]);
builder.append(true);
let mut arr = builder.finish_cloned();
assert_eq!(2, arr.len());
assert!(!builder.is_empty());
builder.values().append_slice(&[7, 8, 9]);
builder.append(true);
arr = builder.finish();
assert_eq!(3, arr.len());
assert!(builder.is_empty());
}
#[test]
fn test_list_list_array_builder() {
let primitive_builder = Int32Builder::with_capacity(10);
let values_builder = ListBuilder::new(primitive_builder);
let mut builder = ListBuilder::new(values_builder);
// [[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], null, [[9, 10]]]
builder.values().values().append_value(1);
builder.values().values().append_value(2);
builder.values().append(true);
builder.values().values().append_value(3);
builder.values().values().append_value(4);
builder.values().append(true);
builder.append(true);
builder.values().values().append_value(5);
builder.values().values().append_value(6);
builder.values().values().append_value(7);
builder.values().append(true);
builder.values().append(false);
builder.values().values().append_value(8);
builder.values().append(true);
builder.append(true);
builder.append(false);
builder.values().values().append_value(9);
builder.values().values().append_value(10);
builder.values().append(true);
builder.append(true);
let l1 = builder.finish();
assert_eq!(4, l1.len());
assert_eq!(1, l1.null_count());
assert_eq!(l1.value_offsets(), &[0, 2, 5, 5, 6]);
let l2 = l1.values().as_list::<i32>();
assert_eq!(6, l2.len());
assert_eq!(1, l2.null_count());
assert_eq!(l2.value_offsets(), &[0, 2, 4, 7, 7, 8, 10]);
let i1 = l2.values().as_primitive::<Int32Type>();
assert_eq!(10, i1.len());
assert_eq!(0, i1.null_count());
assert_eq!(i1.values(), &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
}
#[test]
fn test_extend() {
let mut builder = ListBuilder::new(Int32Builder::new());
builder.extend([
Some(vec![Some(1), Some(2), Some(7), None]),
Some(vec![]),
Some(vec![Some(4), Some(5)]),
None,
]);
let array = builder.finish();
assert_eq!(array.value_offsets(), [0, 4, 4, 6, 6]);
assert_eq!(array.null_count(), 1);
assert!(array.is_null(3));
let elements = array.values().as_primitive::<Int32Type>();
assert_eq!(elements.values(), &[1, 2, 7, 0, 4, 5]);
assert_eq!(elements.null_count(), 1);
assert!(elements.is_null(3));
}
#[test]
fn test_boxed_primitive_array_builder() {
let values_builder = make_builder(&DataType::Int32, 5);
let mut builder = ListBuilder::new(values_builder);
builder
.values()
.as_any_mut()
.downcast_mut::<Int32Builder>()
.expect("should be an Int32Builder")
.append_slice(&[1, 2, 3]);
builder.append(true);
builder
.values()
.as_any_mut()
.downcast_mut::<Int32Builder>()
.expect("should be an Int32Builder")
.append_slice(&[4, 5, 6]);
builder.append(true);
let arr = builder.finish();
assert_eq!(2, arr.len());
let elements = arr.values().as_primitive::<Int32Type>();
assert_eq!(elements.values(), &[1, 2, 3, 4, 5, 6]);
}
#[test]
fn test_boxed_list_list_array_builder() {
// This test is same as `test_list_list_array_builder` but uses boxed builders.
let values_builder = make_builder(
&DataType::List(Arc::new(Field::new("item", DataType::Int32, true))),
10,
);
test_boxed_generic_list_generic_list_array_builder::<i32>(values_builder);
}
#[test]
fn test_boxed_large_list_large_list_array_builder() {
// This test is same as `test_list_list_array_builder` but uses boxed builders.
let values_builder = make_builder(
&DataType::LargeList(Arc::new(Field::new("item", DataType::Int32, true))),
10,
);
test_boxed_generic_list_generic_list_array_builder::<i64>(values_builder);
}
fn test_boxed_generic_list_generic_list_array_builder<O: OffsetSizeTrait + PartialEq>(
values_builder: Box<dyn ArrayBuilder>,
) {
let mut builder: GenericListBuilder<O, Box<dyn ArrayBuilder>> =
GenericListBuilder::<O, Box<dyn ArrayBuilder>>::new(values_builder);
// [[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], null, [[9, 10]]]
builder
.values()
.as_any_mut()
.downcast_mut::<GenericListBuilder<O, Box<dyn ArrayBuilder>>>()
.expect("should be an (Large)ListBuilder")
.values()
.as_any_mut()
.downcast_mut::<Int32Builder>()
.expect("should be an Int32Builder")
.append_value(1);
builder
.values()
.as_any_mut()
.downcast_mut::<GenericListBuilder<O, Box<dyn ArrayBuilder>>>()
.expect("should be an (Large)ListBuilder")
.values()
.as_any_mut()
.downcast_mut::<Int32Builder>()
.expect("should be an Int32Builder")
.append_value(2);
builder
.values()
.as_any_mut()
.downcast_mut::<GenericListBuilder<O, Box<dyn ArrayBuilder>>>()
.expect("should be an (Large)ListBuilder")
.append(true);
builder
.values()
.as_any_mut()
.downcast_mut::<GenericListBuilder<O, Box<dyn ArrayBuilder>>>()
.expect("should be an (Large)ListBuilder")
.values()
.as_any_mut()
.downcast_mut::<Int32Builder>()
.expect("should be an Int32Builder")
.append_value(3);
builder
.values()
.as_any_mut()
.downcast_mut::<GenericListBuilder<O, Box<dyn ArrayBuilder>>>()
.expect("should be an (Large)ListBuilder")
.values()
.as_any_mut()
.downcast_mut::<Int32Builder>()
.expect("should be an Int32Builder")
.append_value(4);
builder
.values()
.as_any_mut()
.downcast_mut::<GenericListBuilder<O, Box<dyn ArrayBuilder>>>()
.expect("should be an (Large)ListBuilder")
.append(true);
builder.append(true);
builder
.values()
.as_any_mut()
.downcast_mut::<GenericListBuilder<O, Box<dyn ArrayBuilder>>>()
.expect("should be an (Large)ListBuilder")
.values()
.as_any_mut()
.downcast_mut::<Int32Builder>()
.expect("should be an Int32Builder")
.append_value(5);
builder
.values()
.as_any_mut()
.downcast_mut::<GenericListBuilder<O, Box<dyn ArrayBuilder>>>()
.expect("should be an (Large)ListBuilder")
.values()
.as_any_mut()
.downcast_mut::<Int32Builder>()
.expect("should be an Int32Builder")
.append_value(6);
builder
.values()
.as_any_mut()
.downcast_mut::<GenericListBuilder<O, Box<dyn ArrayBuilder>>>()
.expect("should be an (Large)ListBuilder")
.values()
.as_any_mut()
.downcast_mut::<Int32Builder>()
.expect("should be an (Large)ListBuilder")
.append_value(7);
builder
.values()
.as_any_mut()
.downcast_mut::<GenericListBuilder<O, Box<dyn ArrayBuilder>>>()
.expect("should be an (Large)ListBuilder")
.append(true);
builder
.values()
.as_any_mut()
.downcast_mut::<GenericListBuilder<O, Box<dyn ArrayBuilder>>>()
.expect("should be an (Large)ListBuilder")
.append(false);
builder
.values()
.as_any_mut()
.downcast_mut::<GenericListBuilder<O, Box<dyn ArrayBuilder>>>()
.expect("should be an (Large)ListBuilder")
.values()
.as_any_mut()
.downcast_mut::<Int32Builder>()
.expect("should be an Int32Builder")
.append_value(8);
builder
.values()
.as_any_mut()
.downcast_mut::<GenericListBuilder<O, Box<dyn ArrayBuilder>>>()
.expect("should be an (Large)ListBuilder")
.append(true);
builder.append(true);
builder.append(false);
builder
.values()
.as_any_mut()
.downcast_mut::<GenericListBuilder<O, Box<dyn ArrayBuilder>>>()
.expect("should be an (Large)ListBuilder")
.values()
.as_any_mut()
.downcast_mut::<Int32Builder>()
.expect("should be an Int32Builder")
.append_value(9);
builder
.values()
.as_any_mut()
.downcast_mut::<GenericListBuilder<O, Box<dyn ArrayBuilder>>>()
.expect("should be an (Large)ListBuilder")
.values()
.as_any_mut()
.downcast_mut::<Int32Builder>()
.expect("should be an Int32Builder")
.append_value(10);
builder
.values()
.as_any_mut()
.downcast_mut::<GenericListBuilder<O, Box<dyn ArrayBuilder>>>()
.expect("should be an (Large)ListBuilder")
.append(true);
builder.append(true);
let l1 = builder.finish();
assert_eq!(4, l1.len());
assert_eq!(1, l1.null_count());
assert_eq!(l1.value_offsets(), &[0, 2, 5, 5, 6].map(O::usize_as));
let l2 = l1.values().as_list::<O>();
assert_eq!(6, l2.len());
assert_eq!(1, l2.null_count());
assert_eq!(l2.value_offsets(), &[0, 2, 4, 7, 7, 8, 10].map(O::usize_as));
let i1 = l2.values().as_primitive::<Int32Type>();
assert_eq!(10, i1.len());
assert_eq!(0, i1.null_count());
assert_eq!(i1.values(), &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
}
#[test]
fn test_with_field() {
let field = Arc::new(Field::new("bar", DataType::Int32, false));
let mut builder = ListBuilder::new(Int32Builder::new()).with_field(field.clone());
builder.append_value([Some(1), Some(2), Some(3)]);
builder.append_null(); // This is fine as nullability refers to nullability of values
builder.append_value([Some(4)]);
let array = builder.finish();
assert_eq!(array.len(), 3);
assert_eq!(array.data_type(), &DataType::List(field.clone()));
builder.append_value([Some(4), Some(5)]);
let array = builder.finish();
assert_eq!(array.data_type(), &DataType::List(field));
assert_eq!(array.len(), 1);
}
#[test]
#[should_panic(expected = "Non-nullable field of ListArray \\\"item\\\" cannot contain nulls")]
fn test_checks_nullability() {
let field = Arc::new(Field::new("item", DataType::Int32, false));
let mut builder = ListBuilder::new(Int32Builder::new()).with_field(field.clone());
builder.append_value([Some(1), None]);
builder.finish();
}
#[test]
#[should_panic(expected = "ListArray expected data type Int64 got Int32")]
fn test_checks_data_type() {
let field = Arc::new(Field::new("item", DataType::Int64, false));
let mut builder = ListBuilder::new(Int32Builder::new()).with_field(field.clone());
builder.append_value([Some(1)]);
builder.finish();
}
}

Просмотреть файл

@ -0,0 +1,380 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::builder::{ArrayBuilder, BufferBuilder};
use crate::{Array, ArrayRef, MapArray, StructArray};
use arrow_buffer::Buffer;
use arrow_buffer::{NullBuffer, NullBufferBuilder};
use arrow_data::ArrayData;
use arrow_schema::{ArrowError, DataType, Field, FieldRef};
use std::any::Any;
use std::sync::Arc;
/// Builder for [`MapArray`]
///
/// ```
/// # use arrow_array::builder::{Int32Builder, MapBuilder, StringBuilder};
/// # use arrow_array::{Int32Array, StringArray};
///
/// let string_builder = StringBuilder::new();
/// let int_builder = Int32Builder::with_capacity(4);
///
/// // Construct `[{"joe": 1}, {"blogs": 2, "foo": 4}, {}, null]`
/// let mut builder = MapBuilder::new(None, string_builder, int_builder);
///
/// builder.keys().append_value("joe");
/// builder.values().append_value(1);
/// builder.append(true).unwrap();
///
/// builder.keys().append_value("blogs");
/// builder.values().append_value(2);
/// builder.keys().append_value("foo");
/// builder.values().append_value(4);
/// builder.append(true).unwrap();
/// builder.append(true).unwrap();
/// builder.append(false).unwrap();
///
/// let array = builder.finish();
/// assert_eq!(array.value_offsets(), &[0, 1, 3, 3, 3]);
/// assert_eq!(array.values().as_ref(), &Int32Array::from(vec![1, 2, 4]));
/// assert_eq!(array.keys().as_ref(), &StringArray::from(vec!["joe", "blogs", "foo"]));
///
/// ```
#[derive(Debug)]
pub struct MapBuilder<K: ArrayBuilder, V: ArrayBuilder> {
offsets_builder: BufferBuilder<i32>,
null_buffer_builder: NullBufferBuilder,
field_names: MapFieldNames,
key_builder: K,
value_builder: V,
value_field: Option<FieldRef>,
}
/// The [`Field`] names for a [`MapArray`]
#[derive(Debug, Clone)]
pub struct MapFieldNames {
/// [`Field`] name for map entries
pub entry: String,
/// [`Field`] name for map key
pub key: String,
/// [`Field`] name for map value
pub value: String,
}
impl Default for MapFieldNames {
fn default() -> Self {
Self {
entry: "entries".to_string(),
key: "keys".to_string(),
value: "values".to_string(),
}
}
}
impl<K: ArrayBuilder, V: ArrayBuilder> MapBuilder<K, V> {
/// Creates a new `MapBuilder`
pub fn new(field_names: Option<MapFieldNames>, key_builder: K, value_builder: V) -> Self {
let capacity = key_builder.len();
Self::with_capacity(field_names, key_builder, value_builder, capacity)
}
/// Creates a new `MapBuilder` with capacity
pub fn with_capacity(
field_names: Option<MapFieldNames>,
key_builder: K,
value_builder: V,
capacity: usize,
) -> Self {
let mut offsets_builder = BufferBuilder::<i32>::new(capacity + 1);
offsets_builder.append(0);
Self {
offsets_builder,
null_buffer_builder: NullBufferBuilder::new(capacity),
field_names: field_names.unwrap_or_default(),
key_builder,
value_builder,
value_field: None,
}
}
/// Override the field passed to [`MapBuilder::new`]
///
/// By default a nullable field is created with the name `values`
///
/// Note: [`Self::finish`] and [`Self::finish_cloned`] will panic if the
/// field's data type does not match that of `V`
pub fn with_values_field(self, field: impl Into<FieldRef>) -> Self {
Self {
value_field: Some(field.into()),
..self
}
}
/// Returns the key array builder of the map
pub fn keys(&mut self) -> &mut K {
&mut self.key_builder
}
/// Returns the value array builder of the map
pub fn values(&mut self) -> &mut V {
&mut self.value_builder
}
/// Returns both the key and value array builders of the map
pub fn entries(&mut self) -> (&mut K, &mut V) {
(&mut self.key_builder, &mut self.value_builder)
}
/// Finish the current map array slot
///
/// Returns an error if the key and values builders are in an inconsistent state.
#[inline]
pub fn append(&mut self, is_valid: bool) -> Result<(), ArrowError> {
if self.key_builder.len() != self.value_builder.len() {
return Err(ArrowError::InvalidArgumentError(format!(
"Cannot append to a map builder when its keys and values have unequal lengths of {} and {}",
self.key_builder.len(),
self.value_builder.len()
)));
}
self.offsets_builder.append(self.key_builder.len() as i32);
self.null_buffer_builder.append(is_valid);
Ok(())
}
/// Builds the [`MapArray`]
pub fn finish(&mut self) -> MapArray {
let len = self.len();
// Build the keys
let keys_arr = self.key_builder.finish();
let values_arr = self.value_builder.finish();
let offset_buffer = self.offsets_builder.finish();
self.offsets_builder.append(0);
let null_bit_buffer = self.null_buffer_builder.finish();
self.finish_helper(keys_arr, values_arr, offset_buffer, null_bit_buffer, len)
}
/// Builds the [`MapArray`] without resetting the builder.
pub fn finish_cloned(&self) -> MapArray {
let len = self.len();
// Build the keys
let keys_arr = self.key_builder.finish_cloned();
let values_arr = self.value_builder.finish_cloned();
let offset_buffer = Buffer::from_slice_ref(self.offsets_builder.as_slice());
let nulls = self.null_buffer_builder.finish_cloned();
self.finish_helper(keys_arr, values_arr, offset_buffer, nulls, len)
}
fn finish_helper(
&self,
keys_arr: Arc<dyn Array>,
values_arr: Arc<dyn Array>,
offset_buffer: Buffer,
nulls: Option<NullBuffer>,
len: usize,
) -> MapArray {
assert!(
keys_arr.null_count() == 0,
"Keys array must have no null values, found {} null value(s)",
keys_arr.null_count()
);
let keys_field = Arc::new(Field::new(
self.field_names.key.as_str(),
keys_arr.data_type().clone(),
false, // always non-nullable
));
let values_field = match &self.value_field {
Some(f) => f.clone(),
None => Arc::new(Field::new(
self.field_names.value.as_str(),
values_arr.data_type().clone(),
true,
)),
};
let struct_array =
StructArray::from(vec![(keys_field, keys_arr), (values_field, values_arr)]);
let map_field = Arc::new(Field::new(
self.field_names.entry.as_str(),
struct_array.data_type().clone(),
false, // always non-nullable
));
let array_data = ArrayData::builder(DataType::Map(map_field, false)) // TODO: support sorted keys
.len(len)
.add_buffer(offset_buffer)
.add_child_data(struct_array.into_data())
.nulls(nulls);
let array_data = unsafe { array_data.build_unchecked() };
MapArray::from(array_data)
}
/// Returns the current null buffer as a slice
pub fn validity_slice(&self) -> Option<&[u8]> {
self.null_buffer_builder.as_slice()
}
}
impl<K: ArrayBuilder, V: ArrayBuilder> ArrayBuilder for MapBuilder<K, V> {
fn len(&self) -> usize {
self.null_buffer_builder.len()
}
fn finish(&mut self) -> ArrayRef {
Arc::new(self.finish())
}
/// Builds the array without resetting the builder.
fn finish_cloned(&self) -> ArrayRef {
Arc::new(self.finish_cloned())
}
fn as_any(&self) -> &dyn Any {
self
}
fn as_any_mut(&mut self) -> &mut dyn Any {
self
}
fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
self
}
}
#[cfg(test)]
mod tests {
use crate::builder::{make_builder, Int32Builder, StringBuilder};
use crate::{Int32Array, StringArray};
use super::*;
#[test]
#[should_panic(expected = "Keys array must have no null values, found 1 null value(s)")]
fn test_map_builder_with_null_keys_panics() {
let mut builder = MapBuilder::new(None, StringBuilder::new(), Int32Builder::new());
builder.keys().append_null();
builder.values().append_value(42);
builder.append(true).unwrap();
builder.finish();
}
#[test]
fn test_boxed_map_builder() {
let keys_builder = make_builder(&DataType::Utf8, 5);
let values_builder = make_builder(&DataType::Int32, 5);
let mut builder = MapBuilder::new(None, keys_builder, values_builder);
builder
.keys()
.as_any_mut()
.downcast_mut::<StringBuilder>()
.expect("should be an StringBuilder")
.append_value("1");
builder
.values()
.as_any_mut()
.downcast_mut::<Int32Builder>()
.expect("should be an Int32Builder")
.append_value(42);
builder.append(true).unwrap();
let map_array = builder.finish();
assert_eq!(
map_array
.keys()
.as_any()
.downcast_ref::<StringArray>()
.expect("should be an StringArray")
.value(0),
"1"
);
assert_eq!(
map_array
.values()
.as_any()
.downcast_ref::<Int32Array>()
.expect("should be an Int32Array")
.value(0),
42
);
}
#[test]
fn test_with_values_field() {
let value_field = Arc::new(Field::new("bars", DataType::Int32, false));
let mut builder = MapBuilder::new(None, Int32Builder::new(), Int32Builder::new())
.with_values_field(value_field.clone());
builder.keys().append_value(1);
builder.values().append_value(2);
builder.append(true).unwrap();
builder.append(false).unwrap(); // This is fine as nullability refers to nullability of values
builder.keys().append_value(3);
builder.values().append_value(4);
builder.append(true).unwrap();
let map = builder.finish();
assert_eq!(map.len(), 3);
assert_eq!(
map.data_type(),
&DataType::Map(
Arc::new(Field::new(
"entries",
DataType::Struct(
vec![
Arc::new(Field::new("keys", DataType::Int32, false)),
value_field.clone()
]
.into()
),
false,
)),
false
)
);
builder.keys().append_value(5);
builder.values().append_value(6);
builder.append(true).unwrap();
let map = builder.finish();
assert_eq!(map.len(), 1);
assert_eq!(
map.data_type(),
&DataType::Map(
Arc::new(Field::new(
"entries",
DataType::Struct(
vec![
Arc::new(Field::new("keys", DataType::Int32, false)),
value_field
]
.into()
),
false,
)),
false
)
);
}
}

Просмотреть файл

@ -0,0 +1,325 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//! Defines push-based APIs for constructing arrays
//!
//! # Basic Usage
//!
//! Builders can be used to build simple, non-nested arrays
//!
//! ```
//! # use arrow_array::builder::Int32Builder;
//! # use arrow_array::PrimitiveArray;
//! let mut a = Int32Builder::new();
//! a.append_value(1);
//! a.append_null();
//! a.append_value(2);
//! let a = a.finish();
//!
//! assert_eq!(a, PrimitiveArray::from(vec![Some(1), None, Some(2)]));
//! ```
//!
//! ```
//! # use arrow_array::builder::StringBuilder;
//! # use arrow_array::{Array, StringArray};
//! let mut a = StringBuilder::new();
//! a.append_value("foo");
//! a.append_value("bar");
//! a.append_null();
//! let a = a.finish();
//!
//! assert_eq!(a, StringArray::from_iter([Some("foo"), Some("bar"), None]));
//! ```
//!
//! # Nested Usage
//!
//! Builders can also be used to build more complex nested arrays, such as lists
//!
//! ```
//! # use arrow_array::builder::{Int32Builder, ListBuilder};
//! # use arrow_array::ListArray;
//! # use arrow_array::types::Int32Type;
//! let mut a = ListBuilder::new(Int32Builder::new());
//! // [1, 2]
//! a.values().append_value(1);
//! a.values().append_value(2);
//! a.append(true);
//! // null
//! a.append(false);
//! // []
//! a.append(true);
//! // [3, null]
//! a.values().append_value(3);
//! a.values().append_null();
//! a.append(true);
//!
//! // [[1, 2], null, [], [3, null]]
//! let a = a.finish();
//!
//! assert_eq!(a, ListArray::from_iter_primitive::<Int32Type, _, _>([
//! Some(vec![Some(1), Some(2)]),
//! None,
//! Some(vec![]),
//! Some(vec![Some(3), None])]
//! ))
//! ```
//!
//! # Custom Builders
//!
//! It is common to have a collection of statically defined Rust types that
//! you want to convert to Arrow arrays.
//!
//! An example of doing so is below
//!
//! ```
//! # use std::any::Any;
//! # use arrow_array::builder::{ArrayBuilder, Int32Builder, ListBuilder, StringBuilder};
//! # use arrow_array::{ArrayRef, RecordBatch, StructArray};
//! # use arrow_schema::{DataType, Field};
//! # use std::sync::Arc;
//! /// A custom row representation
//! struct MyRow {
//! i32: i32,
//! optional_i32: Option<i32>,
//! string: Option<String>,
//! i32_list: Option<Vec<Option<i32>>>,
//! }
//!
//! /// Converts `Vec<Row>` into `StructArray`
//! #[derive(Debug, Default)]
//! struct MyRowBuilder {
//! i32: Int32Builder,
//! string: StringBuilder,
//! i32_list: ListBuilder<Int32Builder>,
//! }
//!
//! impl MyRowBuilder {
//! fn append(&mut self, row: &MyRow) {
//! self.i32.append_value(row.i32);
//! self.string.append_option(row.string.as_ref());
//! self.i32_list.append_option(row.i32_list.as_ref().map(|x| x.iter().copied()));
//! }
//!
//! /// Note: returns StructArray to allow nesting within another array if desired
//! fn finish(&mut self) -> StructArray {
//! let i32 = Arc::new(self.i32.finish()) as ArrayRef;
//! let i32_field = Arc::new(Field::new("i32", DataType::Int32, false));
//!
//! let string = Arc::new(self.string.finish()) as ArrayRef;
//! let string_field = Arc::new(Field::new("i32", DataType::Utf8, false));
//!
//! let i32_list = Arc::new(self.i32_list.finish()) as ArrayRef;
//! let value_field = Arc::new(Field::new("item", DataType::Int32, true));
//! let i32_list_field = Arc::new(Field::new("i32_list", DataType::List(value_field), true));
//!
//! StructArray::from(vec![
//! (i32_field, i32),
//! (string_field, string),
//! (i32_list_field, i32_list),
//! ])
//! }
//! }
//!
//! impl<'a> Extend<&'a MyRow> for MyRowBuilder {
//! fn extend<T: IntoIterator<Item = &'a MyRow>>(&mut self, iter: T) {
//! iter.into_iter().for_each(|row| self.append(row));
//! }
//! }
//!
//! /// Converts a slice of [`MyRow`] to a [`RecordBatch`]
//! fn rows_to_batch(rows: &[MyRow]) -> RecordBatch {
//! let mut builder = MyRowBuilder::default();
//! builder.extend(rows);
//! RecordBatch::from(&builder.finish())
//! }
//! ```
pub use arrow_buffer::BooleanBufferBuilder;
mod boolean_builder;
pub use boolean_builder::*;
mod buffer_builder;
pub use buffer_builder::*;
mod fixed_size_binary_builder;
pub use fixed_size_binary_builder::*;
mod fixed_size_list_builder;
pub use fixed_size_list_builder::*;
mod generic_bytes_builder;
pub use generic_bytes_builder::*;
mod generic_list_builder;
pub use generic_list_builder::*;
mod map_builder;
pub use map_builder::*;
mod null_builder;
pub use null_builder::*;
mod primitive_builder;
pub use primitive_builder::*;
mod primitive_dictionary_builder;
pub use primitive_dictionary_builder::*;
mod primitive_run_builder;
pub use primitive_run_builder::*;
mod struct_builder;
pub use struct_builder::*;
mod generic_bytes_dictionary_builder;
pub use generic_bytes_dictionary_builder::*;
mod generic_byte_run_builder;
pub use generic_byte_run_builder::*;
mod generic_bytes_view_builder;
pub use generic_bytes_view_builder::*;
mod union_builder;
pub use union_builder::*;
use crate::ArrayRef;
use std::any::Any;
/// Trait for dealing with different array builders at runtime
///
/// # Example
///
/// ```
/// // Create
/// # use arrow_array::{ArrayRef, StringArray};
/// # use arrow_array::builder::{ArrayBuilder, Float64Builder, Int64Builder, StringBuilder};
///
/// let mut data_builders: Vec<Box<dyn ArrayBuilder>> = vec![
/// Box::new(Float64Builder::new()),
/// Box::new(Int64Builder::new()),
/// Box::new(StringBuilder::new()),
/// ];
///
/// // Fill
/// data_builders[0]
/// .as_any_mut()
/// .downcast_mut::<Float64Builder>()
/// .unwrap()
/// .append_value(3.14);
/// data_builders[1]
/// .as_any_mut()
/// .downcast_mut::<Int64Builder>()
/// .unwrap()
/// .append_value(-1);
/// data_builders[2]
/// .as_any_mut()
/// .downcast_mut::<StringBuilder>()
/// .unwrap()
/// .append_value("🍎");
///
/// // Finish
/// let array_refs: Vec<ArrayRef> = data_builders
/// .iter_mut()
/// .map(|builder| builder.finish())
/// .collect();
/// assert_eq!(array_refs[0].len(), 1);
/// assert_eq!(array_refs[1].is_null(0), false);
/// assert_eq!(
/// array_refs[2]
/// .as_any()
/// .downcast_ref::<StringArray>()
/// .unwrap()
/// .value(0),
/// "🍎"
/// );
/// ```
pub trait ArrayBuilder: Any + Send + Sync {
/// Returns the number of array slots in the builder
fn len(&self) -> usize;
/// Returns whether number of array slots is zero
fn is_empty(&self) -> bool {
self.len() == 0
}
/// Builds the array
fn finish(&mut self) -> ArrayRef;
/// Builds the array without resetting the underlying builder.
fn finish_cloned(&self) -> ArrayRef;
/// Returns the builder as a non-mutable `Any` reference.
///
/// This is most useful when one wants to call non-mutable APIs on a specific builder
/// type. In this case, one can first cast this into a `Any`, and then use
/// `downcast_ref` to get a reference on the specific builder.
fn as_any(&self) -> &dyn Any;
/// Returns the builder as a mutable `Any` reference.
///
/// This is most useful when one wants to call mutable APIs on a specific builder
/// type. In this case, one can first cast this into a `Any`, and then use
/// `downcast_mut` to get a reference on the specific builder.
fn as_any_mut(&mut self) -> &mut dyn Any;
/// Returns the boxed builder as a box of `Any`.
fn into_box_any(self: Box<Self>) -> Box<dyn Any>;
}
impl ArrayBuilder for Box<dyn ArrayBuilder> {
fn len(&self) -> usize {
(**self).len()
}
fn is_empty(&self) -> bool {
(**self).is_empty()
}
fn finish(&mut self) -> ArrayRef {
(**self).finish()
}
fn finish_cloned(&self) -> ArrayRef {
(**self).finish_cloned()
}
fn as_any(&self) -> &dyn Any {
(**self).as_any()
}
fn as_any_mut(&mut self) -> &mut dyn Any {
(**self).as_any_mut()
}
fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
self
}
}
/// Builder for [`ListArray`](crate::array::ListArray)
pub type ListBuilder<T> = GenericListBuilder<i32, T>;
/// Builder for [`LargeListArray`](crate::array::LargeListArray)
pub type LargeListBuilder<T> = GenericListBuilder<i64, T>;
/// Builder for [`BinaryArray`](crate::array::BinaryArray)
///
/// See examples on [`GenericBinaryBuilder`]
pub type BinaryBuilder = GenericBinaryBuilder<i32>;
/// Builder for [`LargeBinaryArray`](crate::array::LargeBinaryArray)
///
/// See examples on [`GenericBinaryBuilder`]
pub type LargeBinaryBuilder = GenericBinaryBuilder<i64>;
/// Builder for [`StringArray`](crate::array::StringArray)
///
/// See examples on [`GenericStringBuilder`]
pub type StringBuilder = GenericStringBuilder<i32>;
/// Builder for [`LargeStringArray`](crate::array::LargeStringArray)
///
/// See examples on [`GenericStringBuilder`]
pub type LargeStringBuilder = GenericStringBuilder<i64>;

Просмотреть файл

@ -0,0 +1,182 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::builder::ArrayBuilder;
use crate::{ArrayRef, NullArray};
use arrow_data::ArrayData;
use arrow_schema::DataType;
use std::any::Any;
use std::sync::Arc;
/// Builder for [`NullArray`]
///
/// # Example
///
/// Create a `NullArray` from a `NullBuilder`
///
/// ```
///
/// # use arrow_array::{Array, NullArray, builder::NullBuilder};
///
/// let mut b = NullBuilder::new();
/// b.append_empty_value();
/// b.append_null();
/// b.append_nulls(3);
/// b.append_empty_values(3);
/// let arr = b.finish();
///
/// assert_eq!(8, arr.len());
/// assert_eq!(0, arr.null_count());
/// ```
#[derive(Debug)]
pub struct NullBuilder {
len: usize,
}
impl Default for NullBuilder {
fn default() -> Self {
Self::new()
}
}
impl NullBuilder {
/// Creates a new null builder
pub fn new() -> Self {
Self { len: 0 }
}
/// Creates a new null builder with space for `capacity` elements without re-allocating
#[deprecated = "there is no actual notion of capacity in the NullBuilder, so emulating it makes little sense"]
pub fn with_capacity(_capacity: usize) -> Self {
Self::new()
}
/// Returns the capacity of this builder measured in slots of type `T`
#[deprecated = "there is no actual notion of capacity in the NullBuilder, so emulating it makes little sense"]
pub fn capacity(&self) -> usize {
self.len
}
/// Appends a null slot into the builder
#[inline]
pub fn append_null(&mut self) {
self.len += 1;
}
/// Appends `n` `null`s into the builder.
#[inline]
pub fn append_nulls(&mut self, n: usize) {
self.len += n;
}
/// Appends a null slot into the builder
#[inline]
pub fn append_empty_value(&mut self) {
self.append_null();
}
/// Appends `n` `null`s into the builder.
#[inline]
pub fn append_empty_values(&mut self, n: usize) {
self.append_nulls(n);
}
/// Builds the [NullArray] and reset this builder.
pub fn finish(&mut self) -> NullArray {
let len = self.len();
let builder = ArrayData::new_null(&DataType::Null, len).into_builder();
let array_data = unsafe { builder.build_unchecked() };
NullArray::from(array_data)
}
/// Builds the [NullArray] without resetting the builder.
pub fn finish_cloned(&self) -> NullArray {
let len = self.len();
let builder = ArrayData::new_null(&DataType::Null, len).into_builder();
let array_data = unsafe { builder.build_unchecked() };
NullArray::from(array_data)
}
}
impl ArrayBuilder for NullBuilder {
/// Returns the builder as a non-mutable `Any` reference.
fn as_any(&self) -> &dyn Any {
self
}
/// Returns the builder as a mutable `Any` reference.
fn as_any_mut(&mut self) -> &mut dyn Any {
self
}
/// Returns the boxed builder as a box of `Any`.
fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
self
}
/// Returns the number of array slots in the builder
fn len(&self) -> usize {
self.len
}
/// Builds the array and reset this builder.
fn finish(&mut self) -> ArrayRef {
Arc::new(self.finish())
}
/// Builds the array without resetting the builder.
fn finish_cloned(&self) -> ArrayRef {
Arc::new(self.finish_cloned())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::Array;
#[test]
fn test_null_array_builder() {
let mut builder = NullArray::builder(10);
builder.append_null();
builder.append_nulls(4);
builder.append_empty_value();
builder.append_empty_values(4);
let arr = builder.finish();
assert_eq!(10, arr.len());
assert_eq!(0, arr.offset());
assert_eq!(0, arr.null_count());
assert!(arr.is_nullable());
}
#[test]
fn test_null_array_builder_finish_cloned() {
let mut builder = NullArray::builder(16);
builder.append_null();
builder.append_empty_value();
builder.append_empty_values(3);
let mut array = builder.finish_cloned();
assert_eq!(5, array.len());
builder.append_empty_values(5);
array = builder.finish();
assert_eq!(10, array.len());
}
}

Просмотреть файл

@ -0,0 +1,611 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::builder::{ArrayBuilder, BufferBuilder};
use crate::types::*;
use crate::{ArrayRef, PrimitiveArray};
use arrow_buffer::NullBufferBuilder;
use arrow_buffer::{Buffer, MutableBuffer};
use arrow_data::ArrayData;
use arrow_schema::{ArrowError, DataType};
use std::any::Any;
use std::sync::Arc;
/// A signed 8-bit integer array builder.
pub type Int8Builder = PrimitiveBuilder<Int8Type>;
/// A signed 16-bit integer array builder.
pub type Int16Builder = PrimitiveBuilder<Int16Type>;
/// A signed 32-bit integer array builder.
pub type Int32Builder = PrimitiveBuilder<Int32Type>;
/// A signed 64-bit integer array builder.
pub type Int64Builder = PrimitiveBuilder<Int64Type>;
/// An usigned 8-bit integer array builder.
pub type UInt8Builder = PrimitiveBuilder<UInt8Type>;
/// An usigned 16-bit integer array builder.
pub type UInt16Builder = PrimitiveBuilder<UInt16Type>;
/// An usigned 32-bit integer array builder.
pub type UInt32Builder = PrimitiveBuilder<UInt32Type>;
/// An usigned 64-bit integer array builder.
pub type UInt64Builder = PrimitiveBuilder<UInt64Type>;
/// A 16-bit floating point array builder.
pub type Float16Builder = PrimitiveBuilder<Float16Type>;
/// A 32-bit floating point array builder.
pub type Float32Builder = PrimitiveBuilder<Float32Type>;
/// A 64-bit floating point array builder.
pub type Float64Builder = PrimitiveBuilder<Float64Type>;
/// A timestamp second array builder.
pub type TimestampSecondBuilder = PrimitiveBuilder<TimestampSecondType>;
/// A timestamp millisecond array builder.
pub type TimestampMillisecondBuilder = PrimitiveBuilder<TimestampMillisecondType>;
/// A timestamp microsecond array builder.
pub type TimestampMicrosecondBuilder = PrimitiveBuilder<TimestampMicrosecondType>;
/// A timestamp nanosecond array builder.
pub type TimestampNanosecondBuilder = PrimitiveBuilder<TimestampNanosecondType>;
/// A 32-bit date array builder.
pub type Date32Builder = PrimitiveBuilder<Date32Type>;
/// A 64-bit date array builder.
pub type Date64Builder = PrimitiveBuilder<Date64Type>;
/// A 32-bit elaspsed time in seconds array builder.
pub type Time32SecondBuilder = PrimitiveBuilder<Time32SecondType>;
/// A 32-bit elaspsed time in milliseconds array builder.
pub type Time32MillisecondBuilder = PrimitiveBuilder<Time32MillisecondType>;
/// A 64-bit elaspsed time in microseconds array builder.
pub type Time64MicrosecondBuilder = PrimitiveBuilder<Time64MicrosecondType>;
/// A 64-bit elaspsed time in nanoseconds array builder.
pub type Time64NanosecondBuilder = PrimitiveBuilder<Time64NanosecondType>;
/// A “calendar” interval in months array builder.
pub type IntervalYearMonthBuilder = PrimitiveBuilder<IntervalYearMonthType>;
/// A “calendar” interval in days and milliseconds array builder.
pub type IntervalDayTimeBuilder = PrimitiveBuilder<IntervalDayTimeType>;
/// A “calendar” interval in months, days, and nanoseconds array builder.
pub type IntervalMonthDayNanoBuilder = PrimitiveBuilder<IntervalMonthDayNanoType>;
/// An elapsed time in seconds array builder.
pub type DurationSecondBuilder = PrimitiveBuilder<DurationSecondType>;
/// An elapsed time in milliseconds array builder.
pub type DurationMillisecondBuilder = PrimitiveBuilder<DurationMillisecondType>;
/// An elapsed time in microseconds array builder.
pub type DurationMicrosecondBuilder = PrimitiveBuilder<DurationMicrosecondType>;
/// An elapsed time in nanoseconds array builder.
pub type DurationNanosecondBuilder = PrimitiveBuilder<DurationNanosecondType>;
/// A decimal 128 array builder
pub type Decimal128Builder = PrimitiveBuilder<Decimal128Type>;
/// A decimal 256 array builder
pub type Decimal256Builder = PrimitiveBuilder<Decimal256Type>;
/// Builder for [`PrimitiveArray`]
#[derive(Debug)]
pub struct PrimitiveBuilder<T: ArrowPrimitiveType> {
values_builder: BufferBuilder<T::Native>,
null_buffer_builder: NullBufferBuilder,
data_type: DataType,
}
impl<T: ArrowPrimitiveType> ArrayBuilder for PrimitiveBuilder<T> {
/// Returns the builder as a non-mutable `Any` reference.
fn as_any(&self) -> &dyn Any {
self
}
/// Returns the builder as a mutable `Any` reference.
fn as_any_mut(&mut self) -> &mut dyn Any {
self
}
/// Returns the boxed builder as a box of `Any`.
fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
self
}
/// Returns the number of array slots in the builder
fn len(&self) -> usize {
self.values_builder.len()
}
/// Builds the array and reset this builder.
fn finish(&mut self) -> ArrayRef {
Arc::new(self.finish())
}
/// Builds the array without resetting the builder.
fn finish_cloned(&self) -> ArrayRef {
Arc::new(self.finish_cloned())
}
}
impl<T: ArrowPrimitiveType> Default for PrimitiveBuilder<T> {
fn default() -> Self {
Self::new()
}
}
impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
/// Creates a new primitive array builder
pub fn new() -> Self {
Self::with_capacity(1024)
}
/// Creates a new primitive array builder with capacity no of items
pub fn with_capacity(capacity: usize) -> Self {
Self {
values_builder: BufferBuilder::<T::Native>::new(capacity),
null_buffer_builder: NullBufferBuilder::new(capacity),
data_type: T::DATA_TYPE,
}
}
/// Creates a new primitive array builder from buffers
pub fn new_from_buffer(
values_buffer: MutableBuffer,
null_buffer: Option<MutableBuffer>,
) -> Self {
let values_builder = BufferBuilder::<T::Native>::new_from_buffer(values_buffer);
let null_buffer_builder = null_buffer
.map(|buffer| NullBufferBuilder::new_from_buffer(buffer, values_builder.len()))
.unwrap_or_else(|| NullBufferBuilder::new_with_len(values_builder.len()));
Self {
values_builder,
null_buffer_builder,
data_type: T::DATA_TYPE,
}
}
/// By default [`PrimitiveBuilder`] uses [`ArrowPrimitiveType::DATA_TYPE`] as the
/// data type of the generated array.
///
/// This method allows overriding the data type, to allow specifying timezones
/// for [`DataType::Timestamp`] or precision and scale for [`DataType::Decimal128`] and [`DataType::Decimal256`]
///
/// # Panics
///
/// This method panics if `data_type` is not [PrimitiveArray::is_compatible]
pub fn with_data_type(self, data_type: DataType) -> Self {
assert!(
PrimitiveArray::<T>::is_compatible(&data_type),
"incompatible data type for builder, expected {} got {}",
T::DATA_TYPE,
data_type
);
Self { data_type, ..self }
}
/// Returns the capacity of this builder measured in slots of type `T`
pub fn capacity(&self) -> usize {
self.values_builder.capacity()
}
/// Appends a value of type `T` into the builder
#[inline]
pub fn append_value(&mut self, v: T::Native) {
self.null_buffer_builder.append_non_null();
self.values_builder.append(v);
}
/// Appends a null slot into the builder
#[inline]
pub fn append_null(&mut self) {
self.null_buffer_builder.append_null();
self.values_builder.advance(1);
}
/// Appends `n` no. of null's into the builder
#[inline]
pub fn append_nulls(&mut self, n: usize) {
self.null_buffer_builder.append_n_nulls(n);
self.values_builder.advance(n);
}
/// Appends an `Option<T>` into the builder
#[inline]
pub fn append_option(&mut self, v: Option<T::Native>) {
match v {
None => self.append_null(),
Some(v) => self.append_value(v),
};
}
/// Appends a slice of type `T` into the builder
#[inline]
pub fn append_slice(&mut self, v: &[T::Native]) {
self.null_buffer_builder.append_n_non_nulls(v.len());
self.values_builder.append_slice(v);
}
/// Appends values from a slice of type `T` and a validity boolean slice
///
/// # Panics
///
/// Panics if `values` and `is_valid` have different lengths
#[inline]
pub fn append_values(&mut self, values: &[T::Native], is_valid: &[bool]) {
assert_eq!(
values.len(),
is_valid.len(),
"Value and validity lengths must be equal"
);
self.null_buffer_builder.append_slice(is_valid);
self.values_builder.append_slice(values);
}
/// Appends values from a trusted length iterator.
///
/// # Safety
/// This requires the iterator be a trusted length. This could instead require
/// the iterator implement `TrustedLen` once that is stabilized.
#[inline]
pub unsafe fn append_trusted_len_iter(&mut self, iter: impl IntoIterator<Item = T::Native>) {
let iter = iter.into_iter();
let len = iter
.size_hint()
.1
.expect("append_trusted_len_iter requires an upper bound");
self.null_buffer_builder.append_n_non_nulls(len);
self.values_builder.append_trusted_len_iter(iter);
}
/// Builds the [`PrimitiveArray`] and reset this builder.
pub fn finish(&mut self) -> PrimitiveArray<T> {
let len = self.len();
let nulls = self.null_buffer_builder.finish();
let builder = ArrayData::builder(self.data_type.clone())
.len(len)
.add_buffer(self.values_builder.finish())
.nulls(nulls);
let array_data = unsafe { builder.build_unchecked() };
PrimitiveArray::<T>::from(array_data)
}
/// Builds the [`PrimitiveArray`] without resetting the builder.
pub fn finish_cloned(&self) -> PrimitiveArray<T> {
let len = self.len();
let nulls = self.null_buffer_builder.finish_cloned();
let values_buffer = Buffer::from_slice_ref(self.values_builder.as_slice());
let builder = ArrayData::builder(self.data_type.clone())
.len(len)
.add_buffer(values_buffer)
.nulls(nulls);
let array_data = unsafe { builder.build_unchecked() };
PrimitiveArray::<T>::from(array_data)
}
/// Returns the current values buffer as a slice
pub fn values_slice(&self) -> &[T::Native] {
self.values_builder.as_slice()
}
/// Returns the current values buffer as a mutable slice
pub fn values_slice_mut(&mut self) -> &mut [T::Native] {
self.values_builder.as_slice_mut()
}
/// Returns the current null buffer as a slice
pub fn validity_slice(&self) -> Option<&[u8]> {
self.null_buffer_builder.as_slice()
}
/// Returns the current null buffer as a mutable slice
pub fn validity_slice_mut(&mut self) -> Option<&mut [u8]> {
self.null_buffer_builder.as_slice_mut()
}
/// Returns the current values buffer and null buffer as a slice
pub fn slices_mut(&mut self) -> (&mut [T::Native], Option<&mut [u8]>) {
(
self.values_builder.as_slice_mut(),
self.null_buffer_builder.as_slice_mut(),
)
}
}
impl<P: DecimalType> PrimitiveBuilder<P> {
/// Sets the precision and scale
pub fn with_precision_and_scale(self, precision: u8, scale: i8) -> Result<Self, ArrowError> {
validate_decimal_precision_and_scale::<P>(precision, scale)?;
Ok(Self {
data_type: P::TYPE_CONSTRUCTOR(precision, scale),
..self
})
}
}
impl<P: ArrowTimestampType> PrimitiveBuilder<P> {
/// Sets the timezone
pub fn with_timezone(self, timezone: impl Into<Arc<str>>) -> Self {
self.with_timezone_opt(Some(timezone.into()))
}
/// Sets an optional timezone
pub fn with_timezone_opt<S: Into<Arc<str>>>(self, timezone: Option<S>) -> Self {
Self {
data_type: DataType::Timestamp(P::UNIT, timezone.map(Into::into)),
..self
}
}
}
impl<P: ArrowPrimitiveType> Extend<Option<P::Native>> for PrimitiveBuilder<P> {
#[inline]
fn extend<T: IntoIterator<Item = Option<P::Native>>>(&mut self, iter: T) {
for v in iter {
self.append_option(v)
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use arrow_schema::TimeUnit;
use crate::array::Array;
use crate::array::BooleanArray;
use crate::array::Date32Array;
use crate::array::Int32Array;
use crate::array::TimestampSecondArray;
#[test]
fn test_primitive_array_builder_i32() {
let mut builder = Int32Array::builder(5);
for i in 0..5 {
builder.append_value(i);
}
let arr = builder.finish();
assert_eq!(5, arr.len());
assert_eq!(0, arr.offset());
assert_eq!(0, arr.null_count());
for i in 0..5 {
assert!(!arr.is_null(i));
assert!(arr.is_valid(i));
assert_eq!(i as i32, arr.value(i));
}
}
#[test]
fn test_primitive_array_builder_i32_append_iter() {
let mut builder = Int32Array::builder(5);
unsafe { builder.append_trusted_len_iter(0..5) };
let arr = builder.finish();
assert_eq!(5, arr.len());
assert_eq!(0, arr.offset());
assert_eq!(0, arr.null_count());
for i in 0..5 {
assert!(!arr.is_null(i));
assert!(arr.is_valid(i));
assert_eq!(i as i32, arr.value(i));
}
}
#[test]
fn test_primitive_array_builder_i32_append_nulls() {
let mut builder = Int32Array::builder(5);
builder.append_nulls(5);
let arr = builder.finish();
assert_eq!(5, arr.len());
assert_eq!(0, arr.offset());
assert_eq!(5, arr.null_count());
for i in 0..5 {
assert!(arr.is_null(i));
assert!(!arr.is_valid(i));
}
}
#[test]
fn test_primitive_array_builder_date32() {
let mut builder = Date32Array::builder(5);
for i in 0..5 {
builder.append_value(i);
}
let arr = builder.finish();
assert_eq!(5, arr.len());
assert_eq!(0, arr.offset());
assert_eq!(0, arr.null_count());
for i in 0..5 {
assert!(!arr.is_null(i));
assert!(arr.is_valid(i));
assert_eq!(i as i32, arr.value(i));
}
}
#[test]
fn test_primitive_array_builder_timestamp_second() {
let mut builder = TimestampSecondArray::builder(5);
for i in 0..5 {
builder.append_value(i);
}
let arr = builder.finish();
assert_eq!(5, arr.len());
assert_eq!(0, arr.offset());
assert_eq!(0, arr.null_count());
for i in 0..5 {
assert!(!arr.is_null(i));
assert!(arr.is_valid(i));
assert_eq!(i as i64, arr.value(i));
}
}
#[test]
fn test_primitive_array_builder_bool() {
// 00000010 01001000
let buf = Buffer::from([72_u8, 2_u8]);
let mut builder = BooleanArray::builder(10);
for i in 0..10 {
if i == 3 || i == 6 || i == 9 {
builder.append_value(true);
} else {
builder.append_value(false);
}
}
let arr = builder.finish();
assert_eq!(&buf, arr.values().inner());
assert_eq!(10, arr.len());
assert_eq!(0, arr.offset());
assert_eq!(0, arr.null_count());
for i in 0..10 {
assert!(!arr.is_null(i));
assert!(arr.is_valid(i));
assert_eq!(i == 3 || i == 6 || i == 9, arr.value(i), "failed at {i}")
}
}
#[test]
fn test_primitive_array_builder_append_option() {
let arr1 = Int32Array::from(vec![Some(0), None, Some(2), None, Some(4)]);
let mut builder = Int32Array::builder(5);
builder.append_option(Some(0));
builder.append_option(None);
builder.append_option(Some(2));
builder.append_option(None);
builder.append_option(Some(4));
let arr2 = builder.finish();
assert_eq!(arr1.len(), arr2.len());
assert_eq!(arr1.offset(), arr2.offset());
assert_eq!(arr1.null_count(), arr2.null_count());
for i in 0..5 {
assert_eq!(arr1.is_null(i), arr2.is_null(i));
assert_eq!(arr1.is_valid(i), arr2.is_valid(i));
if arr1.is_valid(i) {
assert_eq!(arr1.value(i), arr2.value(i));
}
}
}
#[test]
fn test_primitive_array_builder_append_null() {
let arr1 = Int32Array::from(vec![Some(0), Some(2), None, None, Some(4)]);
let mut builder = Int32Array::builder(5);
builder.append_value(0);
builder.append_value(2);
builder.append_null();
builder.append_null();
builder.append_value(4);
let arr2 = builder.finish();
assert_eq!(arr1.len(), arr2.len());
assert_eq!(arr1.offset(), arr2.offset());
assert_eq!(arr1.null_count(), arr2.null_count());
for i in 0..5 {
assert_eq!(arr1.is_null(i), arr2.is_null(i));
assert_eq!(arr1.is_valid(i), arr2.is_valid(i));
if arr1.is_valid(i) {
assert_eq!(arr1.value(i), arr2.value(i));
}
}
}
#[test]
fn test_primitive_array_builder_append_slice() {
let arr1 = Int32Array::from(vec![Some(0), Some(2), None, None, Some(4)]);
let mut builder = Int32Array::builder(5);
builder.append_slice(&[0, 2]);
builder.append_null();
builder.append_null();
builder.append_value(4);
let arr2 = builder.finish();
assert_eq!(arr1.len(), arr2.len());
assert_eq!(arr1.offset(), arr2.offset());
assert_eq!(arr1.null_count(), arr2.null_count());
for i in 0..5 {
assert_eq!(arr1.is_null(i), arr2.is_null(i));
assert_eq!(arr1.is_valid(i), arr2.is_valid(i));
if arr1.is_valid(i) {
assert_eq!(arr1.value(i), arr2.value(i));
}
}
}
#[test]
fn test_primitive_array_builder_finish() {
let mut builder = Int32Builder::new();
builder.append_slice(&[2, 4, 6, 8]);
let mut arr = builder.finish();
assert_eq!(4, arr.len());
assert_eq!(0, builder.len());
builder.append_slice(&[1, 3, 5, 7, 9]);
arr = builder.finish();
assert_eq!(5, arr.len());
assert_eq!(0, builder.len());
}
#[test]
fn test_primitive_array_builder_finish_cloned() {
let mut builder = Int32Builder::new();
builder.append_value(23);
builder.append_value(45);
let result = builder.finish_cloned();
assert_eq!(result, Int32Array::from(vec![23, 45]));
builder.append_value(56);
assert_eq!(builder.finish_cloned(), Int32Array::from(vec![23, 45, 56]));
builder.append_slice(&[2, 4, 6, 8]);
let mut arr = builder.finish();
assert_eq!(7, arr.len());
assert_eq!(arr, Int32Array::from(vec![23, 45, 56, 2, 4, 6, 8]));
assert_eq!(0, builder.len());
builder.append_slice(&[1, 3, 5, 7, 9]);
arr = builder.finish();
assert_eq!(5, arr.len());
assert_eq!(0, builder.len());
}
#[test]
fn test_primitive_array_builder_with_data_type() {
let mut builder = Decimal128Builder::new().with_data_type(DataType::Decimal128(1, 2));
builder.append_value(1);
let array = builder.finish();
assert_eq!(array.precision(), 1);
assert_eq!(array.scale(), 2);
let data_type = DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".into()));
let mut builder = TimestampNanosecondBuilder::new().with_data_type(data_type.clone());
builder.append_value(1);
let array = builder.finish();
assert_eq!(array.data_type(), &data_type);
}
#[test]
#[should_panic(expected = "incompatible data type for builder, expected Int32 got Int64")]
fn test_invalid_with_data_type() {
Int32Builder::new().with_data_type(DataType::Int64);
}
#[test]
fn test_extend() {
let mut builder = PrimitiveBuilder::<Int16Type>::new();
builder.extend([1, 2, 3, 5, 2, 4, 4].into_iter().map(Some));
builder.extend([2, 4, 6, 2].into_iter().map(Some));
let array = builder.finish();
assert_eq!(array.values(), &[1, 2, 3, 5, 2, 4, 4, 2, 4, 6, 2]);
}
}

Просмотреть файл

@ -0,0 +1,402 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::builder::{ArrayBuilder, PrimitiveBuilder};
use crate::types::ArrowDictionaryKeyType;
use crate::{Array, ArrayRef, ArrowPrimitiveType, DictionaryArray};
use arrow_buffer::{ArrowNativeType, ToByteSlice};
use arrow_schema::{ArrowError, DataType};
use std::any::Any;
use std::collections::hash_map::Entry;
use std::collections::HashMap;
use std::sync::Arc;
/// Wraps a type implementing `ToByteSlice` implementing `Hash` and `Eq` for it
///
/// This is necessary to handle types such as f32, which don't natively implement these
#[derive(Debug)]
struct Value<T>(T);
impl<T: ToByteSlice> std::hash::Hash for Value<T> {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.0.to_byte_slice().hash(state)
}
}
impl<T: ToByteSlice> PartialEq for Value<T> {
fn eq(&self, other: &Self) -> bool {
self.0.to_byte_slice().eq(other.0.to_byte_slice())
}
}
impl<T: ToByteSlice> Eq for Value<T> {}
/// Builder for [`DictionaryArray`] of [`PrimitiveArray`](crate::array::PrimitiveArray)
///
/// # Example:
///
/// ```
///
/// # use arrow_array::builder::PrimitiveDictionaryBuilder;
/// # use arrow_array::types::{UInt32Type, UInt8Type};
/// # use arrow_array::{Array, UInt32Array, UInt8Array};
///
/// let mut builder = PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::new();
/// builder.append(12345678).unwrap();
/// builder.append_null();
/// builder.append(22345678).unwrap();
/// let array = builder.finish();
///
/// assert_eq!(
/// array.keys(),
/// &UInt8Array::from(vec![Some(0), None, Some(1)])
/// );
///
/// // Values are polymorphic and so require a downcast.
/// let av = array.values();
/// let ava: &UInt32Array = av.as_any().downcast_ref::<UInt32Array>().unwrap();
/// let avs: &[u32] = ava.values();
///
/// assert!(!array.is_null(0));
/// assert!(array.is_null(1));
/// assert!(!array.is_null(2));
///
/// assert_eq!(avs, &[12345678, 22345678]);
/// ```
#[derive(Debug)]
pub struct PrimitiveDictionaryBuilder<K, V>
where
K: ArrowPrimitiveType,
V: ArrowPrimitiveType,
{
keys_builder: PrimitiveBuilder<K>,
values_builder: PrimitiveBuilder<V>,
map: HashMap<Value<V::Native>, usize>,
}
impl<K, V> Default for PrimitiveDictionaryBuilder<K, V>
where
K: ArrowPrimitiveType,
V: ArrowPrimitiveType,
{
fn default() -> Self {
Self::new()
}
}
impl<K, V> PrimitiveDictionaryBuilder<K, V>
where
K: ArrowPrimitiveType,
V: ArrowPrimitiveType,
{
/// Creates a new `PrimitiveDictionaryBuilder`.
pub fn new() -> Self {
Self {
keys_builder: PrimitiveBuilder::new(),
values_builder: PrimitiveBuilder::new(),
map: HashMap::new(),
}
}
/// Creates a new `PrimitiveDictionaryBuilder` from the provided keys and values builders.
///
/// # Panics
///
/// This method panics if `keys_builder` or `values_builder` is not empty.
pub fn new_from_empty_builders(
keys_builder: PrimitiveBuilder<K>,
values_builder: PrimitiveBuilder<V>,
) -> Self {
assert!(
keys_builder.is_empty() && values_builder.is_empty(),
"keys and values builders must be empty"
);
Self {
keys_builder,
values_builder,
map: HashMap::new(),
}
}
/// Creates a new `PrimitiveDictionaryBuilder` from existing `PrimitiveBuilder`s of keys and values.
///
/// # Safety
///
/// caller must ensure that the passed in builders are valid for DictionaryArray.
pub unsafe fn new_from_builders(
keys_builder: PrimitiveBuilder<K>,
values_builder: PrimitiveBuilder<V>,
) -> Self {
let keys = keys_builder.values_slice();
let values = values_builder.values_slice();
let mut map = HashMap::with_capacity(values.len());
keys.iter().zip(values.iter()).for_each(|(key, value)| {
map.insert(Value(*value), K::Native::to_usize(*key).unwrap());
});
Self {
keys_builder,
values_builder,
map,
}
}
/// Creates a new `PrimitiveDictionaryBuilder` with the provided capacities
///
/// `keys_capacity`: the number of keys, i.e. length of array to build
/// `values_capacity`: the number of distinct dictionary values, i.e. size of dictionary
pub fn with_capacity(keys_capacity: usize, values_capacity: usize) -> Self {
Self {
keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
values_builder: PrimitiveBuilder::with_capacity(values_capacity),
map: HashMap::with_capacity(values_capacity),
}
}
}
impl<K, V> ArrayBuilder for PrimitiveDictionaryBuilder<K, V>
where
K: ArrowDictionaryKeyType,
V: ArrowPrimitiveType,
{
/// Returns the builder as an non-mutable `Any` reference.
fn as_any(&self) -> &dyn Any {
self
}
/// Returns the builder as an mutable `Any` reference.
fn as_any_mut(&mut self) -> &mut dyn Any {
self
}
/// Returns the boxed builder as a box of `Any`.
fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
self
}
/// Returns the number of array slots in the builder
fn len(&self) -> usize {
self.keys_builder.len()
}
/// Builds the array and reset this builder.
fn finish(&mut self) -> ArrayRef {
Arc::new(self.finish())
}
/// Builds the array without resetting the builder.
fn finish_cloned(&self) -> ArrayRef {
Arc::new(self.finish_cloned())
}
}
impl<K, V> PrimitiveDictionaryBuilder<K, V>
where
K: ArrowDictionaryKeyType,
V: ArrowPrimitiveType,
{
/// Append a primitive value to the array. Return an existing index
/// if already present in the values array or a new index if the
/// value is appended to the values array.
#[inline]
pub fn append(&mut self, value: V::Native) -> Result<K::Native, ArrowError> {
let key = match self.map.entry(Value(value)) {
Entry::Vacant(vacant) => {
// Append new value.
let key = self.values_builder.len();
self.values_builder.append_value(value);
vacant.insert(key);
K::Native::from_usize(key).ok_or(ArrowError::DictionaryKeyOverflowError)?
}
Entry::Occupied(o) => K::Native::usize_as(*o.get()),
};
self.keys_builder.append_value(key);
Ok(key)
}
/// Infallibly append a value to this builder
///
/// # Panics
///
/// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
#[inline]
pub fn append_value(&mut self, value: V::Native) {
self.append(value).expect("dictionary key overflow");
}
/// Appends a null slot into the builder
#[inline]
pub fn append_null(&mut self) {
self.keys_builder.append_null()
}
/// Append an `Option` value into the builder
///
/// # Panics
///
/// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
#[inline]
pub fn append_option(&mut self, value: Option<V::Native>) {
match value {
None => self.append_null(),
Some(v) => self.append_value(v),
};
}
/// Builds the `DictionaryArray` and reset this builder.
pub fn finish(&mut self) -> DictionaryArray<K> {
self.map.clear();
let values = self.values_builder.finish();
let keys = self.keys_builder.finish();
let data_type =
DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(values.data_type().clone()));
let builder = keys
.into_data()
.into_builder()
.data_type(data_type)
.child_data(vec![values.into_data()]);
DictionaryArray::from(unsafe { builder.build_unchecked() })
}
/// Builds the `DictionaryArray` without resetting the builder.
pub fn finish_cloned(&self) -> DictionaryArray<K> {
let values = self.values_builder.finish_cloned();
let keys = self.keys_builder.finish_cloned();
let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(V::DATA_TYPE));
let builder = keys
.into_data()
.into_builder()
.data_type(data_type)
.child_data(vec![values.into_data()]);
DictionaryArray::from(unsafe { builder.build_unchecked() })
}
/// Returns the current dictionary values buffer as a slice
pub fn values_slice(&self) -> &[V::Native] {
self.values_builder.values_slice()
}
/// Returns the current dictionary values buffer as a mutable slice
pub fn values_slice_mut(&mut self) -> &mut [V::Native] {
self.values_builder.values_slice_mut()
}
/// Returns the current null buffer as a slice
pub fn validity_slice(&self) -> Option<&[u8]> {
self.keys_builder.validity_slice()
}
}
impl<K: ArrowDictionaryKeyType, P: ArrowPrimitiveType> Extend<Option<P::Native>>
for PrimitiveDictionaryBuilder<K, P>
{
#[inline]
fn extend<T: IntoIterator<Item = Option<P::Native>>>(&mut self, iter: T) {
for v in iter {
self.append_option(v)
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::array::UInt32Array;
use crate::array::UInt8Array;
use crate::builder::Decimal128Builder;
use crate::types::{Decimal128Type, Int32Type, UInt32Type, UInt8Type};
#[test]
fn test_primitive_dictionary_builder() {
let mut builder = PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::with_capacity(3, 2);
builder.append(12345678).unwrap();
builder.append_null();
builder.append(22345678).unwrap();
let array = builder.finish();
assert_eq!(
array.keys(),
&UInt8Array::from(vec![Some(0), None, Some(1)])
);
// Values are polymorphic and so require a downcast.
let av = array.values();
let ava: &UInt32Array = av.as_any().downcast_ref::<UInt32Array>().unwrap();
let avs: &[u32] = ava.values();
assert!(!array.is_null(0));
assert!(array.is_null(1));
assert!(!array.is_null(2));
assert_eq!(avs, &[12345678, 22345678]);
}
#[test]
fn test_extend() {
let mut builder = PrimitiveDictionaryBuilder::<Int32Type, Int32Type>::new();
builder.extend([1, 2, 3, 1, 2, 3, 1, 2, 3].into_iter().map(Some));
builder.extend([4, 5, 1, 3, 1].into_iter().map(Some));
let dict = builder.finish();
assert_eq!(
dict.keys().values(),
&[0, 1, 2, 0, 1, 2, 0, 1, 2, 3, 4, 0, 2, 0]
);
assert_eq!(dict.values().len(), 5);
}
#[test]
#[should_panic(expected = "DictionaryKeyOverflowError")]
fn test_primitive_dictionary_overflow() {
let mut builder =
PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::with_capacity(257, 257);
// 256 unique keys.
for i in 0..256 {
builder.append(i + 1000).unwrap();
}
// Special error if the key overflows (256th entry)
builder.append(1257).unwrap();
}
#[test]
fn test_primitive_dictionary_with_builders() {
let keys_builder = PrimitiveBuilder::<Int32Type>::new();
let values_builder = Decimal128Builder::new().with_data_type(DataType::Decimal128(1, 2));
let mut builder =
PrimitiveDictionaryBuilder::<Int32Type, Decimal128Type>::new_from_empty_builders(
keys_builder,
values_builder,
);
let dict_array = builder.finish();
assert_eq!(dict_array.value_type(), DataType::Decimal128(1, 2));
assert_eq!(
dict_array.data_type(),
&DataType::Dictionary(
Box::new(DataType::Int32),
Box::new(DataType::Decimal128(1, 2)),
)
);
}
}

Просмотреть файл

@ -0,0 +1,311 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use std::{any::Any, sync::Arc};
use crate::{types::RunEndIndexType, ArrayRef, ArrowPrimitiveType, RunArray};
use super::{ArrayBuilder, PrimitiveBuilder};
use arrow_buffer::ArrowNativeType;
/// Builder for [`RunArray`] of [`PrimitiveArray`](crate::array::PrimitiveArray)
///
/// # Example:
///
/// ```
///
/// # use arrow_array::builder::PrimitiveRunBuilder;
/// # use arrow_array::cast::AsArray;
/// # use arrow_array::types::{UInt32Type, Int16Type};
/// # use arrow_array::{Array, UInt32Array, Int16Array};
///
/// let mut builder =
/// PrimitiveRunBuilder::<Int16Type, UInt32Type>::new();
/// builder.append_value(1234);
/// builder.append_value(1234);
/// builder.append_value(1234);
/// builder.append_null();
/// builder.append_value(5678);
/// builder.append_value(5678);
/// let array = builder.finish();
///
/// assert_eq!(array.run_ends().values(), &[3, 4, 6]);
///
/// let av = array.values();
///
/// assert!(!av.is_null(0));
/// assert!(av.is_null(1));
/// assert!(!av.is_null(2));
///
/// // Values are polymorphic and so require a downcast.
/// let ava: &UInt32Array = av.as_primitive::<UInt32Type>();
///
/// assert_eq!(ava, &UInt32Array::from(vec![Some(1234), None, Some(5678)]));
/// ```
#[derive(Debug)]
pub struct PrimitiveRunBuilder<R, V>
where
R: RunEndIndexType,
V: ArrowPrimitiveType,
{
run_ends_builder: PrimitiveBuilder<R>,
values_builder: PrimitiveBuilder<V>,
current_value: Option<V::Native>,
current_run_end_index: usize,
prev_run_end_index: usize,
}
impl<R, V> Default for PrimitiveRunBuilder<R, V>
where
R: RunEndIndexType,
V: ArrowPrimitiveType,
{
fn default() -> Self {
Self::new()
}
}
impl<R, V> PrimitiveRunBuilder<R, V>
where
R: RunEndIndexType,
V: ArrowPrimitiveType,
{
/// Creates a new `PrimitiveRunBuilder`
pub fn new() -> Self {
Self {
run_ends_builder: PrimitiveBuilder::new(),
values_builder: PrimitiveBuilder::new(),
current_value: None,
current_run_end_index: 0,
prev_run_end_index: 0,
}
}
/// Creates a new `PrimitiveRunBuilder` with the provided capacity
///
/// `capacity`: the expected number of run-end encoded values.
pub fn with_capacity(capacity: usize) -> Self {
Self {
run_ends_builder: PrimitiveBuilder::with_capacity(capacity),
values_builder: PrimitiveBuilder::with_capacity(capacity),
current_value: None,
current_run_end_index: 0,
prev_run_end_index: 0,
}
}
}
impl<R, V> ArrayBuilder for PrimitiveRunBuilder<R, V>
where
R: RunEndIndexType,
V: ArrowPrimitiveType,
{
/// Returns the builder as a non-mutable `Any` reference.
fn as_any(&self) -> &dyn Any {
self
}
/// Returns the builder as a mutable `Any` reference.
fn as_any_mut(&mut self) -> &mut dyn Any {
self
}
/// Returns the boxed builder as a box of `Any`.
fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
self
}
/// Returns the length of logical array encoded by
/// the eventual runs array.
fn len(&self) -> usize {
self.current_run_end_index
}
/// Builds the array and reset this builder.
fn finish(&mut self) -> ArrayRef {
Arc::new(self.finish())
}
/// Builds the array without resetting the builder.
fn finish_cloned(&self) -> ArrayRef {
Arc::new(self.finish_cloned())
}
}
impl<R, V> PrimitiveRunBuilder<R, V>
where
R: RunEndIndexType,
V: ArrowPrimitiveType,
{
/// Appends optional value to the logical array encoded by the RunArray.
pub fn append_option(&mut self, value: Option<V::Native>) {
if self.current_run_end_index == 0 {
self.current_run_end_index = 1;
self.current_value = value;
return;
}
if self.current_value != value {
self.append_run_end();
self.current_value = value;
}
self.current_run_end_index += 1;
}
/// Appends value to the logical array encoded by the run-ends array.
pub fn append_value(&mut self, value: V::Native) {
self.append_option(Some(value))
}
/// Appends null to the logical array encoded by the run-ends array.
pub fn append_null(&mut self) {
self.append_option(None)
}
/// Creates the RunArray and resets the builder.
/// Panics if RunArray cannot be built.
pub fn finish(&mut self) -> RunArray<R> {
// write the last run end to the array.
self.append_run_end();
// reset the run index to zero.
self.current_value = None;
self.current_run_end_index = 0;
// build the run encoded array by adding run_ends and values array as its children.
let run_ends_array = self.run_ends_builder.finish();
let values_array = self.values_builder.finish();
RunArray::<R>::try_new(&run_ends_array, &values_array).unwrap()
}
/// Creates the RunArray and without resetting the builder.
/// Panics if RunArray cannot be built.
pub fn finish_cloned(&self) -> RunArray<R> {
let mut run_ends_array = self.run_ends_builder.finish_cloned();
let mut values_array = self.values_builder.finish_cloned();
// Add current run if one exists
if self.prev_run_end_index != self.current_run_end_index {
let mut run_end_builder = run_ends_array.into_builder().unwrap();
let mut values_builder = values_array.into_builder().unwrap();
self.append_run_end_with_builders(&mut run_end_builder, &mut values_builder);
run_ends_array = run_end_builder.finish();
values_array = values_builder.finish();
}
RunArray::try_new(&run_ends_array, &values_array).unwrap()
}
// Appends the current run to the array.
fn append_run_end(&mut self) {
// empty array or the function called without appending any value.
if self.prev_run_end_index == self.current_run_end_index {
return;
}
let run_end_index = self.run_end_index_as_native();
self.run_ends_builder.append_value(run_end_index);
self.values_builder.append_option(self.current_value);
self.prev_run_end_index = self.current_run_end_index;
}
// Similar to `append_run_end` but on custom builders.
// Used in `finish_cloned` which is not suppose to mutate `self`.
fn append_run_end_with_builders(
&self,
run_ends_builder: &mut PrimitiveBuilder<R>,
values_builder: &mut PrimitiveBuilder<V>,
) {
let run_end_index = self.run_end_index_as_native();
run_ends_builder.append_value(run_end_index);
values_builder.append_option(self.current_value);
}
fn run_end_index_as_native(&self) -> R::Native {
R::Native::from_usize(self.current_run_end_index)
.unwrap_or_else(|| panic!(
"Cannot convert `current_run_end_index` {} from `usize` to native form of arrow datatype {}",
self.current_run_end_index,
R::DATA_TYPE
))
}
}
impl<R, V> Extend<Option<V::Native>> for PrimitiveRunBuilder<R, V>
where
R: RunEndIndexType,
V: ArrowPrimitiveType,
{
fn extend<T: IntoIterator<Item = Option<V::Native>>>(&mut self, iter: T) {
for elem in iter {
self.append_option(elem);
}
}
}
#[cfg(test)]
mod tests {
use crate::builder::PrimitiveRunBuilder;
use crate::cast::AsArray;
use crate::types::{Int16Type, UInt32Type};
use crate::{Array, UInt32Array};
#[test]
fn test_primitive_ree_array_builder() {
let mut builder = PrimitiveRunBuilder::<Int16Type, UInt32Type>::new();
builder.append_value(1234);
builder.append_value(1234);
builder.append_value(1234);
builder.append_null();
builder.append_value(5678);
builder.append_value(5678);
let array = builder.finish();
assert_eq!(array.null_count(), 0);
assert_eq!(array.len(), 6);
assert_eq!(array.run_ends().values(), &[3, 4, 6]);
let av = array.values();
assert!(!av.is_null(0));
assert!(av.is_null(1));
assert!(!av.is_null(2));
// Values are polymorphic and so require a downcast.
let ava: &UInt32Array = av.as_primitive::<UInt32Type>();
assert_eq!(ava, &UInt32Array::from(vec![Some(1234), None, Some(5678)]));
}
#[test]
fn test_extend() {
let mut builder = PrimitiveRunBuilder::<Int16Type, Int16Type>::new();
builder.extend([1, 2, 2, 5, 5, 4, 4].into_iter().map(Some));
builder.extend([4, 4, 6, 2].into_iter().map(Some));
let array = builder.finish();
assert_eq!(array.len(), 11);
assert_eq!(array.null_count(), 0);
assert_eq!(array.run_ends().values(), &[1, 3, 5, 9, 10, 11]);
assert_eq!(
array.values().as_primitive::<Int16Type>().values(),
&[1, 2, 5, 4, 6, 2]
);
}
}

Просмотреть файл

@ -0,0 +1,730 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::builder::*;
use crate::StructArray;
use arrow_buffer::NullBufferBuilder;
use arrow_schema::{DataType, Fields, IntervalUnit, SchemaBuilder, TimeUnit};
use std::sync::Arc;
/// Builder for [`StructArray`]
///
/// Note that callers should make sure that methods of all the child field builders are
/// properly called to maintain the consistency of the data structure.
///
///
/// Handling arrays with complex layouts, such as `List<Struct<List<Struct>>>`, in Rust can be challenging due to its strong typing system.
/// To construct a collection builder ([`ListBuilder`], [`LargeListBuilder`], or [`MapBuilder`]) using [`make_builder`], multiple calls are required. This complexity arises from the recursive approach utilized by [`StructBuilder::from_fields`].
///
/// Initially, [`StructBuilder::from_fields`] invokes [`make_builder`], which returns a `Box<dyn ArrayBuilder>`. To obtain the specific collection builder, one must first use [`StructBuilder::field_builder`] to get a `Collection<[Box<dyn ArrayBuilder>]>`. Subsequently, the `values()` result from this operation can be downcast to the desired builder type.
///
/// For example, when working with [`ListBuilder`], you would first call [`StructBuilder::field_builder::<ListBuilder<Box<dyn ArrayBuilder>>>`] and then downcast the [`Box<dyn ArrayBuilder>`] to the specific [`StructBuilder`] you need.
///
/// For a practical example see the code below:
///
/// ```rust
/// use arrow_array::builder::{ArrayBuilder, ListBuilder, StringBuilder, StructBuilder};
/// use arrow_schema::{DataType, Field, Fields};
/// use std::sync::Arc;
///
/// // This is an example column that has a List<Struct<List<Struct>>> layout
/// let mut example_col = ListBuilder::new(StructBuilder::from_fields(
/// vec![Field::new(
/// "value_list",
/// DataType::List(Arc::new(Field::new(
/// "item",
/// DataType::Struct(Fields::from(vec![
/// Field::new("key", DataType::Utf8, true),
/// Field::new("value", DataType::Utf8, true),
/// ])), //In this example we are trying to get to this builder and insert key/value pairs
/// true,
/// ))),
/// true,
/// )],
/// 0,
/// ));
///
/// // We can obtain the StructBuilder without issues, because example_col was created with StructBuilder
/// let col_struct_builder: &mut StructBuilder = example_col.values();
///
/// // We can't obtain the ListBuilder<StructBuilder> with the expected generic types, because under the hood
/// // the StructBuilder was returned as a Box<dyn ArrayBuilder> and passed as such to the ListBuilder constructor
///
/// // This panics in runtime, even though we know that the builder is a ListBuilder<StructBuilder>.
/// // let sb = col_struct_builder
/// // .field_builder::<ListBuilder<StructBuilder>>(0)
/// // .as_mut()
/// // .unwrap();
///
/// //To keep in line with Rust's strong typing, we fetch a ListBuilder<Box<dyn ArrayBuilder>> from the column StructBuilder first...
/// let mut list_builder_option =
/// col_struct_builder.field_builder::<ListBuilder<Box<dyn ArrayBuilder>>>(0);
///
/// let list_builder = list_builder_option.as_mut().unwrap();
///
/// // ... and then downcast the key/value pair values to a StructBuilder
/// let struct_builder = list_builder
/// .values()
/// .as_any_mut()
/// .downcast_mut::<StructBuilder>()
/// .unwrap();
///
/// // We can now append values to the StructBuilder
/// let key_builder = struct_builder.field_builder::<StringBuilder>(0).unwrap();
/// key_builder.append_value("my key");
///
/// let value_builder = struct_builder.field_builder::<StringBuilder>(1).unwrap();
/// value_builder.append_value("my value");
///
/// struct_builder.append(true);
/// list_builder.append(true);
/// col_struct_builder.append(true);
/// example_col.append(true);
///
/// let array = example_col.finish();
///
/// println!("My array: {:?}", array);
/// ```
///
pub struct StructBuilder {
fields: Fields,
field_builders: Vec<Box<dyn ArrayBuilder>>,
null_buffer_builder: NullBufferBuilder,
}
impl std::fmt::Debug for StructBuilder {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("StructBuilder")
.field("fields", &self.fields)
.field("bitmap_builder", &self.null_buffer_builder)
.field("len", &self.len())
.finish()
}
}
impl ArrayBuilder for StructBuilder {
/// Returns the number of array slots in the builder.
///
/// Note that this always return the first child field builder's length, and it is
/// the caller's responsibility to maintain the consistency that all the child field
/// builder should have the equal number of elements.
fn len(&self) -> usize {
self.null_buffer_builder.len()
}
/// Builds the array.
fn finish(&mut self) -> ArrayRef {
Arc::new(self.finish())
}
/// Builds the array without resetting the builder.
fn finish_cloned(&self) -> ArrayRef {
Arc::new(self.finish_cloned())
}
/// Returns the builder as a non-mutable `Any` reference.
///
/// This is most useful when one wants to call non-mutable APIs on a specific builder
/// type. In this case, one can first cast this into a `Any`, and then use
/// `downcast_ref` to get a reference on the specific builder.
fn as_any(&self) -> &dyn Any {
self
}
/// Returns the builder as a mutable `Any` reference.
///
/// This is most useful when one wants to call mutable APIs on a specific builder
/// type. In this case, one can first cast this into a `Any`, and then use
/// `downcast_mut` to get a reference on the specific builder.
fn as_any_mut(&mut self) -> &mut dyn Any {
self
}
/// Returns the boxed builder as a box of `Any`.
fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
self
}
}
/// Returns a builder with capacity `capacity` that corresponds to the datatype `DataType`
/// This function is useful to construct arrays from an arbitrary vectors with known/expected
/// schema.
///
/// See comments on StructBuilder on how to retreive collection builders built by make_builder.
pub fn make_builder(datatype: &DataType, capacity: usize) -> Box<dyn ArrayBuilder> {
use crate::builder::*;
match datatype {
DataType::Null => Box::new(NullBuilder::new()),
DataType::Boolean => Box::new(BooleanBuilder::with_capacity(capacity)),
DataType::Int8 => Box::new(Int8Builder::with_capacity(capacity)),
DataType::Int16 => Box::new(Int16Builder::with_capacity(capacity)),
DataType::Int32 => Box::new(Int32Builder::with_capacity(capacity)),
DataType::Int64 => Box::new(Int64Builder::with_capacity(capacity)),
DataType::UInt8 => Box::new(UInt8Builder::with_capacity(capacity)),
DataType::UInt16 => Box::new(UInt16Builder::with_capacity(capacity)),
DataType::UInt32 => Box::new(UInt32Builder::with_capacity(capacity)),
DataType::UInt64 => Box::new(UInt64Builder::with_capacity(capacity)),
DataType::Float16 => Box::new(Float16Builder::with_capacity(capacity)),
DataType::Float32 => Box::new(Float32Builder::with_capacity(capacity)),
DataType::Float64 => Box::new(Float64Builder::with_capacity(capacity)),
DataType::Binary => Box::new(BinaryBuilder::with_capacity(capacity, 1024)),
DataType::LargeBinary => Box::new(LargeBinaryBuilder::with_capacity(capacity, 1024)),
DataType::FixedSizeBinary(len) => {
Box::new(FixedSizeBinaryBuilder::with_capacity(capacity, *len))
}
DataType::Decimal128(p, s) => Box::new(
Decimal128Builder::with_capacity(capacity).with_data_type(DataType::Decimal128(*p, *s)),
),
DataType::Decimal256(p, s) => Box::new(
Decimal256Builder::with_capacity(capacity).with_data_type(DataType::Decimal256(*p, *s)),
),
DataType::Utf8 => Box::new(StringBuilder::with_capacity(capacity, 1024)),
DataType::LargeUtf8 => Box::new(LargeStringBuilder::with_capacity(capacity, 1024)),
DataType::Date32 => Box::new(Date32Builder::with_capacity(capacity)),
DataType::Date64 => Box::new(Date64Builder::with_capacity(capacity)),
DataType::Time32(TimeUnit::Second) => {
Box::new(Time32SecondBuilder::with_capacity(capacity))
}
DataType::Time32(TimeUnit::Millisecond) => {
Box::new(Time32MillisecondBuilder::with_capacity(capacity))
}
DataType::Time64(TimeUnit::Microsecond) => {
Box::new(Time64MicrosecondBuilder::with_capacity(capacity))
}
DataType::Time64(TimeUnit::Nanosecond) => {
Box::new(Time64NanosecondBuilder::with_capacity(capacity))
}
DataType::Timestamp(TimeUnit::Second, tz) => Box::new(
TimestampSecondBuilder::with_capacity(capacity)
.with_data_type(DataType::Timestamp(TimeUnit::Second, tz.clone())),
),
DataType::Timestamp(TimeUnit::Millisecond, tz) => Box::new(
TimestampMillisecondBuilder::with_capacity(capacity)
.with_data_type(DataType::Timestamp(TimeUnit::Millisecond, tz.clone())),
),
DataType::Timestamp(TimeUnit::Microsecond, tz) => Box::new(
TimestampMicrosecondBuilder::with_capacity(capacity)
.with_data_type(DataType::Timestamp(TimeUnit::Microsecond, tz.clone())),
),
DataType::Timestamp(TimeUnit::Nanosecond, tz) => Box::new(
TimestampNanosecondBuilder::with_capacity(capacity)
.with_data_type(DataType::Timestamp(TimeUnit::Nanosecond, tz.clone())),
),
DataType::Interval(IntervalUnit::YearMonth) => {
Box::new(IntervalYearMonthBuilder::with_capacity(capacity))
}
DataType::Interval(IntervalUnit::DayTime) => {
Box::new(IntervalDayTimeBuilder::with_capacity(capacity))
}
DataType::Interval(IntervalUnit::MonthDayNano) => {
Box::new(IntervalMonthDayNanoBuilder::with_capacity(capacity))
}
DataType::Duration(TimeUnit::Second) => {
Box::new(DurationSecondBuilder::with_capacity(capacity))
}
DataType::Duration(TimeUnit::Millisecond) => {
Box::new(DurationMillisecondBuilder::with_capacity(capacity))
}
DataType::Duration(TimeUnit::Microsecond) => {
Box::new(DurationMicrosecondBuilder::with_capacity(capacity))
}
DataType::Duration(TimeUnit::Nanosecond) => {
Box::new(DurationNanosecondBuilder::with_capacity(capacity))
}
DataType::List(field) => {
let builder = make_builder(field.data_type(), capacity);
Box::new(ListBuilder::with_capacity(builder, capacity).with_field(field.clone()))
}
DataType::LargeList(field) => {
let builder = make_builder(field.data_type(), capacity);
Box::new(LargeListBuilder::with_capacity(builder, capacity).with_field(field.clone()))
}
DataType::Map(field, _) => match field.data_type() {
DataType::Struct(fields) => {
let map_field_names = MapFieldNames {
key: fields[0].name().clone(),
value: fields[1].name().clone(),
entry: field.name().clone(),
};
let key_builder = make_builder(fields[0].data_type(), capacity);
let value_builder = make_builder(fields[1].data_type(), capacity);
Box::new(
MapBuilder::with_capacity(
Some(map_field_names),
key_builder,
value_builder,
capacity,
)
.with_values_field(fields[1].clone()),
)
}
t => panic!("The field of Map data type {t:?} should has a child Struct field"),
},
DataType::Struct(fields) => Box::new(StructBuilder::from_fields(fields.clone(), capacity)),
t => panic!("Data type {t:?} is not currently supported"),
}
}
impl StructBuilder {
/// Creates a new `StructBuilder`
pub fn new(fields: impl Into<Fields>, field_builders: Vec<Box<dyn ArrayBuilder>>) -> Self {
Self {
field_builders,
fields: fields.into(),
null_buffer_builder: NullBufferBuilder::new(0),
}
}
/// Creates a new `StructBuilder` from [`Fields`] and `capacity`
pub fn from_fields(fields: impl Into<Fields>, capacity: usize) -> Self {
let fields = fields.into();
let mut builders = Vec::with_capacity(fields.len());
for field in &fields {
builders.push(make_builder(field.data_type(), capacity));
}
Self::new(fields, builders)
}
/// Returns a mutable reference to the child field builder at index `i`.
/// Result will be `None` if the input type `T` provided doesn't match the actual
/// field builder's type.
pub fn field_builder<T: ArrayBuilder>(&mut self, i: usize) -> Option<&mut T> {
self.field_builders[i].as_any_mut().downcast_mut::<T>()
}
/// Returns the number of fields for the struct this builder is building.
pub fn num_fields(&self) -> usize {
self.field_builders.len()
}
/// Appends an element (either null or non-null) to the struct. The actual elements
/// should be appended for each child sub-array in a consistent way.
#[inline]
pub fn append(&mut self, is_valid: bool) {
self.null_buffer_builder.append(is_valid);
}
/// Appends a null element to the struct.
#[inline]
pub fn append_null(&mut self) {
self.append(false)
}
/// Builds the `StructArray` and reset this builder.
pub fn finish(&mut self) -> StructArray {
self.validate_content();
if self.fields.is_empty() {
return StructArray::new_empty_fields(self.len(), self.null_buffer_builder.finish());
}
let arrays = self.field_builders.iter_mut().map(|f| f.finish()).collect();
let nulls = self.null_buffer_builder.finish();
StructArray::new(self.fields.clone(), arrays, nulls)
}
/// Builds the `StructArray` without resetting the builder.
pub fn finish_cloned(&self) -> StructArray {
self.validate_content();
if self.fields.is_empty() {
return StructArray::new_empty_fields(
self.len(),
self.null_buffer_builder.finish_cloned(),
);
}
let arrays = self
.field_builders
.iter()
.map(|f| f.finish_cloned())
.collect();
let nulls = self.null_buffer_builder.finish_cloned();
StructArray::new(self.fields.clone(), arrays, nulls)
}
/// Constructs and validates contents in the builder to ensure that
/// - fields and field_builders are of equal length
/// - the number of items in individual field_builders are equal to self.len()
fn validate_content(&self) {
if self.fields.len() != self.field_builders.len() {
panic!("Number of fields is not equal to the number of field_builders.");
}
self.field_builders.iter().enumerate().for_each(|(idx, x)| {
if x.len() != self.len() {
let builder = SchemaBuilder::from(&self.fields);
let schema = builder.finish();
panic!("{}", format!(
"StructBuilder ({:?}) and field_builder with index {} ({:?}) are of unequal lengths: ({} != {}).",
schema,
idx,
self.fields[idx].data_type(),
self.len(),
x.len()
));
}
});
}
/// Returns the current null buffer as a slice
pub fn validity_slice(&self) -> Option<&[u8]> {
self.null_buffer_builder.as_slice()
}
}
#[cfg(test)]
mod tests {
use super::*;
use arrow_buffer::Buffer;
use arrow_data::ArrayData;
use arrow_schema::Field;
use crate::array::Array;
#[test]
fn test_struct_array_builder() {
let string_builder = StringBuilder::new();
let int_builder = Int32Builder::new();
let fields = vec![
Field::new("f1", DataType::Utf8, true),
Field::new("f2", DataType::Int32, true),
];
let field_builders = vec![
Box::new(string_builder) as Box<dyn ArrayBuilder>,
Box::new(int_builder) as Box<dyn ArrayBuilder>,
];
let mut builder = StructBuilder::new(fields, field_builders);
assert_eq!(2, builder.num_fields());
let string_builder = builder
.field_builder::<StringBuilder>(0)
.expect("builder at field 0 should be string builder");
string_builder.append_value("joe");
string_builder.append_null();
string_builder.append_null();
string_builder.append_value("mark");
let int_builder = builder
.field_builder::<Int32Builder>(1)
.expect("builder at field 1 should be int builder");
int_builder.append_value(1);
int_builder.append_value(2);
int_builder.append_null();
int_builder.append_value(4);
builder.append(true);
builder.append(true);
builder.append_null();
builder.append(true);
let struct_data = builder.finish().into_data();
assert_eq!(4, struct_data.len());
assert_eq!(1, struct_data.null_count());
assert_eq!(&[11_u8], struct_data.nulls().unwrap().validity());
let expected_string_data = ArrayData::builder(DataType::Utf8)
.len(4)
.null_bit_buffer(Some(Buffer::from(&[9_u8])))
.add_buffer(Buffer::from_slice_ref([0, 3, 3, 3, 7]))
.add_buffer(Buffer::from_slice_ref(b"joemark"))
.build()
.unwrap();
let expected_int_data = ArrayData::builder(DataType::Int32)
.len(4)
.null_bit_buffer(Some(Buffer::from_slice_ref([11_u8])))
.add_buffer(Buffer::from_slice_ref([1, 2, 0, 4]))
.build()
.unwrap();
assert_eq!(expected_string_data, struct_data.child_data()[0]);
assert_eq!(expected_int_data, struct_data.child_data()[1]);
}
#[test]
fn test_struct_array_builder_finish() {
let int_builder = Int32Builder::new();
let bool_builder = BooleanBuilder::new();
let fields = vec![
Field::new("f1", DataType::Int32, false),
Field::new("f2", DataType::Boolean, false),
];
let field_builders = vec![
Box::new(int_builder) as Box<dyn ArrayBuilder>,
Box::new(bool_builder) as Box<dyn ArrayBuilder>,
];
let mut builder = StructBuilder::new(fields, field_builders);
builder
.field_builder::<Int32Builder>(0)
.unwrap()
.append_slice(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]);
builder
.field_builder::<BooleanBuilder>(1)
.unwrap()
.append_slice(&[
false, true, false, true, false, true, false, true, false, true,
]);
// Append slot values - all are valid.
for _ in 0..10 {
builder.append(true);
}
assert_eq!(10, builder.len());
let arr = builder.finish();
assert_eq!(10, arr.len());
assert_eq!(0, builder.len());
builder
.field_builder::<Int32Builder>(0)
.unwrap()
.append_slice(&[1, 3, 5, 7, 9]);
builder
.field_builder::<BooleanBuilder>(1)
.unwrap()
.append_slice(&[false, true, false, true, false]);
// Append slot values - all are valid.
for _ in 0..5 {
builder.append(true);
}
assert_eq!(5, builder.len());
let arr = builder.finish();
assert_eq!(5, arr.len());
assert_eq!(0, builder.len());
}
#[test]
fn test_struct_array_builder_finish_cloned() {
let int_builder = Int32Builder::new();
let bool_builder = BooleanBuilder::new();
let fields = vec![
Field::new("f1", DataType::Int32, false),
Field::new("f2", DataType::Boolean, false),
];
let field_builders = vec![
Box::new(int_builder) as Box<dyn ArrayBuilder>,
Box::new(bool_builder) as Box<dyn ArrayBuilder>,
];
let mut builder = StructBuilder::new(fields, field_builders);
builder
.field_builder::<Int32Builder>(0)
.unwrap()
.append_slice(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]);
builder
.field_builder::<BooleanBuilder>(1)
.unwrap()
.append_slice(&[
false, true, false, true, false, true, false, true, false, true,
]);
// Append slot values - all are valid.
for _ in 0..10 {
builder.append(true);
}
assert_eq!(10, builder.len());
let mut arr = builder.finish_cloned();
assert_eq!(10, arr.len());
assert_eq!(10, builder.len());
builder
.field_builder::<Int32Builder>(0)
.unwrap()
.append_slice(&[1, 3, 5, 7, 9]);
builder
.field_builder::<BooleanBuilder>(1)
.unwrap()
.append_slice(&[false, true, false, true, false]);
// Append slot values - all are valid.
for _ in 0..5 {
builder.append(true);
}
assert_eq!(15, builder.len());
arr = builder.finish();
assert_eq!(15, arr.len());
assert_eq!(0, builder.len());
}
#[test]
fn test_struct_array_builder_from_schema() {
let mut fields = vec![
Field::new("f1", DataType::Float32, false),
Field::new("f2", DataType::Utf8, false),
];
let sub_fields = vec![
Field::new("g1", DataType::Int32, false),
Field::new("g2", DataType::Boolean, false),
];
let struct_type = DataType::Struct(sub_fields.into());
fields.push(Field::new("f3", struct_type, false));
let mut builder = StructBuilder::from_fields(fields, 5);
assert_eq!(3, builder.num_fields());
assert!(builder.field_builder::<Float32Builder>(0).is_some());
assert!(builder.field_builder::<StringBuilder>(1).is_some());
assert!(builder.field_builder::<StructBuilder>(2).is_some());
}
#[test]
fn test_datatype_properties() {
let fields = Fields::from(vec![
Field::new("f1", DataType::Decimal128(1, 2), false),
Field::new(
"f2",
DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())),
false,
),
]);
let mut builder = StructBuilder::from_fields(fields.clone(), 1);
builder
.field_builder::<Decimal128Builder>(0)
.unwrap()
.append_value(1);
builder
.field_builder::<TimestampMillisecondBuilder>(1)
.unwrap()
.append_value(1);
builder.append(true);
let array = builder.finish();
assert_eq!(array.data_type(), &DataType::Struct(fields.clone()));
assert_eq!(array.column(0).data_type(), fields[0].data_type());
assert_eq!(array.column(1).data_type(), fields[1].data_type());
}
#[test]
#[should_panic(expected = "Data type Dictionary(Int32, Utf8) is not currently supported")]
fn test_struct_array_builder_from_schema_unsupported_type() {
let fields = vec![
Field::new("f1", DataType::Int16, false),
Field::new(
"f2",
DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
false,
),
];
let _ = StructBuilder::from_fields(fields, 5);
}
#[test]
fn test_struct_array_builder_field_builder_type_mismatch() {
let int_builder = Int32Builder::with_capacity(10);
let fields = vec![Field::new("f1", DataType::Int32, false)];
let field_builders = vec![Box::new(int_builder) as Box<dyn ArrayBuilder>];
let mut builder = StructBuilder::new(fields, field_builders);
assert!(builder.field_builder::<BinaryBuilder>(0).is_none());
}
#[test]
#[should_panic(
expected = "StructBuilder (Schema { fields: [Field { name: \"f1\", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: \"f2\", data_type: Boolean, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {} }) and field_builder with index 1 (Boolean) are of unequal lengths: (2 != 1)."
)]
fn test_struct_array_builder_unequal_field_builders_lengths() {
let mut int_builder = Int32Builder::with_capacity(10);
let mut bool_builder = BooleanBuilder::new();
int_builder.append_value(1);
int_builder.append_value(2);
bool_builder.append_value(true);
let fields = vec![
Field::new("f1", DataType::Int32, false),
Field::new("f2", DataType::Boolean, false),
];
let field_builders = vec![
Box::new(int_builder) as Box<dyn ArrayBuilder>,
Box::new(bool_builder) as Box<dyn ArrayBuilder>,
];
let mut builder = StructBuilder::new(fields, field_builders);
builder.append(true);
builder.append(true);
builder.finish();
}
#[test]
#[should_panic(expected = "Number of fields is not equal to the number of field_builders.")]
fn test_struct_array_builder_unequal_field_field_builders() {
let int_builder = Int32Builder::with_capacity(10);
let fields = vec![
Field::new("f1", DataType::Int32, false),
Field::new("f2", DataType::Boolean, false),
];
let field_builders = vec![Box::new(int_builder) as Box<dyn ArrayBuilder>];
let mut builder = StructBuilder::new(fields, field_builders);
builder.finish();
}
#[test]
#[should_panic(
expected = "Incorrect datatype for StructArray field \\\"timestamp\\\", expected Timestamp(Nanosecond, Some(\\\"UTC\\\")) got Timestamp(Nanosecond, None)"
)]
fn test_struct_array_mismatch_builder() {
let fields = vec![Field::new(
"timestamp",
DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".to_owned().into())),
false,
)];
let field_builders: Vec<Box<dyn ArrayBuilder>> =
vec![Box::new(TimestampNanosecondBuilder::new())];
let mut sa = StructBuilder::new(fields, field_builders);
sa.finish();
}
#[test]
fn test_empty() {
let mut builder = StructBuilder::new(Fields::empty(), vec![]);
builder.append(true);
builder.append(false);
let a1 = builder.finish_cloned();
let a2 = builder.finish();
assert_eq!(a1, a2);
assert_eq!(a1.len(), 2);
assert_eq!(a1.null_count(), 1);
assert!(a1.is_valid(0));
assert!(a1.is_null(1));
}
}

Просмотреть файл

@ -0,0 +1,313 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::builder::buffer_builder::{Int32BufferBuilder, Int8BufferBuilder};
use crate::builder::BufferBuilder;
use crate::{make_array, ArrowPrimitiveType, UnionArray};
use arrow_buffer::NullBufferBuilder;
use arrow_buffer::{ArrowNativeType, Buffer};
use arrow_data::ArrayDataBuilder;
use arrow_schema::{ArrowError, DataType, Field};
use std::any::Any;
use std::collections::BTreeMap;
use std::sync::Arc;
/// `FieldData` is a helper struct to track the state of the fields in the `UnionBuilder`.
#[derive(Debug)]
struct FieldData {
/// The type id for this field
type_id: i8,
/// The Arrow data type represented in the `values_buffer`, which is untyped
data_type: DataType,
/// A buffer containing the values for this field in raw bytes
values_buffer: Box<dyn FieldDataValues>,
/// The number of array slots represented by the buffer
slots: usize,
/// A builder for the null bitmap
null_buffer_builder: NullBufferBuilder,
}
/// A type-erased [`BufferBuilder`] used by [`FieldData`]
trait FieldDataValues: std::fmt::Debug {
fn as_mut_any(&mut self) -> &mut dyn Any;
fn append_null(&mut self);
fn finish(&mut self) -> Buffer;
}
impl<T: ArrowNativeType> FieldDataValues for BufferBuilder<T> {
fn as_mut_any(&mut self) -> &mut dyn Any {
self
}
fn append_null(&mut self) {
self.advance(1)
}
fn finish(&mut self) -> Buffer {
self.finish()
}
}
impl FieldData {
/// Creates a new `FieldData`.
fn new<T: ArrowPrimitiveType>(type_id: i8, data_type: DataType, capacity: usize) -> Self {
Self {
type_id,
data_type,
slots: 0,
values_buffer: Box::new(BufferBuilder::<T::Native>::new(capacity)),
null_buffer_builder: NullBufferBuilder::new(capacity),
}
}
/// Appends a single value to this `FieldData`'s `values_buffer`.
fn append_value<T: ArrowPrimitiveType>(&mut self, v: T::Native) {
self.values_buffer
.as_mut_any()
.downcast_mut::<BufferBuilder<T::Native>>()
.expect("Tried to append unexpected type")
.append(v);
self.null_buffer_builder.append(true);
self.slots += 1;
}
/// Appends a null to this `FieldData`.
fn append_null(&mut self) {
self.values_buffer.append_null();
self.null_buffer_builder.append(false);
self.slots += 1;
}
}
/// Builder for [`UnionArray`]
///
/// Example: **Dense Memory Layout**
///
/// ```
/// # use arrow_array::builder::UnionBuilder;
/// # use arrow_array::types::{Float64Type, Int32Type};
///
/// let mut builder = UnionBuilder::new_dense();
/// builder.append::<Int32Type>("a", 1).unwrap();
/// builder.append::<Float64Type>("b", 3.0).unwrap();
/// builder.append::<Int32Type>("a", 4).unwrap();
/// let union = builder.build().unwrap();
///
/// assert_eq!(union.type_id(0), 0);
/// assert_eq!(union.type_id(1), 1);
/// assert_eq!(union.type_id(2), 0);
///
/// assert_eq!(union.value_offset(0), 0);
/// assert_eq!(union.value_offset(1), 0);
/// assert_eq!(union.value_offset(2), 1);
/// ```
///
/// Example: **Sparse Memory Layout**
/// ```
/// # use arrow_array::builder::UnionBuilder;
/// # use arrow_array::types::{Float64Type, Int32Type};
///
/// let mut builder = UnionBuilder::new_sparse();
/// builder.append::<Int32Type>("a", 1).unwrap();
/// builder.append::<Float64Type>("b", 3.0).unwrap();
/// builder.append::<Int32Type>("a", 4).unwrap();
/// let union = builder.build().unwrap();
///
/// assert_eq!(union.type_id(0), 0);
/// assert_eq!(union.type_id(1), 1);
/// assert_eq!(union.type_id(2), 0);
///
/// assert_eq!(union.value_offset(0), 0);
/// assert_eq!(union.value_offset(1), 1);
/// assert_eq!(union.value_offset(2), 2);
/// ```
#[derive(Debug)]
pub struct UnionBuilder {
/// The current number of slots in the array
len: usize,
/// Maps field names to `FieldData` instances which track the builders for that field
fields: BTreeMap<String, FieldData>,
/// Builder to keep track of type ids
type_id_builder: Int8BufferBuilder,
/// Builder to keep track of offsets (`None` for sparse unions)
value_offset_builder: Option<Int32BufferBuilder>,
initial_capacity: usize,
}
impl UnionBuilder {
/// Creates a new dense array builder.
pub fn new_dense() -> Self {
Self::with_capacity_dense(1024)
}
/// Creates a new sparse array builder.
pub fn new_sparse() -> Self {
Self::with_capacity_sparse(1024)
}
/// Creates a new dense array builder with capacity.
pub fn with_capacity_dense(capacity: usize) -> Self {
Self {
len: 0,
fields: Default::default(),
type_id_builder: Int8BufferBuilder::new(capacity),
value_offset_builder: Some(Int32BufferBuilder::new(capacity)),
initial_capacity: capacity,
}
}
/// Creates a new sparse array builder with capacity.
pub fn with_capacity_sparse(capacity: usize) -> Self {
Self {
len: 0,
fields: Default::default(),
type_id_builder: Int8BufferBuilder::new(capacity),
value_offset_builder: None,
initial_capacity: capacity,
}
}
/// Appends a null to this builder, encoding the null in the array
/// of the `type_name` child / field.
///
/// Since `UnionArray` encodes nulls as an entry in its children
/// (it doesn't have a validity bitmap itself), and where the null
/// is part of the final array, appending a NULL requires
/// specifying which field (child) to use.
#[inline]
pub fn append_null<T: ArrowPrimitiveType>(
&mut self,
type_name: &str,
) -> Result<(), ArrowError> {
self.append_option::<T>(type_name, None)
}
/// Appends a value to this builder.
#[inline]
pub fn append<T: ArrowPrimitiveType>(
&mut self,
type_name: &str,
v: T::Native,
) -> Result<(), ArrowError> {
self.append_option::<T>(type_name, Some(v))
}
fn append_option<T: ArrowPrimitiveType>(
&mut self,
type_name: &str,
v: Option<T::Native>,
) -> Result<(), ArrowError> {
let type_name = type_name.to_string();
let mut field_data = match self.fields.remove(&type_name) {
Some(data) => {
if data.data_type != T::DATA_TYPE {
return Err(ArrowError::InvalidArgumentError(format!(
"Attempt to write col \"{}\" with type {} doesn't match existing type {}",
type_name,
T::DATA_TYPE,
data.data_type
)));
}
data
}
None => match self.value_offset_builder {
Some(_) => FieldData::new::<T>(
self.fields.len() as i8,
T::DATA_TYPE,
self.initial_capacity,
),
// In the case of a sparse union, we should pass the maximum of the currently length and the capacity.
None => {
let mut fd = FieldData::new::<T>(
self.fields.len() as i8,
T::DATA_TYPE,
self.len.max(self.initial_capacity),
);
for _ in 0..self.len {
fd.append_null();
}
fd
}
},
};
self.type_id_builder.append(field_data.type_id);
match &mut self.value_offset_builder {
// Dense Union
Some(offset_builder) => {
offset_builder.append(field_data.slots as i32);
}
// Sparse Union
None => {
for (_, fd) in self.fields.iter_mut() {
// Append to all bar the FieldData currently being appended to
fd.append_null();
}
}
}
match v {
Some(v) => field_data.append_value::<T>(v),
None => field_data.append_null(),
}
self.fields.insert(type_name, field_data);
self.len += 1;
Ok(())
}
/// Builds this builder creating a new `UnionArray`.
pub fn build(self) -> Result<UnionArray, ArrowError> {
let mut children = Vec::with_capacity(self.fields.len());
let union_fields = self
.fields
.into_iter()
.map(
|(
name,
FieldData {
type_id,
data_type,
mut values_buffer,
slots,
mut null_buffer_builder,
},
)| {
let array_ref = make_array(unsafe {
ArrayDataBuilder::new(data_type.clone())
.add_buffer(values_buffer.finish())
.len(slots)
.nulls(null_buffer_builder.finish())
.build_unchecked()
});
children.push(array_ref);
(type_id, Arc::new(Field::new(name, data_type, false)))
},
)
.collect();
UnionArray::try_new(
union_fields,
self.type_id_builder.into(),
self.value_offset_builder.map(Into::into),
children,
)
}
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,285 @@
// MIT License
//
// Copyright (c) 2020-2022 Oliver Margetts
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
// Copied from chronoutil crate
//! Contains utility functions for shifting Date objects.
use chrono::{DateTime, Datelike, Days, Months, TimeZone};
use std::cmp::Ordering;
/// Shift a date by the given number of months.
pub(crate) fn shift_months<D>(date: D, months: i32) -> D
where
D: Datelike + std::ops::Add<Months, Output = D> + std::ops::Sub<Months, Output = D>,
{
match months.cmp(&0) {
Ordering::Equal => date,
Ordering::Greater => date + Months::new(months as u32),
Ordering::Less => date - Months::new(months.unsigned_abs()),
}
}
/// Add the given number of months to the given datetime.
///
/// Returns `None` when it will result in overflow.
pub(crate) fn add_months_datetime<Tz: TimeZone>(
dt: DateTime<Tz>,
months: i32,
) -> Option<DateTime<Tz>> {
match months.cmp(&0) {
Ordering::Equal => Some(dt),
Ordering::Greater => dt.checked_add_months(Months::new(months as u32)),
Ordering::Less => dt.checked_sub_months(Months::new(months.unsigned_abs())),
}
}
/// Add the given number of days to the given datetime.
///
/// Returns `None` when it will result in overflow.
pub(crate) fn add_days_datetime<Tz: TimeZone>(dt: DateTime<Tz>, days: i32) -> Option<DateTime<Tz>> {
match days.cmp(&0) {
Ordering::Equal => Some(dt),
Ordering::Greater => dt.checked_add_days(Days::new(days as u64)),
Ordering::Less => dt.checked_sub_days(Days::new(days.unsigned_abs() as u64)),
}
}
/// Substract the given number of months to the given datetime.
///
/// Returns `None` when it will result in overflow.
pub(crate) fn sub_months_datetime<Tz: TimeZone>(
dt: DateTime<Tz>,
months: i32,
) -> Option<DateTime<Tz>> {
match months.cmp(&0) {
Ordering::Equal => Some(dt),
Ordering::Greater => dt.checked_sub_months(Months::new(months as u32)),
Ordering::Less => dt.checked_add_months(Months::new(months.unsigned_abs())),
}
}
/// Substract the given number of days to the given datetime.
///
/// Returns `None` when it will result in overflow.
pub(crate) fn sub_days_datetime<Tz: TimeZone>(dt: DateTime<Tz>, days: i32) -> Option<DateTime<Tz>> {
match days.cmp(&0) {
Ordering::Equal => Some(dt),
Ordering::Greater => dt.checked_sub_days(Days::new(days as u64)),
Ordering::Less => dt.checked_add_days(Days::new(days.unsigned_abs() as u64)),
}
}
#[cfg(test)]
mod tests {
use chrono::naive::{NaiveDate, NaiveDateTime, NaiveTime};
use super::*;
#[test]
fn test_shift_months() {
let base = NaiveDate::from_ymd_opt(2020, 1, 31).unwrap();
assert_eq!(
shift_months(base, 0),
NaiveDate::from_ymd_opt(2020, 1, 31).unwrap()
);
assert_eq!(
shift_months(base, 1),
NaiveDate::from_ymd_opt(2020, 2, 29).unwrap()
);
assert_eq!(
shift_months(base, 2),
NaiveDate::from_ymd_opt(2020, 3, 31).unwrap()
);
assert_eq!(
shift_months(base, 3),
NaiveDate::from_ymd_opt(2020, 4, 30).unwrap()
);
assert_eq!(
shift_months(base, 4),
NaiveDate::from_ymd_opt(2020, 5, 31).unwrap()
);
assert_eq!(
shift_months(base, 5),
NaiveDate::from_ymd_opt(2020, 6, 30).unwrap()
);
assert_eq!(
shift_months(base, 6),
NaiveDate::from_ymd_opt(2020, 7, 31).unwrap()
);
assert_eq!(
shift_months(base, 7),
NaiveDate::from_ymd_opt(2020, 8, 31).unwrap()
);
assert_eq!(
shift_months(base, 8),
NaiveDate::from_ymd_opt(2020, 9, 30).unwrap()
);
assert_eq!(
shift_months(base, 9),
NaiveDate::from_ymd_opt(2020, 10, 31).unwrap()
);
assert_eq!(
shift_months(base, 10),
NaiveDate::from_ymd_opt(2020, 11, 30).unwrap()
);
assert_eq!(
shift_months(base, 11),
NaiveDate::from_ymd_opt(2020, 12, 31).unwrap()
);
assert_eq!(
shift_months(base, 12),
NaiveDate::from_ymd_opt(2021, 1, 31).unwrap()
);
assert_eq!(
shift_months(base, 13),
NaiveDate::from_ymd_opt(2021, 2, 28).unwrap()
);
assert_eq!(
shift_months(base, -1),
NaiveDate::from_ymd_opt(2019, 12, 31).unwrap()
);
assert_eq!(
shift_months(base, -2),
NaiveDate::from_ymd_opt(2019, 11, 30).unwrap()
);
assert_eq!(
shift_months(base, -3),
NaiveDate::from_ymd_opt(2019, 10, 31).unwrap()
);
assert_eq!(
shift_months(base, -4),
NaiveDate::from_ymd_opt(2019, 9, 30).unwrap()
);
assert_eq!(
shift_months(base, -5),
NaiveDate::from_ymd_opt(2019, 8, 31).unwrap()
);
assert_eq!(
shift_months(base, -6),
NaiveDate::from_ymd_opt(2019, 7, 31).unwrap()
);
assert_eq!(
shift_months(base, -7),
NaiveDate::from_ymd_opt(2019, 6, 30).unwrap()
);
assert_eq!(
shift_months(base, -8),
NaiveDate::from_ymd_opt(2019, 5, 31).unwrap()
);
assert_eq!(
shift_months(base, -9),
NaiveDate::from_ymd_opt(2019, 4, 30).unwrap()
);
assert_eq!(
shift_months(base, -10),
NaiveDate::from_ymd_opt(2019, 3, 31).unwrap()
);
assert_eq!(
shift_months(base, -11),
NaiveDate::from_ymd_opt(2019, 2, 28).unwrap()
);
assert_eq!(
shift_months(base, -12),
NaiveDate::from_ymd_opt(2019, 1, 31).unwrap()
);
assert_eq!(
shift_months(base, -13),
NaiveDate::from_ymd_opt(2018, 12, 31).unwrap()
);
assert_eq!(
shift_months(base, 1265),
NaiveDate::from_ymd_opt(2125, 6, 30).unwrap()
);
}
#[test]
fn test_shift_months_with_overflow() {
let base = NaiveDate::from_ymd_opt(2020, 12, 31).unwrap();
assert_eq!(shift_months(base, 0), base);
assert_eq!(
shift_months(base, 1),
NaiveDate::from_ymd_opt(2021, 1, 31).unwrap()
);
assert_eq!(
shift_months(base, 2),
NaiveDate::from_ymd_opt(2021, 2, 28).unwrap()
);
assert_eq!(
shift_months(base, 12),
NaiveDate::from_ymd_opt(2021, 12, 31).unwrap()
);
assert_eq!(
shift_months(base, 18),
NaiveDate::from_ymd_opt(2022, 6, 30).unwrap()
);
assert_eq!(
shift_months(base, -1),
NaiveDate::from_ymd_opt(2020, 11, 30).unwrap()
);
assert_eq!(
shift_months(base, -2),
NaiveDate::from_ymd_opt(2020, 10, 31).unwrap()
);
assert_eq!(
shift_months(base, -10),
NaiveDate::from_ymd_opt(2020, 2, 29).unwrap()
);
assert_eq!(
shift_months(base, -12),
NaiveDate::from_ymd_opt(2019, 12, 31).unwrap()
);
assert_eq!(
shift_months(base, -18),
NaiveDate::from_ymd_opt(2019, 6, 30).unwrap()
);
}
#[test]
fn test_shift_months_datetime() {
let date = NaiveDate::from_ymd_opt(2020, 1, 31).unwrap();
let o_clock = NaiveTime::from_hms_opt(1, 2, 3).unwrap();
let base = NaiveDateTime::new(date, o_clock);
assert_eq!(
shift_months(base, 0).date(),
NaiveDate::from_ymd_opt(2020, 1, 31).unwrap()
);
assert_eq!(
shift_months(base, 1).date(),
NaiveDate::from_ymd_opt(2020, 2, 29).unwrap()
);
assert_eq!(
shift_months(base, 2).date(),
NaiveDate::from_ymd_opt(2020, 3, 31).unwrap()
);
assert_eq!(shift_months(base, 0).time(), o_clock);
assert_eq!(shift_months(base, 1).time(), o_clock);
assert_eq!(shift_months(base, 2).time(), o_clock);
}
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,547 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//! Contains declarations to bind to the [C Stream Interface](https://arrow.apache.org/docs/format/CStreamInterface.html).
//!
//! This module has two main interfaces:
//! One interface maps C ABI to native Rust types, i.e. convert c-pointers, c_char, to native rust.
//! This is handled by [FFI_ArrowArrayStream].
//!
//! The second interface is used to import `FFI_ArrowArrayStream` as Rust implementation `RecordBatch` reader.
//! This is handled by `ArrowArrayStreamReader`.
//!
//! ```ignore
//! # use std::fs::File;
//! # use std::sync::Arc;
//! # use arrow::error::Result;
//! # use arrow::ffi_stream::{export_reader_into_raw, ArrowArrayStreamReader, FFI_ArrowArrayStream};
//! # use arrow::ipc::reader::FileReader;
//! # use arrow::record_batch::RecordBatchReader;
//! # fn main() -> Result<()> {
//! // create an record batch reader natively
//! let file = File::open("arrow_file").unwrap();
//! let reader = Box::new(FileReader::try_new(file).unwrap());
//!
//! // export it
//! let mut stream = FFI_ArrowArrayStream::empty();
//! unsafe { export_reader_into_raw(reader, &mut stream) };
//!
//! // consumed and used by something else...
//!
//! // import it
//! let stream_reader = unsafe { ArrowArrayStreamReader::from_raw(&mut stream).unwrap() };
//! let imported_schema = stream_reader.schema();
//!
//! let mut produced_batches = vec![];
//! for batch in stream_reader {
//! produced_batches.push(batch.unwrap());
//! }
//! Ok(())
//! }
//! ```
use arrow_schema::DataType;
use std::ffi::CStr;
use std::ptr::addr_of;
use std::{
ffi::CString,
os::raw::{c_char, c_int, c_void},
sync::Arc,
};
use arrow_data::ffi::FFI_ArrowArray;
use arrow_schema::{ffi::FFI_ArrowSchema, ArrowError, Schema, SchemaRef};
use crate::array::Array;
use crate::array::StructArray;
use crate::ffi::from_ffi_and_data_type;
use crate::record_batch::{RecordBatch, RecordBatchReader};
type Result<T> = std::result::Result<T, ArrowError>;
const ENOMEM: i32 = 12;
const EIO: i32 = 5;
const EINVAL: i32 = 22;
const ENOSYS: i32 = 78;
/// ABI-compatible struct for `ArrayStream` from C Stream Interface
/// See <https://arrow.apache.org/docs/format/CStreamInterface.html#structure-definitions>
/// This was created by bindgen
#[repr(C)]
#[derive(Debug)]
#[allow(missing_docs)]
pub struct FFI_ArrowArrayStream {
pub get_schema: Option<
unsafe extern "C" fn(arg1: *mut FFI_ArrowArrayStream, out: *mut FFI_ArrowSchema) -> c_int,
>,
pub get_next: Option<
unsafe extern "C" fn(arg1: *mut FFI_ArrowArrayStream, out: *mut FFI_ArrowArray) -> c_int,
>,
pub get_last_error:
Option<unsafe extern "C" fn(arg1: *mut FFI_ArrowArrayStream) -> *const c_char>,
pub release: Option<unsafe extern "C" fn(arg1: *mut FFI_ArrowArrayStream)>,
pub private_data: *mut c_void,
}
unsafe impl Send for FFI_ArrowArrayStream {}
// callback used to drop [FFI_ArrowArrayStream] when it is exported.
unsafe extern "C" fn release_stream(stream: *mut FFI_ArrowArrayStream) {
if stream.is_null() {
return;
}
let stream = &mut *stream;
stream.get_schema = None;
stream.get_next = None;
stream.get_last_error = None;
let private_data = Box::from_raw(stream.private_data as *mut StreamPrivateData);
drop(private_data);
stream.release = None;
}
struct StreamPrivateData {
batch_reader: Box<dyn RecordBatchReader + Send>,
last_error: Option<CString>,
}
// The callback used to get array schema
unsafe extern "C" fn get_schema(
stream: *mut FFI_ArrowArrayStream,
schema: *mut FFI_ArrowSchema,
) -> c_int {
ExportedArrayStream { stream }.get_schema(schema)
}
// The callback used to get next array
unsafe extern "C" fn get_next(
stream: *mut FFI_ArrowArrayStream,
array: *mut FFI_ArrowArray,
) -> c_int {
ExportedArrayStream { stream }.get_next(array)
}
// The callback used to get the error from last operation on the `FFI_ArrowArrayStream`
unsafe extern "C" fn get_last_error(stream: *mut FFI_ArrowArrayStream) -> *const c_char {
let mut ffi_stream = ExportedArrayStream { stream };
// The consumer should not take ownership of this string, we should return
// a const pointer to it.
match ffi_stream.get_last_error() {
Some(err_string) => err_string.as_ptr(),
None => std::ptr::null(),
}
}
impl Drop for FFI_ArrowArrayStream {
fn drop(&mut self) {
match self.release {
None => (),
Some(release) => unsafe { release(self) },
};
}
}
impl FFI_ArrowArrayStream {
/// Creates a new [`FFI_ArrowArrayStream`].
pub fn new(batch_reader: Box<dyn RecordBatchReader + Send>) -> Self {
let private_data = Box::new(StreamPrivateData {
batch_reader,
last_error: None,
});
Self {
get_schema: Some(get_schema),
get_next: Some(get_next),
get_last_error: Some(get_last_error),
release: Some(release_stream),
private_data: Box::into_raw(private_data) as *mut c_void,
}
}
/// Takes ownership of the pointed to [`FFI_ArrowArrayStream`]
///
/// This acts to [move] the data out of `raw_stream`, setting the release callback to NULL
///
/// # Safety
///
/// * `raw_stream` must be [valid] for reads and writes
/// * `raw_stream` must be properly aligned
/// * `raw_stream` must point to a properly initialized value of [`FFI_ArrowArrayStream`]
///
/// [move]: https://arrow.apache.org/docs/format/CDataInterface.html#moving-an-array
/// [valid]: https://doc.rust-lang.org/std/ptr/index.html#safety
pub unsafe fn from_raw(raw_stream: *mut FFI_ArrowArrayStream) -> Self {
std::ptr::replace(raw_stream, Self::empty())
}
/// Creates a new empty [FFI_ArrowArrayStream]. Used to import from the C Stream Interface.
pub fn empty() -> Self {
Self {
get_schema: None,
get_next: None,
get_last_error: None,
release: None,
private_data: std::ptr::null_mut(),
}
}
}
struct ExportedArrayStream {
stream: *mut FFI_ArrowArrayStream,
}
impl ExportedArrayStream {
fn get_private_data(&mut self) -> &mut StreamPrivateData {
unsafe { &mut *((*self.stream).private_data as *mut StreamPrivateData) }
}
pub fn get_schema(&mut self, out: *mut FFI_ArrowSchema) -> i32 {
let private_data = self.get_private_data();
let reader = &private_data.batch_reader;
let schema = FFI_ArrowSchema::try_from(reader.schema().as_ref());
match schema {
Ok(schema) => {
unsafe { std::ptr::copy(addr_of!(schema), out, 1) };
std::mem::forget(schema);
0
}
Err(ref err) => {
private_data.last_error = Some(
CString::new(err.to_string()).expect("Error string has a null byte in it."),
);
get_error_code(err)
}
}
}
pub fn get_next(&mut self, out: *mut FFI_ArrowArray) -> i32 {
let private_data = self.get_private_data();
let reader = &mut private_data.batch_reader;
match reader.next() {
None => {
// Marks ArrowArray released to indicate reaching the end of stream.
unsafe { std::ptr::write(out, FFI_ArrowArray::empty()) }
0
}
Some(next_batch) => {
if let Ok(batch) = next_batch {
let struct_array = StructArray::from(batch);
let array = FFI_ArrowArray::new(&struct_array.to_data());
unsafe { std::ptr::write_unaligned(out, array) };
0
} else {
let err = &next_batch.unwrap_err();
private_data.last_error = Some(
CString::new(err.to_string()).expect("Error string has a null byte in it."),
);
get_error_code(err)
}
}
}
}
pub fn get_last_error(&mut self) -> Option<&CString> {
self.get_private_data().last_error.as_ref()
}
}
fn get_error_code(err: &ArrowError) -> i32 {
match err {
ArrowError::NotYetImplemented(_) => ENOSYS,
ArrowError::MemoryError(_) => ENOMEM,
ArrowError::IoError(_, _) => EIO,
_ => EINVAL,
}
}
/// A `RecordBatchReader` which imports Arrays from `FFI_ArrowArrayStream`.
/// Struct used to fetch `RecordBatch` from the C Stream Interface.
/// Its main responsibility is to expose `RecordBatchReader` functionality
/// that requires [FFI_ArrowArrayStream].
#[derive(Debug)]
pub struct ArrowArrayStreamReader {
stream: FFI_ArrowArrayStream,
schema: SchemaRef,
}
/// Gets schema from a raw pointer of `FFI_ArrowArrayStream`. This is used when constructing
/// `ArrowArrayStreamReader` to cache schema.
fn get_stream_schema(stream_ptr: *mut FFI_ArrowArrayStream) -> Result<SchemaRef> {
let mut schema = FFI_ArrowSchema::empty();
let ret_code = unsafe { (*stream_ptr).get_schema.unwrap()(stream_ptr, &mut schema) };
if ret_code == 0 {
let schema = Schema::try_from(&schema)?;
Ok(Arc::new(schema))
} else {
Err(ArrowError::CDataInterface(format!(
"Cannot get schema from input stream. Error code: {ret_code:?}"
)))
}
}
impl ArrowArrayStreamReader {
/// Creates a new `ArrowArrayStreamReader` from a `FFI_ArrowArrayStream`.
/// This is used to import from the C Stream Interface.
#[allow(dead_code)]
pub fn try_new(mut stream: FFI_ArrowArrayStream) -> Result<Self> {
if stream.release.is_none() {
return Err(ArrowError::CDataInterface(
"input stream is already released".to_string(),
));
}
let schema = get_stream_schema(&mut stream)?;
Ok(Self { stream, schema })
}
/// Creates a new `ArrowArrayStreamReader` from a raw pointer of `FFI_ArrowArrayStream`.
///
/// Assumes that the pointer represents valid C Stream Interfaces.
/// This function copies the content from the raw pointer and cleans up it to prevent
/// double-dropping. The caller is responsible for freeing up the memory allocated for
/// the pointer.
///
/// # Safety
///
/// See [`FFI_ArrowArrayStream::from_raw`]
pub unsafe fn from_raw(raw_stream: *mut FFI_ArrowArrayStream) -> Result<Self> {
Self::try_new(FFI_ArrowArrayStream::from_raw(raw_stream))
}
/// Get the last error from `ArrowArrayStreamReader`
fn get_stream_last_error(&mut self) -> Option<String> {
let get_last_error = self.stream.get_last_error?;
let error_str = unsafe { get_last_error(&mut self.stream) };
if error_str.is_null() {
return None;
}
let error_str = unsafe { CStr::from_ptr(error_str) };
Some(error_str.to_string_lossy().to_string())
}
}
impl Iterator for ArrowArrayStreamReader {
type Item = Result<RecordBatch>;
fn next(&mut self) -> Option<Self::Item> {
let mut array = FFI_ArrowArray::empty();
let ret_code = unsafe { self.stream.get_next.unwrap()(&mut self.stream, &mut array) };
if ret_code == 0 {
// The end of stream has been reached
if array.is_released() {
return None;
}
let result = unsafe {
from_ffi_and_data_type(array, DataType::Struct(self.schema().fields().clone()))
};
Some(result.map(|data| RecordBatch::from(StructArray::from(data))))
} else {
let last_error = self.get_stream_last_error();
let err = ArrowError::CDataInterface(last_error.unwrap());
Some(Err(err))
}
}
}
impl RecordBatchReader for ArrowArrayStreamReader {
fn schema(&self) -> SchemaRef {
self.schema.clone()
}
}
/// Exports a record batch reader to raw pointer of the C Stream Interface provided by the consumer.
///
/// # Safety
/// Assumes that the pointer represents valid C Stream Interfaces, both in memory
/// representation and lifetime via the `release` mechanism.
#[deprecated(note = "Use FFI_ArrowArrayStream::new")]
pub unsafe fn export_reader_into_raw(
reader: Box<dyn RecordBatchReader + Send>,
out_stream: *mut FFI_ArrowArrayStream,
) {
let stream = FFI_ArrowArrayStream::new(reader);
std::ptr::write_unaligned(out_stream, stream);
}
#[cfg(test)]
mod tests {
use super::*;
use arrow_schema::Field;
use crate::array::Int32Array;
use crate::ffi::from_ffi;
struct TestRecordBatchReader {
schema: SchemaRef,
iter: Box<dyn Iterator<Item = Result<RecordBatch>> + Send>,
}
impl TestRecordBatchReader {
pub fn new(
schema: SchemaRef,
iter: Box<dyn Iterator<Item = Result<RecordBatch>> + Send>,
) -> Box<TestRecordBatchReader> {
Box::new(TestRecordBatchReader { schema, iter })
}
}
impl Iterator for TestRecordBatchReader {
type Item = Result<RecordBatch>;
fn next(&mut self) -> Option<Self::Item> {
self.iter.next()
}
}
impl RecordBatchReader for TestRecordBatchReader {
fn schema(&self) -> SchemaRef {
self.schema.clone()
}
}
fn _test_round_trip_export(arrays: Vec<Arc<dyn Array>>) -> Result<()> {
let schema = Arc::new(Schema::new(vec![
Field::new("a", arrays[0].data_type().clone(), true),
Field::new("b", arrays[1].data_type().clone(), true),
Field::new("c", arrays[2].data_type().clone(), true),
]));
let batch = RecordBatch::try_new(schema.clone(), arrays).unwrap();
let iter = Box::new(vec![batch.clone(), batch.clone()].into_iter().map(Ok)) as _;
let reader = TestRecordBatchReader::new(schema.clone(), iter);
// Export a `RecordBatchReader` through `FFI_ArrowArrayStream`
let mut ffi_stream = FFI_ArrowArrayStream::new(reader);
// Get schema from `FFI_ArrowArrayStream`
let mut ffi_schema = FFI_ArrowSchema::empty();
let ret_code = unsafe { get_schema(&mut ffi_stream, &mut ffi_schema) };
assert_eq!(ret_code, 0);
let exported_schema = Schema::try_from(&ffi_schema).unwrap();
assert_eq!(&exported_schema, schema.as_ref());
// Get array from `FFI_ArrowArrayStream`
let mut produced_batches = vec![];
loop {
let mut ffi_array = FFI_ArrowArray::empty();
let ret_code = unsafe { get_next(&mut ffi_stream, &mut ffi_array) };
assert_eq!(ret_code, 0);
// The end of stream has been reached
if ffi_array.is_released() {
break;
}
let array = unsafe { from_ffi(ffi_array, &ffi_schema) }.unwrap();
let record_batch = RecordBatch::from(StructArray::from(array));
produced_batches.push(record_batch);
}
assert_eq!(produced_batches, vec![batch.clone(), batch]);
Ok(())
}
fn _test_round_trip_import(arrays: Vec<Arc<dyn Array>>) -> Result<()> {
let schema = Arc::new(Schema::new(vec![
Field::new("a", arrays[0].data_type().clone(), true),
Field::new("b", arrays[1].data_type().clone(), true),
Field::new("c", arrays[2].data_type().clone(), true),
]));
let batch = RecordBatch::try_new(schema.clone(), arrays).unwrap();
let iter = Box::new(vec![batch.clone(), batch.clone()].into_iter().map(Ok)) as _;
let reader = TestRecordBatchReader::new(schema.clone(), iter);
// Import through `FFI_ArrowArrayStream` as `ArrowArrayStreamReader`
let stream = FFI_ArrowArrayStream::new(reader);
let stream_reader = ArrowArrayStreamReader::try_new(stream).unwrap();
let imported_schema = stream_reader.schema();
assert_eq!(imported_schema, schema);
let mut produced_batches = vec![];
for batch in stream_reader {
produced_batches.push(batch.unwrap());
}
assert_eq!(produced_batches, vec![batch.clone(), batch]);
Ok(())
}
#[test]
fn test_stream_round_trip_export() -> Result<()> {
let array = Int32Array::from(vec![Some(2), None, Some(1), None]);
let array: Arc<dyn Array> = Arc::new(array);
_test_round_trip_export(vec![array.clone(), array.clone(), array])
}
#[test]
fn test_stream_round_trip_import() -> Result<()> {
let array = Int32Array::from(vec![Some(2), None, Some(1), None]);
let array: Arc<dyn Array> = Arc::new(array);
_test_round_trip_import(vec![array.clone(), array.clone(), array])
}
#[test]
fn test_error_import() -> Result<()> {
let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, true)]));
let iter = Box::new(vec![Err(ArrowError::MemoryError("".to_string()))].into_iter());
let reader = TestRecordBatchReader::new(schema.clone(), iter);
// Import through `FFI_ArrowArrayStream` as `ArrowArrayStreamReader`
let stream = FFI_ArrowArrayStream::new(reader);
let stream_reader = ArrowArrayStreamReader::try_new(stream).unwrap();
let imported_schema = stream_reader.schema();
assert_eq!(imported_schema, schema);
let mut produced_batches = vec![];
for batch in stream_reader {
produced_batches.push(batch);
}
// The results should outlive the lifetime of the stream itself.
assert_eq!(produced_batches.len(), 1);
assert!(produced_batches[0].is_err());
Ok(())
}
}

Просмотреть файл

@ -0,0 +1,266 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//! Idiomatic iterators for [`Array`](crate::Array)
use crate::array::{
ArrayAccessor, BooleanArray, FixedSizeBinaryArray, GenericBinaryArray, GenericListArray,
GenericStringArray, PrimitiveArray,
};
use crate::{FixedSizeListArray, MapArray};
use arrow_buffer::NullBuffer;
/// An iterator that returns Some(T) or None, that can be used on any [`ArrayAccessor`]
///
/// # Performance
///
/// [`ArrayIter`] provides an idiomatic way to iterate over an array, however, this
/// comes at the cost of performance. In particular the interleaved handling of
/// the null mask is often sub-optimal.
///
/// If performing an infallible operation, it is typically faster to perform the operation
/// on every index of the array, and handle the null mask separately. For [`PrimitiveArray`]
/// this functionality is provided by [`compute::unary`]
///
/// If performing a fallible operation, it isn't possible to perform the operation independently
/// of the null mask, as this might result in a spurious failure on a null index. However,
/// there are more efficient ways to iterate over just the non-null indices, this functionality
/// is provided by [`compute::try_unary`]
///
/// [`PrimitiveArray`]: crate::PrimitiveArray
/// [`compute::unary`]: https://docs.rs/arrow/latest/arrow/compute/fn.unary.html
/// [`compute::try_unary`]: https://docs.rs/arrow/latest/arrow/compute/fn.try_unary.html
#[derive(Debug)]
pub struct ArrayIter<T: ArrayAccessor> {
array: T,
logical_nulls: Option<NullBuffer>,
current: usize,
current_end: usize,
}
impl<T: ArrayAccessor> ArrayIter<T> {
/// create a new iterator
pub fn new(array: T) -> Self {
let len = array.len();
let logical_nulls = array.logical_nulls();
ArrayIter {
array,
logical_nulls,
current: 0,
current_end: len,
}
}
#[inline]
fn is_null(&self, idx: usize) -> bool {
self.logical_nulls
.as_ref()
.map(|x| x.is_null(idx))
.unwrap_or_default()
}
}
impl<T: ArrayAccessor> Iterator for ArrayIter<T> {
type Item = Option<T::Item>;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
if self.current == self.current_end {
None
} else if self.is_null(self.current) {
self.current += 1;
Some(None)
} else {
let old = self.current;
self.current += 1;
// Safety:
// we just checked bounds in `self.current_end == self.current`
// this is safe on the premise that this struct is initialized with
// current = array.len()
// and that current_end is ever only decremented
unsafe { Some(Some(self.array.value_unchecked(old))) }
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
(
self.array.len() - self.current,
Some(self.array.len() - self.current),
)
}
}
impl<T: ArrayAccessor> DoubleEndedIterator for ArrayIter<T> {
fn next_back(&mut self) -> Option<Self::Item> {
if self.current_end == self.current {
None
} else {
self.current_end -= 1;
Some(if self.is_null(self.current_end) {
None
} else {
// Safety:
// we just checked bounds in `self.current_end == self.current`
// this is safe on the premise that this struct is initialized with
// current = array.len()
// and that current_end is ever only decremented
unsafe { Some(self.array.value_unchecked(self.current_end)) }
})
}
}
}
/// all arrays have known size.
impl<T: ArrayAccessor> ExactSizeIterator for ArrayIter<T> {}
/// an iterator that returns Some(T) or None, that can be used on any PrimitiveArray
pub type PrimitiveIter<'a, T> = ArrayIter<&'a PrimitiveArray<T>>;
/// an iterator that returns Some(T) or None, that can be used on any BooleanArray
pub type BooleanIter<'a> = ArrayIter<&'a BooleanArray>;
/// an iterator that returns Some(T) or None, that can be used on any Utf8Array
pub type GenericStringIter<'a, T> = ArrayIter<&'a GenericStringArray<T>>;
/// an iterator that returns Some(T) or None, that can be used on any BinaryArray
pub type GenericBinaryIter<'a, T> = ArrayIter<&'a GenericBinaryArray<T>>;
/// an iterator that returns Some(T) or None, that can be used on any FixedSizeBinaryArray
pub type FixedSizeBinaryIter<'a> = ArrayIter<&'a FixedSizeBinaryArray>;
/// an iterator that returns Some(T) or None, that can be used on any FixedSizeListArray
pub type FixedSizeListIter<'a> = ArrayIter<&'a FixedSizeListArray>;
/// an iterator that returns Some(T) or None, that can be used on any ListArray
pub type GenericListArrayIter<'a, O> = ArrayIter<&'a GenericListArray<O>>;
/// an iterator that returns Some(T) or None, that can be used on any MapArray
pub type MapArrayIter<'a> = ArrayIter<&'a MapArray>;
#[cfg(test)]
mod tests {
use std::sync::Arc;
use crate::array::{ArrayRef, BinaryArray, BooleanArray, Int32Array, StringArray};
#[test]
fn test_primitive_array_iter_round_trip() {
let array = Int32Array::from(vec![Some(0), None, Some(2), None, Some(4)]);
let array = Arc::new(array) as ArrayRef;
let array = array.as_any().downcast_ref::<Int32Array>().unwrap();
// to and from iter, with a +1
let result: Int32Array = array.iter().map(|e| e.map(|e| e + 1)).collect();
let expected = Int32Array::from(vec![Some(1), None, Some(3), None, Some(5)]);
assert_eq!(result, expected);
// check if DoubleEndedIterator is implemented
let result: Int32Array = array.iter().rev().collect();
let rev_array = Int32Array::from(vec![Some(4), None, Some(2), None, Some(0)]);
assert_eq!(result, rev_array);
// check if ExactSizeIterator is implemented
let _ = array.iter().rposition(|opt_b| opt_b == Some(1));
}
#[test]
fn test_double_ended() {
let array = Int32Array::from(vec![Some(0), None, Some(2), None, Some(4)]);
let mut a = array.iter();
assert_eq!(a.next(), Some(Some(0)));
assert_eq!(a.next(), Some(None));
assert_eq!(a.next_back(), Some(Some(4)));
assert_eq!(a.next_back(), Some(None));
assert_eq!(a.next_back(), Some(Some(2)));
// the two sides have met: None is returned by both
assert_eq!(a.next_back(), None);
assert_eq!(a.next(), None);
}
#[test]
fn test_string_array_iter_round_trip() {
let array = StringArray::from(vec![Some("a"), None, Some("aaa"), None, Some("aaaaa")]);
let array = Arc::new(array) as ArrayRef;
let array = array.as_any().downcast_ref::<StringArray>().unwrap();
// to and from iter, with a +1
let result: StringArray = array
.iter()
.map(|e| {
e.map(|e| {
let mut a = e.to_string();
a.push('b');
a
})
})
.collect();
let expected =
StringArray::from(vec![Some("ab"), None, Some("aaab"), None, Some("aaaaab")]);
assert_eq!(result, expected);
// check if DoubleEndedIterator is implemented
let result: StringArray = array.iter().rev().collect();
let rev_array = StringArray::from(vec![Some("aaaaa"), None, Some("aaa"), None, Some("a")]);
assert_eq!(result, rev_array);
// check if ExactSizeIterator is implemented
let _ = array.iter().rposition(|opt_b| opt_b == Some("a"));
}
#[test]
fn test_binary_array_iter_round_trip() {
let array = BinaryArray::from(vec![
Some(b"a" as &[u8]),
None,
Some(b"aaa"),
None,
Some(b"aaaaa"),
]);
// to and from iter
let result: BinaryArray = array.iter().collect();
assert_eq!(result, array);
// check if DoubleEndedIterator is implemented
let result: BinaryArray = array.iter().rev().collect();
let rev_array = BinaryArray::from(vec![
Some(b"aaaaa" as &[u8]),
None,
Some(b"aaa"),
None,
Some(b"a"),
]);
assert_eq!(result, rev_array);
// check if ExactSizeIterator is implemented
let _ = array.iter().rposition(|opt_b| opt_b == Some(&[9]));
}
#[test]
fn test_boolean_array_iter_round_trip() {
let array = BooleanArray::from(vec![Some(true), None, Some(false)]);
// to and from iter
let result: BooleanArray = array.iter().collect();
assert_eq!(result, array);
// check if DoubleEndedIterator is implemented
let result: BooleanArray = array.iter().rev().collect();
let rev_array = BooleanArray::from(vec![Some(false), None, Some(true)]);
assert_eq!(result, rev_array);
// check if ExactSizeIterator is implemented
let _ = array.iter().rposition(|opt_b| opt_b == Some(true));
}
}

Просмотреть файл

@ -0,0 +1,243 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//! The central type in Apache Arrow are arrays, which are a known-length sequence of values
//! all having the same type. This crate provides concrete implementations of each type, as
//! well as an [`Array`] trait that can be used for type-erasure.
//!
//! # Building an Array
//!
//! Most [`Array`] implementations can be constructed directly from iterators or [`Vec`]
//!
//! ```
//! # use arrow_array::{Int32Array, ListArray, StringArray};
//! # use arrow_array::types::Int32Type;
//! #
//! Int32Array::from(vec![1, 2]);
//! Int32Array::from(vec![Some(1), None]);
//! Int32Array::from_iter([1, 2, 3, 4]);
//! Int32Array::from_iter([Some(1), Some(2), None, Some(4)]);
//!
//! StringArray::from(vec!["foo", "bar"]);
//! StringArray::from(vec![Some("foo"), None]);
//! StringArray::from_iter([Some("foo"), None]);
//! StringArray::from_iter_values(["foo", "bar"]);
//!
//! ListArray::from_iter_primitive::<Int32Type, _, _>([
//! Some(vec![Some(1), None, Some(3)]),
//! None,
//! Some(vec![])
//! ]);
//! ```
//!
//! Additionally [`ArrayBuilder`](builder::ArrayBuilder) implementations can be
//! used to construct arrays with a push-based interface
//!
//! ```
//! # use arrow_array::Int16Array;
//! #
//! // Create a new builder with a capacity of 100
//! let mut builder = Int16Array::builder(100);
//!
//! // Append a single primitive value
//! builder.append_value(1);
//! // Append a null value
//! builder.append_null();
//! // Append a slice of primitive values
//! builder.append_slice(&[2, 3, 4]);
//!
//! // Build the array
//! let array = builder.finish();
//!
//! assert_eq!(5, array.len());
//! assert_eq!(2, array.value(2));
//! assert_eq!(&array.values()[3..5], &[3, 4])
//! ```
//!
//! # Low-level API
//!
//! Internally, arrays consist of one or more shared memory regions backed by a [`Buffer`],
//! the number and meaning of which depend on the arrays data type, as documented in
//! the [Arrow specification].
//!
//! For example, the type [`Int16Array`] represents an array of 16-bit integers and consists of:
//!
//! * An optional [`NullBuffer`] identifying any null values
//! * A contiguous [`ScalarBuffer<i16>`] of values
//!
//! Similarly, the type [`StringArray`] represents an array of UTF-8 strings and consists of:
//!
//! * An optional [`NullBuffer`] identifying any null values
//! * An offsets [`OffsetBuffer<i32>`] identifying valid UTF-8 sequences within the values buffer
//! * A values [`Buffer`] of UTF-8 encoded string data
//!
//! Array constructors such as [`PrimitiveArray::try_new`] provide the ability to cheaply
//! construct an array from these parts, with functions such as [`PrimitiveArray::into_parts`]
//! providing the reverse operation.
//!
//! ```
//! # use arrow_array::{Array, Int32Array, StringArray};
//! # use arrow_buffer::OffsetBuffer;
//! #
//! // Create a Int32Array from Vec without copying
//! let array = Int32Array::new(vec![1, 2, 3].into(), None);
//! assert_eq!(array.values(), &[1, 2, 3]);
//! assert_eq!(array.null_count(), 0);
//!
//! // Create a StringArray from parts
//! let offsets = OffsetBuffer::new(vec![0, 5, 10].into());
//! let array = StringArray::new(offsets, b"helloworld".into(), None);
//! let values: Vec<_> = array.iter().map(|x| x.unwrap()).collect();
//! assert_eq!(values, &["hello", "world"]);
//! ```
//!
//! As [`Buffer`], and its derivatives, can be created from [`Vec`] without copying, this provides
//! an efficient way to not only interoperate with other Rust code, but also implement kernels
//! optimised for the arrow data layout - e.g. by handling buffers instead of values.
//!
//! # Zero-Copy Slicing
//!
//! Given an [`Array`] of arbitrary length, it is possible to create an owned slice of this
//! data. Internally this just increments some ref-counts, and so is incredibly cheap
//!
//! ```rust
//! # use arrow_array::Int32Array;
//! let array = Int32Array::from_iter([1, 2, 3]);
//!
//! // Slice with offset 1 and length 2
//! let sliced = array.slice(1, 2);
//! assert_eq!(sliced.values(), &[2, 3]);
//! ```
//!
//! # Downcasting an Array
//!
//! Arrays are often passed around as a dynamically typed [`&dyn Array`] or [`ArrayRef`].
//! For example, [`RecordBatch`](`crate::RecordBatch`) stores columns as [`ArrayRef`].
//!
//! Whilst these arrays can be passed directly to the [`compute`], [`csv`], [`json`], etc... APIs,
//! it is often the case that you wish to interact with the concrete arrays directly.
//!
//! This requires downcasting to the concrete type of the array:
//!
//! ```
//! # use arrow_array::{Array, Float32Array, Int32Array};
//!
//! // Safely downcast an `Array` to an `Int32Array` and compute the sum
//! // using native i32 values
//! fn sum_int32(array: &dyn Array) -> i32 {
//! let integers: &Int32Array = array.as_any().downcast_ref().unwrap();
//! integers.iter().map(|val| val.unwrap_or_default()).sum()
//! }
//!
//! // Safely downcasts the array to a `Float32Array` and returns a &[f32] view of the data
//! // Note: the values for positions corresponding to nulls will be arbitrary (but still valid f32)
//! fn as_f32_slice(array: &dyn Array) -> &[f32] {
//! array.as_any().downcast_ref::<Float32Array>().unwrap().values()
//! }
//! ```
//!
//! The [`cast::AsArray`] extension trait can make this more ergonomic
//!
//! ```
//! # use arrow_array::Array;
//! # use arrow_array::cast::{AsArray, as_primitive_array};
//! # use arrow_array::types::Float32Type;
//!
//! fn as_f32_slice(array: &dyn Array) -> &[f32] {
//! array.as_primitive::<Float32Type>().values()
//! }
//! ```
//!
//! [`ScalarBuffer<T>`]: arrow_buffer::ScalarBuffer
//! [`ScalarBuffer<i16>`]: arrow_buffer::ScalarBuffer
//! [`OffsetBuffer<i32>`]: arrow_buffer::OffsetBuffer
//! [`NullBuffer`]: arrow_buffer::NullBuffer
//! [Arrow specification]: https://arrow.apache.org/docs/format/Columnar.html
//! [`&dyn Array`]: Array
//! [`NullBuffer`]: arrow_buffer::NullBuffer
//! [`Buffer`]: arrow_buffer::Buffer
//! [`compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html
//! [`json`]: https://docs.rs/arrow/latest/arrow/json/index.html
//! [`csv`]: https://docs.rs/arrow/latest/arrow/csv/index.html
#![deny(rustdoc::broken_intra_doc_links)]
#![warn(missing_docs)]
pub mod array;
pub use array::*;
mod record_batch;
pub use record_batch::{
RecordBatch, RecordBatchIterator, RecordBatchOptions, RecordBatchReader, RecordBatchWriter,
};
mod arithmetic;
pub use arithmetic::ArrowNativeTypeOp;
mod numeric;
pub use numeric::*;
mod scalar;
pub use scalar::*;
pub mod builder;
pub mod cast;
mod delta;
#[cfg(feature = "ffi")]
pub mod ffi;
#[cfg(feature = "ffi")]
pub mod ffi_stream;
pub mod iterator;
pub mod run_iterator;
pub mod temporal_conversions;
pub mod timezone;
mod trusted_len;
pub mod types;
#[cfg(test)]
mod tests {
use crate::builder::*;
#[test]
fn test_buffer_builder_availability() {
let _builder = Int8BufferBuilder::new(10);
let _builder = Int16BufferBuilder::new(10);
let _builder = Int32BufferBuilder::new(10);
let _builder = Int64BufferBuilder::new(10);
let _builder = UInt16BufferBuilder::new(10);
let _builder = UInt32BufferBuilder::new(10);
let _builder = Float32BufferBuilder::new(10);
let _builder = Float64BufferBuilder::new(10);
let _builder = TimestampSecondBufferBuilder::new(10);
let _builder = TimestampMillisecondBufferBuilder::new(10);
let _builder = TimestampMicrosecondBufferBuilder::new(10);
let _builder = TimestampNanosecondBufferBuilder::new(10);
let _builder = Date32BufferBuilder::new(10);
let _builder = Date64BufferBuilder::new(10);
let _builder = Time32SecondBufferBuilder::new(10);
let _builder = Time32MillisecondBufferBuilder::new(10);
let _builder = Time64MicrosecondBufferBuilder::new(10);
let _builder = Time64NanosecondBufferBuilder::new(10);
let _builder = IntervalYearMonthBufferBuilder::new(10);
let _builder = IntervalDayTimeBufferBuilder::new(10);
let _builder = IntervalMonthDayNanoBufferBuilder::new(10);
let _builder = DurationSecondBufferBuilder::new(10);
let _builder = DurationMillisecondBufferBuilder::new(10);
let _builder = DurationMicrosecondBufferBuilder::new(10);
let _builder = DurationNanosecondBufferBuilder::new(10);
}
}

Просмотреть файл

@ -0,0 +1,23 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::ArrowPrimitiveType;
/// A subtype of primitive type that represents numeric values.
pub trait ArrowNumericType: ArrowPrimitiveType {}
impl<T: ArrowPrimitiveType> ArrowNumericType for T {}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,384 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//! Idiomatic iterator for [`RunArray`](crate::RunArray)
use crate::{array::ArrayAccessor, types::RunEndIndexType, Array, TypedRunArray};
use arrow_buffer::ArrowNativeType;
/// The [`RunArrayIter`] provides an idiomatic way to iterate over the run array.
/// It returns Some(T) if there is a value or None if the value is null.
///
/// The iterator comes with a cost as it has to iterate over three arrays to determine
/// the value to be returned. The run_ends array is used to determine the index of the value.
/// The nulls array is used to determine if the value is null and the values array is used to
/// get the value.
///
/// Unlike other iterators in this crate, [`RunArrayIter`] does not use [`ArrayAccessor`]
/// because the run array accessor does binary search to access each value which is too slow.
/// The run array iterator can determine the next value in constant time.
///
#[derive(Debug)]
pub struct RunArrayIter<'a, R, V>
where
R: RunEndIndexType,
V: Sync + Send,
&'a V: ArrayAccessor,
<&'a V as ArrayAccessor>::Item: Default,
{
array: TypedRunArray<'a, R, V>,
current_front_logical: usize,
current_front_physical: usize,
current_back_logical: usize,
current_back_physical: usize,
}
impl<'a, R, V> RunArrayIter<'a, R, V>
where
R: RunEndIndexType,
V: Sync + Send,
&'a V: ArrayAccessor,
<&'a V as ArrayAccessor>::Item: Default,
{
/// create a new iterator
pub fn new(array: TypedRunArray<'a, R, V>) -> Self {
let current_front_physical = array.run_array().get_start_physical_index();
let current_back_physical = array.run_array().get_end_physical_index() + 1;
RunArrayIter {
array,
current_front_logical: array.offset(),
current_front_physical,
current_back_logical: array.offset() + array.len(),
current_back_physical,
}
}
}
impl<'a, R, V> Iterator for RunArrayIter<'a, R, V>
where
R: RunEndIndexType,
V: Sync + Send,
&'a V: ArrayAccessor,
<&'a V as ArrayAccessor>::Item: Default,
{
type Item = Option<<&'a V as ArrayAccessor>::Item>;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
if self.current_front_logical == self.current_back_logical {
return None;
}
// If current logical index is greater than current run end index then increment
// the physical index.
let run_ends = self.array.run_ends().values();
if self.current_front_logical >= run_ends[self.current_front_physical].as_usize() {
// As the run_ends is expected to be strictly increasing, there
// should be at least one logical entry in one physical entry. Because of this
// reason the next value can be accessed by incrementing physical index once.
self.current_front_physical += 1;
}
if self.array.values().is_null(self.current_front_physical) {
self.current_front_logical += 1;
Some(None)
} else {
self.current_front_logical += 1;
// Safety:
// The self.current_physical is kept within bounds of self.current_logical.
// The self.current_logical will not go out of bounds because of the check
// `self.current_logical = self.current_end_logical` above.
unsafe {
Some(Some(
self.array
.values()
.value_unchecked(self.current_front_physical),
))
}
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
(
self.current_back_logical - self.current_front_logical,
Some(self.current_back_logical - self.current_front_logical),
)
}
}
impl<'a, R, V> DoubleEndedIterator for RunArrayIter<'a, R, V>
where
R: RunEndIndexType,
V: Sync + Send,
&'a V: ArrayAccessor,
<&'a V as ArrayAccessor>::Item: Default,
{
fn next_back(&mut self) -> Option<Self::Item> {
if self.current_back_logical == self.current_front_logical {
return None;
}
self.current_back_logical -= 1;
let run_ends = self.array.run_ends().values();
if self.current_back_physical > 0
&& self.current_back_logical < run_ends[self.current_back_physical - 1].as_usize()
{
// As the run_ends is expected to be strictly increasing, there
// should be at least one logical entry in one physical entry. Because of this
// reason the next value can be accessed by decrementing physical index once.
self.current_back_physical -= 1;
}
Some(if self.array.values().is_null(self.current_back_physical) {
None
} else {
// Safety:
// The check `self.current_end_physical > 0` ensures the value will not underflow.
// Also self.current_end_physical starts with array.len() and
// decrements based on the bounds of self.current_end_logical.
unsafe {
Some(
self.array
.values()
.value_unchecked(self.current_back_physical),
)
}
})
}
}
/// all arrays have known size.
impl<'a, R, V> ExactSizeIterator for RunArrayIter<'a, R, V>
where
R: RunEndIndexType,
V: Sync + Send,
&'a V: ArrayAccessor,
<&'a V as ArrayAccessor>::Item: Default,
{
}
#[cfg(test)]
mod tests {
use rand::{seq::SliceRandom, thread_rng, Rng};
use crate::{
array::{Int32Array, StringArray},
builder::PrimitiveRunBuilder,
types::{Int16Type, Int32Type},
Array, Int64RunArray, PrimitiveArray, RunArray,
};
fn build_input_array(size: usize) -> Vec<Option<i32>> {
// The input array is created by shuffling and repeating
// the seed values random number of times.
let mut seed: Vec<Option<i32>> = vec![
None,
None,
None,
Some(1),
Some(2),
Some(3),
Some(4),
Some(5),
Some(6),
Some(7),
Some(8),
Some(9),
];
let mut result: Vec<Option<i32>> = Vec::with_capacity(size);
let mut ix = 0;
let mut rng = thread_rng();
// run length can go up to 8. Cap the max run length for smaller arrays to size / 2.
let max_run_length = 8_usize.min(1_usize.max(size / 2));
while result.len() < size {
// shuffle the seed array if all the values are iterated.
if ix == 0 {
seed.shuffle(&mut rng);
}
// repeat the items between 1 and 8 times. Cap the length for smaller sized arrays
let num = max_run_length.min(rand::thread_rng().gen_range(1..=max_run_length));
for _ in 0..num {
result.push(seed[ix]);
}
ix += 1;
if ix == seed.len() {
ix = 0
}
}
result.resize(size, None);
result
}
#[test]
fn test_primitive_array_iter_round_trip() {
let mut input_vec = vec![
Some(32),
Some(32),
None,
Some(64),
Some(64),
Some(64),
Some(72),
];
let mut builder = PrimitiveRunBuilder::<Int32Type, Int32Type>::new();
builder.extend(input_vec.iter().copied());
let ree_array = builder.finish();
let ree_array = ree_array.downcast::<Int32Array>().unwrap();
let output_vec: Vec<Option<i32>> = ree_array.into_iter().collect();
assert_eq!(input_vec, output_vec);
let rev_output_vec: Vec<Option<i32>> = ree_array.into_iter().rev().collect();
input_vec.reverse();
assert_eq!(input_vec, rev_output_vec);
}
#[test]
fn test_double_ended() {
let input_vec = vec![
Some(32),
Some(32),
None,
Some(64),
Some(64),
Some(64),
Some(72),
];
let mut builder = PrimitiveRunBuilder::<Int32Type, Int32Type>::new();
builder.extend(input_vec);
let ree_array = builder.finish();
let ree_array = ree_array.downcast::<Int32Array>().unwrap();
let mut iter = ree_array.into_iter();
assert_eq!(Some(Some(32)), iter.next());
assert_eq!(Some(Some(72)), iter.next_back());
assert_eq!(Some(Some(32)), iter.next());
assert_eq!(Some(Some(64)), iter.next_back());
assert_eq!(Some(None), iter.next());
assert_eq!(Some(Some(64)), iter.next_back());
assert_eq!(Some(Some(64)), iter.next());
assert_eq!(None, iter.next_back());
assert_eq!(None, iter.next());
}
#[test]
fn test_run_iterator_comprehensive() {
// Test forward and backward iterator for different array lengths.
let logical_lengths = vec![1_usize, 2, 3, 4, 15, 16, 17, 63, 64, 65];
for logical_len in logical_lengths {
let input_array = build_input_array(logical_len);
let mut run_array_builder = PrimitiveRunBuilder::<Int32Type, Int32Type>::new();
run_array_builder.extend(input_array.iter().copied());
let run_array = run_array_builder.finish();
let typed_array = run_array.downcast::<Int32Array>().unwrap();
// test forward iterator
let mut input_iter = input_array.iter().copied();
let mut run_array_iter = typed_array.into_iter();
for _ in 0..logical_len {
assert_eq!(input_iter.next(), run_array_iter.next());
}
assert_eq!(None, run_array_iter.next());
// test reverse iterator
let mut input_iter = input_array.iter().rev().copied();
let mut run_array_iter = typed_array.into_iter().rev();
for _ in 0..logical_len {
assert_eq!(input_iter.next(), run_array_iter.next());
}
assert_eq!(None, run_array_iter.next());
}
}
#[test]
fn test_string_array_iter_round_trip() {
let input_vec = vec!["ab", "ab", "ba", "cc", "cc"];
let input_ree_array: Int64RunArray = input_vec.into_iter().collect();
let string_ree_array = input_ree_array.downcast::<StringArray>().unwrap();
// to and from iter, with a +1
let result: Vec<Option<String>> = string_ree_array
.into_iter()
.map(|e| {
e.map(|e| {
let mut a = e.to_string();
a.push('b');
a
})
})
.collect();
let result_asref: Vec<Option<&str>> = result.iter().map(|f| f.as_deref()).collect();
let expected_vec = vec![
Some("abb"),
Some("abb"),
Some("bab"),
Some("ccb"),
Some("ccb"),
];
assert_eq!(expected_vec, result_asref);
}
#[test]
#[cfg_attr(miri, ignore)] // Takes too long
fn test_sliced_run_array_iterator() {
let total_len = 80;
let input_array = build_input_array(total_len);
// Encode the input_array to run array
let mut builder =
PrimitiveRunBuilder::<Int16Type, Int32Type>::with_capacity(input_array.len());
builder.extend(input_array.iter().copied());
let run_array = builder.finish();
// test for all slice lengths.
for slice_len in 1..=total_len {
// test for offset = 0, slice length = slice_len
let sliced_run_array: RunArray<Int16Type> =
run_array.slice(0, slice_len).into_data().into();
let sliced_typed_run_array = sliced_run_array
.downcast::<PrimitiveArray<Int32Type>>()
.unwrap();
// Iterate on sliced typed run array
let actual: Vec<Option<i32>> = sliced_typed_run_array.into_iter().collect();
let expected: Vec<Option<i32>> = input_array.iter().take(slice_len).copied().collect();
assert_eq!(expected, actual);
// test for offset = total_len - slice_len, length = slice_len
let sliced_run_array: RunArray<Int16Type> = run_array
.slice(total_len - slice_len, slice_len)
.into_data()
.into();
let sliced_typed_run_array = sliced_run_array
.downcast::<PrimitiveArray<Int32Type>>()
.unwrap();
// Iterate on sliced typed run array
let actual: Vec<Option<i32>> = sliced_typed_run_array.into_iter().collect();
let expected: Vec<Option<i32>> = input_array
.iter()
.skip(total_len - slice_len)
.copied()
.collect();
assert_eq!(expected, actual);
}
}
}

Просмотреть файл

@ -0,0 +1,152 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::Array;
/// A possibly [`Scalar`] [`Array`]
///
/// This allows optimised binary kernels where one or more arguments are constant
///
/// ```
/// # use arrow_array::*;
/// # use arrow_buffer::{BooleanBuffer, MutableBuffer, NullBuffer};
/// # use arrow_schema::ArrowError;
/// #
/// fn eq_impl<T: ArrowPrimitiveType>(
/// a: &PrimitiveArray<T>,
/// a_scalar: bool,
/// b: &PrimitiveArray<T>,
/// b_scalar: bool,
/// ) -> BooleanArray {
/// let (array, scalar) = match (a_scalar, b_scalar) {
/// (true, true) | (false, false) => {
/// let len = a.len().min(b.len());
/// let nulls = NullBuffer::union(a.nulls(), b.nulls());
/// let buffer = BooleanBuffer::collect_bool(len, |idx| a.value(idx) == b.value(idx));
/// return BooleanArray::new(buffer, nulls);
/// }
/// (true, false) => (b, (a.null_count() == 0).then(|| a.value(0))),
/// (false, true) => (a, (b.null_count() == 0).then(|| b.value(0))),
/// };
/// match scalar {
/// Some(v) => {
/// let len = array.len();
/// let nulls = array.nulls().cloned();
/// let buffer = BooleanBuffer::collect_bool(len, |idx| array.value(idx) == v);
/// BooleanArray::new(buffer, nulls)
/// }
/// None => BooleanArray::new_null(array.len()),
/// }
/// }
///
/// pub fn eq(l: &dyn Datum, r: &dyn Datum) -> Result<BooleanArray, ArrowError> {
/// let (l_array, l_scalar) = l.get();
/// let (r_array, r_scalar) = r.get();
/// downcast_primitive_array!(
/// (l_array, r_array) => Ok(eq_impl(l_array, l_scalar, r_array, r_scalar)),
/// (a, b) => Err(ArrowError::NotYetImplemented(format!("{a} == {b}"))),
/// )
/// }
///
/// // Comparison of two arrays
/// let a = Int32Array::from(vec![1, 2, 3, 4, 5]);
/// let b = Int32Array::from(vec![1, 2, 4, 7, 3]);
/// let r = eq(&a, &b).unwrap();
/// let values: Vec<_> = r.values().iter().collect();
/// assert_eq!(values, &[true, true, false, false, false]);
///
/// // Comparison of an array and a scalar
/// let a = Int32Array::from(vec![1, 2, 3, 4, 5]);
/// let b = Int32Array::new_scalar(1);
/// let r = eq(&a, &b).unwrap();
/// let values: Vec<_> = r.values().iter().collect();
/// assert_eq!(values, &[true, false, false, false, false]);
pub trait Datum {
/// Returns the value for this [`Datum`] and a boolean indicating if the value is scalar
fn get(&self) -> (&dyn Array, bool);
}
impl<T: Array> Datum for T {
fn get(&self) -> (&dyn Array, bool) {
(self, false)
}
}
impl Datum for dyn Array {
fn get(&self) -> (&dyn Array, bool) {
(self, false)
}
}
impl Datum for &dyn Array {
fn get(&self) -> (&dyn Array, bool) {
(*self, false)
}
}
/// A wrapper around a single value [`Array`] that implements
/// [`Datum`] and indicates [compute] kernels should treat this array
/// as a scalar value (a single value).
///
/// Using a [`Scalar`] is often much more efficient than creating an
/// [`Array`] with the same (repeated) value.
///
/// See [`Datum`] for more information.
///
/// # Example
///
/// ```rust
/// # use arrow_array::{Scalar, Int32Array, ArrayRef};
/// # fn get_array() -> ArrayRef { std::sync::Arc::new(Int32Array::from(vec![42])) }
/// // Create a (typed) scalar for Int32Array for the value 42
/// let scalar = Scalar::new(Int32Array::from(vec![42]));
///
/// // Create a scalar using PrimtiveArray::scalar
/// let scalar = Int32Array::new_scalar(42);
///
/// // create a scalar from an ArrayRef (for dynamic typed Arrays)
/// let array: ArrayRef = get_array();
/// let scalar = Scalar::new(array);
/// ```
///
/// [compute]: https://docs.rs/arrow/latest/arrow/compute/index.html
#[derive(Debug, Copy, Clone)]
pub struct Scalar<T: Array>(T);
impl<T: Array> Scalar<T> {
/// Create a new [`Scalar`] from an [`Array`]
///
/// # Panics
///
/// Panics if `array.len() != 1`
pub fn new(array: T) -> Self {
assert_eq!(array.len(), 1);
Self(array)
}
/// Returns the inner array
#[inline]
pub fn into_inner(self) -> T {
self.0
}
}
impl<T: Array> Datum for Scalar<T> {
fn get(&self) -> (&dyn Array, bool) {
(&self.0, true)
}
}

Просмотреть файл

@ -0,0 +1,351 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//! Conversion methods for dates and times.
use crate::timezone::Tz;
use crate::ArrowPrimitiveType;
use arrow_schema::{DataType, TimeUnit};
use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, NaiveTime, TimeZone, Timelike, Utc};
/// Number of seconds in a day
pub const SECONDS_IN_DAY: i64 = 86_400;
/// Number of milliseconds in a second
pub const MILLISECONDS: i64 = 1_000;
/// Number of microseconds in a second
pub const MICROSECONDS: i64 = 1_000_000;
/// Number of nanoseconds in a second
pub const NANOSECONDS: i64 = 1_000_000_000;
/// Number of milliseconds in a day
pub const MILLISECONDS_IN_DAY: i64 = SECONDS_IN_DAY * MILLISECONDS;
/// Number of microseconds in a day
pub const MICROSECONDS_IN_DAY: i64 = SECONDS_IN_DAY * MICROSECONDS;
/// Number of nanoseconds in a day
pub const NANOSECONDS_IN_DAY: i64 = SECONDS_IN_DAY * NANOSECONDS;
/// Number of days between 0001-01-01 and 1970-01-01
pub const EPOCH_DAYS_FROM_CE: i32 = 719_163;
/// converts a `i32` representing a `date32` to [`NaiveDateTime`]
#[inline]
pub fn date32_to_datetime(v: i32) -> Option<NaiveDateTime> {
Some(DateTime::from_timestamp(v as i64 * SECONDS_IN_DAY, 0)?.naive_utc())
}
/// converts a `i64` representing a `date64` to [`NaiveDateTime`]
#[inline]
pub fn date64_to_datetime(v: i64) -> Option<NaiveDateTime> {
let (sec, milli_sec) = split_second(v, MILLISECONDS);
let datetime = DateTime::from_timestamp(
// extract seconds from milliseconds
sec,
// discard extracted seconds and convert milliseconds to nanoseconds
milli_sec * MICROSECONDS as u32,
)?;
Some(datetime.naive_utc())
}
/// converts a `i32` representing a `time32(s)` to [`NaiveDateTime`]
#[inline]
pub fn time32s_to_time(v: i32) -> Option<NaiveTime> {
NaiveTime::from_num_seconds_from_midnight_opt(v as u32, 0)
}
/// converts a `i32` representing a `time32(ms)` to [`NaiveDateTime`]
#[inline]
pub fn time32ms_to_time(v: i32) -> Option<NaiveTime> {
let v = v as i64;
NaiveTime::from_num_seconds_from_midnight_opt(
// extract seconds from milliseconds
(v / MILLISECONDS) as u32,
// discard extracted seconds and convert milliseconds to
// nanoseconds
(v % MILLISECONDS * MICROSECONDS) as u32,
)
}
/// converts a `i64` representing a `time64(us)` to [`NaiveDateTime`]
#[inline]
pub fn time64us_to_time(v: i64) -> Option<NaiveTime> {
NaiveTime::from_num_seconds_from_midnight_opt(
// extract seconds from microseconds
(v / MICROSECONDS) as u32,
// discard extracted seconds and convert microseconds to
// nanoseconds
(v % MICROSECONDS * MILLISECONDS) as u32,
)
}
/// converts a `i64` representing a `time64(ns)` to [`NaiveDateTime`]
#[inline]
pub fn time64ns_to_time(v: i64) -> Option<NaiveTime> {
NaiveTime::from_num_seconds_from_midnight_opt(
// extract seconds from nanoseconds
(v / NANOSECONDS) as u32,
// discard extracted seconds
(v % NANOSECONDS) as u32,
)
}
/// converts [`NaiveTime`] to a `i32` representing a `time32(s)`
#[inline]
pub fn time_to_time32s(v: NaiveTime) -> i32 {
v.num_seconds_from_midnight() as i32
}
/// converts [`NaiveTime`] to a `i32` representing a `time32(ms)`
#[inline]
pub fn time_to_time32ms(v: NaiveTime) -> i32 {
(v.num_seconds_from_midnight() as i64 * MILLISECONDS
+ v.nanosecond() as i64 * MILLISECONDS / NANOSECONDS) as i32
}
/// converts [`NaiveTime`] to a `i64` representing a `time64(us)`
#[inline]
pub fn time_to_time64us(v: NaiveTime) -> i64 {
v.num_seconds_from_midnight() as i64 * MICROSECONDS
+ v.nanosecond() as i64 * MICROSECONDS / NANOSECONDS
}
/// converts [`NaiveTime`] to a `i64` representing a `time64(ns)`
#[inline]
pub fn time_to_time64ns(v: NaiveTime) -> i64 {
v.num_seconds_from_midnight() as i64 * NANOSECONDS + v.nanosecond() as i64
}
/// converts a `i64` representing a `timestamp(s)` to [`NaiveDateTime`]
#[inline]
pub fn timestamp_s_to_datetime(v: i64) -> Option<NaiveDateTime> {
Some(DateTime::from_timestamp(v, 0)?.naive_utc())
}
/// converts a `i64` representing a `timestamp(ms)` to [`NaiveDateTime`]
#[inline]
pub fn timestamp_ms_to_datetime(v: i64) -> Option<NaiveDateTime> {
let (sec, milli_sec) = split_second(v, MILLISECONDS);
let datetime = DateTime::from_timestamp(
// extract seconds from milliseconds
sec,
// discard extracted seconds and convert milliseconds to nanoseconds
milli_sec * MICROSECONDS as u32,
)?;
Some(datetime.naive_utc())
}
/// converts a `i64` representing a `timestamp(us)` to [`NaiveDateTime`]
#[inline]
pub fn timestamp_us_to_datetime(v: i64) -> Option<NaiveDateTime> {
let (sec, micro_sec) = split_second(v, MICROSECONDS);
let datetime = DateTime::from_timestamp(
// extract seconds from microseconds
sec,
// discard extracted seconds and convert microseconds to nanoseconds
micro_sec * MILLISECONDS as u32,
)?;
Some(datetime.naive_utc())
}
/// converts a `i64` representing a `timestamp(ns)` to [`NaiveDateTime`]
#[inline]
pub fn timestamp_ns_to_datetime(v: i64) -> Option<NaiveDateTime> {
let (sec, nano_sec) = split_second(v, NANOSECONDS);
let datetime = DateTime::from_timestamp(
// extract seconds from nanoseconds
sec, // discard extracted seconds
nano_sec,
)?;
Some(datetime.naive_utc())
}
#[inline]
pub(crate) fn split_second(v: i64, base: i64) -> (i64, u32) {
(v.div_euclid(base), v.rem_euclid(base) as u32)
}
/// converts a `i64` representing a `duration(s)` to [`Duration`]
#[inline]
pub fn duration_s_to_duration(v: i64) -> Duration {
Duration::try_seconds(v).unwrap()
}
/// converts a `i64` representing a `duration(ms)` to [`Duration`]
#[inline]
pub fn duration_ms_to_duration(v: i64) -> Duration {
Duration::try_milliseconds(v).unwrap()
}
/// converts a `i64` representing a `duration(us)` to [`Duration`]
#[inline]
pub fn duration_us_to_duration(v: i64) -> Duration {
Duration::microseconds(v)
}
/// converts a `i64` representing a `duration(ns)` to [`Duration`]
#[inline]
pub fn duration_ns_to_duration(v: i64) -> Duration {
Duration::nanoseconds(v)
}
/// Converts an [`ArrowPrimitiveType`] to [`NaiveDateTime`]
pub fn as_datetime<T: ArrowPrimitiveType>(v: i64) -> Option<NaiveDateTime> {
match T::DATA_TYPE {
DataType::Date32 => date32_to_datetime(v as i32),
DataType::Date64 => date64_to_datetime(v),
DataType::Time32(_) | DataType::Time64(_) => None,
DataType::Timestamp(unit, _) => match unit {
TimeUnit::Second => timestamp_s_to_datetime(v),
TimeUnit::Millisecond => timestamp_ms_to_datetime(v),
TimeUnit::Microsecond => timestamp_us_to_datetime(v),
TimeUnit::Nanosecond => timestamp_ns_to_datetime(v),
},
// interval is not yet fully documented [ARROW-3097]
DataType::Interval(_) => None,
_ => None,
}
}
/// Converts an [`ArrowPrimitiveType`] to [`DateTime<Tz>`]
pub fn as_datetime_with_timezone<T: ArrowPrimitiveType>(v: i64, tz: Tz) -> Option<DateTime<Tz>> {
let naive = as_datetime::<T>(v)?;
Some(Utc.from_utc_datetime(&naive).with_timezone(&tz))
}
/// Converts an [`ArrowPrimitiveType`] to [`NaiveDate`]
pub fn as_date<T: ArrowPrimitiveType>(v: i64) -> Option<NaiveDate> {
as_datetime::<T>(v).map(|datetime| datetime.date())
}
/// Converts an [`ArrowPrimitiveType`] to [`NaiveTime`]
pub fn as_time<T: ArrowPrimitiveType>(v: i64) -> Option<NaiveTime> {
match T::DATA_TYPE {
DataType::Time32(unit) => {
// safe to immediately cast to u32 as `self.value(i)` is positive i32
let v = v as u32;
match unit {
TimeUnit::Second => time32s_to_time(v as i32),
TimeUnit::Millisecond => time32ms_to_time(v as i32),
_ => None,
}
}
DataType::Time64(unit) => match unit {
TimeUnit::Microsecond => time64us_to_time(v),
TimeUnit::Nanosecond => time64ns_to_time(v),
_ => None,
},
DataType::Timestamp(_, _) => as_datetime::<T>(v).map(|datetime| datetime.time()),
DataType::Date32 | DataType::Date64 => NaiveTime::from_hms_opt(0, 0, 0),
DataType::Interval(_) => None,
_ => None,
}
}
/// Converts an [`ArrowPrimitiveType`] to [`Duration`]
pub fn as_duration<T: ArrowPrimitiveType>(v: i64) -> Option<Duration> {
match T::DATA_TYPE {
DataType::Duration(unit) => match unit {
TimeUnit::Second => Some(duration_s_to_duration(v)),
TimeUnit::Millisecond => Some(duration_ms_to_duration(v)),
TimeUnit::Microsecond => Some(duration_us_to_duration(v)),
TimeUnit::Nanosecond => Some(duration_ns_to_duration(v)),
},
_ => None,
}
}
#[cfg(test)]
mod tests {
use crate::temporal_conversions::{
date64_to_datetime, split_second, timestamp_ms_to_datetime, timestamp_ns_to_datetime,
timestamp_us_to_datetime, NANOSECONDS,
};
use chrono::DateTime;
#[test]
fn negative_input_timestamp_ns_to_datetime() {
assert_eq!(
timestamp_ns_to_datetime(-1),
DateTime::from_timestamp(-1, 999_999_999).map(|x| x.naive_utc())
);
assert_eq!(
timestamp_ns_to_datetime(-1_000_000_001),
DateTime::from_timestamp(-2, 999_999_999).map(|x| x.naive_utc())
);
}
#[test]
fn negative_input_timestamp_us_to_datetime() {
assert_eq!(
timestamp_us_to_datetime(-1),
DateTime::from_timestamp(-1, 999_999_000).map(|x| x.naive_utc())
);
assert_eq!(
timestamp_us_to_datetime(-1_000_001),
DateTime::from_timestamp(-2, 999_999_000).map(|x| x.naive_utc())
);
}
#[test]
fn negative_input_timestamp_ms_to_datetime() {
assert_eq!(
timestamp_ms_to_datetime(-1),
DateTime::from_timestamp(-1, 999_000_000).map(|x| x.naive_utc())
);
assert_eq!(
timestamp_ms_to_datetime(-1_001),
DateTime::from_timestamp(-2, 999_000_000).map(|x| x.naive_utc())
);
}
#[test]
fn negative_input_date64_to_datetime() {
assert_eq!(
date64_to_datetime(-1),
DateTime::from_timestamp(-1, 999_000_000).map(|x| x.naive_utc())
);
assert_eq!(
date64_to_datetime(-1_001),
DateTime::from_timestamp(-2, 999_000_000).map(|x| x.naive_utc())
);
}
#[test]
fn test_split_seconds() {
let (sec, nano_sec) = split_second(100, NANOSECONDS);
assert_eq!(sec, 0);
assert_eq!(nano_sec, 100);
let (sec, nano_sec) = split_second(123_000_000_456, NANOSECONDS);
assert_eq!(sec, 123);
assert_eq!(nano_sec, 456);
let (sec, nano_sec) = split_second(-1, NANOSECONDS);
assert_eq!(sec, -1);
assert_eq!(nano_sec, 999_999_999);
let (sec, nano_sec) = split_second(-123_000_000_001, NANOSECONDS);
assert_eq!(sec, -124);
assert_eq!(nano_sec, 999_999_999);
}
}

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше