Upgrade DataFusion to arrow-rs/parquet 58.0.0 / object_store 0.13.0 (#19728)

## Which issue does this PR close?

- Follow on to https://github.com/apache/datafusion/pull/19355
- related to https://github.com/apache/arrow-rs/issues/8466
- Closes https://github.com/apache/datafusion/issues/17455

## Rationale for this change

Keep datafusion up to date (and test Arrow using DataFusion tests)

## What changes are included in this PR?

<!--
There is no need to duplicate the description in the issue here but it
is sometimes worth providing a summary of the individual changes in this
PR.
-->

## Are these changes tested?

<!--
We typically require tests for all PRs in order to:
1. Prevent the code from being accidentally broken by subsequent changes
2. Serve as another way to document the expected behavior of the code

If tests are not included in your PR, please explain why (for example,
are they covered by existing tests)?
-->

## Are there any user-facing changes?

<!--
If there are user-facing changes then we may require documentation to be
updated before approving the PR.
-->

<!--
If there are any breaking changes to public APIs, please add the `api
change` label.
-->

---------

Co-authored-by: Daniël Heres <danielheres@gmail.com>
This commit is contained in:
Andrew Lamb
2026-02-28 05:43:51 -05:00
committed by GitHub
parent acec058cb5
commit 73fbd48070
55 changed files with 439 additions and 461 deletions
Generated
+71 -79
View File
@@ -232,9 +232,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
[[package]]
name = "arrow"
version = "57.3.0"
version = "58.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e4754a624e5ae42081f464514be454b39711daae0458906dacde5f4c632f33a8"
checksum = "602268ce9f569f282cedb9a9f6bac569b680af47b9b077d515900c03c5d190da"
dependencies = [
"arrow-arith",
"arrow-array",
@@ -255,9 +255,9 @@ dependencies = [
[[package]]
name = "arrow-arith"
version = "57.3.0"
version = "58.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7b3141e0ec5145a22d8694ea8b6d6f69305971c4fa1c1a13ef0195aef2d678b"
checksum = "cd53c6bf277dea91f136ae8e3a5d7041b44b5e489e244e637d00ae302051f56f"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -269,9 +269,9 @@ dependencies = [
[[package]]
name = "arrow-array"
version = "57.3.0"
version = "58.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c8955af33b25f3b175ee10af580577280b4bd01f7e823d94c7cdef7cf8c9aef"
checksum = "e53796e07a6525edaf7dc28b540d477a934aff14af97967ad1d5550878969b9e"
dependencies = [
"ahash",
"arrow-buffer",
@@ -288,9 +288,9 @@ dependencies = [
[[package]]
name = "arrow-buffer"
version = "57.3.0"
version = "58.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c697ddca96183182f35b3a18e50b9110b11e916d7b7799cbfd4d34662f2c56c2"
checksum = "f2c1a85bb2e94ee10b76531d8bc3ce9b7b4c0d508cabfb17d477f63f2617bd20"
dependencies = [
"bytes",
"half",
@@ -300,9 +300,9 @@ dependencies = [
[[package]]
name = "arrow-cast"
version = "57.3.0"
version = "58.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "646bbb821e86fd57189c10b4fcdaa941deaf4181924917b0daa92735baa6ada5"
checksum = "89fb245db6b0e234ed8e15b644edb8664673fefe630575e94e62cd9d489a8a26"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -322,9 +322,9 @@ dependencies = [
[[package]]
name = "arrow-csv"
version = "57.3.0"
version = "58.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8da746f4180004e3ce7b83c977daf6394d768332349d3d913998b10a120b790a"
checksum = "d374882fb465a194462527c0c15a93aa19a554cf690a6b77a26b2a02539937a7"
dependencies = [
"arrow-array",
"arrow-cast",
@@ -337,9 +337,9 @@ dependencies = [
[[package]]
name = "arrow-data"
version = "57.3.0"
version = "58.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fdd994a9d28e6365aa78e15da3f3950c0fdcea6b963a12fa1c391afb637b304"
checksum = "189d210bc4244c715fa3ed9e6e22864673cccb73d5da28c2723fb2e527329b33"
dependencies = [
"arrow-buffer",
"arrow-schema",
@@ -350,9 +350,9 @@ dependencies = [
[[package]]
name = "arrow-flight"
version = "57.3.0"
version = "58.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "58c5b083668e6230eae3eab2fc4b5fb989974c845d0aa538dde61a4327c78675"
checksum = "b4f5cdf00ee0003ba0768d3575d0afc47d736b29673b14c3c228fdffa9a3fb29"
dependencies = [
"arrow-arith",
"arrow-array",
@@ -378,9 +378,9 @@ dependencies = [
[[package]]
name = "arrow-ipc"
version = "57.3.0"
version = "58.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "abf7df950701ab528bf7c0cf7eeadc0445d03ef5d6ffc151eaae6b38a58feff1"
checksum = "7968c2e5210c41f4909b2ef76f6e05e172b99021c2def5edf3cc48fdd39d1d6c"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -394,9 +394,9 @@ dependencies = [
[[package]]
name = "arrow-json"
version = "57.3.0"
version = "58.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ff8357658bedc49792b13e2e862b80df908171275f8e6e075c460da5ee4bf86"
checksum = "92111dba5bf900f443488e01f00d8c4ddc2f47f5c50039d18120287b580baa22"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -418,9 +418,9 @@ dependencies = [
[[package]]
name = "arrow-ord"
version = "57.3.0"
version = "58.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7d8f1870e03d4cbed632959498bcc84083b5a24bded52905ae1695bd29da45b"
checksum = "211136cb253577ee1a6665f741a13136d4e563f64f5093ffd6fb837af90b9495"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -431,9 +431,9 @@ dependencies = [
[[package]]
name = "arrow-row"
version = "57.3.0"
version = "58.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "18228633bad92bff92a95746bbeb16e5fc318e8382b75619dec26db79e4de4c0"
checksum = "8e0f20145f9f5ea3fe383e2ba7a7487bf19be36aa9dbf5dd6a1f92f657179663"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -444,9 +444,9 @@ dependencies = [
[[package]]
name = "arrow-schema"
version = "57.3.0"
version = "58.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8c872d36b7bf2a6a6a2b40de9156265f0242910791db366a2c17476ba8330d68"
checksum = "1b47e0ca91cc438d2c7879fe95e0bca5329fff28649e30a88c6f760b1faeddcb"
dependencies = [
"bitflags",
"serde",
@@ -456,9 +456,9 @@ dependencies = [
[[package]]
name = "arrow-select"
version = "57.3.0"
version = "58.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "68bf3e3efbd1278f770d67e5dc410257300b161b93baedb3aae836144edcaf4b"
checksum = "750a7d1dda177735f5e82a314485b6915c7cccdbb278262ac44090f4aba4a325"
dependencies = [
"ahash",
"arrow-array",
@@ -470,9 +470,9 @@ dependencies = [
[[package]]
name = "arrow-string"
version = "57.3.0"
version = "58.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85e968097061b3c0e9fe3079cf2e703e487890700546b5b0647f60fca1b5a8d8"
checksum = "e1eab1208bc4fe55d768cdc9b9f3d9df5a794cdb3ee2586bf89f9b30dc31ad8c"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -796,9 +796,9 @@ dependencies = [
[[package]]
name = "aws-smithy-http"
version = "0.63.5"
version = "0.63.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d619373d490ad70966994801bc126846afaa0d1ee920697a031f0cf63f2568e7"
checksum = "af4a8a5fe3e4ac7ee871237c340bbce13e982d37543b65700f4419e039f5d78e"
dependencies = [
"aws-smithy-runtime-api",
"aws-smithy-types",
@@ -817,9 +817,9 @@ dependencies = [
[[package]]
name = "aws-smithy-http-client"
version = "1.1.11"
version = "1.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "00ccbb08c10f6bcf912f398188e42ee2eab5f1767ce215a02a73bc5df1bbdd95"
checksum = "0709f0083aa19b704132684bc26d3c868e06bd428ccc4373b0b55c3e8748a58b"
dependencies = [
"aws-smithy-async",
"aws-smithy-runtime-api",
@@ -869,9 +869,9 @@ dependencies = [
[[package]]
name = "aws-smithy-runtime"
version = "1.10.2"
version = "1.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22ccf7f6eba8b2dcf8ce9b74806c6c185659c311665c4bf8d6e71ebd454db6bf"
checksum = "8fd3dfc18c1ce097cf81fced7192731e63809829c6cbf933c1ec47452d08e1aa"
dependencies = [
"aws-smithy-async",
"aws-smithy-http",
@@ -1536,9 +1536,9 @@ dependencies = [
[[package]]
name = "criterion"
version = "0.8.2"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "950046b2aa2492f9a536f5f4f9a3de7b9e2476e575e05bd6c333371add4d98f3"
checksum = "4d883447757bb0ee46f233e9dc22eb84d93a9508c9b868687b274fc431d886bf"
dependencies = [
"alloca",
"anes",
@@ -1563,9 +1563,9 @@ dependencies = [
[[package]]
name = "criterion-plot"
version = "0.8.2"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d8d80a2f4f5b554395e47b5d8305bc3d27813bacb73493eb1001e8f76dae29ea"
checksum = "ed943f81ea2faa8dcecbbfa50164acf95d555afec96a27871663b300e387b2e4"
dependencies = [
"cast",
"itertools 0.13.0",
@@ -2746,7 +2746,7 @@ dependencies = [
"libc",
"option-ext",
"redox_users",
"windows-sys 0.61.2",
"windows-sys 0.59.0",
]
[[package]]
@@ -2856,9 +2856,9 @@ dependencies = [
[[package]]
name = "env_filter"
version = "1.0.0"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a1c3cc8e57274ec99de65301228b537f1e4eedc1b8e0f9411c6caac8ae7308f"
checksum = "1bf3c259d255ca70051b30e2e95b5446cdb8949ac4cd22c0d7fd634d89f568e2"
dependencies = [
"log",
"regex",
@@ -2866,9 +2866,9 @@ dependencies = [
[[package]]
name = "env_logger"
version = "0.11.9"
version = "0.11.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2daee4ea451f429a58296525ddf28b45a3b64f1acf6587e2067437bb11e218d"
checksum = "13c863f0904021b108aa8b2f55046443e6b1ebde8fd4a15c399893aae4fa069f"
dependencies = [
"anstream",
"anstyle",
@@ -2890,7 +2890,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
dependencies = [
"libc",
"windows-sys 0.61.2",
"windows-sys 0.59.0",
]
[[package]]
@@ -3283,6 +3283,8 @@ dependencies = [
"cfg-if",
"crunchy",
"num-traits",
"rand 0.9.2",
"rand_distr",
"zerocopy",
]
@@ -3691,9 +3693,9 @@ dependencies = [
[[package]]
name = "indicatif"
version = "0.18.4"
version = "0.18.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "25470f23803092da7d239834776d653104d551bc4d7eacaf31e6837854b8e9eb"
checksum = "9375e112e4b463ec1b1c6c011953545c65a30164fbab5b581df32b3abf0dcb88"
dependencies = [
"console 0.16.2",
"portable-atomic",
@@ -4149,7 +4151,7 @@ version = "0.50.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5"
dependencies = [
"windows-sys 0.61.2",
"windows-sys 0.59.0",
]
[[package]]
@@ -4263,9 +4265,9 @@ dependencies = [
[[package]]
name = "object_store"
version = "0.12.5"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fbfbfff40aeccab00ec8a910b57ca8ecf4319b335c542f2edcd19dd25a1e2a00"
checksum = "c2858065e55c148d294a9f3aae3b0fa9458edadb41a108397094566f4e3c0dfb"
dependencies = [
"async-trait",
"base64 0.22.1",
@@ -4285,7 +4287,7 @@ dependencies = [
"rand 0.9.2",
"reqwest",
"ring",
"rustls-pemfile",
"rustls-pki-types",
"serde",
"serde_json",
"serde_urlencoded",
@@ -4384,14 +4386,13 @@ dependencies = [
[[package]]
name = "parquet"
version = "57.3.0"
version = "58.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ee96b29972a257b855ff2341b37e61af5f12d6af1158b6dcdb5b31ea07bb3cb"
checksum = "3f491d0ef1b510194426ee67ddc18a9b747ef3c42050c19322a2cd2e1666c29b"
dependencies = [
"ahash",
"arrow-array",
"arrow-buffer",
"arrow-cast",
"arrow-data",
"arrow-ipc",
"arrow-schema",
@@ -4768,7 +4769,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7"
dependencies = [
"heck",
"itertools 0.14.0",
"itertools 0.13.0",
"log",
"multimap",
"petgraph",
@@ -4787,7 +4788,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b"
dependencies = [
"anyhow",
"itertools 0.14.0",
"itertools 0.13.0",
"proc-macro2",
"quote",
"syn 2.0.117",
@@ -4889,7 +4890,7 @@ dependencies = [
"once_cell",
"socket2",
"tracing",
"windows-sys 0.60.2",
"windows-sys 0.59.0",
]
[[package]]
@@ -5077,9 +5078,9 @@ dependencies = [
[[package]]
name = "regex"
version = "1.12.3"
version = "1.12.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276"
checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4"
dependencies = [
"aho-corasick",
"memchr",
@@ -5256,7 +5257,7 @@ dependencies = [
"errno",
"libc",
"linux-raw-sys",
"windows-sys 0.61.2",
"windows-sys 0.59.0",
]
[[package]]
@@ -5287,15 +5288,6 @@ dependencies = [
"security-framework",
]
[[package]]
name = "rustls-pemfile"
version = "2.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50"
dependencies = [
"rustls-pki-types",
]
[[package]]
name = "rustls-pki-types"
version = "1.13.2"
@@ -5965,15 +5957,15 @@ dependencies = [
[[package]]
name = "tempfile"
version = "3.25.0"
version = "3.24.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0136791f7c95b1f6dd99f9cc786b91bb81c3800b639b3478e561ddb7be95e5f1"
checksum = "655da9c7eb6305c55742045d5a8d2037996d61d8de95806335c7c86ce0f82e9c"
dependencies = [
"fastrand",
"getrandom 0.4.1",
"getrandom 0.3.4",
"once_cell",
"rustix",
"windows-sys 0.61.2",
"windows-sys 0.59.0",
]
[[package]]
@@ -5989,9 +5981,9 @@ dependencies = [
[[package]]
name = "testcontainers"
version = "0.27.0"
version = "0.27.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3fdcea723c64cc08dbc533b3761e345a15bf1222cbe6cb611de09b43f17a168"
checksum = "c1c0624faaa317c56d6d19136580be889677259caf5c897941c6f446b4655068"
dependencies = [
"astral-tokio-tar",
"async-trait",
@@ -6263,9 +6255,9 @@ dependencies = [
[[package]]
name = "tonic"
version = "0.14.5"
version = "0.14.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fec7c61a0695dc1887c1b53952990f3ad2e3a31453e1f49f10e75424943a93ec"
checksum = "a286e33f82f8a1ee2df63f4fa35c0becf4a85a0cb03091a15fd7bf0b402dc94a"
dependencies = [
"async-trait",
"axum",
@@ -6931,7 +6923,7 @@ version = "0.1.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
dependencies = [
"windows-sys 0.61.2",
"windows-sys 0.59.0",
]
[[package]]
+8 -8
View File
@@ -91,19 +91,19 @@ ahash = { version = "0.8", default-features = false, features = [
"runtime-rng",
] }
apache-avro = { version = "0.21", default-features = false }
arrow = { version = "57.3.0", features = [
arrow = { version = "58.0.0", features = [
"prettyprint",
"chrono-tz",
] }
arrow-buffer = { version = "57.2.0", default-features = false }
arrow-flight = { version = "57.3.0", features = [
arrow-buffer = { version = "58.0.0", default-features = false }
arrow-flight = { version = "58.0.0", features = [
"flight-sql-experimental",
] }
arrow-ipc = { version = "57.2.0", default-features = false, features = [
arrow-ipc = { version = "58.0.0", default-features = false, features = [
"lz4",
] }
arrow-ord = { version = "57.2.0", default-features = false }
arrow-schema = { version = "57.2.0", default-features = false }
arrow-ord = { version = "58.0.0", default-features = false }
arrow-schema = { version = "58.0.0", default-features = false }
async-trait = "0.1.89"
bigdecimal = "0.4.8"
bytes = "1.11"
@@ -165,9 +165,9 @@ liblzma = { version = "0.4.6", features = ["static"] }
log = "^0.4"
memchr = "2.8.0"
num-traits = { version = "0.2" }
object_store = { version = "0.12.5", default-features = false }
object_store = { version = "0.13.1", default-features = false }
parking_lot = "0.12"
parquet = { version = "57.3.0", default-features = false, features = [
parquet = { version = "58.0.0", default-features = false, features = [
"arrow",
"async",
"object_store",
+5 -5
View File
@@ -521,6 +521,7 @@ mod tests {
use datafusion::common::plan_err;
use datafusion::prelude::SessionContext;
use datafusion_common::assert_contains;
use url::Url;
async fn create_external_table_test(location: &str, sql: &str) -> Result<()> {
@@ -714,7 +715,7 @@ mod tests {
let err = create_external_table_test(location, &sql)
.await
.unwrap_err();
assert!(err.to_string().contains("os error 2"));
assert_contains!(err.to_string(), "os error 2");
// for service_account_key
let sql = format!(
@@ -722,9 +723,8 @@ mod tests {
);
let err = create_external_table_test(location, &sql)
.await
.unwrap_err()
.to_string();
assert!(err.contains("No RSA key found in pem file"), "{err}");
.unwrap_err();
assert_contains!(err.to_string(), "Error reading pem file: no items found");
// for application_credentials_path
let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET
@@ -732,7 +732,7 @@ mod tests {
let err = create_external_table_test(location, &sql)
.await
.unwrap_err();
assert!(err.to_string().contains("os error 2"));
assert_contains!(err.to_string(), "os error 2");
Ok(())
}
+6 -6
View File
@@ -617,8 +617,8 @@ mod tests {
| filename | file_size_bytes | metadata_size_bytes | hits | extra |
+-----------------------------------+-----------------+---------------------+------+------------------+
| alltypes_plain.parquet | 1851 | 8882 | 2 | page_index=false |
| alltypes_tiny_pages.parquet | 454233 | 269266 | 2 | page_index=true |
| lz4_raw_compressed_larger.parquet | 380836 | 1347 | 2 | page_index=false |
| alltypes_tiny_pages.parquet | 454233 | 269074 | 2 | page_index=true |
| lz4_raw_compressed_larger.parquet | 380836 | 1339 | 2 | page_index=false |
+-----------------------------------+-----------------+---------------------+------+------------------+
");
@@ -648,8 +648,8 @@ mod tests {
| filename | file_size_bytes | metadata_size_bytes | hits | extra |
+-----------------------------------+-----------------+---------------------+------+------------------+
| alltypes_plain.parquet | 1851 | 8882 | 5 | page_index=false |
| alltypes_tiny_pages.parquet | 454233 | 269266 | 2 | page_index=true |
| lz4_raw_compressed_larger.parquet | 380836 | 1347 | 3 | page_index=false |
| alltypes_tiny_pages.parquet | 454233 | 269074 | 2 | page_index=true |
| lz4_raw_compressed_larger.parquet | 380836 | 1339 | 3 | page_index=false |
+-----------------------------------+-----------------+---------------------+------+------------------+
");
@@ -841,8 +841,8 @@ mod tests {
+---------------------+-----------+-----------------+------+
| metadata_size_bytes | filename | file_size_bytes | etag |
+---------------------+-----------+-----------------+------+
| 212 | 0.parquet | 3645 | 0 |
| 212 | 1.parquet | 3645 | 1 |
| 212 | 0.parquet | 3642 | 0 |
| 212 | 1.parquet | 3642 | 1 |
+---------------------+-----------+-----------------+------+
");
@@ -36,10 +36,11 @@ use datafusion::{
execution::object_store::{DefaultObjectStoreRegistry, ObjectStoreRegistry},
};
use futures::stream::{BoxStream, Stream};
use futures::{StreamExt, TryStreamExt};
use object_store::{
GetOptions, GetRange, GetResult, ListResult, MultipartUpload, ObjectMeta,
ObjectStore, PutMultipartOptions, PutOptions, PutPayload, PutResult, Result,
path::Path,
CopyOptions, GetOptions, GetRange, GetResult, ListResult, MultipartUpload,
ObjectMeta, ObjectStore, ObjectStoreExt, PutMultipartOptions, PutOptions, PutPayload,
PutResult, Result, path::Path,
};
use parking_lot::{Mutex, RwLock};
use url::Url;
@@ -230,16 +231,26 @@ impl InstrumentedObjectStore {
let timestamp = Utc::now();
let range = options.range.clone();
let head = options.head;
let start = Instant::now();
let ret = self.inner.get_opts(location, options).await?;
let elapsed = start.elapsed();
let (op, size) = if head {
(Operation::Head, None)
} else {
(
Operation::Get,
Some((ret.range.end - ret.range.start) as usize),
)
};
self.requests.lock().push(RequestDetails {
op: Operation::Get,
op,
path: location.clone(),
timestamp,
duration: Some(elapsed),
size: Some((ret.range.end - ret.range.start) as usize),
size,
range,
extra_display: None,
});
@@ -247,23 +258,30 @@ impl InstrumentedObjectStore {
Ok(ret)
}
async fn instrumented_delete(&self, location: &Path) -> Result<()> {
fn instrumented_delete_stream(
&self,
locations: BoxStream<'static, Result<Path>>,
) -> BoxStream<'static, Result<Path>> {
let requests_captured = Arc::clone(&self.requests);
let timestamp = Utc::now();
let start = Instant::now();
self.inner.delete(location).await?;
let elapsed = start.elapsed();
self.requests.lock().push(RequestDetails {
op: Operation::Delete,
path: location.clone(),
timestamp,
duration: Some(elapsed),
size: None,
range: None,
extra_display: None,
});
Ok(())
self.inner
.delete_stream(locations)
.and_then(move |location| {
let elapsed = start.elapsed();
requests_captured.lock().push(RequestDetails {
op: Operation::Delete,
path: location.clone(),
timestamp,
duration: Some(elapsed),
size: None,
range: None,
extra_display: None,
});
futures::future::ok(location)
})
.boxed()
}
fn instrumented_list(
@@ -361,25 +379,6 @@ impl InstrumentedObjectStore {
Ok(())
}
async fn instrumented_head(&self, location: &Path) -> Result<ObjectMeta> {
let timestamp = Utc::now();
let start = Instant::now();
let ret = self.inner.head(location).await?;
let elapsed = start.elapsed();
self.requests.lock().push(RequestDetails {
op: Operation::Head,
path: location.clone(),
timestamp,
duration: Some(elapsed),
size: None,
range: None,
extra_display: None,
});
Ok(ret)
}
}
impl fmt::Display for InstrumentedObjectStore {
@@ -429,12 +428,15 @@ impl ObjectStore for InstrumentedObjectStore {
self.inner.get_opts(location, options).await
}
async fn delete(&self, location: &Path) -> Result<()> {
fn delete_stream(
&self,
locations: BoxStream<'static, Result<Path>>,
) -> BoxStream<'static, Result<Path>> {
if self.enabled() {
return self.instrumented_delete(location).await;
return self.instrumented_delete_stream(locations);
}
self.inner.delete(location).await
self.inner.delete_stream(locations)
}
fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result<ObjectMeta>> {
@@ -453,28 +455,24 @@ impl ObjectStore for InstrumentedObjectStore {
self.inner.list_with_delimiter(prefix).await
}
async fn copy(&self, from: &Path, to: &Path) -> Result<()> {
async fn copy_opts(
&self,
from: &Path,
to: &Path,
options: CopyOptions,
) -> Result<()> {
if self.enabled() {
return self.instrumented_copy(from, to).await;
return match options.mode {
object_store::CopyMode::Create => {
self.instrumented_copy_if_not_exists(from, to).await
}
object_store::CopyMode::Overwrite => {
self.instrumented_copy(from, to).await
}
};
}
self.inner.copy(from, to).await
}
async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> {
if self.enabled() {
return self.instrumented_copy_if_not_exists(from, to).await;
}
self.inner.copy_if_not_exists(from, to).await
}
async fn head(&self, location: &Path) -> Result<ObjectMeta> {
if self.enabled() {
return self.instrumented_head(location).await;
}
self.inner.head(location).await
self.inner.copy_opts(from, to, options).await
}
}
@@ -69,7 +69,7 @@ use datafusion_proto::protobuf::{
};
use object_store::memory::InMemory;
use object_store::path::Path;
use object_store::{ObjectStore, PutPayload};
use object_store::{ObjectStore, ObjectStoreExt, PutPayload};
use serde::{Deserialize, Serialize};
/// Example showing how to preserve custom adapter information during plan serialization.
@@ -36,7 +36,7 @@ use datafusion::{
use datafusion::datasource::physical_plan::FileScanConfigBuilder;
use datafusion_examples::utils::datasets::ExampleDataset;
use futures::StreamExt;
use object_store::{ObjectStore, local::LocalFileSystem, memory::InMemory};
use object_store::{ObjectStoreExt, local::LocalFileSystem, memory::InMemory};
/// This example demonstrates using the low level [`FileStream`] / [`FileOpener`] APIs to directly
/// read data from (CSV/JSON) into Arrow RecordBatches.
@@ -40,7 +40,7 @@ use datafusion_physical_expr_adapter::{
};
use object_store::memory::InMemory;
use object_store::path::Path;
use object_store::{ObjectStore, PutPayload};
use object_store::{ObjectStore, ObjectStoreExt, PutPayload};
// Example showing how to implement custom casting rules to adapt file schemas.
// This example enforces that casts must be strictly widening: if the file type is Int64 and the table type is Int32, it will error
@@ -48,7 +48,7 @@ use datafusion_physical_expr_adapter::{
use futures::StreamExt;
use object_store::memory::InMemory;
use object_store::path::Path;
use object_store::{ObjectStore, PutPayload};
use object_store::{ObjectStore, ObjectStoreExt, PutPayload};
// Metadata key for storing default values in field metadata
const DEFAULT_VALUE_METADATA_KEY: &str = "example.default_value";
@@ -79,7 +79,7 @@ pub async fn default_column_values() -> Result<()> {
let mut buf = vec![];
let props = WriterProperties::builder()
.set_max_row_group_size(2)
.set_max_row_group_row_count(Some(2))
.build();
let mut writer =
@@ -47,7 +47,7 @@ use datafusion_physical_expr_adapter::{
};
use object_store::memory::InMemory;
use object_store::path::Path;
use object_store::{ObjectStore, PutPayload};
use object_store::{ObjectStoreExt, PutPayload};
// Example showing how to implement custom filter rewriting for JSON shredding.
//
@@ -76,7 +76,7 @@ pub async fn json_shredding() -> Result<()> {
let mut buf = vec![];
let props = WriterProperties::builder()
.set_max_row_group_size(2)
.set_max_row_group_row_count(Some(2))
.build();
let mut writer = ArrowWriter::try_new(&mut buf, batch.schema(), Some(props))
@@ -43,7 +43,7 @@ use datafusion::parquet::arrow::arrow_reader::{
ArrowReaderOptions, ParquetRecordBatchReaderBuilder, RowSelection, RowSelector,
};
use datafusion::parquet::arrow::async_reader::{AsyncFileReader, ParquetObjectReader};
use datafusion::parquet::file::metadata::ParquetMetaData;
use datafusion::parquet::file::metadata::{PageIndexPolicy, ParquetMetaData};
use datafusion::parquet::file::properties::{EnabledStatistics, WriterProperties};
use datafusion::parquet::schema::types::ColumnPath;
use datafusion::physical_expr::PhysicalExpr;
@@ -410,7 +410,7 @@ impl IndexedFile {
let options = ArrowReaderOptions::new()
// Load the page index when reading metadata to cache
// so it is available to interpret row selections
.with_page_index(true);
.with_page_index_policy(PageIndexPolicy::Required);
let reader =
ParquetRecordBatchReaderBuilder::try_new_with_options(file, options)?;
let metadata = reader.metadata().clone();
@@ -567,7 +567,7 @@ impl ParquetFileReaderFactory for CachedParquetFileReaderFactory {
.object_meta
.location
.parts()
.last()
.next_back()
.expect("No path in location")
.as_ref()
.to_string();
@@ -659,7 +659,7 @@ fn make_demo_file(path: impl AsRef<Path>, value_range: Range<i32>) -> Result<()>
// enable page statistics for the tag column,
// for everything else.
let props = WriterProperties::builder()
.set_max_row_group_size(100)
.set_max_row_group_row_count(Some(100))
// compute column chunk (per row group) statistics by default
.set_statistics_enabled(EnabledStatistics::Chunk)
// compute column page statistics for the tag column
+1 -1
View File
@@ -84,7 +84,7 @@ mod tests {
.build();
// Verify the expected options propagated down to parquet crate WriterProperties struct
assert_eq!(properties.max_row_group_size(), 123);
assert_eq!(properties.max_row_group_row_count(), Some(123));
assert_eq!(properties.data_page_size_limit(), 123);
assert_eq!(properties.write_batch_size(), 123);
assert_eq!(properties.writer_version(), WriterVersion::PARQUET_2_0);
@@ -222,7 +222,7 @@ impl ParquetOptions {
.and_then(|s| parse_statistics_string(s).ok())
.unwrap_or(DEFAULT_STATISTICS_ENABLED),
)
.set_max_row_group_size(*max_row_group_size)
.set_max_row_group_row_count(Some(*max_row_group_size))
.set_created_by(created_by.clone())
.set_column_index_truncate_length(*column_index_truncate_length)
.set_statistics_truncate_length(*statistics_truncate_length)
@@ -393,7 +393,7 @@ mod tests {
use parquet::basic::Compression;
use parquet::file::properties::{
BloomFilterProperties, DEFAULT_BLOOM_FILTER_FPP, DEFAULT_BLOOM_FILTER_NDV,
EnabledStatistics,
DEFAULT_MAX_ROW_GROUP_ROW_COUNT, EnabledStatistics,
};
use std::collections::HashMap;
@@ -536,7 +536,9 @@ mod tests {
write_batch_size: props.write_batch_size(),
writer_version: props.writer_version().into(),
dictionary_page_size_limit: props.dictionary_page_size_limit(),
max_row_group_size: props.max_row_group_size(),
max_row_group_size: props
.max_row_group_row_count()
.unwrap_or(DEFAULT_MAX_ROW_GROUP_ROW_COUNT),
created_by: props.created_by().to_string(),
column_index_truncate_length: props.column_index_truncate_length(),
statistics_truncate_length: props.statistics_truncate_length(),
+2 -2
View File
@@ -45,7 +45,7 @@ const NUM_BATCHES: usize = 2048;
/// The number of rows in each record batch to write
const WRITE_RECORD_BATCH_SIZE: usize = 1024;
/// The number of rows in a row group
const ROW_GROUP_SIZE: usize = 1024 * 1024;
const ROW_GROUP_ROW_COUNT: usize = 1024 * 1024;
/// The number of row groups expected
const EXPECTED_ROW_GROUPS: usize = 2;
@@ -154,7 +154,7 @@ fn generate_file() -> NamedTempFile {
let properties = WriterProperties::builder()
.set_writer_version(WriterVersion::PARQUET_2_0)
.set_max_row_group_size(ROW_GROUP_SIZE)
.set_max_row_group_row_count(Some(ROW_GROUP_ROW_COUNT))
.build();
let mut writer =
@@ -40,7 +40,7 @@ const NUM_BATCHES: usize = 128;
/// The number of rows in each record batch to write
const WRITE_RECORD_BATCH_SIZE: usize = 4096;
/// The number of rows in a row group
const ROW_GROUP_SIZE: usize = 65536;
const ROW_GROUP_ROW_COUNT: usize = 65536;
/// The number of row groups expected
const EXPECTED_ROW_GROUPS: usize = 8;
/// The range for random string lengths
@@ -114,7 +114,7 @@ fn generate_file() -> NamedTempFile {
let properties = WriterProperties::builder()
.set_writer_version(WriterVersion::PARQUET_2_0)
.set_max_row_group_size(ROW_GROUP_SIZE)
.set_max_row_group_row_count(Some(ROW_GROUP_ROW_COUNT))
.build();
let mut writer =
+1 -1
View File
@@ -25,9 +25,9 @@ use datafusion_execution::object_store::ObjectStoreUrl;
use datafusion_physical_optimizer::PhysicalOptimizerRule;
use datafusion_physical_optimizer::filter_pushdown::FilterPushdown;
use datafusion_physical_plan::ExecutionPlan;
use object_store::ObjectStore;
use object_store::memory::InMemory;
use object_store::path::Path;
use object_store::{ObjectStore, ObjectStoreExt};
use parquet::arrow::ArrowWriter;
use std::sync::Arc;
+1 -1
View File
@@ -31,7 +31,7 @@ use datafusion::{
use datafusion_execution::runtime_env::RuntimeEnv;
use itertools::Itertools;
use object_store::{
ObjectStore,
ObjectStore, ObjectStoreExt,
memory::InMemory,
path::Path,
throttle::{ThrottleConfig, ThrottledStore},
@@ -65,7 +65,8 @@ mod tests {
use object_store::path::Path;
use object_store::{
Attributes, GetOptions, GetResult, GetResultPayload, ListResult, MultipartUpload,
ObjectMeta, ObjectStore, PutMultipartOptions, PutOptions, PutPayload, PutResult,
ObjectMeta, ObjectStore, ObjectStoreExt, PutMultipartOptions, PutOptions,
PutPayload, PutResult,
};
use regex::Regex;
use rstest::*;
@@ -104,10 +105,6 @@ mod tests {
unimplemented!()
}
async fn get(&self, location: &Path) -> object_store::Result<GetResult> {
self.get_opts(location, GetOptions::default()).await
}
async fn get_opts(
&self,
location: &Path,
@@ -147,14 +144,6 @@ mod tests {
unimplemented!()
}
async fn head(&self, _location: &Path) -> object_store::Result<ObjectMeta> {
unimplemented!()
}
async fn delete(&self, _location: &Path) -> object_store::Result<()> {
unimplemented!()
}
fn list(
&self,
_prefix: Option<&Path>,
@@ -169,17 +158,21 @@ mod tests {
unimplemented!()
}
async fn copy(&self, _from: &Path, _to: &Path) -> object_store::Result<()> {
unimplemented!()
}
async fn copy_if_not_exists(
async fn copy_opts(
&self,
_from: &Path,
_to: &Path,
_options: object_store::CopyOptions,
) -> object_store::Result<()> {
unimplemented!()
}
fn delete_stream(
&self,
_locations: BoxStream<'static, object_store::Result<Path>>,
) -> BoxStream<'static, object_store::Result<Path>> {
unimplemented!()
}
}
impl VariableStream {
@@ -156,8 +156,8 @@ mod tests {
use futures::StreamExt;
use futures::stream::BoxStream;
use insta::assert_snapshot;
use object_store::ObjectMeta;
use object_store::local::LocalFileSystem;
use object_store::{CopyOptions, ObjectMeta};
use object_store::{
GetOptions, GetResult, ListResult, MultipartUpload, ObjectStore,
PutMultipartOptions, PutOptions, PutPayload, PutResult, path::Path,
@@ -165,7 +165,8 @@ mod tests {
use parquet::arrow::ParquetRecordBatchStreamBuilder;
use parquet::arrow::arrow_reader::ArrowReaderOptions;
use parquet::file::metadata::{
KeyValue, ParquetColumnIndex, ParquetMetaData, ParquetOffsetIndex,
KeyValue, PageIndexPolicy, ParquetColumnIndex, ParquetMetaData,
ParquetOffsetIndex,
};
use parquet::file::page_index::column_index::ColumnIndexMetaData;
use tokio::fs::File;
@@ -310,7 +311,7 @@ mod tests {
_payload: PutPayload,
_opts: PutOptions,
) -> object_store::Result<PutResult> {
Err(object_store::Error::NotImplemented)
unimplemented!()
}
async fn put_multipart_opts(
@@ -318,7 +319,7 @@ mod tests {
_location: &Path,
_opts: PutMultipartOptions,
) -> object_store::Result<Box<dyn MultipartUpload>> {
Err(object_store::Error::NotImplemented)
unimplemented!()
}
async fn get_opts(
@@ -330,40 +331,34 @@ mod tests {
self.inner.get_opts(location, options).await
}
async fn head(&self, _location: &Path) -> object_store::Result<ObjectMeta> {
Err(object_store::Error::NotImplemented)
}
async fn delete(&self, _location: &Path) -> object_store::Result<()> {
Err(object_store::Error::NotImplemented)
fn delete_stream(
&self,
_locations: BoxStream<'static, object_store::Result<Path>>,
) -> BoxStream<'static, object_store::Result<Path>> {
unimplemented!()
}
fn list(
&self,
_prefix: Option<&Path>,
) -> BoxStream<'static, object_store::Result<ObjectMeta>> {
Box::pin(futures::stream::once(async {
Err(object_store::Error::NotImplemented)
}))
unimplemented!()
}
async fn list_with_delimiter(
&self,
_prefix: Option<&Path>,
) -> object_store::Result<ListResult> {
Err(object_store::Error::NotImplemented)
unimplemented!()
}
async fn copy(&self, _from: &Path, _to: &Path) -> object_store::Result<()> {
Err(object_store::Error::NotImplemented)
}
async fn copy_if_not_exists(
async fn copy_opts(
&self,
_from: &Path,
_to: &Path,
_options: CopyOptions,
) -> object_store::Result<()> {
Err(object_store::Error::NotImplemented)
unimplemented!()
}
}
@@ -1105,7 +1100,8 @@ mod tests {
let testdata = datafusion_common::test_util::parquet_test_data();
let path = format!("{testdata}/alltypes_tiny_pages.parquet");
let file = File::open(path).await?;
let options = ArrowReaderOptions::new().with_page_index(true);
let options =
ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Required);
let builder =
ParquetRecordBatchStreamBuilder::new_with_options(file, options.clone())
.await?
@@ -31,7 +31,7 @@ mod tests {
use datafusion_datasource::TableSchema;
use datafusion_datasource_csv::CsvFormat;
use object_store::ObjectStore;
use object_store::{ObjectStore, ObjectStoreExt};
use crate::datasource::file_format::FileFormat;
use crate::prelude::CsvReadOptions;
+33 -34
View File
@@ -27,6 +27,7 @@ use crate::{
prelude::SessionContext,
};
use futures::{FutureExt, stream::BoxStream};
use object_store::{CopyOptions, ObjectStoreExt};
use std::{
fmt::{Debug, Display, Formatter},
sync::Arc,
@@ -130,39 +131,40 @@ impl ObjectStore for BlockingObjectStore {
location: &Path,
options: GetOptions,
) -> object_store::Result<GetResult> {
self.inner.get_opts(location, options).await
}
async fn head(&self, location: &Path) -> object_store::Result<ObjectMeta> {
println!(
"{} received head call for {location}",
BlockingObjectStore::NAME
);
// Wait until the expected number of concurrent calls is reached, but timeout after 1 second to avoid hanging failing tests.
let wait_result = timeout(Duration::from_secs(1), self.barrier.wait()).await;
match wait_result {
Ok(_) => println!(
"{} barrier reached for {location}",
if options.head {
println!(
"{} received head call for {location}",
BlockingObjectStore::NAME
),
Err(_) => {
let error_message = format!(
"{} barrier wait timed out for {location}",
);
// Wait until the expected number of concurrent calls is reached, but timeout after 1 second to avoid hanging failing tests.
let wait_result = timeout(Duration::from_secs(1), self.barrier.wait()).await;
match wait_result {
Ok(_) => println!(
"{} barrier reached for {location}",
BlockingObjectStore::NAME
);
log::error!("{error_message}");
return Err(Error::Generic {
store: BlockingObjectStore::NAME,
source: error_message.into(),
});
),
Err(_) => {
let error_message = format!(
"{} barrier wait timed out for {location}",
BlockingObjectStore::NAME
);
log::error!("{error_message}");
return Err(Error::Generic {
store: BlockingObjectStore::NAME,
source: error_message.into(),
});
}
}
}
// Forward the call to the inner object store.
self.inner.head(location).await
}
async fn delete(&self, location: &Path) -> object_store::Result<()> {
self.inner.delete(location).await
// Forward the call to the inner object store.
self.inner.get_opts(location, options).await
}
fn delete_stream(
&self,
locations: BoxStream<'static, object_store::Result<Path>>,
) -> BoxStream<'static, object_store::Result<Path>> {
self.inner.delete_stream(locations)
}
fn list(
@@ -179,15 +181,12 @@ impl ObjectStore for BlockingObjectStore {
self.inner.list_with_delimiter(prefix).await
}
async fn copy(&self, from: &Path, to: &Path) -> object_store::Result<()> {
self.inner.copy(from, to).await
}
async fn copy_if_not_exists(
async fn copy_opts(
&self,
from: &Path,
to: &Path,
options: CopyOptions,
) -> object_store::Result<()> {
self.inner.copy_if_not_exists(from, to).await
self.inner.copy_opts(from, to, options).await
}
}
@@ -19,7 +19,7 @@ use std::sync::Arc;
use arrow_schema::DataType;
use futures::{FutureExt, StreamExt as _, TryStreamExt as _};
use object_store::{ObjectStore as _, memory::InMemory, path::Path};
use object_store::{ObjectStoreExt, memory::InMemory, path::Path};
use datafusion::execution::SessionStateBuilder;
use datafusion_catalog_listing::helpers::{
+6 -3
View File
@@ -6534,7 +6534,7 @@ async fn test_fill_null_all_columns() -> Result<()> {
async fn test_insert_into_casting_support() -> Result<()> {
// Testing case1:
// Inserting query schema mismatch: Expected table field 'a' with type Float16, but got 'a' with type Utf8.
// And the cast is not supported from Utf8 to Float16.
// And the cast is not supported from Binary to Float16.
// Create a new schema with one field called "a" of type Float16, and setting nullable to false
let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Float16, false)]));
@@ -6545,7 +6545,10 @@ async fn test_insert_into_casting_support() -> Result<()> {
let initial_table = Arc::new(MemTable::try_new(schema.clone(), vec![vec![]])?);
session_ctx.register_table("t", initial_table.clone())?;
let mut write_df = session_ctx.sql("values ('a123'), ('b456')").await.unwrap();
let mut write_df = session_ctx
.sql("values (x'a123'), (x'b456')")
.await
.unwrap();
write_df = write_df
.clone()
@@ -6559,7 +6562,7 @@ async fn test_insert_into_casting_support() -> Result<()> {
assert_contains!(
e.to_string(),
"Inserting query schema mismatch: Expected table field 'a' with type Float16, but got 'a' with type Utf8."
"Inserting query schema mismatch: Expected table field 'a' with type Float16, but got 'a' with type Binary."
);
// Testing case2:
@@ -36,8 +36,9 @@ use insta::assert_snapshot;
use object_store::memory::InMemory;
use object_store::path::Path;
use object_store::{
GetOptions, GetRange, GetResult, ListResult, MultipartUpload, ObjectMeta,
ObjectStore, PutMultipartOptions, PutOptions, PutPayload, PutResult,
CopyOptions, GetOptions, GetRange, GetResult, ListResult, MultipartUpload,
ObjectMeta, ObjectStore, ObjectStoreExt, PutMultipartOptions, PutOptions, PutPayload,
PutResult,
};
use parking_lot::Mutex;
use std::fmt;
@@ -54,8 +55,8 @@ async fn create_single_csv_file() {
@r"
RequestCountingObjectStore()
Total Requests: 2
- HEAD path=csv_table.csv
- GET path=csv_table.csv
- GET (opts) path=csv_table.csv head=true
- GET (opts) path=csv_table.csv
"
);
}
@@ -76,7 +77,7 @@ async fn query_single_csv_file() {
------- Object Store Request Summary -------
RequestCountingObjectStore()
Total Requests: 2
- HEAD path=csv_table.csv
- GET (opts) path=csv_table.csv head=true
- GET (opts) path=csv_table.csv
"
);
@@ -91,9 +92,9 @@ async fn create_multi_file_csv_file() {
RequestCountingObjectStore()
Total Requests: 4
- LIST prefix=data
- GET path=data/file_0.csv
- GET path=data/file_1.csv
- GET path=data/file_2.csv
- GET (opts) path=data/file_0.csv
- GET (opts) path=data/file_1.csv
- GET (opts) path=data/file_2.csv
"
);
}
@@ -351,8 +352,8 @@ async fn create_single_parquet_file_default() {
@r"
RequestCountingObjectStore()
Total Requests: 2
- HEAD path=parquet_table.parquet
- GET (range) range=0-2994 path=parquet_table.parquet
- GET (opts) path=parquet_table.parquet head=true
- GET (opts) path=parquet_table.parquet range=0-2994
"
);
}
@@ -370,8 +371,8 @@ async fn create_single_parquet_file_prefetch() {
@r"
RequestCountingObjectStore()
Total Requests: 2
- HEAD path=parquet_table.parquet
- GET (range) range=1994-2994 path=parquet_table.parquet
- GET (opts) path=parquet_table.parquet head=true
- GET (opts) path=parquet_table.parquet range=1994-2994
"
);
}
@@ -399,10 +400,10 @@ async fn create_single_parquet_file_too_small_prefetch() {
@r"
RequestCountingObjectStore()
Total Requests: 4
- HEAD path=parquet_table.parquet
- GET (range) range=2494-2994 path=parquet_table.parquet
- GET (range) range=2264-2986 path=parquet_table.parquet
- GET (range) range=2124-2264 path=parquet_table.parquet
- GET (opts) path=parquet_table.parquet head=true
- GET (opts) path=parquet_table.parquet range=2494-2994
- GET (opts) path=parquet_table.parquet range=2264-2986
- GET (opts) path=parquet_table.parquet range=2124-2264
"
);
}
@@ -431,9 +432,9 @@ async fn create_single_parquet_file_small_prefetch() {
@r"
RequestCountingObjectStore()
Total Requests: 3
- HEAD path=parquet_table.parquet
- GET (range) range=2254-2994 path=parquet_table.parquet
- GET (range) range=2124-2264 path=parquet_table.parquet
- GET (opts) path=parquet_table.parquet head=true
- GET (opts) path=parquet_table.parquet range=2254-2994
- GET (opts) path=parquet_table.parquet range=2124-2264
"
);
}
@@ -455,8 +456,8 @@ async fn create_single_parquet_file_no_prefetch() {
@r"
RequestCountingObjectStore()
Total Requests: 2
- HEAD path=parquet_table.parquet
- GET (range) range=0-2994 path=parquet_table.parquet
- GET (opts) path=parquet_table.parquet head=true
- GET (opts) path=parquet_table.parquet range=0-2994
"
);
}
@@ -476,7 +477,7 @@ async fn query_single_parquet_file() {
------- Object Store Request Summary -------
RequestCountingObjectStore()
Total Requests: 3
- HEAD path=parquet_table.parquet
- GET (opts) path=parquet_table.parquet head=true
- GET (ranges) path=parquet_table.parquet ranges=4-534,534-1064
- GET (ranges) path=parquet_table.parquet ranges=1064-1594,1594-2124
"
@@ -500,7 +501,7 @@ async fn query_single_parquet_file_with_single_predicate() {
------- Object Store Request Summary -------
RequestCountingObjectStore()
Total Requests: 2
- HEAD path=parquet_table.parquet
- GET (opts) path=parquet_table.parquet head=true
- GET (ranges) path=parquet_table.parquet ranges=1064-1481,1481-1594,1594-2011,2011-2124
"
);
@@ -524,7 +525,7 @@ async fn query_single_parquet_file_multi_row_groups_multiple_predicates() {
------- Object Store Request Summary -------
RequestCountingObjectStore()
Total Requests: 3
- HEAD path=parquet_table.parquet
- GET (opts) path=parquet_table.parquet head=true
- GET (ranges) path=parquet_table.parquet ranges=4-421,421-534,534-951,951-1064
- GET (ranges) path=parquet_table.parquet ranges=1064-1481,1481-1594,1594-2011,2011-2124
"
@@ -701,7 +702,7 @@ impl Test {
let mut buffer = vec![];
let props = parquet::file::properties::WriterProperties::builder()
.set_max_row_group_size(100)
.set_max_row_group_row_count(Some(100))
.build();
let mut writer = parquet::arrow::ArrowWriter::try_new(
&mut buffer,
@@ -752,11 +753,8 @@ impl Test {
/// Details of individual requests made through the [`RequestCountingObjectStore`]
#[derive(Clone, Debug)]
enum RequestDetails {
Get { path: Path },
GetOpts { path: Path, get_options: GetOptions },
GetRanges { path: Path, ranges: Vec<Range<u64>> },
GetRange { path: Path, range: Range<u64> },
Head { path: Path },
List { prefix: Option<Path> },
ListWithDelimiter { prefix: Option<Path> },
ListWithOffset { prefix: Option<Path>, offset: Path },
@@ -774,9 +772,6 @@ fn display_range(range: &Range<u64>) -> impl Display + '_ {
impl Display for RequestDetails {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
match self {
RequestDetails::Get { path } => {
write!(f, "GET path={path}")
}
RequestDetails::GetOpts { path, get_options } => {
write!(f, "GET (opts) path={path}")?;
if let Some(range) = &get_options.range {
@@ -814,13 +809,6 @@ impl Display for RequestDetails {
}
Ok(())
}
RequestDetails::GetRange { path, range } => {
let range = display_range(range);
write!(f, "GET (range) range={range} path={path}")
}
RequestDetails::Head { path } => {
write!(f, "HEAD path={path}")
}
RequestDetails::List { prefix } => {
write!(f, "LIST")?;
if let Some(prefix) = prefix {
@@ -893,7 +881,7 @@ impl ObjectStore for RequestCountingObjectStore {
_payload: PutPayload,
_opts: PutOptions,
) -> object_store::Result<PutResult> {
Err(object_store::Error::NotImplemented)
unimplemented!()
}
async fn put_multipart_opts(
@@ -901,15 +889,7 @@ impl ObjectStore for RequestCountingObjectStore {
_location: &Path,
_opts: PutMultipartOptions,
) -> object_store::Result<Box<dyn MultipartUpload>> {
Err(object_store::Error::NotImplemented)
}
async fn get(&self, location: &Path) -> object_store::Result<GetResult> {
let result = self.inner.get(location).await?;
self.requests.lock().push(RequestDetails::Get {
path: location.to_owned(),
});
Ok(result)
unimplemented!()
}
async fn get_opts(
@@ -925,19 +905,6 @@ impl ObjectStore for RequestCountingObjectStore {
Ok(result)
}
async fn get_range(
&self,
location: &Path,
range: Range<u64>,
) -> object_store::Result<Bytes> {
let result = self.inner.get_range(location, range.clone()).await?;
self.requests.lock().push(RequestDetails::GetRange {
path: location.to_owned(),
range: range.clone(),
});
Ok(result)
}
async fn get_ranges(
&self,
location: &Path,
@@ -951,18 +918,6 @@ impl ObjectStore for RequestCountingObjectStore {
Ok(result)
}
async fn head(&self, location: &Path) -> object_store::Result<ObjectMeta> {
let result = self.inner.head(location).await?;
self.requests.lock().push(RequestDetails::Head {
path: location.to_owned(),
});
Ok(result)
}
async fn delete(&self, _location: &Path) -> object_store::Result<()> {
Err(object_store::Error::NotImplemented)
}
fn list(
&self,
prefix: Option<&Path>,
@@ -998,15 +953,19 @@ impl ObjectStore for RequestCountingObjectStore {
self.inner.list_with_delimiter(prefix).await
}
async fn copy(&self, _from: &Path, _to: &Path) -> object_store::Result<()> {
Err(object_store::Error::NotImplemented)
fn delete_stream(
&self,
_locations: BoxStream<'static, object_store::Result<Path>>,
) -> BoxStream<'static, object_store::Result<Path>> {
unimplemented!()
}
async fn copy_if_not_exists(
async fn copy_opts(
&self,
_from: &Path,
_to: &Path,
_options: CopyOptions,
) -> object_store::Result<()> {
Err(object_store::Error::NotImplemented)
unimplemented!()
}
}
+3 -1
View File
@@ -31,7 +31,9 @@ use datafusion_execution::object_store::ObjectStoreUrl;
use datafusion_physical_expr::PhysicalExpr;
use datafusion_physical_plan::{ExecutionPlan, collect, filter::FilterExec};
use itertools::Itertools;
use object_store::{ObjectStore, PutPayload, memory::InMemory, path::Path};
use object_store::{
ObjectStore, ObjectStoreExt, PutPayload, memory::InMemory, path::Path,
};
use parquet::{
arrow::ArrowWriter,
file::properties::{EnabledStatistics, WriterProperties},
@@ -31,7 +31,7 @@ use datafusion_execution::object_store::ObjectStoreUrl;
use itertools::Itertools;
use object_store::memory::InMemory;
use object_store::path::Path;
use object_store::{ObjectStore, PutPayload};
use object_store::{ObjectStore, ObjectStoreExt, PutPayload};
use parquet::arrow::ArrowWriter;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
@@ -43,7 +43,7 @@ use futures::{FutureExt, TryFutureExt};
use insta::assert_snapshot;
use object_store::memory::InMemory;
use object_store::path::Path;
use object_store::{ObjectMeta, ObjectStore};
use object_store::{ObjectMeta, ObjectStore, ObjectStoreExt};
use parquet::arrow::ArrowWriter;
use parquet::arrow::arrow_reader::ArrowReaderOptions;
use parquet::arrow::async_reader::AsyncFileReader;
@@ -37,7 +37,7 @@ use datafusion_physical_expr_adapter::{
DefaultPhysicalExprAdapter, DefaultPhysicalExprAdapterFactory, PhysicalExprAdapter,
PhysicalExprAdapterFactory,
};
use object_store::{ObjectStore, memory::InMemory, path::Path};
use object_store::{ObjectStore, ObjectStoreExt, memory::InMemory, path::Path};
use parquet::arrow::ArrowWriter;
async fn write_parquet(batch: RecordBatch, store: Arc<dyn ObjectStore>, path: &str) {
@@ -409,7 +409,7 @@ fn get_test_data() -> TestData {
.expect("tempfile creation");
let props = WriterProperties::builder()
.set_max_row_group_size(row_per_group)
.set_max_row_group_row_count(Some(row_per_group))
.build();
let batches = create_data_batch(scenario);
@@ -63,7 +63,7 @@ async fn single_file() {
// Set the row group size smaller so can test with fewer rows
let props = WriterProperties::builder()
.set_max_row_group_size(1024)
.set_max_row_group_row_count(Some(1024))
.build();
// Only create the parquet file once as it is fairly large
@@ -230,7 +230,7 @@ async fn single_file_small_data_pages() {
// Set a low row count limit to improve page filtering
let props = WriterProperties::builder()
.set_max_row_group_size(2048)
.set_max_row_group_row_count(Some(2048))
.set_data_page_row_count_limit(512)
.set_write_batch_size(512)
.build();
+1 -1
View File
@@ -1148,7 +1148,7 @@ async fn make_test_file_rg(
.expect("tempfile creation");
let props = WriterProperties::builder()
.set_max_row_group_size(row_per_group)
.set_max_row_group_row_count(Some(row_per_group))
.set_bloom_filter_enabled(true)
.set_statistics_enabled(EnabledStatistics::Page)
.build();
@@ -49,7 +49,7 @@ use datafusion_physical_plan::{
collect, displayable, ExecutionPlan, Partitioning,
};
use object_store::ObjectStore;
use object_store::ObjectStoreExt;
use object_store::memory::InMemory;
use rstest::rstest;
use url::Url;
+35 -30
View File
@@ -20,7 +20,6 @@
use std::collections::BTreeSet;
use std::fs::File;
use std::io::{Read, Seek, SeekFrom};
use std::ops::Range;
use std::sync::Arc;
use arrow::datatypes::DataType;
@@ -43,9 +42,12 @@ use datafusion_execution::config::SessionConfig;
use async_trait::async_trait;
use bytes::Bytes;
use chrono::{TimeZone, Utc};
use futures::StreamExt;
use futures::stream::{self, BoxStream};
use insta::assert_snapshot;
use object_store::{Attributes, MultipartUpload, PutMultipartOptions, PutPayload};
use object_store::{
Attributes, CopyOptions, GetRange, MultipartUpload, PutMultipartOptions, PutPayload,
};
use object_store::{
GetOptions, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore,
PutOptions, PutResult, path::Path,
@@ -620,7 +622,7 @@ async fn create_partitioned_alltypes_parquet_table(
}
#[derive(Debug)]
/// An object store implem that is mirrors a given file to multiple paths.
/// An object store implem that mirrors a given file to multiple paths.
pub struct MirroringObjectStore {
/// The `(path,size)` of the files that "exist" in the store
files: Vec<Path>,
@@ -669,12 +671,13 @@ impl ObjectStore for MirroringObjectStore {
async fn get_opts(
&self,
location: &Path,
_options: GetOptions,
options: GetOptions,
) -> object_store::Result<GetResult> {
self.files.iter().find(|x| *x == location).unwrap();
let path = std::path::PathBuf::from(&self.mirrored_file);
let file = File::open(&path).unwrap();
let metadata = file.metadata().unwrap();
let meta = ObjectMeta {
location: location.clone(),
last_modified: metadata.modified().map(chrono::DateTime::from).unwrap(),
@@ -683,37 +686,35 @@ impl ObjectStore for MirroringObjectStore {
version: None,
};
let payload = if options.head {
// no content for head requests
GetResultPayload::Stream(stream::empty().boxed())
} else if let Some(range) = options.range {
let GetRange::Bounded(range) = range else {
unimplemented!("Unbounded range not supported in MirroringObjectStore");
};
let mut file = File::open(path).unwrap();
file.seek(SeekFrom::Start(range.start)).unwrap();
let to_read = range.end - range.start;
let to_read: usize = to_read.try_into().unwrap();
let mut data = Vec::with_capacity(to_read);
let read = file.take(to_read as u64).read_to_end(&mut data).unwrap();
assert_eq!(read, to_read);
let stream = stream::once(async move { Ok(Bytes::from(data)) }).boxed();
GetResultPayload::Stream(stream)
} else {
GetResultPayload::File(file, path)
};
Ok(GetResult {
range: 0..meta.size,
payload: GetResultPayload::File(file, path),
payload,
meta,
attributes: Attributes::default(),
})
}
async fn get_range(
&self,
location: &Path,
range: Range<u64>,
) -> object_store::Result<Bytes> {
self.files.iter().find(|x| *x == location).unwrap();
let path = std::path::PathBuf::from(&self.mirrored_file);
let mut file = File::open(path).unwrap();
file.seek(SeekFrom::Start(range.start)).unwrap();
let to_read = range.end - range.start;
let to_read: usize = to_read.try_into().unwrap();
let mut data = Vec::with_capacity(to_read);
let read = file.take(to_read as u64).read_to_end(&mut data).unwrap();
assert_eq!(read, to_read);
Ok(data.into())
}
async fn delete(&self, _location: &Path) -> object_store::Result<()> {
unimplemented!()
}
fn list(
&self,
prefix: Option<&Path>,
@@ -783,14 +784,18 @@ impl ObjectStore for MirroringObjectStore {
})
}
async fn copy(&self, _from: &Path, _to: &Path) -> object_store::Result<()> {
fn delete_stream(
&self,
_locations: BoxStream<'static, object_store::Result<Path>>,
) -> BoxStream<'static, object_store::Result<Path>> {
unimplemented!()
}
async fn copy_if_not_exists(
async fn copy_opts(
&self,
_from: &Path,
_to: &Path,
_options: CopyOptions,
) -> object_store::Result<()> {
unimplemented!()
}
@@ -18,10 +18,11 @@
//! Object store implementation used for testing
use crate::tracing::asserting_tracer::assert_traceability;
use futures::StreamExt;
use futures::stream::BoxStream;
use object_store::{
GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, ObjectStore,
PutMultipartOptions, PutOptions, PutPayload, PutResult, path::Path,
CopyOptions, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta,
ObjectStore, PutMultipartOptions, PutOptions, PutPayload, PutResult, path::Path,
};
use std::fmt::{Debug, Display, Formatter};
use std::sync::Arc;
@@ -83,14 +84,17 @@ impl ObjectStore for TraceableObjectStore {
self.inner.get_opts(location, options).await
}
async fn head(&self, location: &Path) -> object_store::Result<ObjectMeta> {
assert_traceability().await;
self.inner.head(location).await
}
async fn delete(&self, location: &Path) -> object_store::Result<()> {
assert_traceability().await;
self.inner.delete(location).await
fn delete_stream(
&self,
locations: BoxStream<'static, object_store::Result<Path>>,
) -> BoxStream<'static, object_store::Result<Path>> {
self.inner
.delete_stream(locations)
.then(|res| async {
futures::executor::block_on(assert_traceability());
res
})
.boxed()
}
fn list(
@@ -109,17 +113,13 @@ impl ObjectStore for TraceableObjectStore {
self.inner.list_with_delimiter(prefix).await
}
async fn copy(&self, from: &Path, to: &Path) -> object_store::Result<()> {
assert_traceability().await;
self.inner.copy(from, to).await
}
async fn copy_if_not_exists(
async fn copy_opts(
&self,
from: &Path,
to: &Path,
options: CopyOptions,
) -> object_store::Result<()> {
assert_traceability().await;
self.inner.copy_if_not_exists(from, to).await
self.inner.copy_opts(from, to, options).await
}
}
@@ -63,7 +63,8 @@ use datafusion_session::Session;
use futures::StreamExt;
use futures::stream::BoxStream;
use object_store::{
GetOptions, GetRange, GetResultPayload, ObjectMeta, ObjectStore, path::Path,
GetOptions, GetRange, GetResultPayload, ObjectMeta, ObjectStore, ObjectStoreExt,
path::Path,
};
use tokio::io::AsyncWriteExt;
+1 -1
View File
@@ -52,7 +52,7 @@ use datafusion_datasource::file_stream::FileOpenFuture;
use datafusion_datasource::file_stream::FileOpener;
use futures::StreamExt;
use itertools::Itertools;
use object_store::{GetOptions, GetRange, GetResultPayload, ObjectStore};
use object_store::{GetOptions, GetRange, GetResultPayload, ObjectStore, ObjectStoreExt};
/// Enum indicating which Arrow IPC format to use
#[derive(Clone, Copy, Debug)]
@@ -41,7 +41,7 @@ use datafusion_physical_plan::ExecutionPlan;
use datafusion_session::Session;
use async_trait::async_trait;
use object_store::{GetResultPayload, ObjectMeta, ObjectStore};
use object_store::{GetResultPayload, ObjectMeta, ObjectStore, ObjectStoreExt};
#[derive(Default)]
/// Factory struct used to create [`AvroFormat`]
+1 -1
View File
@@ -147,7 +147,7 @@ mod private {
use bytes::Buf;
use datafusion_datasource::{PartitionedFile, file_stream::FileOpenFuture};
use futures::StreamExt;
use object_store::{GetResultPayload, ObjectStore};
use object_store::{GetResultPayload, ObjectStore, ObjectStoreExt};
pub struct AvroOpener {
pub config: Arc<AvroSource>,
+3 -1
View File
@@ -60,7 +60,9 @@ use bytes::{Buf, Bytes};
use datafusion_datasource::source::DataSourceExec;
use futures::stream::BoxStream;
use futures::{Stream, StreamExt, TryStreamExt, pin_mut};
use object_store::{ObjectMeta, ObjectStore, delimited::newline_delimited_stream};
use object_store::{
ObjectMeta, ObjectStore, ObjectStoreExt, delimited::newline_delimited_stream,
};
use regex::Regex;
#[derive(Default)]
@@ -61,7 +61,7 @@ use datafusion_session::Session;
use crate::utils::JsonArrayToNdjsonReader;
use async_trait::async_trait;
use object_store::{GetResultPayload, ObjectMeta, ObjectStore};
use object_store::{GetResultPayload, ObjectMeta, ObjectStore, ObjectStoreExt};
#[derive(Default)]
/// Factory struct used to create [JsonFormat]
+1 -1
View File
@@ -509,9 +509,9 @@ mod tests {
use bytes::Bytes;
use datafusion_datasource::FileRange;
use futures::TryStreamExt;
use object_store::PutPayload;
use object_store::memory::InMemory;
use object_store::path::Path;
use object_store::{ObjectStoreExt, PutPayload};
/// Helper to create a test schema
fn test_schema() -> SchemaRef {
@@ -34,9 +34,9 @@ use parquet::arrow::{ArrowWriter, ProjectionMask};
use parquet::file::properties::WriterProperties;
use tempfile::TempDir;
const ROW_GROUP_SIZE: usize = 10_000;
const ROW_GROUP_ROW_COUNT: usize = 10_000;
const TOTAL_ROW_GROUPS: usize = 10;
const TOTAL_ROWS: usize = ROW_GROUP_SIZE * TOTAL_ROW_GROUPS;
const TOTAL_ROWS: usize = ROW_GROUP_ROW_COUNT * TOTAL_ROW_GROUPS;
const TARGET_VALUE: &str = "target_value";
const COLUMN_NAME: &str = "list_col";
const PAYLOAD_COLUMN_NAME: &str = "payload";
@@ -69,7 +69,7 @@ fn parquet_nested_filter_pushdown(c: &mut Criterion) {
b.iter(|| {
let matched = scan_with_predicate(&dataset_path, &predicate, false)
.expect("baseline parquet scan with filter succeeded");
assert_eq!(matched, ROW_GROUP_SIZE);
assert_eq!(matched, ROW_GROUP_ROW_COUNT);
});
});
@@ -79,7 +79,7 @@ fn parquet_nested_filter_pushdown(c: &mut Criterion) {
b.iter(|| {
let matched = scan_with_predicate(&dataset_path, &predicate, true)
.expect("pushdown parquet scan with filter succeeded");
assert_eq!(matched, ROW_GROUP_SIZE);
assert_eq!(matched, ROW_GROUP_ROW_COUNT);
});
});
@@ -170,7 +170,7 @@ fn create_dataset() -> datafusion_common::Result<BenchmarkDataset> {
]));
let writer_props = WriterProperties::builder()
.set_max_row_group_size(ROW_GROUP_SIZE)
.set_max_row_group_row_count(Some(ROW_GROUP_ROW_COUNT))
.build();
let mut writer = ArrowWriter::try_new(
@@ -195,7 +195,7 @@ fn create_dataset() -> datafusion_common::Result<BenchmarkDataset> {
];
for value in sorted_values {
let batch = build_list_batch(&schema, value, ROW_GROUP_SIZE)?;
let batch = build_list_batch(&schema, value, ROW_GROUP_ROW_COUNT)?;
writer.write(&batch)?;
}
@@ -70,7 +70,7 @@ use futures::future::BoxFuture;
use futures::{FutureExt, StreamExt, TryStreamExt};
use object_store::buffered::BufWriter;
use object_store::path::Path;
use object_store::{ObjectMeta, ObjectStore};
use object_store::{ObjectMeta, ObjectStore, ObjectStoreExt};
use parquet::arrow::arrow_writer::{
ArrowColumnChunk, ArrowColumnWriter, ArrowLeafColumn, ArrowRowGroupWriterFactory,
ArrowWriterOptions, compute_leaves,
@@ -82,7 +82,9 @@ use parquet::basic::Type;
use parquet::encryption::encrypt::FileEncryptionProperties;
use parquet::errors::ParquetError;
use parquet::file::metadata::{ParquetMetaData, SortingColumn};
use parquet::file::properties::{WriterProperties, WriterPropertiesBuilder};
use parquet::file::properties::{
DEFAULT_MAX_ROW_GROUP_ROW_COUNT, WriterProperties, WriterPropertiesBuilder,
};
use parquet::file::writer::SerializedFileWriter;
use parquet::schema::types::SchemaDescriptor;
use tokio::io::{AsyncWrite, AsyncWriteExt};
@@ -1587,7 +1589,9 @@ fn spawn_parquet_parallel_serialization_task(
) -> SpawnedTask<Result<(), DataFusionError>> {
SpawnedTask::spawn(async move {
let max_buffer_rb = parallel_options.max_buffered_record_batches_per_stream;
let max_row_group_rows = writer_props.max_row_group_size();
let max_row_group_rows = writer_props
.max_row_group_row_count()
.unwrap_or(DEFAULT_MAX_ROW_GROUP_ROW_COUNT);
let mut row_group_index = 0;
let col_writers =
row_group_writer_factory.create_column_writers(row_group_index)?;
+6 -5
View File
@@ -348,7 +348,8 @@ impl FileOpener for ParquetOpener {
// unnecessary I/O. We decide later if it is needed to evaluate the
// pruning predicates. Thus default to not requesting it from the
// underlying reader.
let mut options = ArrowReaderOptions::new().with_page_index(false);
let mut options =
ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Skip);
#[cfg(feature = "parquet_encryption")]
if let Some(fd_val) = file_decryption_properties {
options = options.with_file_decryption_properties(Arc::clone(&fd_val));
@@ -1037,7 +1038,7 @@ mod test {
};
use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
use futures::{Stream, StreamExt};
use object_store::{ObjectStore, memory::InMemory, path::Path};
use object_store::{ObjectStore, ObjectStoreExt, memory::InMemory, path::Path};
use parquet::arrow::ArrowWriter;
use parquet::file::properties::WriterProperties;
@@ -1734,7 +1735,7 @@ mod test {
// Write parquet file with multiple row groups
// Force small row groups by setting max_row_group_size
let props = WriterProperties::builder()
.set_max_row_group_size(3) // Force each batch into its own row group
.set_max_row_group_row_count(Some(3)) // Force each batch into its own row group
.build();
let data_len = write_parquet_batches(
@@ -1834,7 +1835,7 @@ mod test {
.unwrap(); // 4 rows
let props = WriterProperties::builder()
.set_max_row_group_size(4)
.set_max_row_group_row_count(Some(4))
.build();
let data_len = write_parquet_batches(
@@ -1921,7 +1922,7 @@ mod test {
let batch3 = record_batch!(("a", Int32, vec![Some(7), Some(8)])).unwrap();
let props = WriterProperties::builder()
.set_max_row_group_size(2)
.set_max_row_group_row_count(Some(2))
.build();
let data_len = write_parquet_batches(
@@ -662,6 +662,7 @@ mod tests {
use datafusion_expr::{Expr, cast, col, lit};
use datafusion_physical_expr::planner::logical2physical;
use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
use object_store::ObjectStoreExt;
use parquet::arrow::ArrowSchemaConverter;
use parquet::arrow::async_reader::ParquetObjectReader;
use parquet::basic::LogicalType;
@@ -1752,7 +1753,7 @@ mod tests {
pruning_predicate: &PruningPredicate,
) -> Result<RowGroupAccessPlanFilter> {
use datafusion_datasource::PartitionedFile;
use object_store::{ObjectMeta, ObjectStore};
use object_store::ObjectMeta;
let object_meta = ObjectMeta {
location: object_store::path::Path::parse(file_name).expect("creating path"),
+1 -1
View File
@@ -573,7 +573,7 @@ mod tests {
use datafusion_execution::object_store::{
DefaultObjectStoreRegistry, ObjectStoreRegistry,
};
use object_store::{local::LocalFileSystem, path::Path};
use object_store::{ObjectStoreExt, local::LocalFileSystem, path::Path};
use std::{collections::HashMap, ops::Not, sync::Arc};
use url::Url;
+19 -23
View File
@@ -30,7 +30,7 @@ use itertools::Itertools;
use log::debug;
use object_store::path::DELIMITER;
use object_store::path::Path;
use object_store::{ObjectMeta, ObjectStore};
use object_store::{ObjectMeta, ObjectStore, ObjectStoreExt};
use url::Url;
/// A parsed URL identifying files for a listing table, see [`ListingTableUrl::parse`]
@@ -521,8 +521,8 @@ mod tests {
use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
use datafusion_physical_plan::ExecutionPlan;
use object_store::{
GetOptions, GetResult, ListResult, MultipartUpload, PutMultipartOptions,
PutPayload,
CopyOptions, GetOptions, GetResult, ListResult, MultipartUpload,
PutMultipartOptions, PutPayload,
};
use std::any::Any;
use std::collections::HashMap;
@@ -1108,7 +1108,14 @@ mod tests {
location: &Path,
options: GetOptions,
) -> object_store::Result<GetResult> {
self.in_mem.get_opts(location, options).await
if options.head && self.forbidden_paths.contains(location) {
Err(object_store::Error::PermissionDenied {
path: location.to_string(),
source: "forbidden".into(),
})
} else {
self.in_mem.get_opts(location, options).await
}
}
async fn get_ranges(
@@ -1119,19 +1126,11 @@ mod tests {
self.in_mem.get_ranges(location, ranges).await
}
async fn head(&self, location: &Path) -> object_store::Result<ObjectMeta> {
if self.forbidden_paths.contains(location) {
Err(object_store::Error::PermissionDenied {
path: location.to_string(),
source: "forbidden".into(),
})
} else {
self.in_mem.head(location).await
}
}
async fn delete(&self, location: &Path) -> object_store::Result<()> {
self.in_mem.delete(location).await
fn delete_stream(
&self,
locations: BoxStream<'static, object_store::Result<Path>>,
) -> BoxStream<'static, object_store::Result<Path>> {
self.in_mem.delete_stream(locations)
}
fn list(
@@ -1148,16 +1147,13 @@ mod tests {
self.in_mem.list_with_delimiter(prefix).await
}
async fn copy(&self, from: &Path, to: &Path) -> object_store::Result<()> {
self.in_mem.copy(from, to).await
}
async fn copy_if_not_exists(
async fn copy_opts(
&self,
from: &Path,
to: &Path,
options: CopyOptions,
) -> object_store::Result<()> {
self.in_mem.copy_if_not_exists(from, to).await
self.in_mem.copy_opts(from, to, options).await
}
}
+20 -5
View File
@@ -392,20 +392,27 @@ impl Range {
}
let stop = if !self.include_upper_bound {
Date32Type::subtract_month_day_nano(stop, step)
Date32Type::subtract_month_day_nano_opt(stop, step).ok_or_else(|| {
exec_datafusion_err!(
"Cannot generate date range where stop {} - {step:?}) overflows",
date32_to_string(stop)
)
})?
} else {
stop
};
let neg = months < 0 || days < 0;
let mut new_date = start;
let mut new_date = Some(start);
let values = from_fn(|| {
if (neg && new_date < stop) || (!neg && new_date > stop) {
let Some(current_date) = new_date else {
return None; // previous overflow
};
if (neg && current_date < stop) || (!neg && current_date > stop) {
None
} else {
let current_date = new_date;
new_date = Date32Type::add_month_day_nano(new_date, step);
new_date = Date32Type::add_month_day_nano_opt(current_date, step);
Some(Some(current_date))
}
});
@@ -578,3 +585,11 @@ fn parse_tz(tz: &Option<&str>) -> Result<Tz> {
Tz::from_str(tz)
.map_err(|op| exec_datafusion_err!("failed to parse timezone {tz}: {:?}", op))
}
fn date32_to_string(value: i32) -> String {
if let Some(d) = Date32Type::to_naive_date_opt(value) {
format!("{value} ({d})")
} else {
format!("{value} (unknown date)")
}
}
@@ -114,7 +114,11 @@ impl ScalarUDFImpl for SparkLastDay {
}
fn spark_last_day(days: i32) -> Result<i32> {
let date = Date32Type::to_naive_date(days);
let date = Date32Type::to_naive_date_opt(days).ok_or_else(|| {
exec_datafusion_err!(
"Spark `last_day`: Unable to convert days value {days} to date"
)
})?;
let (year, month) = (date.year(), date.month());
let (next_year, next_month) = if month == 12 {
@@ -213,7 +213,7 @@ where
}
fn spark_next_day(days: i32, day_of_week: &str) -> Option<i32> {
let date = Date32Type::to_naive_date(days);
let date = Date32Type::to_naive_date_opt(days)?;
let day_of_week = day_of_week.trim().to_uppercase();
let day_of_week = match day_of_week.as_str() {
@@ -104,7 +104,7 @@ Plan with Metrics
03)----ProjectionExec: expr=[id@0 as id, value@1 as v, value@1 + id@0 as name], metrics=[output_rows=10, <slt:ignore>]
04)------FilterExec: value@1 > 3, metrics=[output_rows=10, <slt:ignore>, selectivity=100% (10/10)]
05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1, metrics=[output_rows=10, <slt:ignore>]
06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/test_data.parquet]]}, projection=[id, value], file_type=parquet, predicate=value@1 > 3 AND DynamicFilter [ value@1 IS NULL OR value@1 > 800 ], pruning_predicate=value_null_count@1 != row_count@2 AND value_max@0 > 3 AND (value_null_count@1 > 0 OR value_null_count@1 != row_count@2 AND value_max@0 > 800), required_guarantees=[], metrics=[output_rows=10, <slt:ignore>, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched -> 1 fully matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=1 total → 1 matched, limit_pruned_row_groups=0 total → 0 matched, bytes_scanned=210, metadata_load_time=<slt:ignore>, scan_efficiency_ratio=18% (210/1.16 K)]
06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/dynamic_filter_pushdown_config/test_data.parquet]]}, projection=[id, value], file_type=parquet, predicate=value@1 > 3 AND DynamicFilter [ value@1 IS NULL OR value@1 > 800 ], pruning_predicate=value_null_count@1 != row_count@2 AND value_max@0 > 3 AND (value_null_count@1 > 0 OR value_null_count@1 != row_count@2 AND value_max@0 > 800), required_guarantees=[], metrics=[output_rows=10, elapsed_compute=1ns, output_bytes=80.0 B, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched -> 1 fully matched, row_groups_pruned_bloom_filter=1 total → 1 matched, page_index_pages_pruned=1 total → 1 matched, limit_pruned_row_groups=0 total → 0 matched, bytes_scanned=210, metadata_load_time=<slt:ignore>, scan_efficiency_ratio=18% (210/1.15 K)]
statement ok
set datafusion.explain.analyze_level = dev;
@@ -268,7 +268,7 @@ physical_plan
06)┌─────────────┴─────────────┐
07)│ DataSourceExec │
08)│ -------------------- │
09)│ bytes: 1040
09)│ bytes: 1024 │
10)│ format: memory │
11)│ rows: 2 │
12)└───────────────────────────┘
@@ -345,7 +345,7 @@ physical_plan
15)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
16)│ DataSourceExec ││ ProjectionExec │
17)│ -------------------- ││ -------------------- │
18)│ bytes: 520 ││ date_col: date_col │
18)│ bytes: 512 ││ date_col: date_col │
19)│ format: memory ││ int_col: int_col │
20)│ rows: 1 ││ │
21)│ ││ string_col: │
@@ -592,7 +592,7 @@ physical_plan
07)┌─────────────┴─────────────┐
08)│ DataSourceExec │
09)│ -------------------- │
10)│ bytes: 520
10)│ bytes: 512 │
11)│ format: memory │
12)│ rows: 1 │
13)└───────────────────────────┘
@@ -954,7 +954,7 @@ physical_plan
13)┌─────────────┴─────────────┐
14)│ DataSourceExec │
15)│ -------------------- │
16)│ bytes: 520
16)│ bytes: 512 │
17)│ format: memory │
18)│ rows: 1 │
19)└───────────────────────────┘
@@ -1305,7 +1305,7 @@ physical_plan
42)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
43)│ DataSourceExec ││ DataSourceExec │
44)│ -------------------- ││ -------------------- │
45)│ bytes: 296 ││ bytes: 288
45)│ bytes: 288 ││ bytes: 280
46)│ format: memory ││ format: memory │
47)│ rows: 1 ││ rows: 1 │
48)└───────────────────────────┘└───────────────────────────┘
@@ -1324,14 +1324,14 @@ physical_plan
04)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
05)│ DataSourceExec ││ ProjectionExec │
06)│ -------------------- ││ -------------------- │
07)│ bytes: 296 ││ id: CAST(id AS Int32) │
07)│ bytes: 288 ││ id: CAST(id AS Int32) │
08)│ format: memory ││ name: name │
09)│ rows: 1 ││ │
10)└───────────────────────────┘└─────────────┬─────────────┘
11)-----------------------------┌─────────────┴─────────────┐
12)-----------------------------│ DataSourceExec │
13)-----------------------------│ -------------------- │
14)-----------------------------│ bytes: 288
14)-----------------------------│ bytes: 280
15)-----------------------------│ format: memory │
16)-----------------------------│ rows: 1 │
17)-----------------------------└───────────────────────────┘
@@ -90,21 +90,15 @@ SELECT make_interval(0, 0, 0, 0, 2147483647, 1, 0.0);
----
NULL
# Intervals being rendered as empty string, see issue:
# https://github.com/apache/datafusion/issues/17455
# We expect something like 0.00 secs with query ?
query T
SELECT make_interval(0, 0, 0, 0, 0, 0, 0.0) || '';
----
(empty)
0 secs
# Intervals being rendered as empty string, see issue:
# https://github.com/apache/datafusion/issues/17455
# We expect something like 0.00 secs with query ?
query T
SELECT make_interval() || '';
----
(empty)
0 secs
query ?
SELECT INTERVAL '1' SECOND AS iv;
+1 -1
View File
@@ -99,7 +99,7 @@ mod test {
use datafusion_physical_plan::collect;
use datafusion_sql::parser::DFParser;
use futures::{StreamExt, TryStreamExt, stream};
use object_store::{ObjectStore, PutPayload, memory::InMemory, path::Path};
use object_store::{ObjectStoreExt, PutPayload, memory::InMemory, path::Path};
use url::Url;
use wasm_bindgen_test::wasm_bindgen_test;
@@ -28,6 +28,17 @@
[#19692]: https://github.com/apache/datafusion/issues/19692
### Upgrade arrow/parquet to 58.0.0 and object_store to 0.13.0
DataFusion 53.0.0 uses `arrow` and `parquet` 58.0.0, and `object_store` 0.13.0.
This may require updates to your Cargo.toml if you have direct dependencies on
these crates.
See the [Arrow 58.0.0 release notes] and the [object_store 0.13.0 upgrade guide] for details on breaking changes in those versions.
[arrow 58.0.0 release notes]: https://github.com/apache/arrow-rs/releases/tag/58.0.0
[object_store 0.13.0 upgrade guide]: https://github.com/apache/arrow-rs/releases/tag/58.0.0
### `ExecutionPlan::properties` now returns `&Arc<PlanProperties>`
Now `ExecutionPlan::properties()` returns `&Arc<PlanProperties>` instead of a