From 2c84f243b882eff69806cd7294d38bf422fdb24a Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 20 Dec 2024 16:18:55 -0500 Subject: [PATCH 001/503] [object_store]: Version and Changelog for 0.11.2 (#6908) * [object_store]: Version and Changelog for 0.11.2 * increment version * update script * changelog * tweaks * Update object_store/CHANGELOG.md Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- object_store/CHANGELOG-old.md | 39 ++++++++++++++ object_store/CHANGELOG.md | 51 ++++++++++--------- object_store/Cargo.toml | 2 +- object_store/dev/release/README.md | 5 +- object_store/dev/release/update_change_log.sh | 4 +- 5 files changed, 72 insertions(+), 29 deletions(-) diff --git a/object_store/CHANGELOG-old.md b/object_store/CHANGELOG-old.md index 28dbde4e7b7f..c42689240dd9 100644 --- a/object_store/CHANGELOG-old.md +++ b/object_store/CHANGELOG-old.md @@ -19,6 +19,45 @@ # Historical Changelog + +## [object_store_0.11.1](https://github.com/apache/arrow-rs/tree/object_store_0.11.1) (2024-10-15) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.11.0...object_store_0.11.1) + +**Implemented enhancements:** + +- There is no way to pass object store client options as environment variables [\#6333](https://github.com/apache/arrow-rs/issues/6333) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Better Document Backoff Algorithm [\#6324](https://github.com/apache/arrow-rs/issues/6324) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add direction to `list_with_offset` [\#6274](https://github.com/apache/arrow-rs/issues/6274) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support server-side encryption with customer-provided keys \(SSE-C\) [\#6229](https://github.com/apache/arrow-rs/issues/6229) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Fixed bugs:** + +- \[object-store\] Requested tokio version is too old - does not compile [\#6458](https://github.com/apache/arrow-rs/issues/6458) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Azure SAS tokens are visible when retry errors are logged via object\_store [\#6322](https://github.com/apache/arrow-rs/issues/6322) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Merged pull requests:** + +- object\_store: fix typo in with\_connect\_timeout\_disabled that actually disabled non-connect timeouts [\#6563](https://github.com/apache/arrow-rs/pull/6563) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([adriangb](https://github.com/adriangb)) +- object\_store: Clarify what is a prefix in list\(\) documentation [\#6520](https://github.com/apache/arrow-rs/pull/6520) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([progval](https://github.com/progval)) +- object\_store: enable lint `unreachable_pub` [\#6512](https://github.com/apache/arrow-rs/pull/6512) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ByteBaker](https://github.com/ByteBaker)) +- \[object\_store\] Retry S3 requests with 200 response with "Error" in body [\#6508](https://github.com/apache/arrow-rs/pull/6508) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([PeterKeDer](https://github.com/PeterKeDer)) +- \[object-store\] Require tokio 1.29.0. [\#6459](https://github.com/apache/arrow-rs/pull/6459) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ashtuchkin](https://github.com/ashtuchkin)) +- feat: expose HTTP/2 max frame size in `object_store` [\#6442](https://github.com/apache/arrow-rs/pull/6442) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([crepererum](https://github.com/crepererum)) +- Derive `Clone` for `object_store::aws::AmazonS3` [\#6414](https://github.com/apache/arrow-rs/pull/6414) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ethe](https://github.com/ethe)) +- object\_score: Support Azure Fabric OAuth Provider [\#6382](https://github.com/apache/arrow-rs/pull/6382) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([RobinLin666](https://github.com/RobinLin666)) +- `object_store::GetOptions` derive `Clone` [\#6361](https://github.com/apache/arrow-rs/pull/6361) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([samuelcolvin](https://github.com/samuelcolvin)) +- \[object\_store\] Propagate env vars as object store client options [\#6334](https://github.com/apache/arrow-rs/pull/6334) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ccciudatu](https://github.com/ccciudatu)) +- docs\[object\_store\]: clarify the backoff strategy that is actually implemented [\#6325](https://github.com/apache/arrow-rs/pull/6325) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([westonpace](https://github.com/westonpace)) +- fix: azure sas token visible in logs [\#6323](https://github.com/apache/arrow-rs/pull/6323) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) +- object\_store/delimited: Fix `TrailingEscape` condition [\#6265](https://github.com/apache/arrow-rs/pull/6265) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Turbo87](https://github.com/Turbo87)) +- fix\(object\_store\): only add encryption headers for SSE-C in get request [\#6260](https://github.com/apache/arrow-rs/pull/6260) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jiachengdb](https://github.com/jiachengdb)) +- docs: Add parquet\_opendal in related projects [\#6236](https://github.com/apache/arrow-rs/pull/6236) ([Xuanwo](https://github.com/Xuanwo)) +- feat\(object\_store\): add support for server-side encryption with customer-provided keys \(SSE-C\) [\#6230](https://github.com/apache/arrow-rs/pull/6230) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jiachengdb](https://github.com/jiachengdb)) +- feat: further TLS options on ClientOptions: \#5034 [\#6148](https://github.com/apache/arrow-rs/pull/6148) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ByteBaker](https://github.com/ByteBaker)) + + + ## [object_store_0.11.0](https://github.com/apache/arrow-rs/tree/object_store_0.11.0) (2024-08-12) [Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.10.2...object_store_0.11.0) diff --git a/object_store/CHANGELOG.md b/object_store/CHANGELOG.md index 95585983572c..0e834c5e2ef2 100644 --- a/object_store/CHANGELOG.md +++ b/object_store/CHANGELOG.md @@ -19,41 +19,42 @@ # Changelog -## [object_store_0.11.1](https://github.com/apache/arrow-rs/tree/object_store_0.11.1) (2024-10-15) +## [object_store_0.11.2](https://github.com/apache/arrow-rs/tree/object_store_0.11.2) (2024-12-20) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.11.0...object_store_0.11.1) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.11.1...object_store_0.11.2) **Implemented enhancements:** -- There is no way to pass object store client options as environment variables [\#6333](https://github.com/apache/arrow-rs/issues/6333) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Better Document Backoff Algorithm [\#6324](https://github.com/apache/arrow-rs/issues/6324) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Add direction to `list_with_offset` [\#6274](https://github.com/apache/arrow-rs/issues/6274) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Support server-side encryption with customer-provided keys \(SSE-C\) [\#6229](https://github.com/apache/arrow-rs/issues/6229) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object-store's AzureClient should protect against multiple streams performing put\_block in parallel for the same BLOB path [\#6868](https://github.com/apache/arrow-rs/issues/6868) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support S3 Put IfMatch [\#6799](https://github.com/apache/arrow-rs/issues/6799) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store Azure Government using OAuth [\#6759](https://github.com/apache/arrow-rs/issues/6759) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support for AWS Requester Pays buckets [\#6716](https://github.com/apache/arrow-rs/issues/6716) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[object-store\]: Implement credential\_process support for S3 [\#6422](https://github.com/apache/arrow-rs/issues/6422) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Conditional put and rename\_if\_not\_exist on S3 [\#6285](https://github.com/apache/arrow-rs/issues/6285) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Fixed bugs:** -- \[object-store\] Requested tokio version is too old - does not compile [\#6458](https://github.com/apache/arrow-rs/issues/6458) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Azure SAS tokens are visible when retry errors are logged via object\_store [\#6322](https://github.com/apache/arrow-rs/issues/6322) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- `object_store` errors when `reqwest` `gzip` feature is enabled [\#6842](https://github.com/apache/arrow-rs/issues/6842) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Multi-part s3 uploads fail when using checksum [\#6793](https://github.com/apache/arrow-rs/issues/6793) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- `with_unsigned_payload` shouldn't generate payload hash [\#6697](https://github.com/apache/arrow-rs/issues/6697) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[Object\_store\] min\_ttl is too high for GKE tokens [\#6625](https://github.com/apache/arrow-rs/issues/6625) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store `test_private_bucket` fails - store: "S3", source: BucketNotFound { bucket: "bloxbender" } [\#6600](https://github.com/apache/arrow-rs/issues/6600) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- S3 endpoint and trailing slash result in weird/invalid requests [\#6580](https://github.com/apache/arrow-rs/issues/6580) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Merged pull requests:** -- object\_store: fix typo in with\_connect\_timeout\_disabled that actually disabled non-connect timeouts [\#6563](https://github.com/apache/arrow-rs/pull/6563) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([adriangb](https://github.com/adriangb)) -- object\_store: Clarify what is a prefix in list\(\) documentation [\#6520](https://github.com/apache/arrow-rs/pull/6520) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([progval](https://github.com/progval)) -- object\_store: enable lint `unreachable_pub` [\#6512](https://github.com/apache/arrow-rs/pull/6512) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ByteBaker](https://github.com/ByteBaker)) -- \[object\_store\] Retry S3 requests with 200 response with "Error" in body [\#6508](https://github.com/apache/arrow-rs/pull/6508) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([PeterKeDer](https://github.com/PeterKeDer)) -- \[object-store\] Require tokio 1.29.0. [\#6459](https://github.com/apache/arrow-rs/pull/6459) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ashtuchkin](https://github.com/ashtuchkin)) -- feat: expose HTTP/2 max frame size in `object_store` [\#6442](https://github.com/apache/arrow-rs/pull/6442) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([crepererum](https://github.com/crepererum)) -- Derive `Clone` for `object_store::aws::AmazonS3` [\#6414](https://github.com/apache/arrow-rs/pull/6414) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ethe](https://github.com/ethe)) -- object\_score: Support Azure Fabric OAuth Provider [\#6382](https://github.com/apache/arrow-rs/pull/6382) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([RobinLin666](https://github.com/RobinLin666)) -- `object_store::GetOptions` derive `Clone` [\#6361](https://github.com/apache/arrow-rs/pull/6361) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([samuelcolvin](https://github.com/samuelcolvin)) -- \[object\_store\] Propagate env vars as object store client options [\#6334](https://github.com/apache/arrow-rs/pull/6334) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ccciudatu](https://github.com/ccciudatu)) -- docs\[object\_store\]: clarify the backoff strategy that is actually implemented [\#6325](https://github.com/apache/arrow-rs/pull/6325) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([westonpace](https://github.com/westonpace)) -- fix: azure sas token visible in logs [\#6323](https://github.com/apache/arrow-rs/pull/6323) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) -- object\_store/delimited: Fix `TrailingEscape` condition [\#6265](https://github.com/apache/arrow-rs/pull/6265) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Turbo87](https://github.com/Turbo87)) -- fix\(object\_store\): only add encryption headers for SSE-C in get request [\#6260](https://github.com/apache/arrow-rs/pull/6260) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jiachengdb](https://github.com/jiachengdb)) -- docs: Add parquet\_opendal in related projects [\#6236](https://github.com/apache/arrow-rs/pull/6236) ([Xuanwo](https://github.com/Xuanwo)) -- feat\(object\_store\): add support for server-side encryption with customer-provided keys \(SSE-C\) [\#6230](https://github.com/apache/arrow-rs/pull/6230) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jiachengdb](https://github.com/jiachengdb)) -- feat: further TLS options on ClientOptions: \#5034 [\#6148](https://github.com/apache/arrow-rs/pull/6148) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ByteBaker](https://github.com/ByteBaker)) +- Use randomized content ID for Azure multipart uploads [\#6869](https://github.com/apache/arrow-rs/pull/6869) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([avarnon](https://github.com/avarnon)) +- Always explicitly disable `gzip` automatic decompression on reqwest client used by object\_store [\#6843](https://github.com/apache/arrow-rs/pull/6843) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([phillipleblanc](https://github.com/phillipleblanc)) +- object-store: remove S3ConditionalPut::ETagPutIfNotExists [\#6802](https://github.com/apache/arrow-rs/pull/6802) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([benesch](https://github.com/benesch)) +- Fix multipart uploads with checksums on object locked buckets [\#6794](https://github.com/apache/arrow-rs/pull/6794) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([avantgardnerio](https://github.com/avantgardnerio)) +- Add AuthorityHost to AzureConfigKey [\#6773](https://github.com/apache/arrow-rs/pull/6773) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([zadeluca](https://github.com/zadeluca)) +- object\_store: Add support for requester pays buckets [\#6768](https://github.com/apache/arrow-rs/pull/6768) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([kylebarron](https://github.com/kylebarron)) +- check sign\_payload instead of skip\_signature before computing checksum [\#6698](https://github.com/apache/arrow-rs/pull/6698) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mherrerarendon](https://github.com/mherrerarendon)) +- Update quick-xml requirement from 0.36.0 to 0.37.0 in /object\_store [\#6687](https://github.com/apache/arrow-rs/pull/6687) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([crepererum](https://github.com/crepererum)) +- Support native S3 conditional writes [\#6682](https://github.com/apache/arrow-rs/pull/6682) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([benesch](https://github.com/benesch)) +- \[object\_store\] fix S3 endpoint and trailing slash result in invalid requests [\#6641](https://github.com/apache/arrow-rs/pull/6641) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([adbmal](https://github.com/adbmal)) +- Lower GCP token min\_ttl to 4 minutes and add backoff to token refresh logic [\#6638](https://github.com/apache/arrow-rs/pull/6638) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mwylde](https://github.com/mwylde)) +- Remove `test_private_bucket` object\_store test [\#6601](https://github.com/apache/arrow-rs/pull/6601) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index bcc8e0b92243..bf254b3a0bbd 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.11.1" +version = "0.11.2" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/object_store/dev/release/README.md b/object_store/dev/release/README.md index 912ff4cd8bac..2dd1f6243c09 100644 --- a/object_store/dev/release/README.md +++ b/object_store/dev/release/README.md @@ -24,7 +24,10 @@ This file documents the release process for the `object_store` crate. -At the time of writing, we release a new version of `object_store` on demand rather than on a regular schedule. +We release a new version of `object_store` according to the schedule listed in +the [main README.md] + +[main README.md]: https://github.com/apache/arrow-rs?tab=readme-ov-file#object_store-crate As we are still in an early phase, we use the 0.x version scheme. If any code has been merged to main that has a breaking API change, as defined in [Rust RFC 1105] diff --git a/object_store/dev/release/update_change_log.sh b/object_store/dev/release/update_change_log.sh index 30724478ae1e..2797b62c0010 100755 --- a/object_store/dev/release/update_change_log.sh +++ b/object_store/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.11.0" -FUTURE_RELEASE="object_store_0.11.1" +SINCE_TAG="object_store_0.11.1" +FUTURE_RELEASE="object_store_0.11.2" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 10cf03c35808eedc191a9b3330bc7c268b7b71c1 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Tue, 24 Dec 2024 22:22:33 +0800 Subject: [PATCH 002/503] feat(parquet): Add next_row_group API for ParquetRecordBatchStream (#6907) * feat(parquet): Add next_row_group API for ParquetRecordBatchStream Signed-off-by: Xuanwo * chore: Returning error instead of using unreachable Signed-off-by: Xuanwo --------- Signed-off-by: Xuanwo --- parquet/src/arrow/async_reader/mod.rs | 132 ++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index c408456df147..96715e1164b2 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -613,6 +613,9 @@ impl std::fmt::Debug for StreamState { /// An asynchronous [`Stream`](https://docs.rs/futures/latest/futures/stream/trait.Stream.html) of [`RecordBatch`] /// for a parquet file that can be constructed using [`ParquetRecordBatchStreamBuilder`]. +/// +/// `ParquetRecordBatchStream` also provides [`ParquetRecordBatchStream::next_row_group`] for fetching row groups, +/// allowing users to decode record batches separately from I/O. pub struct ParquetRecordBatchStream { metadata: Arc, @@ -654,6 +657,70 @@ impl ParquetRecordBatchStream { } } +impl ParquetRecordBatchStream +where + T: AsyncFileReader + Unpin + Send + 'static, +{ + /// Fetches the next row group from the stream. + /// + /// Users can continue to call this function to get row groups and decode them concurrently. + /// + /// ## Notes + /// + /// ParquetRecordBatchStream should be used either as a `Stream` or with `next_row_group`; they should not be used simultaneously. + /// + /// ## Returns + /// + /// - `Ok(None)` if the stream has ended. + /// - `Err(error)` if the stream has errored. All subsequent calls will return `Ok(None)`. + /// - `Ok(Some(reader))` which holds all the data for the row group. + pub async fn next_row_group(&mut self) -> Result> { + loop { + match &mut self.state { + StreamState::Decoding(_) | StreamState::Reading(_) => { + return Err(ParquetError::General( + "Cannot combine the use of next_row_group with the Stream API".to_string(), + )) + } + StreamState::Init => { + let row_group_idx = match self.row_groups.pop_front() { + Some(idx) => idx, + None => return Ok(None), + }; + + let row_count = self.metadata.row_group(row_group_idx).num_rows() as usize; + + let selection = self.selection.as_mut().map(|s| s.split_off(row_count)); + + let reader_factory = self.reader.take().expect("lost reader"); + + let (reader_factory, maybe_reader) = reader_factory + .read_row_group( + row_group_idx, + selection, + self.projection.clone(), + self.batch_size, + ) + .await + .map_err(|err| { + self.state = StreamState::Error; + err + })?; + self.reader = Some(reader_factory); + + if let Some(reader) = maybe_reader { + return Ok(Some(reader)); + } else { + // All rows skipped, read next row group + continue; + } + } + StreamState::Error => return Ok(None), // Ends the stream as error happens. + } + } + } +} + impl Stream for ParquetRecordBatchStream where T: AsyncFileReader + Unpin + Send + 'static, @@ -1020,6 +1087,71 @@ mod tests { ); } + #[tokio::test] + async fn test_async_reader_with_next_row_group() { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{testdata}/alltypes_plain.parquet"); + let data = Bytes::from(std::fs::read(path).unwrap()); + + let metadata = ParquetMetaDataReader::new() + .parse_and_finish(&data) + .unwrap(); + let metadata = Arc::new(metadata); + + assert_eq!(metadata.num_row_groups(), 1); + + let async_reader = TestReader { + data: data.clone(), + metadata: metadata.clone(), + requests: Default::default(), + }; + + let requests = async_reader.requests.clone(); + let builder = ParquetRecordBatchStreamBuilder::new(async_reader) + .await + .unwrap(); + + let mask = ProjectionMask::leaves(builder.parquet_schema(), vec![1, 2]); + let mut stream = builder + .with_projection(mask.clone()) + .with_batch_size(1024) + .build() + .unwrap(); + + let mut readers = vec![]; + while let Some(reader) = stream.next_row_group().await.unwrap() { + readers.push(reader); + } + + let async_batches: Vec<_> = readers + .into_iter() + .flat_map(|r| r.map(|v| v.unwrap()).collect::>()) + .collect(); + + let sync_batches = ParquetRecordBatchReaderBuilder::try_new(data) + .unwrap() + .with_projection(mask) + .with_batch_size(104) + .build() + .unwrap() + .collect::>>() + .unwrap(); + + assert_eq!(async_batches, sync_batches); + + let requests = requests.lock().unwrap(); + let (offset_1, length_1) = metadata.row_group(0).column(1).byte_range(); + let (offset_2, length_2) = metadata.row_group(0).column(2).byte_range(); + + assert_eq!( + &requests[..], + &[ + offset_1 as usize..(offset_1 + length_1) as usize, + offset_2 as usize..(offset_2 + length_2) as usize + ] + ); + } + #[tokio::test] async fn test_async_reader_with_index() { let testdata = arrow::util::test_util::parquet_test_data(); From 63899b7ad9d093e86acc6c98604aa66431102993 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Thu, 26 Dec 2024 16:49:36 +0200 Subject: [PATCH 003/503] chore(arrow-ord): move `can_rank` to the `rank` file (#6910) --- arrow-ord/src/rank.rs | 9 +++++++++ arrow-ord/src/sort.rs | 11 +---------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arrow-ord/src/rank.rs b/arrow-ord/src/rank.rs index ecc693bab4e4..e61cebef38ec 100644 --- a/arrow-ord/src/rank.rs +++ b/arrow-ord/src/rank.rs @@ -24,6 +24,15 @@ use arrow_buffer::NullBuffer; use arrow_schema::{ArrowError, DataType, SortOptions}; use std::cmp::Ordering; +/// Whether `arrow_ord::rank` can rank an array of given data type. +pub(crate) fn can_rank(data_type: &DataType) -> bool { + data_type.is_primitive() + || matches!( + data_type, + DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary | DataType::LargeBinary + ) +} + /// Assigns a rank to each value in `array` based on its position in the sorted order /// /// Where values are equal, they will be assigned the highest of their ranks, diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index 60fc4a918525..51a6659e631b 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -30,7 +30,7 @@ use arrow_select::take::take; use std::cmp::Ordering; use std::sync::Arc; -use crate::rank::rank; +use crate::rank::{can_rank, rank}; pub use arrow_schema::SortOptions; /// Sort the `ArrayRef` using `SortOptions`. @@ -190,15 +190,6 @@ fn partition_validity(array: &dyn Array) -> (Vec, Vec) { } } -/// Whether `arrow_ord::rank` can rank an array of given data type. -fn can_rank(data_type: &DataType) -> bool { - data_type.is_primitive() - || matches!( - data_type, - DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary | DataType::LargeBinary - ) -} - /// Whether `sort_to_indices` can sort an array of given data type. fn can_sort_to_indices(data_type: &DataType) -> bool { data_type.is_primitive() From 191a9ec1e058c3c1ed31fb9bf0ed2d62f29ddd61 Mon Sep 17 00:00:00 2001 From: Tom Forbes Date: Sun, 29 Dec 2024 14:22:46 +0000 Subject: [PATCH 004/503] Fix error message typos with Parquet compression (#6918) --- parquet/src/basic.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 97e8c22f1b2f..99f122fe4c3e 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -426,14 +426,19 @@ fn split_compression_string(str_setting: &str) -> Result<(&str, Option), Pa fn check_level_is_none(level: &Option) -> Result<(), ParquetError> { if level.is_some() { - return Err(ParquetError::General("level is not support".to_string())); + return Err(ParquetError::General( + "compression level is not supported".to_string(), + )); } Ok(()) } fn require_level(codec: &str, level: Option) -> Result { - level.ok_or(ParquetError::General(format!("{} require level", codec))) + level.ok_or(ParquetError::General(format!( + "{} requires a compression level", + codec + ))) } impl FromStr for Compression { From 1d2696d3284266cddb48e0974bd61fc23882a893 Mon Sep 17 00:00:00 2001 From: wiedld Date: Mon, 30 Dec 2024 05:57:59 -0500 Subject: [PATCH 005/503] chore: expose arrow-schema methods, for use when writing parquet outside of ArrowWriter (#6916) --- parquet/src/arrow/mod.rs | 4 ++-- parquet/src/arrow/schema/mod.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index 6777e00fb05c..1305bbac83f0 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -123,8 +123,8 @@ use arrow_schema::{FieldRef, Schema}; pub use self::schema::arrow_to_parquet_schema; pub use self::schema::{ - parquet_to_arrow_field_levels, parquet_to_arrow_schema, parquet_to_arrow_schema_by_columns, - ArrowSchemaConverter, FieldLevels, + add_encoded_arrow_schema_to_metadata, encode_arrow_schema, parquet_to_arrow_field_levels, + parquet_to_arrow_schema, parquet_to_arrow_schema_by_columns, ArrowSchemaConverter, FieldLevels, }; /// Schema metadata key used to store serialized Arrow IPC schema diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 212dec525833..8be2439002be 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -170,7 +170,7 @@ fn get_arrow_schema_from_metadata(encoded_meta: &str) -> Result { } /// Encodes the Arrow schema into the IPC format, and base64 encodes it -fn encode_arrow_schema(schema: &Schema) -> String { +pub fn encode_arrow_schema(schema: &Schema) -> String { let options = writer::IpcWriteOptions::default(); #[allow(deprecated)] let mut dictionary_tracker = @@ -192,7 +192,7 @@ fn encode_arrow_schema(schema: &Schema) -> String { /// Mutates writer metadata by storing the encoded Arrow schema. /// If there is an existing Arrow schema metadata, it is replaced. -pub(crate) fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut WriterProperties) { +pub fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut WriterProperties) { let encoded = encode_arrow_schema(schema); let schema_kv = KeyValue { From 7d9bb379a6676c317907452bc4879bb65809006f Mon Sep 17 00:00:00 2001 From: Takahiro Ebato Date: Mon, 30 Dec 2024 19:58:56 +0900 Subject: [PATCH 006/503] Improve error message for unsupported cast between struct and other types (#6919) --- arrow-cast/src/cast/mod.rs | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index ba470635c6cd..7aff32b0beee 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -1189,12 +1189,12 @@ pub fn cast_with_options( let array = StructArray::try_new(to_fields.clone(), fields, array.nulls().cloned())?; Ok(Arc::new(array) as ArrayRef) } - (Struct(_), _) => Err(ArrowError::CastError( - "Cannot cast from struct to other types except struct".to_string(), - )), - (_, Struct(_)) => Err(ArrowError::CastError( - "Cannot cast to struct from other types except struct".to_string(), - )), + (Struct(_), _) => Err(ArrowError::CastError(format!( + "Casting from {from_type:?} to {to_type:?} not supported" + ))), + (_, Struct(_)) => Err(ArrowError::CastError(format!( + "Casting from {from_type:?} to {to_type:?} not supported" + ))), (_, Boolean) => match from_type { UInt8 => cast_numeric_to_bool::(array), UInt16 => cast_numeric_to_bool::(array), @@ -9941,6 +9941,32 @@ mod tests { ); } + #[test] + fn test_cast_struct_to_non_struct() { + let boolean = Arc::new(BooleanArray::from(vec![true, false])); + let struct_array = StructArray::from(vec![( + Arc::new(Field::new("a", DataType::Boolean, false)), + boolean.clone() as ArrayRef, + )]); + let to_type = DataType::Utf8; + let result = cast(&struct_array, &to_type); + assert_eq!( + r#"Cast error: Casting from Struct([Field { name: "a", data_type: Boolean, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]) to Utf8 not supported"#, + result.unwrap_err().to_string() + ); + } + + #[test] + fn test_cast_non_struct_to_struct() { + let array = StringArray::from(vec!["a", "b"]); + let to_type = DataType::Struct(vec![Field::new("a", DataType::Boolean, false)].into()); + let result = cast(&array, &to_type); + assert_eq!( + r#"Cast error: Casting from Utf8 to Struct([Field { name: "a", data_type: Boolean, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]) not supported"#, + result.unwrap_err().to_string() + ); + } + #[test] fn test_decimal_to_decimal_throw_error_on_precision_overflow_same_scale() { let array = vec![Some(123456789)]; From 5249f99e0e6671fc06996365dd4cf22a8427348c Mon Sep 17 00:00:00 2001 From: Tai Le Manh Date: Mon, 30 Dec 2024 19:42:37 +0700 Subject: [PATCH 007/503] [arrow-string] Implement string view support for `regexp_match` (#6849) * [arrow-string] Implement string view suport for regexp match Signed-off-by: Tai Le Manh * update unit tests * fix clippy warnings * Add test cases Signed-off-by: Tai Le Manh --------- Signed-off-by: Tai Le Manh --- arrow-string/src/regexp.rs | 640 ++++++++++++++++++++++++++----------- 1 file changed, 452 insertions(+), 188 deletions(-) diff --git a/arrow-string/src/regexp.rs b/arrow-string/src/regexp.rs index d14662be7280..f3893cd5bd13 100644 --- a/arrow-string/src/regexp.rs +++ b/arrow-string/src/regexp.rs @@ -20,7 +20,9 @@ use crate::like::StringArrayType; -use arrow_array::builder::{BooleanBufferBuilder, GenericStringBuilder, ListBuilder}; +use arrow_array::builder::{ + BooleanBufferBuilder, GenericStringBuilder, ListBuilder, StringViewBuilder, +}; use arrow_array::cast::AsArray; use arrow_array::*; use arrow_buffer::NullBuffer; @@ -243,78 +245,96 @@ where Ok(BooleanArray::from(data)) } -fn regexp_array_match( - array: &GenericStringArray, - regex_array: &GenericStringArray, - flags_array: Option<&GenericStringArray>, -) -> Result { - let mut patterns: HashMap = HashMap::new(); - let builder: GenericStringBuilder = GenericStringBuilder::with_capacity(0, 0); - let mut list_builder = ListBuilder::new(builder); +macro_rules! process_regexp_array_match { + ($array:expr, $regex_array:expr, $flags_array:expr, $list_builder:expr) => { + let mut patterns: HashMap = HashMap::new(); - let complete_pattern = match flags_array { - Some(flags) => Box::new( - regex_array - .iter() - .zip(flags.iter()) - .map(|(pattern, flags)| { + let complete_pattern = match $flags_array { + Some(flags) => Box::new($regex_array.iter().zip(flags.iter()).map( + |(pattern, flags)| { pattern.map(|pattern| match flags { Some(value) => format!("(?{value}){pattern}"), None => pattern.to_string(), }) - }), - ) as Box>>, - None => Box::new( - regex_array - .iter() - .map(|pattern| pattern.map(|pattern| pattern.to_string())), - ), - }; + }, + )) as Box>>, + None => Box::new( + $regex_array + .iter() + .map(|pattern| pattern.map(|pattern| pattern.to_string())), + ), + }; - array - .iter() - .zip(complete_pattern) - .map(|(value, pattern)| { - match (value, pattern) { - // Required for Postgres compatibility: - // SELECT regexp_match('foobarbequebaz', ''); = {""} - (Some(_), Some(pattern)) if pattern == *"" => { - list_builder.values().append_value(""); - list_builder.append(true); - } - (Some(value), Some(pattern)) => { - let existing_pattern = patterns.get(&pattern); - let re = match existing_pattern { - Some(re) => re, - None => { - let re = Regex::new(pattern.as_str()).map_err(|e| { - ArrowError::ComputeError(format!( - "Regular expression did not compile: {e:?}" - )) - })?; - patterns.entry(pattern).or_insert(re) - } - }; - match re.captures(value) { - Some(caps) => { - let mut iter = caps.iter(); - if caps.len() > 1 { - iter.next(); - } - for m in iter.flatten() { - list_builder.values().append_value(m.as_str()); + $array + .iter() + .zip(complete_pattern) + .map(|(value, pattern)| { + match (value, pattern) { + // Required for Postgres compatibility: + // SELECT regexp_match('foobarbequebaz', ''); = {""} + (Some(_), Some(pattern)) if pattern == *"" => { + $list_builder.values().append_value(""); + $list_builder.append(true); + } + (Some(value), Some(pattern)) => { + let existing_pattern = patterns.get(&pattern); + let re = match existing_pattern { + Some(re) => re, + None => { + let re = Regex::new(pattern.as_str()).map_err(|e| { + ArrowError::ComputeError(format!( + "Regular expression did not compile: {e:?}" + )) + })?; + patterns.entry(pattern).or_insert(re) } + }; + match re.captures(value) { + Some(caps) => { + let mut iter = caps.iter(); + if caps.len() > 1 { + iter.next(); + } + for m in iter.flatten() { + $list_builder.values().append_value(m.as_str()); + } - list_builder.append(true); + $list_builder.append(true); + } + None => $list_builder.append(false), } - None => list_builder.append(false), } + _ => $list_builder.append(false), } - _ => list_builder.append(false), - } - Ok(()) - }) - .collect::, ArrowError>>()?; + Ok(()) + }) + .collect::, ArrowError>>()?; + }; +} + +fn regexp_array_match( + array: &GenericStringArray, + regex_array: &GenericStringArray, + flags_array: Option<&GenericStringArray>, +) -> Result { + let builder: GenericStringBuilder = GenericStringBuilder::with_capacity(0, 0); + let mut list_builder = ListBuilder::new(builder); + + process_regexp_array_match!(array, regex_array, flags_array, list_builder); + + Ok(Arc::new(list_builder.finish())) +} + +fn regexp_array_match_utf8view( + array: &StringViewArray, + regex_array: &StringViewArray, + flags_array: Option<&StringViewArray>, +) -> Result { + let builder = StringViewBuilder::with_capacity(0); + let mut list_builder = ListBuilder::new(builder); + + process_regexp_array_match!(array, regex_array, flags_array, list_builder); + Ok(Arc::new(list_builder.finish())) } @@ -333,6 +353,54 @@ fn get_scalar_pattern_flag<'a, OffsetSize: OffsetSizeTrait>( } } +fn get_scalar_pattern_flag_utf8view<'a>( + regex_array: &'a dyn Array, + flag_array: Option<&'a dyn Array>, +) -> (Option<&'a str>, Option<&'a str>) { + let regex = regex_array.as_string_view(); + let regex = regex.is_valid(0).then(|| regex.value(0)); + + if let Some(flag_array) = flag_array { + let flag = flag_array.as_string_view(); + (regex, flag.is_valid(0).then(|| flag.value(0))) + } else { + (regex, None) + } +} + +macro_rules! process_regexp_match { + ($array:expr, $regex:expr, $list_builder:expr) => { + $array + .iter() + .map(|value| { + match value { + // Required for Postgres compatibility: + // SELECT regexp_match('foobarbequebaz', ''); = {""} + Some(_) if $regex.as_str().is_empty() => { + $list_builder.values().append_value(""); + $list_builder.append(true); + } + Some(value) => match $regex.captures(value) { + Some(caps) => { + let mut iter = caps.iter(); + if caps.len() > 1 { + iter.next(); + } + for m in iter.flatten() { + $list_builder.values().append_value(m.as_str()); + } + $list_builder.append(true); + } + None => $list_builder.append(false), + }, + None => $list_builder.append(false), + } + Ok(()) + }) + .collect::, ArrowError>>()? + }; +} + fn regexp_scalar_match( array: &GenericStringArray, regex: &Regex, @@ -340,35 +408,19 @@ fn regexp_scalar_match( let builder: GenericStringBuilder = GenericStringBuilder::with_capacity(0, 0); let mut list_builder = ListBuilder::new(builder); - array - .iter() - .map(|value| { - match value { - // Required for Postgres compatibility: - // SELECT regexp_match('foobarbequebaz', ''); = {""} - Some(_) if regex.as_str() == "" => { - list_builder.values().append_value(""); - list_builder.append(true); - } - Some(value) => match regex.captures(value) { - Some(caps) => { - let mut iter = caps.iter(); - if caps.len() > 1 { - iter.next(); - } - for m in iter.flatten() { - list_builder.values().append_value(m.as_str()); - } + process_regexp_match!(array, regex, list_builder); - list_builder.append(true); - } - None => list_builder.append(false), - }, - _ => list_builder.append(false), - } - Ok(()) - }) - .collect::, ArrowError>>()?; + Ok(Arc::new(list_builder.finish())) +} + +fn regexp_scalar_match_utf8view( + array: &StringViewArray, + regex: &Regex, +) -> Result { + let builder = StringViewBuilder::with_capacity(0); + let mut list_builder = ListBuilder::new(builder); + + process_regexp_match!(array, regex, list_builder); Ok(Arc::new(list_builder.finish())) } @@ -406,7 +458,7 @@ pub fn regexp_match( if array.data_type() != rhs.data_type() { return Err(ArrowError::ComputeError( - "regexp_match() requires both array and pattern to be either Utf8 or LargeUtf8" + "regexp_match() requires both array and pattern to be either Utf8, Utf8View or LargeUtf8" .to_string(), )); } @@ -428,7 +480,7 @@ pub fn regexp_match( if flags_array.is_some() && rhs.data_type() != flags.unwrap().data_type() { return Err(ArrowError::ComputeError( - "regexp_match() requires both pattern and flags to be either string or largestring" + "regexp_match() requires both pattern and flags to be either Utf8, Utf8View or LargeUtf8" .to_string(), )); } @@ -436,11 +488,13 @@ pub fn regexp_match( if is_rhs_scalar { // Regex and flag is scalars let (regex, flag) = match rhs.data_type() { + DataType::Utf8View => get_scalar_pattern_flag_utf8view(rhs, flags), DataType::Utf8 => get_scalar_pattern_flag::(rhs, flags), DataType::LargeUtf8 => get_scalar_pattern_flag::(rhs, flags), _ => { return Err(ArrowError::ComputeError( - "regexp_match() requires pattern to be either Utf8 or LargeUtf8".to_string(), + "regexp_match() requires pattern to be either Utf8, Utf8View or LargeUtf8" + .to_string(), )); } }; @@ -468,14 +522,21 @@ pub fn regexp_match( })?; match array.data_type() { + DataType::Utf8View => regexp_scalar_match_utf8view(array.as_string_view(), &re), DataType::Utf8 => regexp_scalar_match(array.as_string::(), &re), DataType::LargeUtf8 => regexp_scalar_match(array.as_string::(), &re), _ => Err(ArrowError::ComputeError( - "regexp_match() requires array to be either Utf8 or LargeUtf8".to_string(), + "regexp_match() requires array to be either Utf8, Utf8View or LargeUtf8" + .to_string(), )), } } else { match array.data_type() { + DataType::Utf8View => { + let regex_array = rhs.as_string_view(); + let flags_array = flags.map(|flags| flags.as_string_view()); + regexp_array_match_utf8view(array.as_string_view(), regex_array, flags_array) + } DataType::Utf8 => { let regex_array = rhs.as_string(); let flags_array = flags.map(|flags| flags.as_string()); @@ -487,7 +548,8 @@ pub fn regexp_match( regexp_array_match(array.as_string::(), regex_array, flags_array) } _ => Err(ArrowError::ComputeError( - "regexp_match() requires array to be either Utf8 or LargeUtf8".to_string(), + "regexp_match() requires array to be either Utf8, Utf8View or LargeUtf8" + .to_string(), )), } } @@ -497,114 +559,316 @@ pub fn regexp_match( mod tests { use super::*; - #[test] - fn match_single_group() { - let values = vec![ + macro_rules! test_match_single_group { + ($test_name:ident, $values:expr, $patterns:expr, $arr_type:ty, $builder_type:ty, $expected:expr) => { + #[test] + fn $test_name() { + let array: $arr_type = <$arr_type>::from($values); + let pattern: $arr_type = <$arr_type>::from($patterns); + + let actual = regexp_match(&array, &pattern, None).unwrap(); + + let elem_builder: $builder_type = <$builder_type>::new(); + let mut expected_builder = ListBuilder::new(elem_builder); + + for val in $expected { + match val { + Some(v) => { + expected_builder.values().append_value(v); + expected_builder.append(true); + } + None => expected_builder.append(false), + } + } + + let expected = expected_builder.finish(); + let result = actual.as_any().downcast_ref::().unwrap(); + assert_eq!(&expected, result); + } + }; + } + + test_match_single_group!( + match_single_group_string, + vec![ Some("abc-005-def"), Some("X-7-5"), Some("X545"), None, Some("foobarbequebaz"), Some("foobarbequebaz"), - ]; - let array = StringArray::from(values); - let mut pattern_values = vec![r".*-(\d*)-.*"; 4]; - pattern_values.push(r"(bar)(bequ1e)"); - pattern_values.push(""); - let pattern = GenericStringArray::::from(pattern_values); - let actual = regexp_match(&array, &pattern, None).unwrap(); - let elem_builder: GenericStringBuilder = GenericStringBuilder::new(); - let mut expected_builder = ListBuilder::new(elem_builder); - expected_builder.values().append_value("005"); - expected_builder.append(true); - expected_builder.values().append_value("7"); - expected_builder.append(true); - expected_builder.append(false); - expected_builder.append(false); - expected_builder.append(false); - expected_builder.values().append_value(""); - expected_builder.append(true); - let expected = expected_builder.finish(); - let result = actual.as_any().downcast_ref::().unwrap(); - assert_eq!(&expected, result); - } + ], + vec![ + r".*-(\d*)-.*", + r".*-(\d*)-.*", + r".*-(\d*)-.*", + r".*-(\d*)-.*", + r"(bar)(bequ1e)", + "" + ], + StringArray, + GenericStringBuilder, + [Some("005"), Some("7"), None, None, None, Some("")] + ); + test_match_single_group!( + match_single_group_string_view, + vec![ + Some("abc-005-def"), + Some("X-7-5"), + Some("X545"), + None, + Some("foobarbequebaz"), + Some("foobarbequebaz"), + ], + vec![ + r".*-(\d*)-.*", + r".*-(\d*)-.*", + r".*-(\d*)-.*", + r".*-(\d*)-.*", + r"(bar)(bequ1e)", + "" + ], + StringViewArray, + StringViewBuilder, + [Some("005"), Some("7"), None, None, None, Some("")] + ); + + macro_rules! test_match_single_group_with_flags { + ($test_name:ident, $values:expr, $patterns:expr, $flags:expr, $array_type:ty, $builder_type:ty, $expected:expr) => { + #[test] + fn $test_name() { + let array: $array_type = <$array_type>::from($values); + let pattern: $array_type = <$array_type>::from($patterns); + let flags: $array_type = <$array_type>::from($flags); + + let actual = regexp_match(&array, &pattern, Some(&flags)).unwrap(); - #[test] - fn match_single_group_with_flags() { - let values = vec![Some("abc-005-def"), Some("X-7-5"), Some("X545"), None]; - let array = StringArray::from(values); - let pattern = StringArray::from(vec![r"x.*-(\d*)-.*"; 4]); - let flags = StringArray::from(vec!["i"; 4]); - let actual = regexp_match(&array, &pattern, Some(&flags)).unwrap(); - let elem_builder: GenericStringBuilder = GenericStringBuilder::with_capacity(0, 0); - let mut expected_builder = ListBuilder::new(elem_builder); - expected_builder.append(false); - expected_builder.values().append_value("7"); - expected_builder.append(true); - expected_builder.append(false); - expected_builder.append(false); - let expected = expected_builder.finish(); - let result = actual.as_any().downcast_ref::().unwrap(); - assert_eq!(&expected, result); + let elem_builder: $builder_type = <$builder_type>::new(); + let mut expected_builder = ListBuilder::new(elem_builder); + + for val in $expected { + match val { + Some(v) => { + expected_builder.values().append_value(v); + expected_builder.append(true); + } + None => { + expected_builder.append(false); + } + } + } + + let expected = expected_builder.finish(); + let result = actual.as_any().downcast_ref::().unwrap(); + assert_eq!(&expected, result); + } + }; } - #[test] - fn match_scalar_pattern() { - let values = vec![Some("abc-005-def"), Some("X-7-5"), Some("X545"), None]; - let array = StringArray::from(values); - let pattern = Scalar::new(StringArray::from(vec![r"x.*-(\d*)-.*"; 1])); - let flags = Scalar::new(StringArray::from(vec!["i"; 1])); - let actual = regexp_match(&array, &pattern, Some(&flags)).unwrap(); - let elem_builder: GenericStringBuilder = GenericStringBuilder::with_capacity(0, 0); - let mut expected_builder = ListBuilder::new(elem_builder); - expected_builder.append(false); - expected_builder.values().append_value("7"); - expected_builder.append(true); - expected_builder.append(false); - expected_builder.append(false); - let expected = expected_builder.finish(); - let result = actual.as_any().downcast_ref::().unwrap(); - assert_eq!(&expected, result); - - // No flag - let values = vec![Some("abc-005-def"), Some("x-7-5"), Some("X545"), None]; - let array = StringArray::from(values); - let actual = regexp_match(&array, &pattern, None).unwrap(); - let result = actual.as_any().downcast_ref::().unwrap(); - assert_eq!(&expected, result); + test_match_single_group_with_flags!( + match_single_group_with_flags_string, + vec![Some("abc-005-def"), Some("X-7-5"), Some("X545"), None], + vec![r"x.*-(\d*)-.*"; 4], + vec!["i"; 4], + StringArray, + GenericStringBuilder, + [None, Some("7"), None, None] + ); + test_match_single_group_with_flags!( + match_single_group_with_flags_stringview, + vec![Some("abc-005-def"), Some("X-7-5"), Some("X545"), None], + vec![r"x.*-(\d*)-.*"; 4], + vec!["i"; 4], + StringViewArray, + StringViewBuilder, + [None, Some("7"), None, None] + ); + + macro_rules! test_match_scalar_pattern { + ($test_name:ident, $values:expr, $pattern:expr, $flag:expr, $array_type:ty, $builder_type:ty, $expected:expr) => { + #[test] + fn $test_name() { + let array: $array_type = <$array_type>::from($values); + + let pattern_scalar = Scalar::new(<$array_type>::from(vec![$pattern; 1])); + let flag_scalar = Scalar::new(<$array_type>::from(vec![$flag; 1])); + + let actual = regexp_match(&array, &pattern_scalar, Some(&flag_scalar)).unwrap(); + + let elem_builder: $builder_type = <$builder_type>::new(); + let mut expected_builder = ListBuilder::new(elem_builder); + + for val in $expected { + match val { + Some(v) => { + expected_builder.values().append_value(v); + expected_builder.append(true); + } + None => expected_builder.append(false), + } + } + + let expected = expected_builder.finish(); + let result = actual.as_any().downcast_ref::().unwrap(); + assert_eq!(&expected, result); + } + }; } - #[test] - fn match_scalar_no_pattern() { - let values = vec![Some("abc-005-def"), Some("X-7-5"), Some("X545"), None]; - let array = StringArray::from(values); - let pattern = Scalar::new(new_null_array(&DataType::Utf8, 1)); - let actual = regexp_match(&array, &pattern, None).unwrap(); - let elem_builder: GenericStringBuilder = GenericStringBuilder::with_capacity(0, 0); - let mut expected_builder = ListBuilder::new(elem_builder); - expected_builder.append(false); - expected_builder.append(false); - expected_builder.append(false); - expected_builder.append(false); - let expected = expected_builder.finish(); - let result = actual.as_any().downcast_ref::().unwrap(); - assert_eq!(&expected, result); + test_match_scalar_pattern!( + match_scalar_pattern_string_with_flags, + vec![ + Some("abc-005-def"), + Some("x-7-5"), + Some("X-0-Y"), + Some("X545"), + None + ], + r"x.*-(\d*)-.*", + Some("i"), + StringArray, + GenericStringBuilder, + [None, Some("7"), Some("0"), None, None] + ); + test_match_scalar_pattern!( + match_scalar_pattern_stringview_with_flags, + vec![ + Some("abc-005-def"), + Some("x-7-5"), + Some("X-0-Y"), + Some("X545"), + None + ], + r"x.*-(\d*)-.*", + Some("i"), + StringViewArray, + StringViewBuilder, + [None, Some("7"), Some("0"), None, None] + ); + + test_match_scalar_pattern!( + match_scalar_pattern_string_no_flags, + vec![ + Some("abc-005-def"), + Some("x-7-5"), + Some("X-0-Y"), + Some("X545"), + None + ], + r"x.*-(\d*)-.*", + None::<&str>, + StringArray, + GenericStringBuilder, + [None, Some("7"), None, None, None] + ); + test_match_scalar_pattern!( + match_scalar_pattern_stringview_no_flags, + vec![ + Some("abc-005-def"), + Some("x-7-5"), + Some("X-0-Y"), + Some("X545"), + None + ], + r"x.*-(\d*)-.*", + None::<&str>, + StringViewArray, + StringViewBuilder, + [None, Some("7"), None, None, None] + ); + + macro_rules! test_match_scalar_no_pattern { + ($test_name:ident, $values:expr, $array_type:ty, $pattern_type:expr, $builder_type:ty, $expected:expr) => { + #[test] + fn $test_name() { + let array: $array_type = <$array_type>::from($values); + let pattern = Scalar::new(new_null_array(&$pattern_type, 1)); + + let actual = regexp_match(&array, &pattern, None).unwrap(); + + let elem_builder: $builder_type = <$builder_type>::new(); + let mut expected_builder = ListBuilder::new(elem_builder); + + for val in $expected { + match val { + Some(v) => { + expected_builder.values().append_value(v); + expected_builder.append(true); + } + None => expected_builder.append(false), + } + } + + let expected = expected_builder.finish(); + let result = actual.as_any().downcast_ref::().unwrap(); + assert_eq!(&expected, result); + } + }; } - #[test] - fn test_single_group_not_skip_match() { - let array = StringArray::from(vec![Some("foo"), Some("bar")]); - let pattern = GenericStringArray::::from(vec![r"foo"]); - let actual = regexp_match(&array, &pattern, None).unwrap(); - let result = actual.as_any().downcast_ref::().unwrap(); - let elem_builder: GenericStringBuilder = GenericStringBuilder::new(); - let mut expected_builder = ListBuilder::new(elem_builder); - expected_builder.values().append_value("foo"); - expected_builder.append(true); - let expected = expected_builder.finish(); - assert_eq!(&expected, result); + test_match_scalar_no_pattern!( + match_scalar_no_pattern_string, + vec![Some("abc-005-def"), Some("X-7-5"), Some("X545"), None], + StringArray, + DataType::Utf8, + GenericStringBuilder, + [None::<&str>, None, None, None] + ); + test_match_scalar_no_pattern!( + match_scalar_no_pattern_stringview, + vec![Some("abc-005-def"), Some("X-7-5"), Some("X545"), None], + StringViewArray, + DataType::Utf8View, + StringViewBuilder, + [None::<&str>, None, None, None] + ); + + macro_rules! test_match_single_group_not_skip { + ($test_name:ident, $values:expr, $pattern:expr, $array_type:ty, $builder_type:ty, $expected:expr) => { + #[test] + fn $test_name() { + let array: $array_type = <$array_type>::from($values); + let pattern: $array_type = <$array_type>::from(vec![$pattern]); + + let actual = regexp_match(&array, &pattern, None).unwrap(); + + let elem_builder: $builder_type = <$builder_type>::new(); + let mut expected_builder = ListBuilder::new(elem_builder); + + for val in $expected { + match val { + Some(v) => { + expected_builder.values().append_value(v); + expected_builder.append(true); + } + None => expected_builder.append(false), + } + } + + let expected = expected_builder.finish(); + let result = actual.as_any().downcast_ref::().unwrap(); + assert_eq!(&expected, result); + } + }; } + test_match_single_group_not_skip!( + match_single_group_not_skip_string, + vec![Some("foo"), Some("bar")], + r"foo", + StringArray, + GenericStringBuilder, + [Some("foo")] + ); + test_match_single_group_not_skip!( + match_single_group_not_skip_stringview, + vec![Some("foo"), Some("bar")], + r"foo", + StringViewArray, + StringViewBuilder, + [Some("foo")] + ); + macro_rules! test_flag_utf8 { ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => { #[test] From fbee05f22a80d2e874ded122dce0efb36ef66c8a Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Wed, 1 Jan 2025 06:25:04 -0800 Subject: [PATCH 008/503] Add doctest example for `Buffer::from_bytes` (#6920) * Add doctest example for * Remove typo * Update arrow-buffer/src/buffer/immutable.rs --------- Co-authored-by: Andrew Lamb --- arrow-buffer/src/buffer/immutable.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index d0c8ffa39783..cf1d6f366751 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -60,6 +60,14 @@ unsafe impl Sync for Buffer where Bytes: Sync {} impl Buffer { /// Auxiliary method to create a new Buffer + /// + /// This can be used with a [`bytes::Bytes`] via `into()`: + /// + /// ``` + /// # use arrow_buffer::Buffer; + /// let bytes = bytes::Bytes::from_static(b"foo"); + /// let buffer = Buffer::from_bytes(bytes.into()); + /// ``` #[inline] pub fn from_bytes(bytes: Bytes) -> Self { let length = bytes.len(); From 4a0bdde24f48d0fc1222d936493f798d9ea4789d Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Thu, 2 Jan 2025 10:09:34 +0100 Subject: [PATCH 009/503] object_store: Add enabled-by-default "fs" feature (#6636) --- .github/workflows/object_store.yml | 4 ++++ object_store/Cargo.toml | 4 +++- object_store/src/chunked.rs | 4 ++++ object_store/src/lib.rs | 17 +++++++++++------ object_store/src/limit.rs | 1 + object_store/src/parse.rs | 4 ++-- object_store/src/throttle.rs | 2 ++ object_store/src/util.rs | 2 +- 8 files changed, 28 insertions(+), 10 deletions(-) diff --git a/.github/workflows/object_store.yml b/.github/workflows/object_store.yml index 93f809aaabd4..899318f01324 100644 --- a/.github/workflows/object_store.yml +++ b/.github/workflows/object_store.yml @@ -54,6 +54,10 @@ jobs: # targets. - name: Run clippy with default features run: cargo clippy -- -D warnings + - name: Run clippy without default features + run: cargo clippy --no-default-features -- -D warnings + - name: Run clippy with fs features + run: cargo clippy --no-default-features --features fs -- -D warnings - name: Run clippy with aws feature run: cargo clippy --features aws -- -D warnings - name: Run clippy with gcp feature diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index bf254b3a0bbd..a127be3602ef 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -41,7 +41,7 @@ percent-encoding = "2.1" snafu = { version = "0.8", default-features = false, features = ["std", "rust_1_61"] } tracing = { version = "0.1" } url = "2.2" -walkdir = "2" +walkdir = { version = "2", optional = true } # Cloud storage support base64 = { version = "0.22", default-features = false, features = ["std"], optional = true } @@ -61,8 +61,10 @@ httparse = { version = "1.8.0", default-features = false, features = ["std"], op nix = { version = "0.29.0", features = ["fs"] } [features] +default = ["fs"] cloud = ["serde", "serde_json", "quick-xml", "hyper", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] azure = ["cloud", "httparse"] +fs = ["walkdir"] gcp = ["cloud", "rustls-pemfile"] aws = ["cloud", "md-5"] http = ["cloud"] diff --git a/object_store/src/chunked.rs b/object_store/src/chunked.rs index 98cc20498013..3f83c1336dc4 100644 --- a/object_store/src/chunked.rs +++ b/object_store/src/chunked.rs @@ -86,6 +86,7 @@ impl ObjectStore for ChunkedStore { async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { let r = self.inner.get_opts(location, options).await?; let stream = match r.payload { + #[cfg(all(feature = "fs", not(target_arch = "wasm32")))] GetResultPayload::File(file, path) => { crate::local::chunked_stream(file, path, r.range.clone(), self.chunk_size) } @@ -178,7 +179,9 @@ impl ObjectStore for ChunkedStore { mod tests { use futures::StreamExt; + #[cfg(feature = "fs")] use crate::integration::*; + #[cfg(feature = "fs")] use crate::local::LocalFileSystem; use crate::memory::InMemory; use crate::path::Path; @@ -209,6 +212,7 @@ mod tests { } } + #[cfg(feature = "fs")] #[tokio::test] async fn test_chunked() { let temporary = tempfile::tempdir().unwrap(); diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 4d8d8f02a0bc..6f5733226922 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -66,10 +66,13 @@ //! By default, this crate provides the following implementations: //! //! * Memory: [`InMemory`](memory::InMemory) -//! * Local filesystem: [`LocalFileSystem`](local::LocalFileSystem) //! //! Feature flags are used to enable support for other implementations: //! +#![cfg_attr( + feature = "fs", + doc = "* Local filesystem: [`LocalFileSystem`](local::LocalFileSystem)" +)] #![cfg_attr( feature = "gcp", doc = "* [`gcp`]: [Google Cloud Storage](https://cloud.google.com/storage/) support. See [`GoogleCloudStorageBuilder`](gcp::GoogleCloudStorageBuilder)" @@ -513,7 +516,7 @@ pub mod gcp; #[cfg(feature = "http")] pub mod http; pub mod limit; -#[cfg(not(target_arch = "wasm32"))] +#[cfg(all(feature = "fs", not(target_arch = "wasm32")))] pub mod local; pub mod memory; pub mod path; @@ -557,7 +560,7 @@ pub use upload::*; pub use util::{coalesce_ranges, collect_bytes, GetRange, OBJECT_STORE_COALESCE_DEFAULT}; use crate::path::Path; -#[cfg(not(target_arch = "wasm32"))] +#[cfg(all(feature = "fs", not(target_arch = "wasm32")))] use crate::util::maybe_spawn_blocking; use async_trait::async_trait; use bytes::Bytes; @@ -565,7 +568,7 @@ use chrono::{DateTime, Utc}; use futures::{stream::BoxStream, StreamExt, TryStreamExt}; use snafu::Snafu; use std::fmt::{Debug, Formatter}; -#[cfg(not(target_arch = "wasm32"))] +#[cfg(all(feature = "fs", not(target_arch = "wasm32")))] use std::io::{Read, Seek, SeekFrom}; use std::ops::Range; use std::sync::Arc; @@ -1028,6 +1031,7 @@ pub struct GetResult { /// be able to optimise the case of a file already present on local disk pub enum GetResultPayload { /// The file, path + #[cfg(all(feature = "fs", not(target_arch = "wasm32")))] File(std::fs::File, std::path::PathBuf), /// An opaque stream of bytes Stream(BoxStream<'static, Result>), @@ -1036,6 +1040,7 @@ pub enum GetResultPayload { impl Debug for GetResultPayload { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { + #[cfg(all(feature = "fs", not(target_arch = "wasm32")))] Self::File(_, _) => write!(f, "GetResultPayload(File)"), Self::Stream(_) => write!(f, "GetResultPayload(Stream)"), } @@ -1047,7 +1052,7 @@ impl GetResult { pub async fn bytes(self) -> Result { let len = self.range.end - self.range.start; match self.payload { - #[cfg(not(target_arch = "wasm32"))] + #[cfg(all(feature = "fs", not(target_arch = "wasm32")))] GetResultPayload::File(mut file, path) => { maybe_spawn_blocking(move || { file.seek(SeekFrom::Start(self.range.start as _)) @@ -1087,7 +1092,7 @@ impl GetResult { /// no additional complexity or overheads pub fn into_stream(self) -> BoxStream<'static, Result> { match self.payload { - #[cfg(not(target_arch = "wasm32"))] + #[cfg(all(feature = "fs", not(target_arch = "wasm32")))] GetResultPayload::File(file, path) => { const CHUNK_SIZE: usize = 8 * 1024; local::chunked_stream(file, path, self.range, CHUNK_SIZE) diff --git a/object_store/src/limit.rs b/object_store/src/limit.rs index 64b96ad1a96c..6a3c3b574e62 100644 --- a/object_store/src/limit.rs +++ b/object_store/src/limit.rs @@ -199,6 +199,7 @@ impl ObjectStore for LimitStore { fn permit_get_result(r: GetResult, permit: OwnedSemaphorePermit) -> GetResult { let payload = match r.payload { + #[cfg(all(feature = "fs", not(target_arch = "wasm32")))] v @ GetResultPayload::File(_, _) => v, GetResultPayload::Stream(s) => { GetResultPayload::Stream(PermitWrapper::new(s, permit).boxed()) diff --git a/object_store/src/parse.rs b/object_store/src/parse.rs index debc9e529312..a3919305281d 100644 --- a/object_store/src/parse.rs +++ b/object_store/src/parse.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#[cfg(not(target_arch = "wasm32"))] +#[cfg(all(feature = "fs", not(target_arch = "wasm32")))] use crate::local::LocalFileSystem; use crate::memory::InMemory; use crate::path::Path; @@ -179,7 +179,7 @@ where let path = Path::parse(path)?; let store = match scheme { - #[cfg(not(target_arch = "wasm32"))] + #[cfg(all(feature = "fs", not(target_arch = "wasm32")))] ObjectStoreScheme::Local => Box::new(LocalFileSystem::new()) as _, ObjectStoreScheme::Memory => Box::new(InMemory::new()) as _, #[cfg(feature = "aws")] diff --git a/object_store/src/throttle.rs b/object_store/src/throttle.rs index d07276c3dcad..b9dff5c6d1d2 100644 --- a/object_store/src/throttle.rs +++ b/object_store/src/throttle.rs @@ -307,8 +307,10 @@ fn usize_to_u32_saturate(x: usize) -> u32 { } fn throttle_get(result: GetResult, wait_get_per_byte: Duration) -> GetResult { + #[allow(clippy::infallible_destructuring_match)] let s = match result.payload { GetResultPayload::Stream(s) => s, + #[cfg(all(feature = "fs", not(target_arch = "wasm32")))] GetResultPayload::File(_, _) => unimplemented!(), }; diff --git a/object_store/src/util.rs b/object_store/src/util.rs index ecf90f95d7c7..99102a99e61e 100644 --- a/object_store/src/util.rs +++ b/object_store/src/util.rs @@ -75,7 +75,7 @@ where } } -#[cfg(not(target_arch = "wasm32"))] +#[cfg(all(feature = "fs", not(target_arch = "wasm32")))] /// Takes a function and spawns it to a tokio blocking pool if available pub(crate) async fn maybe_spawn_blocking(f: F) -> Result where From f7263e253655b2ee613be97f9d00e063444d3df5 Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Thu, 2 Jan 2025 23:25:44 +0100 Subject: [PATCH 010/503] object_store: Migrate from `snafu` to `thiserror` (#6266) * object_store: Add `thiserror` dependency * object_store/memory: Migrate from `snafu` to `thiserror` * object_store/parse: Migrate from `snafu` to `thiserror` * object_store/util: Migrate from `snafu` to `thiserror` * object_store/local: Migrate from `snafu` to `thiserror` * object_store/delimited: Migrate from `snafu` to `thiserror` * object_store/path/parts: Migrate from `snafu` to `thiserror` * object_store/path: Migrate from `snafu` to `thiserror` * object_store/http: Migrate from `snafu` to `thiserror` * object_store/client: Migrate from `snafu` to `thiserror` * object_store/aws: Migrate from `snafu` to `thiserror` * object_store/azure: Migrate from `snafu` to `thiserror` * object_store/gcp: Migrate from `snafu` to `thiserror` * object_store/lib: Migrate from `snafu` to `thiserror` * Remove `snafu` dependency --- object_store/Cargo.toml | 2 +- object_store/src/aws/builder.rs | 52 +++++--- object_store/src/aws/client.rs | 87 ++++++------ object_store/src/aws/credential.rs | 17 ++- object_store/src/aws/resolve.rs | 30 ++--- object_store/src/azure/builder.rs | 65 +++++---- object_store/src/azure/client.rs | 93 +++++++------ object_store/src/azure/credential.rs | 41 +++--- object_store/src/client/get.rs | 97 +++++++------- object_store/src/client/header.rs | 54 +++++--- object_store/src/client/retry.rs | 13 +- object_store/src/delimited.rs | 15 ++- object_store/src/gcp/builder.rs | 48 ++++--- object_store/src/gcp/client.rs | 91 +++++++------ object_store/src/gcp/credential.rs | 57 ++++---- object_store/src/http/client.rs | 52 +++++--- object_store/src/http/mod.rs | 13 +- object_store/src/lib.rs | 36 +++-- object_store/src/local.rs | 191 ++++++++++++--------------- object_store/src/memory.rs | 31 ++--- object_store/src/parse.rs | 12 +- object_store/src/path/mod.rs | 35 +++-- object_store/src/path/parts.rs | 7 +- object_store/src/util.rs | 9 +- 24 files changed, 620 insertions(+), 528 deletions(-) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index a127be3602ef..6f5e9db1bc70 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -38,7 +38,7 @@ humantime = "2.1" itertools = "0.13.0" parking_lot = { version = "0.12" } percent-encoding = "2.1" -snafu = { version = "0.8", default-features = false, features = ["std", "rust_1_61"] } +thiserror = "2.0.2" tracing = { version = "0.1" } url = "2.2" walkdir = { version = "2", optional = true } diff --git a/object_store/src/aws/builder.rs b/object_store/src/aws/builder.rs index 840245a7b5d4..d29fa782e8ff 100644 --- a/object_store/src/aws/builder.rs +++ b/object_store/src/aws/builder.rs @@ -32,7 +32,6 @@ use itertools::Itertools; use md5::{Digest, Md5}; use reqwest::header::{HeaderMap, HeaderValue}; use serde::{Deserialize, Serialize}; -use snafu::{OptionExt, ResultExt, Snafu}; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; @@ -43,46 +42,46 @@ use url::Url; static DEFAULT_METADATA_ENDPOINT: &str = "http://169.254.169.254"; /// A specialized `Error` for object store-related errors -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] enum Error { - #[snafu(display("Missing bucket name"))] + #[error("Missing bucket name")] MissingBucketName, - #[snafu(display("Missing AccessKeyId"))] + #[error("Missing AccessKeyId")] MissingAccessKeyId, - #[snafu(display("Missing SecretAccessKey"))] + #[error("Missing SecretAccessKey")] MissingSecretAccessKey, - #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] + #[error("Unable parse source url. Url: {}, Error: {}", url, source)] UnableToParseUrl { source: url::ParseError, url: String, }, - #[snafu(display( + #[error( "Unknown url scheme cannot be parsed into storage location: {}", scheme - ))] + )] UnknownUrlScheme { scheme: String }, - #[snafu(display("URL did not match any known pattern for scheme: {}", url))] + #[error("URL did not match any known pattern for scheme: {}", url)] UrlNotRecognised { url: String }, - #[snafu(display("Configuration key: '{}' is not known.", key))] + #[error("Configuration key: '{}' is not known.", key)] UnknownConfigurationKey { key: String }, - #[snafu(display("Invalid Zone suffix for bucket '{bucket}'"))] + #[error("Invalid Zone suffix for bucket '{bucket}'")] ZoneSuffix { bucket: String }, - #[snafu(display("Invalid encryption type: {}. Valid values are \"AES256\", \"sse:kms\", \"sse:kms:dsse\" and \"sse-c\".", passed))] + #[error("Invalid encryption type: {}. Valid values are \"AES256\", \"sse:kms\", \"sse:kms:dsse\" and \"sse-c\".", passed)] InvalidEncryptionType { passed: String }, - #[snafu(display( + #[error( "Invalid encryption header values. Header: {}, source: {}", header, source - ))] + )] InvalidEncryptionHeader { header: &'static str, source: Box, @@ -603,8 +602,15 @@ impl AmazonS3Builder { /// This is a separate member function to allow fallible computation to /// be deferred until [`Self::build`] which in turn allows deriving [`Clone`] fn parse_url(&mut self, url: &str) -> Result<()> { - let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; - let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; + let parsed = Url::parse(url).map_err(|source| { + let url = url.into(); + Error::UnableToParseUrl { url, source } + })?; + + let host = parsed + .host_str() + .ok_or_else(|| Error::UrlNotRecognised { url: url.into() })?; + match parsed.scheme() { "s3" | "s3a" => self.bucket_name = Some(host.to_string()), "https" => match host.splitn(4, '.').collect_tuple() { @@ -630,9 +636,12 @@ impl AmazonS3Builder { self.bucket_name = Some(bucket.into()); } } - _ => return Err(UrlNotRecognisedSnafu { url }.build().into()), + _ => return Err(Error::UrlNotRecognised { url: url.into() }.into()), }, - scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), + scheme => { + let scheme = scheme.into(); + return Err(Error::UnknownUrlScheme { scheme }.into()); + } }; Ok(()) } @@ -875,7 +884,7 @@ impl AmazonS3Builder { self.parse_url(&url)?; } - let bucket = self.bucket_name.context(MissingBucketNameSnafu)?; + let bucket = self.bucket_name.ok_or(Error::MissingBucketName)?; let region = self.region.unwrap_or_else(|| "us-east-1".to_string()); let checksum = self.checksum_algorithm.map(|x| x.get()).transpose()?; let copy_if_not_exists = self.copy_if_not_exists.map(|x| x.get()).transpose()?; @@ -957,7 +966,10 @@ impl AmazonS3Builder { let (session_provider, zonal_endpoint) = match self.s3_express.get()? { true => { - let zone = parse_bucket_az(&bucket).context(ZoneSuffixSnafu { bucket: &bucket })?; + let zone = parse_bucket_az(&bucket).ok_or_else(|| { + let bucket = bucket.clone(); + Error::ZoneSuffix { bucket } + })?; // https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-express-Regions-and-Zones.html let endpoint = format!("https://{bucket}.s3express-{zone}.{region}.amazonaws.com"); diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index 81015e82b39c..25fdd3311c95 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -56,7 +56,6 @@ use reqwest::{Client as ReqwestClient, Method, RequestBuilder, Response}; use ring::digest; use ring::digest::Context; use serde::{Deserialize, Serialize}; -use snafu::{ResultExt, Snafu}; use std::sync::Arc; const VERSION_HEADER: &str = "x-amz-version-id"; @@ -65,56 +64,56 @@ const USER_DEFINED_METADATA_HEADER_PREFIX: &str = "x-amz-meta-"; const ALGORITHM: &str = "x-amz-checksum-algorithm"; /// A specialized `Error` for object store-related errors -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] pub(crate) enum Error { - #[snafu(display("Error performing DeleteObjects request: {}", source))] + #[error("Error performing DeleteObjects request: {}", source)] DeleteObjectsRequest { source: crate::client::retry::Error }, - #[snafu(display( + #[error( "DeleteObjects request failed for key {}: {} (code: {})", path, message, code - ))] + )] DeleteFailed { path: String, code: String, message: String, }, - #[snafu(display("Error getting DeleteObjects response body: {}", source))] + #[error("Error getting DeleteObjects response body: {}", source)] DeleteObjectsResponse { source: reqwest::Error }, - #[snafu(display("Got invalid DeleteObjects response: {}", source))] + #[error("Got invalid DeleteObjects response: {}", source)] InvalidDeleteObjectsResponse { source: Box, }, - #[snafu(display("Error performing list request: {}", source))] + #[error("Error performing list request: {}", source)] ListRequest { source: crate::client::retry::Error }, - #[snafu(display("Error getting list response body: {}", source))] + #[error("Error getting list response body: {}", source)] ListResponseBody { source: reqwest::Error }, - #[snafu(display("Error getting create multipart response body: {}", source))] + #[error("Error getting create multipart response body: {}", source)] CreateMultipartResponseBody { source: reqwest::Error }, - #[snafu(display("Error performing complete multipart request: {}: {}", path, source))] + #[error("Error performing complete multipart request: {}: {}", path, source)] CompleteMultipartRequest { source: crate::client::retry::Error, path: String, }, - #[snafu(display("Error getting complete multipart response body: {}", source))] + #[error("Error getting complete multipart response body: {}", source)] CompleteMultipartResponseBody { source: reqwest::Error }, - #[snafu(display("Got invalid list response: {}", source))] + #[error("Got invalid list response: {}", source)] InvalidListResponse { source: quick_xml::de::DeError }, - #[snafu(display("Got invalid multipart response: {}", source))] + #[error("Got invalid multipart response: {}", source)] InvalidMultipartResponse { source: quick_xml::de::DeError }, - #[snafu(display("Unable to extract metadata from headers: {}", source))] + #[error("Unable to extract metadata from headers: {}", source)] Metadata { source: crate::client::header::Error, }, @@ -263,10 +262,15 @@ impl SessionCredential<'_> { } } -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] pub enum RequestError { - #[snafu(context(false))] - Generic { source: crate::Error }, + #[error(transparent)] + Generic { + #[from] + source: crate::Error, + }, + + #[error("Retry")] Retry { source: crate::client::retry::Error, path: String, @@ -426,12 +430,16 @@ impl<'a> Request<'a> { .payload(self.payload) .send() .await - .context(RetrySnafu { path }) + .map_err(|source| { + let path = path.into(); + RequestError::Retry { source, path } + }) } pub(crate) async fn do_put(self) -> Result { let response = self.send().await?; - Ok(get_put_result(response.headers(), VERSION_HEADER).context(MetadataSnafu)?) + Ok(get_put_result(response.headers(), VERSION_HEADER) + .map_err(|source| Error::Metadata { source })?) } } @@ -535,10 +543,10 @@ impl S3Client { .with_aws_sigv4(credential.authorizer(), Some(digest.as_ref())) .send_retry(&self.config.retry_config) .await - .context(DeleteObjectsRequestSnafu {})? + .map_err(|source| Error::DeleteObjectsRequest { source })? .bytes() .await - .context(DeleteObjectsResponseSnafu {})?; + .map_err(|source| Error::DeleteObjectsResponse { source })?; let response: BatchDeleteResponse = quick_xml::de::from_reader(response.reader()).map_err(|err| { @@ -635,10 +643,10 @@ impl S3Client { .await? .bytes() .await - .context(CreateMultipartResponseBodySnafu)?; + .map_err(|source| Error::CreateMultipartResponseBody { source })?; - let response: InitiateMultipartUploadResult = - quick_xml::de::from_reader(response.reader()).context(InvalidMultipartResponseSnafu)?; + let response: InitiateMultipartUploadResult = quick_xml::de::from_reader(response.reader()) + .map_err(|source| Error::InvalidMultipartResponse { source })?; Ok(response.upload_id) } @@ -683,14 +691,14 @@ impl S3Client { .map(|v| v.to_string()); let e_tag = match is_copy { - false => get_etag(response.headers()).context(MetadataSnafu)?, + false => get_etag(response.headers()).map_err(|source| Error::Metadata { source })?, true => { let response = response .bytes() .await - .context(CreateMultipartResponseBodySnafu)?; + .map_err(|source| Error::CreateMultipartResponseBody { source })?; let response: CopyPartResult = quick_xml::de::from_reader(response.reader()) - .context(InvalidMultipartResponseSnafu)?; + .map_err(|source| Error::InvalidMultipartResponse { source })?; response.e_tag } }; @@ -764,19 +772,21 @@ impl S3Client { .retry_error_body(true) .send() .await - .context(CompleteMultipartRequestSnafu { - path: location.as_ref(), + .map_err(|source| Error::CompleteMultipartRequest { + source, + path: location.as_ref().to_string(), })?; - let version = get_version(response.headers(), VERSION_HEADER).context(MetadataSnafu)?; + let version = get_version(response.headers(), VERSION_HEADER) + .map_err(|source| Error::Metadata { source })?; let data = response .bytes() .await - .context(CompleteMultipartResponseBodySnafu)?; + .map_err(|source| Error::CompleteMultipartResponseBody { source })?; - let response: CompleteMultipartUploadResult = - quick_xml::de::from_reader(data.reader()).context(InvalidMultipartResponseSnafu)?; + let response: CompleteMultipartUploadResult = quick_xml::de::from_reader(data.reader()) + .map_err(|source| Error::InvalidMultipartResponse { source })?; Ok(PutResult { e_tag: Some(response.e_tag), @@ -884,13 +894,14 @@ impl ListClient for S3Client { .with_aws_sigv4(credential.authorizer(), None) .send_retry(&self.config.retry_config) .await - .context(ListRequestSnafu)? + .map_err(|source| Error::ListRequest { source })? .bytes() .await - .context(ListResponseBodySnafu)?; + .map_err(|source| Error::ListResponseBody { source })?; + + let mut response: ListResponse = quick_xml::de::from_reader(response.reader()) + .map_err(|source| Error::InvalidListResponse { source })?; - let mut response: ListResponse = - quick_xml::de::from_reader(response.reader()).context(InvalidListResponseSnafu)?; let token = response.next_continuation_token.take(); Ok((response.try_into()?, token)) diff --git a/object_store/src/aws/credential.rs b/object_store/src/aws/credential.rs index ee2f8e2ec953..9c74e1c6526a 100644 --- a/object_store/src/aws/credential.rs +++ b/object_store/src/aws/credential.rs @@ -29,23 +29,22 @@ use percent_encoding::utf8_percent_encode; use reqwest::header::{HeaderMap, HeaderValue, AUTHORIZATION}; use reqwest::{Client, Method, Request, RequestBuilder, StatusCode}; use serde::Deserialize; -use snafu::{ResultExt, Snafu}; use std::collections::BTreeMap; use std::sync::Arc; use std::time::{Duration, Instant}; use tracing::warn; use url::Url; -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] #[allow(clippy::enum_variant_names)] enum Error { - #[snafu(display("Error performing CreateSession request: {source}"))] + #[error("Error performing CreateSession request: {source}")] CreateSessionRequest { source: crate::client::retry::Error }, - #[snafu(display("Error getting CreateSession response: {source}"))] + #[error("Error getting CreateSession response: {source}")] CreateSessionResponse { source: reqwest::Error }, - #[snafu(display("Invalid CreateSessionOutput response: {source}"))] + #[error("Invalid CreateSessionOutput response: {source}")] CreateSessionOutput { source: quick_xml::DeError }, } @@ -726,13 +725,13 @@ impl TokenProvider for SessionProvider { .with_aws_sigv4(Some(authorizer), None) .send_retry(retry) .await - .context(CreateSessionRequestSnafu)? + .map_err(|source| Error::CreateSessionRequest { source })? .bytes() .await - .context(CreateSessionResponseSnafu)?; + .map_err(|source| Error::CreateSessionResponse { source })?; - let resp: CreateSessionOutput = - quick_xml::de::from_reader(bytes.reader()).context(CreateSessionOutputSnafu)?; + let resp: CreateSessionOutput = quick_xml::de::from_reader(bytes.reader()) + .map_err(|source| Error::CreateSessionOutput { source })?; let creds = resp.credentials; Ok(TemporaryToken { diff --git a/object_store/src/aws/resolve.rs b/object_store/src/aws/resolve.rs index 25bc74f32f29..db899ea989e3 100644 --- a/object_store/src/aws/resolve.rs +++ b/object_store/src/aws/resolve.rs @@ -17,21 +17,20 @@ use crate::aws::STORE; use crate::{ClientOptions, Result}; -use snafu::{ensure, OptionExt, ResultExt, Snafu}; /// A specialized `Error` for object store-related errors -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] enum Error { - #[snafu(display("Bucket '{}' not found", bucket))] + #[error("Bucket '{}' not found", bucket)] BucketNotFound { bucket: String }, - #[snafu(display("Failed to resolve region for bucket '{}'", bucket))] + #[error("Failed to resolve region for bucket '{}'", bucket)] ResolveRegion { bucket: String, source: reqwest::Error, }, - #[snafu(display("Failed to parse the region for bucket '{}'", bucket))] + #[error("Failed to parse the region for bucket '{}'", bucket)] RegionParse { bucket: String }, } @@ -54,22 +53,23 @@ pub async fn resolve_bucket_region(bucket: &str, client_options: &ClientOptions) let client = client_options.client()?; - let response = client - .head(&endpoint) - .send() - .await - .context(ResolveRegionSnafu { bucket })?; + let response = client.head(&endpoint).send().await.map_err(|source| { + let bucket = bucket.into(); + Error::ResolveRegion { bucket, source } + })?; - ensure!( - response.status() != StatusCode::NOT_FOUND, - BucketNotFoundSnafu { bucket } - ); + if response.status() == StatusCode::NOT_FOUND { + let bucket = bucket.into(); + return Err(Error::BucketNotFound { bucket }.into()); + } let region = response .headers() .get("x-amz-bucket-region") .and_then(|x| x.to_str().ok()) - .context(RegionParseSnafu { bucket })?; + .ok_or_else(|| Error::RegionParse { + bucket: bucket.into(), + })?; Ok(region.to_string()) } diff --git a/object_store/src/azure/builder.rs b/object_store/src/azure/builder.rs index 08c9a232393d..f0572ebe6358 100644 --- a/object_store/src/azure/builder.rs +++ b/object_store/src/azure/builder.rs @@ -26,7 +26,6 @@ use crate::config::ConfigValue; use crate::{ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider}; use percent_encoding::percent_decode_str; use serde::{Deserialize, Serialize}; -use snafu::{OptionExt, ResultExt, Snafu}; use std::str::FromStr; use std::sync::Arc; use url::Url; @@ -45,48 +44,48 @@ const EMULATOR_ACCOUNT_KEY: &str = const MSI_ENDPOINT_ENV_KEY: &str = "IDENTITY_ENDPOINT"; /// A specialized `Error` for Azure builder-related errors -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] enum Error { - #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] + #[error("Unable parse source url. Url: {}, Error: {}", url, source)] UnableToParseUrl { source: url::ParseError, url: String, }, - #[snafu(display( + #[error( "Unable parse emulator url {}={}, Error: {}", env_name, env_value, source - ))] + )] UnableToParseEmulatorUrl { env_name: String, env_value: String, source: url::ParseError, }, - #[snafu(display("Account must be specified"))] + #[error("Account must be specified")] MissingAccount {}, - #[snafu(display("Container name must be specified"))] + #[error("Container name must be specified")] MissingContainerName {}, - #[snafu(display( + #[error( "Unknown url scheme cannot be parsed into storage location: {}", scheme - ))] + )] UnknownUrlScheme { scheme: String }, - #[snafu(display("URL did not match any known pattern for scheme: {}", url))] + #[error("URL did not match any known pattern for scheme: {}", url)] UrlNotRecognised { url: String }, - #[snafu(display("Failed parsing an SAS key"))] + #[error("Failed parsing an SAS key")] DecodeSasKey { source: std::str::Utf8Error }, - #[snafu(display("Missing component in SAS query pair"))] + #[error("Missing component in SAS query pair")] MissingSasComponent {}, - #[snafu(display("Configuration key: '{}' is not known.", key))] + #[error("Configuration key: '{}' is not known.", key)] UnknownConfigurationKey { key: String }, } @@ -642,11 +641,17 @@ impl MicrosoftAzureBuilder { /// This is a separate member function to allow fallible computation to /// be deferred until [`Self::build`] which in turn allows deriving [`Clone`] fn parse_url(&mut self, url: &str) -> Result<()> { - let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; - let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; + let parsed = Url::parse(url).map_err(|source| { + let url = url.into(); + Error::UnableToParseUrl { url, source } + })?; + + let host = parsed + .host_str() + .ok_or_else(|| Error::UrlNotRecognised { url: url.into() })?; let validate = |s: &str| match s.contains('.') { - true => Err(UrlNotRecognisedSnafu { url }.build()), + true => Err(Error::UrlNotRecognised { url: url.into() }), false => Ok(s.to_string()), }; @@ -665,7 +670,7 @@ impl MicrosoftAzureBuilder { self.account_name = Some(validate(a)?); self.use_fabric_endpoint = true.into(); } else { - return Err(UrlNotRecognisedSnafu { url }.build().into()); + return Err(Error::UrlNotRecognised { url: url.into() }.into()); } } "https" => match host.split_once('.') { @@ -689,9 +694,12 @@ impl MicrosoftAzureBuilder { } self.use_fabric_endpoint = true.into(); } - _ => return Err(UrlNotRecognisedSnafu { url }.build().into()), + _ => return Err(Error::UrlNotRecognised { url: url.into() }.into()), }, - scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), + scheme => { + let scheme = scheme.into(); + return Err(Error::UnknownUrlScheme { scheme }.into()); + } } Ok(()) } @@ -924,8 +932,10 @@ impl MicrosoftAzureBuilder { }, }; - let url = - Url::parse(&account_url).context(UnableToParseUrlSnafu { url: account_url })?; + let url = Url::parse(&account_url).map_err(|source| { + let url = account_url.clone(); + Error::UnableToParseUrl { url, source } + })?; let credential = if let Some(credential) = self.credentials { credential @@ -1030,10 +1040,13 @@ impl MicrosoftAzureBuilder { /// if present, otherwise falls back to default_url fn url_from_env(env_name: &str, default_url: &str) -> Result { let url = match std::env::var(env_name) { - Ok(env_value) => Url::parse(&env_value).context(UnableToParseEmulatorUrlSnafu { - env_name, - env_value, - })?, + Ok(env_value) => { + Url::parse(&env_value).map_err(|source| Error::UnableToParseEmulatorUrl { + env_name: env_name.into(), + env_value, + source, + })? + } Err(_) => Url::parse(default_url).expect("Failed to parse default URL"), }; Ok(url) @@ -1042,7 +1055,7 @@ fn url_from_env(env_name: &str, default_url: &str) -> Result { fn split_sas(sas: &str) -> Result, Error> { let sas = percent_decode_str(sas) .decode_utf8() - .context(DecodeSasKeySnafu {})?; + .map_err(|source| Error::DecodeSasKey { source })?; let kv_str_pairs = sas .trim_start_matches('?') .split('&') diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index 69ff39526bef..ea3a5faf3ad8 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -42,7 +42,6 @@ use reqwest::{ Client as ReqwestClient, Method, RequestBuilder, Response, }; use serde::{Deserialize, Serialize}; -use snafu::{OptionExt, ResultExt, Snafu}; use std::collections::HashMap; use std::sync::Arc; use std::time::Duration; @@ -60,84 +59,84 @@ static MS_CONTENT_LANGUAGE: HeaderName = HeaderName::from_static("x-ms-blob-cont static TAGS_HEADER: HeaderName = HeaderName::from_static("x-ms-tags"); /// A specialized `Error` for object store-related errors -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] pub(crate) enum Error { - #[snafu(display("Error performing get request {}: {}", path, source))] + #[error("Error performing get request {}: {}", path, source)] GetRequest { source: crate::client::retry::Error, path: String, }, - #[snafu(display("Error performing put request {}: {}", path, source))] + #[error("Error performing put request {}: {}", path, source)] PutRequest { source: crate::client::retry::Error, path: String, }, - #[snafu(display("Error performing delete request {}: {}", path, source))] + #[error("Error performing delete request {}: {}", path, source)] DeleteRequest { source: crate::client::retry::Error, path: String, }, - #[snafu(display("Error performing bulk delete request: {}", source))] + #[error("Error performing bulk delete request: {}", source)] BulkDeleteRequest { source: crate::client::retry::Error }, - #[snafu(display("Error receiving bulk delete request body: {}", source))] + #[error("Error receiving bulk delete request body: {}", source)] BulkDeleteRequestBody { source: reqwest::Error }, - #[snafu(display( + #[error( "Bulk delete request failed due to invalid input: {} (code: {})", reason, code - ))] + )] BulkDeleteRequestInvalidInput { code: String, reason: String }, - #[snafu(display("Got invalid bulk delete response: {}", reason))] + #[error("Got invalid bulk delete response: {}", reason)] InvalidBulkDeleteResponse { reason: String }, - #[snafu(display( + #[error( "Bulk delete request failed for key {}: {} (code: {})", path, reason, code - ))] + )] DeleteFailed { path: String, code: String, reason: String, }, - #[snafu(display("Error performing list request: {}", source))] + #[error("Error performing list request: {}", source)] ListRequest { source: crate::client::retry::Error }, - #[snafu(display("Error getting list response body: {}", source))] + #[error("Error getting list response body: {}", source)] ListResponseBody { source: reqwest::Error }, - #[snafu(display("Got invalid list response: {}", source))] + #[error("Got invalid list response: {}", source)] InvalidListResponse { source: quick_xml::de::DeError }, - #[snafu(display("Unable to extract metadata from headers: {}", source))] + #[error("Unable to extract metadata from headers: {}", source)] Metadata { source: crate::client::header::Error, }, - #[snafu(display("ETag required for conditional update"))] + #[error("ETag required for conditional update")] MissingETag, - #[snafu(display("Error requesting user delegation key: {}", source))] + #[error("Error requesting user delegation key: {}", source)] DelegationKeyRequest { source: crate::client::retry::Error }, - #[snafu(display("Error getting user delegation key response body: {}", source))] + #[error("Error getting user delegation key response body: {}", source)] DelegationKeyResponseBody { source: reqwest::Error }, - #[snafu(display("Got invalid user delegation key response: {}", source))] + #[error("Got invalid user delegation key response: {}", source)] DelegationKeyResponse { source: quick_xml::de::DeError }, - #[snafu(display("Generating SAS keys with SAS tokens auth is not supported"))] + #[error("Generating SAS keys with SAS tokens auth is not supported")] SASforSASNotSupported, - #[snafu(display("Generating SAS keys while skipping signatures is not supported"))] + #[error("Generating SAS keys while skipping signatures is not supported")] SASwithSkipSignature, } @@ -268,8 +267,9 @@ impl<'a> PutRequest<'a> { .payload(Some(self.payload)) .send() .await - .context(PutRequestSnafu { - path: self.path.as_ref(), + .map_err(|source| { + let path = self.path.as_ref().into(); + Error::PutRequest { path, source } })?; Ok(response) @@ -544,13 +544,14 @@ impl AzureClient { PutMode::Overwrite => builder.idempotent(true), PutMode::Create => builder.header(&IF_NONE_MATCH, "*"), PutMode::Update(v) => { - let etag = v.e_tag.as_ref().context(MissingETagSnafu)?; + let etag = v.e_tag.as_ref().ok_or(Error::MissingETag)?; builder.header(&IF_MATCH, etag) } }; let response = builder.header(&BLOB_TYPE, "BlockBlob").send().await?; - Ok(get_put_result(response.headers(), VERSION_HEADER).context(MetadataSnafu)?) + Ok(get_put_result(response.headers(), VERSION_HEADER) + .map_err(|source| Error::Metadata { source })?) } /// PUT a block @@ -595,7 +596,8 @@ impl AzureClient { .send() .await?; - Ok(get_put_result(response.headers(), VERSION_HEADER).context(MetadataSnafu)?) + Ok(get_put_result(response.headers(), VERSION_HEADER) + .map_err(|source| Error::Metadata { source })?) } /// Make an Azure Delete request @@ -620,8 +622,9 @@ impl AzureClient { .sensitive(sensitive) .send() .await - .context(DeleteRequestSnafu { - path: path.as_ref(), + .map_err(|source| { + let path = path.as_ref().into(); + Error::DeleteRequest { source, path } })?; Ok(()) @@ -693,14 +696,14 @@ impl AzureClient { .with_azure_authorization(&credential, &self.config.account) .send_retry(&self.config.retry_config) .await - .context(BulkDeleteRequestSnafu {})?; + .map_err(|source| Error::BulkDeleteRequest { source })?; let boundary = parse_multipart_response_boundary(&batch_response)?; let batch_body = batch_response .bytes() .await - .context(BulkDeleteRequestBodySnafu {})?; + .map_err(|source| Error::BulkDeleteRequestBody { source })?; let results = parse_blob_batch_delete_body(batch_body, boundary, &paths).await?; @@ -780,13 +783,13 @@ impl AzureClient { .idempotent(true) .send() .await - .context(DelegationKeyRequestSnafu)? + .map_err(|source| Error::DelegationKeyRequest { source })? .bytes() .await - .context(DelegationKeyResponseBodySnafu)?; + .map_err(|source| Error::DelegationKeyResponseBody { source })?; - let response: UserDelegationKey = - quick_xml::de::from_reader(response.reader()).context(DelegationKeyResponseSnafu)?; + let response: UserDelegationKey = quick_xml::de::from_reader(response.reader()) + .map_err(|source| Error::DelegationKeyResponse { source })?; Ok(response) } @@ -842,9 +845,11 @@ impl AzureClient { .sensitive(sensitive) .send() .await - .context(GetRequestSnafu { - path: path.as_ref(), + .map_err(|source| { + let path = path.as_ref().into(); + Error::GetRequest { source, path } })?; + Ok(response) } } @@ -900,8 +905,9 @@ impl GetClient for AzureClient { .sensitive(sensitive) .send() .await - .context(GetRequestSnafu { - path: path.as_ref(), + .map_err(|source| { + let path = path.as_ref().into(); + Error::GetRequest { source, path } })?; match response.headers().get("x-ms-resource-type") { @@ -962,13 +968,14 @@ impl ListClient for AzureClient { .sensitive(sensitive) .send() .await - .context(ListRequestSnafu)? + .map_err(|source| Error::ListRequest { source })? .bytes() .await - .context(ListResponseBodySnafu)?; + .map_err(|source| Error::ListResponseBody { source })?; + + let mut response: ListResultInternal = quick_xml::de::from_reader(response.reader()) + .map_err(|source| Error::InvalidListResponse { source })?; - let mut response: ListResultInternal = - quick_xml::de::from_reader(response.reader()).context(InvalidListResponseSnafu)?; let token = response.next_marker.take(); Ok((to_list_result(response, prefix)?, token)) diff --git a/object_store/src/azure/credential.rs b/object_store/src/azure/credential.rs index 2832eed72256..c9e6ac640b4a 100644 --- a/object_store/src/azure/credential.rs +++ b/object_store/src/azure/credential.rs @@ -32,7 +32,6 @@ use reqwest::header::{ }; use reqwest::{Client, Method, Request, RequestBuilder}; use serde::Deserialize; -use snafu::{ResultExt, Snafu}; use std::borrow::Cow; use std::collections::HashMap; use std::fmt::Debug; @@ -71,27 +70,27 @@ const AZURE_STORAGE_SCOPE: &str = "https://storage.azure.com/.default"; /// const AZURE_STORAGE_RESOURCE: &str = "https://storage.azure.com"; -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] pub enum Error { - #[snafu(display("Error performing token request: {}", source))] + #[error("Error performing token request: {}", source)] TokenRequest { source: crate::client::retry::Error }, - #[snafu(display("Error getting token response body: {}", source))] + #[error("Error getting token response body: {}", source)] TokenResponseBody { source: reqwest::Error }, - #[snafu(display("Error reading federated token file "))] + #[error("Error reading federated token file ")] FederatedTokenFile, - #[snafu(display("Invalid Access Key: {}", source))] + #[error("Invalid Access Key: {}", source)] InvalidAccessKey { source: base64::DecodeError }, - #[snafu(display("'az account get-access-token' command failed: {message}"))] + #[error("'az account get-access-token' command failed: {message}")] AzureCli { message: String }, - #[snafu(display("Failed to parse azure cli response: {source}"))] + #[error("Failed to parse azure cli response: {source}")] AzureCliResponse { source: serde_json::Error }, - #[snafu(display("Generating SAS keys with SAS tokens auth is not supported"))] + #[error("Generating SAS keys with SAS tokens auth is not supported")] SASforSASNotSupported, } @@ -113,7 +112,10 @@ pub struct AzureAccessKey(Vec); impl AzureAccessKey { /// Create a new [`AzureAccessKey`], checking it for validity pub fn try_new(key: &str) -> Result { - let key = BASE64_STANDARD.decode(key).context(InvalidAccessKeySnafu)?; + let key = BASE64_STANDARD + .decode(key) + .map_err(|source| Error::InvalidAccessKey { source })?; + Ok(Self(key)) } } @@ -636,10 +638,10 @@ impl TokenProvider for ClientSecretOAuthProvider { .idempotent(true) .send() .await - .context(TokenRequestSnafu)? + .map_err(|source| Error::TokenRequest { source })? .json() .await - .context(TokenResponseBodySnafu)?; + .map_err(|source| Error::TokenResponseBody { source })?; Ok(TemporaryToken { token: Arc::new(AzureCredential::BearerToken(response.access_token)), @@ -744,10 +746,10 @@ impl TokenProvider for ImdsManagedIdentityProvider { let response: ImdsTokenResponse = builder .send_retry(retry) .await - .context(TokenRequestSnafu)? + .map_err(|source| Error::TokenRequest { source })? .json() .await - .context(TokenResponseBodySnafu)?; + .map_err(|source| Error::TokenResponseBody { source })?; Ok(TemporaryToken { token: Arc::new(AzureCredential::BearerToken(response.access_token)), @@ -820,10 +822,10 @@ impl TokenProvider for WorkloadIdentityOAuthProvider { .idempotent(true) .send() .await - .context(TokenRequestSnafu)? + .map_err(|source| Error::TokenRequest { source })? .json() .await - .context(TokenResponseBodySnafu)?; + .map_err(|source| Error::TokenResponseBody { source })?; Ok(TemporaryToken { token: Arc::new(AzureCredential::BearerToken(response.access_token)), @@ -900,7 +902,8 @@ impl AzureCliCredential { })?; let token_response = serde_json::from_str::(output) - .context(AzureCliResponseSnafu)?; + .map_err(|source| Error::AzureCliResponse { source })?; + if !token_response.token_type.eq_ignore_ascii_case("bearer") { return Err(Error::AzureCli { message: format!( @@ -1033,10 +1036,10 @@ impl TokenProvider for FabricTokenOAuthProvider { .idempotent(true) .send() .await - .context(TokenRequestSnafu)? + .map_err(|source| Error::TokenRequest { source })? .text() .await - .context(TokenResponseBodySnafu)?; + .map_err(|source| Error::TokenResponseBody { source })?; let exp_in = Self::validate_and_get_expiry(&access_token) .map_or(3600, |expiry| expiry - Self::get_current_timestamp()); Ok(TemporaryToken { diff --git a/object_store/src/client/get.rs b/object_store/src/client/get.rs index 5dd62cbece5a..57aca8956452 100644 --- a/object_store/src/client/get.rs +++ b/object_store/src/client/get.rs @@ -29,7 +29,6 @@ use hyper::header::{ use hyper::StatusCode; use reqwest::header::ToStrError; use reqwest::Response; -use snafu::{ensure, OptionExt, ResultExt, Snafu}; /// A client that can perform a get request #[async_trait] @@ -95,49 +94,51 @@ impl ContentRange { } /// A specialized `Error` for get-related errors -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] enum GetResultError { - #[snafu(context(false))] + #[error(transparent)] Header { + #[from] source: crate::client::header::Error, }, - #[snafu(transparent)] + #[error(transparent)] InvalidRangeRequest { + #[from] source: crate::util::InvalidGetRange, }, - #[snafu(display("Received non-partial response when range requested"))] + #[error("Received non-partial response when range requested")] NotPartial, - #[snafu(display("Content-Range header not present in partial response"))] + #[error("Content-Range header not present in partial response")] NoContentRange, - #[snafu(display("Failed to parse value for CONTENT_RANGE header: \"{value}\""))] + #[error("Failed to parse value for CONTENT_RANGE header: \"{value}\"")] ParseContentRange { value: String }, - #[snafu(display("Content-Range header contained non UTF-8 characters"))] + #[error("Content-Range header contained non UTF-8 characters")] InvalidContentRange { source: ToStrError }, - #[snafu(display("Cache-Control header contained non UTF-8 characters"))] + #[error("Cache-Control header contained non UTF-8 characters")] InvalidCacheControl { source: ToStrError }, - #[snafu(display("Content-Disposition header contained non UTF-8 characters"))] + #[error("Content-Disposition header contained non UTF-8 characters")] InvalidContentDisposition { source: ToStrError }, - #[snafu(display("Content-Encoding header contained non UTF-8 characters"))] + #[error("Content-Encoding header contained non UTF-8 characters")] InvalidContentEncoding { source: ToStrError }, - #[snafu(display("Content-Language header contained non UTF-8 characters"))] + #[error("Content-Language header contained non UTF-8 characters")] InvalidContentLanguage { source: ToStrError }, - #[snafu(display("Content-Type header contained non UTF-8 characters"))] + #[error("Content-Type header contained non UTF-8 characters")] InvalidContentType { source: ToStrError }, - #[snafu(display("Metadata value for \"{key:?}\" contained non UTF-8 characters"))] + #[error("Metadata value for \"{key:?}\" contained non UTF-8 characters")] InvalidMetadata { key: String }, - #[snafu(display("Requested {expected:?}, got {actual:?}"))] + #[error("Requested {expected:?}, got {actual:?}")] UnexpectedRange { expected: Range, actual: Range, @@ -153,17 +154,24 @@ fn get_result( // ensure that we receive the range we asked for let range = if let Some(expected) = range { - ensure!( - response.status() == StatusCode::PARTIAL_CONTENT, - NotPartialSnafu - ); + if response.status() != StatusCode::PARTIAL_CONTENT { + return Err(GetResultError::NotPartial); + } + let val = response .headers() .get(CONTENT_RANGE) - .context(NoContentRangeSnafu)?; + .ok_or(GetResultError::NoContentRange)?; + + let value = val + .to_str() + .map_err(|source| GetResultError::InvalidContentRange { source })?; + + let value = ContentRange::from_str(value).ok_or_else(|| { + let value = value.into(); + GetResultError::ParseContentRange { value } + })?; - let value = val.to_str().context(InvalidContentRangeSnafu)?; - let value = ContentRange::from_str(value).context(ParseContentRangeSnafu { value })?; let actual = value.range; // Update size to reflect full size of object (#5272) @@ -171,10 +179,9 @@ fn get_result( let expected = expected.as_range(meta.size)?; - ensure!( - actual == expected, - UnexpectedRangeSnafu { expected, actual } - ); + if actual != expected { + return Err(GetResultError::UnexpectedRange { expected, actual }); + } actual } else { @@ -182,11 +189,11 @@ fn get_result( }; macro_rules! parse_attributes { - ($headers:expr, $(($header:expr, $attr:expr, $err:expr)),*) => {{ + ($headers:expr, $(($header:expr, $attr:expr, $map_err:expr)),*) => {{ let mut attributes = Attributes::new(); $( if let Some(x) = $headers.get($header) { - let x = x.to_str().context($err)?; + let x = x.to_str().map_err($map_err)?; attributes.insert($attr, x.to_string().into()); } )* @@ -196,31 +203,23 @@ fn get_result( let mut attributes = parse_attributes!( response.headers(), - ( - CACHE_CONTROL, - Attribute::CacheControl, - InvalidCacheControlSnafu - ), + (CACHE_CONTROL, Attribute::CacheControl, |source| { + GetResultError::InvalidCacheControl { source } + }), ( CONTENT_DISPOSITION, Attribute::ContentDisposition, - InvalidContentDispositionSnafu - ), - ( - CONTENT_ENCODING, - Attribute::ContentEncoding, - InvalidContentEncodingSnafu + |source| GetResultError::InvalidContentDisposition { source } ), - ( - CONTENT_LANGUAGE, - Attribute::ContentLanguage, - InvalidContentLanguageSnafu - ), - ( - CONTENT_TYPE, - Attribute::ContentType, - InvalidContentTypeSnafu - ) + (CONTENT_ENCODING, Attribute::ContentEncoding, |source| { + GetResultError::InvalidContentEncoding { source } + }), + (CONTENT_LANGUAGE, Attribute::ContentLanguage, |source| { + GetResultError::InvalidContentLanguage { source } + }), + (CONTENT_TYPE, Attribute::ContentType, |source| { + GetResultError::InvalidContentType { source } + }) ); // Add attributes that match the user-defined metadata prefix (e.g. x-amz-meta-) diff --git a/object_store/src/client/header.rs b/object_store/src/client/header.rs index 07c04c11945a..db06da6345d5 100644 --- a/object_store/src/client/header.rs +++ b/object_store/src/client/header.rs @@ -22,7 +22,6 @@ use crate::ObjectMeta; use chrono::{DateTime, TimeZone, Utc}; use hyper::header::{CONTENT_LENGTH, ETAG, LAST_MODIFIED}; use hyper::HeaderMap; -use snafu::{OptionExt, ResultExt, Snafu}; #[derive(Debug, Copy, Clone)] /// Configuration for header extraction @@ -44,27 +43,27 @@ pub(crate) struct HeaderConfig { pub user_defined_metadata_prefix: Option<&'static str>, } -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] pub(crate) enum Error { - #[snafu(display("ETag Header missing from response"))] + #[error("ETag Header missing from response")] MissingEtag, - #[snafu(display("Received header containing non-ASCII data"))] + #[error("Received header containing non-ASCII data")] BadHeader { source: reqwest::header::ToStrError }, - #[snafu(display("Last-Modified Header missing from response"))] + #[error("Last-Modified Header missing from response")] MissingLastModified, - #[snafu(display("Content-Length Header missing from response"))] + #[error("Content-Length Header missing from response")] MissingContentLength, - #[snafu(display("Invalid last modified '{}': {}", last_modified, source))] + #[error("Invalid last modified '{}': {}", last_modified, source)] InvalidLastModified { last_modified: String, source: chrono::ParseError, }, - #[snafu(display("Invalid content length '{}': {}", content_length, source))] + #[error("Invalid content length '{}': {}", content_length, source)] InvalidContentLength { content_length: String, source: std::num::ParseIntError, @@ -86,7 +85,11 @@ pub(crate) fn get_put_result( #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] pub(crate) fn get_version(headers: &HeaderMap, version: &str) -> Result, Error> { Ok(match headers.get(version) { - Some(x) => Some(x.to_str().context(BadHeaderSnafu)?.to_string()), + Some(x) => Some( + x.to_str() + .map_err(|source| Error::BadHeader { source })? + .to_string(), + ), None => None, }) } @@ -94,7 +97,10 @@ pub(crate) fn get_version(headers: &HeaderMap, version: &str) -> Result Result { let e_tag = headers.get(ETAG).ok_or(Error::MissingEtag)?; - Ok(e_tag.to_str().context(BadHeaderSnafu)?.to_string()) + Ok(e_tag + .to_str() + .map_err(|source| Error::BadHeader { source })? + .to_string()) } /// Extracts [`ObjectMeta`] from the provided [`HeaderMap`] @@ -105,9 +111,15 @@ pub(crate) fn header_meta( ) -> Result { let last_modified = match headers.get(LAST_MODIFIED) { Some(last_modified) => { - let last_modified = last_modified.to_str().context(BadHeaderSnafu)?; + let last_modified = last_modified + .to_str() + .map_err(|source| Error::BadHeader { source })?; + DateTime::parse_from_rfc2822(last_modified) - .context(InvalidLastModifiedSnafu { last_modified })? + .map_err(|source| Error::InvalidLastModified { + last_modified: last_modified.into(), + source, + })? .with_timezone(&Utc) } None if cfg.last_modified_required => return Err(Error::MissingLastModified), @@ -122,15 +134,25 @@ pub(crate) fn header_meta( let content_length = headers .get(CONTENT_LENGTH) - .context(MissingContentLengthSnafu)?; + .ok_or(Error::MissingContentLength)?; + + let content_length = content_length + .to_str() + .map_err(|source| Error::BadHeader { source })?; - let content_length = content_length.to_str().context(BadHeaderSnafu)?; let size = content_length .parse() - .context(InvalidContentLengthSnafu { content_length })?; + .map_err(|source| Error::InvalidContentLength { + content_length: content_length.into(), + source, + })?; let version = match cfg.version_header.and_then(|h| headers.get(h)) { - Some(v) => Some(v.to_str().context(BadHeaderSnafu)?.to_string()), + Some(v) => Some( + v.to_str() + .map_err(|source| Error::BadHeader { source })? + .to_string(), + ), None => None, }; diff --git a/object_store/src/client/retry.rs b/object_store/src/client/retry.rs index a8a8e58de4d0..8938b0861cca 100644 --- a/object_store/src/client/retry.rs +++ b/object_store/src/client/retry.rs @@ -22,30 +22,29 @@ use crate::PutPayload; use futures::future::BoxFuture; use reqwest::header::LOCATION; use reqwest::{Client, Request, Response, StatusCode}; -use snafu::Error as SnafuError; -use snafu::Snafu; +use std::error::Error as StdError; use std::time::{Duration, Instant}; use tracing::info; /// Retry request error -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] pub enum Error { - #[snafu(display("Received redirect without LOCATION, this normally indicates an incorrectly configured region"))] + #[error("Received redirect without LOCATION, this normally indicates an incorrectly configured region")] BareRedirect, - #[snafu(display("Server error, body contains Error, with status {status}: {}", body.as_deref().unwrap_or("No Body")))] + #[error("Server error, body contains Error, with status {status}: {}", body.as_deref().unwrap_or("No Body"))] Server { status: StatusCode, body: Option, }, - #[snafu(display("Client error with status {status}: {}", body.as_deref().unwrap_or("No Body")))] + #[error("Client error with status {status}: {}", body.as_deref().unwrap_or("No Body"))] Client { status: StatusCode, body: Option, }, - #[snafu(display("Error after {retries} retries in {elapsed:?}, max_retries:{max_retries}, retry_timeout:{retry_timeout:?}, source:{source}"))] + #[error("Error after {retries} retries in {elapsed:?}, max_retries:{max_retries}, retry_timeout:{retry_timeout:?}, source:{source}")] Reqwest { retries: usize, max_retries: usize, diff --git a/object_store/src/delimited.rs b/object_store/src/delimited.rs index 96f88bf41ff7..5b11a0bf7eb1 100644 --- a/object_store/src/delimited.rs +++ b/object_store/src/delimited.rs @@ -21,16 +21,15 @@ use std::collections::VecDeque; use bytes::Bytes; use futures::{Stream, StreamExt}; -use snafu::{ensure, Snafu}; use super::Result; -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] enum Error { - #[snafu(display("encountered unterminated string"))] + #[error("encountered unterminated string")] UnterminatedString, - #[snafu(display("encountered trailing escape character"))] + #[error("encountered trailing escape character")] TrailingEscape, } @@ -125,8 +124,12 @@ impl LineDelimiter { /// Returns `true` if there is no remaining data to be read fn finish(&mut self) -> Result { if !self.remainder.is_empty() { - ensure!(!self.is_quote, UnterminatedStringSnafu); - ensure!(!self.is_escape, TrailingEscapeSnafu); + if self.is_quote { + Err(Error::UnterminatedString)?; + } + if self.is_escape { + Err(Error::TrailingEscape)?; + } self.complete .push_back(Bytes::from(std::mem::take(&mut self.remainder))) diff --git a/object_store/src/gcp/builder.rs b/object_store/src/gcp/builder.rs index fac923c4b9a0..cc5c1e1a0745 100644 --- a/object_store/src/gcp/builder.rs +++ b/object_store/src/gcp/builder.rs @@ -27,7 +27,6 @@ use crate::gcp::{ }; use crate::{ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider}; use serde::{Deserialize, Serialize}; -use snafu::{OptionExt, ResultExt, Snafu}; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; @@ -37,33 +36,33 @@ use super::credential::{AuthorizedUserSigningCredentials, InstanceSigningCredent const TOKEN_MIN_TTL: Duration = Duration::from_secs(4 * 60); -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] enum Error { - #[snafu(display("Missing bucket name"))] + #[error("Missing bucket name")] MissingBucketName {}, - #[snafu(display("One of service account path or service account key may be provided."))] + #[error("One of service account path or service account key may be provided.")] ServiceAccountPathAndKeyProvided, - #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] + #[error("Unable parse source url. Url: {}, Error: {}", url, source)] UnableToParseUrl { source: url::ParseError, url: String, }, - #[snafu(display( + #[error( "Unknown url scheme cannot be parsed into storage location: {}", scheme - ))] + )] UnknownUrlScheme { scheme: String }, - #[snafu(display("URL did not match any known pattern for scheme: {}", url))] + #[error("URL did not match any known pattern for scheme: {}", url)] UrlNotRecognised { url: String }, - #[snafu(display("Configuration key: '{}' is not known.", key))] + #[error("Configuration key: '{}' is not known.", key)] UnknownConfigurationKey { key: String }, - #[snafu(display("GCP credential error: {}", source))] + #[error("GCP credential error: {}", source)] Credential { source: credential::Error }, } @@ -319,12 +318,21 @@ impl GoogleCloudStorageBuilder { /// This is a separate member function to allow fallible computation to /// be deferred until [`Self::build`] which in turn allows deriving [`Clone`] fn parse_url(&mut self, url: &str) -> Result<()> { - let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; - let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; + let parsed = Url::parse(url).map_err(|source| Error::UnableToParseUrl { + source, + url: url.to_string(), + })?; + + let host = parsed.host_str().ok_or_else(|| Error::UrlNotRecognised { + url: url.to_string(), + })?; match parsed.scheme() { "gs" => self.bucket_name = Some(host.to_string()), - scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), + scheme => { + let scheme = scheme.to_string(); + return Err(Error::UnknownUrlScheme { scheme }.into()); + } } Ok(()) } @@ -428,12 +436,14 @@ impl GoogleCloudStorageBuilder { // First try to initialize from the service account information. let service_account_credentials = match (self.service_account_path, self.service_account_key) { - (Some(path), None) => { - Some(ServiceAccountCredentials::from_file(path).context(CredentialSnafu)?) - } - (None, Some(key)) => { - Some(ServiceAccountCredentials::from_key(&key).context(CredentialSnafu)?) - } + (Some(path), None) => Some( + ServiceAccountCredentials::from_file(path) + .map_err(|source| Error::Credential { source })?, + ), + (None, Some(key)) => Some( + ServiceAccountCredentials::from_key(&key) + .map_err(|source| Error::Credential { source })?, + ), (None, None) => None, (Some(_), Some(_)) => return Err(Error::ServiceAccountPathAndKeyProvided.into()), }; diff --git a/object_store/src/gcp/client.rs b/object_store/src/gcp/client.rs index ccc9c341f2fe..1928d13b4739 100644 --- a/object_store/src/gcp/client.rs +++ b/object_store/src/gcp/client.rs @@ -44,7 +44,6 @@ use percent_encoding::{percent_encode, utf8_percent_encode, NON_ALPHANUMERIC}; use reqwest::header::HeaderName; use reqwest::{Client, Method, RequestBuilder, Response, StatusCode}; use serde::{Deserialize, Serialize}; -use snafu::{OptionExt, ResultExt, Snafu}; use std::sync::Arc; const VERSION_HEADER: &str = "x-goog-generation"; @@ -53,62 +52,62 @@ const USER_DEFINED_METADATA_HEADER_PREFIX: &str = "x-goog-meta-"; static VERSION_MATCH: HeaderName = HeaderName::from_static("x-goog-if-generation-match"); -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] enum Error { - #[snafu(display("Error performing list request: {}", source))] + #[error("Error performing list request: {}", source)] ListRequest { source: crate::client::retry::Error }, - #[snafu(display("Error getting list response body: {}", source))] + #[error("Error getting list response body: {}", source)] ListResponseBody { source: reqwest::Error }, - #[snafu(display("Got invalid list response: {}", source))] + #[error("Got invalid list response: {}", source)] InvalidListResponse { source: quick_xml::de::DeError }, - #[snafu(display("Error performing get request {}: {}", path, source))] + #[error("Error performing get request {}: {}", path, source)] GetRequest { source: crate::client::retry::Error, path: String, }, - #[snafu(display("Error performing request {}: {}", path, source))] + #[error("Error performing request {}: {}", path, source)] Request { source: crate::client::retry::Error, path: String, }, - #[snafu(display("Error getting put response body: {}", source))] + #[error("Error getting put response body: {}", source)] PutResponseBody { source: reqwest::Error }, - #[snafu(display("Got invalid put request: {}", source))] + #[error("Got invalid put request: {}", source)] InvalidPutRequest { source: quick_xml::se::SeError }, - #[snafu(display("Got invalid put response: {}", source))] + #[error("Got invalid put response: {}", source)] InvalidPutResponse { source: quick_xml::de::DeError }, - #[snafu(display("Unable to extract metadata from headers: {}", source))] + #[error("Unable to extract metadata from headers: {}", source)] Metadata { source: crate::client::header::Error, }, - #[snafu(display("Version required for conditional update"))] + #[error("Version required for conditional update")] MissingVersion, - #[snafu(display("Error performing complete multipart request: {}", source))] + #[error("Error performing complete multipart request: {}", source)] CompleteMultipartRequest { source: crate::client::retry::Error }, - #[snafu(display("Error getting complete multipart response body: {}", source))] + #[error("Error getting complete multipart response body: {}", source)] CompleteMultipartResponseBody { source: reqwest::Error }, - #[snafu(display("Got invalid multipart response: {}", source))] + #[error("Got invalid multipart response: {}", source)] InvalidMultipartResponse { source: quick_xml::de::DeError }, - #[snafu(display("Error signing blob: {}", source))] + #[error("Error signing blob: {}", source)] SignBlobRequest { source: crate::client::retry::Error }, - #[snafu(display("Got invalid signing blob response: {}", source))] + #[error("Got invalid signing blob response: {}", source)] InvalidSignBlobResponse { source: reqwest::Error }, - #[snafu(display("Got invalid signing blob signature: {}", source))] + #[error("Got invalid signing blob signature: {}", source)] InvalidSignBlobSignature { source: base64::DecodeError }, } @@ -236,15 +235,17 @@ impl<'a> Request<'a> { .payload(self.payload) .send() .await - .context(RequestSnafu { - path: self.path.as_ref(), + .map_err(|source| { + let path = self.path.as_ref().into(); + Error::Request { source, path } })?; Ok(resp) } async fn do_put(self) -> Result { let response = self.send().await?; - Ok(get_put_result(response.headers(), VERSION_HEADER).context(MetadataSnafu)?) + Ok(get_put_result(response.headers(), VERSION_HEADER) + .map_err(|source| Error::Metadata { source })?) } } @@ -336,17 +337,17 @@ impl GoogleCloudStorageClient { .idempotent(true) .send() .await - .context(SignBlobRequestSnafu)?; + .map_err(|source| Error::SignBlobRequest { source })?; //If successful, the signature is returned in the signedBlob field in the response. let response = response .json::() .await - .context(InvalidSignBlobResponseSnafu)?; + .map_err(|source| Error::InvalidSignBlobResponse { source })?; let signed_blob = BASE64_STANDARD .decode(response.signed_blob) - .context(InvalidSignBlobSignatureSnafu)?; + .map_err(|source| Error::InvalidSignBlobSignature { source })?; Ok(hex_encode(&signed_blob)) } @@ -389,7 +390,7 @@ impl GoogleCloudStorageClient { PutMode::Overwrite => builder.idempotent(true), PutMode::Create => builder.header(&VERSION_MATCH, "0"), PutMode::Update(v) => { - let etag = v.version.as_ref().context(MissingVersionSnafu)?; + let etag = v.version.as_ref().ok_or(Error::MissingVersion)?; builder.header(&VERSION_MATCH, etag) } }; @@ -443,9 +444,14 @@ impl GoogleCloudStorageClient { .send() .await?; - let data = response.bytes().await.context(PutResponseBodySnafu)?; + let data = response + .bytes() + .await + .map_err(|source| Error::PutResponseBody { source })?; + let result: InitiateMultipartUploadResult = - quick_xml::de::from_reader(data.as_ref().reader()).context(InvalidPutResponseSnafu)?; + quick_xml::de::from_reader(data.as_ref().reader()) + .map_err(|source| Error::InvalidPutResponse { source })?; Ok(result.upload_id) } @@ -467,8 +473,9 @@ impl GoogleCloudStorageClient { .query(&[("uploadId", multipart_id)]) .send_retry(&self.config.retry_config) .await - .context(RequestSnafu { - path: path.as_ref(), + .map_err(|source| { + let path = path.as_ref().into(); + Error::Request { source, path } })?; Ok(()) @@ -498,7 +505,7 @@ impl GoogleCloudStorageClient { let credential = self.get_credential().await?; let data = quick_xml::se::to_string(&upload_info) - .context(InvalidPutRequestSnafu)? + .map_err(|source| Error::InvalidPutRequest { source })? // We cannot disable the escaping that transforms "/" to ""e;" :( // https://github.com/tafia/quick-xml/issues/362 // https://github.com/tafia/quick-xml/issues/350 @@ -514,17 +521,18 @@ impl GoogleCloudStorageClient { .idempotent(true) .send() .await - .context(CompleteMultipartRequestSnafu)?; + .map_err(|source| Error::CompleteMultipartRequest { source })?; - let version = get_version(response.headers(), VERSION_HEADER).context(MetadataSnafu)?; + let version = get_version(response.headers(), VERSION_HEADER) + .map_err(|source| Error::Metadata { source })?; let data = response .bytes() .await - .context(CompleteMultipartResponseBodySnafu)?; + .map_err(|source| Error::CompleteMultipartResponseBody { source })?; - let response: CompleteMultipartUploadResult = - quick_xml::de::from_reader(data.reader()).context(InvalidMultipartResponseSnafu)?; + let response: CompleteMultipartUploadResult = quick_xml::de::from_reader(data.reader()) + .map_err(|source| Error::InvalidMultipartResponse { source })?; Ok(PutResult { e_tag: Some(response.e_tag), @@ -615,8 +623,9 @@ impl GetClient for GoogleCloudStorageClient { .with_get_options(options) .send_retry(&self.config.retry_config) .await - .context(GetRequestSnafu { - path: path.as_ref(), + .map_err(|source| { + let path = path.as_ref().into(); + Error::GetRequest { source, path } })?; Ok(response) @@ -665,13 +674,13 @@ impl ListClient for GoogleCloudStorageClient { .bearer_auth(&credential.bearer) .send_retry(&self.config.retry_config) .await - .context(ListRequestSnafu)? + .map_err(|source| Error::ListRequest { source })? .bytes() .await - .context(ListResponseBodySnafu)?; + .map_err(|source| Error::ListResponseBody { source })?; - let mut response: ListResponse = - quick_xml::de::from_reader(response.reader()).context(InvalidListResponseSnafu)?; + let mut response: ListResponse = quick_xml::de::from_reader(response.reader()) + .map_err(|source| Error::InvalidListResponse { source })?; let token = response.next_continuation_token.take(); Ok((response.try_into()?, token)) diff --git a/object_store/src/gcp/credential.rs b/object_store/src/gcp/credential.rs index 155a80b343b2..4b21ad1d3eab 100644 --- a/object_store/src/gcp/credential.rs +++ b/object_store/src/gcp/credential.rs @@ -33,7 +33,6 @@ use percent_encoding::utf8_percent_encode; use reqwest::{Client, Method}; use ring::signature::RsaKeyPair; use serde::Deserialize; -use snafu::{ResultExt, Snafu}; use std::collections::BTreeMap; use std::env; use std::fs::File; @@ -54,36 +53,39 @@ const DEFAULT_GCS_SIGN_BLOB_HOST: &str = "storage.googleapis.com"; const DEFAULT_METADATA_HOST: &str = "metadata.google.internal"; const DEFAULT_METADATA_IP: &str = "169.254.169.254"; -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] pub enum Error { - #[snafu(display("Unable to open service account file from {}: {}", path.display(), source))] + #[error("Unable to open service account file from {}: {}", path.display(), source)] OpenCredentials { source: std::io::Error, path: PathBuf, }, - #[snafu(display("Unable to decode service account file: {}", source))] + #[error("Unable to decode service account file: {}", source)] DecodeCredentials { source: serde_json::Error }, - #[snafu(display("No RSA key found in pem file"))] + #[error("No RSA key found in pem file")] MissingKey, - #[snafu(display("Invalid RSA key: {}", source), context(false))] - InvalidKey { source: ring::error::KeyRejected }, + #[error("Invalid RSA key: {}", source)] + InvalidKey { + #[from] + source: ring::error::KeyRejected, + }, - #[snafu(display("Error signing: {}", source))] + #[error("Error signing: {}", source)] Sign { source: ring::error::Unspecified }, - #[snafu(display("Error encoding jwt payload: {}", source))] + #[error("Error encoding jwt payload: {}", source)] Encode { source: serde_json::Error }, - #[snafu(display("Unsupported key encoding: {}", encoding))] + #[error("Unsupported key encoding: {}", encoding)] UnsupportedKey { encoding: String }, - #[snafu(display("Error performing token request: {}", source))] + #[error("Error performing token request: {}", source)] TokenRequest { source: crate::client::retry::Error }, - #[snafu(display("Error getting token response body: {}", source))] + #[error("Error getting token response body: {}", source)] TokenResponseBody { source: reqwest::Error }, } @@ -153,7 +155,7 @@ impl ServiceAccountKey { string_to_sign.as_bytes(), &mut signature, ) - .context(SignSnafu)?; + .map_err(|source| Error::Sign { source })?; Ok(hex_encode(&signature)) } @@ -289,7 +291,7 @@ impl TokenProvider for SelfSignedJwt { message.as_bytes(), &mut sig_bytes, ) - .context(SignSnafu)?; + .map_err(|source| Error::Sign { source })?; let signature = BASE64_URL_SAFE_NO_PAD.encode(sig_bytes); let bearer = [message, signature].join("."); @@ -305,11 +307,12 @@ fn read_credentials_file(service_account_path: impl AsRef) - where T: serde::de::DeserializeOwned, { - let file = File::open(&service_account_path).context(OpenCredentialsSnafu { - path: service_account_path.as_ref().to_owned(), + let file = File::open(&service_account_path).map_err(|source| { + let path = service_account_path.as_ref().to_owned(); + Error::OpenCredentials { source, path } })?; let reader = BufReader::new(file); - serde_json::from_reader(reader).context(DecodeCredentialsSnafu) + serde_json::from_reader(reader).map_err(|source| Error::DecodeCredentials { source }) } /// A deserialized `service-account-********.json`-file. @@ -341,7 +344,7 @@ impl ServiceAccountCredentials { /// Create a new [`ServiceAccountCredentials`] from a string. pub(crate) fn from_key(key: &str) -> Result { - serde_json::from_str(key).context(DecodeCredentialsSnafu) + serde_json::from_str(key).map_err(|source| Error::DecodeCredentials { source }) } /// Create a [`SelfSignedJwt`] from this credentials struct. @@ -380,7 +383,7 @@ fn seconds_since_epoch() -> u64 { } fn b64_encode_obj(obj: &T) -> Result { - let string = serde_json::to_string(obj).context(EncodeSnafu)?; + let string = serde_json::to_string(obj).map_err(|source| Error::Encode { source })?; Ok(BASE64_URL_SAFE_NO_PAD.encode(string)) } @@ -404,10 +407,10 @@ async fn make_metadata_request( .query(&[("audience", "https://www.googleapis.com/oauth2/v4/token")]) .send_retry(retry) .await - .context(TokenRequestSnafu)? + .map_err(|source| Error::TokenRequest { source })? .json() .await - .context(TokenResponseBodySnafu)?; + .map_err(|source| Error::TokenResponseBody { source })?; Ok(response) } @@ -467,10 +470,10 @@ async fn make_metadata_request_for_email( .header("Metadata-Flavor", "Google") .send_retry(retry) .await - .context(TokenRequestSnafu)? + .map_err(|source| Error::TokenRequest { source })? .text() .await - .context(TokenResponseBodySnafu)?; + .map_err(|source| Error::TokenResponseBody { source })?; Ok(response) } @@ -608,10 +611,10 @@ impl AuthorizedUserSigningCredentials { .query(&[("access_token", &self.credential.refresh_token)]) .send_retry(retry) .await - .context(TokenRequestSnafu)? + .map_err(|source| Error::TokenRequest { source })? .json::() .await - .context(TokenResponseBodySnafu)?; + .map_err(|source| Error::TokenResponseBody { source })?; Ok(response.email) } @@ -659,10 +662,10 @@ impl TokenProvider for AuthorizedUserCredentials { .idempotent(true) .send() .await - .context(TokenRequestSnafu)? + .map_err(|source| Error::TokenRequest { source })? .json::() .await - .context(TokenResponseBodySnafu)?; + .map_err(|source| Error::TokenResponseBody { source })?; Ok(TemporaryToken { token: Arc::new(GcpCredential { diff --git a/object_store/src/http/client.rs b/object_store/src/http/client.rs index eeb7e5694228..41e6464c1999 100644 --- a/object_store/src/http/client.rs +++ b/object_store/src/http/client.rs @@ -32,42 +32,41 @@ use hyper::header::{ use percent_encoding::percent_decode_str; use reqwest::{Method, Response, StatusCode}; use serde::Deserialize; -use snafu::{OptionExt, ResultExt, Snafu}; use url::Url; -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] enum Error { - #[snafu(display("Request error: {}", source))] + #[error("Request error: {}", source)] Request { source: retry::Error }, - #[snafu(display("Request error: {}", source))] + #[error("Request error: {}", source)] Reqwest { source: reqwest::Error }, - #[snafu(display("Range request not supported by {}", href))] + #[error("Range request not supported by {}", href)] RangeNotSupported { href: String }, - #[snafu(display("Error decoding PROPFIND response: {}", source))] + #[error("Error decoding PROPFIND response: {}", source)] InvalidPropFind { source: quick_xml::de::DeError }, - #[snafu(display("Missing content size for {}", href))] + #[error("Missing content size for {}", href)] MissingSize { href: String }, - #[snafu(display("Error getting properties of \"{}\" got \"{}\"", href, status))] + #[error("Error getting properties of \"{}\" got \"{}\"", href, status)] PropStatus { href: String, status: String }, - #[snafu(display("Failed to parse href \"{}\": {}", href, source))] + #[error("Failed to parse href \"{}\": {}", href, source)] InvalidHref { href: String, source: url::ParseError, }, - #[snafu(display("Path \"{}\" contained non-unicode characters: {}", path, source))] + #[error("Path \"{}\" contained non-unicode characters: {}", path, source)] NonUnicode { path: String, source: std::str::Utf8Error, }, - #[snafu(display("Encountered invalid path \"{}\": {}", path, source))] + #[error("Encountered invalid path \"{}\": {}", path, source)] InvalidPath { path: String, source: crate::path::Error, @@ -129,7 +128,7 @@ impl Client { .request(method, url) .send_retry(&self.retry_config) .await - .context(RequestSnafu)?; + .map_err(|source| Error::Request { source })?; Ok(()) } @@ -236,7 +235,10 @@ impl Client { .await; let response = match result { - Ok(result) => result.bytes().await.context(ReqwestSnafu)?, + Ok(result) => result + .bytes() + .await + .map_err(|source| Error::Reqwest { source })?, Err(e) if matches!(e.status(), Some(StatusCode::NOT_FOUND)) => { return match depth { "0" => { @@ -255,7 +257,9 @@ impl Client { Err(source) => return Err(Error::Request { source }.into()), }; - let status = quick_xml::de::from_reader(response.reader()).context(InvalidPropFindSnafu)?; + let status = quick_xml::de::from_reader(response.reader()) + .map_err(|source| Error::InvalidPropFind { source })?; + Ok(status) } @@ -397,14 +401,23 @@ impl MultiStatusResponse { let url = Url::options() .base_url(Some(base_url)) .parse(&self.href) - .context(InvalidHrefSnafu { href: &self.href })?; + .map_err(|source| Error::InvalidHref { + href: self.href.clone(), + source, + })?; // Reverse any percent encoding let path = percent_decode_str(url.path()) .decode_utf8() - .context(NonUnicodeSnafu { path: url.path() })?; + .map_err(|source| Error::NonUnicode { + path: url.path().into(), + source, + })?; - Ok(Path::parse(path.as_ref()).context(InvalidPathSnafu { path })?) + Ok(Path::parse(path.as_ref()).map_err(|source| { + let path = path.into(); + Error::InvalidPath { path, source } + })?) } fn size(&self) -> Result { @@ -412,7 +425,10 @@ impl MultiStatusResponse { .prop_stat .prop .content_length - .context(MissingSizeSnafu { href: &self.href })?; + .ok_or_else(|| Error::MissingSize { + href: self.href.clone(), + })?; + Ok(size) } diff --git a/object_store/src/http/mod.rs b/object_store/src/http/mod.rs index 4b1c927e74f5..417f72856722 100644 --- a/object_store/src/http/mod.rs +++ b/object_store/src/http/mod.rs @@ -35,7 +35,6 @@ use async_trait::async_trait; use futures::stream::BoxStream; use futures::{StreamExt, TryStreamExt}; use itertools::Itertools; -use snafu::{OptionExt, ResultExt, Snafu}; use url::Url; use crate::client::get::GetClientExt; @@ -49,18 +48,18 @@ use crate::{ mod client; -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] enum Error { - #[snafu(display("Must specify a URL"))] + #[error("Must specify a URL")] MissingUrl, - #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] + #[error("Unable parse source url. Url: {}, Error: {}", url, source)] UnableToParseUrl { source: url::ParseError, url: String, }, - #[snafu(display("Unable to extract metadata from headers: {}", source))] + #[error("Unable to extract metadata from headers: {}", source)] Metadata { source: crate::client::header::Error, }, @@ -235,8 +234,8 @@ impl HttpBuilder { /// Build an [`HttpStore`] with the configured options pub fn build(self) -> Result { - let url = self.url.context(MissingUrlSnafu)?; - let parsed = Url::parse(&url).context(UnableToParseUrlSnafu { url })?; + let url = self.url.ok_or(Error::MissingUrl)?; + let parsed = Url::parse(&url).map_err(|source| Error::UnableToParseUrl { url, source })?; Ok(HttpStore { client: Client::new(parsed, self.client_options, self.retry_config)?, diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 6f5733226922..987ffacc6e49 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -566,7 +566,6 @@ use async_trait::async_trait; use bytes::Bytes; use chrono::{DateTime, Utc}; use futures::{stream::BoxStream, StreamExt, TryStreamExt}; -use snafu::Snafu; use std::fmt::{Debug, Formatter}; #[cfg(all(feature = "fs", not(target_arch = "wasm32")))] use std::io::{Read, Seek, SeekFrom}; @@ -1229,11 +1228,11 @@ pub struct PutResult { pub type Result = std::result::Result; /// A specialized `Error` for object store-related errors -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] #[non_exhaustive] pub enum Error { /// A fallback error type when no variant matches - #[snafu(display("Generic {} error: {}", store, source))] + #[error("Generic {} error: {}", store, source)] Generic { /// The store this error originated from store: &'static str, @@ -1242,7 +1241,7 @@ pub enum Error { }, /// Error when the object is not found at given location - #[snafu(display("Object at location {} not found: {}", path, source))] + #[error("Object at location {} not found: {}", path, source)] NotFound { /// The path to file path: String, @@ -1251,31 +1250,30 @@ pub enum Error { }, /// Error for invalid path - #[snafu( - display("Encountered object with invalid path: {}", source), - context(false) - )] + #[error("Encountered object with invalid path: {}", source)] InvalidPath { /// The wrapped error + #[from] source: path::Error, }, /// Error when `tokio::spawn` failed - #[snafu(display("Error joining spawned task: {}", source), context(false))] + #[error("Error joining spawned task: {}", source)] JoinError { /// The wrapped error + #[from] source: tokio::task::JoinError, }, /// Error when the attempted operation is not supported - #[snafu(display("Operation not supported: {}", source))] + #[error("Operation not supported: {}", source)] NotSupported { /// The wrapped error source: Box, }, /// Error when the object already exists - #[snafu(display("Object at location {} already exists: {}", path, source))] + #[error("Object at location {} already exists: {}", path, source)] AlreadyExists { /// The path to the path: String, @@ -1284,7 +1282,7 @@ pub enum Error { }, /// Error when the required conditions failed for the operation - #[snafu(display("Request precondition failure for path {}: {}", path, source))] + #[error("Request precondition failure for path {}: {}", path, source)] Precondition { /// The path to the file path: String, @@ -1293,7 +1291,7 @@ pub enum Error { }, /// Error when the object at the location isn't modified - #[snafu(display("Object at location {} not modified: {}", path, source))] + #[error("Object at location {} not modified: {}", path, source)] NotModified { /// The path to the file path: String, @@ -1302,16 +1300,16 @@ pub enum Error { }, /// Error when an operation is not implemented - #[snafu(display("Operation not yet implemented."))] + #[error("Operation not yet implemented.")] NotImplemented, /// Error when the used credentials don't have enough permission /// to perform the requested operation - #[snafu(display( + #[error( "The operation lacked the necessary privileges to complete for path {}: {}", path, source - ))] + )] PermissionDenied { /// The path to the file path: String, @@ -1320,11 +1318,11 @@ pub enum Error { }, /// Error when the used credentials lack valid authentication - #[snafu(display( + #[error( "The operation lacked valid authentication credentials for path {}: {}", path, source - ))] + )] Unauthenticated { /// The path to the file path: String, @@ -1333,7 +1331,7 @@ pub enum Error { }, /// Error when a configuration key is invalid for the store used - #[snafu(display("Configuration key: '{}' is not valid for store '{}'.", key, store))] + #[error("Configuration key: '{}' is not valid for store '{}'.", key, store)] UnknownConfigurationKey { /// The object store used store: &'static str, diff --git a/object_store/src/local.rs b/object_store/src/local.rs index 78fce9c26224..b193481ae7b8 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -30,7 +30,6 @@ use chrono::{DateTime, Utc}; use futures::{stream::BoxStream, StreamExt}; use futures::{FutureExt, TryStreamExt}; use parking_lot::Mutex; -use snafu::{ensure, OptionExt, ResultExt, Snafu}; use url::Url; use walkdir::{DirEntry, WalkDir}; @@ -43,117 +42,80 @@ use crate::{ }; /// A specialized `Error` for filesystem object store-related errors -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] pub(crate) enum Error { - #[snafu(display("File size for {} did not fit in a usize: {}", path, source))] + #[error("File size for {} did not fit in a usize: {}", path, source)] FileSizeOverflowedUsize { source: std::num::TryFromIntError, path: String, }, - #[snafu(display("Unable to walk dir: {}", source))] - UnableToWalkDir { - source: walkdir::Error, - }, + #[error("Unable to walk dir: {}", source)] + UnableToWalkDir { source: walkdir::Error }, - #[snafu(display("Unable to access metadata for {}: {}", path, source))] + #[error("Unable to access metadata for {}: {}", path, source)] Metadata { source: Box, path: String, }, - #[snafu(display("Unable to copy data to file: {}", source))] - UnableToCopyDataToFile { - source: io::Error, - }, + #[error("Unable to copy data to file: {}", source)] + UnableToCopyDataToFile { source: io::Error }, - #[snafu(display("Unable to rename file: {}", source))] - UnableToRenameFile { - source: io::Error, - }, + #[error("Unable to rename file: {}", source)] + UnableToRenameFile { source: io::Error }, - #[snafu(display("Unable to create dir {}: {}", path.display(), source))] - UnableToCreateDir { - source: io::Error, - path: PathBuf, - }, + #[error("Unable to create dir {}: {}", path.display(), source)] + UnableToCreateDir { source: io::Error, path: PathBuf }, - #[snafu(display("Unable to create file {}: {}", path.display(), source))] - UnableToCreateFile { - source: io::Error, - path: PathBuf, - }, + #[error("Unable to create file {}: {}", path.display(), source)] + UnableToCreateFile { source: io::Error, path: PathBuf }, - #[snafu(display("Unable to delete file {}: {}", path.display(), source))] - UnableToDeleteFile { - source: io::Error, - path: PathBuf, - }, + #[error("Unable to delete file {}: {}", path.display(), source)] + UnableToDeleteFile { source: io::Error, path: PathBuf }, - #[snafu(display("Unable to open file {}: {}", path.display(), source))] - UnableToOpenFile { - source: io::Error, - path: PathBuf, - }, + #[error("Unable to open file {}: {}", path.display(), source)] + UnableToOpenFile { source: io::Error, path: PathBuf }, - #[snafu(display("Unable to read data from file {}: {}", path.display(), source))] - UnableToReadBytes { - source: io::Error, - path: PathBuf, - }, + #[error("Unable to read data from file {}: {}", path.display(), source)] + UnableToReadBytes { source: io::Error, path: PathBuf }, - #[snafu(display("Out of range of file {}, expected: {}, actual: {}", path.display(), expected, actual))] + #[error("Out of range of file {}, expected: {}, actual: {}", path.display(), expected, actual)] OutOfRange { path: PathBuf, expected: usize, actual: usize, }, - #[snafu(display("Requested range was invalid"))] - InvalidRange { - source: InvalidGetRange, - }, + #[error("Requested range was invalid")] + InvalidRange { source: InvalidGetRange }, - #[snafu(display("Unable to copy file from {} to {}: {}", from.display(), to.display(), source))] + #[error("Unable to copy file from {} to {}: {}", from.display(), to.display(), source)] UnableToCopyFile { from: PathBuf, to: PathBuf, source: io::Error, }, - NotFound { - path: PathBuf, - source: io::Error, - }, + #[error("NotFound")] + NotFound { path: PathBuf, source: io::Error }, - #[snafu(display("Error seeking file {}: {}", path.display(), source))] - Seek { - source: io::Error, - path: PathBuf, - }, + #[error("Error seeking file {}: {}", path.display(), source)] + Seek { source: io::Error, path: PathBuf }, - #[snafu(display("Unable to convert URL \"{}\" to filesystem path", url))] - InvalidUrl { - url: Url, - }, + #[error("Unable to convert URL \"{}\" to filesystem path", url)] + InvalidUrl { url: Url }, - AlreadyExists { - path: String, - source: io::Error, - }, + #[error("AlreadyExists")] + AlreadyExists { path: String, source: io::Error }, - #[snafu(display("Unable to canonicalize filesystem root: {}", path.display()))] - UnableToCanonicalize { - path: PathBuf, - source: io::Error, - }, + #[error("Unable to canonicalize filesystem root: {}", path.display())] + UnableToCanonicalize { path: PathBuf, source: io::Error }, - #[snafu(display("Filenames containing trailing '/#\\d+/' are not supported: {}", path))] - InvalidPath { - path: String, - }, + #[error("Filenames containing trailing '/#\\d+/' are not supported: {}", path)] + InvalidPath { path: String }, - #[snafu(display("Upload aborted"))] + #[error("Upload aborted")] Aborted, } @@ -276,8 +238,9 @@ impl LocalFileSystem { /// Returns an error if the path does not exist /// pub fn new_with_prefix(prefix: impl AsRef) -> Result { - let path = std::fs::canonicalize(&prefix).context(UnableToCanonicalizeSnafu { - path: prefix.as_ref(), + let path = std::fs::canonicalize(&prefix).map_err(|source| { + let path = prefix.as_ref().into(); + Error::UnableToCanonicalize { source, path } })?; Ok(Self { @@ -290,12 +253,12 @@ impl LocalFileSystem { /// Return an absolute filesystem path of the given file location pub fn path_to_filesystem(&self, location: &Path) -> Result { - ensure!( - is_valid_file_path(location), - InvalidPathSnafu { - path: location.as_ref() - } - ); + if !is_valid_file_path(location) { + let path = location.as_ref().into(); + let error = Error::InvalidPath { path }; + return Err(error.into()); + } + let path = self.config.prefix_to_filesystem(location)?; #[cfg(target_os = "windows")] @@ -451,7 +414,9 @@ impl ObjectStore for LocalFileSystem { options.check_preconditions(&meta)?; let range = match options.range { - Some(r) => r.as_range(meta.size).context(InvalidRangeSnafu)?, + Some(r) => r + .as_range(meta.size) + .map_err(|source| Error::InvalidRange { source })?, None => 0..meta.size, }; @@ -721,12 +686,15 @@ impl ObjectStore for LocalFileSystem { /// Creates the parent directories of `path` or returns an error based on `source` if no parent fn create_parent_dirs(path: &std::path::Path, source: io::Error) -> Result<()> { - let parent = path.parent().ok_or_else(|| Error::UnableToCreateFile { - path: path.to_path_buf(), - source, + let parent = path.parent().ok_or_else(|| { + let path = path.to_path_buf(); + Error::UnableToCreateFile { path, source } })?; - std::fs::create_dir_all(parent).context(UnableToCreateDirSnafu { path: parent })?; + std::fs::create_dir_all(parent).map_err(|source| { + let path = parent.into(); + Error::UnableToCreateDir { source, path } + })?; Ok(()) } @@ -796,12 +764,14 @@ impl MultipartUpload for LocalUpload { let s = Arc::clone(&self.state); maybe_spawn_blocking(move || { let mut file = s.file.lock(); - file.seek(SeekFrom::Start(offset)) - .context(SeekSnafu { path: &s.dest })?; + file.seek(SeekFrom::Start(offset)).map_err(|source| { + let path = s.dest.clone(); + Error::Seek { source, path } + })?; data.iter() .try_for_each(|x| file.write_all(x)) - .context(UnableToCopyDataToFileSnafu)?; + .map_err(|source| Error::UnableToCopyDataToFile { source })?; Ok(()) }) @@ -809,12 +779,13 @@ impl MultipartUpload for LocalUpload { } async fn complete(&mut self) -> Result { - let src = self.src.take().context(AbortedSnafu)?; + let src = self.src.take().ok_or(Error::Aborted)?; let s = Arc::clone(&self.state); maybe_spawn_blocking(move || { // Ensure no inflight writes let file = s.file.lock(); - std::fs::rename(&src, &s.dest).context(UnableToRenameFileSnafu)?; + std::fs::rename(&src, &s.dest) + .map_err(|source| Error::UnableToRenameFile { source })?; let metadata = file.metadata().map_err(|e| Error::Metadata { source: e.into(), path: src.to_string_lossy().to_string(), @@ -829,9 +800,10 @@ impl MultipartUpload for LocalUpload { } async fn abort(&mut self) -> Result<()> { - let src = self.src.take().context(AbortedSnafu)?; + let src = self.src.take().ok_or(Error::Aborted)?; maybe_spawn_blocking(move || { - std::fs::remove_file(&src).context(UnableToDeleteFileSnafu { path: &src })?; + std::fs::remove_file(&src) + .map_err(|source| Error::UnableToDeleteFile { source, path: src })?; Ok(()) }) .await @@ -898,22 +870,30 @@ pub(crate) fn chunked_stream( pub(crate) fn read_range(file: &mut File, path: &PathBuf, range: Range) -> Result { let to_read = range.end - range.start; file.seek(SeekFrom::Start(range.start as u64)) - .context(SeekSnafu { path })?; + .map_err(|source| { + let path = path.into(); + Error::Seek { source, path } + })?; let mut buf = Vec::with_capacity(to_read); let read = file .take(to_read as u64) .read_to_end(&mut buf) - .context(UnableToReadBytesSnafu { path })?; + .map_err(|source| { + let path = path.into(); + Error::UnableToReadBytes { source, path } + })?; - ensure!( - read == to_read, - OutOfRangeSnafu { - path, + if read != to_read { + let error = Error::OutOfRange { + path: path.into(), expected: to_read, - actual: read - } - ); + actual: read, + }; + + return Err(error.into()); + } + Ok(buf.into()) } @@ -982,8 +962,9 @@ fn get_etag(metadata: &Metadata) -> String { fn convert_metadata(metadata: Metadata, location: Path) -> Result { let last_modified = last_modified(&metadata); - let size = usize::try_from(metadata.len()).context(FileSizeOverflowedUsizeSnafu { - path: location.as_ref(), + let size = usize::try_from(metadata.len()).map_err(|source| { + let path = location.as_ref().into(); + Error::FileSizeOverflowedUsize { source, path } })?; Ok(ObjectMeta { diff --git a/object_store/src/memory.rs b/object_store/src/memory.rs index a467e3b88a26..3f3cff3390db 100644 --- a/object_store/src/memory.rs +++ b/object_store/src/memory.rs @@ -25,7 +25,6 @@ use bytes::Bytes; use chrono::{DateTime, Utc}; use futures::{stream::BoxStream, StreamExt}; use parking_lot::RwLock; -use snafu::{OptionExt, ResultExt, Snafu}; use crate::multipart::{MultipartStore, PartId}; use crate::util::InvalidGetRange; @@ -37,24 +36,24 @@ use crate::{ use crate::{GetOptions, PutPayload}; /// A specialized `Error` for in-memory object store-related errors -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] enum Error { - #[snafu(display("No data in memory found. Location: {path}"))] + #[error("No data in memory found. Location: {path}")] NoDataInMemory { path: String }, - #[snafu(display("Invalid range: {source}"))] + #[error("Invalid range: {source}")] Range { source: InvalidGetRange }, - #[snafu(display("Object already exists at that location: {path}"))] + #[error("Object already exists at that location: {path}")] AlreadyExists { path: String }, - #[snafu(display("ETag required for conditional update"))] + #[error("ETag required for conditional update")] MissingETag, - #[snafu(display("MultipartUpload not found: {id}"))] + #[error("MultipartUpload not found: {id}")] UploadNotFound { id: String }, - #[snafu(display("Missing part at index: {part}"))] + #[error("Missing part at index: {part}")] MissingPart { part: usize }, } @@ -158,7 +157,7 @@ impl Storage { }), Some(e) => { let existing = e.e_tag.to_string(); - let expected = v.e_tag.context(MissingETagSnafu)?; + let expected = v.e_tag.ok_or(Error::MissingETag)?; if existing == expected { *e = entry; Ok(()) @@ -177,7 +176,7 @@ impl Storage { .parse() .ok() .and_then(|x| self.uploads.get_mut(&x)) - .context(UploadNotFoundSnafu { id })?; + .ok_or_else(|| Error::UploadNotFound { id: id.into() })?; Ok(parts) } @@ -186,7 +185,7 @@ impl Storage { .parse() .ok() .and_then(|x| self.uploads.remove(&x)) - .context(UploadNotFoundSnafu { id })?; + .ok_or_else(|| Error::UploadNotFound { id: id.into() })?; Ok(parts) } } @@ -250,7 +249,9 @@ impl ObjectStore for InMemory { let (range, data) = match options.range { Some(range) => { - let r = range.as_range(entry.data.len()).context(RangeSnafu)?; + let r = range + .as_range(entry.data.len()) + .map_err(|source| Error::Range { source })?; (r.clone(), entry.data.slice(r)) } None => (0..entry.data.len(), entry.data), @@ -272,7 +273,7 @@ impl ObjectStore for InMemory { .map(|range| { let r = GetRange::Bounded(range.clone()) .as_range(entry.data.len()) - .context(RangeSnafu)?; + .map_err(|source| Error::Range { source })?; Ok(entry.data.slice(r)) }) @@ -435,7 +436,7 @@ impl MultipartStore for InMemory { let mut cap = 0; for (part, x) in upload.parts.iter().enumerate() { - cap += x.as_ref().context(MissingPartSnafu { part })?.len(); + cap += x.as_ref().ok_or(Error::MissingPart { part })?.len(); } let mut buf = Vec::with_capacity(cap); for x in &upload.parts { @@ -474,7 +475,7 @@ impl InMemory { .map .get(location) .cloned() - .context(NoDataInMemorySnafu { + .ok_or_else(|| Error::NoDataInMemory { path: location.to_string(), })?; diff --git a/object_store/src/parse.rs b/object_store/src/parse.rs index a3919305281d..bc65a0b8d1c8 100644 --- a/object_store/src/parse.rs +++ b/object_store/src/parse.rs @@ -20,16 +20,18 @@ use crate::local::LocalFileSystem; use crate::memory::InMemory; use crate::path::Path; use crate::ObjectStore; -use snafu::Snafu; use url::Url; -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] pub enum Error { - #[snafu(display("Unable to recognise URL \"{}\"", url))] + #[error("Unable to recognise URL \"{}\"", url)] Unrecognised { url: Url }, - #[snafu(context(false))] - Path { source: crate::path::Error }, + #[error(transparent)] + Path { + #[from] + source: crate::path::Error, + }, } impl From for super::Error { diff --git a/object_store/src/path/mod.rs b/object_store/src/path/mod.rs index 4c9bb5f05186..f8affe8dfbb9 100644 --- a/object_store/src/path/mod.rs +++ b/object_store/src/path/mod.rs @@ -19,7 +19,6 @@ use itertools::Itertools; use percent_encoding::percent_decode; -use snafu::{ensure, ResultExt, Snafu}; use std::fmt::Formatter; #[cfg(not(target_arch = "wasm32"))] use url::Url; @@ -35,18 +34,18 @@ mod parts; pub use parts::{InvalidPart, PathPart}; /// Error returned by [`Path::parse`] -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] #[non_exhaustive] pub enum Error { /// Error when there's an empty segment between two slashes `/` in the path - #[snafu(display("Path \"{}\" contained empty path segment", path))] + #[error("Path \"{}\" contained empty path segment", path)] EmptySegment { /// The source path path: String, }, /// Error when an invalid segment is encountered in the given path - #[snafu(display("Error parsing Path \"{}\": {}", path, source))] + #[error("Error parsing Path \"{}\": {}", path, source)] BadSegment { /// The source path path: String, @@ -55,7 +54,7 @@ pub enum Error { }, /// Error when path cannot be canonicalized - #[snafu(display("Failed to canonicalize path \"{}\": {}", path.display(), source))] + #[error("Failed to canonicalize path \"{}\": {}", path.display(), source)] Canonicalize { /// The source path path: std::path::PathBuf, @@ -64,14 +63,14 @@ pub enum Error { }, /// Error when the path is not a valid URL - #[snafu(display("Unable to convert path \"{}\" to URL", path.display()))] + #[error("Unable to convert path \"{}\" to URL", path.display())] InvalidPath { /// The source path path: std::path::PathBuf, }, /// Error when a path contains non-unicode characters - #[snafu(display("Path \"{}\" contained non-unicode characters: {}", path, source))] + #[error("Path \"{}\" contained non-unicode characters: {}", path, source)] NonUnicode { /// The source path path: String, @@ -80,7 +79,7 @@ pub enum Error { }, /// Error when the a path doesn't start with given prefix - #[snafu(display("Path {} does not start with prefix {}", path, prefix))] + #[error("Path {} does not start with prefix {}", path, prefix)] PrefixMismatch { /// The source path path: String, @@ -173,8 +172,14 @@ impl Path { let stripped = stripped.strip_suffix(DELIMITER).unwrap_or(stripped); for segment in stripped.split(DELIMITER) { - ensure!(!segment.is_empty(), EmptySegmentSnafu { path }); - PathPart::parse(segment).context(BadSegmentSnafu { path })?; + if segment.is_empty() { + return Err(Error::EmptySegment { path: path.into() }); + } + + PathPart::parse(segment).map_err(|source| { + let path = path.into(); + Error::BadSegment { source, path } + })?; } Ok(Self { @@ -190,8 +195,9 @@ impl Path { /// /// Note: this will canonicalize the provided path, resolving any symlinks pub fn from_filesystem_path(path: impl AsRef) -> Result { - let absolute = std::fs::canonicalize(&path).context(CanonicalizeSnafu { - path: path.as_ref(), + let absolute = std::fs::canonicalize(&path).map_err(|source| { + let path = path.as_ref().into(); + Error::Canonicalize { source, path } })?; Self::from_absolute_path(absolute) @@ -241,7 +247,10 @@ impl Path { let path = path.as_ref(); let decoded = percent_decode(path.as_bytes()) .decode_utf8() - .context(NonUnicodeSnafu { path })?; + .map_err(|source| { + let path = path.into(); + Error::NonUnicode { source, path } + })?; Self::parse(decoded) } diff --git a/object_store/src/path/parts.rs b/object_store/src/path/parts.rs index de2e1a75c955..9c6612bf9331 100644 --- a/object_store/src/path/parts.rs +++ b/object_store/src/path/parts.rs @@ -19,15 +19,14 @@ use percent_encoding::{percent_encode, AsciiSet, CONTROLS}; use std::borrow::Cow; use crate::path::DELIMITER_BYTE; -use snafu::Snafu; /// Error returned by [`PathPart::parse`] -#[derive(Debug, Snafu)] -#[snafu(display( +#[derive(Debug, thiserror::Error)] +#[error( "Encountered illegal character sequence \"{}\" whilst parsing path segment \"{}\"", illegal, segment -))] +)] #[allow(missing_copy_implementations)] pub struct InvalidPart { segment: String, diff --git a/object_store/src/util.rs b/object_store/src/util.rs index 99102a99e61e..6d638f3cb2b8 100644 --- a/object_store/src/util.rs +++ b/object_store/src/util.rs @@ -24,7 +24,6 @@ use std::{ use super::Result; use bytes::Bytes; use futures::{stream::StreamExt, Stream, TryStreamExt}; -use snafu::Snafu; #[cfg(any(feature = "azure", feature = "http"))] pub(crate) static RFC1123_FMT: &str = "%a, %d %h %Y %T GMT"; @@ -204,14 +203,12 @@ pub enum GetRange { Suffix(usize), } -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] pub(crate) enum InvalidGetRange { - #[snafu(display( - "Wanted range starting at {requested}, but object was only {length} bytes long" - ))] + #[error("Wanted range starting at {requested}, but object was only {length} bytes long")] StartTooLarge { requested: usize, length: usize }, - #[snafu(display("Range started at {start} and ended at {end}"))] + #[error("Range started at {start} and ended at {end}")] Inconsistent { start: usize, end: usize }, } From f2d9aadb14c6e5eef9f3bca8c7537de9751b0bab Mon Sep 17 00:00:00 2001 From: Kikkon <19528375+Kikkon@users.noreply.github.com> Date: Fri, 3 Jan 2025 19:44:34 +0800 Subject: [PATCH 011/503] feat: add GenericListViewBuilder (#6552) * feat: add GenericListViewBuilder * remove uszie * fix tests * remove static * lint * chore: add comment for should fail test * Update arrow-array/src/builder/generic_list_view_builder.rs Co-authored-by: Marco Neumann * Update arrow-array/src/builder/generic_list_view_builder.rs Co-authored-by: Marco Neumann * fix name & lint --------- Co-authored-by: Marco Neumann --- .../src/builder/generic_list_view_builder.rs | 707 ++++++++++++++++++ arrow-array/src/builder/mod.rs | 8 + arrow-array/src/builder/struct_builder.rs | 10 + arrow-array/src/cast.rs | 16 + 4 files changed, 741 insertions(+) create mode 100644 arrow-array/src/builder/generic_list_view_builder.rs diff --git a/arrow-array/src/builder/generic_list_view_builder.rs b/arrow-array/src/builder/generic_list_view_builder.rs new file mode 100644 index 000000000000..5aaf9efefe24 --- /dev/null +++ b/arrow-array/src/builder/generic_list_view_builder.rs @@ -0,0 +1,707 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::builder::ArrayBuilder; +use crate::{ArrayRef, GenericListViewArray, OffsetSizeTrait}; +use arrow_buffer::{Buffer, BufferBuilder, NullBufferBuilder, ScalarBuffer}; +use arrow_schema::{Field, FieldRef}; +use std::any::Any; +use std::sync::Arc; + +/// Builder for [`GenericListViewArray`] +#[derive(Debug)] +pub struct GenericListViewBuilder { + offsets_builder: BufferBuilder, + sizes_builder: BufferBuilder, + null_buffer_builder: NullBufferBuilder, + values_builder: T, + field: Option, + current_offset: OffsetSize, +} + +impl Default for GenericListViewBuilder { + fn default() -> Self { + Self::new(T::default()) + } +} + +impl ArrayBuilder + for GenericListViewBuilder +{ + /// Returns the builder as a non-mutable `Any` reference. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as a mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + + /// Returns the number of array slots in the builder + fn len(&self) -> usize { + self.null_buffer_builder.len() + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } +} + +impl GenericListViewBuilder { + /// Creates a new [`GenericListViewBuilder`] from a given values array builder + pub fn new(values_builder: T) -> Self { + let capacity = values_builder.len(); + Self::with_capacity(values_builder, capacity) + } + + /// Creates a new [`GenericListViewBuilder`] from a given values array builder + /// `capacity` is the number of items to pre-allocate space for in this builder + pub fn with_capacity(values_builder: T, capacity: usize) -> Self { + let offsets_builder = BufferBuilder::::new(capacity); + let sizes_builder = BufferBuilder::::new(capacity); + Self { + offsets_builder, + null_buffer_builder: NullBufferBuilder::new(capacity), + values_builder, + sizes_builder, + field: None, + current_offset: OffsetSize::zero(), + } + } + + /// + /// By default a nullable field is created with the name `item` + /// + /// Note: [`Self::finish`] and [`Self::finish_cloned`] will panic if the + /// field's data type does not match that of `T` + pub fn with_field(self, field: impl Into) -> Self { + Self { + field: Some(field.into()), + ..self + } + } +} + +impl GenericListViewBuilder +where + T: 'static, +{ + /// Returns the child array builder as a mutable reference. + /// + /// This mutable reference can be used to append values into the child array builder, + /// but you must call [`append`](#method.append) to delimit each distinct list value. + pub fn values(&mut self) -> &mut T { + &mut self.values_builder + } + + /// Returns the child array builder as an immutable reference + pub fn values_ref(&self) -> &T { + &self.values_builder + } + + /// Finish the current variable-length list array slot + /// + /// # Panics + /// + /// Panics if the length of [`Self::values`] exceeds `OffsetSize::MAX` + #[inline] + pub fn append(&mut self, is_valid: bool) { + self.offsets_builder.append(self.current_offset); + self.sizes_builder.append( + OffsetSize::from_usize( + self.values_builder.len() - self.current_offset.to_usize().unwrap(), + ) + .unwrap(), + ); + self.null_buffer_builder.append(is_valid); + self.current_offset = OffsetSize::from_usize(self.values_builder.len()).unwrap(); + } + + /// Append value into this [`GenericListViewBuilder`] + #[inline] + pub fn append_value(&mut self, i: I) + where + T: Extend>, + I: IntoIterator>, + { + self.extend(std::iter::once(Some(i))) + } + + /// Append a null to this [`GenericListViewBuilder`] + /// + /// See [`Self::append_value`] for an example use. + #[inline] + pub fn append_null(&mut self) { + self.offsets_builder.append(self.current_offset); + self.sizes_builder + .append(OffsetSize::from_usize(0).unwrap()); + self.null_buffer_builder.append_null(); + } + + /// Appends an optional value into this [`GenericListViewBuilder`] + /// + /// If `Some` calls [`Self::append_value`] otherwise calls [`Self::append_null`] + #[inline] + pub fn append_option(&mut self, i: Option) + where + T: Extend>, + I: IntoIterator>, + { + match i { + Some(i) => self.append_value(i), + None => self.append_null(), + } + } + + /// Builds the [`GenericListViewArray`] and reset this builder. + pub fn finish(&mut self) -> GenericListViewArray { + let values = self.values_builder.finish(); + let nulls = self.null_buffer_builder.finish(); + let offsets = self.offsets_builder.finish(); + self.current_offset = OffsetSize::zero(); + + // Safety: Safe by construction + let offsets = ScalarBuffer::from(offsets); + let sizes = self.sizes_builder.finish(); + let sizes = ScalarBuffer::from(sizes); + let field = match &self.field { + Some(f) => f.clone(), + None => Arc::new(Field::new("item", values.data_type().clone(), true)), + }; + GenericListViewArray::new(field, offsets, sizes, values, nulls) + } + + /// Builds the [`GenericListViewArray`] without resetting the builder. + pub fn finish_cloned(&self) -> GenericListViewArray { + let values = self.values_builder.finish_cloned(); + let nulls = self.null_buffer_builder.finish_cloned(); + + let offsets = Buffer::from_slice_ref(self.offsets_builder.as_slice()); + // Safety: safe by construction + let offsets = ScalarBuffer::from(offsets); + + let sizes = Buffer::from_slice_ref(self.sizes_builder.as_slice()); + let sizes = ScalarBuffer::from(sizes); + + let field = match &self.field { + Some(f) => f.clone(), + None => Arc::new(Field::new("item", values.data_type().clone(), true)), + }; + + GenericListViewArray::new(field, offsets, sizes, values, nulls) + } + + /// Returns the current offsets buffer as a slice + pub fn offsets_slice(&self) -> &[OffsetSize] { + self.offsets_builder.as_slice() + } +} + +impl Extend> for GenericListViewBuilder +where + O: OffsetSizeTrait, + B: ArrayBuilder + Extend, + V: IntoIterator, +{ + #[inline] + fn extend>>(&mut self, iter: T) { + for v in iter { + match v { + Some(elements) => { + self.values_builder.extend(elements); + self.append(true); + } + None => self.append(false), + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::builder::{make_builder, Int32Builder, ListViewBuilder}; + use crate::cast::AsArray; + use crate::types::Int32Type; + use crate::{Array, Int32Array}; + use arrow_schema::DataType; + + fn test_generic_list_view_array_builder_impl() { + let values_builder = Int32Builder::with_capacity(10); + let mut builder = GenericListViewBuilder::::new(values_builder); + + // [[0, 1, 2], [3, 4, 5], [6, 7]] + builder.values().append_value(0); + builder.values().append_value(1); + builder.values().append_value(2); + builder.append(true); + builder.values().append_value(3); + builder.values().append_value(4); + builder.values().append_value(5); + builder.append(true); + builder.values().append_value(6); + builder.values().append_value(7); + builder.append(true); + let list_array = builder.finish(); + + let list_values = list_array.values().as_primitive::(); + assert_eq!(list_values.values(), &[0, 1, 2, 3, 4, 5, 6, 7]); + assert_eq!(list_array.value_offsets(), [0, 3, 6].map(O::usize_as)); + assert_eq!(list_array.value_sizes(), [3, 3, 2].map(O::usize_as)); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(3, list_array.len()); + assert_eq!(0, list_array.null_count()); + assert_eq!(O::from_usize(6).unwrap(), list_array.value_offsets()[2]); + assert_eq!(O::from_usize(2).unwrap(), list_array.value_sizes()[2]); + for i in 0..2 { + assert!(list_array.is_valid(i)); + assert!(!list_array.is_null(i)); + } + } + + #[test] + fn test_list_view_array_builder() { + test_generic_list_view_array_builder_impl::() + } + + #[test] + fn test_large_list_view_array_builder() { + test_generic_list_view_array_builder_impl::() + } + + fn test_generic_list_view_array_builder_nulls_impl() { + let values_builder = Int32Builder::with_capacity(10); + let mut builder = GenericListViewBuilder::::new(values_builder); + + // [[0, 1, 2], null, [3, null, 5], [6, 7]] + builder.values().append_value(0); + builder.values().append_value(1); + builder.values().append_value(2); + builder.append(true); + builder.append(false); + builder.values().append_value(3); + builder.values().append_null(); + builder.values().append_value(5); + builder.append(true); + builder.values().append_value(6); + builder.values().append_value(7); + builder.append(true); + + let list_array = builder.finish(); + + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(4, list_array.len()); + assert_eq!(1, list_array.null_count()); + assert_eq!(O::from_usize(3).unwrap(), list_array.value_offsets()[2]); + assert_eq!(O::from_usize(3).unwrap(), list_array.value_sizes()[2]); + } + + #[test] + fn test_list_view_array_builder_nulls() { + test_generic_list_view_array_builder_nulls_impl::() + } + + #[test] + fn test_large_list_view_array_builder_nulls() { + test_generic_list_view_array_builder_nulls_impl::() + } + + #[test] + fn test_list_view_array_builder_finish() { + let values_builder = Int32Array::builder(5); + let mut builder = ListViewBuilder::new(values_builder); + + builder.values().append_slice(&[1, 2, 3]); + builder.append(true); + builder.values().append_slice(&[4, 5, 6]); + builder.append(true); + + let mut arr = builder.finish(); + assert_eq!(2, arr.len()); + assert!(builder.is_empty()); + + builder.values().append_slice(&[7, 8, 9]); + builder.append(true); + arr = builder.finish(); + assert_eq!(1, arr.len()); + assert!(builder.is_empty()); + } + + #[test] + fn test_list_view_array_builder_finish_cloned() { + let values_builder = Int32Array::builder(5); + let mut builder = ListViewBuilder::new(values_builder); + + builder.values().append_slice(&[1, 2, 3]); + builder.append(true); + builder.values().append_slice(&[4, 5, 6]); + builder.append(true); + + let mut arr = builder.finish_cloned(); + assert_eq!(2, arr.len()); + assert!(!builder.is_empty()); + + builder.values().append_slice(&[7, 8, 9]); + builder.append(true); + arr = builder.finish(); + assert_eq!(3, arr.len()); + assert!(builder.is_empty()); + } + + #[test] + fn test_list_view_list_view_array_builder() { + let primitive_builder = Int32Builder::with_capacity(10); + let values_builder = ListViewBuilder::new(primitive_builder); + let mut builder = ListViewBuilder::new(values_builder); + + // [[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], null, [[9, 10]]] + builder.values().values().append_value(1); + builder.values().values().append_value(2); + builder.values().append(true); + builder.values().values().append_value(3); + builder.values().values().append_value(4); + builder.values().append(true); + builder.append(true); + + builder.values().values().append_value(5); + builder.values().values().append_value(6); + builder.values().values().append_value(7); + builder.values().append(true); + builder.values().append(false); + builder.values().values().append_value(8); + builder.values().append(true); + builder.append(true); + + builder.append(false); + + builder.values().values().append_value(9); + builder.values().values().append_value(10); + builder.values().append(true); + builder.append(true); + + let l1 = builder.finish(); + + assert_eq!(4, l1.len()); + assert_eq!(1, l1.null_count()); + + assert_eq!(l1.value_offsets(), &[0, 2, 5, 5]); + assert_eq!(l1.value_sizes(), &[2, 3, 0, 1]); + + let l2 = l1.values().as_list_view::(); + + assert_eq!(6, l2.len()); + assert_eq!(1, l2.null_count()); + assert_eq!(l2.value_offsets(), &[0, 2, 4, 7, 7, 8]); + assert_eq!(l2.value_sizes(), &[2, 2, 3, 0, 1, 2]); + + let i1 = l2.values().as_primitive::(); + assert_eq!(10, i1.len()); + assert_eq!(0, i1.null_count()); + assert_eq!(i1.values(), &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); + } + + #[test] + fn test_extend() { + let mut builder = ListViewBuilder::new(Int32Builder::new()); + builder.extend([ + Some(vec![Some(1), Some(2), Some(7), None]), + Some(vec![]), + Some(vec![Some(4), Some(5)]), + None, + ]); + + let array = builder.finish(); + assert_eq!(array.value_offsets(), [0, 4, 4, 6]); + assert_eq!(array.value_sizes(), [4, 0, 2, 0]); + assert_eq!(array.null_count(), 1); + assert!(array.is_null(3)); + let elements = array.values().as_primitive::(); + assert_eq!(elements.values(), &[1, 2, 7, 0, 4, 5]); + assert_eq!(elements.null_count(), 1); + assert!(elements.is_null(3)); + } + + #[test] + fn test_boxed_primitive_array_builder() { + let values_builder = make_builder(&DataType::Int32, 5); + let mut builder = ListViewBuilder::new(values_builder); + + builder + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_slice(&[1, 2, 3]); + builder.append(true); + + builder + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_slice(&[4, 5, 6]); + builder.append(true); + + let arr = builder.finish(); + assert_eq!(2, arr.len()); + + let elements = arr.values().as_primitive::(); + assert_eq!(elements.values(), &[1, 2, 3, 4, 5, 6]); + } + + #[test] + fn test_boxed_list_view_list_view_array_builder() { + // This test is same as `test_list_list_array_builder` but uses boxed builders. + let values_builder = make_builder( + &DataType::ListView(Arc::new(Field::new("item", DataType::Int32, true))), + 10, + ); + test_boxed_generic_list_view_generic_list_view_array_builder::(values_builder); + } + + #[test] + fn test_boxed_large_list_view_large_list_view_array_builder() { + // This test is same as `test_list_list_array_builder` but uses boxed builders. + let values_builder = make_builder( + &DataType::LargeListView(Arc::new(Field::new("item", DataType::Int32, true))), + 10, + ); + test_boxed_generic_list_view_generic_list_view_array_builder::(values_builder); + } + + fn test_boxed_generic_list_view_generic_list_view_array_builder( + values_builder: Box, + ) where + O: OffsetSizeTrait + PartialEq, + { + let mut builder: GenericListViewBuilder> = + GenericListViewBuilder::>::new(values_builder); + + // [[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], null, [[9, 10]]] + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(1); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(2); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .append(true); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(3); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(4); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .append(true); + builder.append(true); + + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(5); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(6); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an (Large)ListViewBuilder") + .append_value(7); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .append(true); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .append(false); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(8); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .append(true); + builder.append(true); + + builder.append(false); + + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(9); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(10); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .append(true); + builder.append(true); + + let l1 = builder.finish(); + assert_eq!(4, l1.len()); + assert_eq!(1, l1.null_count()); + assert_eq!(l1.value_offsets(), &[0, 2, 5, 5].map(O::usize_as)); + assert_eq!(l1.value_sizes(), &[2, 3, 0, 1].map(O::usize_as)); + + let l2 = l1.values().as_list_view::(); + assert_eq!(6, l2.len()); + assert_eq!(1, l2.null_count()); + assert_eq!(l2.value_offsets(), &[0, 2, 4, 7, 7, 8].map(O::usize_as)); + assert_eq!(l2.value_sizes(), &[2, 2, 3, 0, 1, 2].map(O::usize_as)); + + let i1 = l2.values().as_primitive::(); + assert_eq!(10, i1.len()); + assert_eq!(0, i1.null_count()); + assert_eq!(i1.values(), &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); + } + + #[test] + fn test_with_field() { + let field = Arc::new(Field::new("bar", DataType::Int32, false)); + let mut builder = ListViewBuilder::new(Int32Builder::new()).with_field(field.clone()); + builder.append_value([Some(1), Some(2), Some(3)]); + builder.append_null(); // This is fine as nullability refers to nullability of values + builder.append_value([Some(4)]); + let array = builder.finish(); + assert_eq!(array.len(), 3); + assert_eq!(array.data_type(), &DataType::ListView(field.clone())); + + builder.append_value([Some(4), Some(5)]); + let array = builder.finish(); + assert_eq!(array.data_type(), &DataType::ListView(field)); + assert_eq!(array.len(), 1); + } + + #[test] + #[should_panic( + expected = r#"Non-nullable field of ListViewArray \"item\" cannot contain nulls"# + )] + // If a non-nullable type is declared but a null value is used, it will be intercepted by the null check. + fn test_checks_nullability() { + let field = Arc::new(Field::new("item", DataType::Int32, false)); + let mut builder = ListViewBuilder::new(Int32Builder::new()).with_field(field.clone()); + builder.append_value([Some(1), None]); + builder.finish(); + } + + #[test] + #[should_panic(expected = "ListViewArray expected data type Int64 got Int32")] + // If the declared type does not match the actual appended type, it will be intercepted by type checking in the finish function. + fn test_checks_data_type() { + let field = Arc::new(Field::new("item", DataType::Int64, false)); + let mut builder = ListViewBuilder::new(Int32Builder::new()).with_field(field.clone()); + builder.append_value([Some(1)]); + builder.finish(); + } +} diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index 89a96280eb87..982e8788b90d 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -180,6 +180,8 @@ mod generic_byte_run_builder; pub use generic_byte_run_builder::*; mod generic_bytes_view_builder; pub use generic_bytes_view_builder::*; +mod generic_list_view_builder; +pub use generic_list_view_builder::*; mod union_builder; pub use union_builder::*; @@ -304,6 +306,12 @@ pub type ListBuilder = GenericListBuilder; /// Builder for [`LargeListArray`](crate::array::LargeListArray) pub type LargeListBuilder = GenericListBuilder; +/// Builder for [`ListViewArray`](crate::array::ListViewArray) +pub type ListViewBuilder = GenericListViewBuilder; + +/// Builder for [`LargeListViewArray`](crate::array::LargeListViewArray) +pub type LargeListViewBuilder = GenericListViewBuilder; + /// Builder for [`BinaryArray`](crate::array::BinaryArray) /// /// See examples on [`GenericBinaryBuilder`] diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index 2b288445c74b..4a40c2201746 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -270,6 +270,16 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box { + let builder = make_builder(field.data_type(), capacity); + Box::new(ListViewBuilder::with_capacity(builder, capacity).with_field(field.clone())) + } + DataType::LargeListView(field) => { + let builder = make_builder(field.data_type(), capacity); + Box::new( + LargeListViewBuilder::with_capacity(builder, capacity).with_field(field.clone()), + ) + } DataType::Map(field, _) => match field.data_type() { DataType::Struct(fields) => { let map_field_names = MapFieldNames { diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs index fc657f94c6a6..d871431593b6 100644 --- a/arrow-array/src/cast.rs +++ b/arrow-array/src/cast.rs @@ -832,6 +832,14 @@ pub trait AsArray: private::Sealed { self.as_list_opt().expect("list array") } + /// Downcast this to a [`GenericListViewArray`] returning `None` if not possible + fn as_list_view_opt(&self) -> Option<&GenericListViewArray>; + + /// Downcast this to a [`GenericListViewArray`] panicking if not possible + fn as_list_view(&self) -> &GenericListViewArray { + self.as_list_view_opt().expect("list view array") + } + /// Downcast this to a [`FixedSizeBinaryArray`] returning `None` if not possible fn as_fixed_size_binary_opt(&self) -> Option<&FixedSizeBinaryArray>; @@ -905,6 +913,10 @@ impl AsArray for dyn Array + '_ { self.as_any().downcast_ref() } + fn as_list_view_opt(&self) -> Option<&GenericListViewArray> { + self.as_any().downcast_ref() + } + fn as_fixed_size_binary_opt(&self) -> Option<&FixedSizeBinaryArray> { self.as_any().downcast_ref() } @@ -960,6 +972,10 @@ impl AsArray for ArrayRef { self.as_ref().as_list_opt() } + fn as_list_view_opt(&self) -> Option<&GenericListViewArray> { + self.as_ref().as_list_view_opt() + } + fn as_fixed_size_binary_opt(&self) -> Option<&FixedSizeBinaryArray> { self.as_ref().as_fixed_size_binary_opt() } From b77d38d022079b106449ead3996f373edc906327 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 3 Jan 2025 13:29:36 +0100 Subject: [PATCH 012/503] Update itertools requirement from 0.13.0 to 0.14.0 in /object_store (#6925) Updates the requirements on [itertools](https://github.com/rust-itertools/itertools) to permit the latest version. - [Changelog](https://github.com/rust-itertools/itertools/blob/master/CHANGELOG.md) - [Commits](https://github.com/rust-itertools/itertools/compare/v0.13.0...v0.14.0) --- updated-dependencies: - dependency-name: itertools dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- object_store/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 6f5e9db1bc70..992ae6662cdb 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -35,7 +35,7 @@ bytes = "1.0" chrono = { version = "0.4.34", default-features = false, features = ["clock"] } futures = "0.3" humantime = "2.1" -itertools = "0.13.0" +itertools = "0.14.0" parking_lot = { version = "0.12" } percent-encoding = "2.1" thiserror = "2.0.2" From 1f639f6e7670698627454927024643641d7c2bd0 Mon Sep 17 00:00:00 2001 From: wiedld Date: Sat, 4 Jan 2025 05:21:59 -0500 Subject: [PATCH 013/503] Document how to use Extend for generic methods on ArrayBuilders (#6932) * chore: add docs for how to use Extend for generic methods on ArrayBuilders * chore: move to mod docs and add more examples --- arrow-array/src/builder/mod.rs | 69 ++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index 982e8788b90d..29d75024ea72 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -78,6 +78,73 @@ //! )) //! ``` //! +//! # Using the [`Extend`] trait to append values from an iterable: +//! +//! ``` +//! # use arrow_array::{Array}; +//! # use arrow_array::builder::{ArrayBuilder, StringBuilder}; +//! +//! let mut builder = StringBuilder::new(); +//! builder.extend(vec![Some("🍐"), Some("🍎"), None]); +//! assert_eq!(builder.finish().len(), 3); +//! ``` +//! +//! # Using the [`Extend`] trait to write generic functions: +//! +//! ``` +//! # use arrow_array::{Array, ArrayRef, StringArray}; +//! # use arrow_array::builder::{ArrayBuilder, Int32Builder, ListBuilder, StringBuilder}; +//! +//! // For generic methods that fill a list of values for an [`ArrayBuilder`], use the [`Extend`] trait. +//! fn filter_and_fill>(builder: &mut impl Extend, values: I, filter: V) +//! where V: PartialEq +//! { +//! builder.extend(values.into_iter().filter(|v| *v == filter)); +//! } +//! let mut string_builder = StringBuilder::new(); +//! filter_and_fill( +//! &mut string_builder, +//! vec![Some("🍐"), Some("🍎"), None], +//! Some("🍎"), +//! ); +//! assert_eq!(string_builder.finish().len(), 1); +//! +//! let mut int_builder = Int32Builder::new(); +//! filter_and_fill( +//! &mut int_builder, +//! vec![Some(11), Some(42), None], +//! Some(42), +//! ); +//! assert_eq!(int_builder.finish().len(), 1); +//! +//! // For generic methods that fill lists-of-lists for an [`ArrayBuilder`], use the [`Extend`] trait. +//! fn filter_and_fill_if_contains>>( +//! list_builder: &mut impl Extend>, +//! values: I, +//! filter: Option, +//! ) where +//! T: PartialEq, +//! for<'a> &'a V: IntoIterator>, +//! { +//! list_builder.extend(values.into_iter().filter(|string: &Option| { +//! string +//! .as_ref() +//! .map(|str: &V| str.into_iter().any(|ch: &Option| ch == &filter)) +//! .unwrap_or(false) +//! })); +//! } +//! let builder = StringBuilder::new(); +//! let mut list_builder = ListBuilder::new(builder); +//! let pear_pear = vec![Some("🍐"),Some("🍐")]; +//! let pear_app = vec![Some("🍐"),Some("🍎")]; +//! filter_and_fill_if_contains( +//! &mut list_builder, +//! vec![Some(pear_pear), Some(pear_app), None], +//! Some("🍎"), +//! ); +//! assert_eq!(list_builder.finish().len(), 1); +//! ``` +//! //! # Custom Builders //! //! It is common to have a collection of statically defined Rust types that @@ -134,6 +201,8 @@ //! } //! } //! +//! /// For building arrays in generic code, use Extend instead of the append_* methods +//! /// e.g. append_value, append_option, append_null //! impl<'a> Extend<&'a MyRow> for MyRowBuilder { //! fn extend>(&mut self, iter: T) { //! iter.into_iter().for_each(|row| self.append(row)); From debc5bf85729f7ebeb3789c6094a784abca95f73 Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Sat, 4 Jan 2025 05:23:12 -0500 Subject: [PATCH 014/503] [Parquet] Add projection utility functions (#6931) * projection utilities * improve docs --------- Co-authored-by: Andrew Lamb --- parquet/src/arrow/mod.rs | 101 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index 1305bbac83f0..35f5897c18f8 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -281,6 +281,45 @@ impl ProjectionMask { pub fn leaf_included(&self, leaf_idx: usize) -> bool { self.mask.as_ref().map(|m| m[leaf_idx]).unwrap_or(true) } + + /// Union two projection masks + /// + /// Example: + /// ```text + /// mask1 = [true, false, true] + /// mask2 = [false, true, true] + /// union(mask1, mask2) = [true, true, true] + /// ``` + pub fn union(&mut self, other: &Self) { + match (self.mask.as_ref(), other.mask.as_ref()) { + (None, _) | (_, None) => self.mask = None, + (Some(a), Some(b)) => { + debug_assert_eq!(a.len(), b.len()); + let mask = a.iter().zip(b.iter()).map(|(&a, &b)| a || b).collect(); + self.mask = Some(mask); + } + } + } + + /// Intersect two projection masks + /// + /// Example: + /// ```text + /// mask1 = [true, false, true] + /// mask2 = [false, true, true] + /// intersect(mask1, mask2) = [false, false, true] + /// ``` + pub fn intersect(&mut self, other: &Self) { + match (self.mask.as_ref(), other.mask.as_ref()) { + (None, _) => self.mask = other.mask.clone(), + (_, None) => {} + (Some(a), Some(b)) => { + debug_assert_eq!(a.len(), b.len()); + let mask = a.iter().zip(b.iter()).map(|(&a, &b)| a && b).collect(); + self.mask = Some(mask); + } + } + } } /// Lookups up the parquet column by name @@ -551,4 +590,66 @@ mod test { let mask = ProjectionMask::columns(&schema, ["a", "e"]); assert_eq!(mask.mask.unwrap(), [true, false, true, false, true]); } + + #[test] + fn test_projection_mask_union() { + let mut mask1 = ProjectionMask { + mask: Some(vec![true, false, true]), + }; + let mask2 = ProjectionMask { + mask: Some(vec![false, true, true]), + }; + mask1.union(&mask2); + assert_eq!(mask1.mask, Some(vec![true, true, true])); + + let mut mask1 = ProjectionMask { mask: None }; + let mask2 = ProjectionMask { + mask: Some(vec![false, true, true]), + }; + mask1.union(&mask2); + assert_eq!(mask1.mask, None); + + let mut mask1 = ProjectionMask { + mask: Some(vec![true, false, true]), + }; + let mask2 = ProjectionMask { mask: None }; + mask1.union(&mask2); + assert_eq!(mask1.mask, None); + + let mut mask1 = ProjectionMask { mask: None }; + let mask2 = ProjectionMask { mask: None }; + mask1.union(&mask2); + assert_eq!(mask1.mask, None); + } + + #[test] + fn test_projection_mask_intersect() { + let mut mask1 = ProjectionMask { + mask: Some(vec![true, false, true]), + }; + let mask2 = ProjectionMask { + mask: Some(vec![false, true, true]), + }; + mask1.intersect(&mask2); + assert_eq!(mask1.mask, Some(vec![false, false, true])); + + let mut mask1 = ProjectionMask { mask: None }; + let mask2 = ProjectionMask { + mask: Some(vec![false, true, true]), + }; + mask1.intersect(&mask2); + assert_eq!(mask1.mask, Some(vec![false, true, true])); + + let mut mask1 = ProjectionMask { + mask: Some(vec![true, false, true]), + }; + let mask2 = ProjectionMask { mask: None }; + mask1.intersect(&mask2); + assert_eq!(mask1.mask, Some(vec![true, false, true])); + + let mut mask1 = ProjectionMask { mask: None }; + let mask2 = ProjectionMask { mask: None }; + mask1.intersect(&mask2); + assert_eq!(mask1.mask, None); + } } From 27992680d5677b629d7b4ca11d3f94c2fe905f1a Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Sat, 4 Jan 2025 12:24:48 +0200 Subject: [PATCH 015/503] feat(arrow-select): `concat` kernel will merge dictionary values for list of dictionaries (#6893) * feat(arrow-select): make list of dictionary merge dictionary keys TODO: - [ ] Add support to nested lists - [ ] Add more tests - [ ] Fix failing test * fix concat lists of dictionaries * format * remove unused import * improve test helper * feat: add merge offset buffers into one * format * add reproduction tst * recommit * fix clippy * fix clippy * fix clippy * improve offsets code according to code review * use concat dictionaries * add specialize code to concat lists to be able to use the concat dictionary logic * remove the use of ArrayData --- arrow-buffer/src/buffer/offset.rs | 52 ++++++++ arrow-select/src/concat.rs | 191 ++++++++++++++++++++++++++++-- 2 files changed, 232 insertions(+), 11 deletions(-) diff --git a/arrow-buffer/src/buffer/offset.rs b/arrow-buffer/src/buffer/offset.rs index a6be2b67af84..164af6f01d0e 100644 --- a/arrow-buffer/src/buffer/offset.rs +++ b/arrow-buffer/src/buffer/offset.rs @@ -133,6 +133,38 @@ impl OffsetBuffer { Self(out.into()) } + /// Get an Iterator over the lengths of this [`OffsetBuffer`] + /// + /// ``` + /// # use arrow_buffer::{OffsetBuffer, ScalarBuffer}; + /// let offsets = OffsetBuffer::<_>::new(ScalarBuffer::::from(vec![0, 1, 4, 9])); + /// assert_eq!(offsets.lengths().collect::>(), vec![1, 3, 5]); + /// ``` + /// + /// Empty [`OffsetBuffer`] will return an empty iterator + /// ``` + /// # use arrow_buffer::OffsetBuffer; + /// let offsets = OffsetBuffer::::new_empty(); + /// assert_eq!(offsets.lengths().count(), 0); + /// ``` + /// + /// This can be used to merge multiple [`OffsetBuffer`]s to one + /// ``` + /// # use arrow_buffer::{OffsetBuffer, ScalarBuffer}; + /// + /// let buffer1 = OffsetBuffer::::from_lengths([2, 6, 3, 7, 2]); + /// let buffer2 = OffsetBuffer::::from_lengths([1, 3, 5, 7, 9]); + /// + /// let merged = OffsetBuffer::::from_lengths( + /// vec![buffer1, buffer2].iter().flat_map(|x| x.lengths()) + /// ); + /// + /// assert_eq!(merged.lengths().collect::>(), &[2, 6, 3, 7, 2, 1, 3, 5, 7, 9]); + /// ``` + pub fn lengths(&self) -> impl ExactSizeIterator + '_ { + self.0.windows(2).map(|x| x[1].as_usize() - x[0].as_usize()) + } + /// Free up unused memory. pub fn shrink_to_fit(&mut self) { self.0.shrink_to_fit(); @@ -244,4 +276,24 @@ mod tests { fn from_lengths_usize_overflow() { OffsetBuffer::::from_lengths([usize::MAX, 1]); } + + #[test] + fn get_lengths() { + let offsets = OffsetBuffer::::new(ScalarBuffer::::from(vec![0, 1, 4, 9])); + assert_eq!(offsets.lengths().collect::>(), vec![1, 3, 5]); + } + + #[test] + fn get_lengths_should_be_with_fixed_size() { + let offsets = OffsetBuffer::::new(ScalarBuffer::::from(vec![0, 1, 4, 9])); + let iter = offsets.lengths(); + assert_eq!(iter.size_hint(), (3, Some(3))); + assert_eq!(iter.len(), 3); + } + + #[test] + fn get_lengths_from_empty_offset_buffer_should_be_empty_iterator() { + let offsets = OffsetBuffer::::new_empty(); + assert_eq!(offsets.lengths().collect::>(), vec![]); + } } diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs index 129b90ee0470..4855e0087cc6 100644 --- a/arrow-select/src/concat.rs +++ b/arrow-select/src/concat.rs @@ -34,9 +34,9 @@ use crate::dictionary::{merge_dictionary_values, should_merge_dictionary_values} use arrow_array::cast::AsArray; use arrow_array::types::*; use arrow_array::*; -use arrow_buffer::{ArrowNativeType, BooleanBufferBuilder, NullBuffer}; +use arrow_buffer::{ArrowNativeType, BooleanBufferBuilder, NullBuffer, OffsetBuffer}; use arrow_data::transform::{Capacities, MutableArrayData}; -use arrow_schema::{ArrowError, DataType, SchemaRef}; +use arrow_schema::{ArrowError, DataType, FieldRef, SchemaRef}; use std::sync::Arc; fn binary_capacity(arrays: &[&dyn Array]) -> Capacities { @@ -129,6 +129,54 @@ fn concat_dictionaries( Ok(Arc::new(array)) } +fn concat_lists( + arrays: &[&dyn Array], + field: &FieldRef, +) -> Result { + let mut output_len = 0; + let mut list_has_nulls = false; + + let lists = arrays + .iter() + .map(|x| x.as_list::()) + .inspect(|l| { + output_len += l.len(); + list_has_nulls |= l.null_count() != 0; + }) + .collect::>(); + + let lists_nulls = list_has_nulls.then(|| { + let mut nulls = BooleanBufferBuilder::new(output_len); + for l in &lists { + match l.nulls() { + Some(n) => nulls.append_buffer(n.inner()), + None => nulls.append_n(l.len(), true), + } + } + NullBuffer::new(nulls.finish()) + }); + + let values: Vec<&dyn Array> = lists + .iter() + .map(|x| x.values().as_ref()) + .collect::>(); + + let concatenated_values = concat(values.as_slice())?; + + // Merge value offsets from the lists + let value_offset_buffer = + OffsetBuffer::::from_lengths(lists.iter().flat_map(|x| x.offsets().lengths())); + + let array = GenericListArray::::try_new( + Arc::clone(field), + value_offset_buffer, + concatenated_values, + lists_nulls, + )?; + + Ok(Arc::new(array)) +} + macro_rules! dict_helper { ($t:ty, $arrays:expr) => { return Ok(Arc::new(concat_dictionaries::<$t>($arrays)?) as _) @@ -163,14 +211,20 @@ pub fn concat(arrays: &[&dyn Array]) -> Result { "It is not possible to concatenate arrays of different data types.".to_string(), )); } - if let DataType::Dictionary(k, _) = d { - downcast_integer! { - k.as_ref() => (dict_helper, arrays), - _ => unreachable!("illegal dictionary key type {k}") - }; - } else { - let capacity = get_capacity(arrays, d); - concat_fallback(arrays, capacity) + + match d { + DataType::Dictionary(k, _) => { + downcast_integer! { + k.as_ref() => (dict_helper, arrays), + _ => unreachable!("illegal dictionary key type {k}") + } + } + DataType::List(field) => concat_lists::(arrays, field), + DataType::LargeList(field) => concat_lists::(arrays, field), + _ => { + let capacity = get_capacity(arrays, d); + concat_fallback(arrays, capacity) + } } } @@ -228,8 +282,9 @@ pub fn concat_batches<'a>( #[cfg(test)] mod tests { use super::*; - use arrow_array::builder::StringDictionaryBuilder; + use arrow_array::builder::{GenericListBuilder, StringDictionaryBuilder}; use arrow_schema::{Field, Schema}; + use std::fmt::Debug; #[test] fn test_concat_empty_vec() { @@ -851,4 +906,118 @@ mod tests { assert_eq!(array.null_count(), 10); assert_eq!(array.logical_null_count(), 10); } + + #[test] + fn concat_dictionary_list_array_simple() { + let scalars = vec![ + create_single_row_list_of_dict(vec![Some("a")]), + create_single_row_list_of_dict(vec![Some("a")]), + create_single_row_list_of_dict(vec![Some("b")]), + ]; + + let arrays = scalars + .iter() + .map(|a| a as &(dyn Array)) + .collect::>(); + let concat_res = concat(arrays.as_slice()).unwrap(); + + let expected_list = create_list_of_dict(vec![ + // Row 1 + Some(vec![Some("a")]), + Some(vec![Some("a")]), + Some(vec![Some("b")]), + ]); + + let list = concat_res.as_list::(); + + // Assert that the list is equal to the expected list + list.iter().zip(expected_list.iter()).for_each(|(a, b)| { + assert_eq!(a, b); + }); + + assert_dictionary_has_unique_values::<_, StringArray>( + list.values().as_dictionary::(), + ); + } + + #[test] + fn concat_many_dictionary_list_arrays() { + let number_of_unique_values = 8; + let scalars = (0..80000) + .map(|i| { + create_single_row_list_of_dict(vec![Some( + (i % number_of_unique_values).to_string(), + )]) + }) + .collect::>(); + + let arrays = scalars + .iter() + .map(|a| a as &(dyn Array)) + .collect::>(); + let concat_res = concat(arrays.as_slice()).unwrap(); + + let expected_list = create_list_of_dict( + (0..80000) + .map(|i| Some(vec![Some((i % number_of_unique_values).to_string())])) + .collect::>(), + ); + + let list = concat_res.as_list::(); + + // Assert that the list is equal to the expected list + list.iter().zip(expected_list.iter()).for_each(|(a, b)| { + assert_eq!(a, b); + }); + + assert_dictionary_has_unique_values::<_, StringArray>( + list.values().as_dictionary::(), + ); + } + + fn create_single_row_list_of_dict( + list_items: Vec>>, + ) -> GenericListArray { + let rows = list_items.into_iter().map(Some).collect(); + + create_list_of_dict(vec![rows]) + } + + fn create_list_of_dict( + rows: Vec>>>>, + ) -> GenericListArray { + let mut builder = + GenericListBuilder::::new(StringDictionaryBuilder::::new()); + + for row in rows { + builder.append_option(row); + } + + builder.finish() + } + + fn assert_dictionary_has_unique_values<'a, K, V>(array: &'a DictionaryArray) + where + K: ArrowDictionaryKeyType, + V: Sync + Send + 'static, + &'a V: ArrayAccessor + IntoIterator, + + <&'a V as ArrayAccessor>::Item: Default + Clone + PartialEq + Debug + Ord, + <&'a V as IntoIterator>::Item: Clone + PartialEq + Debug + Ord, + { + let dict = array.downcast_dict::().unwrap(); + let mut values = dict.values().into_iter().collect::>(); + + // remove duplicates must be sorted first so we can compare + values.sort(); + + let mut unique_values = values.clone(); + + unique_values.dedup(); + + assert_eq!( + values, unique_values, + "There are duplicates in the value list (the value list here is sorted which is only for the assertion)" + ); + } } From 133e13b12f73259ecaa74eb8378ab2d540adefd6 Mon Sep 17 00:00:00 2001 From: Himadri Pal Date: Sat, 4 Jan 2025 11:21:37 -0800 Subject: [PATCH 016/503] remove println (#6935) --- arrow-cast/src/cast/mod.rs | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 7aff32b0beee..440d0a8becde 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -3767,7 +3767,6 @@ mod tests { Arc::new(StringViewArray::from(vec![Some("1.5"), Some("2.5"), None])); for array in inputs { - println!("type: {}", array.data_type()); assert!(can_cast_types(array.data_type(), &DataType::Utf8View)); let arr = cast(&array, &DataType::Utf8View).unwrap(); assert_eq!(expected.as_ref(), arr.as_ref()); @@ -9971,7 +9970,6 @@ mod tests { fn test_decimal_to_decimal_throw_error_on_precision_overflow_same_scale() { let array = vec![Some(123456789)]; let array = create_decimal_array(array, 24, 2).unwrap(); - println!("{:?}", array); let input_type = DataType::Decimal128(24, 2); let output_type = DataType::Decimal128(6, 2); assert!(can_cast_types(&input_type, &output_type)); @@ -9988,8 +9986,7 @@ mod tests { #[test] fn test_decimal_to_decimal_throw_error_on_precision_overflow_lower_scale() { let array = vec![Some(123456789)]; - let array = create_decimal_array(array, 24, 2).unwrap(); - println!("{:?}", array); + let array = create_decimal_array(array, 24, 4).unwrap(); let input_type = DataType::Decimal128(24, 4); let output_type = DataType::Decimal128(6, 2); assert!(can_cast_types(&input_type, &output_type)); @@ -10000,14 +9997,13 @@ mod tests { }; let result = cast_with_options(&array, &output_type, &options); assert_eq!(result.unwrap_err().to_string(), - "Invalid argument error: 123456790 is too large to store in a Decimal128 of precision 6. Max is 999999"); + "Invalid argument error: 1234568 is too large to store in a Decimal128 of precision 6. Max is 999999"); } #[test] fn test_decimal_to_decimal_throw_error_on_precision_overflow_greater_scale() { let array = vec![Some(123456789)]; let array = create_decimal_array(array, 24, 2).unwrap(); - println!("{:?}", array); let input_type = DataType::Decimal128(24, 2); let output_type = DataType::Decimal128(6, 3); assert!(can_cast_types(&input_type, &output_type)); @@ -10025,7 +10021,6 @@ mod tests { fn test_decimal_to_decimal_throw_error_on_precision_overflow_diff_type() { let array = vec![Some(123456789)]; let array = create_decimal_array(array, 24, 2).unwrap(); - println!("{:?}", array); let input_type = DataType::Decimal128(24, 2); let output_type = DataType::Decimal256(6, 2); assert!(can_cast_types(&input_type, &output_type)); From 24a6bff6769a5c6062aafe52b6702459086d3b94 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 4 Jan 2025 14:32:17 -0500 Subject: [PATCH 017/503] Minor: improve `zip` kernel docs, add examples (#6928) * Minor: improve `zip` kernel docs` * Add example for zip with scalar --- arrow-select/src/zip.rs | 66 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 59 insertions(+), 7 deletions(-) diff --git a/arrow-select/src/zip.rs b/arrow-select/src/zip.rs index acb31dfa3bc2..2efd2e749921 100644 --- a/arrow-select/src/zip.rs +++ b/arrow-select/src/zip.rs @@ -15,20 +15,72 @@ // specific language governing permissions and limitations // under the License. -//! Zip two arrays by some boolean mask. Where the mask evaluates `true` values of `truthy` +//! [`zip`]: Combine values from two arrays based on boolean mask use crate::filter::SlicesIterator; use arrow_array::*; use arrow_data::transform::MutableArrayData; use arrow_schema::ArrowError; -/// Zip two arrays by some boolean mask. Where the mask evaluates `true` values of `truthy` -/// are taken, where the mask evaluates `false` values of `falsy` are taken. +/// Zip two arrays by some boolean mask. /// -/// # Arguments -/// * `mask` - Boolean values used to determine from which array to take the values. -/// * `truthy` - Values of this array are taken if mask evaluates `true` -/// * `falsy` - Values of this array are taken if mask evaluates `false` +/// - Where `mask` is `true`, values of `truthy` are taken +/// - Where `mask` is `false` or `NULL`, values of `falsy` are taken +/// +/// # Example: `zip` two arrays +/// ``` +/// # use std::sync::Arc; +/// # use arrow_array::{ArrayRef, BooleanArray, Int32Array}; +/// # use arrow_select::zip::zip; +/// // mask: [true, true, false, NULL, true] +/// let mask = BooleanArray::from(vec![ +/// Some(true), Some(true), Some(false), None, Some(true) +/// ]); +/// // truthy array: [1, NULL, 3, 4, 5] +/// let truthy = Int32Array::from(vec![ +/// Some(1), None, Some(3), Some(4), Some(5) +/// ]); +/// // falsy array: [10, 20, 30, 40, 50] +/// let falsy = Int32Array::from(vec![ +/// Some(10), Some(20), Some(30), Some(40), Some(50) +/// ]); +/// // zip with this mask select the first, second and last value from `truthy` +/// // and the third and fourth value from `falsy` +/// let result = zip(&mask, &truthy, &falsy).unwrap(); +/// // Expected: [1, NULL, 30, 40, 5] +/// let expected: ArrayRef = Arc::new(Int32Array::from(vec![ +/// Some(1), None, Some(30), Some(40), Some(5) +/// ])); +/// assert_eq!(&result, &expected); +/// ``` +/// +/// # Example: `zip` and array with a scalar +/// +/// Use `zip` to replace certain values in an array with a scalar +/// +/// ``` +/// # use std::sync::Arc; +/// # use arrow_array::{ArrayRef, BooleanArray, Int32Array}; +/// # use arrow_select::zip::zip; +/// // mask: [true, true, false, NULL, true] +/// let mask = BooleanArray::from(vec![ +/// Some(true), Some(true), Some(false), None, Some(true) +/// ]); +/// // array: [1, NULL, 3, 4, 5] +/// let arr = Int32Array::from(vec![ +/// Some(1), None, Some(3), Some(4), Some(5) +/// ]); +/// // scalar: 42 +/// let scalar = Int32Array::new_scalar(42); +/// // zip the array with the mask select the first, second and last value from `arr` +/// // and fill the third and fourth value with the scalar 42 +/// let result = zip(&mask, &arr, &scalar).unwrap(); +/// // Expected: [1, NULL, 42, 42, 5] +/// let expected: ArrayRef = Arc::new(Int32Array::from(vec![ +/// Some(1), None, Some(42), Some(42), Some(5) +/// ])); +/// assert_eq!(&result, &expected); +/// ``` pub fn zip( mask: &BooleanArray, truthy: &dyn Datum, From 2b452048e46ae7e8bf2b2a4b504b535197d56c21 Mon Sep 17 00:00:00 2001 From: Vrishabh Date: Sun, 5 Jan 2025 15:54:14 +0530 Subject: [PATCH 018/503] Minor clippy fixes (#6942) --- arrow-cast/src/parse.rs | 14 ++++---------- arrow-flight/src/encode.rs | 2 +- object_store/src/aws/client.rs | 2 +- object_store/src/azure/client.rs | 2 +- object_store/src/gcp/client.rs | 2 +- parquet/src/arrow/async_reader/mod.rs | 2 +- parquet/src/column/writer/mod.rs | 4 ++-- 7 files changed, 11 insertions(+), 17 deletions(-) diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index f4c4639c1c08..4e93e9787cc8 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -881,7 +881,7 @@ pub fn parse_decimal( for (_, b) in bs.by_ref() { if !b.is_ascii_digit() { if *b == b'e' || *b == b'E' { - result = match parse_e_notation::( + result = parse_e_notation::( s, digits as u16, fractionals as i16, @@ -889,10 +889,7 @@ pub fn parse_decimal( point_index, precision as u16, scale as i16, - ) { - Err(e) => return Err(e), - Ok(v) => v, - }; + )?; is_e_notation = true; @@ -926,7 +923,7 @@ pub fn parse_decimal( } } b'e' | b'E' => { - result = match parse_e_notation::( + result = parse_e_notation::( s, digits as u16, fractionals as i16, @@ -934,10 +931,7 @@ pub fn parse_decimal( index, precision as u16, scale as i16, - ) { - Err(e) => return Err(e), - Ok(v) => v, - }; + )?; is_e_notation = true; diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index 315b7b3cb6e5..19fe42474405 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -1833,7 +1833,7 @@ mod tests { .flight_descriptor .as_ref() .map(|descriptor| { - let path_len: usize = descriptor.path.iter().map(|p| p.as_bytes().len()).sum(); + let path_len: usize = descriptor.path.iter().map(|p| p.len()).sum(); std::mem::size_of_val(descriptor) + descriptor.cmd.len() + path_len }) diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index 25fdd3311c95..b81be0c0efad 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -299,7 +299,7 @@ pub(crate) struct Request<'a> { retry_error_body: bool, } -impl<'a> Request<'a> { +impl Request<'_> { pub(crate) fn query(self, query: &T) -> Self { let builder = self.builder.query(query); Self { builder, ..self } diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index ea3a5faf3ad8..bd72d0c6aee1 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -198,7 +198,7 @@ struct PutRequest<'a> { idempotent: bool, } -impl<'a> PutRequest<'a> { +impl PutRequest<'_> { fn header(self, k: &HeaderName, v: &str) -> Self { let builder = self.builder.header(k, v); Self { builder, ..self } diff --git a/object_store/src/gcp/client.rs b/object_store/src/gcp/client.rs index 1928d13b4739..d6f89ca71740 100644 --- a/object_store/src/gcp/client.rs +++ b/object_store/src/gcp/client.rs @@ -173,7 +173,7 @@ pub(crate) struct Request<'a> { idempotent: bool, } -impl<'a> Request<'a> { +impl Request<'_> { fn header(self, k: &HeaderName, v: &str) -> Self { let builder = self.builder.header(k, v); Self { builder, ..self } diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 96715e1164b2..4f3befe42662 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -792,7 +792,7 @@ struct InMemoryRowGroup<'a> { row_count: usize, } -impl<'a> InMemoryRowGroup<'a> { +impl InMemoryRowGroup<'_> { /// Fetches the necessary column data into memory async fn fetch( &mut self, diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 8dc1d0db4476..5f34f34cbb7a 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -3275,8 +3275,8 @@ mod tests { fn test_truncate_utf8() { // No-op let data = "❤️🧡💛💚💙💜"; - let r = truncate_utf8(data, data.as_bytes().len()).unwrap(); - assert_eq!(r.len(), data.as_bytes().len()); + let r = truncate_utf8(data, data.len()).unwrap(); + assert_eq!(r.len(), data.len()); assert_eq!(&r, data.as_bytes()); // We slice it away from the UTF8 boundary From fc245b55ba85d3535f0702706e886ce67e95f1d2 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 6 Jan 2025 16:03:51 -0500 Subject: [PATCH 019/503] Improve `Buffer` documentation, deprecate `Buffer::from_bytes` add `From` and `From` impls (#6939) * Improve Bytes documentation * Improve Buffer documentation, add From and From impls * avoid linking to private docs * Deprecate `Buffer::from_bytes` * Apply suggestions from code review Co-authored-by: Jeffrey Vo --------- Co-authored-by: Jeffrey Vo --- arrow-buffer/src/buffer/immutable.rs | 118 ++++++++++++++---- arrow-buffer/src/buffer/mutable.rs | 2 +- arrow-buffer/src/bytes.rs | 8 +- arrow-flight/src/decode.rs | 2 +- arrow-flight/src/sql/client.rs | 2 +- .../src/arrow/array_reader/byte_view_array.rs | 10 +- 6 files changed, 104 insertions(+), 38 deletions(-) diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index cf1d6f366751..fd145ce2306e 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -28,8 +28,43 @@ use crate::{bit_util, bytes::Bytes, native::ArrowNativeType}; use super::ops::bitwise_unary_op_helper; use super::{MutableBuffer, ScalarBuffer}; -/// Buffer represents a contiguous memory region that can be shared with other buffers and across -/// thread boundaries. +/// A contiguous memory region that can be shared with other buffers and across +/// thread boundaries that stores Arrow data. +/// +/// `Buffer`s can be sliced and cloned without copying the underlying data and can +/// be created from memory allocated by non-Rust sources such as C/C++. +/// +/// # Example: Create a `Buffer` from a `Vec` (without copying) +/// ``` +/// # use arrow_buffer::Buffer; +/// let vec: Vec = vec![1, 2, 3]; +/// let buffer = Buffer::from(vec); +/// ``` +/// +/// # Example: Convert a `Buffer` to a `Vec` (without copying) +/// +/// Use [`Self::into_vec`] to convert a `Buffer` back into a `Vec` if there are +/// no other references and the types are aligned correctly. +/// ``` +/// # use arrow_buffer::Buffer; +/// # let vec: Vec = vec![1, 2, 3]; +/// # let buffer = Buffer::from(vec); +/// // convert the buffer back into a Vec of u32 +/// // note this will fail if the buffer is shared or not aligned correctly +/// let vec: Vec = buffer.into_vec().unwrap(); +/// ``` +/// +/// # Example: Create a `Buffer` from a [`bytes::Bytes`] (without copying) +/// +/// [`bytes::Bytes`] is a common type in the Rust ecosystem for shared memory +/// regions. You can create a buffer from a `Bytes` instance using the `From` +/// implementation, also without copying. +/// +/// ``` +/// # use arrow_buffer::Buffer; +/// let bytes = bytes::Bytes::from("hello"); +/// let buffer = Buffer::from(bytes); +///``` #[derive(Clone, Debug)] pub struct Buffer { /// the internal byte buffer. @@ -59,24 +94,15 @@ unsafe impl Send for Buffer where Bytes: Send {} unsafe impl Sync for Buffer where Bytes: Sync {} impl Buffer { - /// Auxiliary method to create a new Buffer + /// Create a new Buffer from a (internal) `Bytes` /// - /// This can be used with a [`bytes::Bytes`] via `into()`: + /// NOTE despite the same name, `Bytes` is an internal struct in arrow-rs + /// and is different than [`bytes::Bytes`]. /// - /// ``` - /// # use arrow_buffer::Buffer; - /// let bytes = bytes::Bytes::from_static(b"foo"); - /// let buffer = Buffer::from_bytes(bytes.into()); - /// ``` - #[inline] + /// See examples on [`Buffer`] for ways to create a buffer from a [`bytes::Bytes`]. + #[deprecated(since = "54.1.0", note = "Use Buffer::from instead")] pub fn from_bytes(bytes: Bytes) -> Self { - let length = bytes.len(); - let ptr = bytes.as_ptr(); - Buffer { - data: Arc::new(bytes), - ptr, - length, - } + Self::from(bytes) } /// Returns the offset, in bytes, of `Self::ptr` to `Self::data` @@ -107,8 +133,11 @@ impl Buffer { buffer.into() } - /// Creates a buffer from an existing memory region. Ownership of the memory is tracked via reference counting - /// and the memory will be freed using the `drop` method of [crate::alloc::Allocation] when the reference count reaches zero. + /// Creates a buffer from an existing memory region. + /// + /// Ownership of the memory is tracked via reference counting + /// and the memory will be freed using the `drop` method of + /// [crate::alloc::Allocation] when the reference count reaches zero. /// /// # Arguments /// @@ -155,7 +184,7 @@ impl Buffer { self.data.capacity() } - /// Tried to shrink the capacity of the buffer as much as possible, freeing unused memory. + /// Tries to shrink the capacity of the buffer as much as possible, freeing unused memory. /// /// If the buffer is shared, this is a no-op. /// @@ -190,7 +219,7 @@ impl Buffer { } } - /// Returns whether the buffer is empty. + /// Returns true if the buffer is empty. #[inline] pub fn is_empty(&self) -> bool { self.length == 0 @@ -206,7 +235,9 @@ impl Buffer { } /// Returns a new [Buffer] that is a slice of this buffer starting at `offset`. - /// Doing so allows the same memory region to be shared between buffers. + /// + /// This function is `O(1)` and does not copy any data, allowing the + /// same memory region to be shared between buffers. /// /// # Panics /// @@ -240,7 +271,10 @@ impl Buffer { /// Returns a new [Buffer] that is a slice of this buffer starting at `offset`, /// with `length` bytes. - /// Doing so allows the same memory region to be shared between buffers. + /// + /// This function is `O(1)` and does not copy any data, allowing the same + /// memory region to be shared between buffers. + /// /// # Panics /// Panics iff `(offset + length)` is larger than the existing length. pub fn slice_with_length(&self, offset: usize, length: usize) -> Self { @@ -328,10 +362,16 @@ impl Buffer { }) } - /// Returns `Vec` for mutating the buffer + /// Converts self into a `Vec`, if possible. + /// + /// This can be used to reuse / mutate the underlying data. /// - /// Returns `Err(self)` if this buffer does not have the same [`Layout`] as - /// the destination Vec or contains a non-zero offset + /// # Errors + /// + /// Returns `Err(self)` if + /// 1. this buffer does not have the same [`Layout`] as the destination Vec + /// 2. contains a non-zero offset + /// 3. The buffer is shared pub fn into_vec(self) -> Result, Self> { let layout = match self.data.deallocation() { Deallocation::Standard(l) => l, @@ -414,7 +454,29 @@ impl From> for Buffer { } } -/// Creating a `Buffer` instance by storing the boolean values into the buffer +/// Convert from internal `Bytes` (not [`bytes::Bytes`]) to `Buffer` +impl From for Buffer { + #[inline] + fn from(bytes: Bytes) -> Self { + let length = bytes.len(); + let ptr = bytes.as_ptr(); + Self { + data: Arc::new(bytes), + ptr, + length, + } + } +} + +/// Convert from [`bytes::Bytes`], not internal `Bytes` to `Buffer` +impl From for Buffer { + fn from(bytes: bytes::Bytes) -> Self { + let bytes: Bytes = bytes.into(); + Self::from(bytes) + } +} + +/// Create a `Buffer` instance by storing the boolean values into the buffer impl FromIterator for Buffer { fn from_iter(iter: I) -> Self where @@ -447,7 +509,9 @@ impl From> for Buffer { impl Buffer { /// Creates a [`Buffer`] from an [`Iterator`] with a trusted (upper) length. + /// /// Prefer this to `collect` whenever possible, as it is ~60% faster. + /// /// # Example /// ``` /// # use arrow_buffer::buffer::Buffer; diff --git a/arrow-buffer/src/buffer/mutable.rs b/arrow-buffer/src/buffer/mutable.rs index c4315a1d64cd..5ad55e306e2a 100644 --- a/arrow-buffer/src/buffer/mutable.rs +++ b/arrow-buffer/src/buffer/mutable.rs @@ -328,7 +328,7 @@ impl MutableBuffer { pub(super) fn into_buffer(self) -> Buffer { let bytes = unsafe { Bytes::new(self.data, self.len, Deallocation::Standard(self.layout)) }; std::mem::forget(self); - Buffer::from_bytes(bytes) + Buffer::from(bytes) } /// View this buffer as a mutable slice of a specific type. diff --git a/arrow-buffer/src/bytes.rs b/arrow-buffer/src/bytes.rs index 77724137aef7..b811bd2c6b40 100644 --- a/arrow-buffer/src/bytes.rs +++ b/arrow-buffer/src/bytes.rs @@ -28,14 +28,18 @@ use crate::buffer::dangling_ptr; /// A continuous, fixed-size, immutable memory region that knows how to de-allocate itself. /// -/// This structs' API is inspired by the `bytes::Bytes`, but it is not limited to using rust's -/// global allocator nor u8 alignment. +/// Note that this structure is an internal implementation detail of the +/// arrow-rs crate. While it has the same name and similar API as +/// [`bytes::Bytes`] it is not limited to rust's global allocator nor u8 +/// alignment. It is possible to create a `Bytes` from `bytes::Bytes` using the +/// `From` implementation. /// /// In the most common case, this buffer is allocated using [`alloc`](std::alloc::alloc) /// with an alignment of [`ALIGNMENT`](crate::alloc::ALIGNMENT) /// /// When the region is allocated by a different allocator, [Deallocation::Custom], this calls the /// custom deallocator to deallocate the region when it is no longer needed. +/// pub struct Bytes { /// The raw pointer to be beginning of the region ptr: NonNull, diff --git a/arrow-flight/src/decode.rs b/arrow-flight/src/decode.rs index 7bafc384306b..760fc926fca6 100644 --- a/arrow-flight/src/decode.rs +++ b/arrow-flight/src/decode.rs @@ -295,7 +295,7 @@ impl FlightDataDecoder { )); }; - let buffer = Buffer::from_bytes(data.data_body.into()); + let buffer = Buffer::from(data.data_body); let dictionary_batch = message.header_as_dictionary_batch().ok_or_else(|| { FlightError::protocol( "Could not get dictionary batch from DictionaryBatch message", diff --git a/arrow-flight/src/sql/client.rs b/arrow-flight/src/sql/client.rs index a6e228737b3f..6d3ac3dbe610 100644 --- a/arrow-flight/src/sql/client.rs +++ b/arrow-flight/src/sql/client.rs @@ -721,7 +721,7 @@ pub fn arrow_data_from_flight_data( let dictionaries_by_field = HashMap::new(); let record_batch = read_record_batch( - &Buffer::from_bytes(flight_data.data_body.into()), + &Buffer::from(flight_data.data_body), ipc_record_batch, arrow_schema_ref.clone(), &dictionaries_by_field, diff --git a/parquet/src/arrow/array_reader/byte_view_array.rs b/parquet/src/arrow/array_reader/byte_view_array.rs index 5845e2c08cec..92a8b0592d0d 100644 --- a/parquet/src/arrow/array_reader/byte_view_array.rs +++ b/parquet/src/arrow/array_reader/byte_view_array.rs @@ -316,9 +316,8 @@ impl ByteViewArrayDecoderPlain { } pub fn read(&mut self, output: &mut ViewBuffer, len: usize) -> Result { - // Here we convert `bytes::Bytes` into `arrow_buffer::Bytes`, which is zero copy - // Then we convert `arrow_buffer::Bytes` into `arrow_buffer:Buffer`, which is also zero copy - let buf = arrow_buffer::Buffer::from_bytes(self.buf.clone().into()); + // Zero copy convert `bytes::Bytes` into `arrow_buffer::Buffer` + let buf = arrow_buffer::Buffer::from(self.buf.clone()); let block_id = output.append_block(buf); let to_read = len.min(self.max_remaining_values); @@ -549,9 +548,8 @@ impl ByteViewArrayDecoderDeltaLength { let src_lengths = &self.lengths[self.length_offset..self.length_offset + to_read]; - // Here we convert `bytes::Bytes` into `arrow_buffer::Bytes`, which is zero copy - // Then we convert `arrow_buffer::Bytes` into `arrow_buffer:Buffer`, which is also zero copy - let bytes = arrow_buffer::Buffer::from_bytes(self.data.clone().into()); + // Zero copy convert `bytes::Bytes` into `arrow_buffer::Buffer` + let bytes = Buffer::from(self.data.clone()); let block_id = output.append_block(bytes); let mut current_offset = self.data_offset; From a160e94aa8f1845a264ef208a2ab0fb8d9137240 Mon Sep 17 00:00:00 2001 From: Jinpeng Date: Mon, 6 Jan 2025 16:09:05 -0500 Subject: [PATCH 020/503] Convert some panics that happen on invalid parquet files to error results (#6738) * Reduce panics * t pushmove integer logical type from format.rs to schema type.rs * remove some changes as per reviews * use wrapping_shl * fix typo in error message * return error for invalid decimal length --------- Co-authored-by: jp0317 Co-authored-by: Andrew Lamb --- parquet/src/errors.rs | 7 ++++ parquet/src/file/metadata/reader.rs | 26 ++++++------- parquet/src/file/serialized_reader.rs | 53 ++++++++++++++++++++++---- parquet/src/file/statistics.rs | 26 +++++++++++++ parquet/src/schema/types.rs | 25 +++++++++++- parquet/src/thrift.rs | 35 ++++++++++++++--- parquet/tests/arrow_reader/bad_data.rs | 2 +- 7 files changed, 146 insertions(+), 28 deletions(-) diff --git a/parquet/src/errors.rs b/parquet/src/errors.rs index 8dc97f4ca2e6..d749287bba62 100644 --- a/parquet/src/errors.rs +++ b/parquet/src/errors.rs @@ -17,6 +17,7 @@ //! Common Parquet errors and macros. +use core::num::TryFromIntError; use std::error::Error; use std::{cell, io, result, str}; @@ -81,6 +82,12 @@ impl Error for ParquetError { } } +impl From for ParquetError { + fn from(e: TryFromIntError) -> ParquetError { + ParquetError::General(format!("Integer overflow: {e}")) + } +} + impl From for ParquetError { fn from(e: io::Error) -> ParquetError { ParquetError::External(Box::new(e)) diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index ec2cd1094d3a..c6715a33b5ae 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -627,7 +627,8 @@ impl ParquetMetaDataReader { for rg in t_file_metadata.row_groups { row_groups.push(RowGroupMetaData::from_thrift(schema_descr.clone(), rg)?); } - let column_orders = Self::parse_column_orders(t_file_metadata.column_orders, &schema_descr); + let column_orders = + Self::parse_column_orders(t_file_metadata.column_orders, &schema_descr)?; let file_metadata = FileMetaData::new( t_file_metadata.version, @@ -645,15 +646,13 @@ impl ParquetMetaDataReader { fn parse_column_orders( t_column_orders: Option>, schema_descr: &SchemaDescriptor, - ) -> Option> { + ) -> Result>> { match t_column_orders { Some(orders) => { // Should always be the case - assert_eq!( - orders.len(), - schema_descr.num_columns(), - "Column order length mismatch" - ); + if orders.len() != schema_descr.num_columns() { + return Err(general_err!("Column order length mismatch")); + }; let mut res = Vec::new(); for (i, column) in schema_descr.columns().iter().enumerate() { match orders[i] { @@ -667,9 +666,9 @@ impl ParquetMetaDataReader { } } } - Some(res) + Ok(Some(res)) } - None => None, + None => Ok(None), } } } @@ -741,7 +740,7 @@ mod tests { ]); assert_eq!( - ParquetMetaDataReader::parse_column_orders(t_column_orders, &schema_descr), + ParquetMetaDataReader::parse_column_orders(t_column_orders, &schema_descr).unwrap(), Some(vec![ ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED), ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED) @@ -750,20 +749,21 @@ mod tests { // Test when no column orders are defined. assert_eq!( - ParquetMetaDataReader::parse_column_orders(None, &schema_descr), + ParquetMetaDataReader::parse_column_orders(None, &schema_descr).unwrap(), None ); } #[test] - #[should_panic(expected = "Column order length mismatch")] fn test_metadata_column_orders_len_mismatch() { let schema = SchemaType::group_type_builder("schema").build().unwrap(); let schema_descr = SchemaDescriptor::new(Arc::new(schema)); let t_column_orders = Some(vec![TColumnOrder::TYPEORDER(TypeDefinedOrder::new())]); - ParquetMetaDataReader::parse_column_orders(t_column_orders, &schema_descr); + let res = ParquetMetaDataReader::parse_column_orders(t_column_orders, &schema_descr); + assert!(res.is_err()); + assert!(format!("{:?}", res.unwrap_err()).contains("Column order length mismatch")); } #[test] diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 06f3cf9fb23f..a942481f7e4d 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -435,7 +435,7 @@ pub(crate) fn decode_page( let is_sorted = dict_header.is_sorted.unwrap_or(false); Page::DictionaryPage { buf: buffer, - num_values: dict_header.num_values as u32, + num_values: dict_header.num_values.try_into()?, encoding: Encoding::try_from(dict_header.encoding)?, is_sorted, } @@ -446,7 +446,7 @@ pub(crate) fn decode_page( .ok_or_else(|| ParquetError::General("Missing V1 data page header".to_string()))?; Page::DataPage { buf: buffer, - num_values: header.num_values as u32, + num_values: header.num_values.try_into()?, encoding: Encoding::try_from(header.encoding)?, def_level_encoding: Encoding::try_from(header.definition_level_encoding)?, rep_level_encoding: Encoding::try_from(header.repetition_level_encoding)?, @@ -460,12 +460,12 @@ pub(crate) fn decode_page( let is_compressed = header.is_compressed.unwrap_or(true); Page::DataPageV2 { buf: buffer, - num_values: header.num_values as u32, + num_values: header.num_values.try_into()?, encoding: Encoding::try_from(header.encoding)?, - num_nulls: header.num_nulls as u32, - num_rows: header.num_rows as u32, - def_levels_byte_len: header.definition_levels_byte_length as u32, - rep_levels_byte_len: header.repetition_levels_byte_length as u32, + num_nulls: header.num_nulls.try_into()?, + num_rows: header.num_rows.try_into()?, + def_levels_byte_len: header.definition_levels_byte_length.try_into()?, + rep_levels_byte_len: header.repetition_levels_byte_length.try_into()?, is_compressed, statistics: statistics::from_thrift(physical_type, header.statistics)?, } @@ -578,6 +578,27 @@ impl Iterator for SerializedPageReader { } } +fn verify_page_header_len(header_len: usize, remaining_bytes: usize) -> Result<()> { + if header_len > remaining_bytes { + return Err(eof_err!("Invalid page header")); + } + Ok(()) +} + +fn verify_page_size( + compressed_size: i32, + uncompressed_size: i32, + remaining_bytes: usize, +) -> Result<()> { + // The page's compressed size should not exceed the remaining bytes that are + // available to read. The page's uncompressed size is the expected size + // after decompression, which can never be negative. + if compressed_size < 0 || compressed_size as usize > remaining_bytes || uncompressed_size < 0 { + return Err(eof_err!("Invalid page header")); + } + Ok(()) +} + impl PageReader for SerializedPageReader { fn get_next_page(&mut self) -> Result> { loop { @@ -596,10 +617,16 @@ impl PageReader for SerializedPageReader { *header } else { let (header_len, header) = read_page_header_len(&mut read)?; + verify_page_header_len(header_len, *remaining)?; *offset += header_len; *remaining -= header_len; header }; + verify_page_size( + header.compressed_page_size, + header.uncompressed_page_size, + *remaining, + )?; let data_len = header.compressed_page_size as usize; *offset += data_len; *remaining -= data_len; @@ -683,6 +710,7 @@ impl PageReader for SerializedPageReader { } else { let mut read = self.reader.get_read(*offset as u64)?; let (header_len, header) = read_page_header_len(&mut read)?; + verify_page_header_len(header_len, *remaining_bytes)?; *offset += header_len; *remaining_bytes -= header_len; let page_meta = if let Ok(page_meta) = (&header).try_into() { @@ -733,12 +761,23 @@ impl PageReader for SerializedPageReader { next_page_header, } => { if let Some(buffered_header) = next_page_header.take() { + verify_page_size( + buffered_header.compressed_page_size, + buffered_header.uncompressed_page_size, + *remaining_bytes, + )?; // The next page header has already been peeked, so just advance the offset *offset += buffered_header.compressed_page_size as usize; *remaining_bytes -= buffered_header.compressed_page_size as usize; } else { let mut read = self.reader.get_read(*offset as u64)?; let (header_len, header) = read_page_header_len(&mut read)?; + verify_page_header_len(header_len, *remaining_bytes)?; + verify_page_size( + header.compressed_page_size, + header.uncompressed_page_size, + *remaining_bytes, + )?; let data_page_size = header.compressed_page_size as usize; *offset += header_len + data_page_size; *remaining_bytes -= header_len + data_page_size; diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index 2e05b83369cf..b7522a76f0fc 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -157,6 +157,32 @@ pub fn from_thrift( stats.max_value }; + fn check_len(min: &Option>, max: &Option>, len: usize) -> Result<()> { + if let Some(min) = min { + if min.len() < len { + return Err(ParquetError::General( + "Insufficient bytes to parse min statistic".to_string(), + )); + } + } + if let Some(max) = max { + if max.len() < len { + return Err(ParquetError::General( + "Insufficient bytes to parse max statistic".to_string(), + )); + } + } + Ok(()) + } + + match physical_type { + Type::BOOLEAN => check_len(&min, &max, 1), + Type::INT32 | Type::FLOAT => check_len(&min, &max, 4), + Type::INT64 | Type::DOUBLE => check_len(&min, &max, 8), + Type::INT96 => check_len(&min, &max, 12), + _ => Ok(()), + }?; + // Values are encoded using PLAIN encoding definition, except that // variable-length byte arrays do not include a length prefix. // diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index d168e46de047..d9e9b22e809f 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -556,7 +556,11 @@ impl<'a> PrimitiveTypeBuilder<'a> { } } PhysicalType::FIXED_LEN_BYTE_ARRAY => { - let max_precision = (2f64.powi(8 * self.length - 1) - 1f64).log10().floor() as i32; + let length = self + .length + .checked_mul(8) + .ok_or(general_err!("Invalid length {} for Decimal", self.length))?; + let max_precision = (2f64.powi(length - 1) - 1f64).log10().floor() as i32; if self.precision > max_precision { return Err(general_err!( @@ -1171,9 +1175,25 @@ pub fn from_thrift(elements: &[SchemaElement]) -> Result { )); } + if !schema_nodes[0].is_group() { + return Err(general_err!("Expected root node to be a group type")); + } + Ok(schema_nodes.remove(0)) } +/// Checks if the logical type is valid. +fn check_logical_type(logical_type: &Option) -> Result<()> { + if let Some(LogicalType::Integer { bit_width, .. }) = *logical_type { + if bit_width != 8 && bit_width != 16 && bit_width != 32 && bit_width != 64 { + return Err(general_err!( + "Bit width must be 8, 16, 32, or 64 for Integer logical type" + )); + } + } + Ok(()) +} + /// Constructs a new Type from the `elements`, starting at index `index`. /// The first result is the starting index for the next Type after this one. If it is /// equal to `elements.len()`, then this Type is the last one. @@ -1198,6 +1218,9 @@ fn from_thrift_helper(elements: &[SchemaElement], index: usize) -> Result<(usize .logical_type .as_ref() .map(|value| LogicalType::from(value.clone())); + + check_logical_type(&logical_type)?; + let field_id = elements[index].field_id; match elements[index].num_children { // From parquet-format: diff --git a/parquet/src/thrift.rs b/parquet/src/thrift.rs index ceb6b1c29fe8..b216fec6f3e7 100644 --- a/parquet/src/thrift.rs +++ b/parquet/src/thrift.rs @@ -67,7 +67,7 @@ impl<'a> TCompactSliceInputProtocol<'a> { let mut shift = 0; loop { let byte = self.read_byte()?; - in_progress |= ((byte & 0x7F) as u64) << shift; + in_progress |= ((byte & 0x7F) as u64).wrapping_shl(shift); shift += 7; if byte & 0x80 == 0 { return Ok(in_progress); @@ -96,13 +96,22 @@ impl<'a> TCompactSliceInputProtocol<'a> { } } +macro_rules! thrift_unimplemented { + () => { + Err(thrift::Error::Protocol(thrift::ProtocolError { + kind: thrift::ProtocolErrorKind::NotImplemented, + message: "not implemented".to_string(), + })) + }; +} + impl TInputProtocol for TCompactSliceInputProtocol<'_> { fn read_message_begin(&mut self) -> thrift::Result { unimplemented!() } fn read_message_end(&mut self) -> thrift::Result<()> { - unimplemented!() + thrift_unimplemented!() } fn read_struct_begin(&mut self) -> thrift::Result> { @@ -147,7 +156,21 @@ impl TInputProtocol for TCompactSliceInputProtocol<'_> { ), _ => { if field_delta != 0 { - self.last_read_field_id += field_delta as i16; + self.last_read_field_id = self + .last_read_field_id + .checked_add(field_delta as i16) + .map_or_else( + || { + Err(thrift::Error::Protocol(thrift::ProtocolError { + kind: thrift::ProtocolErrorKind::InvalidData, + message: format!( + "cannot add {} to {}", + field_delta, self.last_read_field_id + ), + })) + }, + Ok, + )?; } else { self.last_read_field_id = self.read_i16()?; }; @@ -226,15 +249,15 @@ impl TInputProtocol for TCompactSliceInputProtocol<'_> { } fn read_set_begin(&mut self) -> thrift::Result { - unimplemented!() + thrift_unimplemented!() } fn read_set_end(&mut self) -> thrift::Result<()> { - unimplemented!() + thrift_unimplemented!() } fn read_map_begin(&mut self) -> thrift::Result { - unimplemented!() + thrift_unimplemented!() } fn read_map_end(&mut self) -> thrift::Result<()> { diff --git a/parquet/tests/arrow_reader/bad_data.rs b/parquet/tests/arrow_reader/bad_data.rs index 74342031432a..cfd61e82d32b 100644 --- a/parquet/tests/arrow_reader/bad_data.rs +++ b/parquet/tests/arrow_reader/bad_data.rs @@ -106,7 +106,7 @@ fn test_arrow_rs_gh_6229_dict_header() { let err = read_file("ARROW-RS-GH-6229-DICTHEADER.parquet").unwrap_err(); assert_eq!( err.to_string(), - "External: Parquet argument error: EOF: eof decoding byte array" + "External: Parquet argument error: Parquet error: Integer overflow: out of range integral type conversion attempted" ); } From 4f1f6e57c568fae8233ab9da7d7c7acdaea4112a Mon Sep 17 00:00:00 2001 From: June <61218022+itsjunetime@users.noreply.github.com> Date: Mon, 6 Jan 2025 14:13:56 -0700 Subject: [PATCH 021/503] Update MSRVs to be accurate (#6742) * Update most MSRVs * Make cargo-msrv verify every package in repo instead of just a select few and purposefully break arrow-flight msrv * Add test to ensure workspace rust version is being used at least somewhere * Fix exit1 => exit 1 * Make arrow-flight work, at the very least, with 'cargo metadata' * Fix arrow-flight/gen rust-version to make CI pass now * Get rid of pretty msrv logging as it can't all be displayed * Do '-mindepth 2' with find to prevent running cargo msrv on the workspace as a whole * Use correct MSRV for object_store * remove workspace msrv check * revert msrv * push object_store MSRV back down to 1.62.1 * Revert unrelated formatting changes * Fix object_store msrv --------- Co-authored-by: Andrew Lamb Co-authored-by: Jeffrey Vo --- .github/workflows/rust.yml | 28 +++++--------------- Cargo.toml | 2 +- arrow-flight/gen/Cargo.toml | 2 +- arrow-integration-testing/Cargo.toml | 2 +- arrow-pyarrow-integration-testing/Cargo.toml | 2 +- arrow-schema/Cargo.toml | 2 +- arrow/Cargo.toml | 2 +- parquet/Cargo.toml | 2 +- 8 files changed, 14 insertions(+), 28 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 044250b70435..ca0d2441ceae 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -123,23 +123,6 @@ jobs: uses: ./.github/actions/setup-builder - name: Install cargo-msrv run: cargo install cargo-msrv - - name: Downgrade arrow dependencies - run: cargo update -p ahash --precise 0.8.7 - - name: Check arrow - working-directory: arrow - run: | - # run `cd arrow; cargo msrv verify` to see problematic dependencies - cargo msrv verify --output-format=json - - name: Check parquet - working-directory: parquet - run: | - # run `cd parquet; cargo msrv verify` to see problematic dependencies - cargo msrv verify --output-format=json - - name: Check arrow-flight - working-directory: arrow-flight - run: | - # run `cd arrow-flight; cargo msrv verify` to see problematic dependencies - cargo msrv verify --output-format=json - name: Downgrade object_store dependencies working-directory: object_store # Necessary because tokio 1.30.0 updates MSRV to 1.63 @@ -147,8 +130,11 @@ jobs: run: | cargo update -p tokio --precise 1.29.1 cargo update -p url --precise 2.5.0 - - name: Check object_store - working-directory: object_store + - name: Check all packages run: | - # run `cd object_store; cargo msrv verify` to see problematic dependencies - cargo msrv verify --output-format=json + # run `cargo msrv verify --manifest-path "path/to/Cargo.toml"` to see problematic dependencies + find . -mindepth 2 -name Cargo.toml | while read -r dir + do + echo "Checking package '$dir'" + cargo msrv verify --manifest-path "$dir" --output-format=json || exit 1 + done diff --git a/Cargo.toml b/Cargo.toml index 75ba410f12a6..39e3c0bca99a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -74,7 +74,7 @@ include = [ "Cargo.toml", ] edition = "2021" -rust-version = "1.62" +rust-version = "1.70" [workspace.dependencies] arrow = { version = "54.0.0", path = "./arrow", default-features = false } diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index 6358227a8912..e52efbf67e21 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -20,7 +20,7 @@ name = "gen" description = "Code generation for arrow-flight" version = "0.1.0" edition = { workspace = true } -rust-version = { workspace = true } +rust-version = "1.71.1" authors = { workspace = true } homepage = { workspace = true } repository = { workspace = true } diff --git a/arrow-integration-testing/Cargo.toml b/arrow-integration-testing/Cargo.toml index 8654b4b92734..26cb05fae1c2 100644 --- a/arrow-integration-testing/Cargo.toml +++ b/arrow-integration-testing/Cargo.toml @@ -25,7 +25,7 @@ authors = { workspace = true } license = { workspace = true } edition = { workspace = true } publish = false -rust-version = { workspace = true } +rust-version = "1.75.0" [lib] crate-type = ["lib", "cdylib"] diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index 03d08df30959..4ead95fcb912 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -25,7 +25,7 @@ authors = ["Apache Arrow "] license = "Apache-2.0" keywords = [ "arrow" ] edition = "2021" -rust-version = "1.62" +rust-version = "1.70" publish = false [lib] diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml index 1e1f9fbde0e4..d1bcf046b7ca 100644 --- a/arrow-schema/Cargo.toml +++ b/arrow-schema/Cargo.toml @@ -26,7 +26,7 @@ license = { workspace = true } keywords = { workspace = true } include = { workspace = true } edition = { workspace = true } -rust-version = { workspace = true } +rust-version = "1.64" [lib] name = "arrow_schema" diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 8860cd61c5b3..a1c9c0ab2113 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -31,7 +31,7 @@ include = [ "Cargo.toml", ] edition = { workspace = true } -rust-version = "1.70.0" +rust-version = { workspace = true } [lib] name = "arrow" diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 19f890710778..e4085472ea20 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -26,7 +26,7 @@ authors = { workspace = true } keywords = ["arrow", "parquet", "hadoop"] readme = "README.md" edition = { workspace = true } -rust-version = "1.70.0" +rust-version = { workspace = true } [target.'cfg(target_arch = "wasm32")'.dependencies] ahash = { version = "0.8", default-features = false, features = ["compile-time-rng"] } From f18dadd7093cbed66ee42738d6564950168d3fe3 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 8 Jan 2025 09:02:23 -0500 Subject: [PATCH 022/503] Document the `ParquetRecordBatchStream` buffering (#6947) * Document the ParquetRecordBatchStream buffering * Update parquet/src/arrow/async_reader/mod.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- parquet/src/arrow/async_reader/mod.rs | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 4f3befe42662..5323251b07e7 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -611,11 +611,23 @@ impl std::fmt::Debug for StreamState { } } -/// An asynchronous [`Stream`](https://docs.rs/futures/latest/futures/stream/trait.Stream.html) of [`RecordBatch`] -/// for a parquet file that can be constructed using [`ParquetRecordBatchStreamBuilder`]. +/// An asynchronous [`Stream`]of [`RecordBatch`] constructed using [`ParquetRecordBatchStreamBuilder`] to read parquet files. /// /// `ParquetRecordBatchStream` also provides [`ParquetRecordBatchStream::next_row_group`] for fetching row groups, /// allowing users to decode record batches separately from I/O. +/// +/// # I/O Buffering +/// +/// `ParquetRecordBatchStream` buffers *all* data pages selected after predicates +/// (projection + filtering, etc) and decodes the rows from those buffered pages. +/// +/// For example, if all rows and columns are selected, the entire row group is +/// buffered in memory during decode. This minimizes the number of IO operations +/// required, which is especially important for object stores, where IO operations +/// have latencies in the hundreds of milliseconds +/// +/// +/// [`Stream`]: https://docs.rs/futures/latest/futures/stream/trait.Stream.html pub struct ParquetRecordBatchStream { metadata: Arc, From 74499c0e7846cfbc498bf9fd7a2c1a4c8731c897 Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Wed, 8 Jan 2025 06:06:13 -0800 Subject: [PATCH 023/503] Return `BoxStream` with `'static` lifetime from `ObjectStore::list` (#6619) Co-authored-by: Andrew Lamb --- object_store/src/aws/client.rs | 2 +- object_store/src/aws/mod.rs | 4 +-- object_store/src/azure/client.rs | 2 +- object_store/src/azure/mod.rs | 7 ++-- object_store/src/chunked.rs | 4 +-- object_store/src/client/list.rs | 19 +++++----- object_store/src/client/pagination.rs | 50 ++++++++++++++++----------- object_store/src/gcp/client.rs | 2 +- object_store/src/gcp/mod.rs | 4 +-- object_store/src/http/mod.rs | 15 ++++---- object_store/src/lib.rs | 8 ++--- object_store/src/limit.rs | 14 ++++---- object_store/src/local.rs | 2 +- object_store/src/memory.rs | 2 +- object_store/src/prefix.rs | 32 ++++++++++++++--- object_store/src/throttle.rs | 16 +++++---- object_store/tests/get_range_file.rs | 2 +- 17 files changed, 113 insertions(+), 72 deletions(-) diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index b81be0c0efad..246f2779dd07 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -855,7 +855,7 @@ impl GetClient for S3Client { } #[async_trait] -impl ListClient for S3Client { +impl ListClient for Arc { /// Make an S3 List request async fn list_request( &self, diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 7f449c49963c..82ef909de984 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -273,7 +273,7 @@ impl ObjectStore for AmazonS3 { .boxed() } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { self.client.list(prefix) } @@ -281,7 +281,7 @@ impl ObjectStore for AmazonS3 { &self, prefix: Option<&Path>, offset: &Path, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'static, Result> { if self.client.config.is_s3_express() { let offset = offset.clone(); // S3 Express does not support start-after diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index bd72d0c6aee1..fa5412c455fc 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -925,7 +925,7 @@ impl GetClient for AzureClient { } #[async_trait] -impl ListClient for AzureClient { +impl ListClient for Arc { /// Make an Azure List request async fn list_request( &self, diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 81b6667bc058..ea4dd8f567a9 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -119,6 +119,9 @@ impl ObjectStore for MicrosoftAzure { self.client.delete_request(location, &()).await } + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { + self.client.list(prefix) + } fn delete_stream<'a>( &'a self, locations: BoxStream<'a, Result>, @@ -139,10 +142,6 @@ impl ObjectStore for MicrosoftAzure { .boxed() } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { - self.client.list(prefix) - } - async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { self.client.list_with_delimiter(prefix).await } diff --git a/object_store/src/chunked.rs b/object_store/src/chunked.rs index 3f83c1336dc4..4998e9f2a04d 100644 --- a/object_store/src/chunked.rs +++ b/object_store/src/chunked.rs @@ -150,7 +150,7 @@ impl ObjectStore for ChunkedStore { self.inner.delete(location).await } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { self.inner.list(prefix) } @@ -158,7 +158,7 @@ impl ObjectStore for ChunkedStore { &self, prefix: Option<&Path>, offset: &Path, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'static, Result> { self.inner.list_with_offset(prefix, offset) } diff --git a/object_store/src/client/list.rs b/object_store/src/client/list.rs index 4445d0d17533..fe9bfebf768d 100644 --- a/object_store/src/client/list.rs +++ b/object_store/src/client/list.rs @@ -44,37 +44,38 @@ pub(crate) trait ListClientExt { prefix: Option<&Path>, delimiter: bool, offset: Option<&Path>, - ) -> BoxStream<'_, Result>; + ) -> BoxStream<'static, Result>; - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result>; + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result>; #[allow(unused)] fn list_with_offset( &self, prefix: Option<&Path>, offset: &Path, - ) -> BoxStream<'_, Result>; + ) -> BoxStream<'static, Result>; async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result; } #[async_trait] -impl ListClientExt for T { +impl ListClientExt for T { fn list_paginated( &self, prefix: Option<&Path>, delimiter: bool, offset: Option<&Path>, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'static, Result> { let offset = offset.map(|x| x.to_string()); let prefix = prefix .filter(|x| !x.as_ref().is_empty()) .map(|p| format!("{}{}", p.as_ref(), crate::path::DELIMITER)); stream_paginated( + self.clone(), (prefix, offset), - move |(prefix, offset), token| async move { - let (r, next_token) = self + move |client, (prefix, offset), token| async move { + let (r, next_token) = client .list_request( prefix.as_deref(), delimiter, @@ -88,7 +89,7 @@ impl ListClientExt for T { .boxed() } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { self.list_paginated(prefix, false, None) .map_ok(|r| futures::stream::iter(r.objects.into_iter().map(Ok))) .try_flatten() @@ -99,7 +100,7 @@ impl ListClientExt for T { &self, prefix: Option<&Path>, offset: &Path, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'static, Result> { self.list_paginated(prefix, false, Some(offset)) .map_ok(|r| futures::stream::iter(r.objects.into_iter().map(Ok))) .try_flatten() diff --git a/object_store/src/client/pagination.rs b/object_store/src/client/pagination.rs index 77b2a3d8e2f2..d789c7431d8c 100644 --- a/object_store/src/client/pagination.rs +++ b/object_store/src/client/pagination.rs @@ -35,9 +35,14 @@ use std::future::Future; /// finish, otherwise it will continue to call `op(state, token)` with the values returned by the /// previous call to `op`, until a continuation token of `None` is returned /// -pub(crate) fn stream_paginated(state: S, op: F) -> impl Stream> +pub(crate) fn stream_paginated( + client: C, + state: S, + op: F, +) -> impl Stream> where - F: Fn(S, Option) -> Fut + Copy, + C: Clone, + F: Fn(C, S, Option) -> Fut + Copy, Fut: Future)>>, { enum PaginationState { @@ -46,27 +51,30 @@ where Done, } - futures::stream::unfold(PaginationState::Start(state), move |state| async move { - let (s, page_token) = match state { - PaginationState::Start(s) => (s, None), - PaginationState::HasMore(s, page_token) if !page_token.is_empty() => { - (s, Some(page_token)) - } - _ => { - return None; - } - }; + futures::stream::unfold(PaginationState::Start(state), move |state| { + let client = client.clone(); + async move { + let (s, page_token) = match state { + PaginationState::Start(s) => (s, None), + PaginationState::HasMore(s, page_token) if !page_token.is_empty() => { + (s, Some(page_token)) + } + _ => { + return None; + } + }; - let (resp, s, continuation) = match op(s, page_token).await { - Ok(resp) => resp, - Err(e) => return Some((Err(e), PaginationState::Done)), - }; + let (resp, s, continuation) = match op(client, s, page_token).await { + Ok(resp) => resp, + Err(e) => return Some((Err(e), PaginationState::Done)), + }; - let next_state = match continuation { - Some(token) => PaginationState::HasMore(s, token), - None => PaginationState::Done, - }; + let next_state = match continuation { + Some(token) => PaginationState::HasMore(s, token), + None => PaginationState::Done, + }; - Some((Ok(resp), next_state)) + Some((Ok(resp), next_state)) + } }) } diff --git a/object_store/src/gcp/client.rs b/object_store/src/gcp/client.rs index d6f89ca71740..8dd1c69802a8 100644 --- a/object_store/src/gcp/client.rs +++ b/object_store/src/gcp/client.rs @@ -633,7 +633,7 @@ impl GetClient for GoogleCloudStorageClient { } #[async_trait] -impl ListClient for GoogleCloudStorageClient { +impl ListClient for Arc { /// Perform a list request async fn list_request( &self, diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index 5199135ba6b0..a2f512415a8d 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -183,7 +183,7 @@ impl ObjectStore for GoogleCloudStorage { self.client.delete_request(location).await } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { self.client.list(prefix) } @@ -191,7 +191,7 @@ impl ObjectStore for GoogleCloudStorage { &self, prefix: Option<&Path>, offset: &Path, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'static, Result> { self.client.list_with_offset(prefix, offset) } diff --git a/object_store/src/http/mod.rs b/object_store/src/http/mod.rs index 417f72856722..899740d36db9 100644 --- a/object_store/src/http/mod.rs +++ b/object_store/src/http/mod.rs @@ -31,6 +31,8 @@ //! [rfc2518]: https://datatracker.ietf.org/doc/html/rfc2518 //! [WebDAV]: https://en.wikipedia.org/wiki/WebDAV +use std::sync::Arc; + use async_trait::async_trait; use futures::stream::BoxStream; use futures::{StreamExt, TryStreamExt}; @@ -79,7 +81,7 @@ impl From for crate::Error { /// See [`crate::http`] for more information #[derive(Debug)] pub struct HttpStore { - client: Client, + client: Arc, } impl std::fmt::Display for HttpStore { @@ -130,19 +132,20 @@ impl ObjectStore for HttpStore { self.client.delete(location).await } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { let prefix_len = prefix.map(|p| p.as_ref().len()).unwrap_or_default(); let prefix = prefix.cloned(); + let client = Arc::clone(&self.client); futures::stream::once(async move { - let status = self.client.list(prefix.as_ref(), "infinity").await?; + let status = client.list(prefix.as_ref(), "infinity").await?; let iter = status .response .into_iter() .filter(|r| !r.is_dir()) - .map(|response| { + .map(move |response| { response.check_ok()?; - response.object_meta(self.client.base_url()) + response.object_meta(client.base_url()) }) // Filter out exact prefix matches .filter_ok(move |r| r.location.as_ref().len() > prefix_len); @@ -238,7 +241,7 @@ impl HttpBuilder { let parsed = Url::parse(&url).map_err(|source| Error::UnableToParseUrl { url, source })?; Ok(HttpStore { - client: Client::new(parsed, self.client_options, self.retry_config)?, + client: Arc::new(Client::new(parsed, self.client_options, self.retry_config)?), }) } } diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 987ffacc6e49..53eda5a82fd5 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -722,7 +722,7 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// `foo/bar_baz/x`. List is recursive, i.e. `foo/bar/more/x` will be included. /// /// Note: the order of returned [`ObjectMeta`] is not guaranteed - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result>; + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result>; /// List all the objects with the given prefix and a location greater than `offset` /// @@ -734,7 +734,7 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { &self, prefix: Option<&Path>, offset: &Path, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'static, Result> { let offset = offset.clone(); self.list(prefix) .try_filter(move |f| futures::future::ready(f.location > offset)) @@ -847,7 +847,7 @@ macro_rules! as_ref_impl { self.as_ref().delete_stream(locations) } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { self.as_ref().list(prefix) } @@ -855,7 +855,7 @@ macro_rules! as_ref_impl { &self, prefix: Option<&Path>, offset: &Path, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'static, Result> { self.as_ref().list_with_offset(prefix, offset) } diff --git a/object_store/src/limit.rs b/object_store/src/limit.rs index 6a3c3b574e62..77f72a0e11a1 100644 --- a/object_store/src/limit.rs +++ b/object_store/src/limit.rs @@ -45,7 +45,7 @@ use tokio::sync::{OwnedSemaphorePermit, Semaphore}; /// #[derive(Debug)] pub struct LimitStore { - inner: T, + inner: Arc, max_requests: usize, semaphore: Arc, } @@ -56,7 +56,7 @@ impl LimitStore { /// `max_requests` pub fn new(inner: T, max_requests: usize) -> Self { Self { - inner, + inner: Arc::new(inner), max_requests, semaphore: Arc::new(Semaphore::new(max_requests)), } @@ -144,12 +144,13 @@ impl ObjectStore for LimitStore { self.inner.delete_stream(locations) } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { let prefix = prefix.cloned(); + let inner = Arc::clone(&self.inner); let fut = Arc::clone(&self.semaphore) .acquire_owned() .map(move |permit| { - let s = self.inner.list(prefix.as_ref()); + let s = inner.list(prefix.as_ref()); PermitWrapper::new(s, permit.unwrap()) }); fut.into_stream().flatten().boxed() @@ -159,13 +160,14 @@ impl ObjectStore for LimitStore { &self, prefix: Option<&Path>, offset: &Path, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'static, Result> { let prefix = prefix.cloned(); let offset = offset.clone(); + let inner = Arc::clone(&self.inner); let fut = Arc::clone(&self.semaphore) .acquire_owned() .map(move |permit| { - let s = self.inner.list_with_offset(prefix.as_ref(), &offset); + let s = inner.list_with_offset(prefix.as_ref(), &offset); PermitWrapper::new(s, permit.unwrap()) }); fut.into_stream().flatten().boxed() diff --git a/object_store/src/local.rs b/object_store/src/local.rs index b193481ae7b8..364026459a03 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -488,7 +488,7 @@ impl ObjectStore for LocalFileSystem { .await } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { let config = Arc::clone(&self.config); let root_path = match prefix { diff --git a/object_store/src/memory.rs b/object_store/src/memory.rs index 3f3cff3390db..6402f924346f 100644 --- a/object_store/src/memory.rs +++ b/object_store/src/memory.rs @@ -297,7 +297,7 @@ impl ObjectStore for InMemory { Ok(()) } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { let root = Path::default(); let prefix = prefix.unwrap_or(&root); diff --git a/object_store/src/prefix.rs b/object_store/src/prefix.rs index 227887d78fd7..a0b67ca4b58e 100644 --- a/object_store/src/prefix.rs +++ b/object_store/src/prefix.rs @@ -74,6 +74,28 @@ impl PrefixStore { } } +// Note: This is a relative hack to move these two functions to pure functions so they don't rely +// on the `self` lifetime. Expected to be cleaned up before merge. +// +/// Strip the constant prefix from a given path +fn strip_prefix(prefix: &Path, path: Path) -> Path { + // Note cannot use match because of borrow checker + if let Some(suffix) = path.prefix_match(prefix) { + return suffix.collect(); + } + path +} + +/// Strip the constant prefix from a given ObjectMeta +fn strip_meta(prefix: &Path, meta: ObjectMeta) -> ObjectMeta { + ObjectMeta { + last_modified: meta.last_modified, + size: meta.size, + location: strip_prefix(prefix, meta.location), + e_tag: meta.e_tag, + version: None, + } +} #[async_trait::async_trait] impl ObjectStore for PrefixStore { async fn put(&self, location: &Path, payload: PutPayload) -> Result { @@ -136,21 +158,23 @@ impl ObjectStore for PrefixStore { self.inner.delete(&full_path).await } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { let prefix = self.full_path(prefix.unwrap_or(&Path::default())); let s = self.inner.list(Some(&prefix)); - s.map_ok(|meta| self.strip_meta(meta)).boxed() + let slf_prefix = self.prefix.clone(); + s.map_ok(move |meta| strip_meta(&slf_prefix, meta)).boxed() } fn list_with_offset( &self, prefix: Option<&Path>, offset: &Path, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'static, Result> { let offset = self.full_path(offset); let prefix = self.full_path(prefix.unwrap_or(&Path::default())); let s = self.inner.list_with_offset(Some(&prefix), &offset); - s.map_ok(|meta| self.strip_meta(meta)).boxed() + let slf_prefix = self.prefix.clone(); + s.map_ok(move |meta| strip_meta(&slf_prefix, meta)).boxed() } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { diff --git a/object_store/src/throttle.rs b/object_store/src/throttle.rs index b9dff5c6d1d2..29cd32705ccc 100644 --- a/object_store/src/throttle.rs +++ b/object_store/src/throttle.rs @@ -237,11 +237,13 @@ impl ObjectStore for ThrottledStore { self.inner.delete(location).await } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { let stream = self.inner.list(prefix); + let config = Arc::clone(&self.config); futures::stream::once(async move { - let wait_list_per_entry = self.config().wait_list_per_entry; - sleep(self.config().wait_list_per_call).await; + let config = *config.lock(); + let wait_list_per_entry = config.wait_list_per_entry; + sleep(config.wait_list_per_call).await; throttle_stream(stream, move |_| wait_list_per_entry) }) .flatten() @@ -252,11 +254,13 @@ impl ObjectStore for ThrottledStore { &self, prefix: Option<&Path>, offset: &Path, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'static, Result> { let stream = self.inner.list_with_offset(prefix, offset); + let config = Arc::clone(&self.config); futures::stream::once(async move { - let wait_list_per_entry = self.config().wait_list_per_entry; - sleep(self.config().wait_list_per_call).await; + let config = *config.lock(); + let wait_list_per_entry = config.wait_list_per_entry; + sleep(config.wait_list_per_call).await; throttle_stream(stream, move |_| wait_list_per_entry) }) .flatten() diff --git a/object_store/tests/get_range_file.rs b/object_store/tests/get_range_file.rs index c5550ac21728..e500fc8ac87d 100644 --- a/object_store/tests/get_range_file.rs +++ b/object_store/tests/get_range_file.rs @@ -62,7 +62,7 @@ impl ObjectStore for MyStore { todo!() } - fn list(&self, _: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, _: Option<&Path>) -> BoxStream<'static, Result> { todo!() } From a47d9967be152b246e9fde1fe12ee512bcd5a856 Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Wed, 8 Jan 2025 09:28:05 -0500 Subject: [PATCH 024/503] [Parquet] Reuse buffer in `ByteViewArrayDecoderPlain` (#6930) * reuse buffer in view array * Update parquet/src/arrow/array_reader/byte_view_array.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * use From instead --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- .../src/arrow/array_reader/byte_view_array.rs | 38 ++++++++++++++++--- 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/parquet/src/arrow/array_reader/byte_view_array.rs b/parquet/src/arrow/array_reader/byte_view_array.rs index 92a8b0592d0d..0e16642940d2 100644 --- a/parquet/src/arrow/array_reader/byte_view_array.rs +++ b/parquet/src/arrow/array_reader/byte_view_array.rs @@ -290,7 +290,7 @@ impl ByteViewArrayDecoder { /// Decoder from [`Encoding::PLAIN`] data to [`ViewBuffer`] pub struct ByteViewArrayDecoderPlain { - buf: Bytes, + buf: Buffer, offset: usize, validate_utf8: bool, @@ -308,7 +308,7 @@ impl ByteViewArrayDecoderPlain { validate_utf8: bool, ) -> Self { Self { - buf, + buf: Buffer::from(buf), offset: 0, max_remaining_values: num_values.unwrap_or(num_levels), validate_utf8, @@ -316,9 +316,15 @@ impl ByteViewArrayDecoderPlain { } pub fn read(&mut self, output: &mut ViewBuffer, len: usize) -> Result { - // Zero copy convert `bytes::Bytes` into `arrow_buffer::Buffer` - let buf = arrow_buffer::Buffer::from(self.buf.clone()); - let block_id = output.append_block(buf); + // avoid creating a new buffer if the last buffer is the same as the current buffer + // This is especially useful when row-level filtering is applied, where we call lots of small `read` over the same buffer. + let block_id = { + if output.buffers.last().is_some_and(|x| x.ptr_eq(&self.buf)) { + output.buffers.len() as u32 - 1 + } else { + output.append_block(self.buf.clone()) + } + }; let to_read = len.min(self.max_remaining_values); @@ -690,12 +696,13 @@ mod tests { use crate::{ arrow::{ - array_reader::test_util::{byte_array_all_encodings, utf8_column}, + array_reader::test_util::{byte_array_all_encodings, encode_byte_array, utf8_column}, buffer::view_buffer::ViewBuffer, record_reader::buffer::ValuesBuffer, }, basic::Encoding, column::reader::decoder::ColumnValueDecoder, + data_type::ByteArray, }; use super::*; @@ -746,4 +753,23 @@ mod tests { ); } } + + #[test] + fn test_byte_view_array_plain_decoder_reuse_buffer() { + let byte_array = vec!["hello", "world", "large payload over 12 bytes", "b"]; + let byte_array: Vec = byte_array.into_iter().map(|x| x.into()).collect(); + let pages = encode_byte_array(Encoding::PLAIN, &byte_array); + + let column_desc = utf8_column(); + let mut decoder = ByteViewArrayColumnValueDecoder::new(&column_desc); + + let mut view_buffer = ViewBuffer::default(); + decoder.set_data(Encoding::PLAIN, pages, 4, None).unwrap(); + decoder.read(&mut view_buffer, 1).unwrap(); + decoder.read(&mut view_buffer, 1).unwrap(); + assert_eq!(view_buffer.buffers.len(), 1); + + decoder.read(&mut view_buffer, 1).unwrap(); + assert_eq!(view_buffer.buffers.len(), 1); + } } From 485dbb1e0f692c7bfa47376d477bb62e7d801a8c Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 8 Jan 2025 09:54:39 -0800 Subject: [PATCH 025/503] regenerate arrow-ipc/src/gen with patched flatbuffers (#6426) * regenerate arrow-ipc/src/gen with patched flatbuffers * use git repo instead of local path * add backticks * expand allowed overage to accommodate more alignment padding * re-enable nanoarrow integration test * add assertions that struct alignment is correct * remove struct alignment assertions * apply a patch to generated code rather than requiring patched flatc * point to google/flatbuffers with pub PushAlignment * add license header to gen.patch * use flatbuffers 24.12.23 * remove unnecessary gen.patch --- .github/workflows/integration.yml | 3 +- arrow-flight/src/encode.rs | 14 +- arrow-ipc/Cargo.toml | 2 +- arrow-ipc/regen.sh | 90 +++---- arrow-ipc/src/gen/File.rs | 26 +- arrow-ipc/src/gen/Message.rs | 66 ++--- arrow-ipc/src/gen/Schema.rs | 397 +++++++++++++++--------------- arrow-ipc/src/gen/SparseTensor.rs | 182 +++++++++++--- arrow-ipc/src/gen/Tensor.rs | 150 +++++++++-- arrow-ipc/src/lib.rs | 11 + 10 files changed, 609 insertions(+), 332 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 9b23b1b5ad2e..a47195d1becf 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -65,8 +65,7 @@ jobs: ARROW_INTEGRATION_JAVA: ON ARROW_INTEGRATION_JS: ON ARCHERY_INTEGRATION_TARGET_IMPLEMENTATIONS: "rust" - # Disable nanoarrow integration, due to https://github.com/apache/arrow-rs/issues/5052 - ARCHERY_INTEGRATION_WITH_NANOARROW: "0" + ARCHERY_INTEGRATION_WITH_NANOARROW: "1" # https://github.com/apache/arrow/pull/38403/files#r1371281630 ARCHERY_INTEGRATION_WITH_RUST: "1" # These are necessary because the github runner overrides $HOME diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index 19fe42474405..57ac9f3173fe 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -1708,7 +1708,7 @@ mod tests { ]) .unwrap(); - verify_encoded_split(batch, 112).await; + verify_encoded_split(batch, 120).await; } #[tokio::test] @@ -1719,7 +1719,7 @@ mod tests { // overage is much higher than ideal // https://github.com/apache/arrow-rs/issues/3478 - verify_encoded_split(batch, 4304).await; + verify_encoded_split(batch, 4312).await; } #[tokio::test] @@ -1755,7 +1755,7 @@ mod tests { // 5k over limit (which is 2x larger than limit of 5k) // overage is much higher than ideal // https://github.com/apache/arrow-rs/issues/3478 - verify_encoded_split(batch, 5800).await; + verify_encoded_split(batch, 5808).await; } #[tokio::test] @@ -1771,7 +1771,7 @@ mod tests { let batch = RecordBatch::try_from_iter(vec![("a1", Arc::new(array) as _)]).unwrap(); - verify_encoded_split(batch, 48).await; + verify_encoded_split(batch, 56).await; } #[tokio::test] @@ -1785,7 +1785,7 @@ mod tests { // overage is much higher than ideal // https://github.com/apache/arrow-rs/issues/3478 - verify_encoded_split(batch, 3328).await; + verify_encoded_split(batch, 3336).await; } #[tokio::test] @@ -1799,7 +1799,7 @@ mod tests { // overage is much higher than ideal // https://github.com/apache/arrow-rs/issues/3478 - verify_encoded_split(batch, 5280).await; + verify_encoded_split(batch, 5288).await; } #[tokio::test] @@ -1824,7 +1824,7 @@ mod tests { // overage is much higher than ideal // https://github.com/apache/arrow-rs/issues/3478 - verify_encoded_split(batch, 4128).await; + verify_encoded_split(batch, 4136).await; } /// Return size, in memory of flight data diff --git a/arrow-ipc/Cargo.toml b/arrow-ipc/Cargo.toml index cf91b3a3415f..4988eed4a5ed 100644 --- a/arrow-ipc/Cargo.toml +++ b/arrow-ipc/Cargo.toml @@ -38,7 +38,7 @@ arrow-array = { workspace = true } arrow-buffer = { workspace = true } arrow-data = { workspace = true } arrow-schema = { workspace = true } -flatbuffers = { version = "24.3.25", default-features = false } +flatbuffers = { version = "24.12.23", default-features = false } lz4_flex = { version = "0.11", default-features = false, features = ["std", "frame"], optional = true } zstd = { version = "0.13.0", default-features = false, optional = true } diff --git a/arrow-ipc/regen.sh b/arrow-ipc/regen.sh index 8d8862ccc7f4..b368bd1bc7cc 100755 --- a/arrow-ipc/regen.sh +++ b/arrow-ipc/regen.sh @@ -21,33 +21,36 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" # Change to the toplevel `arrow-rs` directory pushd $DIR/../ -echo "Build flatc from source ..." - -FB_URL="https://github.com/google/flatbuffers" -FB_DIR="arrow/.flatbuffers" -FLATC="$FB_DIR/bazel-bin/flatc" - -if [ -z $(which bazel) ]; then - echo "bazel is required to build flatc" - exit 1 -fi - -echo "Bazel version: $(bazel version | head -1 | awk -F':' '{print $2}')" - -if [ ! -e $FB_DIR ]; then - echo "git clone $FB_URL ..." - git clone -b master --no-tag --depth 1 $FB_URL $FB_DIR +if [ -z "$FLATC" ]; then + echo "Build flatc from source ..." + + FB_URL="https://github.com/google/flatbuffers" + FB_DIR="arrow/.flatbuffers" + FLATC="$FB_DIR/bazel-bin/flatc" + + if [ -z $(which bazel) ]; then + echo "bazel is required to build flatc" + exit 1 + fi + + echo "Bazel version: $(bazel version | head -1 | awk -F':' '{print $2}')" + + if [ ! -e $FB_DIR ]; then + echo "git clone $FB_URL ..." + git clone -b master --no-tag --depth 1 $FB_URL $FB_DIR + else + echo "git pull $FB_URL ..." + git -C $FB_DIR pull + fi + + pushd $FB_DIR + echo "run: bazel build :flatc ..." + bazel build :flatc + popd else - echo "git pull $FB_URL ..." - git -C $FB_DIR pull + echo "Using flatc $FLATC ..." fi -pushd $FB_DIR -echo "run: bazel build :flatc ..." -bazel build :flatc -popd - - # Execute the code generation: $FLATC --filename-suffix "" --rust -o arrow-ipc/src/gen/ format/*.fbs @@ -99,37 +102,38 @@ for f in `ls *.rs`; do fi echo "Modifying: $f" - sed -i '' '/extern crate flatbuffers;/d' $f - sed -i '' '/use self::flatbuffers::EndianScalar;/d' $f - sed -i '' '/\#\[allow(unused_imports, dead_code)\]/d' $f - sed -i '' '/pub mod org {/d' $f - sed -i '' '/pub mod apache {/d' $f - sed -i '' '/pub mod arrow {/d' $f - sed -i '' '/pub mod flatbuf {/d' $f - sed -i '' '/} \/\/ pub mod flatbuf/d' $f - sed -i '' '/} \/\/ pub mod arrow/d' $f - sed -i '' '/} \/\/ pub mod apache/d' $f - sed -i '' '/} \/\/ pub mod org/d' $f - sed -i '' '/use core::mem;/d' $f - sed -i '' '/use core::cmp::Ordering;/d' $f - sed -i '' '/use self::flatbuffers::{EndianScalar, Follow};/d' $f + sed --in-place='' '/extern crate flatbuffers;/d' $f + sed --in-place='' '/use self::flatbuffers::EndianScalar;/d' $f + sed --in-place='' '/\#\[allow(unused_imports, dead_code)\]/d' $f + sed --in-place='' '/pub mod org {/d' $f + sed --in-place='' '/pub mod apache {/d' $f + sed --in-place='' '/pub mod arrow {/d' $f + sed --in-place='' '/pub mod flatbuf {/d' $f + sed --in-place='' '/} \/\/ pub mod flatbuf/d' $f + sed --in-place='' '/} \/\/ pub mod arrow/d' $f + sed --in-place='' '/} \/\/ pub mod apache/d' $f + sed --in-place='' '/} \/\/ pub mod org/d' $f + sed --in-place='' '/use core::mem;/d' $f + sed --in-place='' '/use core::cmp::Ordering;/d' $f + sed --in-place='' '/use self::flatbuffers::{EndianScalar, Follow};/d' $f # required by flatc 1.12.0+ - sed -i '' "/\#\!\[allow(unused_imports, dead_code)\]/d" $f + sed --in-place='' "/\#\!\[allow(unused_imports, dead_code)\]/d" $f for name in ${names[@]}; do - sed -i '' "/use crate::${name}::\*;/d" $f - sed -i '' "s/use self::flatbuffers::Verifiable;/use flatbuffers::Verifiable;/g" $f + sed --in-place='' "/use crate::${name}::\*;/d" $f + sed --in-place='' "s/use self::flatbuffers::Verifiable;/use flatbuffers::Verifiable;/g" $f done # Replace all occurrences of "type__" with "type_", "TYPE__" with "TYPE_". - sed -i '' 's/type__/type_/g' $f - sed -i '' 's/TYPE__/TYPE_/g' $f + sed --in-place='' 's/type__/type_/g' $f + sed --in-place='' 's/TYPE__/TYPE_/g' $f # Some files need prefixes if [[ $f == "File.rs" ]]; then # Now prefix the file with the static contents echo -e "${PREFIX}" "${SCHEMA_IMPORT}" | cat - $f > temp && mv temp $f elif [[ $f == "Message.rs" ]]; then + sed --in-place='' 's/List/\`List\`/g' $f echo -e "${PREFIX}" "${SCHEMA_IMPORT}" "${SPARSE_TENSOR_IMPORT}" "${TENSOR_IMPORT}" | cat - $f > temp && mv temp $f elif [[ $f == "SparseTensor.rs" ]]; then echo -e "${PREFIX}" "${SCHEMA_IMPORT}" "${TENSOR_IMPORT}" | cat - $f > temp && mv temp $f diff --git a/arrow-ipc/src/gen/File.rs b/arrow-ipc/src/gen/File.rs index c0c2fb183237..427cf75de096 100644 --- a/arrow-ipc/src/gen/File.rs +++ b/arrow-ipc/src/gen/File.rs @@ -23,6 +23,8 @@ use flatbuffers::EndianScalar; use std::{cmp::Ordering, mem}; // automatically generated by the FlatBuffers compiler, do not modify +// @generated + // struct Block, aligned to 8 #[repr(transparent)] #[derive(Clone, Copy, PartialEq)] @@ -64,6 +66,10 @@ impl<'b> flatbuffers::Push for Block { let src = ::core::slice::from_raw_parts(self as *const Block as *const u8, Self::size()); dst.copy_from_slice(src); } + #[inline] + fn alignment() -> flatbuffers::PushAlignment { + flatbuffers::PushAlignment::new(8) + } } impl<'a> flatbuffers::Verifiable for Block { @@ -211,8 +217,8 @@ impl<'a> Footer<'a> { Footer { _tab: table } } #[allow(unused_mut)] - pub fn create<'bldr: 'args, 'args: 'mut_bldr, 'mut_bldr>( - _fbb: &'mut_bldr mut flatbuffers::FlatBufferBuilder<'bldr>, + pub fn create<'bldr: 'args, 'args: 'mut_bldr, 'mut_bldr, A: flatbuffers::Allocator + 'bldr>( + _fbb: &'mut_bldr mut flatbuffers::FlatBufferBuilder<'bldr, A>, args: &'args FooterArgs<'args>, ) -> flatbuffers::WIPOffset> { let mut builder = FooterBuilder::new(_fbb); @@ -344,11 +350,11 @@ impl<'a> Default for FooterArgs<'a> { } } -pub struct FooterBuilder<'a: 'b, 'b> { - fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, +pub struct FooterBuilder<'a: 'b, 'b, A: flatbuffers::Allocator + 'a> { + fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a, A>, start_: flatbuffers::WIPOffset, } -impl<'a: 'b, 'b> FooterBuilder<'a, 'b> { +impl<'a: 'b, 'b, A: flatbuffers::Allocator + 'a> FooterBuilder<'a, 'b, A> { #[inline] pub fn add_version(&mut self, version: MetadataVersion) { self.fbb_ @@ -388,7 +394,7 @@ impl<'a: 'b, 'b> FooterBuilder<'a, 'b> { ); } #[inline] - pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> FooterBuilder<'a, 'b> { + pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a, A>) -> FooterBuilder<'a, 'b, A> { let start = _fbb.start_table(); FooterBuilder { fbb_: _fbb, @@ -474,16 +480,16 @@ pub unsafe fn size_prefixed_root_as_footer_unchecked(buf: &[u8]) -> Footer { flatbuffers::size_prefixed_root_unchecked::