sparrow 2.4.0
C++20 idiomatic APIs for the Apache Arrow Columnar Format
Loading...
Searching...
No Matches
run_end_encoded_array.hpp
Go to the documentation of this file.
1// Copyright 2024 Man Group Operations Limited
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#pragma once
16
17#include "sparrow/array_api.hpp"
22
23namespace sparrow
24{
26
28 {
29 public:
30
33
36 run_end_encoded_reference(run_end_encoded_array& array, std::size_t index, std::size_t run_index);
37
40
44
45 [[nodiscard]] SPARROW_API bool has_value() const;
46 [[nodiscard]] SPARROW_API operator const_reference() const;
47
48 private:
49
50 [[nodiscard]] SPARROW_API const_reference current_value() const;
51
52 run_end_encoded_array* p_array = nullptr;
53 std::size_t m_index = 0;
54 mutable std::size_t m_run_index = 0;
55 };
56}
57
58#include "sparrow/layout/run_end_encoded_iterator.hpp"
59
60namespace sparrow
61{
62
63 namespace detail
64 {
65 template <>
67 {
68 [[nodiscard]] static constexpr sparrow::data_type get()
69 {
71 }
72 };
73 }
74
78 template <class T>
79 constexpr bool is_run_end_encoded_array_v = std::same_as<T, run_end_encoded_array>;
80
101 {
102 public:
103
105 using size_type = std::size_t;
112 using reverse_iterator = std::reverse_iterator<iterator>;
113 using const_reverse_iterator = std::reverse_iterator<const_iterator>;
114
126
139 template <class... Args>
141 explicit run_end_encoded_array(Args&&... args)
142 : run_end_encoded_array(create_proxy(std::forward<Args>(args)...))
143 {
144 }
145
156
170
173
180 [[nodiscard]] SPARROW_API reference operator[](std::uint64_t i);
181
188 [[nodiscard]] SPARROW_API array_traits::const_reference operator[](std::uint64_t i) const;
189
195 [[nodiscard]] SPARROW_API iterator begin();
196
197
203 [[nodiscard]] SPARROW_API iterator end();
204
210 [[nodiscard]] SPARROW_API const_iterator begin() const;
211
217 [[nodiscard]] SPARROW_API const_iterator end() const;
218
224 [[nodiscard]] SPARROW_API const_iterator cbegin() const;
225
231 [[nodiscard]] SPARROW_API const_iterator cend() const;
232
239
246
253
260
267
274
280 [[nodiscard]] SPARROW_API reference front();
281
288
294 [[nodiscard]] SPARROW_API reference back();
295
302
309
319
320 template <std::input_iterator InputIt>
321 requires std::constructible_from<value_type, typename std::iterator_traits<InputIt>::value_type>
322 iterator insert(const_iterator pos, InputIt first, InputIt last)
323 {
324 const size_type index = static_cast<size_type>(std::distance(cbegin(), pos));
325 size_type inserted_count = 0;
326 while (first != last)
327 {
328 value_type current_value(*first);
329 ++first;
330 insert_logical_value(index + inserted_count, current_value, first == last);
331 ++inserted_count;
332 }
333 return sparrow::next(begin(), static_cast<std::ptrdiff_t>(index));
334 }
335
336 template <std::ranges::input_range R>
337 requires std::constructible_from<value_type, std::ranges::range_value_t<R>>
339 {
340 return insert(
341 pos,
342 std::ranges::begin(std::forward<R>(range)),
343 std::ranges::end(std::forward<R>(range))
344 );
345 }
346
353
364
365 SPARROW_API void push_back(const value_type& value);
366
368
369 SPARROW_API void resize(size_type new_length, const value_type& value);
370
372
378 [[nodiscard]] SPARROW_API bool empty() const;
379
385 [[nodiscard]] SPARROW_API size_type size() const;
386
392 [[nodiscard]] std::optional<std::string_view> name() const;
393
399 [[nodiscard]] std::optional<key_value_view> metadata() const;
400
401 private:
402
413 template <input_metadata_container METADATA_RANGE = std::vector<metadata_pair>>
414 [[nodiscard]] static auto create_proxy(
415 array&& acc_lengths,
416 array&& encoded_values,
417 std::optional<std::string_view> name = std::nullopt,
418 std::optional<METADATA_RANGE> metadata = std::nullopt
419 ) -> arrow_proxy;
420
421 using acc_length_ptr_variant_type = std::variant<const std::int16_t*, const std::int32_t*, const std::int64_t*>;
422
431 [[nodiscard]] SPARROW_API static std::pair<std::int64_t, std::int64_t>
432 extract_length_and_null_count(const array& acc_lengths_arr, const array& encoded_values_arr);
433
440 [[nodiscard]] SPARROW_API static acc_length_ptr_variant_type get_acc_lengths_ptr(const array& ar);
441
448 [[nodiscard]] SPARROW_API std::uint64_t get_acc_length(std::uint64_t run_index) const;
449
455 [[nodiscard]] SPARROW_API arrow_proxy& get_arrow_proxy();
456
462 [[nodiscard]] SPARROW_API const arrow_proxy& get_arrow_proxy() const;
463
464 [[nodiscard]] SPARROW_API size_type find_run_index(std::uint64_t logical_index) const;
465
466 [[nodiscard]] SPARROW_API std::uint64_t run_start(size_type run_index) const;
467
468 [[nodiscard]] SPARROW_API std::uint64_t run_end(size_type run_index) const;
469
470 [[nodiscard]] SPARROW_API const_reference encoded_value(size_type run_index) const;
471
472 [[nodiscard]] SPARROW_API bool
473 encoded_values_equal(size_type lhs_run_index, size_type rhs_run_index) const;
474
475 [[nodiscard]] SPARROW_API bool encoded_value_equals(size_type run_index, const value_type& value) const;
476
477 SPARROW_API void insert_encoded_value(size_type run_index, const value_type& value);
478
479 SPARROW_API void erase_encoded_values(size_type run_index, size_type count);
480
481 SPARROW_API void rebind_children_from_proxy();
482
483 SPARROW_API void insert_acc_length(size_type pos, std::uint64_t value);
484
485 SPARROW_API void erase_acc_lengths(size_type pos, size_type count);
486
487 SPARROW_API void set_acc_length(size_type index, std::uint64_t value);
488
489 SPARROW_API void shift_acc_lengths(size_type start_index, std::int64_t delta);
490
491 SPARROW_API void merge_adjacent_runs(size_type left_run_index);
492
493 SPARROW_API void refresh_and_merge_adjacent_runs(std::optional<size_type> merge_candidate);
494
495 SPARROW_API void
496 insert_logical_values(size_type index, const value_type& value, size_type count, bool refresh_state = true);
497
498 SPARROW_API void erase_logical_values(size_type index, size_type count, bool refresh_state = true);
499
500 SPARROW_API void
501 insert_logical_value(size_type index, const value_type& value, bool refresh_state = true);
502
503 SPARROW_API void erase_logical_value(size_type index, bool refresh_state = true);
504
505 SPARROW_API void replace_logical_value(size_type index, const value_type& value);
506
507 SPARROW_API void refresh_cache();
508
509 SPARROW_API void refresh_after_mutation();
510
511 SPARROW_API void finalize_mutation(bool refresh_state);
512
513 SPARROW_API void throw_if_sliced_for_mutation(const char* operation) const;
514
515 [[nodiscard]] SPARROW_API static value_type materialize_value(const const_reference& value);
516
518 arrow_proxy m_proxy;
520 std::uint64_t m_encoded_length;
521
523 array p_acc_lengths_array;
525 array p_encoded_values_array;
527 acc_length_ptr_variant_type m_acc_lengths;
528
529 // friend classes
530 friend class run_encoded_array_iterator<false>;
531 friend class run_encoded_array_iterator<true>;
534 };
535
538
539 template <input_metadata_container METADATA_RANGE>
540 auto run_end_encoded_array::create_proxy(
541 array&& acc_lengths,
542 array&& encoded_values,
543 std::optional<std::string_view> name,
544 std::optional<METADATA_RANGE> metadata
545 ) -> arrow_proxy
546 {
547 const auto flags = detail::array_access::get_arrow_proxy(encoded_values).flags();
548 auto [null_count, length] = extract_length_and_null_count(acc_lengths, encoded_values);
549
550 auto [acc_length_array, acc_length_schema] = extract_arrow_structures(std::move(acc_lengths));
551 auto [encoded_values_array, encoded_values_schema] = extract_arrow_structures(std::move(encoded_values));
552
553 constexpr auto n_children = 2;
554 ArrowSchema** child_schemas = new ArrowSchema*[n_children];
555 ArrowArray** child_arrays = new ArrowArray*[n_children];
556
557 child_schemas[0] = new ArrowSchema(std::move(acc_length_schema));
558 child_schemas[1] = new ArrowSchema(std::move(encoded_values_schema));
559
560 child_arrays[0] = new ArrowArray(std::move(acc_length_array));
561 child_arrays[1] = new ArrowArray(std::move(encoded_values_array));
562
563 const repeat_view<bool> children_ownserhip{true, n_children};
564
566 std::string("+r"),
567 std::move(name), // name
568 std::move(metadata), // metadata
569 flags, // flags,
570 child_schemas, // children
571 children_ownserhip, // children ownership
572 nullptr, // dictionary
573 true // dictionary ownership
574 );
575
576 std::vector<buffer<std::uint8_t>> arr_buffs = {};
577
578 ArrowArray arr = make_arrow_array(
579 static_cast<std::int64_t>(length), // length
580 static_cast<int64_t>(null_count),
581 0, // offset
582 std::move(arr_buffs),
583 child_arrays, // children
584 children_ownserhip, // children ownership
585 nullptr, // dictionary
586 true // dictionary ownership
587 );
588
589 return arrow_proxy{std::move(arr), std::move(schema)};
590 }
591
592} // namespace sparrow
593
594
595#if defined(__cpp_lib_format)
596
597template <>
598struct std::formatter<sparrow::run_end_encoded_array>
599{
600 constexpr auto parse(std::format_parse_context& ctx)
601 {
602 return ctx.begin(); // Simple implementation
603 }
604
605 SPARROW_API auto format(const sparrow::run_end_encoded_array& ar, std::format_context& ctx) const
606 -> decltype(ctx.out());
607};
608
609namespace sparrow
610{
611 SPARROW_API std::ostream& operator<<(std::ostream& os, const run_end_encoded_array& value);
612}
613
614#endif
Dynamically typed array encapsulating an Arrow layout.
Definition array_api.hpp:50
SPARROW_API std::unordered_set< ArrowFlag > flags() const
Gets the Arrow flags set for this array.
static const sparrow::arrow_proxy & get_arrow_proxy(const ARRAY &array)
A view that repeats a value a given number of times.
SPARROW_API const_iterator end() const
Gets a constant iterator to the end of the array.
SPARROW_API const_reverse_iterator rend() const
Gets a constant reverse iterator to the end of the reversed array.
SPARROW_API void push_back(const value_type &value)
SPARROW_API iterator insert(const_iterator pos, const value_type &value)
Inserts a single logical value before pos.
std::optional< key_value_view > metadata() const
Gets the metadata of the array.
array_traits::const_reference const_reference
std::reverse_iterator< iterator > reverse_iterator
run_encoded_array_iterator< true > const_iterator
array_traits::inner_value_type inner_value_type
SPARROW_API size_type size() const
Gets the number of elements in the array.
SPARROW_API reference front()
Gets a mutable reference to the first element.
SPARROW_API array_traits::const_reference back() const
Gets a reference to the last element.
SPARROW_API bool empty() const
Checks if the array is empty.
SPARROW_API const_reverse_iterator rbegin() const
Gets a constant reverse iterator to the beginning of reversed the array.
SPARROW_API iterator end()
Gets an iterator to the end of the array.
SPARROW_API iterator begin()
Gets an iterator to the beginning of the array.
SPARROW_API const_iterator begin() const
Gets a constant iterator to the beginning of the array.
self_type & operator=(self_type &&)=default
SPARROW_API array_traits::const_reference operator[](std::uint64_t i) const
Constant access operator for getting element at index.
SPARROW_API run_end_encoded_array(const self_type &)
Copy constructor.
run_end_encoded_array(self_type &&)=default
SPARROW_API run_end_encoded_array(arrow_proxy proxy)
Constructs run-end encoded array from Arrow proxy.
SPARROW_API iterator erase(const_iterator first, const_iterator last)
Erases the logical range [first, last).
std::optional< std::string_view > name() const
Gets the name of the array.
std::reverse_iterator< const_iterator > const_reverse_iterator
SPARROW_API iterator insert(const_iterator pos, const value_type &value, size_type count)
Inserts count copies of value before pos.
SPARROW_API const_reverse_iterator crbegin() const
Gets a constant reverse iterator to the beginning of reversed the array.
SPARROW_API void pop_back()
SPARROW_API reference operator[](std::uint64_t i)
Mutable access operator for updating the logical value at index.
SPARROW_API reference back()
Gets a mutable reference to the last element.
SPARROW_API reverse_iterator rend()
Gets a reverse iterator to the end of the reversed array.
SPARROW_API void resize(size_type new_length, const value_type &value)
iterator insert(const_iterator pos, InputIt first, InputIt last)
SPARROW_API array_traits::const_reference front() const
Gets a constant reference to the first element.
SPARROW_API const_iterator cend() const
Gets a constant iterator to the end of the array.
SPARROW_API self_type & operator=(const self_type &)
Copy assignment operator.
iterator insert(const_iterator pos, R &&range)
run_end_encoded_array(Args &&... args)
Generic constructor for creating run-end encoded array.
SPARROW_API reverse_iterator rbegin()
Gets a reverse iterator to the beginning of the reversed array.
SPARROW_API const_reverse_iterator crend() const
Gets a constant reverse iterator to the end of the reversed array.
SPARROW_API const_iterator cbegin() const
Gets a constant iterator to the beginning of the array.
SPARROW_API iterator erase(const_iterator pos)
Erases the logical value at pos.
run_encoded_array_iterator< false > iterator
run_end_encoded_reference(run_end_encoded_reference &&) noexcept=default
run_end_encoded_reference(const run_end_encoded_reference &)=default
SPARROW_API run_end_encoded_reference(run_end_encoded_array &array, std::size_t index)
SPARROW_API run_end_encoded_reference(run_end_encoded_array &array, std::size_t index, std::size_t run_index)
SPARROW_API bool has_value() const
array_traits::const_reference const_reference
#define SPARROW_API
Definition config.hpp:38
std::ostream & operator<<(std::ostream &stream, primesum::uint128_t n)
The __int128_t type (GCC/Clang) is not well supported by the C++ standard library (in 2016) so we hav...
Definition int128_t.hpp:48
constexpr bool excludes_copy_and_move_ctor_v
Convenience variable template for excludes_copy_and_move_ctor.
ArrowSchema make_arrow_schema(F format, N name, std::optional< M > metadata, std::optional< std::unordered_set< ArrowFlag > > flags, ArrowSchema **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowSchema *dictionary, bool dictionary_ownership)
Creates an ArrowSchema owned by a unique_ptr and holding the provided data.
SPARROW_API bool operator==(const array &lhs, const array &rhs)
Compares the content of two arrays.
constexpr InputIt next(InputIt it, Distance n)
Definition iterator.hpp:605
constexpr bool is_run_end_encoded_array_v
Checks whether T is a run_end_encoded_array type.
std::pair< ArrowArray, ArrowSchema > extract_arrow_structures(A &&a)
Extracts the internal ArrowArray and ArrowSchema structures from the given array or typed layout.
Definition array.hpp:110
ArrowArray make_arrow_array(int64_t length, int64_t null_count, int64_t offset, B buffers, ArrowArray **children, const CHILDREN_OWNERSHIP &children_ownership, ArrowArray *dictionary, bool dictionary_ownership)
Creates an ArrowArray.
data_type
Runtime identifier of arrow data types, usually associated with raw bytes with the associated value.
Extensions to the C++ standard library.
mpl::rename< mpl::unique< mpl::transform< detail::array_const_reference_t, all_base_types_t > >, nullable_variant > const_reference
mpl::rename< all_base_types_t, std::variant > inner_value_type
mpl::rename< mpl::transform< detail::array_value_type_t, all_base_types_t >, nullable_variant > value_type
Metafunction for retrieving the data_type of a typed array.