7#include <sparrow/record_batch.hpp>
42 const sparrow::record_batch& record_batch,
43 const std::vector<record_batch_block>& dictionary_blocks,
44 const std::vector<record_batch_block>& record_batch_blocks,
106 template <writable_stream TStream>
129 void write(
const sparrow::record_batch& rb);
150 template <std::ranges::input_range R>
151 requires std::same_as<std::ranges::range_value_t<R>, sparrow::record_batch>
155 if (std::ranges::empty(record_batches))
162 throw std::runtime_error(
"Cannot write to a file serializer that has been ended");
176 const auto reserve_function = [&record_batches, &compressed_buffers_cache,
this]()
184 compressed_buffers_cache
198 for (
const auto& rb : record_batches)
202 throw std::invalid_argument(
"Record batch schema does not match file serializer schema");
209 throw std::runtime_error(
210 "Arrow file format does not support multiple non-delta dictionary batches "
211 "for the same dictionary id"
215 const int64_t dict_offset =
static_cast<int64_t
>(
m_stream.size());
222 compressed_buffers_cache
226 dict_block_info.metadata_length,
227 dict_block_info.body_length
232 const int64_t offset =
static_cast<int64_t
>(
m_stream.size());
281 template <std::ranges::input_range R>
282 requires std::same_as<std::ranges::range_value_t<R>, sparrow::record_batch>
285 write(record_batches);
Type-erased wrapper for any stream-like object.
Tracks dictionaries during serialization.
A class for serializing Apache Arrow record batches to an output stream.
void end()
Finalizes the serialization process by writing end-of-stream marker.
A class for serializing Apache Arrow record batches to the IPC file format.
std::vector< record_batch_block > m_record_batch_blocks
stream_file_serializer & operator<<(stream_file_serializer &(*manip)(stream_file_serializer &))
std::vector< sparrow::data_type > m_dtypes
std::optional< CompressionType > m_compression
void write(const R &record_batches)
Writes a collection of record batches to the file.
std::optional< sparrow::record_batch > m_first_record_batch
~stream_file_serializer()
Destructor for the stream_file_serializer.
stream_file_serializer & operator<<(const sparrow::record_batch &rb)
void write(const sparrow::record_batch &rb)
Writes a single record batch to the file.
stream_file_serializer(TStream &stream, std::optional< CompressionType > compression=std::nullopt)
Constructs a stream_file_serializer object with a reference to a stream.
void end()
Finalizes the file serialization by writing footer and trailing magic bytes.
stream_file_serializer & operator<<(const R &record_batches)
dictionary_tracker m_dict_tracker
std::vector< record_batch_block > m_dictionary_blocks
any_output_stream m_stream
std::size_t calculate_serializer_reserve_size(const R &record_batches, std::size_t current_stream_size, bool schema_received, std::optional< CompressionType > compression, const dictionary_tracker &dict_tracker, std::optional< std::reference_wrapper< CompressionCache > > cache=std::nullopt)
void for_each_pending_dictionary(const sparrow::record_batch &record_batch, dictionary_tracker &tracker, Func visitor)
SPARROW_IPC_API serialized_record_batch_info serialize_record_batch(const sparrow::record_batch &record_batch, any_output_stream &stream, std::optional< CompressionType > compression, std::optional< std::reference_wrapper< CompressionCache > > cache)
Serializes a record batch into a binary format following the Arrow IPC specification.
SPARROW_IPC_API size_t write_footer(const sparrow::record_batch &record_batch, const std::vector< record_batch_block > &dictionary_blocks, const std::vector< record_batch_block > &record_batch_blocks, any_output_stream &stream)
Writes the Arrow IPC file footer.
SPARROW_IPC_API void serialize_schema_message(const sparrow::record_batch &record_batch, any_output_stream &stream)
Serializes a schema message for a record batch into a byte buffer.
SPARROW_IPC_API serialized_record_batch_info serialize_dictionary_batch(int64_t dictionary_id, const sparrow::record_batch &record_batch, bool is_delta, any_output_stream &stream, std::optional< CompressionType > compression, std::optional< std::reference_wrapper< CompressionCache > > cache)
Serializes a dictionary batch into a binary format following the Arrow IPC specification.
SPARROW_IPC_API std::vector< sparrow::record_batch > deserialize_file(std::span< const uint8_t > data)
Deserializes Arrow IPC file format into a vector of record batches.
constexpr std::array< std::uint8_t, 8 > arrow_file_header_magic
Magic bytes with padding for file header (8 bytes total for alignment)
stream_file_serializer & end_file(stream_file_serializer &serializer)
SPARROW_IPC_API std::vector< sparrow::data_type > get_column_dtypes(const sparrow::record_batch &rb)
Information about a dictionary used for encoding.
bool is_delta
Whether this is a delta update.
sparrow::record_batch data
Dictionary values as a single-column record batch.
int64_t id
Dictionary identifier.
Represents a block entry in the Arrow IPC file footer.
int64_t body_length
Length of the record batch body (data buffers)
int32_t metadata_length
Length of the metadata (FlatBuffer message)
int64_t offset
Offset from the start of the file to the record batch message.