3#include <sparrow/record_batch.hpp>
48 template <writable_stream TStream>
49 serializer(TStream& stream, std::optional<CompressionType> compression = std::nullopt)
51 , m_compression(compression)
69 void write(
const sparrow::record_batch& rb);
86 template <std::ranges::input_range R>
87 requires std::same_as<std::ranges::range_value_t<R>, sparrow::record_batch>
88 void write(
const R& record_batches)
91 if (std::ranges::empty(record_batches))
98 throw std::runtime_error(
"Cannot append to a serializer that has been ended");
104 const auto reserve_function = [&record_batches, &compressed_buffers_cache,
this]()
112 compressed_buffers_cache
116 m_stream.reserve(reserve_function);
118 if (!m_schema_received)
120 m_schema_received =
true;
121 m_dtypes = get_column_dtypes(*record_batches.begin());
125 for (
const auto& rb : record_batches)
127 if (get_column_dtypes(rb) != m_dtypes)
129 throw std::invalid_argument(
"Record batch schema does not match serializer schema");
140 compressed_buffers_cache
188 template <std::ranges::input_range R>
189 requires std::same_as<std::ranges::range_value_t<R>, sparrow::record_batch>
192 write(record_batches);
229 static std::vector<sparrow::data_type> get_column_dtypes(
const sparrow::record_batch& rb);
231 bool m_schema_received{
false};
232 std::vector<sparrow::data_type> m_dtypes;
235 std::optional<CompressionType> m_compression;
236 dictionary_tracker m_dict_tracker;
Type-erased wrapper for any stream-like object.
A class for serializing Apache Arrow record batches to an output stream.
void write(const sparrow::record_batch &rb)
Writes a record batch to the serializer.
serializer(TStream &stream, std::optional< CompressionType > compression=std::nullopt)
Constructs a serializer object with a reference to a stream.
void write(const R &record_batches)
Writes a collection of record batches to the stream.
void end()
Finalizes the serialization process by writing end-of-stream marker.
serializer & operator<<(const sparrow::record_batch &rb)
serializer & operator<<(const R &record_batches)
~serializer()
Destructor for the serializer.
serializer & operator<<(serializer &(*manip)(serializer &))
std::size_t calculate_serializer_reserve_size(const R &record_batches, std::size_t current_stream_size, bool schema_received, std::optional< CompressionType > compression, const dictionary_tracker &dict_tracker, std::optional< std::reference_wrapper< CompressionCache > > cache=std::nullopt)
void for_each_pending_dictionary(const sparrow::record_batch &record_batch, dictionary_tracker &tracker, Func visitor)
SPARROW_IPC_API serialized_record_batch_info serialize_record_batch(const sparrow::record_batch &record_batch, any_output_stream &stream, std::optional< CompressionType > compression, std::optional< std::reference_wrapper< CompressionCache > > cache)
Serializes a record batch into a binary format following the Arrow IPC specification.
SPARROW_IPC_API void serialize_schema_message(const sparrow::record_batch &record_batch, any_output_stream &stream)
Serializes a schema message for a record batch into a byte buffer.
SPARROW_IPC_API serialized_record_batch_info serialize_dictionary_batch(int64_t dictionary_id, const sparrow::record_batch &record_batch, bool is_delta, any_output_stream &stream, std::optional< CompressionType > compression, std::optional< std::reference_wrapper< CompressionCache > > cache)
Serializes a dictionary batch into a binary format following the Arrow IPC specification.
serializer & end_stream(serializer &serializer)
Information about a dictionary used for encoding.
bool is_delta
Whether this is a delta update.
sparrow::record_batch data
Dictionary values as a single-column record batch.
int64_t id
Dictionary identifier.