sparrow-ipc/stream__file__serializer_8hpp_source.html

#pragma once


#include <cstddef>

#include <optional>

#include <vector>


#include <sparrow/record_batch.hpp>


#include "sparrow_ipc/any_output_stream.hpp"

#include "sparrow_ipc/compression.hpp"

#include "sparrow_ipc/config/config.hpp"

#include "sparrow_ipc/dictionary_iteration.hpp"

#include "sparrow_ipc/dictionary_tracker.hpp"

#include "sparrow_ipc/magic_values.hpp"

#include "sparrow_ipc/serialize.hpp"

#include "sparrow_ipc/serialize_utils.hpp"

#include "sparrow_ipc/serializer_reserve.hpp"


namespace sparrow_ipc

{


    struct record_batch_block

    {

        int64_t offset;

        int32_t metadata_length;

        int64_t body_length;

    };


    SPARROW_IPC_API size_t write_footer(

        const sparrow::record_batch& record_batch,

        const std::vector<record_batch_block>& dictionary_blocks,

        const std::vector<record_batch_block>& record_batch_blocks,

        any_output_stream& stream

    );


    class SPARROW_IPC_API stream_file_serializer

    {

    public:


        template <writable_stream TStream>


        stream_file_serializer(TStream& stream, std::optional<CompressionType> compression = std::nullopt)

            : m_stream(stream)

            , m_compression(compression)

        {

        }


        template <writable_stream TStream>


        stream_file_serializer(

            TStream& stream,

            const sparrow::record_batch& schema_batch,

            std::optional<CompressionType> compression = std::nullopt

        )

            : m_stream(stream)

            , m_compression(compression)

        {

            // Write file header magic

            m_stream.write(arrow_file_header_magic);

            m_stream.add_padding();

            m_header_written = true;


            // Establish schema

            m_schema_received = true;

            m_first_record_batch = schema_batch;

            m_dtypes = get_column_dtypes(schema_batch);

            serialize_schema_message(schema_batch, m_stream);

        }


        ~stream_file_serializer();


        void write(const sparrow::record_batch& rb);


        template <std::ranges::input_range R>

            requires std::same_as<std::ranges::range_value_t<R>, sparrow::record_batch>


        void write(const R& record_batches)

        {

            CompressionCache compressed_buffers_cache;

            if (std::ranges::empty(record_batches))

            {

                return;

            }


            if (m_ended)

            {

                throw std::runtime_error("Cannot write to a file serializer that has been ended");

            }


            // Write file header magic on first write

            if (!m_header_written)

            {

                m_stream.write(arrow_file_header_magic);

                m_stream.add_padding();

                m_header_written = true;

            }


            // NOTE `reserve_function` is making us store a cache for the compressed buffers at this level.

            // The benefit of capacity allocation should be evaluated vs storing a cache of compressed buffers

            // of record batches.

            const auto reserve_function = [&record_batches, &compressed_buffers_cache, this]()

            {

                return calculate_serializer_reserve_size(

                    record_batches,

                    m_stream.size(),

                    m_schema_received,

                    m_compression,

                    m_dict_tracker,

                    compressed_buffers_cache

                );

            };


            m_stream.reserve(reserve_function);


            if (!m_schema_received)

            {

                m_schema_received = true;

                m_first_record_batch = *record_batches.begin();

                m_dtypes = get_column_dtypes(*record_batches.begin());

                serialize_schema_message(*record_batches.begin(), m_stream);

            }


            for (const auto& rb : record_batches)

            {

                if (get_column_dtypes(rb) != m_dtypes)

                {

                    throw std::invalid_argument("Record batch schema does not match file serializer schema");

                }


                for_each_pending_dictionary(rb, m_dict_tracker, [&](const dictionary_info& dict_info)

                {

                    if (m_dict_tracker.is_emitted(dict_info.id) && !dict_info.is_delta)

                    {

                        throw std::runtime_error(

                            "Arrow file format does not support multiple non-delta dictionary batches "

                            "for the same dictionary id"

                        );

                    }


                    const int64_t dict_offset = static_cast<int64_t>(m_stream.size());

                    const auto dict_block_info = serialize_dictionary_batch(

                        dict_info.id,

                        dict_info.data,

                        dict_info.is_delta,

                        m_stream,

                        m_compression,

                        compressed_buffers_cache

                    );

                    m_dictionary_blocks.emplace_back(

                        dict_offset,

                        dict_block_info.metadata_length,

                        dict_block_info.body_length

                    );

                });


                // Offset is from the start of the file to the record batch message

                const int64_t offset = static_cast<int64_t>(m_stream.size());


                // Serialize and get block info

                const auto info = serialize_record_batch(rb, m_stream, m_compression, compressed_buffers_cache);


                m_record_batch_blocks.emplace_back(offset, info.metadata_length, info.body_length);

            }

        }


        stream_file_serializer& operator<<(const sparrow::record_batch& rb)

        {

            write(rb);

            return *this;

        }


        template <std::ranges::input_range R>

            requires std::same_as<std::ranges::range_value_t<R>, sparrow::record_batch>


        stream_file_serializer& operator<<(const R& record_batches)

        {

            write(record_batches);

            return *this;

        }


        stream_file_serializer& operator<<(stream_file_serializer& (*manip)(stream_file_serializer&) )

        {

            return manip(*this);

        }


        void end();


        bool m_header_written{false};

        bool m_schema_received{false};

        std::optional<sparrow::record_batch> m_first_record_batch;

        std::vector<sparrow::data_type> m_dtypes;

        any_output_stream m_stream;

        bool m_ended{false};

        std::optional<CompressionType> m_compression;

        dictionary_tracker m_dict_tracker;

        std::vector<record_batch_block> m_dictionary_blocks;

        std::vector<record_batch_block> m_record_batch_blocks;

    };


    inline stream_file_serializer& end_file(stream_file_serializer& serializer)

    {

        serializer.end();

        return serializer;

    }


}

any_output_stream.hpp

sparrow_ipc::CompressionCache
Definition compression.hpp:25

sparrow_ipc::any_output_stream
Type-erased wrapper for any stream-like object.
Definition any_output_stream.hpp:55

sparrow_ipc::dictionary_tracker
Tracks dictionaries during serialization.
Definition dictionary_tracker.hpp:38

sparrow_ipc::serializer
A class for serializing Apache Arrow record batches to an output stream.
Definition serializer.hpp:37

sparrow_ipc::serializer::end
void end()
Finalizes the serialization process by writing end-of-stream marker.

sparrow_ipc::stream_file_serializer
A class for serializing Apache Arrow record batches to the IPC file format.
Definition stream_file_serializer.hpp:71

sparrow_ipc::stream_file_serializer::m_record_batch_blocks
std::vector< record_batch_block > m_record_batch_blocks
Definition stream_file_serializer.hpp:344

sparrow_ipc::stream_file_serializer::operator<<
stream_file_serializer & operator<<(stream_file_serializer &(*manip)(stream_file_serializer &))
Definition stream_file_serializer.hpp:312

sparrow_ipc::stream_file_serializer::m_dtypes
std::vector< sparrow::data_type > m_dtypes
Definition stream_file_serializer.hpp:338

sparrow_ipc::stream_file_serializer::m_compression
std::optional< CompressionType > m_compression
Definition stream_file_serializer.hpp:341

sparrow_ipc::stream_file_serializer::m_header_written
bool m_header_written
Definition stream_file_serializer.hpp:335

sparrow_ipc::stream_file_serializer::write
void write(const R &record_batches)
Writes a collection of record batches to the file.
Definition stream_file_serializer.hpp:161

sparrow_ipc::stream_file_serializer::m_first_record_batch
std::optional< sparrow::record_batch > m_first_record_batch
Definition stream_file_serializer.hpp:337

sparrow_ipc::stream_file_serializer::m_schema_received
bool m_schema_received
Definition stream_file_serializer.hpp:336

sparrow_ipc::stream_file_serializer::~stream_file_serializer
~stream_file_serializer()
Destructor for the stream_file_serializer.

sparrow_ipc::stream_file_serializer::operator<<
stream_file_serializer & operator<<(const sparrow::record_batch &rb)
Definition stream_file_serializer.hpp:266

sparrow_ipc::stream_file_serializer::write
void write(const sparrow::record_batch &rb)
Writes a single record batch to the file.

sparrow_ipc::stream_file_serializer::stream_file_serializer
stream_file_serializer(TStream &stream, std::optional< CompressionType > compression=std::nullopt)
Constructs a stream_file_serializer object with a reference to a stream.
Definition stream_file_serializer.hpp:83

sparrow_ipc::stream_file_serializer::end
void end()
Finalizes the file serialization by writing footer and trailing magic bytes.

sparrow_ipc::stream_file_serializer::operator<<
stream_file_serializer & operator<<(const R &record_batches)
Definition stream_file_serializer.hpp:292

sparrow_ipc::stream_file_serializer::m_ended
bool m_ended
Definition stream_file_serializer.hpp:340

sparrow_ipc::stream_file_serializer::m_dict_tracker
dictionary_tracker m_dict_tracker
Definition stream_file_serializer.hpp:342

sparrow_ipc::stream_file_serializer::m_dictionary_blocks
std::vector< record_batch_block > m_dictionary_blocks
Definition stream_file_serializer.hpp:343

sparrow_ipc::stream_file_serializer::m_stream
any_output_stream m_stream
Definition stream_file_serializer.hpp:339

sparrow_ipc::stream_file_serializer::stream_file_serializer
stream_file_serializer(TStream &stream, const sparrow::record_batch &schema_batch, std::optional< CompressionType > compression=std::nullopt)
Constructs a stream_file_serializer object with a reference to a stream and a schema.
Definition stream_file_serializer.hpp:102

compression.hpp

config.hpp

SPARROW_IPC_API
#define SPARROW_IPC_API
Definition config.hpp:12

dictionary_iteration.hpp

dictionary_tracker.hpp

magic_values.hpp

sparrow_ipc
Definition any_output_stream.hpp:13

sparrow_ipc::calculate_serializer_reserve_size
std::size_t calculate_serializer_reserve_size(const R &record_batches, std::size_t current_stream_size, bool schema_received, std::optional< CompressionType > compression, const dictionary_tracker &dict_tracker, std::optional< std::reference_wrapper< CompressionCache > > cache=std::nullopt)
Definition serializer_reserve.hpp:17

sparrow_ipc::for_each_pending_dictionary
void for_each_pending_dictionary(const sparrow::record_batch &record_batch, dictionary_tracker &tracker, Func visitor)
Definition dictionary_iteration.hpp:11

sparrow_ipc::serialize_record_batch
SPARROW_IPC_API serialized_record_batch_info serialize_record_batch(const sparrow::record_batch &record_batch, any_output_stream &stream, std::optional< CompressionType > compression, std::optional< std::reference_wrapper< CompressionCache > > cache)
Serializes a record batch into a binary format following the Arrow IPC specification.

sparrow_ipc::write_footer
SPARROW_IPC_API size_t write_footer(const sparrow::record_batch &record_batch, const std::vector< record_batch_block > &dictionary_blocks, const std::vector< record_batch_block > &record_batch_blocks, any_output_stream &stream)
Writes the Arrow IPC file footer.

sparrow_ipc::serialize_schema_message
SPARROW_IPC_API void serialize_schema_message(const sparrow::record_batch &record_batch, any_output_stream &stream)
Serializes a schema message for a record batch into a byte buffer.

sparrow_ipc::serialize_dictionary_batch
SPARROW_IPC_API serialized_record_batch_info serialize_dictionary_batch(int64_t dictionary_id, const sparrow::record_batch &record_batch, bool is_delta, any_output_stream &stream, std::optional< CompressionType > compression, std::optional< std::reference_wrapper< CompressionCache > > cache)
Serializes a dictionary batch into a binary format following the Arrow IPC specification.

sparrow_ipc::arrow_file_header_magic
constexpr std::array< std::uint8_t, 8 > arrow_file_header_magic
Magic bytes with padding for file header (8 bytes total for alignment)
Definition magic_values.hpp:34

sparrow_ipc::end_file
stream_file_serializer & end_file(stream_file_serializer &serializer)
Definition stream_file_serializer.hpp:360

sparrow_ipc::get_column_dtypes
SPARROW_IPC_API std::vector< sparrow::data_type > get_column_dtypes(const sparrow::record_batch &rb)

serialize.hpp

serialize_utils.hpp

serializer_reserve.hpp

sparrow_ipc::dictionary_info
Information about a dictionary used for encoding.
Definition dictionary_tracker.hpp:19

sparrow_ipc::dictionary_info::is_delta
bool is_delta
Whether this is a delta update.
Definition dictionary_tracker.hpp:23

sparrow_ipc::dictionary_info::data
sparrow::record_batch data
Dictionary values as a single-column record batch.
Definition dictionary_tracker.hpp:21

sparrow_ipc::dictionary_info::id
int64_t id
Dictionary identifier.
Definition dictionary_tracker.hpp:20

sparrow_ipc::record_batch_block
Represents a block entry in the Arrow IPC file footer.
Definition stream_file_serializer.hpp:27

sparrow_ipc::record_batch_block::body_length
int64_t body_length
Length of the record batch body (data buffers)
Definition stream_file_serializer.hpp:30

sparrow_ipc::record_batch_block::metadata_length
int32_t metadata_length
Length of the metadata (FlatBuffer message)
Definition stream_file_serializer.hpp:29

sparrow_ipc::record_batch_block::offset
int64_t offset
Offset from the start of the file to the record batch message.
Definition stream_file_serializer.hpp:28