mirror of
https://github.com/langchain-ai/datafusion.git
synced 2026-07-01 21:24:06 -04:00
fe2591b3ce
Remove MaterializedField, MajorType, RepeatedTypes Add code to convert from FlatBuf representation to Pojo also adds tests to test the conversion
211 lines
5.1 KiB
Plaintext
211 lines
5.1 KiB
Plaintext
namespace org.apache.arrow.flatbuf;
|
|
|
|
/// ----------------------------------------------------------------------
|
|
/// Logical types and their metadata (if any)
|
|
///
|
|
/// These are stored in the flatbuffer in the Type union below
|
|
|
|
table Null {
|
|
}
|
|
|
|
/// A Tuple in the flatbuffer metadata is the same as an Arrow Struct
|
|
/// (according to the physical memory layout). We used Tuple here as Struct is
|
|
/// a reserved word in Flatbuffers
|
|
table Tuple {
|
|
}
|
|
|
|
table List {
|
|
}
|
|
|
|
enum UnionMode:int { Sparse, Dense }
|
|
|
|
table Union {
|
|
mode: UnionMode;
|
|
}
|
|
|
|
table Int {
|
|
bitWidth: int; // restricted to 8, 16, 32, and 64 in v1
|
|
is_signed: bool;
|
|
}
|
|
|
|
enum Precision:int {SINGLE, DOUBLE}
|
|
|
|
table FloatingPoint {
|
|
precision: Precision;
|
|
}
|
|
|
|
table Utf8 {
|
|
}
|
|
|
|
table Binary {
|
|
}
|
|
|
|
table Bool {
|
|
}
|
|
|
|
table Decimal {
|
|
precision: int;
|
|
scale: int;
|
|
}
|
|
|
|
table Date {
|
|
}
|
|
|
|
table Time {
|
|
}
|
|
|
|
table Timestamp {
|
|
timezone: string;
|
|
}
|
|
|
|
table IntervalDay {
|
|
}
|
|
|
|
table IntervalYear {
|
|
}
|
|
|
|
table JSONScalar {
|
|
dense:bool=true;
|
|
}
|
|
|
|
/// ----------------------------------------------------------------------
|
|
/// Top-level Type value, enabling extensible type-specific metadata. We can
|
|
/// add new logical types to Type without breaking backwards compatibility
|
|
|
|
union Type {
|
|
Null,
|
|
Int,
|
|
FloatingPoint,
|
|
Binary,
|
|
Utf8,
|
|
Bool,
|
|
Decimal,
|
|
Date,
|
|
Time,
|
|
Timestamp,
|
|
IntervalDay,
|
|
IntervalYear,
|
|
List,
|
|
Tuple,
|
|
Union,
|
|
JSONScalar
|
|
}
|
|
|
|
/// ----------------------------------------------------------------------
|
|
/// A field represents a named column in a record / row batch or child of a
|
|
/// nested type.
|
|
///
|
|
/// - children is only for nested Arrow arrays
|
|
/// - For primitive types, children will have length 0
|
|
/// - nullable should default to true in general
|
|
|
|
table Field {
|
|
// Name is not required, in i.e. a List
|
|
name: string;
|
|
nullable: bool;
|
|
type: Type;
|
|
children: [Field];
|
|
}
|
|
|
|
/// ----------------------------------------------------------------------
|
|
/// Endianness of the platform that produces the RecordBatch
|
|
|
|
enum Endianness:int { Little, Big }
|
|
|
|
/// ----------------------------------------------------------------------
|
|
/// A Schema describes the columns in a row batch
|
|
|
|
table Schema {
|
|
|
|
/// endianness of the buffer
|
|
/// it is Little Endian by default
|
|
/// if endianness doesn't match the underlying system then the vectors need to be converted
|
|
endianness: Endianness=Little;
|
|
|
|
fields: [Field];
|
|
}
|
|
|
|
/// ----------------------------------------------------------------------
|
|
/// Data structures for describing a table row batch (a collection of
|
|
/// equal-length Arrow arrays)
|
|
|
|
/// A Buffer represents a single contiguous memory segment
|
|
struct Buffer {
|
|
/// The shared memory page id where this buffer is located. Currently this is
|
|
/// not used
|
|
page: int;
|
|
|
|
/// The relative offset into the shared memory page where the bytes for this
|
|
/// buffer starts
|
|
offset: long;
|
|
|
|
/// The absolute length (in bytes) of the memory buffer. The memory is found
|
|
/// from offset (inclusive) to offset + length (non-inclusive).
|
|
length: long;
|
|
}
|
|
|
|
/// Metadata about a field at some level of a nested type tree (but not
|
|
/// its children).
|
|
///
|
|
/// For example, a List<Int16> with values [[1, 2, 3], null, [4], [5, 6], null]
|
|
/// would have {length: 5, null_count: 2} for its List node, and {length: 6,
|
|
/// null_count: 0} for its Int16 node, as separate FieldNode structs
|
|
struct FieldNode {
|
|
/// The number of value slots in the Arrow array at this level of a nested
|
|
/// tree
|
|
length: int;
|
|
|
|
/// The number of observed nulls. Fields with null_count == 0 may choose not
|
|
/// to write their physical validity bitmap out as a materialized buffer,
|
|
/// instead setting the length of the bitmap buffer to 0.
|
|
null_count: int;
|
|
}
|
|
|
|
/// A data header describing the shared memory layout of a "record" or "row"
|
|
/// batch. Some systems call this a "row batch" internally and others a "record
|
|
/// batch".
|
|
table RecordBatch {
|
|
/// number of records / rows. The arrays in the batch should all have this
|
|
/// length
|
|
length: int;
|
|
|
|
/// Nodes correspond to the pre-ordered flattened logical schema
|
|
nodes: [FieldNode];
|
|
|
|
/// Buffers correspond to the pre-ordered flattened buffer tree
|
|
///
|
|
/// The number of buffers appended to this list depends on the schema. For
|
|
/// example, most primitive arrays will have 2 buffers, 1 for the validity
|
|
/// bitmap and 1 for the values. For struct arrays, there will only be a
|
|
/// single buffer for the validity (nulls) bitmap
|
|
buffers: [Buffer];
|
|
}
|
|
|
|
/// ----------------------------------------------------------------------
|
|
/// For sending dictionary encoding information. Any Field can be
|
|
/// dictionary-encoded, but in this case none of its children may be
|
|
/// dictionary-encoded.
|
|
///
|
|
/// TODO(wesm): To be documented in more detail
|
|
|
|
table DictionaryBatch {
|
|
id: long;
|
|
data: RecordBatch;
|
|
}
|
|
|
|
/// ----------------------------------------------------------------------
|
|
/// The root Message type
|
|
|
|
/// This union enables us to easily send different message types without
|
|
/// redundant storage, and in the future we can easily add new message types.
|
|
union MessageHeader {
|
|
Schema, DictionaryBatch, RecordBatch
|
|
}
|
|
|
|
table Message {
|
|
header: MessageHeader;
|
|
bodyLength: long;
|
|
}
|
|
|
|
root_type Message;
|