vortex_array/array/varbin/
arrow.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
use std::sync::Arc;

use arrow_array::{ArrayRef, BinaryArray, LargeBinaryArray, LargeStringArray, StringArray};
use vortex_dtype::{DType, PType};
use vortex_error::{vortex_bail, VortexResult};

use crate::array::VarBinArray;
use crate::compute::try_cast;
use crate::validity::ArrayValidity;
use crate::variants::PrimitiveArrayTrait;
use crate::{ArrayDType, IntoArrayVariant, ToArrayData};

/// Convert the array to Arrow variable length binary array type.
pub(crate) fn varbin_to_arrow(varbin_array: &VarBinArray) -> VortexResult<ArrayRef> {
    let offsets = varbin_array
        .offsets()
        .into_primitive()
        .map_err(|err| err.with_context("Failed to canonicalize offsets"))?;
    let offsets = match offsets.ptype() {
        PType::I32 | PType::I64 => offsets,
        PType::U64 => offsets.reinterpret_cast(PType::I64),
        PType::U32 => offsets.reinterpret_cast(PType::I32),

        // Unless it's u64, everything else can be converted into an i32.
        _ => try_cast(offsets.to_array(), PType::I32.into())
            .and_then(|a| a.into_primitive())
            .map_err(|err| err.with_context("Failed to cast offsets to PrimitiveArray of i32"))?,
    };
    let nulls = varbin_array
        .logical_validity()
        .to_null_buffer()
        .map_err(|err| err.with_context("Failed to get null buffer from logical validity"))?;

    let data = varbin_array
        .bytes()
        .into_primitive()
        .map_err(|err| err.with_context("Failed to canonicalize bytes"))?;
    if data.dtype() != &DType::BYTES {
        vortex_bail!("Expected bytes to be of type U8, got {}", data.ptype());
    }
    let data = data.byte_buffer();

    // Switch on Arrow DType.
    Ok(match varbin_array.dtype() {
        DType::Binary(_) => match offsets.ptype() {
            PType::I32 => Arc::new(unsafe {
                BinaryArray::new_unchecked(
                    offsets.buffer::<i32>().into_arrow_offset_buffer(),
                    data.clone().into_arrow_buffer(),
                    nulls,
                )
            }),
            PType::I64 => Arc::new(unsafe {
                LargeBinaryArray::new_unchecked(
                    offsets.buffer::<i64>().into_arrow_offset_buffer(),
                    data.clone().into_arrow_buffer(),
                    nulls,
                )
            }),
            _ => vortex_bail!("Invalid offsets type {}", offsets.ptype()),
        },
        DType::Utf8(_) => match offsets.ptype() {
            PType::I32 => Arc::new(unsafe {
                StringArray::new_unchecked(
                    offsets.buffer::<i32>().into_arrow_offset_buffer(),
                    data.clone().into_arrow_buffer(),
                    nulls,
                )
            }),
            PType::I64 => Arc::new(unsafe {
                LargeStringArray::new_unchecked(
                    offsets.buffer::<i64>().into_arrow_offset_buffer(),
                    data.clone().into_arrow_buffer(),
                    nulls,
                )
            }),
            _ => vortex_bail!("Invalid offsets type {}", offsets.ptype()),
        },
        _ => vortex_bail!(
            "expected utf8 or binary instead of {}",
            varbin_array.dtype()
        ),
    })
}