vortex_datafusion/memory/
statistics.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
use datafusion_common::stats::Precision;
use datafusion_common::{ColumnStatistics, Result as DFResult, ScalarValue, Statistics};
use itertools::Itertools;
use vortex_array::array::ChunkedArray;
use vortex_array::stats::{ArrayStatistics, Stat};
use vortex_array::variants::StructArrayTrait;
use vortex_array::ArrayLen;
use vortex_error::{vortex_err, VortexExpect, VortexResult};

pub fn chunked_array_df_stats(array: &ChunkedArray, projection: &[usize]) -> DFResult<Statistics> {
    let mut nbytes: usize = 0;
    let column_statistics = projection
        .iter()
        .map(|i| {
            array
                .maybe_null_field_by_idx(*i)
                .ok_or_else(|| vortex_err!("Projection references unknown field {i}"))
        })
        .map_ok(|arr| {
            nbytes += arr.nbytes();
            ColumnStatistics {
                null_count: arr
                    .statistics()
                    .get_as::<u64>(Stat::NullCount)
                    .map(|n| n as usize)
                    .map(Precision::Exact)
                    .unwrap_or(Precision::Absent),
                max_value: arr
                    .statistics()
                    .get(Stat::Max)
                    .map(|n| {
                        ScalarValue::try_from(n).vortex_expect("cannot convert scalar to df scalar")
                    })
                    .map(Precision::Exact)
                    .unwrap_or(Precision::Absent),
                min_value: arr
                    .statistics()
                    .get(Stat::Min)
                    .map(|n| {
                        ScalarValue::try_from(n).vortex_expect("cannot convert scalar to df scalar")
                    })
                    .map(Precision::Exact)
                    .unwrap_or(Precision::Absent),
                distinct_count: Precision::Absent,
            }
        })
        .collect::<VortexResult<Vec<_>>>()?;

    Ok(Statistics {
        num_rows: Precision::Exact(array.len()),
        total_byte_size: Precision::Exact(nbytes),
        column_statistics,
    })
}