vortex_file/write/
stats_accumulator.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
//! Metadata accumulators track the per-chunk-of-a-column metadata, layout locations, and row counts.

use itertools::Itertools;
use vortex_array::array::StructArray;
use vortex_array::builders::{builder_with_capacity, ArrayBuilder, ArrayBuilderExt};
use vortex_array::stats::{ArrayStatistics as _, Stat};
use vortex_array::validity::{ArrayValidity, Validity};
use vortex_array::{ArrayData, IntoArrayData};
use vortex_dtype::DType;
use vortex_error::VortexResult;

pub struct StatsAccumulator {
    stats: Vec<Stat>,
    builders: Vec<Box<dyn ArrayBuilder>>,
    length: usize,
}

impl StatsAccumulator {
    pub fn new(dtype: &DType, mut stats: Vec<Stat>) -> Self {
        // Sort stats by their ordinal so we can recreate their dtype from bitset
        stats.sort_by_key(|s| u8::from(*s));
        let builders = stats
            .iter()
            .map(|s| builder_with_capacity(&s.dtype(dtype).as_nullable(), 1024))
            .collect();
        Self {
            stats,
            builders,
            length: 0,
        }
    }

    pub fn push_chunk(&mut self, array: &ArrayData) -> VortexResult<()> {
        for (s, builder) in self.stats.iter().zip_eq(self.builders.iter_mut()) {
            if let Some(v) = array.statistics().compute(*s) {
                builder.append_scalar(&v.cast(builder.dtype())?)?;
            } else {
                builder.append_null();
            }
        }
        self.length += 1;
        Ok(())
    }

    pub fn into_array(mut self) -> VortexResult<Option<StatArray>> {
        let mut names = Vec::new();
        let mut fields = Vec::new();
        let mut stats = Vec::new();

        for (stat, builder) in self.stats.iter().zip(self.builders.iter_mut()) {
            let values = builder
                .finish()
                .map_err(|e| e.with_context(format!("Failed to finish stat builder for {stat}")))?;

            // We drop any all-null stats columns
            if values.logical_validity().null_count()? == values.len() {
                continue;
            }

            stats.push(*stat);
            names.push(stat.to_string().into());
            fields.push(values);
        }

        if names.is_empty() {
            return Ok(None);
        }

        Ok(Some(StatArray(
            StructArray::try_new(names.into(), fields, self.length, Validity::NonNullable)?
                .into_array(),
            stats,
        )))
    }
}

pub struct StatArray(pub ArrayData, pub Vec<Stat>);