Skip to content

Grouping and Aggregation

Therismos provides a grouping and aggregation system for modeling SQL-like GROUP BY operations with aggregation functions as object structures.

Quick Start

from therismos.grouping import GroupSpec, Aggregation, AggregationFunction

spec = GroupSpec(
    group_by=["category", "region"],
    aggregations=[
        Aggregation("total", AggregationFunction.COUNT),
        Aggregation("min_price", AggregationFunction.MIN, "price"),
        Aggregation("avg_price", AggregationFunction.AVERAGE, "price"),
    ],
)

from therismos.grouping.visitors import StringVisitor
print(spec.accept(StringVisitor()))
# ("category,region", "total:count,min_price:min:price,avg_price:average:price")

Aggregation Functions

Function Description
COUNT Count of items (field optional, silently ignored)
SUM Sum of values
MIN Minimum value
MAX Maximum value
AVERAGE Mean value
STDDEV Standard deviation
MEDIAN Median value
Q1 25th percentile
Q3 75th percentile
P01, P05, P10 1st, 5th, 10th percentiles
P90, P95, P99 90th, 95th, 99th percentiles

All functions except COUNT require a field.

Creating Grouping Specifications

# Multiple grouping fields with percentile aggregations
spec = GroupSpec(
    group_by=["service"],
    aggregations=[
        Aggregation("p95_latency", AggregationFunction.P95, "latency"),
        Aggregation("p99_latency", AggregationFunction.P99, "latency"),
        Aggregation("median_latency", AggregationFunction.MEDIAN, "latency"),
    ],
)

# Global aggregation (no grouping)
spec = GroupSpec(
    group_by=[],
    aggregations=[
        Aggregation("total_count", AggregationFunction.COUNT),
        Aggregation("overall_avg", AggregationFunction.AVERAGE, "score"),
    ],
)

Optimization

Removes duplicate grouping fields and duplicate aggregation IDs (keeps last occurrence):

from therismos.grouping.optimizer import optimize

spec = GroupSpec(
    group_by=["category", "region", "category"],  # duplicate field
    aggregations=[
        Aggregation("total", AggregationFunction.COUNT),
        Aggregation("min_price", AggregationFunction.MIN, "price"),
        Aggregation("total", AggregationFunction.MAX, "quantity"),  # duplicate ID
    ],
)

optimized, records = optimize(spec)
# group_by=["region", "category"], last "total" definition kept

Built-in Visitors

from therismos.grouping.visitors import StringVisitor, DictVisitor, FieldGathererVisitor

spec = GroupSpec(
    group_by=["category", "region"],
    aggregations=[
        Aggregation("count", AggregationFunction.COUNT),
        Aggregation("avg_price", AggregationFunction.AVERAGE, "price"),
    ],
)

print(spec.accept(StringVisitor()))
# ("category,region", "count:count,avg_price:average:price")

result = spec.accept(DictVisitor())
# {
#     "group_by": ["category", "region"],
#     "aggregations": [
#         {"id": "count", "function": "count", "field": None},
#         {"id": "avg_price", "function": "average", "field": "price"}
#     ]
# }

field_visitor = FieldGathererVisitor()
spec.accept(field_visitor)
print(field_visitor.field_names)
# {"category", "region", "price"}

Custom Visitors

from therismos.grouping import GroupSpec

class PandasVisitor:
    def visit_group_spec(self, spec: GroupSpec) -> str:
        if not spec.group_by:
            agg_dict = self._build_agg_dict(spec.aggregations.values())
            return f"df.agg({agg_dict})"
        group_fields = list(spec.group_by)
        agg_dict = self._build_agg_dict(spec.aggregations.values())
        return f"df.groupby({group_fields}).agg({agg_dict})"

    def _build_agg_dict(self, aggregations):
        agg_map = {"count": "count", "min": "min", "max": "max",
                   "average": "mean", "stddev": "std", "median": "median"}
        return {
            agg.id: (agg.field, agg_map[agg.function.value])
            for agg in aggregations
            if agg.function.value in agg_map and agg.field
        }

result = spec.accept(PandasVisitor())
# "df.groupby(['category', 'region']).agg({...})"