Grouping and Aggregation¶
Therismos provides a grouping and aggregation system for modeling SQL-like GROUP BY operations with aggregation functions as object structures.
Quick Start¶
from therismos.grouping import GroupSpec, Aggregation, AggregationFunction
spec = GroupSpec(
group_by=["category", "region"],
aggregations=[
Aggregation("total", AggregationFunction.COUNT),
Aggregation("min_price", AggregationFunction.MIN, "price"),
Aggregation("avg_price", AggregationFunction.AVERAGE, "price"),
],
)
from therismos.grouping.visitors import StringVisitor
print(spec.accept(StringVisitor()))
# ("category,region", "total:count,min_price:min:price,avg_price:average:price")
Aggregation Functions¶
| Function | Description |
|---|---|
COUNT |
Count of items (field optional, silently ignored) |
SUM |
Sum of values |
MIN |
Minimum value |
MAX |
Maximum value |
AVERAGE |
Mean value |
STDDEV |
Standard deviation |
MEDIAN |
Median value |
Q1 |
25th percentile |
Q3 |
75th percentile |
P01, P05, P10 |
1st, 5th, 10th percentiles |
P90, P95, P99 |
90th, 95th, 99th percentiles |
All functions except COUNT require a field.
Creating Grouping Specifications¶
# Multiple grouping fields with percentile aggregations
spec = GroupSpec(
group_by=["service"],
aggregations=[
Aggregation("p95_latency", AggregationFunction.P95, "latency"),
Aggregation("p99_latency", AggregationFunction.P99, "latency"),
Aggregation("median_latency", AggregationFunction.MEDIAN, "latency"),
],
)
# Global aggregation (no grouping)
spec = GroupSpec(
group_by=[],
aggregations=[
Aggregation("total_count", AggregationFunction.COUNT),
Aggregation("overall_avg", AggregationFunction.AVERAGE, "score"),
],
)
Optimization¶
Removes duplicate grouping fields and duplicate aggregation IDs (keeps last occurrence):
from therismos.grouping.optimizer import optimize
spec = GroupSpec(
group_by=["category", "region", "category"], # duplicate field
aggregations=[
Aggregation("total", AggregationFunction.COUNT),
Aggregation("min_price", AggregationFunction.MIN, "price"),
Aggregation("total", AggregationFunction.MAX, "quantity"), # duplicate ID
],
)
optimized, records = optimize(spec)
# group_by=["region", "category"], last "total" definition kept
Built-in Visitors¶
from therismos.grouping.visitors import StringVisitor, DictVisitor, FieldGathererVisitor
spec = GroupSpec(
group_by=["category", "region"],
aggregations=[
Aggregation("count", AggregationFunction.COUNT),
Aggregation("avg_price", AggregationFunction.AVERAGE, "price"),
],
)
print(spec.accept(StringVisitor()))
# ("category,region", "count:count,avg_price:average:price")
result = spec.accept(DictVisitor())
# {
# "group_by": ["category", "region"],
# "aggregations": [
# {"id": "count", "function": "count", "field": None},
# {"id": "avg_price", "function": "average", "field": "price"}
# ]
# }
field_visitor = FieldGathererVisitor()
spec.accept(field_visitor)
print(field_visitor.field_names)
# {"category", "region", "price"}
Custom Visitors¶
from therismos.grouping import GroupSpec
class PandasVisitor:
def visit_group_spec(self, spec: GroupSpec) -> str:
if not spec.group_by:
agg_dict = self._build_agg_dict(spec.aggregations.values())
return f"df.agg({agg_dict})"
group_fields = list(spec.group_by)
agg_dict = self._build_agg_dict(spec.aggregations.values())
return f"df.groupby({group_fields}).agg({agg_dict})"
def _build_agg_dict(self, aggregations):
agg_map = {"count": "count", "min": "min", "max": "max",
"average": "mean", "stddev": "std", "median": "median"}
return {
agg.id: (agg.field, agg_map[agg.function.value])
for agg in aggregations
if agg.function.value in agg_map and agg.field
}
result = spec.accept(PandasVisitor())
# "df.groupby(['category', 'region']).agg({...})"