Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable alert severity overrides #617

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions config/crd/bases/pyrra.dev_servicelevelobjectives.json
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,14 @@
"description": "Disabled is used to disable the generation of alerts. Recording rules are still generated.",
"type": "boolean"
},
"highSeverity": {
"description": "HighSeverity is used as the severity label value for high severity generated alerts. Defaults to \"critical\".",
"type": "string"
},
"lowSeverity": {
"description": "LowSeverity is used as the severity label value for low severity generated alerts. Defaults to \"warning\".",
"type": "string"
},
"name": {
"description": "Name is used as the name of the alert generated by Pyrra. Defaults to \"ErrorBudgetBurn\".",
"type": "string"
Expand Down
8 changes: 8 additions & 0 deletions config/crd/bases/pyrra.dev_servicelevelobjectives.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,14 @@ spec:
description: Disabled is used to disable the generation of alerts.
Recording rules are still generated.
type: boolean
highSeverity:
description: HighSeverity is used as the severity label value
for high severity generated alerts. Defaults to "critical".
type: string
lowSeverity:
description: LowSeverity is used as the severity label value for
low severity generated alerts. Defaults to "warning".
type: string
name:
description: Name is used as the name of the alert generated by
Pyrra. Defaults to "ErrorBudgetBurn".
Expand Down
16 changes: 16 additions & 0 deletions kubernetes/api/v1alpha1/servicelevelobjective_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,14 @@ type Alerting struct {
// +optional
// Name is used as the name of the alert generated by Pyrra. Defaults to "ErrorBudgetBurn".
Name string `json:"name,omitempty"`

// +optional
// HighSeverity is used as the severity label value for high severity generated alerts. Defaults to "critical".
HighSeverity string `json:"highSeverity,omitempty"`

// +optional
// LowSeverity is used as the severity label value for low severity generated alerts. Defaults to "warning".
LowSeverity string `json:"lowSeverity,omitempty"`
}

type RatioIndicator struct {
Expand Down Expand Up @@ -162,6 +170,14 @@ func (in ServiceLevelObjective) Internal() (slo.Objective, error) {
alerting.Name = in.Spec.Alerting.Name
}

if in.Spec.Alerting.HighSeverity != "" {
alerting.HighSev = slo.Severity(in.Spec.Alerting.HighSeverity)
}

if in.Spec.Alerting.LowSeverity != "" {
alerting.LowSev = slo.Severity(in.Spec.Alerting.LowSeverity)
}

if in.Spec.ServiceLevelIndicator.Ratio != nil && in.Spec.ServiceLevelIndicator.Latency != nil {
return slo.Objective{}, fmt.Errorf("cannot have ratio and latency indicators at the same time")
}
Expand Down
52 changes: 52 additions & 0 deletions kubernetes/api/v1alpha1/servicelevelobjective_types_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,58 @@ spec:
},
},
},
{
config: `
apiVersion: pyrra.dev/v1alpha1
kind: ServiceLevelObjective
metadata:
name: custom-severity
namespace: monitoring
labels:
prometheus: k8s
role: alert-rules
spec:
target: 99
window: 2w
indicator:
ratio:
errors:
metric: prometheus_operator_reconcile_errors_total
total:
metric: prometheus_operator_reconcile_operations_total
alerting:
highSeverity: high
lowSeverity: low
`,
objective: slo.Objective{
Labels: labels.FromStrings(
labels.MetricName, "custom-severity",
"namespace", "monitoring",
),
Target: 0.99,
Window: model.Duration(14 * 24 * time.Hour),
Indicator: slo.Indicator{
Ratio: &slo.RatioIndicator{
Errors: slo.Metric{
Name: "prometheus_operator_reconcile_errors_total",
LabelMatchers: []*labels.Matcher{
{Type: labels.MatchEqual, Name: labels.MetricName, Value: "prometheus_operator_reconcile_errors_total"},
},
},
Total: slo.Metric{
Name: "prometheus_operator_reconcile_operations_total",
LabelMatchers: []*labels.Matcher{
{Type: labels.MatchEqual, Name: labels.MetricName, Value: "prometheus_operator_reconcile_operations_total"},
},
},
},
},
Alerting: slo.Alerting{
HighSev: "high",
LowSev: "low",
},
},
},
}

func TestServiceLevelObjective_Internal(t *testing.T) {
Expand Down
8 changes: 8 additions & 0 deletions slo/promql_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,14 @@ var (
},
}
}
objectiveHTTPRatioWithCustomSeverity = func() Objective {
o := objectiveHTTPRatio()
o.Alerting = Alerting{
HighSev: "high",
LowSev: "low",
}
return o
}
objectiveHTTPRatioGrouping = func() Objective {
o := objectiveHTTPRatio()
o.Indicator.Ratio.Grouping = []string{"job", "handler"}
Expand Down
30 changes: 15 additions & 15 deletions slo/rules.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ type MultiBurnRateAlert struct {
}

func (o Objective) Alerts() ([]MultiBurnRateAlert, error) {
ws := Windows(time.Duration(o.Window))
ws := Windows(time.Duration(o.Window), o.AlertHighSeverity(), o.AlertLowSeverity())

mbras := make([]MultiBurnRateAlert, len(ws))
for i, w := range ws {
Expand Down Expand Up @@ -57,7 +57,7 @@ func (o Objective) Alerts() ([]MultiBurnRateAlert, error) {
func (o Objective) Burnrates() (monitoringv1.RuleGroup, error) {
sloName := o.Labels.Get(labels.MetricName)

ws := Windows(time.Duration(o.Window))
ws := Windows(time.Duration(o.Window), o.AlertHighSeverity(), o.AlertLowSeverity())
burnrates := burnratesFromWindows(ws)
rules := make([]monitoringv1.Rule, 0, len(burnrates))

Expand Down Expand Up @@ -581,7 +581,7 @@ func (o Objective) IncreaseRules() (monitoringv1.RuleGroup, error) {
alertLabels[k] = v
}
// Add severity label for alerts
alertLabels["severity"] = string(critical)
alertLabels["severity"] = string(o.AlertHighSeverity())

rules = append(rules, monitoringv1.Rule{
Alert: "SLOMetricAbsent",
Expand Down Expand Up @@ -727,7 +727,7 @@ func (o Objective) IncreaseRules() (monitoringv1.RuleGroup, error) {
alertLabels[k] = v
}
// Add severity label for alerts
alertLabels["severity"] = string(critical)
alertLabels["severity"] = string(o.AlertHighSeverity())

rules = append(rules, monitoringv1.Rule{
Alert: "SLOMetricAbsent",
Expand All @@ -752,7 +752,7 @@ func (o Objective) IncreaseRules() (monitoringv1.RuleGroup, error) {
alertLabelsLe[k] = v
}
// Add severity label for alerts
alertLabelsLe["severity"] = string(critical)
alertLabelsLe["severity"] = string(o.AlertHighSeverity())

rules = append(rules, monitoringv1.Rule{
Alert: "SLOMetricAbsent",
Expand Down Expand Up @@ -857,7 +857,7 @@ func (o Objective) IncreaseRules() (monitoringv1.RuleGroup, error) {
alertLabels[k] = v
}
// Add severity label for alerts
alertLabels["severity"] = string(critical)
alertLabels["severity"] = string(o.AlertHighSeverity())

rules = append(rules, monitoringv1.Rule{
Alert: "SLOMetricAbsent",
Expand Down Expand Up @@ -899,47 +899,47 @@ func (o Objective) IncreaseRules() (monitoringv1.RuleGroup, error) {
}, nil
}

type severity string
type Severity string

const (
critical severity = "critical"
warning severity = "warning"
critical Severity = "critical"
warning Severity = "warning"
)

type Window struct {
Severity severity
Severity Severity
For time.Duration
Long time.Duration
Short time.Duration
Factor float64
}

func Windows(sloWindow time.Duration) []Window {
func Windows(sloWindow time.Duration, highSev, lowSev Severity) []Window {
// TODO: I'm still not sure if For, Long, Short should really be based on the 28 days ratio...

round := time.Minute // TODO: Change based on sloWindow

// long and short rates are calculated based on the ratio for 28 days.
return []Window{{
Severity: critical,
Severity: highSev,
For: (sloWindow / (28 * 24 * (60 / 2))).Round(round), // 2m for 28d - half short
Long: (sloWindow / (28 * 24)).Round(round), // 1h for 28d
Short: (sloWindow / (28 * 24 * (60 / 5))).Round(round), // 5m for 28d
Factor: 14, // error budget burn: 50% within a day
}, {
Severity: critical,
Severity: highSev,
For: (sloWindow / (28 * 24 * (60 / 15))).Round(round), // 15m for 28d - half short
Long: (sloWindow / (28 * (24 / 6))).Round(round), // 6h for 28d
Short: (sloWindow / (28 * 24 * (60 / 30))).Round(round), // 30m for 28d
Factor: 7, // error budget burn: 20% within a day / 100% within 5 days
}, {
Severity: warning,
Severity: lowSev,
For: (sloWindow / (28 * 24)).Round(round), // 1h for 28d - half short
Long: (sloWindow / 28).Round(round), // 1d for 28d
Short: (sloWindow / (28 * (24 / 2))).Round(round), // 2h for 28d
Factor: 2, // error budget burn: 10% within a day / 100% within 10 days
}, {
Severity: warning,
Severity: lowSev,
For: (sloWindow / (28 * (24 / 3))).Round(round), // 3h for 28d - half short
Long: (sloWindow / 7).Round(round), // 4d for 28d
Short: (sloWindow / (28 * (24 / 6))).Round(round), // 6h for 28d
Expand Down
79 changes: 76 additions & 3 deletions slo/rules_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,62 @@ func TestObjective_Burnrates(t *testing.T) {
Labels: map[string]string{"severity": "warning", "job": "thanos-receive-default", "long": "4d", "slo": "monitoring-http-errors", "short": "6h"},
}},
},
}, {
name: "http-ratio-custom-severity",
slo: objectiveHTTPRatioWithCustomSeverity(),
rules: monitoringv1.RuleGroup{
Name: "monitoring-http-errors",
Interval: "30s",
Rules: []monitoringv1.Rule{{
Record: "http_requests:burnrate5m",
Expr: intstr.FromString(`sum(rate(http_requests_total{code=~"5..",job="thanos-receive-default"}[5m])) / sum(rate(http_requests_total{job="thanos-receive-default"}[5m]))`),
Labels: map[string]string{"job": "thanos-receive-default", "slo": "monitoring-http-errors"},
}, {
Record: "http_requests:burnrate30m",
Expr: intstr.FromString(`sum(rate(http_requests_total{code=~"5..",job="thanos-receive-default"}[30m])) / sum(rate(http_requests_total{job="thanos-receive-default"}[30m]))`),
Labels: map[string]string{"job": "thanos-receive-default", "slo": "monitoring-http-errors"},
}, {
Record: "http_requests:burnrate1h",
Expr: intstr.FromString(`sum(rate(http_requests_total{code=~"5..",job="thanos-receive-default"}[1h])) / sum(rate(http_requests_total{job="thanos-receive-default"}[1h]))`),
Labels: map[string]string{"job": "thanos-receive-default", "slo": "monitoring-http-errors"},
}, {
Record: "http_requests:burnrate2h",
Expr: intstr.FromString(`sum(rate(http_requests_total{code=~"5..",job="thanos-receive-default"}[2h])) / sum(rate(http_requests_total{job="thanos-receive-default"}[2h]))`),
Labels: map[string]string{"job": "thanos-receive-default", "slo": "monitoring-http-errors"},
}, {
Record: "http_requests:burnrate6h",
Expr: intstr.FromString(`sum(rate(http_requests_total{code=~"5..",job="thanos-receive-default"}[6h])) / sum(rate(http_requests_total{job="thanos-receive-default"}[6h]))`),
Labels: map[string]string{"job": "thanos-receive-default", "slo": "monitoring-http-errors"},
}, {
Record: "http_requests:burnrate1d",
Expr: intstr.FromString(`sum(rate(http_requests_total{code=~"5..",job="thanos-receive-default"}[1d])) / sum(rate(http_requests_total{job="thanos-receive-default"}[1d]))`),
Labels: map[string]string{"job": "thanos-receive-default", "slo": "monitoring-http-errors"},
}, {
Record: "http_requests:burnrate4d",
Expr: intstr.FromString(`sum(rate(http_requests_total{code=~"5..",job="thanos-receive-default"}[4d])) / sum(rate(http_requests_total{job="thanos-receive-default"}[4d]))`),
Labels: map[string]string{"job": "thanos-receive-default", "slo": "monitoring-http-errors"},
}, {
Alert: "ErrorBudgetBurn",
For: "2m",
Expr: intstr.FromString(`http_requests:burnrate5m{job="thanos-receive-default",slo="monitoring-http-errors"} > (14 * (1-0.99)) and http_requests:burnrate1h{job="thanos-receive-default",slo="monitoring-http-errors"} > (14 * (1-0.99))`),
Labels: map[string]string{"severity": "high", "job": "thanos-receive-default", "long": "1h", "slo": "monitoring-http-errors", "short": "5m"},
}, {
Alert: "ErrorBudgetBurn",
For: "15m",
Expr: intstr.FromString(`http_requests:burnrate30m{job="thanos-receive-default",slo="monitoring-http-errors"} > (7 * (1-0.99)) and http_requests:burnrate6h{job="thanos-receive-default",slo="monitoring-http-errors"} > (7 * (1-0.99))`),
Labels: map[string]string{"severity": "high", "job": "thanos-receive-default", "long": "6h", "slo": "monitoring-http-errors", "short": "30m"},
}, {
Alert: "ErrorBudgetBurn",
For: "1h",
Expr: intstr.FromString(`http_requests:burnrate2h{job="thanos-receive-default",slo="monitoring-http-errors"} > (2 * (1-0.99)) and http_requests:burnrate1d{job="thanos-receive-default",slo="monitoring-http-errors"} > (2 * (1-0.99))`),
Labels: map[string]string{"severity": "low", "job": "thanos-receive-default", "long": "1d", "slo": "monitoring-http-errors", "short": "2h"},
}, {
Alert: "ErrorBudgetBurn",
For: "3h",
Expr: intstr.FromString(`http_requests:burnrate6h{job="thanos-receive-default",slo="monitoring-http-errors"} > (1 * (1-0.99)) and http_requests:burnrate4d{job="thanos-receive-default",slo="monitoring-http-errors"} > (1 * (1-0.99))`),
Labels: map[string]string{"severity": "low", "job": "thanos-receive-default", "long": "4d", "slo": "monitoring-http-errors", "short": "6h"},
}},
},
}, {
name: "http-ratio-grouping",
slo: objectiveHTTPRatioGrouping(),
Expand Down Expand Up @@ -1040,7 +1096,7 @@ func TestObjective_Burnrates(t *testing.T) {
},
}}

require.Len(t, testcases, 19)
require.Len(t, testcases, 20)

for _, tc := range testcases {
t.Run(tc.name, func(t *testing.T) {
Expand Down Expand Up @@ -1150,6 +1206,23 @@ func TestObjective_IncreaseRules(t *testing.T) {
Labels: map[string]string{"job": "thanos-receive-default", "slo": "monitoring-http-errors", "severity": "critical"},
}},
},
}, {
name: "http-ratio-custom-severity",
slo: objectiveHTTPRatioWithCustomSeverity(),
rules: monitoringv1.RuleGroup{
Name: "monitoring-http-errors-increase",
Interval: "2m30s",
Rules: []monitoringv1.Rule{{
Record: "http_requests:increase4w",
Expr: intstr.FromString(`sum by (code) (increase(http_requests_total{job="thanos-receive-default"}[4w]))`),
Labels: map[string]string{"job": "thanos-receive-default", "slo": "monitoring-http-errors"},
}, {
Alert: "SLOMetricAbsent",
Expr: intstr.FromString(`absent(http_requests_total{job="thanos-receive-default"}) == 1`),
For: "2m",
Labels: map[string]string{"job": "thanos-receive-default", "slo": "monitoring-http-errors", "severity": "high"},
}},
},
}, {
name: "http-ratio-grouping",
slo: objectiveHTTPRatioGrouping(),
Expand Down Expand Up @@ -1487,7 +1560,7 @@ func TestObjective_IncreaseRules(t *testing.T) {
},
}}

require.Len(t, testcases, 16)
require.Len(t, testcases, 17)

for _, tc := range testcases {
t.Run(tc.name, func(t *testing.T) {
Expand All @@ -1499,7 +1572,7 @@ func TestObjective_IncreaseRules(t *testing.T) {
}

func Test_windows(t *testing.T) {
ws := Windows(28 * 24 * time.Hour)
ws := Windows(28*24*time.Hour, critical, warning)

require.Equal(t, Window{
Severity: critical,
Expand Down
Loading