背景概述
最近可能是心血来潮,也可能是突发奇想,想着把公司目前的告警平台完善一下,于是就想到了如何在告警平台里添加告警规则的功能,而且还需要展示已有的告警规则,以及配置文件等信息。
告警规则思路
不设置告警规则,仅在平台侧使用promQL进行告警。 依旧使用prometheus原有的告警规则,通过initContainers模式上报信息。 修改prometheus代码使之支持rule存储到数据库中。
关于第一和第二种模式我们就不解释了,比较简单,这里我们说一下第三种模式的大致实现。
实现大致流程
源码下载 定位告警规则加载方法 修改该方法 自定义数据模拟 验证
源码下载以及启动
git clone https://github.com/prometheus/prometheus.git
这里我们使用goland
需要注意的是,我们需要使用go版本为1.21哦,别忘记tidy一下。
定位加载告警方法
cmd/prometheus/main.go
ruleManager = rules.NewManager(&rules.ManagerOptions{
Appendable: fanoutStorage,
Queryable: localStorage,
QueryFunc: rules.EngineQueryFunc(queryEngine, fanoutStorage),
NotifyFunc: rules.SendAlerts(notifierManager, cfg.web.ExternalURL.String()),
Context: ctxRule,
ExternalURL: cfg.web.ExternalURL,
Registerer: prometheus.DefaultRegisterer,
Logger: log.With(logger, "component", "rule manager"),
OutageTolerance: time.Duration(cfg.outageTolerance),
ForGracePeriod: time.Duration(cfg.forGracePeriod),
ResendDelay: time.Duration(cfg.resendDelay),
MaxConcurrentEvals: cfg.maxConcurrentEvals,
ConcurrentEvalsEnabled: cfg.enableConcurrentRuleEval,
DefaultRuleQueryOffset: func() time.Duration {
return time.Duration(cfgFile.GlobalConfig.RuleQueryOffset)
},
})
❝ruleManager 在 Prometheus 初始化时调用 rules.NewManager方法完成构建,ruleManager为Manager类型
ruleManager *rules.Manager
type Manager struct {
opts *ManagerOptions
groups map[string]*Group
mtx sync.RWMutex
block chan struct{}
done chan struct{}
restored bool
logger log.Logger
}
加载告警组规则
func (m *Manager) LoadGroups(
interval time.Duration, externalLabels labels.Labels, externalURL string, groupEvalIterationFunc GroupEvalIterationFunc, filenames ...string,
) (map[string]*Group, []error) {
groups := make(map[string]*Group)
shouldRestore := !m.restored
for _, fn := range filenames {
rgs, errs := m.opts.GroupLoader.Load(fn)
if errs != nil {
return nil, errs
}
for _, rg := range rgs.Groups {
itv := interval
if rg.Interval != 0 {
itv = time.Duration(rg.Interval)
}
rules := make([]Rule, 0, len(rg.Rules))
for _, r := range rg.Rules {
expr, err := m.opts.GroupLoader.Parse(r.Expr.Value)
if err != nil {
return nil, []error{fmt.Errorf("%s: %w", fn, err)}
}
if r.Alert.Value != "" {
rules = append(rules, NewAlertingRule(
r.Alert.Value,
expr,
time.Duration(r.For),
time.Duration(r.KeepFiringFor),
labels.FromMap(r.Labels),
labels.FromMap(r.Annotations),
externalLabels,
externalURL,
m.restored,
log.With(m.logger, "alert", r.Alert),
))
continue
}
rules = append(rules, NewRecordingRule(
r.Record.Value,
expr,
labels.FromMap(r.Labels),
))
}
// Check dependencies between rules and store it on the Rule itself.
m.opts.RuleDependencyController.AnalyseRules(rules)
groups[GroupKey(fn, rg.Name)] = NewGroup(GroupOptions{
Name: rg.Name,
File: fn,
Interval: itv,
Limit: rg.Limit,
Rules: rules,
ShouldRestore: shouldRestore,
Opts: m.opts,
QueryOffset: (*time.Duration)(rg.QueryOffset),
done: m.done,
EvalIterationFunc: groupEvalIterationFunc,
})
}
}
return groups, nil
}
❝这里主要是通过加载文件生成对应的rule。
修改LoadGroups
type RuleText struct {
Name string //规则所属组得名称
Fn string //类别
Interval int //规则计算间隔
Alert string //告警名称
Expr string //规则表达式
For string //持续时间
Labels map[string]string //规则维度信息
Annotations map[string]string //规则描述信息
}
func (m *Manager) LoadGroups(
interval time.Duration, externalLabels labels.Labels, externalURL string, groupEvalIterationFunc GroupEvalIterationFunc, filenames ...string,
) (map[string]*Group, []error) {
groups := make(map[string]*Group)
// 定义一个示例告警规则
var rule = RuleText{
Name: "test",
Fn: "alert",
Interval: 20,
Alert: "test_alert",
Expr: "up{job=\"prometheus\"}==0",
For: "30s",
Labels: map[string]string{"severity": "1"},
Annotations: map[string]string{"name": "zxl"},
}
rulelist := []RuleText{rule}
shouldRestore := !m.restored
var rName, rFn string
itv := interval
rules := make([]Rule, 0, len(rulelist))
// 处理示例告警规则
for _, e := range rulelist {
r := e
rName = r.Name
rFn = r.Fn
expr, err := parser.ParseExpr(r.Expr)
if err != nil {
return nil, []error{err}
}
if r.Interval != 0 {
itv = time.Duration(r.Interval) * time.Second // 确保单位是秒
}
dur, derr := model.ParseDuration(r.For)
if derr != nil {
return nil, []error{derr}
}
// 构建 AlertingRule 实例并将其添加到 rules 中
rules = append(rules, NewAlertingRule(
r.Alert,
expr,
time.Duration(dur),
time.Duration(dur),
labels.FromMap(r.Labels),
labels.FromMap(r.Annotations),
externalLabels,
externalURL,
m.restored,
log.With(m.logger, "alert", r.Alert),
))
}
if len(rules) > 0 {
groups[GroupKey(rName, rFn)] = NewGroup(GroupOptions{
Name: rName,
File: rFn,
Interval: itv,
Rules: rules,
ShouldRestore: shouldRestore,
Opts: m.opts,
done: m.done,
})
}
// 处理从文件加载的规则
//for _, fn := range filenames {
// rgs, errs := m.opts.GroupLoader.Load(fn)
// if errs != nil {
// return nil, errs
// }
//
// for _, rg := range rgs.Groups {
// itv := interval
// if rg.Interval != 0 {
// itv = time.Duration(rg.Interval)
// }
//
// rules := make([]Rule, 0, len(rg.Rules))
// for _, r := range rg.Rules {
// expr, err := m.opts.GroupLoader.Parse(r.Expr.Value)
// if err != nil {
// return nil, []error{fmt.Errorf("%s: %w", fn, err)}
// }
//
// if r.Alert.Value != "" {
// rules = append(rules, NewAlertingRule(
// r.Alert.Value,
// expr,
// time.Duration(r.For),
// time.Duration(r.KeepFiringFor),
// labels.FromMap(r.Labels),
// labels.FromMap(r.Annotations),
// externalLabels,
// externalURL,
// m.restored,
// log.With(m.logger, "alert", r.Alert),
// ))
// continue
// }
// rules = append(rules, NewRecordingRule(
// r.Record.Value,
// expr,
// labels.FromMap(r.Labels),
// ))
// }
//
// // 检查规则之间的依赖关系并存储在规则本身
// m.opts.RuleDependencyController.AnalyseRules(rules)
//
// groups[GroupKey(fn, rg.Name)] = NewGroup(GroupOptions{
// Name: rg.Name,
// File: fn,
// Interval: itv,
// Limit: rg.Limit,
// Rules: rules,
// ShouldRestore: shouldRestore,
// Opts: m.opts,
// QueryOffset: (*time.Duration)(rg.QueryOffset),
// done: m.done,
// EvalIterationFunc: groupEvalIterationFunc,
// })
// }
//}
return groups, nil
}
❝这里我们将原生的注释掉了,如果需要可以自行开启。
自定义数据模拟
var rule = RuleText{
Name: "test",
Fn: "alert",
Interval: 20,
Alert: "test_alert",
Expr: "up{job=\"prometheus\"}==0",
For: "30s",
Labels: map[string]string{"severity": "1"},
Annotations: map[string]string{"name": "zxl"},
}
❝这里就是我们自定义的一个告警规则
验证
这里我们通过postman请求进行验证,就不使用web了,本机没有环境ヾ(≧▽≦*)o
返回内容
{
"status": "success",
"data": {
"groups": [
{
"name": "test",
"file": "alert",
"rules": [
{
"state": "inactive",
"name": "test_alert",
"query": "up{job=\"prometheus\"} == 0",
"duration": 30,
"keepFiringFor": 30,
"labels": {
"severity": "1"
},
"annotations": {
"name": "zxl"
},
"alerts": [],
"health": "ok",
"evaluationTime": 0.0006087,
"lastEvaluation": "2024-07-02T21:57:10.3285261+08:00",
"type": "alerting"
}
],
"interval": 20,
"limit": 0,
"evaluationTime": 0.0006087,
"lastEvaluation": "2024-07-02T21:57:10.3285261+08:00"
}
]
}
}
总结
我们本次使用的是直接定义了一个结构体的方式来验证的,那么我们是不是可以使用数据库的形式将其存储到DB里呢,这样我们就可以实现我们的需求了。关于如何使用数据库存储,我们下次分享再给大家呈现。
添加👇下面微信,拉你进群与大佬一起探讨云原生!