背景概述
当我们进行告警中心系统开发时,遇到一个告警唯一检索的问题,按照以前的开发模式是通过将告警中label通过md5的方式来实现。alertmanager有告警去重的功能,那么也会涉及到唯一值的问题,为此翻看了一个alertmanager源码整理了一下告警指纹的实现。
物料准备
goland golang sdk 1.22版本 alertmanager源码
大致实现源码
provider/mem/mem.go
// 写入告警数据
func (a *Alerts) Put(alerts ...*types.Alert) error {
a.mtx.Lock()
defer a.mtx.Unlock()
for _, alert := range alerts {
// 告警指纹
fp := alert.Fingerprint()
existing := false
// Check that there's an alert existing within the store before
// trying to merge.
if old, err := a.alerts.Get(fp); err == nil {
existing = true
// Merge alerts if there is an overlap in activity range.
if (alert.EndsAt.After(old.StartsAt) && alert.EndsAt.Before(old.EndsAt)) ||
(alert.StartsAt.After(old.StartsAt) && alert.StartsAt.Before(old.EndsAt)) {
alert = old.Merge(alert)
}
}
if err := a.callback.PreStore(alert, existing); err != nil {
level.Error(a.logger).Log("msg", "pre-store callback returned error on set alert", "err", err)
continue
}
if err := a.alerts.Set(alert); err != nil {
level.Error(a.logger).Log("msg", "error on set alert", "err", err)
continue
}
a.callback.PostStore(alert, existing)
for _, l := range a.listeners {
select {
case l.alerts <- alert:
case <-l.done:
}
}
}
return nil
}
告警指纹
func labelSetToFingerprint(ls LabelSet) Fingerprint {
if len(ls) == 0 {
return Fingerprint(emptyLabelSignature)
}
labelNames := make(LabelNames, 0, len(ls))
for labelName := range ls {
labelNames = append(labelNames, labelName)
}
sort.Sort(labelNames)
sum := hashNew()
for _, labelName := range labelNames {
sum = hashAdd(sum, string(labelName))
sum = hashAddByte(sum, SeparatorByte)
sum = hashAdd(sum, string(ls[labelName]))
sum = hashAddByte(sum, SeparatorByte)
}
return Fingerprint(sum)
}
❝其中
Fingerprint
为uint64
类型的哦。
alertmanager
发送告警源数据
{
"receiver": "webhook-tpln",
"status": "firing",
"alerts": [
{
"status": "firing",
"labels": {
"alertname": "CPU使用情况",
"env": "aws-prod",
"instance": "ip-10-19-140-140.ap-southeast-1.compute.internal",
"origin_prometheus": "aws-prod",
"prometheus": "monitoring/k8s",
"severity": "一般告警",
"type": "ecs"
},
"annotations": {
"description": "ip-10-19-140-140.ap-southeast-1.compute.internal CPU使用率过高!当前值 2.633%",
"summary": "ip-10-19-140-140.ap-southeast-1.compute.internal CPU使用率过高!"
},
"startsAt": "2024-07-15T06:21:31.701Z",
"endsAt": "0001-01-01T00:00:00Z",
"fingerprint": "bee06cb3ccfa40b0"
}
],
"groupLabels": {
"alertname": "CPU使用情况",
"instance": "ip-10-19-140-140.ap-southeast-1.compute.internal"
},
"commonLabels": {
"alertname": "CPU使用情况",
"env": "aws-prod",
"instance": "ip-10-19-140-140.ap-southeast-1.compute.internal",
"origin_prometheus": "aws-prod",
"prometheus": "monitoring/k8s",
"severity": "一般告警",
"type": "ecs"
},
"commonAnnotations": {
"description": "ip-10-19-140-140.ap-southeast-1.compute.internal CPU使用率过高!当前值 2.633% ",
"summary": "ip-10-19-140-140.ap-southeast-1.compute.internal CPU使用率过高!"
},
"externalURL": "http://alertmanager-main-1:9093",
"version": "4",
"truncatedAlerts": 0
}
❝我们可以发现其中告警指纹并不是
uint64
类型,而是string
类型的,我们接着看一下webhook
相关的代码。
notify/webhook/webhook.go
func (n *Notifier) Notify(ctx context.Context, alerts ...*types.Alert) (bool, error) {
alerts, numTruncated := truncateAlerts(n.conf.MaxAlerts, alerts)
data := notify.GetTemplateData(ctx, n.tmpl, alerts, n.logger)
groupKey, err := notify.ExtractGroupKey(ctx)
if err != nil {
level.Error(n.logger).Log("err", err)
}
msg := &Message{
Version: "4",
Data: data,
GroupKey: groupKey.String(),
TruncatedAlerts: numTruncated,
}
var buf bytes.Buffer
if err := json.NewEncoder(&buf).Encode(msg); err != nil {
return false, err
}
var url string
if n.conf.URL != nil {
url = n.conf.URL.String()
} else {
content, err := os.ReadFile(n.conf.URLFile)
if err != nil {
return false, fmt.Errorf("read url_file: %w", err)
}
url = strings.TrimSpace(string(content))
}
resp, err := notify.PostJSON(ctx, n.client, url, &buf)
if err != nil {
return true, notify.RedactURL(err)
}
defer notify.Drain(resp)
shouldRetry, err := n.retrier.Check(resp.StatusCode, resp.Body)
if err != nil {
return shouldRetry, notify.NewErrorWithReason(notify.GetFailureReasonFromStatusCode(resp.StatusCode), err)
}
return shouldRetry, err
}
❝这里我们重点关注一下
data := notify.GetTemplateData(ctx, n.tmpl, alerts, n.logger)
template/template.go
for _, a := range types.Alerts(alerts...) {
alert := Alert{
Status: string(a.Status()),
Labels: make(KV, len(a.Labels)),
Annotations: make(KV, len(a.Annotations)),
StartsAt: a.StartsAt,
EndsAt: a.EndsAt,
GeneratorURL: a.GeneratorURL,
Fingerprint: a.Fingerprint().String(),
}
for k, v := range a.Labels {
alert.Labels[string(k)] = string(v)
}
for k, v := range a.Annotations {
alert.Annotations[string(k)] = string(v)
}
data.Alerts = append(data.Alerts, alert)
}
❝我们可以发现template将uint64进行的转换。
func (f Fingerprint) String() string {
return fmt.Sprintf("%016x", uint64(f))
}
❝
uint64
转换代码实现。
告警指纹完整实现
package main
import "fmt"
func main() {
var ls LabelSet = map[LabelName]LabelValue{
"alertname": "CPU使用情况",
"env": "aws-prod",
"instance": "ip-10-19-140-140.ap-southeast-1.compute.internal",
"origin_prometheus": "aws-prod",
"prometheus": "monitoring/k8s",
"severity": "一般告警",
"type": "ecs",
}
fi := getFingerprint(ls)
fmt.Printf("%016x\n", fi)
}
// 告警指纹的实现
type LabelName string
type LabelValue string
type LabelNames []LabelName
type LabelSet map[LabelName]LabelValue
const (
offset64 = 14695981039346656037
prime64 = 1099511628211
)
const SeparatorByte byte = 255
type Fingerprint uint64
func getFingerprint(ls LabelSet) Fingerprint {
labelNames := make(LabelNames, 0, len(ls))
for labelName := range ls {
labelNames = append(labelNames, labelName)
}
sum := hashNew()
for _, labelName := range labelNames {
sum = hashAdd(sum, string(labelName))
sum = hashAddByte(sum, SeparatorByte)
sum = hashAdd(sum, string(ls[labelName]))
sum = hashAddByte(sum, SeparatorByte)
}
return Fingerprint(sum)
}
func hashNew() uint64 {
return offset64
}
func hashAdd(h uint64, s string) uint64 {
for i := 0; i < len(s); i++ {
h ^= uint64(s[i])
h *= prime64
}
return h
}
func hashAddByte(h uint64, b byte) uint64 {
h ^= uint64(b)
h *= prime64
return h
}
❝运行结果为:
bee06cb3ccfa40b0
总结
alertmanager
对于告警的实现大致为,将告警label
的key
和value
转为告警指纹,让后作为map
的key
,这样就可以实现告警的去重了,通过本地缓存的方式进行加载。如果我们在进行告警中心开发时,可以利用告警指纹来做唯一参考。
添加👇下面微信,拉你进群与大佬一起探讨云原生!