Files
kubesphere/pkg/models/alerting/rules/ruler.go
junotx a120969be7 optimize alerting rule concurrency
Signed-off-by: junotx <junotx@126.com>
2021-02-01 17:25:30 +08:00

566 lines
17 KiB
Go

package rules
import (
"context"
"fmt"
"net/http"
"sort"
"github.com/docker/docker/pkg/locker"
"github.com/ghodss/yaml"
"github.com/pkg/errors"
promresourcesv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
prominformersv1 "github.com/prometheus-operator/prometheus-operator/pkg/client/informers/externalversions/monitoring/v1"
promresourcesclient "github.com/prometheus-operator/prometheus-operator/pkg/client/versioned"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/client-go/util/retry"
"kubesphere.io/kubesphere/pkg/api/alerting/v2alpha1"
)
const (
customAlertingRuleResourcePrefix = "custom-alerting-rule-"
)
var (
maxSecretSize = corev1.MaxSecretSize
maxConfigMapDataSize = int(float64(maxSecretSize) * 0.45)
errOutOfConfigMapSize = errors.New("out of config map size")
)
type Ruler interface {
Namespace() string
RuleResourceNamespaceSelector() (labels.Selector, error)
RuleResourceSelector(extraRuleResourceSelector labels.Selector) (labels.Selector, error)
ExternalLabels() func() map[string]string
ListRuleResources(ruleNamespace *corev1.Namespace, extraRuleResourceSelector labels.Selector) (
[]*promresourcesv1.PrometheusRule, error)
AddAlertingRule(ctx context.Context, ruleNamespace *corev1.Namespace, extraRuleResourceSelector labels.Selector,
group string, rule *promresourcesv1.Rule, ruleResourceLabels map[string]string) error
UpdateAlertingRule(ctx context.Context, ruleNamespace *corev1.Namespace, extraRuleResourceSelector labels.Selector,
group string, rule *promresourcesv1.Rule, ruleResourceLabels map[string]string) error
DeleteAlertingRule(ctx context.Context, ruleNamespace *corev1.Namespace, extraRuleResourceSelector labels.Selector,
group string, name string) error
}
type ruleResource promresourcesv1.PrometheusRule
// deleteAlertingRule deletes the rules with the given name.
// If the rule is deleted, return true to indicate the resource should be updated.
func (r *ruleResource) deleteAlertingRule(name string) (bool, error) {
var (
nGroups []promresourcesv1.RuleGroup
ok bool
)
for _, g := range r.Spec.Groups {
var rules []promresourcesv1.Rule
for _, gr := range g.Rules {
if gr.Alert != "" && gr.Alert == name {
ok = true
continue
}
rules = append(rules, gr)
}
if len(rules) > 0 {
nGroups = append(nGroups, promresourcesv1.RuleGroup{
Name: g.Name,
Interval: g.Interval,
PartialResponseStrategy: g.PartialResponseStrategy,
Rules: rules,
})
}
}
if ok {
r.Spec.Groups = nGroups
}
return ok, nil
}
// updateAlertingRule updates the rule with the given group.
// If the rule is updated, return true to indicate the resource should be updated.
func (r *ruleResource) updateAlertingRule(groupName string, rule *promresourcesv1.Rule) (bool, error) {
var (
ok bool
pr = (promresourcesv1.PrometheusRule)(*r)
npr = pr.DeepCopy()
groupMap = make(map[string]*promresourcesv1.RuleGroup)
)
for _, g := range npr.Spec.Groups {
var rules []promresourcesv1.Rule
for i, gr := range g.Rules {
if gr.Alert != "" && gr.Alert == rule.Alert {
ok = true
continue
}
rules = append(rules, g.Rules[i])
}
if len(rules) > 0 {
groupMap[g.Name] = &promresourcesv1.RuleGroup{
Name: g.Name,
Interval: g.Interval,
PartialResponseStrategy: g.PartialResponseStrategy,
Rules: rules,
}
}
}
if ok {
if g, exist := groupMap[groupName]; exist {
g.Rules = append(g.Rules, *rule)
} else {
groupMap[groupName] = &promresourcesv1.RuleGroup{
Name: groupName,
Rules: []promresourcesv1.Rule{*rule},
}
}
var groups []promresourcesv1.RuleGroup
for _, g := range groupMap {
groups = append(groups, *g)
}
npr.Spec.Groups = groups
content, err := yaml.Marshal(npr.Spec)
if err != nil {
return false, errors.Wrap(err, "failed to unmarshal content")
}
if len(string(content)) < maxConfigMapDataSize { // check size limit
r.Spec.Groups = groups
return true, nil
}
return false, errOutOfConfigMapSize
}
return false, nil
}
func (r *ruleResource) addAlertingRule(group string, rule *promresourcesv1.Rule) (bool, error) {
var (
err error
pr = (promresourcesv1.PrometheusRule)(*r)
npr = pr.DeepCopy()
ok bool
)
for i := 0; i < len(npr.Spec.Groups); i++ {
if npr.Spec.Groups[i].Name == group {
npr.Spec.Groups[i].Rules = append(npr.Spec.Groups[i].Rules, *rule)
ok = true
break
}
}
if !ok { // add a group when there is no group with the specified group name
npr.Spec.Groups = append(npr.Spec.Groups, promresourcesv1.RuleGroup{
Name: group,
Rules: []promresourcesv1.Rule{*rule},
})
}
content, err := yaml.Marshal(npr.Spec)
if err != nil {
return false, errors.Wrap(err, "failed to unmarshal content")
}
if len(string(content)) < maxConfigMapDataSize { // check size limit
r.Spec.Groups = npr.Spec.Groups
return true, nil
} else {
return false, errOutOfConfigMapSize
}
}
func (r *ruleResource) commit(ctx context.Context, prometheusResourceClient promresourcesclient.Interface) error {
var pr = (promresourcesv1.PrometheusRule)(*r)
if len(pr.Spec.Groups) == 0 {
return prometheusResourceClient.MonitoringV1().PrometheusRules(r.Namespace).Delete(ctx, r.Name, metav1.DeleteOptions{})
}
newPr, err := prometheusResourceClient.MonitoringV1().PrometheusRules(r.Namespace).Update(ctx, &pr, metav1.UpdateOptions{})
if err != nil {
return err
}
newPr.DeepCopyInto(&pr)
return nil
}
type PrometheusRuler struct {
resource *promresourcesv1.Prometheus
informer prominformersv1.PrometheusRuleInformer
client promresourcesclient.Interface
}
func NewPrometheusRuler(resource *promresourcesv1.Prometheus, informer prominformersv1.PrometheusRuleInformer,
client promresourcesclient.Interface) Ruler {
return &PrometheusRuler{
resource: resource,
informer: informer,
client: client,
}
}
func (r *PrometheusRuler) Namespace() string {
return r.resource.Namespace
}
func (r *PrometheusRuler) RuleResourceNamespaceSelector() (labels.Selector, error) {
if r.resource.Spec.RuleNamespaceSelector == nil {
return nil, nil
}
return metav1.LabelSelectorAsSelector(r.resource.Spec.RuleNamespaceSelector)
}
func (r *PrometheusRuler) RuleResourceSelector(extraRuleResourceSelector labels.Selector) (labels.Selector, error) {
rSelector, err := metav1.LabelSelectorAsSelector(r.resource.Spec.RuleSelector)
if err != nil {
return nil, err
}
if extraRuleResourceSelector != nil {
if requirements, ok := extraRuleResourceSelector.Requirements(); ok {
rSelector = rSelector.Add(requirements...)
}
}
return rSelector, nil
}
func (r *PrometheusRuler) ExternalLabels() func() map[string]string {
// ignoring the external labels because rules gotten from prometheus endpoint do not include them
return nil
}
func (r *PrometheusRuler) ListRuleResources(ruleNamespace *corev1.Namespace, extraRuleResourceSelector labels.Selector) (
[]*promresourcesv1.PrometheusRule, error) {
selected, err := ruleNamespaceSelected(r, ruleNamespace)
if err != nil {
return nil, err
}
if !selected {
return nil, nil
}
rSelector, err := r.RuleResourceSelector(extraRuleResourceSelector)
if err != nil {
return nil, err
}
return r.informer.Lister().PrometheusRules(ruleNamespace.Name).List(rSelector)
}
func (r *PrometheusRuler) AddAlertingRule(ctx context.Context, ruleNamespace *corev1.Namespace,
extraRuleResourceSelector labels.Selector,
group string, rule *promresourcesv1.Rule, ruleResourceLabels map[string]string) error {
return errors.New("not supported to add rules for prometheus")
}
func (r *PrometheusRuler) UpdateAlertingRule(ctx context.Context, ruleNamespace *corev1.Namespace,
extraRuleResourceSelector labels.Selector,
group string, rule *promresourcesv1.Rule, ruleResourceLabels map[string]string) error {
return errors.New("not supported to update rules for prometheus")
}
func (r *PrometheusRuler) DeleteAlertingRule(ctx context.Context, ruleNamespace *corev1.Namespace,
extraRuleResourceSelector labels.Selector,
group string, name string) error {
return errors.New("not supported to update rules for prometheus")
}
type ThanosRuler struct {
resource *promresourcesv1.ThanosRuler
informer prominformersv1.PrometheusRuleInformer
client promresourcesclient.Interface
locker locker.Locker
}
func NewThanosRuler(resource *promresourcesv1.ThanosRuler, informer prominformersv1.PrometheusRuleInformer,
client promresourcesclient.Interface) Ruler {
return &ThanosRuler{
resource: resource,
informer: informer,
client: client,
}
}
func (r *ThanosRuler) Namespace() string {
return r.resource.Namespace
}
func (r *ThanosRuler) RuleResourceNamespaceSelector() (labels.Selector, error) {
if r.resource.Spec.RuleNamespaceSelector == nil {
return nil, nil
}
return metav1.LabelSelectorAsSelector(r.resource.Spec.RuleNamespaceSelector)
}
func (r *ThanosRuler) RuleResourceSelector(extraRuleSelector labels.Selector) (labels.Selector, error) {
rSelector, err := metav1.LabelSelectorAsSelector(r.resource.Spec.RuleSelector)
if err != nil {
return nil, err
}
if extraRuleSelector != nil {
if requirements, ok := extraRuleSelector.Requirements(); ok {
rSelector = rSelector.Add(requirements...)
}
}
return rSelector, nil
}
func (r *ThanosRuler) ExternalLabels() func() map[string]string {
// rules gotten from thanos ruler endpoint include the labels
lbls := make(map[string]string)
if ls := r.resource.Spec.Labels; ls != nil {
for k, v := range ls {
lbls[k] = v
}
}
return func() map[string]string {
return lbls
}
}
func (r *ThanosRuler) ListRuleResources(ruleNamespace *corev1.Namespace, extraRuleSelector labels.Selector) (
[]*promresourcesv1.PrometheusRule, error) {
selected, err := ruleNamespaceSelected(r, ruleNamespace)
if err != nil {
return nil, err
}
if !selected {
return nil, nil
}
rSelector, err := r.RuleResourceSelector(extraRuleSelector)
if err != nil {
return nil, err
}
return r.informer.Lister().PrometheusRules(ruleNamespace.Name).List(rSelector)
}
func (r *ThanosRuler) AddAlertingRule(ctx context.Context, ruleNamespace *corev1.Namespace,
extraRuleResourceSelector labels.Selector,
group string, rule *promresourcesv1.Rule, ruleResourceLabels map[string]string) error {
prometheusRules, err := r.ListRuleResources(ruleNamespace, extraRuleResourceSelector)
if err != nil {
return err
}
return r.addAlertingRule(ctx, ruleNamespace, prometheusRules, nil, group, rule, ruleResourceLabels)
}
func (r *ThanosRuler) addAlertingRule(ctx context.Context, ruleNamespace *corev1.Namespace,
prometheusRules []*promresourcesv1.PrometheusRule, excludePrometheusRules map[string]*promresourcesv1.PrometheusRule,
group string, rule *promresourcesv1.Rule, ruleResourceLabels map[string]string) error {
sort.Slice(prometheusRules, func(i, j int) bool {
return len(fmt.Sprint(prometheusRules[i])) <= len(fmt.Sprint(prometheusRules[j]))
})
for _, prometheusRule := range prometheusRules {
if len(excludePrometheusRules) > 0 {
if _, ok := excludePrometheusRules[prometheusRule.Name]; ok {
continue
}
}
if err := r.doRuleResourceOperation(prometheusRule, func(newerPr *promresourcesv1.PrometheusRule) error {
resource := ruleResource(*newerPr)
if ok, err := resource.addAlertingRule(group, rule); err != nil {
return err
} else if ok {
if err = resource.commit(ctx, r.client); err != nil {
return err
}
}
return nil
}); err != nil {
if err == errOutOfConfigMapSize {
break
} else if resourceNotFound(err) {
continue
}
return err
}
return nil
}
// create a new rule resource and add rule into it when all existing rule resources are full.
newPromRule := promresourcesv1.PrometheusRule{
ObjectMeta: metav1.ObjectMeta{
Namespace: ruleNamespace.Name,
GenerateName: customAlertingRuleResourcePrefix,
Labels: ruleResourceLabels,
},
Spec: promresourcesv1.PrometheusRuleSpec{
Groups: []promresourcesv1.RuleGroup{{
Name: group,
Rules: []promresourcesv1.Rule{*rule},
}},
},
}
if _, err := r.client.MonitoringV1().
PrometheusRules(ruleNamespace.Name).Create(ctx, &newPromRule, metav1.CreateOptions{}); err != nil {
return errors.Wrapf(err, "error creating a prometheusrule resource %s/%s",
newPromRule.Namespace, newPromRule.Name)
}
return nil
}
func (r *ThanosRuler) UpdateAlertingRule(ctx context.Context, ruleNamespace *corev1.Namespace,
extraRuleResourceSelector labels.Selector,
group string, rule *promresourcesv1.Rule, ruleResourceLabels map[string]string) error {
prometheusRules, err := r.ListRuleResources(ruleNamespace, extraRuleResourceSelector)
if err != nil {
return err
}
var (
found bool
success bool
prsToDelRule = make(map[string]*promresourcesv1.PrometheusRule)
)
for i, prometheusRule := range prometheusRules {
if success { // If the update has been successful, delete the possible same rule in other resources
if err := r.doRuleResourceOperation(prometheusRule, func(newerPr *promresourcesv1.PrometheusRule) error {
resource := ruleResource(*newerPr)
if ok, err := resource.deleteAlertingRule(rule.Alert); err != nil {
return err
} else if ok {
if err = resource.commit(ctx, r.client); err != nil {
return err
}
}
return nil
}); err != nil && !resourceNotFound(err) {
return err
}
continue
}
if err := r.doRuleResourceOperation(prometheusRule, func(newerPr *promresourcesv1.PrometheusRule) error {
resource := ruleResource(*newerPr)
if ok, err := resource.updateAlertingRule(group, rule); err != nil {
return err
} else if ok {
if err = resource.commit(ctx, r.client); err != nil {
return err
}
}
return nil
}); err != nil {
if resourceNotFound(err) {
continue
} else if err == errOutOfConfigMapSize {
// updating the rule in the resource may oversize the size limit,
// so delete it and then add the new rule to a new resource.
prsToDelRule[prometheusRule.Name] = prometheusRules[i]
found = true
continue
}
return err
}
found = true
success = true
}
if !found {
return v2alpha1.ErrAlertingRuleNotFound
}
if !success {
err := r.addAlertingRule(ctx, ruleNamespace, prometheusRules, prsToDelRule, group, rule, ruleResourceLabels)
if err != nil {
return err
}
}
for _, pr := range prsToDelRule {
if err := r.doRuleResourceOperation(pr, func(newerPr *promresourcesv1.PrometheusRule) error {
resource := ruleResource(*newerPr)
if ok, err := resource.deleteAlertingRule(rule.Alert); err != nil {
return err
} else if ok {
if err = resource.commit(ctx, r.client); err != nil {
return err
}
}
return nil
}); err != nil && !resourceNotFound(err) {
return err
}
}
return nil
}
func (r *ThanosRuler) DeleteAlertingRule(ctx context.Context, ruleNamespace *corev1.Namespace,
extraRuleResourceSelector labels.Selector, group string, name string) error {
prometheusRules, err := r.ListRuleResources(ruleNamespace, extraRuleResourceSelector)
if err != nil {
return err
}
var success bool
for _, prometheusRule := range prometheusRules {
if err := r.doRuleResourceOperation(prometheusRule, func(newerPr *promresourcesv1.PrometheusRule) error {
resource := ruleResource(*newerPr)
if ok, err := resource.deleteAlertingRule(name); err != nil {
return err
} else if ok {
if err = resource.commit(ctx, r.client); err != nil {
return err
}
}
return nil
}); err != nil {
if resourceNotFound(err) {
continue
}
return err
}
success = true
}
if !success {
return v2alpha1.ErrAlertingRuleNotFound
}
return nil
}
func (r *ThanosRuler) doRuleResourceOperation(pr *promresourcesv1.PrometheusRule,
operation func(newerPr *promresourcesv1.PrometheusRule) error) error {
key := pr.Namespace + "/" + pr.Name
return retry.RetryOnConflict(retry.DefaultRetry, func() error {
r.locker.Lock(key)
defer r.locker.Unlock(key)
pr, err := r.informer.Lister().PrometheusRules(pr.Namespace).Get(pr.Name)
if err != nil {
return err
}
return operation(pr)
})
}
func ruleNamespaceSelected(r Ruler, ruleNamespace *corev1.Namespace) (bool, error) {
rnSelector, err := r.RuleResourceNamespaceSelector()
if err != nil {
return false, err
}
if rnSelector == nil { // refer to the comment of Prometheus.Spec.RuleResourceNamespaceSelector
if r.Namespace() != ruleNamespace.Name {
return false, nil
}
} else {
if !rnSelector.Matches(labels.Set(ruleNamespace.Labels)) {
return false, nil
}
}
return true, nil
}
func resourceNotFound(err error) bool {
switch e := err.(type) {
case *apierrors.StatusError:
if e.Status().Code == http.StatusNotFound {
return true
}
}
return false
}