diff --git a/pkg/api/alerting/v2alpha1/types.go b/pkg/api/alerting/v2alpha1/types.go index b2a895709..fbd37ff88 100644 --- a/pkg/api/alerting/v2alpha1/types.go +++ b/pkg/api/alerting/v2alpha1/types.go @@ -17,7 +17,7 @@ limitations under the License. package v2alpha1 import ( - "regexp" + "context" "sort" "strconv" "strings" @@ -26,21 +26,27 @@ import ( "github.com/emicklei/go-restful" "github.com/pkg/errors" prommodel "github.com/prometheus/common/model" + "github.com/prometheus/prometheus/pkg/timestamp" "github.com/prometheus/prometheus/promql/parser" + "github.com/prometheus/prometheus/template" utilerrors "k8s.io/apimachinery/pkg/util/errors" ) const ( RuleLevelCluster RuleLevel = "cluster" RuleLevelNamespace RuleLevel = "namespace" + + AnnotationKeyRuleUpdateTime = "rule_update_time" ) var ( ErrThanosRulerNotEnabled = errors.New("The request operation to custom alerting rule could not be done because thanos ruler is not enabled") ErrAlertingRuleNotFound = errors.New("The alerting rule was not found") ErrAlertingRuleAlreadyExists = errors.New("The alerting rule already exists") + ErrAlertingAPIV2NotEnabled = errors.New("The alerting v2 API is not enabled") - ruleLabelNameMatcher = regexp.MustCompile(`[a-zA-Z_][a-zA-Z0-9_]*`) + templateTestData = template.AlertTemplateData(map[string]string{}, map[string]string{}, 0) + templateTestTextPrefix = "{{$labels := .Labels}}{{$externalLabels := .ExternalLabels}}{{$value := .Value}}" ) type RuleLevel string @@ -65,7 +71,10 @@ func (r *PostableAlertingRule) Validate() error { if r.Name == "" { errs = append(errs, errors.New("name can not be empty")) } - if _, err := parser.ParseExpr(r.Query); err != nil { + + if r.Query == "" { + errs = append(errs, errors.New("query can not be empty")) + } else if _, err := parser.ParseExpr(r.Query); err != nil { errs = append(errs, errors.Wrapf(err, "query is invalid: %s", r.Query)) } if r.Duration != "" { @@ -74,12 +83,42 @@ func (r *PostableAlertingRule) Validate() error { } } + parseTest := func(text string) error { + tmpl := template.NewTemplateExpander( + context.TODO(), + templateTestTextPrefix+text, + "__alert_"+r.Name, + templateTestData, + prommodel.Time(timestamp.FromTime(time.Now())), + nil, + nil, + ) + return tmpl.ParseTest() + } + if len(r.Labels) > 0 { - for name, _ := range r.Labels { - if !ruleLabelNameMatcher.MatchString(name) || strings.HasPrefix(name, "__") { + for name, v := range r.Labels { + if !prommodel.LabelName(name).IsValid() || strings.HasPrefix(name, "__") { errs = append(errs, errors.Errorf( "label name (%s) is not valid. The name must match [a-zA-Z_][a-zA-Z0-9_]* and has not the __ prefix (label names with this prefix are for internal use)", name)) } + if !prommodel.LabelValue(v).IsValid() { + errs = append(errs, errors.Errorf("invalid label value: %s", v)) + } + if err := parseTest(v); err != nil { + errs = append(errs, errors.Errorf("invalid label value: %s", v)) + } + } + } + + if len(r.Annotations) > 0 { + for name, v := range r.Annotations { + if !prommodel.LabelName(name).IsValid() { + errs = append(errs, errors.Errorf("invalid annotation name: %s", v)) + } + if err := parseTest(v); err != nil { + errs = append(errs, errors.Errorf("invalid annotation value: %s", v)) + } } } @@ -126,7 +165,7 @@ type AlertingRuleQueryParams struct { LabelEqualFilters map[string]string LabelContainFilters map[string]string - Offset int + PageNum int Limit int SortField string SortType string @@ -180,10 +219,22 @@ func AlertingRuleIdCompare(leftId, rightId string) bool { } func (q *AlertingRuleQueryParams) Sort(rules []*GettableAlertingRule) { - idCompare := func(left, right *GettableAlertingRule) bool { + baseCompare := func(left, right *GettableAlertingRule) bool { + var leftUpdateTime, rightUpdateTime string + if len(left.Annotations) > 0 { + leftUpdateTime = left.Annotations[AnnotationKeyRuleUpdateTime] + } + if len(right.Annotations) > 0 { + rightUpdateTime = right.Annotations[AnnotationKeyRuleUpdateTime] + } + + if leftUpdateTime != rightUpdateTime { + return leftUpdateTime > rightUpdateTime + } + return AlertingRuleIdCompare(left.Id, right.Id) } - var compare = idCompare + var compare = baseCompare if q != nil { reverse := q.SortType == "desc" switch q.SortField { @@ -195,7 +246,7 @@ func (q *AlertingRuleQueryParams) Sort(rules []*GettableAlertingRule) { } return c < 0 } - return idCompare(left, right) + return baseCompare(left, right) } case "lastEvaluation": compare = func(left, right *GettableAlertingRule) bool { @@ -213,7 +264,7 @@ func (q *AlertingRuleQueryParams) Sort(rules []*GettableAlertingRule) { return left.LastEvaluation.Before(*right.LastEvaluation) } } - return idCompare(left, right) + return baseCompare(left, right) } case "evaluationTime": compare = func(left, right *GettableAlertingRule) bool { @@ -223,7 +274,7 @@ func (q *AlertingRuleQueryParams) Sort(rules []*GettableAlertingRule) { } return left.EvaluationDurationSeconds < right.EvaluationDurationSeconds } - return idCompare(left, right) + return baseCompare(left, right) } } } @@ -235,7 +286,7 @@ func (q *AlertingRuleQueryParams) Sort(rules []*GettableAlertingRule) { func (q *AlertingRuleQueryParams) Sub(rules []*GettableAlertingRule) []*GettableAlertingRule { start, stop := 0, 10 if q != nil { - start, stop = q.Offset, q.Offset+q.Limit + start, stop = (q.PageNum-1)*q.Limit, q.PageNum*q.Limit } total := len(rules) if start < total { @@ -252,8 +303,8 @@ type AlertQueryParams struct { LabelEqualFilters map[string]string LabelContainFilters map[string]string - Offset int - Limit int + PageNum int + Limit int } func (q *AlertQueryParams) Filter(alerts []*Alert) []*Alert { @@ -312,7 +363,7 @@ func (q *AlertQueryParams) Sort(alerts []*Alert) { func (q *AlertQueryParams) Sub(alerts []*Alert) []*Alert { start, stop := 0, 10 if q != nil { - start, stop = q.Offset, q.Offset+q.Limit + start, stop = (q.PageNum-1)*q.Limit, q.PageNum*q.Limit } total := len(alerts) if start < total { @@ -333,7 +384,14 @@ func ParseAlertingRuleQueryParams(req *restful.Request) (*AlertingRuleQueryParam q.NameContainFilter = req.QueryParameter("name") q.State = req.QueryParameter("state") q.Health = req.QueryParameter("health") - q.Offset, _ = strconv.Atoi(req.QueryParameter("offset")) + q.PageNum, err = strconv.Atoi(req.QueryParameter("page")) + if err != nil { + q.PageNum = 1 + err = nil + } + if q.PageNum <= 0 { + q.PageNum = 1 + } q.Limit, err = strconv.Atoi(req.QueryParameter("limit")) if err != nil { q.Limit = 10 @@ -352,7 +410,14 @@ func ParseAlertQueryParams(req *restful.Request) (*AlertQueryParams, error) { ) q.State = req.QueryParameter("state") - q.Offset, _ = strconv.Atoi(req.QueryParameter("offset")) + q.PageNum, err = strconv.Atoi(req.QueryParameter("page")) + if err != nil { + q.PageNum = 1 + err = nil + } + if q.PageNum <= 0 { + q.PageNum = 1 + } q.Limit, err = strconv.Atoi(req.QueryParameter("limit")) if err != nil { q.Limit = 10 diff --git a/pkg/kapis/alerting/v2alpha1/register.go b/pkg/kapis/alerting/v2alpha1/register.go index 1e32d3fb6..95ddba912 100644 --- a/pkg/kapis/alerting/v2alpha1/register.go +++ b/pkg/kapis/alerting/v2alpha1/register.go @@ -41,10 +41,24 @@ func AddToContainer(container *restful.Container, informers informers.InformerFa promResourceClient promresourcesclient.Interface, ruleClient alerting.RuleClient, option *alerting.Options) error { - handler := newHandler(informers, promResourceClient, ruleClient, option) - ws := runtime.NewWebService(GroupVersion) + if informers == nil || promResourceClient == nil || ruleClient == nil || option == nil { + h := func(req *restful.Request, resp *restful.Response) { + ksapi.HandleBadRequest(resp, nil, alertingv2alpha1.ErrAlertingAPIV2NotEnabled) + return + } + ws.Route(ws.GET("/{path:*}").To(h).Returns(http.StatusOK, ksapi.StatusOK, nil)) + ws.Route(ws.PUT("/{path:*}").To(h).Returns(http.StatusOK, ksapi.StatusOK, nil)) + ws.Route(ws.POST("/{path:*}").To(h).Returns(http.StatusOK, ksapi.StatusOK, nil)) + ws.Route(ws.DELETE("/{path:*}").To(h).Returns(http.StatusOK, ksapi.StatusOK, nil)) + ws.Route(ws.PATCH("/{path:*}").To(h).Returns(http.StatusOK, ksapi.StatusOK, nil)) + container.Add(ws) + return nil + } + + handler := newHandler(informers, promResourceClient, ruleClient, option) + ws.Route(ws.GET("/rules"). To(handler.handleListCustomAlertingRules). Doc("list the cluster-level custom alerting rules"). @@ -54,7 +68,7 @@ func AddToContainer(container *restful.Container, informers informers.InformerFa Param(ws.QueryParameter("label_filters", "label filters, concatenating multiple filters with commas, equal symbol for exact query, wave symbol for fuzzy query e.g. name~a").DataFormat("key=%s,key~%s")). Param(ws.QueryParameter("sort_field", "sort field, one of `name`, `lastEvaluation`, `evaluationTime`")). Param(ws.QueryParameter("sort_type", "sort type, one of `asc`, `desc`")). - Param(ws.QueryParameter("offset", "offset of the result set").DataType("integer").DefaultValue("0")). + Param(ws.QueryParameter("page", "page of the result set").DataType("integer").DefaultValue("1")). Param(ws.QueryParameter("limit", "limit size of the result set").DataType("integer").DefaultValue("10")). Returns(http.StatusOK, ksapi.StatusOK, alertingv2alpha1.GettableAlertingRuleList{}). Metadata(restfulspec.KeyOpenAPITags, []string{constants.AlertingTag})) @@ -64,7 +78,7 @@ func AddToContainer(container *restful.Container, informers informers.InformerFa Doc("list the alerts of the cluster-level custom alerting rules"). Param(ws.QueryParameter("state", "state, one of `firing`, `pending`, `inactive`")). Param(ws.QueryParameter("label_filters", "label filters, concatenating multiple filters with commas, equal symbol for exact query, wave symbol for fuzzy query e.g. name~a").DataFormat("key=%s,key~%s")). - Param(ws.QueryParameter("offset", "offset of the result set").DataType("integer").DefaultValue("0")). + Param(ws.QueryParameter("page", "page of the result set").DataType("integer").DefaultValue("1")). Param(ws.QueryParameter("limit", "limit size of the result set").DataType("integer").DefaultValue("10")). Returns(http.StatusOK, ksapi.StatusOK, alertingv2alpha1.AlertList{}). Metadata(restfulspec.KeyOpenAPITags, []string{constants.AlertingTag})) @@ -78,7 +92,7 @@ func AddToContainer(container *restful.Container, informers informers.InformerFa ws.Route(ws.GET("/rules/{rule_name}/alerts"). To(handler.handleListCustomRuleAlerts). Doc("list the alerts of the cluster-level custom alerting rule with the specified name"). - Returns(http.StatusOK, ksapi.StatusOK, []alertingv2alpha1.Alert{}). + Returns(http.StatusOK, ksapi.StatusOK, alertingv2alpha1.AlertList{}). Metadata(restfulspec.KeyOpenAPITags, []string{constants.AlertingTag})) ws.Route(ws.POST("/rules"). @@ -110,7 +124,7 @@ func AddToContainer(container *restful.Container, informers informers.InformerFa Param(ws.QueryParameter("label_filters", "label filters, concatenating multiple filters with commas, equal symbol for exact query, wave symbol for fuzzy query e.g. name~a").DataFormat("key=%s,key~%s")). Param(ws.QueryParameter("sort_field", "sort field, one of `name`, `lastEvaluation`, `evaluationTime`")). Param(ws.QueryParameter("sort_type", "sort type, one of `asc`, `desc`")). - Param(ws.QueryParameter("offset", "offset of the result set").DataType("integer").DefaultValue("0")). + Param(ws.QueryParameter("page", "page of the result set").DataType("integer").DefaultValue("1")). Param(ws.QueryParameter("limit", "limit size of the result set").DataType("integer").DefaultValue("10")). Returns(http.StatusOK, ksapi.StatusOK, alertingv2alpha1.GettableAlertingRuleList{}). Metadata(restfulspec.KeyOpenAPITags, []string{constants.AlertingTag})) @@ -120,7 +134,7 @@ func AddToContainer(container *restful.Container, informers informers.InformerFa Doc("list the alerts of the custom alerting rules in the specified namespace."). Param(ws.QueryParameter("state", "state, one of `firing`, `pending`, `inactive`")). Param(ws.QueryParameter("label_filters", "label filters, concatenating multiple filters with commas, equal symbol for exact query, wave symbol for fuzzy query e.g. name~a").DataFormat("key=%s,key~%s")). - Param(ws.QueryParameter("offset", "offset of the result set").DataType("integer").DefaultValue("0")). + Param(ws.QueryParameter("page", "page of the result set").DataType("integer").DefaultValue("1")). Param(ws.QueryParameter("limit", "limit size of the result set").DataType("integer").DefaultValue("10")). Returns(http.StatusOK, ksapi.StatusOK, alertingv2alpha1.AlertList{}). Metadata(restfulspec.KeyOpenAPITags, []string{constants.AlertingTag})) @@ -134,7 +148,7 @@ func AddToContainer(container *restful.Container, informers informers.InformerFa ws.Route(ws.GET("/namespaces/{namespace}/rules/{rule_name}/alerts"). To(handler.handleListCustomRuleAlerts). Doc("get the alerts of the custom alerting rule with the specified name in the specified namespace"). - Returns(http.StatusOK, ksapi.StatusOK, []alertingv2alpha1.Alert{}). + Returns(http.StatusOK, ksapi.StatusOK, alertingv2alpha1.AlertList{}). Metadata(restfulspec.KeyOpenAPITags, []string{constants.AlertingTag})) ws.Route(ws.POST("/namespaces/{namespace}/rules"). @@ -166,7 +180,7 @@ func AddToContainer(container *restful.Container, informers informers.InformerFa Param(ws.QueryParameter("label_filters", "label filters, concatenating multiple filters with commas, equal symbol for exact query, wave symbol for fuzzy query e.g. name~a").DataFormat("key=%s,key~%s")). Param(ws.QueryParameter("sort_field", "sort field, one of `name`, `lastEvaluation`, `evaluationTime`")). Param(ws.QueryParameter("sort_type", "sort type, one of `asc`, `desc`")). - Param(ws.QueryParameter("offset", "offset of the result set").DataType("integer").DefaultValue("0")). + Param(ws.QueryParameter("page", "page of the result set").DataType("integer").DefaultValue("1")). Param(ws.QueryParameter("limit", "limit size of the result set").DataType("integer").DefaultValue("10")). Returns(http.StatusOK, ksapi.StatusOK, alertingv2alpha1.GettableAlertingRuleList{}). Metadata(restfulspec.KeyOpenAPITags, []string{constants.AlertingTag})) @@ -176,7 +190,7 @@ func AddToContainer(container *restful.Container, informers informers.InformerFa Doc("list the alerts of the builtin(non-custom) rules"). Param(ws.QueryParameter("state", "state, one of `firing`, `pending`, `inactive`")). Param(ws.QueryParameter("label_filters", "label filters, concatenating multiple filters with commas, equal symbol for exact query, wave symbol for fuzzy query e.g. name~a").DataFormat("key=%s,key~%s")). - Param(ws.QueryParameter("offset", "offset of the result set").DataType("integer").DefaultValue("0")). + Param(ws.QueryParameter("page", "page of the result set").DataType("integer").DefaultValue("1")). Param(ws.QueryParameter("limit", "limit size of the result set").DataType("integer").DefaultValue("10")). Returns(http.StatusOK, ksapi.StatusOK, alertingv2alpha1.AlertList{}). Metadata(restfulspec.KeyOpenAPITags, []string{constants.AlertingTag})) @@ -190,7 +204,7 @@ func AddToContainer(container *restful.Container, informers informers.InformerFa ws.Route(ws.GET("/builtin/rules/{rule_id}/alerts"). To(handler.handleListBuiltinRuleAlerts). Doc("list the alerts of the builtin(non-custom) alerting rule with the specified id"). - Returns(http.StatusOK, ksapi.StatusOK, []alertingv2alpha1.Alert{}). + Returns(http.StatusOK, ksapi.StatusOK, alertingv2alpha1.AlertList{}). Metadata(restfulspec.KeyOpenAPITags, []string{constants.AlertingTag})) container.Add(ws) diff --git a/pkg/models/alerting/alerting.go b/pkg/models/alerting/alerting.go index 3c40ed01c..3a4b14d4a 100644 --- a/pkg/models/alerting/alerting.go +++ b/pkg/models/alerting/alerting.go @@ -4,6 +4,7 @@ import ( "context" "sort" "strings" + "time" "github.com/pkg/errors" promresourcesv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" @@ -48,7 +49,7 @@ type Operator interface { // GetCustomAlertingRule gets the custom alerting rule with the given name. GetCustomAlertingRule(ctx context.Context, namespace, ruleName string) (*v2alpha1.GettableAlertingRule, error) // ListCustomRuleAlerts lists the alerts of the custom alerting rule with the given name. - ListCustomRuleAlerts(ctx context.Context, namespace, ruleName string) ([]*v2alpha1.Alert, error) + ListCustomRuleAlerts(ctx context.Context, namespace, ruleName string) (*v2alpha1.AlertList, error) // CreateCustomAlertingRule creates a custom alerting rule. CreateCustomAlertingRule(ctx context.Context, namespace string, rule *v2alpha1.PostableAlertingRule) error // UpdateCustomAlertingRule updates the custom alerting rule with the given name. @@ -65,7 +66,7 @@ type Operator interface { // GetBuiltinAlertingRule gets the builtin(non-custom) alerting rule with the given id GetBuiltinAlertingRule(ctx context.Context, ruleId string) (*v2alpha1.GettableAlertingRule, error) // ListBuiltinRuleAlerts lists the alerts of the builtin(non-custom) alerting rule with the given id - ListBuiltinRuleAlerts(ctx context.Context, ruleId string) ([]*v2alpha1.Alert, error) + ListBuiltinRuleAlerts(ctx context.Context, ruleId string) (*v2alpha1.AlertList, error) } func NewOperator(informers informers.InformerFactory, @@ -184,7 +185,7 @@ func (o *operator) GetCustomAlertingRule(ctx context.Context, namespace, ruleNam } func (o *operator) ListCustomRuleAlerts(ctx context.Context, namespace, ruleName string) ( - []*v2alpha1.Alert, error) { + *v2alpha1.AlertList, error) { rule, err := o.GetCustomAlertingRule(ctx, namespace, ruleName) if err != nil { @@ -193,7 +194,10 @@ func (o *operator) ListCustomRuleAlerts(ctx context.Context, namespace, ruleName if rule == nil { return nil, v2alpha1.ErrAlertingRuleNotFound } - return rule.Alerts, nil + return &v2alpha1.AlertList{ + Total: len(rule.Alerts), + Items: rule.Alerts, + }, nil } func (o *operator) ListBuiltinAlertingRules(ctx context.Context, @@ -223,7 +227,7 @@ func (o *operator) GetBuiltinAlertingRule(ctx context.Context, ruleId string) ( return o.getBuiltinAlertingRule(ctx, ruleId) } -func (o *operator) ListBuiltinRuleAlerts(ctx context.Context, ruleId string) ([]*v2alpha1.Alert, error) { +func (o *operator) ListBuiltinRuleAlerts(ctx context.Context, ruleId string) (*v2alpha1.AlertList, error) { rule, err := o.getBuiltinAlertingRule(ctx, ruleId) if err != nil { return nil, err @@ -231,7 +235,10 @@ func (o *operator) ListBuiltinRuleAlerts(ctx context.Context, ruleId string) ([] if rule == nil { return nil, v2alpha1.ErrAlertingRuleNotFound } - return rule.Alerts, nil + return &v2alpha1.AlertList{ + Total: len(rule.Alerts), + Items: rule.Alerts, + }, nil } func (o *operator) ListClusterAlertingRules(ctx context.Context, customFlag string, @@ -464,6 +471,8 @@ func (o *operator) CreateCustomAlertingRule(ctx context.Context, namespace strin return v2alpha1.ErrAlertingRuleAlreadyExists } + setRuleUpdateTime(rule, time.Now()) + return ruler.AddAlertingRule(ctx, ruleNamespace, extraRuleResourceSelector, customRuleGroupDefault, parseToPrometheusRule(rule), ruleResourceLabels) } @@ -515,6 +524,8 @@ func (o *operator) UpdateCustomAlertingRule(ctx context.Context, namespace, name return v2alpha1.ErrAlertingRuleNotFound } + setRuleUpdateTime(rule, time.Now()) + return ruler.UpdateAlertingRule(ctx, ruleNamespace, extraRuleResourceSelector, resourceRule.Group, parseToPrometheusRule(rule), ruleResourceLabels) } @@ -627,3 +638,13 @@ func pageAlerts(alertingRules []*v2alpha1.GettableAlertingRule, Items: queryParams.Sub(alerts), } } + +func setRuleUpdateTime(rule *v2alpha1.PostableAlertingRule, t time.Time) { + if rule.Annotations == nil { + rule.Annotations = make(map[string]string) + } + if t.IsZero() { + t = time.Now() + } + rule.Annotations[v2alpha1.AnnotationKeyRuleUpdateTime] = t.UTC().Format(time.RFC3339) +} diff --git a/pkg/models/alerting/rules/ruler.go b/pkg/models/alerting/rules/ruler.go index 887a6e899..5741ffa28 100644 --- a/pkg/models/alerting/rules/ruler.go +++ b/pkg/models/alerting/rules/ruler.go @@ -3,16 +3,20 @@ package rules import ( "context" "fmt" + "net/http" "sort" + "github.com/docker/docker/pkg/locker" "github.com/ghodss/yaml" "github.com/pkg/errors" promresourcesv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" prominformersv1 "github.com/prometheus-operator/prometheus-operator/pkg/client/informers/externalversions/monitoring/v1" promresourcesclient "github.com/prometheus-operator/prometheus-operator/pkg/client/versioned" corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" + "k8s.io/client-go/util/retry" "kubesphere.io/kubesphere/pkg/api/alerting/v2alpha1" ) @@ -268,6 +272,7 @@ type ThanosRuler struct { resource *promresourcesv1.ThanosRuler informer prominformersv1.PrometheusRuleInformer client promresourcesclient.Interface + locker locker.Locker } func NewThanosRuler(resource *promresourcesv1.ThanosRuler, informer prominformersv1.PrometheusRuleInformer, @@ -345,7 +350,7 @@ func (r *ThanosRuler) AddAlertingRule(ctx context.Context, ruleNamespace *corev1 } func (r *ThanosRuler) addAlertingRule(ctx context.Context, ruleNamespace *corev1.Namespace, - prometheusRules []*promresourcesv1.PrometheusRule, excludeRuleResources map[string]*ruleResource, + prometheusRules []*promresourcesv1.PrometheusRule, excludePrometheusRules map[string]*promresourcesv1.PrometheusRule, group string, rule *promresourcesv1.Rule, ruleResourceLabels map[string]string) error { sort.Slice(prometheusRules, func(i, j int) bool { @@ -353,23 +358,30 @@ func (r *ThanosRuler) addAlertingRule(ctx context.Context, ruleNamespace *corev1 }) for _, prometheusRule := range prometheusRules { - if len(excludeRuleResources) > 0 { - if _, ok := excludeRuleResources[prometheusRule.Name]; ok { + if len(excludePrometheusRules) > 0 { + if _, ok := excludePrometheusRules[prometheusRule.Name]; ok { continue } } - resource := ruleResource(*prometheusRule) - if ok, err := resource.addAlertingRule(group, rule); err != nil { - if err == errOutOfConfigMapSize { - break - } - return err - } else if ok { - if err = resource.commit(ctx, r.client); err != nil { + if err := r.doRuleResourceOperation(ctx, prometheusRule, func(pr *promresourcesv1.PrometheusRule) error { + resource := ruleResource(*pr) + if ok, err := resource.addAlertingRule(group, rule); err != nil { return err + } else if ok { + if err = resource.commit(ctx, r.client); err != nil { + return err + } } return nil + }); err != nil { + if err == errOutOfConfigMapSize { + break + } else if resourceNotFound(err) { + continue + } + return err } + return nil } // create a new rule resource and add rule into it when all existing rule resources are full. newPromRule := promresourcesv1.PrometheusRule{ @@ -403,38 +415,52 @@ func (r *ThanosRuler) UpdateAlertingRule(ctx context.Context, ruleNamespace *cor } var ( - found bool - success bool - resourcesToDelRule = make(map[string]*ruleResource) + found bool + success bool + prsToDelRule = make(map[string]*promresourcesv1.PrometheusRule) ) - for _, prometheusRule := range prometheusRules { - resource := ruleResource(*prometheusRule) + for i, prometheusRule := range prometheusRules { if success { // If the update has been successful, delete the possible same rule in other resources - if ok, err := resource.deleteAlertingRule(rule.Alert); err != nil { + if err := r.doRuleResourceOperation(ctx, prometheusRule, func(pr *promresourcesv1.PrometheusRule) error { + resource := ruleResource(*pr) + if ok, err := resource.deleteAlertingRule(rule.Alert); err != nil { + return err + } else if ok { + if err = resource.commit(ctx, r.client); err != nil { + return err + } + } + return nil + }); err != nil && !resourceNotFound(err) { + return err + } + continue + } + + if err := r.doRuleResourceOperation(ctx, prometheusRule, func(pr *promresourcesv1.PrometheusRule) error { + resource := ruleResource(*pr) + if ok, err := resource.updateAlertingRule(group, rule); err != nil { return err } else if ok { if err = resource.commit(ctx, r.client); err != nil { return err } } - continue - } - if ok, err := resource.updateAlertingRule(group, rule); err != nil { - if err == errOutOfConfigMapSize { - // updating the rule in the resource will oversize the size limit, + return nil + }); err != nil { + if resourceNotFound(err) { + continue + } else if err == errOutOfConfigMapSize { + // updating the rule in the resource may oversize the size limit, // so delete it and then add the new rule to a new resource. - resourcesToDelRule[resource.Name] = &resource + prsToDelRule[prometheusRule.Name] = prometheusRules[i] found = true - } else { - return err + continue } - } else if ok { - if err = resource.commit(ctx, r.client); err != nil { - return err - } - found = true - success = true + return err } + found = true + success = true } if !found { @@ -442,18 +468,24 @@ func (r *ThanosRuler) UpdateAlertingRule(ctx context.Context, ruleNamespace *cor } if !success { - err := r.addAlertingRule(ctx, ruleNamespace, prometheusRules, resourcesToDelRule, group, rule, ruleResourceLabels) + err := r.addAlertingRule(ctx, ruleNamespace, prometheusRules, prsToDelRule, group, rule, ruleResourceLabels) if err != nil { return err } } - for _, resource := range resourcesToDelRule { - if ok, err := resource.deleteAlertingRule(rule.Alert); err != nil { - return err - } else if ok { - if err = resource.commit(ctx, r.client); err != nil { + for _, pr := range prsToDelRule { + if err := r.doRuleResourceOperation(ctx, pr, func(pr *promresourcesv1.PrometheusRule) error { + resource := ruleResource(*pr) + if ok, err := resource.deleteAlertingRule(rule.Alert); err != nil { return err + } else if ok { + if err = resource.commit(ctx, r.client); err != nil { + return err + } } + return nil + }); err != nil && !resourceNotFound(err) { + return err } } return nil @@ -467,15 +499,23 @@ func (r *ThanosRuler) DeleteAlertingRule(ctx context.Context, ruleNamespace *cor } var success bool for _, prometheusRule := range prometheusRules { - resource := ruleResource(*prometheusRule) - if ok, err := resource.deleteAlertingRule(name); err != nil { - return err - } else if ok { - if err = resource.commit(ctx, r.client); err != nil { + if err := r.doRuleResourceOperation(ctx, prometheusRule, func(pr *promresourcesv1.PrometheusRule) error { + resource := ruleResource(*pr) + if ok, err := resource.deleteAlertingRule(name); err != nil { return err + } else if ok { + if err = resource.commit(ctx, r.client); err != nil { + return err + } } - success = true + return nil + }); err != nil { + if resourceNotFound(err) { + continue + } + return err } + success = true } if !success { return v2alpha1.ErrAlertingRuleNotFound @@ -483,6 +523,20 @@ func (r *ThanosRuler) DeleteAlertingRule(ctx context.Context, ruleNamespace *cor return nil } +func (r *ThanosRuler) doRuleResourceOperation(ctx context.Context, pr *promresourcesv1.PrometheusRule, + operation func(pr *promresourcesv1.PrometheusRule) error) error { + key := pr.Namespace + "/" + pr.Name + return retry.RetryOnConflict(retry.DefaultRetry, func() error { + r.locker.Lock(key) + defer r.locker.Unlock(key) + pr, err := r.client.MonitoringV1().PrometheusRules(pr.Namespace).Get(ctx, pr.Name, metav1.GetOptions{}) + if err != nil { + return err + } + return operation(pr) + }) +} + func ruleNamespaceSelected(r Ruler, ruleNamespace *corev1.Namespace) (bool, error) { rnSelector, err := r.RuleResourceNamespaceSelector() if err != nil { @@ -499,3 +553,13 @@ func ruleNamespaceSelected(r Ruler, ruleNamespace *corev1.Namespace) (bool, erro } return true, nil } + +func resourceNotFound(err error) bool { + switch e := err.(type) { + case *apierrors.StatusError: + if e.Status().Code == http.StatusNotFound { + return true + } + } + return false +} diff --git a/pkg/models/alerting/rules/utils.go b/pkg/models/alerting/rules/utils.go index b22f1ca9a..5ee59db8b 100644 --- a/pkg/models/alerting/rules/utils.go +++ b/pkg/models/alerting/rules/utils.go @@ -1,7 +1,6 @@ package rules import ( - "kubesphere.io/kubesphere/pkg/simple/client/alerting" "path/filepath" "sort" "strings" @@ -16,6 +15,7 @@ import ( "github.com/prometheus/prometheus/rules" "k8s.io/klog" "kubesphere.io/kubesphere/pkg/api/alerting/v2alpha1" + "kubesphere.io/kubesphere/pkg/simple/client/alerting" ) const ( @@ -25,6 +25,9 @@ const ( LabelKeyInternalRuleName = "__rule_name__" LabelKeyInternalRuleQuery = "__rule_query__" LabelKeyInternalRuleDuration = "__rule_duration__" + + LabelKeyThanosRulerReplica = "thanos_ruler_replica" + LabelKeyPrometheusReplica = "prometheus_replica" ) func FormatExpr(expr string) (string, error) { @@ -100,16 +103,21 @@ func GenEndpointRuleId(group string, epRule *alerting.AlertingRule, } duration := parseDurationSeconds(epRule.Duration) - var labelsMap map[string]string - if externalLabels == nil { - labelsMap = epRule.Labels - } else { - labelsMap = make(map[string]string) - extLabels := externalLabels() - for key, value := range epRule.Labels { - if v, ok := extLabels[key]; !(ok && value == v) { - labelsMap[key] = value - } + var extLabels map[string]string + if externalLabels != nil { + extLabels = externalLabels() + } + labelsMap := make(map[string]string) + for key, value := range epRule.Labels { + if key == LabelKeyPrometheusReplica || key == LabelKeyThanosRulerReplica { + continue + } + if extLabels == nil { + labelsMap[key] = value + continue + } + if v, ok := extLabels[key]; !(ok && value == v) { + labelsMap[key] = value } } @@ -148,11 +156,11 @@ func GetAlertingRulesStatus(ruleNamespace string, ruleChunk *ResourceRuleChunk, continue } - for _, epRule := range group.Rules { + for i, epRule := range group.Rules { if eid, err := GenEndpointRuleId(group.Name, epRule, extLabels); err != nil { return nil, errors.Wrap(err, ErrGenRuleId) } else { - idEpRules[eid] = epRule + idEpRules[eid] = group.Rules[i] nameIds[epRule.Name] = append(nameIds[epRule.Name], eid) } }