refactor monitoring (#1751)
Signed-off-by: huanggze <loganhuang@yunify.com>
This commit is contained in:
@@ -1,32 +1,41 @@
|
||||
package monitoring
|
||||
|
||||
type ClusterQuery struct {
|
||||
import "time"
|
||||
|
||||
const (
|
||||
StatusSuccess = "success"
|
||||
StatusError = "error"
|
||||
MetricTypeMatrix = "matrix"
|
||||
MetricTypeVector = "vector"
|
||||
)
|
||||
|
||||
type Metric struct {
|
||||
MetricName string `json:"metric_name,omitempty" description:"metric name, eg. scheduler_up_sum"`
|
||||
Status string `json:"status" description:"result status, one of error, success"`
|
||||
MetricData `json:"data" description:"actual metric result"`
|
||||
ErrorType string `json:"errorType,omitempty"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
type ClusterMetrics struct {
|
||||
type MetricData struct {
|
||||
MetricType string `json:"resultType" description:"result type, one of matrix, vector"`
|
||||
MetricValues []MetricValue `json:"result" description:"metric data including labels, time series and values"`
|
||||
}
|
||||
|
||||
type WorkspaceQuery struct {
|
||||
type Point [2]float64
|
||||
|
||||
type MetricValue struct {
|
||||
Metadata map[string]string `json:"metric,omitempty" description:"time series labels"`
|
||||
Sample Point `json:"value,omitempty" description:"time series, values of vector type"`
|
||||
Series []Point `json:"values,omitempty" description:"time series, values of matrix type"`
|
||||
}
|
||||
|
||||
type WorkspaceMetrics struct {
|
||||
}
|
||||
|
||||
type NamespaceQuery struct {
|
||||
}
|
||||
|
||||
type NamespaceMetrics struct {
|
||||
}
|
||||
|
||||
// Interface defines all the abstract behaviors of monitoring
|
||||
type Interface interface {
|
||||
// The `stmts` defines statements, expressions or rules (eg. promql in Prometheus) for querying specific metrics.
|
||||
GetMetrics(stmts []string, time time.Time) ([]Metric, error)
|
||||
GetMetricsOverTime(stmts []string, start, end time.Time, step time.Duration) ([]Metric, error)
|
||||
|
||||
// Get
|
||||
GetClusterMetrics(query ClusterQuery) ClusterMetrics
|
||||
|
||||
//
|
||||
GetWorkspaceMetrics(query WorkspaceQuery) WorkspaceMetrics
|
||||
|
||||
//
|
||||
GetNamespaceMetrics(query NamespaceQuery) NamespaceMetrics
|
||||
// Get named metrics (eg. node_cpu_usage)
|
||||
GetNamedMetrics(time time.Time, opt QueryOption) ([]Metric, error)
|
||||
GetNamedMetricsOverTime(start, end time.Time, step time.Duration, opt QueryOption) ([]Metric, error)
|
||||
}
|
||||
|
||||
252
pkg/simple/client/monitoring/named_metrics.go
Normal file
252
pkg/simple/client/monitoring/named_metrics.go
Normal file
@@ -0,0 +1,252 @@
|
||||
package monitoring
|
||||
|
||||
type MonitoringLevel int
|
||||
|
||||
const (
|
||||
LevelCluster = MonitoringLevel(1) << iota
|
||||
LevelNode
|
||||
LevelWorkspace
|
||||
LevelNamespace
|
||||
LevelWorkload
|
||||
LevelPod
|
||||
LevelContainer
|
||||
LevelPVC
|
||||
LevelComponent
|
||||
)
|
||||
|
||||
var ClusterMetrics = []string{
|
||||
"cluster_cpu_utilisation",
|
||||
"cluster_cpu_usage",
|
||||
"cluster_cpu_total",
|
||||
"cluster_memory_utilisation",
|
||||
"cluster_memory_available",
|
||||
"cluster_memory_total",
|
||||
"cluster_memory_usage_wo_cache",
|
||||
"cluster_net_utilisation",
|
||||
"cluster_net_bytes_transmitted",
|
||||
"cluster_net_bytes_received",
|
||||
"cluster_disk_read_iops",
|
||||
"cluster_disk_write_iops",
|
||||
"cluster_disk_read_throughput",
|
||||
"cluster_disk_write_throughput",
|
||||
"cluster_disk_size_usage",
|
||||
"cluster_disk_size_utilisation",
|
||||
"cluster_disk_size_capacity",
|
||||
"cluster_disk_size_available",
|
||||
"cluster_disk_inode_total",
|
||||
"cluster_disk_inode_usage",
|
||||
"cluster_disk_inode_utilisation",
|
||||
"cluster_namespace_count",
|
||||
"cluster_pod_count",
|
||||
"cluster_pod_quota",
|
||||
"cluster_pod_utilisation",
|
||||
"cluster_pod_running_count",
|
||||
"cluster_pod_succeeded_count",
|
||||
"cluster_pod_abnormal_count",
|
||||
"cluster_node_online",
|
||||
"cluster_node_offline",
|
||||
"cluster_node_total",
|
||||
"cluster_cronjob_count",
|
||||
"cluster_pvc_count",
|
||||
"cluster_daemonset_count",
|
||||
"cluster_deployment_count",
|
||||
"cluster_endpoint_count",
|
||||
"cluster_hpa_count",
|
||||
"cluster_job_count",
|
||||
"cluster_statefulset_count",
|
||||
"cluster_replicaset_count",
|
||||
"cluster_service_count",
|
||||
"cluster_secret_count",
|
||||
"cluster_pv_count",
|
||||
"cluster_ingresses_extensions_count",
|
||||
"cluster_load1",
|
||||
"cluster_load5",
|
||||
"cluster_load15",
|
||||
"cluster_pod_abnormal_ratio",
|
||||
"cluster_node_offline_ratio",
|
||||
}
|
||||
|
||||
var NodeMetrics = []string{
|
||||
"node_cpu_utilisation",
|
||||
"node_cpu_total",
|
||||
"node_cpu_usage",
|
||||
"node_memory_utilisation",
|
||||
"node_memory_usage_wo_cache",
|
||||
"node_memory_available",
|
||||
"node_memory_total",
|
||||
"node_net_utilisation",
|
||||
"node_net_bytes_transmitted",
|
||||
"node_net_bytes_received",
|
||||
"node_disk_read_iops",
|
||||
"node_disk_write_iops",
|
||||
"node_disk_read_throughput",
|
||||
"node_disk_write_throughput",
|
||||
"node_disk_size_capacity",
|
||||
"node_disk_size_available",
|
||||
"node_disk_size_usage",
|
||||
"node_disk_size_utilisation",
|
||||
"node_disk_inode_total",
|
||||
"node_disk_inode_usage",
|
||||
"node_disk_inode_utilisation",
|
||||
"node_pod_count",
|
||||
"node_pod_quota",
|
||||
"node_pod_utilisation",
|
||||
"node_pod_running_count",
|
||||
"node_pod_succeeded_count",
|
||||
"node_pod_abnormal_count",
|
||||
"node_load1",
|
||||
"node_load5",
|
||||
"node_load15",
|
||||
"node_pod_abnormal_ratio",
|
||||
}
|
||||
|
||||
var WorkspaceMetrics = []string{
|
||||
"workspace_cpu_usage",
|
||||
"workspace_memory_usage",
|
||||
"workspace_memory_usage_wo_cache",
|
||||
"workspace_net_bytes_transmitted",
|
||||
"workspace_net_bytes_received",
|
||||
"workspace_pod_count",
|
||||
"workspace_pod_running_count",
|
||||
"workspace_pod_succeeded_count",
|
||||
"workspace_pod_abnormal_count",
|
||||
"workspace_ingresses_extensions_count",
|
||||
"workspace_cronjob_count",
|
||||
"workspace_pvc_count",
|
||||
"workspace_daemonset_count",
|
||||
"workspace_deployment_count",
|
||||
"workspace_endpoint_count",
|
||||
"workspace_hpa_count",
|
||||
"workspace_job_count",
|
||||
"workspace_statefulset_count",
|
||||
"workspace_replicaset_count",
|
||||
"workspace_service_count",
|
||||
"workspace_secret_count",
|
||||
"workspace_pod_abnormal_ratio",
|
||||
}
|
||||
|
||||
var NamespaceMetrics = []string{
|
||||
"namespace_cpu_usage",
|
||||
"namespace_memory_usage",
|
||||
"namespace_memory_usage_wo_cache",
|
||||
"namespace_net_bytes_transmitted",
|
||||
"namespace_net_bytes_received",
|
||||
"namespace_pod_count",
|
||||
"namespace_pod_running_count",
|
||||
"namespace_pod_succeeded_count",
|
||||
"namespace_pod_abnormal_count",
|
||||
"namespace_pod_abnormal_ratio",
|
||||
"namespace_memory_limit_hard",
|
||||
"namespace_cpu_limit_hard",
|
||||
"namespace_pod_count_hard",
|
||||
"namespace_cronjob_count",
|
||||
"namespace_pvc_count",
|
||||
"namespace_daemonset_count",
|
||||
"namespace_deployment_count",
|
||||
"namespace_endpoint_count",
|
||||
"namespace_hpa_count",
|
||||
"namespace_job_count",
|
||||
"namespace_statefulset_count",
|
||||
"namespace_replicaset_count",
|
||||
"namespace_service_count",
|
||||
"namespace_secret_count",
|
||||
"namespace_configmap_count",
|
||||
"namespace_ingresses_extensions_count",
|
||||
"namespace_s2ibuilder_count",
|
||||
}
|
||||
|
||||
var WorkloadMetrics = []string{
|
||||
"workload_cpu_usage",
|
||||
"workload_memory_usage",
|
||||
"workload_memory_usage_wo_cache",
|
||||
"workload_net_bytes_transmitted",
|
||||
"workload_net_bytes_received",
|
||||
|
||||
"workload_deployment_replica",
|
||||
"workload_deployment_replica_available",
|
||||
"workload_statefulset_replica",
|
||||
"workload_statefulset_replica_available",
|
||||
"workload_daemonset_replica",
|
||||
"workload_daemonset_replica_available",
|
||||
"workload_deployment_unavailable_replicas_ratio",
|
||||
"workload_daemonset_unavailable_replicas_ratio",
|
||||
"workload_statefulset_unavailable_replicas_ratio",
|
||||
}
|
||||
|
||||
var PodMetrics = []string{
|
||||
"pod_cpu_usage",
|
||||
"pod_memory_usage",
|
||||
"pod_memory_usage_wo_cache",
|
||||
"pod_net_bytes_transmitted",
|
||||
"pod_net_bytes_received",
|
||||
}
|
||||
|
||||
var ContainerMetrics = []string{
|
||||
"container_cpu_usage",
|
||||
"container_memory_usage",
|
||||
"container_memory_usage_wo_cache",
|
||||
}
|
||||
|
||||
var PVCMetrics = []string{
|
||||
"pvc_inodes_available",
|
||||
"pvc_inodes_used",
|
||||
"pvc_inodes_total",
|
||||
"pvc_inodes_utilisation",
|
||||
"pvc_bytes_available",
|
||||
"pvc_bytes_used",
|
||||
"pvc_bytes_total",
|
||||
"pvc_bytes_utilisation",
|
||||
}
|
||||
|
||||
var ComponentMetrics = []string{
|
||||
"etcd_server_list",
|
||||
"etcd_server_total",
|
||||
"etcd_server_up_total",
|
||||
"etcd_server_has_leader",
|
||||
"etcd_server_leader_changes",
|
||||
"etcd_server_proposals_failed_rate",
|
||||
"etcd_server_proposals_applied_rate",
|
||||
"etcd_server_proposals_committed_rate",
|
||||
"etcd_server_proposals_pending_count",
|
||||
"etcd_mvcc_db_size",
|
||||
"etcd_network_client_grpc_received_bytes",
|
||||
"etcd_network_client_grpc_sent_bytes",
|
||||
"etcd_grpc_call_rate",
|
||||
"etcd_grpc_call_failed_rate",
|
||||
"etcd_grpc_server_msg_received_rate",
|
||||
"etcd_grpc_server_msg_sent_rate",
|
||||
"etcd_disk_wal_fsync_duration",
|
||||
"etcd_disk_wal_fsync_duration_quantile",
|
||||
"etcd_disk_backend_commit_duration",
|
||||
"etcd_disk_backend_commit_duration_quantile",
|
||||
|
||||
"apiserver_up_sum",
|
||||
"apiserver_request_rate",
|
||||
"apiserver_request_by_verb_rate",
|
||||
"apiserver_request_latencies",
|
||||
"apiserver_request_by_verb_latencies",
|
||||
|
||||
"scheduler_up_sum",
|
||||
"scheduler_schedule_attempts",
|
||||
"scheduler_schedule_attempt_rate",
|
||||
"scheduler_e2e_scheduling_latency",
|
||||
"scheduler_e2e_scheduling_latency_quantile",
|
||||
|
||||
"controller_manager_up_sum",
|
||||
|
||||
"coredns_up_sum",
|
||||
"coredns_cache_hits",
|
||||
"coredns_cache_misses",
|
||||
"coredns_dns_request_rate",
|
||||
"coredns_dns_request_duration",
|
||||
"coredns_dns_request_duration_quantile",
|
||||
"coredns_dns_request_by_type_rate",
|
||||
"coredns_dns_request_by_rcode_rate",
|
||||
"coredns_panic_rate",
|
||||
"coredns_proxy_request_rate",
|
||||
"coredns_proxy_request_duration",
|
||||
"coredns_proxy_request_duration_quantile",
|
||||
|
||||
"prometheus_up_sum",
|
||||
"prometheus_tsdb_head_samples_appended_rate",
|
||||
}
|
||||
@@ -1,31 +0,0 @@
|
||||
package monitoring
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"time"
|
||||
)
|
||||
|
||||
// prometheus implements monitoring interface backed by Prometheus
|
||||
type prometheus struct {
|
||||
options *Options
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewPrometheus(options *Options) Interface {
|
||||
return &prometheus{
|
||||
options: options,
|
||||
client: &http.Client{Timeout: 10 * time.Second},
|
||||
}
|
||||
}
|
||||
|
||||
func (p prometheus) GetClusterMetrics(query ClusterQuery) ClusterMetrics {
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (p prometheus) GetWorkspaceMetrics(query WorkspaceQuery) WorkspaceMetrics {
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (p prometheus) GetNamespaceMetrics(query NamespaceQuery) NamespaceMetrics {
|
||||
panic("implement me")
|
||||
}
|
||||
178
pkg/simple/client/monitoring/prometheus/prometheus.go
Normal file
178
pkg/simple/client/monitoring/prometheus/prometheus.go
Normal file
@@ -0,0 +1,178 @@
|
||||
package prometheus
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/json-iterator/go"
|
||||
"io/ioutil"
|
||||
"kubesphere.io/kubesphere/pkg/simple/client/monitoring"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
var json = jsoniter.ConfigCompatibleWithStandardLibrary
|
||||
|
||||
// prometheus implements monitoring interface backed by Prometheus
|
||||
type prometheus struct {
|
||||
options *Options
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewPrometheus(options *Options) monitoring.Interface {
|
||||
return &prometheus{
|
||||
options: options,
|
||||
client: &http.Client{Timeout: 10 * time.Second},
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(huanggze): reserve for custom monitoring
|
||||
func (p *prometheus) GetMetrics(stmts []string, time time.Time) ([]monitoring.Metric, error) {
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
// TODO(huanggze): reserve for custom monitoring
|
||||
func (p *prometheus) GetMetricsOverTime(stmts []string, start, end time.Time, step time.Duration) ([]monitoring.Metric, error) {
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (p *prometheus) GetNamedMetrics(ts time.Time, o monitoring.QueryOption) ([]monitoring.Metric, error) {
|
||||
metrics := make([]monitoring.Metric, 0)
|
||||
var mtx sync.Mutex // guard metrics
|
||||
var wg sync.WaitGroup
|
||||
|
||||
opts := monitoring.NewQueryOptions()
|
||||
o.Apply(opts)
|
||||
|
||||
errCh := make(chan error)
|
||||
for _, metric := range opts.NamedMetrics {
|
||||
matched, _ := regexp.MatchString(opts.MetricFilter, metric)
|
||||
if matched {
|
||||
exp := makeExpression(metric, *opts)
|
||||
wg.Add(1)
|
||||
go func(metric, exp string) {
|
||||
res, err := p.query(exp, ts)
|
||||
if err != nil {
|
||||
select {
|
||||
case errCh <- err: // Record error once
|
||||
default:
|
||||
}
|
||||
} else {
|
||||
res.MetricName = metric // Add metric name
|
||||
mtx.Lock()
|
||||
metrics = append(metrics, res)
|
||||
mtx.Unlock()
|
||||
}
|
||||
wg.Done()
|
||||
}(metric, exp)
|
||||
}
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
select {
|
||||
case err := <-errCh:
|
||||
return nil, err
|
||||
default:
|
||||
return metrics, nil
|
||||
}
|
||||
}
|
||||
|
||||
func (p *prometheus) GetNamedMetricsOverTime(start, end time.Time, step time.Duration, o monitoring.QueryOption) ([]monitoring.Metric, error) {
|
||||
metrics := make([]monitoring.Metric, 0)
|
||||
var mtx sync.Mutex // guard metrics
|
||||
var wg sync.WaitGroup
|
||||
|
||||
opts := monitoring.NewQueryOptions()
|
||||
o.Apply(opts)
|
||||
|
||||
errCh := make(chan error)
|
||||
for _, metric := range opts.NamedMetrics {
|
||||
matched, _ := regexp.MatchString(opts.MetricFilter, metric)
|
||||
if matched {
|
||||
exp := makeExpression(metric, *opts)
|
||||
wg.Add(1)
|
||||
go func(metric, exp string) {
|
||||
res, err := p.rangeQuery(exp, start, end, step)
|
||||
if err != nil {
|
||||
select {
|
||||
case errCh <- err: // Record error once
|
||||
default:
|
||||
}
|
||||
} else {
|
||||
res.MetricName = metric // Add metric name
|
||||
mtx.Lock()
|
||||
metrics = append(metrics, res)
|
||||
mtx.Unlock()
|
||||
}
|
||||
wg.Done()
|
||||
}(metric, exp)
|
||||
}
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
select {
|
||||
case err := <-errCh:
|
||||
return nil, err
|
||||
default:
|
||||
return metrics, nil
|
||||
}
|
||||
}
|
||||
|
||||
func (p prometheus) query(exp string, ts time.Time) (monitoring.Metric, error) {
|
||||
params := &url.Values{}
|
||||
params.Set("time", ts.Format(time.RFC3339))
|
||||
params.Set("query", exp)
|
||||
|
||||
u := fmt.Sprintf("%s/api/v1/query?%s", p.options.Endpoint, params.Encode())
|
||||
|
||||
var m monitoring.Metric
|
||||
response, err := p.client.Get(u)
|
||||
if err != nil {
|
||||
return monitoring.Metric{}, err
|
||||
}
|
||||
|
||||
body, err := ioutil.ReadAll(response.Body)
|
||||
if err != nil {
|
||||
return monitoring.Metric{}, err
|
||||
}
|
||||
defer response.Body.Close()
|
||||
|
||||
err = json.Unmarshal(body, m)
|
||||
if err != nil {
|
||||
return monitoring.Metric{}, err
|
||||
}
|
||||
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (p prometheus) rangeQuery(exp string, start, end time.Time, step time.Duration) (monitoring.Metric, error) {
|
||||
params := &url.Values{}
|
||||
params.Set("start", start.Format(time.RFC3339))
|
||||
params.Set("end", end.Format(time.RFC3339))
|
||||
params.Set("step", step.String())
|
||||
params.Set("query", exp)
|
||||
|
||||
u := fmt.Sprintf("%s/api/v1/query?%s", p.options.Endpoint, params.Encode())
|
||||
|
||||
var m monitoring.Metric
|
||||
response, err := p.client.Get(u)
|
||||
if err != nil {
|
||||
return monitoring.Metric{}, err
|
||||
}
|
||||
|
||||
body, err := ioutil.ReadAll(response.Body)
|
||||
if err != nil {
|
||||
return monitoring.Metric{}, err
|
||||
}
|
||||
defer response.Body.Close()
|
||||
|
||||
err = json.Unmarshal(body, m)
|
||||
if err != nil {
|
||||
return monitoring.Metric{}, err
|
||||
}
|
||||
|
||||
return m, nil
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
package monitoring
|
||||
package prometheus
|
||||
|
||||
import (
|
||||
"github.com/spf13/pflag"
|
||||
415
pkg/simple/client/monitoring/prometheus/promql_templates.go
Normal file
415
pkg/simple/client/monitoring/prometheus/promql_templates.go
Normal file
@@ -0,0 +1,415 @@
|
||||
/*
|
||||
Copyright 2019 The KubeSphere Authors.
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package prometheus
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"kubesphere.io/kubesphere/pkg/simple/client/monitoring"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const (
|
||||
StatefulSet = "StatefulSet"
|
||||
DaemonSet = "DaemonSet"
|
||||
Deployment = "Deployment"
|
||||
)
|
||||
|
||||
//TODO(huanggze): move this part to a ConfigMap
|
||||
var promQLTemplates = map[string]string{
|
||||
//cluster
|
||||
"cluster_cpu_utilisation": ":node_cpu_utilisation:avg1m",
|
||||
"cluster_cpu_usage": `round(:node_cpu_utilisation:avg1m * sum(node:node_num_cpu:sum), 0.001)`,
|
||||
"cluster_cpu_total": "sum(node:node_num_cpu:sum)",
|
||||
"cluster_memory_utilisation": ":node_memory_utilisation:",
|
||||
"cluster_memory_available": "sum(node:node_memory_bytes_available:sum)",
|
||||
"cluster_memory_total": "sum(node:node_memory_bytes_total:sum)",
|
||||
"cluster_memory_usage_wo_cache": "sum(node:node_memory_bytes_total:sum) - sum(node:node_memory_bytes_available:sum)",
|
||||
"cluster_net_utilisation": ":node_net_utilisation:sum_irate",
|
||||
"cluster_net_bytes_transmitted": "sum(node:node_net_bytes_transmitted:sum_irate)",
|
||||
"cluster_net_bytes_received": "sum(node:node_net_bytes_received:sum_irate)",
|
||||
"cluster_disk_read_iops": "sum(node:data_volume_iops_reads:sum)",
|
||||
"cluster_disk_write_iops": "sum(node:data_volume_iops_writes:sum)",
|
||||
"cluster_disk_read_throughput": "sum(node:data_volume_throughput_bytes_read:sum)",
|
||||
"cluster_disk_write_throughput": "sum(node:data_volume_throughput_bytes_written:sum)",
|
||||
"cluster_disk_size_usage": `sum(max(node_filesystem_size_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"} - node_filesystem_avail_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"}) by (device, instance))`,
|
||||
"cluster_disk_size_utilisation": `cluster:disk_utilization:ratio`,
|
||||
"cluster_disk_size_capacity": `sum(max(node_filesystem_size_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"}) by (device, instance))`,
|
||||
"cluster_disk_size_available": `sum(max(node_filesystem_avail_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"}) by (device, instance))`,
|
||||
"cluster_disk_inode_total": `sum(node:node_inodes_total:)`,
|
||||
"cluster_disk_inode_usage": `sum(node:node_inodes_total:) - sum(node:node_inodes_free:)`,
|
||||
"cluster_disk_inode_utilisation": `cluster:disk_inode_utilization:ratio`,
|
||||
"cluster_namespace_count": `count(kube_namespace_labels)`,
|
||||
"cluster_pod_count": `cluster:pod:sum`,
|
||||
"cluster_pod_quota": `sum(max(kube_node_status_capacity_pods) by (node) unless on (node) (kube_node_status_condition{condition="Ready",status=~"unknown|false"} > 0))`,
|
||||
"cluster_pod_utilisation": `cluster:pod_utilization:ratio`,
|
||||
"cluster_pod_running_count": `cluster:pod_running:count`,
|
||||
"cluster_pod_succeeded_count": `count(kube_pod_info unless on (pod) (kube_pod_status_phase{phase=~"Failed|Pending|Unknown|Running"} > 0) unless on (node) (kube_node_status_condition{condition="Ready",status=~"unknown|false"} > 0))`,
|
||||
"cluster_pod_abnormal_count": `cluster:pod_abnormal:sum`,
|
||||
"cluster_node_online": `sum(kube_node_status_condition{condition="Ready",status="true"})`,
|
||||
"cluster_node_offline": `cluster:node_offline:sum`,
|
||||
"cluster_node_total": `sum(kube_node_status_condition{condition="Ready"})`,
|
||||
"cluster_cronjob_count": `sum(kube_cronjob_labels)`,
|
||||
"cluster_pvc_count": `sum(kube_persistentvolumeclaim_info)`,
|
||||
"cluster_daemonset_count": `sum(kube_daemonset_labels)`,
|
||||
"cluster_deployment_count": `sum(kube_deployment_labels)`,
|
||||
"cluster_endpoint_count": `sum(kube_endpoint_labels)`,
|
||||
"cluster_hpa_count": `sum(kube_hpa_labels)`,
|
||||
"cluster_job_count": `sum(kube_job_labels)`,
|
||||
"cluster_statefulset_count": `sum(kube_statefulset_labels)`,
|
||||
"cluster_replicaset_count": `count(kube_replicaset_labels)`,
|
||||
"cluster_service_count": `sum(kube_service_info)`,
|
||||
"cluster_secret_count": `sum(kube_secret_info)`,
|
||||
"cluster_pv_count": `sum(kube_persistentvolume_labels)`,
|
||||
"cluster_ingresses_extensions_count": `sum(kube_ingress_labels)`,
|
||||
"cluster_load1": `sum(node_load1{job="node-exporter"}) / sum(node:node_num_cpu:sum)`,
|
||||
"cluster_load5": `sum(node_load5{job="node-exporter"}) / sum(node:node_num_cpu:sum)`,
|
||||
"cluster_load15": `sum(node_load15{job="node-exporter"}) / sum(node:node_num_cpu:sum)`,
|
||||
"cluster_pod_abnormal_ratio": `cluster:pod_abnormal:ratio`,
|
||||
"cluster_node_offline_ratio": `cluster:node_offline:ratio`,
|
||||
|
||||
//node
|
||||
"node_cpu_utilisation": "node:node_cpu_utilisation:avg1m{$1}",
|
||||
"node_cpu_total": "node:node_num_cpu:sum{$1}",
|
||||
"node_memory_utilisation": "node:node_memory_utilisation:{$1}",
|
||||
"node_memory_available": "node:node_memory_bytes_available:sum{$1}",
|
||||
"node_memory_total": "node:node_memory_bytes_total:sum{$1}",
|
||||
"node_memory_usage_wo_cache": "node:node_memory_bytes_total:sum{$1} - node:node_memory_bytes_available:sum{$1}",
|
||||
"node_net_utilisation": "node:node_net_utilisation:sum_irate{$1}",
|
||||
"node_net_bytes_transmitted": "node:node_net_bytes_transmitted:sum_irate{$1}",
|
||||
"node_net_bytes_received": "node:node_net_bytes_received:sum_irate{$1}",
|
||||
"node_disk_read_iops": "node:data_volume_iops_reads:sum{$1}",
|
||||
"node_disk_write_iops": "node:data_volume_iops_writes:sum{$1}",
|
||||
"node_disk_read_throughput": "node:data_volume_throughput_bytes_read:sum{$1}",
|
||||
"node_disk_write_throughput": "node:data_volume_throughput_bytes_written:sum{$1}",
|
||||
"node_disk_size_capacity": `sum(max(node_filesystem_size_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"} * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{$1}) by (device, node)) by (node)`,
|
||||
"node_disk_size_available": `node:disk_space_available:{$1}`,
|
||||
"node_disk_size_usage": `sum(max((node_filesystem_size_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"} - node_filesystem_avail_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"}) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{$1}) by (device, node)) by (node)`,
|
||||
"node_disk_size_utilisation": `node:disk_space_utilization:ratio{$1}`,
|
||||
"node_disk_inode_total": `node:node_inodes_total:{$1}`,
|
||||
"node_disk_inode_usage": `node:node_inodes_total:{$1} - node:node_inodes_free:{$1}`,
|
||||
"node_disk_inode_utilisation": `node:disk_inode_utilization:ratio{$1}`,
|
||||
"node_pod_count": `node:pod_count:sum{$1}`,
|
||||
"node_pod_quota": `max(kube_node_status_capacity_pods{$1}) by (node) unless on (node) (kube_node_status_condition{condition="Ready",status=~"unknown|false"} > 0)`,
|
||||
"node_pod_utilisation": `node:pod_utilization:ratio{$1}`,
|
||||
"node_pod_running_count": `node:pod_running:count{$1}`,
|
||||
"node_pod_succeeded_count": `node:pod_succeeded:count{$1}`,
|
||||
"node_pod_abnormal_count": `node:pod_abnormal:count{$1}`,
|
||||
"node_cpu_usage": `round(node:node_cpu_utilisation:avg1m{$1} * node:node_num_cpu:sum{$1}, 0.001)`,
|
||||
"node_load1": `node:load1:ratio{$1}`,
|
||||
"node_load5": `node:load5:ratio{$1}`,
|
||||
"node_load15": `node:load15:ratio{$1}`,
|
||||
"node_pod_abnormal_ratio": `node:pod_abnormal:ratio{$1}`,
|
||||
|
||||
// workspace
|
||||
"workspace_cpu_usage": `round(sum by (label_kubesphere_io_workspace) (namespace:container_cpu_usage_seconds_total:sum_rate{namespace!="", $1}), 0.001)`,
|
||||
"workspace_memory_usage": `sum by (label_kubesphere_io_workspace) (namespace:container_memory_usage_bytes:sum{namespace!="", $1})`,
|
||||
"workspace_memory_usage_wo_cache": `sum by (label_kubesphere_io_workspace) (namespace:container_memory_usage_bytes_wo_cache:sum{namespace!="", $1})`,
|
||||
"workspace_net_bytes_transmitted": `sum by (label_kubesphere_io_workspace) (sum by (namespace) (irate(container_network_transmit_bytes_total{namespace!="", pod_name!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m])) * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
|
||||
"workspace_net_bytes_received": `sum by (label_kubesphere_io_workspace) (sum by (namespace) (irate(container_network_receive_bytes_total{namespace!="", pod_name!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m])) * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
|
||||
"workspace_pod_count": `sum by (label_kubesphere_io_workspace) (kube_pod_status_phase{phase!~"Failed|Succeeded", namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
|
||||
"workspace_pod_running_count": `sum by (label_kubesphere_io_workspace) (kube_pod_status_phase{phase="Running", namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
|
||||
"workspace_pod_succeeded_count": `sum by (label_kubesphere_io_workspace) (kube_pod_status_phase{phase="Succeeded", namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
|
||||
"workspace_pod_abnormal_count": `count by (label_kubesphere_io_workspace) ((kube_pod_info{node!=""} unless on (pod, namespace) (kube_pod_status_phase{job="kube-state-metrics", phase="Succeeded"}>0) unless on (pod, namespace) ((kube_pod_status_ready{job="kube-state-metrics", condition="true"}>0) and on (pod, namespace) (kube_pod_status_phase{job="kube-state-metrics", phase="Running"}>0)) unless on (pod, namespace) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", reason="ContainerCreating"}>0)) * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
|
||||
"workspace_ingresses_extensions_count": `sum by (label_kubesphere_io_workspace) (kube_ingress_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
|
||||
"workspace_cronjob_count": `sum by (label_kubesphere_io_workspace) (kube_cronjob_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
|
||||
"workspace_pvc_count": `sum by (label_kubesphere_io_workspace) (kube_persistentvolumeclaim_info{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
|
||||
"workspace_daemonset_count": `sum by (label_kubesphere_io_workspace) (kube_daemonset_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
|
||||
"workspace_deployment_count": `sum by (label_kubesphere_io_workspace) (kube_deployment_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
|
||||
"workspace_endpoint_count": `sum by (label_kubesphere_io_workspace) (kube_endpoint_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
|
||||
"workspace_hpa_count": `sum by (label_kubesphere_io_workspace) (kube_hpa_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
|
||||
"workspace_job_count": `sum by (label_kubesphere_io_workspace) (kube_job_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
|
||||
"workspace_statefulset_count": `sum by (label_kubesphere_io_workspace) (kube_statefulset_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
|
||||
"workspace_replicaset_count": `count by (label_kubesphere_io_workspace) (kube_replicaset_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
|
||||
"workspace_service_count": `sum by (label_kubesphere_io_workspace) (kube_service_info{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
|
||||
"workspace_secret_count": `sum by (label_kubesphere_io_workspace) (kube_secret_info{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
|
||||
"workspace_pod_abnormal_ratio": `count by (label_kubesphere_io_workspace) ((kube_pod_info{node!=""} unless on (pod, namespace) (kube_pod_status_phase{job="kube-state-metrics", phase="Succeeded"}>0) unless on (pod, namespace) ((kube_pod_status_ready{job="kube-state-metrics", condition="true"}>0) and on (pod, namespace) (kube_pod_status_phase{job="kube-state-metrics", phase="Running"}>0)) unless on (pod, namespace) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", reason="ContainerCreating"}>0)) * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1}) / sum by (label_kubesphere_io_workspace) (kube_pod_status_phase{phase!="Succeeded", namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
|
||||
|
||||
//namespace
|
||||
"namespace_cpu_usage": `round(namespace:container_cpu_usage_seconds_total:sum_rate{namespace!="", $1}, 0.001)`,
|
||||
"namespace_memory_usage": `namespace:container_memory_usage_bytes:sum{namespace!="", $1}`,
|
||||
"namespace_memory_usage_wo_cache": `namespace:container_memory_usage_bytes_wo_cache:sum{namespace!="", $1}`,
|
||||
"namespace_net_bytes_transmitted": `sum by (namespace) (irate(container_network_transmit_bytes_total{namespace!="", pod_name!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m]) * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
|
||||
"namespace_net_bytes_received": `sum by (namespace) (irate(container_network_receive_bytes_total{namespace!="", pod_name!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m]) * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
|
||||
"namespace_pod_count": `sum by (namespace) (kube_pod_status_phase{phase!~"Failed|Succeeded", namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
|
||||
"namespace_pod_running_count": `sum by (namespace) (kube_pod_status_phase{phase="Running", namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
|
||||
"namespace_pod_succeeded_count": `sum by (namespace) (kube_pod_status_phase{phase="Succeeded", namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
|
||||
"namespace_pod_abnormal_count": `namespace:pod_abnormal:count{namespace!="", $1}`,
|
||||
"namespace_pod_abnormal_ratio": `namespace:pod_abnormal:ratio{namespace!="", $1}`,
|
||||
"namespace_memory_limit_hard": `min by (namespace) (kube_resourcequota{resourcequota!="quota", type="hard", namespace!="", resource="limits.memory"} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
|
||||
"namespace_cpu_limit_hard": `min by (namespace) (kube_resourcequota{resourcequota!="quota", type="hard", namespace!="", resource="limits.cpu"} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
|
||||
"namespace_pod_count_hard": `min by (namespace) (kube_resourcequota{resourcequota!="quota", type="hard", namespace!="", resource="count/pods"} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
|
||||
"namespace_cronjob_count": `sum by (namespace) (kube_cronjob_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
|
||||
"namespace_pvc_count": `sum by (namespace) (kube_persistentvolumeclaim_info{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
|
||||
"namespace_daemonset_count": `sum by (namespace) (kube_daemonset_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
|
||||
"namespace_deployment_count": `sum by (namespace) (kube_deployment_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
|
||||
"namespace_endpoint_count": `sum by (namespace) (kube_endpoint_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
|
||||
"namespace_hpa_count": `sum by (namespace) (kube_hpa_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
|
||||
"namespace_job_count": `sum by (namespace) (kube_job_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
|
||||
"namespace_statefulset_count": `sum by (namespace) (kube_statefulset_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
|
||||
"namespace_replicaset_count": `count by (namespace) (kube_replicaset_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
|
||||
"namespace_service_count": `sum by (namespace) (kube_service_info{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
|
||||
"namespace_secret_count": `sum by (namespace) (kube_secret_info{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
|
||||
"namespace_configmap_count": `sum by (namespace) (kube_configmap_info{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
|
||||
"namespace_ingresses_extensions_count": `sum by (namespace) (kube_ingress_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
|
||||
"namespace_s2ibuilder_count": `sum by (namespace) (s2i_s2ibuilder_created{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
|
||||
|
||||
// workload
|
||||
"workload_cpu_usage": `round(namespace:workload_cpu_usage:sum{$1}, 0.001)`,
|
||||
"workload_memory_usage": `namespace:workload_memory_usage:sum{$1}`,
|
||||
"workload_memory_usage_wo_cache": `namespace:workload_memory_usage_wo_cache:sum{$1}`,
|
||||
"workload_net_bytes_transmitted": `namespace:workload_net_bytes_transmitted:sum_irate{$1}`,
|
||||
"workload_net_bytes_received": `namespace:workload_net_bytes_received:sum_irate{$1}`,
|
||||
|
||||
"workload_deployment_replica": `label_join(sum (label_join(label_replace(kube_deployment_spec_replicas{$2}, "owner_kind", "Deployment", "", ""), "workload", "", "deployment")) by (namespace, owner_kind, workload), "workload", ":", "owner_kind", "workload")`,
|
||||
"workload_deployment_replica_available": `label_join(sum (label_join(label_replace(kube_deployment_status_replicas_available{$2}, "owner_kind", "Deployment", "", ""), "workload", "", "deployment")) by (namespace, owner_kind, workload), "workload", ":", "owner_kind", "workload")`,
|
||||
"workload_statefulset_replica": `label_join(sum (label_join(label_replace(kube_statefulset_replicas{$2}, "owner_kind", "StatefulSet", "", ""), "workload", "", "statefulset")) by (namespace, owner_kind, workload), "workload", ":", "owner_kind", "workload")`,
|
||||
"workload_statefulset_replica_available": `label_join(sum (label_join(label_replace(kube_statefulset_status_replicas_current{$2}, "owner_kind", "StatefulSet", "", ""), "workload", "", "statefulset")) by (namespace, owner_kind, workload), "workload", ":", "owner_kind", "workload")`,
|
||||
"workload_daemonset_replica": `label_join(sum (label_join(label_replace(kube_daemonset_status_desired_number_scheduled{$2}, "owner_kind", "DaemonSet", "", ""), "workload", "", "daemonset")) by (namespace, owner_kind, workload), "workload", ":", "owner_kind", "workload")`,
|
||||
"workload_daemonset_replica_available": `label_join(sum (label_join(label_replace(kube_daemonset_status_number_available{$2}, "owner_kind", "DaemonSet", "", ""), "workload", "", "daemonset")) by (namespace, owner_kind, workload), "workload", ":", "owner_kind", "workload")`,
|
||||
"workload_deployment_unavailable_replicas_ratio": `namespace:deployment_unavailable_replicas:ratio{$1}`,
|
||||
"workload_daemonset_unavailable_replicas_ratio": `namespace:daemonset_unavailable_replicas:ratio{$1}`,
|
||||
"workload_statefulset_unavailable_replicas_ratio": `namespace:statefulset_unavailable_replicas:ratio{$1}`,
|
||||
|
||||
// pod
|
||||
"pod_cpu_usage": `round(label_join(sum by (namespace, pod_name) (irate(container_cpu_usage_seconds_total{job="kubelet", pod_name!="", image!=""}[5m])), "pod", "", "pod_name") * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}, 0.001)`,
|
||||
"pod_memory_usage": `label_join(sum by (namespace, pod_name) (container_memory_usage_bytes{job="kubelet", pod_name!="", image!=""}), "pod", "", "pod_name") * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`,
|
||||
"pod_memory_usage_wo_cache": `label_join(sum by (namespace, pod_name) (container_memory_working_set_bytes{job="kubelet", pod_name!="", image!=""}), "pod", "", "pod_name") * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`,
|
||||
"pod_net_bytes_transmitted": `label_join(sum by (namespace, pod_name) (irate(container_network_transmit_bytes_total{pod_name!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m])), "pod", "", "pod_name") * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`,
|
||||
"pod_net_bytes_received": `label_join(sum by (namespace, pod_name) (irate(container_network_receive_bytes_total{pod_name!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m])), "pod", "", "pod_name") * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`,
|
||||
|
||||
// container
|
||||
"container_cpu_usage": `round(sum by (namespace, pod_name, container_name) (irate(container_cpu_usage_seconds_total{job="kubelet", container_name!="POD", container_name!="", image!="", $1}[5m])), 0.001)`,
|
||||
"container_memory_usage": `sum by (namespace, pod_name, container_name) (container_memory_usage_bytes{job="kubelet", container_name!="POD", container_name!="", image!="", $1})`,
|
||||
"container_memory_usage_wo_cache": `sum by (namespace, pod_name, container_name) (container_memory_working_set_bytes{job="kubelet", container_name!="POD", container_name!="", image!="", $1})`,
|
||||
|
||||
// pvc
|
||||
"pvc_inodes_available": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes_free) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
|
||||
"pvc_inodes_used": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes_used) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
|
||||
"pvc_inodes_total": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
|
||||
"pvc_inodes_utilisation": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes_used / kubelet_volume_stats_inodes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
|
||||
"pvc_bytes_available": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_available_bytes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
|
||||
"pvc_bytes_used": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_used_bytes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
|
||||
"pvc_bytes_total": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
|
||||
"pvc_bytes_utilisation": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
|
||||
|
||||
// component
|
||||
"etcd_server_list": `label_replace(up{job="etcd"}, "node_ip", "$1", "instance", "(.*):.*")`,
|
||||
"etcd_server_total": `count(up{job="etcd"})`,
|
||||
"etcd_server_up_total": `etcd:up:sum`,
|
||||
"etcd_server_has_leader": `label_replace(etcd_server_has_leader, "node_ip", "$1", "instance", "(.*):.*")`,
|
||||
"etcd_server_leader_changes": `label_replace(etcd:etcd_server_leader_changes_seen:sum_changes, "node_ip", "$1", "node", "(.*)")`,
|
||||
"etcd_server_proposals_failed_rate": `avg(etcd:etcd_server_proposals_failed:sum_irate)`,
|
||||
"etcd_server_proposals_applied_rate": `avg(etcd:etcd_server_proposals_applied:sum_irate)`,
|
||||
"etcd_server_proposals_committed_rate": `avg(etcd:etcd_server_proposals_committed:sum_irate)`,
|
||||
"etcd_server_proposals_pending_count": `avg(etcd:etcd_server_proposals_pending:sum)`,
|
||||
"etcd_mvcc_db_size": `avg(etcd:etcd_debugging_mvcc_db_total_size:sum)`,
|
||||
"etcd_network_client_grpc_received_bytes": `sum(etcd:etcd_network_client_grpc_received_bytes:sum_irate)`,
|
||||
"etcd_network_client_grpc_sent_bytes": `sum(etcd:etcd_network_client_grpc_sent_bytes:sum_irate)`,
|
||||
"etcd_grpc_call_rate": `sum(etcd:grpc_server_started:sum_irate)`,
|
||||
"etcd_grpc_call_failed_rate": `sum(etcd:grpc_server_handled:sum_irate)`,
|
||||
"etcd_grpc_server_msg_received_rate": `sum(etcd:grpc_server_msg_received:sum_irate)`,
|
||||
"etcd_grpc_server_msg_sent_rate": `sum(etcd:grpc_server_msg_sent:sum_irate)`,
|
||||
"etcd_disk_wal_fsync_duration": `avg(etcd:etcd_disk_wal_fsync_duration:avg)`,
|
||||
"etcd_disk_wal_fsync_duration_quantile": `avg(etcd:etcd_disk_wal_fsync_duration:histogram_quantile) by (quantile)`,
|
||||
"etcd_disk_backend_commit_duration": `avg(etcd:etcd_disk_backend_commit_duration:avg)`,
|
||||
"etcd_disk_backend_commit_duration_quantile": `avg(etcd:etcd_disk_backend_commit_duration:histogram_quantile) by (quantile)`,
|
||||
|
||||
"apiserver_up_sum": `apiserver:up:sum`,
|
||||
"apiserver_request_rate": `apiserver:apiserver_request_count:sum_irate`,
|
||||
"apiserver_request_by_verb_rate": `apiserver:apiserver_request_count:sum_verb_irate`,
|
||||
"apiserver_request_latencies": `apiserver:apiserver_request_latencies:avg`,
|
||||
"apiserver_request_by_verb_latencies": `apiserver:apiserver_request_latencies:avg_by_verb`,
|
||||
|
||||
"scheduler_up_sum": `scheduler:up:sum`,
|
||||
"scheduler_schedule_attempts": `scheduler:scheduler_schedule_attempts:sum`,
|
||||
"scheduler_schedule_attempt_rate": `scheduler:scheduler_schedule_attempts:sum_rate`,
|
||||
"scheduler_e2e_scheduling_latency": `scheduler:scheduler_e2e_scheduling_latency:avg`,
|
||||
"scheduler_e2e_scheduling_latency_quantile": `scheduler:scheduler_e2e_scheduling_latency:histogram_quantile`,
|
||||
|
||||
"controller_manager_up_sum": `controller_manager:up:sum`,
|
||||
|
||||
"coredns_up_sum": `coredns:up:sum`,
|
||||
"coredns_cache_hits": `coredns:coredns_cache_hits_total:sum_irate`,
|
||||
"coredns_cache_misses": `coredns:coredns_cache_misses:sum_irate`,
|
||||
"coredns_dns_request_rate": `coredns:coredns_dns_request_count:sum_irate`,
|
||||
"coredns_dns_request_duration": `coredns:coredns_dns_request_duration:avg`,
|
||||
"coredns_dns_request_duration_quantile": `coredns:coredns_dns_request_duration:histogram_quantile`,
|
||||
"coredns_dns_request_by_type_rate": `coredns:coredns_dns_request_type_count:sum_irate`,
|
||||
"coredns_dns_request_by_rcode_rate": `coredns:coredns_dns_response_rcode_count:sum_irate`,
|
||||
"coredns_panic_rate": `coredns:coredns_panic_count:sum_irate`,
|
||||
"coredns_proxy_request_rate": `coredns:coredns_proxy_request_count:sum_irate`,
|
||||
"coredns_proxy_request_duration": `coredns:coredns_proxy_request_duration:avg`,
|
||||
"coredns_proxy_request_duration_quantile": `coredns:coredns_proxy_request_duration:histogram_quantile`,
|
||||
|
||||
"prometheus_up_sum": `prometheus:up:sum`,
|
||||
"prometheus_tsdb_head_samples_appended_rate": `prometheus:prometheus_tsdb_head_samples_appended:sum_rate`,
|
||||
}
|
||||
|
||||
func makeExpression(metric string, opt monitoring.QueryOptions) string {
|
||||
tmpl := promQLTemplates[metric]
|
||||
switch opt.Level {
|
||||
case monitoring.LevelCluster:
|
||||
case monitoring.LevelNode:
|
||||
makeNodeMetricExpression(tmpl, opt)
|
||||
case monitoring.LevelWorkspace:
|
||||
makeWorkspaceMetricExpression(tmpl, opt)
|
||||
case monitoring.LevelNamespace:
|
||||
makeNamespaceMetricExpression(tmpl, opt)
|
||||
case monitoring.LevelWorkload:
|
||||
makeWorkloadMetricExpression(tmpl, opt)
|
||||
case monitoring.LevelPod:
|
||||
makePodMetricExpression(tmpl, opt)
|
||||
case monitoring.LevelContainer:
|
||||
makeContainerMetricExpression(tmpl, opt)
|
||||
case monitoring.LevelPVC:
|
||||
makePVCMetricExpression(tmpl, opt)
|
||||
case monitoring.LevelComponent:
|
||||
default:
|
||||
}
|
||||
return tmpl
|
||||
}
|
||||
|
||||
func makeNodeMetricExpression(tmpl string, o monitoring.QueryOptions) string {
|
||||
var nodeSelector string
|
||||
if o.NodeName != "" {
|
||||
nodeSelector = fmt.Sprintf(`node="%s"`, o.NodeName)
|
||||
} else {
|
||||
nodeSelector = fmt.Sprintf(`node=~"%s"`, o.ResourceFilter)
|
||||
}
|
||||
return strings.Replace(tmpl, "$1", nodeSelector, -1)
|
||||
}
|
||||
|
||||
func makeWorkspaceMetricExpression(tmpl string, o monitoring.QueryOptions) string {
|
||||
var workspaceSelector string
|
||||
if o.WorkspaceName != "" {
|
||||
workspaceSelector = fmt.Sprintf(`label_kubesphere_io_workspace="%s"`, o.WorkspaceName)
|
||||
} else {
|
||||
workspaceSelector = fmt.Sprintf(`label_kubesphere_io_workspace=~"%s", label_kubesphere_io_workspace!=""`, o.ResourceFilter)
|
||||
}
|
||||
return strings.Replace(tmpl, "$1", workspaceSelector, -1)
|
||||
}
|
||||
|
||||
func makeNamespaceMetricExpression(tmpl string, o monitoring.QueryOptions) string {
|
||||
var namespaceSelector string
|
||||
|
||||
// For monitoring namespaces in the specific workspace
|
||||
// GET /workspaces/{workspace}/namespaces
|
||||
if o.WorkspaceName != "" {
|
||||
namespaceSelector = fmt.Sprintf(`label_kubesphere_io_workspace="%s", namespace=~"%s"`, o.WorkspaceName, o.ResourceFilter)
|
||||
return strings.Replace(tmpl, "$1", namespaceSelector, -1)
|
||||
}
|
||||
|
||||
// For monitoring the specific namespaces
|
||||
// GET /namespaces/{namespace} or
|
||||
// GET /namespaces
|
||||
if o.NamespaceName != "" {
|
||||
namespaceSelector = fmt.Sprintf(`namespace="%s"`, o.NamespaceName)
|
||||
} else {
|
||||
namespaceSelector = fmt.Sprintf(`namespace=~"%s"`, o.ResourceFilter)
|
||||
}
|
||||
return strings.Replace(tmpl, "$1", namespaceSelector, -1)
|
||||
}
|
||||
|
||||
func makeWorkloadMetricExpression(tmpl string, o monitoring.QueryOptions) string {
|
||||
var kindSelector, workloadSelector string
|
||||
switch o.WorkloadKind {
|
||||
case "deployment":
|
||||
o.WorkloadKind = Deployment
|
||||
kindSelector = fmt.Sprintf(`namespace="%s", deployment!="", deployment=~"%s"`, o.NamespaceName, o.ResourceFilter)
|
||||
case "statefulset":
|
||||
o.WorkloadKind = StatefulSet
|
||||
kindSelector = fmt.Sprintf(`namespace="%s", statefulset!="", statefulset=~"%s"`, o.NamespaceName, o.ResourceFilter)
|
||||
case "daemonset":
|
||||
o.WorkloadKind = DaemonSet
|
||||
kindSelector = fmt.Sprintf(`namespace="%s", daemonset!="", daemonset=~"%s"`, o.NamespaceName, o.ResourceFilter)
|
||||
default:
|
||||
o.WorkloadKind = ".*"
|
||||
kindSelector = fmt.Sprintf(`namespace="%s"`, o.NamespaceName)
|
||||
}
|
||||
workloadSelector = fmt.Sprintf(`namespace="%s", workload=~"%s:%s"`, o.NamespaceName, o.WorkloadKind, o.ResourceFilter)
|
||||
return strings.NewReplacer("$1", workloadSelector, "$2", kindSelector).Replace(tmpl)
|
||||
}
|
||||
|
||||
func makePodMetricExpression(tmpl string, o monitoring.QueryOptions) string {
|
||||
var podSelector, workloadSelector string
|
||||
|
||||
// For monitoriong pods of the specific workload
|
||||
// GET /namespaces/{namespace}/workloads/{kind}/{workload}/pods
|
||||
if o.WorkloadName != "" {
|
||||
switch o.WorkloadKind {
|
||||
case "deployment":
|
||||
workloadSelector = fmt.Sprintf(`owner_kind="ReplicaSet", owner_name=~"^%s-[^-]{1,10}$"`, o.WorkloadKind)
|
||||
case "statefulset":
|
||||
workloadSelector = fmt.Sprintf(`owner_kind="StatefulSet", owner_name="%s"`, o.WorkloadKind)
|
||||
case "daemonset":
|
||||
workloadSelector = fmt.Sprintf(`owner_kind="DaemonSet", owner_name="%s"`, o.WorkloadKind)
|
||||
}
|
||||
}
|
||||
|
||||
// For monitoring pods in the specific namespace
|
||||
// GET /namespaces/{namespace}/workloads/{kind}/{workload}/pods or
|
||||
// GET /namespaces/{namespace}/pods/{pod} or
|
||||
// GET /namespaces/{namespace}/pods
|
||||
if o.NamespaceName != "" {
|
||||
if o.PodName != "" {
|
||||
podSelector = fmt.Sprintf(`pod="%s", namespace="%s"`, o.PodName, o.NamespaceName)
|
||||
} else {
|
||||
podSelector = fmt.Sprintf(`pod=~"%s", namespace="%s"`, o.ResourceFilter, o.NamespaceName)
|
||||
}
|
||||
}
|
||||
|
||||
// For monitoring pods on the specific node
|
||||
// GET /nodes/{node}/pods/{pod}
|
||||
if o.PodName != "" {
|
||||
if o.PodName != "" {
|
||||
podSelector = fmt.Sprintf(`pod="%s", node="%s"`, o.PodName, o.NodeName)
|
||||
} else {
|
||||
podSelector = fmt.Sprintf(`pod=~"%s", node="%s"`, o.ResourceFilter, o.NodeName)
|
||||
}
|
||||
}
|
||||
return strings.NewReplacer("$1", workloadSelector, "$2", podSelector).Replace(tmpl)
|
||||
}
|
||||
|
||||
func makeContainerMetricExpression(tmpl string, o monitoring.QueryOptions) string {
|
||||
var containerSelector string
|
||||
if o.ContainerName != "" {
|
||||
containerSelector = fmt.Sprintf(`pod_name="%s", namespace="%s", container_name="%s"`, o.PodName, o.NamespaceName, o.ContainerName)
|
||||
} else {
|
||||
containerSelector = fmt.Sprintf(`pod_name="%s", namespace="%s", container_name=~"%s"`, o.PodName, o.NamespaceName, o.ResourceFilter)
|
||||
}
|
||||
return strings.Replace(tmpl, "$1", containerSelector, -1)
|
||||
}
|
||||
|
||||
func makePVCMetricExpression(tmpl string, o monitoring.QueryOptions) string {
|
||||
var pvcSelector string
|
||||
|
||||
// For monitoring persistentvolumeclaims in the specific namespace
|
||||
// GET /namespaces/{namespace}/persistentvolumeclaims/{persistentvolumeclaim} or
|
||||
// GET /namespaces/{namespace}/persistentvolumeclaims
|
||||
if o.NamespaceName != "" {
|
||||
if o.PersistentVolumeClaimName != "" {
|
||||
pvcSelector = fmt.Sprintf(`namespace="%s", persistentvolumeclaim="%s"`, o.NamespaceName, o.PersistentVolumeClaimName)
|
||||
} else {
|
||||
pvcSelector = fmt.Sprintf(`namespace="%s", persistentvolumeclaim=~"%s"`, o.NamespaceName, o.ResourceFilter)
|
||||
}
|
||||
return strings.Replace(tmpl, "$1", pvcSelector, -1)
|
||||
}
|
||||
|
||||
// For monitoring persistentvolumeclaims of the specific storageclass
|
||||
// GET /storageclasses/{storageclass}/persistentvolumeclaims
|
||||
if o.StorageClassName != "" {
|
||||
pvcSelector = fmt.Sprintf(`storageclass="%s", persistentvolumeclaim=~"%s"`, o.StorageClassName, o.ResourceFilter)
|
||||
}
|
||||
return strings.Replace(tmpl, "$1", pvcSelector, -1)
|
||||
}
|
||||
@@ -1 +0,0 @@
|
||||
package monitoring
|
||||
162
pkg/simple/client/monitoring/query_options.go
Normal file
162
pkg/simple/client/monitoring/query_options.go
Normal file
@@ -0,0 +1,162 @@
|
||||
package monitoring
|
||||
|
||||
type QueryOption interface {
|
||||
Apply(*QueryOptions)
|
||||
}
|
||||
|
||||
type QueryOptions struct {
|
||||
Level MonitoringLevel
|
||||
NamedMetrics []string
|
||||
|
||||
MetricFilter string
|
||||
ResourceFilter string
|
||||
NodeName string
|
||||
WorkspaceName string
|
||||
NamespaceName string
|
||||
WorkloadKind string
|
||||
WorkloadName string
|
||||
PodName string
|
||||
ContainerName string
|
||||
StorageClassName string
|
||||
PersistentVolumeClaimName string
|
||||
}
|
||||
|
||||
func NewQueryOptions() *QueryOptions {
|
||||
return &QueryOptions{}
|
||||
}
|
||||
|
||||
type ClusterOption struct {
|
||||
MetricFilter string
|
||||
}
|
||||
|
||||
func (co ClusterOption) Apply(o *QueryOptions) {
|
||||
o.Level = LevelCluster
|
||||
o.NamedMetrics = ClusterMetrics
|
||||
}
|
||||
|
||||
type NodeOption struct {
|
||||
MetricFilter string
|
||||
ResourceFilter string
|
||||
NodeName string
|
||||
}
|
||||
|
||||
func (no NodeOption) Apply(o *QueryOptions) {
|
||||
o.Level = LevelNode
|
||||
o.NamedMetrics = NodeMetrics
|
||||
o.ResourceFilter = no.ResourceFilter
|
||||
o.NodeName = no.NodeName
|
||||
}
|
||||
|
||||
type WorkspaceOption struct {
|
||||
MetricFilter string
|
||||
ResourceFilter string
|
||||
WorkspaceName string
|
||||
}
|
||||
|
||||
func (wo WorkspaceOption) Apply(o *QueryOptions) {
|
||||
o.Level = LevelWorkspace
|
||||
o.NamedMetrics = WorkspaceMetrics
|
||||
o.MetricFilter = wo.MetricFilter
|
||||
o.ResourceFilter = wo.ResourceFilter
|
||||
o.WorkspaceName = wo.WorkspaceName
|
||||
}
|
||||
|
||||
type NamespaceOption struct {
|
||||
MetricFilter string
|
||||
ResourceFilter string
|
||||
WorkspaceName string
|
||||
NamespaceName string
|
||||
}
|
||||
|
||||
func (no NamespaceOption) Apply(o *QueryOptions) {
|
||||
o.Level = LevelNamespace
|
||||
o.NamedMetrics = NamespaceMetrics
|
||||
o.MetricFilter = no.MetricFilter
|
||||
o.ResourceFilter = no.ResourceFilter
|
||||
o.WorkspaceName = no.WorkspaceName
|
||||
o.NamespaceName = no.NamespaceName
|
||||
}
|
||||
|
||||
type WorkloadOption struct {
|
||||
MetricFilter string
|
||||
ResourceFilter string
|
||||
NamespaceName string
|
||||
WorkloadKind string
|
||||
WorkloadName string
|
||||
}
|
||||
|
||||
func (wo WorkloadOption) Apply(o *QueryOptions) {
|
||||
o.Level = LevelWorkload
|
||||
o.NamedMetrics = WorkspaceMetrics
|
||||
o.MetricFilter = wo.MetricFilter
|
||||
o.ResourceFilter = wo.ResourceFilter
|
||||
o.NamespaceName = wo.NamespaceName
|
||||
o.WorkloadKind = wo.WorkloadKind
|
||||
o.WorkloadName = wo.WorkloadName
|
||||
}
|
||||
|
||||
type PodOption struct {
|
||||
MetricFilter string
|
||||
ResourceFilter string
|
||||
NodeName string
|
||||
NamespaceName string
|
||||
WorkloadKind string
|
||||
WorkloadName string
|
||||
PodName string
|
||||
}
|
||||
|
||||
func (po PodOption) Apply(o *QueryOptions) {
|
||||
o.Level = LevelPod
|
||||
o.NamedMetrics = PodMetrics
|
||||
o.MetricFilter = po.MetricFilter
|
||||
o.ResourceFilter = po.ResourceFilter
|
||||
o.NamespaceName = po.NamespaceName
|
||||
o.WorkloadKind = po.WorkloadKind
|
||||
o.WorkloadName = po.WorkloadName
|
||||
}
|
||||
|
||||
type ContainerOption struct {
|
||||
MetricFilter string
|
||||
ResourceFilter string
|
||||
NamespaceName string
|
||||
PodName string
|
||||
ContainerName string
|
||||
}
|
||||
|
||||
func (co ContainerOption) Apply(o *QueryOptions) {
|
||||
o.Level = LevelContainer
|
||||
o.NamedMetrics = ContainerMetrics
|
||||
o.MetricFilter = co.MetricFilter
|
||||
o.ResourceFilter = co.ResourceFilter
|
||||
o.NamespaceName = co.NamespaceName
|
||||
o.PodName = co.PodName
|
||||
o.ContainerName = co.ContainerName
|
||||
}
|
||||
|
||||
type PVCOption struct {
|
||||
MetricFilter string
|
||||
ResourceFilter string
|
||||
NamespaceName string
|
||||
StorageClassName string
|
||||
PersistentVolumeClaimName string
|
||||
}
|
||||
|
||||
func (po PVCOption) Apply(o *QueryOptions) {
|
||||
o.Level = LevelPVC
|
||||
o.NamedMetrics = PVCMetrics
|
||||
o.MetricFilter = po.MetricFilter
|
||||
o.ResourceFilter = po.ResourceFilter
|
||||
o.NamespaceName = po.NamespaceName
|
||||
o.StorageClassName = po.StorageClassName
|
||||
o.PersistentVolumeClaimName = po.PersistentVolumeClaimName
|
||||
}
|
||||
|
||||
type ComponentOption struct {
|
||||
MetricFilter string
|
||||
}
|
||||
|
||||
func (co ComponentOption) Apply(o *QueryOptions) {
|
||||
o.Level = LevelComponent
|
||||
o.NamedMetrics = ComponentMetrics
|
||||
o.MetricFilter = co.MetricFilter
|
||||
}
|
||||
Reference in New Issue
Block a user