refactor monitoring (#1751)

Signed-off-by: huanggze <loganhuang@yunify.com>
This commit is contained in:
Guangzhe Huang
2020-03-07 12:34:52 +08:00
committed by GitHub
parent 6c6bfb2677
commit 148a804726
30 changed files with 1606 additions and 2073 deletions

View File

@@ -1,32 +1,41 @@
package monitoring
type ClusterQuery struct {
import "time"
const (
StatusSuccess = "success"
StatusError = "error"
MetricTypeMatrix = "matrix"
MetricTypeVector = "vector"
)
type Metric struct {
MetricName string `json:"metric_name,omitempty" description:"metric name, eg. scheduler_up_sum"`
Status string `json:"status" description:"result status, one of error, success"`
MetricData `json:"data" description:"actual metric result"`
ErrorType string `json:"errorType,omitempty"`
Error string `json:"error,omitempty"`
}
type ClusterMetrics struct {
type MetricData struct {
MetricType string `json:"resultType" description:"result type, one of matrix, vector"`
MetricValues []MetricValue `json:"result" description:"metric data including labels, time series and values"`
}
type WorkspaceQuery struct {
type Point [2]float64
type MetricValue struct {
Metadata map[string]string `json:"metric,omitempty" description:"time series labels"`
Sample Point `json:"value,omitempty" description:"time series, values of vector type"`
Series []Point `json:"values,omitempty" description:"time series, values of matrix type"`
}
type WorkspaceMetrics struct {
}
type NamespaceQuery struct {
}
type NamespaceMetrics struct {
}
// Interface defines all the abstract behaviors of monitoring
type Interface interface {
// The `stmts` defines statements, expressions or rules (eg. promql in Prometheus) for querying specific metrics.
GetMetrics(stmts []string, time time.Time) ([]Metric, error)
GetMetricsOverTime(stmts []string, start, end time.Time, step time.Duration) ([]Metric, error)
// Get
GetClusterMetrics(query ClusterQuery) ClusterMetrics
//
GetWorkspaceMetrics(query WorkspaceQuery) WorkspaceMetrics
//
GetNamespaceMetrics(query NamespaceQuery) NamespaceMetrics
// Get named metrics (eg. node_cpu_usage)
GetNamedMetrics(time time.Time, opt QueryOption) ([]Metric, error)
GetNamedMetricsOverTime(start, end time.Time, step time.Duration, opt QueryOption) ([]Metric, error)
}

View File

@@ -0,0 +1,252 @@
package monitoring
type MonitoringLevel int
const (
LevelCluster = MonitoringLevel(1) << iota
LevelNode
LevelWorkspace
LevelNamespace
LevelWorkload
LevelPod
LevelContainer
LevelPVC
LevelComponent
)
var ClusterMetrics = []string{
"cluster_cpu_utilisation",
"cluster_cpu_usage",
"cluster_cpu_total",
"cluster_memory_utilisation",
"cluster_memory_available",
"cluster_memory_total",
"cluster_memory_usage_wo_cache",
"cluster_net_utilisation",
"cluster_net_bytes_transmitted",
"cluster_net_bytes_received",
"cluster_disk_read_iops",
"cluster_disk_write_iops",
"cluster_disk_read_throughput",
"cluster_disk_write_throughput",
"cluster_disk_size_usage",
"cluster_disk_size_utilisation",
"cluster_disk_size_capacity",
"cluster_disk_size_available",
"cluster_disk_inode_total",
"cluster_disk_inode_usage",
"cluster_disk_inode_utilisation",
"cluster_namespace_count",
"cluster_pod_count",
"cluster_pod_quota",
"cluster_pod_utilisation",
"cluster_pod_running_count",
"cluster_pod_succeeded_count",
"cluster_pod_abnormal_count",
"cluster_node_online",
"cluster_node_offline",
"cluster_node_total",
"cluster_cronjob_count",
"cluster_pvc_count",
"cluster_daemonset_count",
"cluster_deployment_count",
"cluster_endpoint_count",
"cluster_hpa_count",
"cluster_job_count",
"cluster_statefulset_count",
"cluster_replicaset_count",
"cluster_service_count",
"cluster_secret_count",
"cluster_pv_count",
"cluster_ingresses_extensions_count",
"cluster_load1",
"cluster_load5",
"cluster_load15",
"cluster_pod_abnormal_ratio",
"cluster_node_offline_ratio",
}
var NodeMetrics = []string{
"node_cpu_utilisation",
"node_cpu_total",
"node_cpu_usage",
"node_memory_utilisation",
"node_memory_usage_wo_cache",
"node_memory_available",
"node_memory_total",
"node_net_utilisation",
"node_net_bytes_transmitted",
"node_net_bytes_received",
"node_disk_read_iops",
"node_disk_write_iops",
"node_disk_read_throughput",
"node_disk_write_throughput",
"node_disk_size_capacity",
"node_disk_size_available",
"node_disk_size_usage",
"node_disk_size_utilisation",
"node_disk_inode_total",
"node_disk_inode_usage",
"node_disk_inode_utilisation",
"node_pod_count",
"node_pod_quota",
"node_pod_utilisation",
"node_pod_running_count",
"node_pod_succeeded_count",
"node_pod_abnormal_count",
"node_load1",
"node_load5",
"node_load15",
"node_pod_abnormal_ratio",
}
var WorkspaceMetrics = []string{
"workspace_cpu_usage",
"workspace_memory_usage",
"workspace_memory_usage_wo_cache",
"workspace_net_bytes_transmitted",
"workspace_net_bytes_received",
"workspace_pod_count",
"workspace_pod_running_count",
"workspace_pod_succeeded_count",
"workspace_pod_abnormal_count",
"workspace_ingresses_extensions_count",
"workspace_cronjob_count",
"workspace_pvc_count",
"workspace_daemonset_count",
"workspace_deployment_count",
"workspace_endpoint_count",
"workspace_hpa_count",
"workspace_job_count",
"workspace_statefulset_count",
"workspace_replicaset_count",
"workspace_service_count",
"workspace_secret_count",
"workspace_pod_abnormal_ratio",
}
var NamespaceMetrics = []string{
"namespace_cpu_usage",
"namespace_memory_usage",
"namespace_memory_usage_wo_cache",
"namespace_net_bytes_transmitted",
"namespace_net_bytes_received",
"namespace_pod_count",
"namespace_pod_running_count",
"namespace_pod_succeeded_count",
"namespace_pod_abnormal_count",
"namespace_pod_abnormal_ratio",
"namespace_memory_limit_hard",
"namespace_cpu_limit_hard",
"namespace_pod_count_hard",
"namespace_cronjob_count",
"namespace_pvc_count",
"namespace_daemonset_count",
"namespace_deployment_count",
"namespace_endpoint_count",
"namespace_hpa_count",
"namespace_job_count",
"namespace_statefulset_count",
"namespace_replicaset_count",
"namespace_service_count",
"namespace_secret_count",
"namespace_configmap_count",
"namespace_ingresses_extensions_count",
"namespace_s2ibuilder_count",
}
var WorkloadMetrics = []string{
"workload_cpu_usage",
"workload_memory_usage",
"workload_memory_usage_wo_cache",
"workload_net_bytes_transmitted",
"workload_net_bytes_received",
"workload_deployment_replica",
"workload_deployment_replica_available",
"workload_statefulset_replica",
"workload_statefulset_replica_available",
"workload_daemonset_replica",
"workload_daemonset_replica_available",
"workload_deployment_unavailable_replicas_ratio",
"workload_daemonset_unavailable_replicas_ratio",
"workload_statefulset_unavailable_replicas_ratio",
}
var PodMetrics = []string{
"pod_cpu_usage",
"pod_memory_usage",
"pod_memory_usage_wo_cache",
"pod_net_bytes_transmitted",
"pod_net_bytes_received",
}
var ContainerMetrics = []string{
"container_cpu_usage",
"container_memory_usage",
"container_memory_usage_wo_cache",
}
var PVCMetrics = []string{
"pvc_inodes_available",
"pvc_inodes_used",
"pvc_inodes_total",
"pvc_inodes_utilisation",
"pvc_bytes_available",
"pvc_bytes_used",
"pvc_bytes_total",
"pvc_bytes_utilisation",
}
var ComponentMetrics = []string{
"etcd_server_list",
"etcd_server_total",
"etcd_server_up_total",
"etcd_server_has_leader",
"etcd_server_leader_changes",
"etcd_server_proposals_failed_rate",
"etcd_server_proposals_applied_rate",
"etcd_server_proposals_committed_rate",
"etcd_server_proposals_pending_count",
"etcd_mvcc_db_size",
"etcd_network_client_grpc_received_bytes",
"etcd_network_client_grpc_sent_bytes",
"etcd_grpc_call_rate",
"etcd_grpc_call_failed_rate",
"etcd_grpc_server_msg_received_rate",
"etcd_grpc_server_msg_sent_rate",
"etcd_disk_wal_fsync_duration",
"etcd_disk_wal_fsync_duration_quantile",
"etcd_disk_backend_commit_duration",
"etcd_disk_backend_commit_duration_quantile",
"apiserver_up_sum",
"apiserver_request_rate",
"apiserver_request_by_verb_rate",
"apiserver_request_latencies",
"apiserver_request_by_verb_latencies",
"scheduler_up_sum",
"scheduler_schedule_attempts",
"scheduler_schedule_attempt_rate",
"scheduler_e2e_scheduling_latency",
"scheduler_e2e_scheduling_latency_quantile",
"controller_manager_up_sum",
"coredns_up_sum",
"coredns_cache_hits",
"coredns_cache_misses",
"coredns_dns_request_rate",
"coredns_dns_request_duration",
"coredns_dns_request_duration_quantile",
"coredns_dns_request_by_type_rate",
"coredns_dns_request_by_rcode_rate",
"coredns_panic_rate",
"coredns_proxy_request_rate",
"coredns_proxy_request_duration",
"coredns_proxy_request_duration_quantile",
"prometheus_up_sum",
"prometheus_tsdb_head_samples_appended_rate",
}

View File

@@ -1,31 +0,0 @@
package monitoring
import (
"net/http"
"time"
)
// prometheus implements monitoring interface backed by Prometheus
type prometheus struct {
options *Options
client *http.Client
}
func NewPrometheus(options *Options) Interface {
return &prometheus{
options: options,
client: &http.Client{Timeout: 10 * time.Second},
}
}
func (p prometheus) GetClusterMetrics(query ClusterQuery) ClusterMetrics {
panic("implement me")
}
func (p prometheus) GetWorkspaceMetrics(query WorkspaceQuery) WorkspaceMetrics {
panic("implement me")
}
func (p prometheus) GetNamespaceMetrics(query NamespaceQuery) NamespaceMetrics {
panic("implement me")
}

View File

@@ -0,0 +1,178 @@
package prometheus
import (
"fmt"
"github.com/json-iterator/go"
"io/ioutil"
"kubesphere.io/kubesphere/pkg/simple/client/monitoring"
"net/http"
"net/url"
"regexp"
"sync"
"time"
)
var json = jsoniter.ConfigCompatibleWithStandardLibrary
// prometheus implements monitoring interface backed by Prometheus
type prometheus struct {
options *Options
client *http.Client
}
func NewPrometheus(options *Options) monitoring.Interface {
return &prometheus{
options: options,
client: &http.Client{Timeout: 10 * time.Second},
}
}
// TODO(huanggze): reserve for custom monitoring
func (p *prometheus) GetMetrics(stmts []string, time time.Time) ([]monitoring.Metric, error) {
panic("implement me")
}
// TODO(huanggze): reserve for custom monitoring
func (p *prometheus) GetMetricsOverTime(stmts []string, start, end time.Time, step time.Duration) ([]monitoring.Metric, error) {
panic("implement me")
}
func (p *prometheus) GetNamedMetrics(ts time.Time, o monitoring.QueryOption) ([]monitoring.Metric, error) {
metrics := make([]monitoring.Metric, 0)
var mtx sync.Mutex // guard metrics
var wg sync.WaitGroup
opts := monitoring.NewQueryOptions()
o.Apply(opts)
errCh := make(chan error)
for _, metric := range opts.NamedMetrics {
matched, _ := regexp.MatchString(opts.MetricFilter, metric)
if matched {
exp := makeExpression(metric, *opts)
wg.Add(1)
go func(metric, exp string) {
res, err := p.query(exp, ts)
if err != nil {
select {
case errCh <- err: // Record error once
default:
}
} else {
res.MetricName = metric // Add metric name
mtx.Lock()
metrics = append(metrics, res)
mtx.Unlock()
}
wg.Done()
}(metric, exp)
}
}
wg.Wait()
select {
case err := <-errCh:
return nil, err
default:
return metrics, nil
}
}
func (p *prometheus) GetNamedMetricsOverTime(start, end time.Time, step time.Duration, o monitoring.QueryOption) ([]monitoring.Metric, error) {
metrics := make([]monitoring.Metric, 0)
var mtx sync.Mutex // guard metrics
var wg sync.WaitGroup
opts := monitoring.NewQueryOptions()
o.Apply(opts)
errCh := make(chan error)
for _, metric := range opts.NamedMetrics {
matched, _ := regexp.MatchString(opts.MetricFilter, metric)
if matched {
exp := makeExpression(metric, *opts)
wg.Add(1)
go func(metric, exp string) {
res, err := p.rangeQuery(exp, start, end, step)
if err != nil {
select {
case errCh <- err: // Record error once
default:
}
} else {
res.MetricName = metric // Add metric name
mtx.Lock()
metrics = append(metrics, res)
mtx.Unlock()
}
wg.Done()
}(metric, exp)
}
}
wg.Wait()
select {
case err := <-errCh:
return nil, err
default:
return metrics, nil
}
}
func (p prometheus) query(exp string, ts time.Time) (monitoring.Metric, error) {
params := &url.Values{}
params.Set("time", ts.Format(time.RFC3339))
params.Set("query", exp)
u := fmt.Sprintf("%s/api/v1/query?%s", p.options.Endpoint, params.Encode())
var m monitoring.Metric
response, err := p.client.Get(u)
if err != nil {
return monitoring.Metric{}, err
}
body, err := ioutil.ReadAll(response.Body)
if err != nil {
return monitoring.Metric{}, err
}
defer response.Body.Close()
err = json.Unmarshal(body, m)
if err != nil {
return monitoring.Metric{}, err
}
return m, nil
}
func (p prometheus) rangeQuery(exp string, start, end time.Time, step time.Duration) (monitoring.Metric, error) {
params := &url.Values{}
params.Set("start", start.Format(time.RFC3339))
params.Set("end", end.Format(time.RFC3339))
params.Set("step", step.String())
params.Set("query", exp)
u := fmt.Sprintf("%s/api/v1/query?%s", p.options.Endpoint, params.Encode())
var m monitoring.Metric
response, err := p.client.Get(u)
if err != nil {
return monitoring.Metric{}, err
}
body, err := ioutil.ReadAll(response.Body)
if err != nil {
return monitoring.Metric{}, err
}
defer response.Body.Close()
err = json.Unmarshal(body, m)
if err != nil {
return monitoring.Metric{}, err
}
return m, nil
}

View File

@@ -1,4 +1,4 @@
package monitoring
package prometheus
import (
"github.com/spf13/pflag"

View File

@@ -0,0 +1,415 @@
/*
Copyright 2019 The KubeSphere Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package prometheus
import (
"fmt"
"kubesphere.io/kubesphere/pkg/simple/client/monitoring"
"strings"
)
const (
StatefulSet = "StatefulSet"
DaemonSet = "DaemonSet"
Deployment = "Deployment"
)
//TODO(huanggze): move this part to a ConfigMap
var promQLTemplates = map[string]string{
//cluster
"cluster_cpu_utilisation": ":node_cpu_utilisation:avg1m",
"cluster_cpu_usage": `round(:node_cpu_utilisation:avg1m * sum(node:node_num_cpu:sum), 0.001)`,
"cluster_cpu_total": "sum(node:node_num_cpu:sum)",
"cluster_memory_utilisation": ":node_memory_utilisation:",
"cluster_memory_available": "sum(node:node_memory_bytes_available:sum)",
"cluster_memory_total": "sum(node:node_memory_bytes_total:sum)",
"cluster_memory_usage_wo_cache": "sum(node:node_memory_bytes_total:sum) - sum(node:node_memory_bytes_available:sum)",
"cluster_net_utilisation": ":node_net_utilisation:sum_irate",
"cluster_net_bytes_transmitted": "sum(node:node_net_bytes_transmitted:sum_irate)",
"cluster_net_bytes_received": "sum(node:node_net_bytes_received:sum_irate)",
"cluster_disk_read_iops": "sum(node:data_volume_iops_reads:sum)",
"cluster_disk_write_iops": "sum(node:data_volume_iops_writes:sum)",
"cluster_disk_read_throughput": "sum(node:data_volume_throughput_bytes_read:sum)",
"cluster_disk_write_throughput": "sum(node:data_volume_throughput_bytes_written:sum)",
"cluster_disk_size_usage": `sum(max(node_filesystem_size_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"} - node_filesystem_avail_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"}) by (device, instance))`,
"cluster_disk_size_utilisation": `cluster:disk_utilization:ratio`,
"cluster_disk_size_capacity": `sum(max(node_filesystem_size_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"}) by (device, instance))`,
"cluster_disk_size_available": `sum(max(node_filesystem_avail_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"}) by (device, instance))`,
"cluster_disk_inode_total": `sum(node:node_inodes_total:)`,
"cluster_disk_inode_usage": `sum(node:node_inodes_total:) - sum(node:node_inodes_free:)`,
"cluster_disk_inode_utilisation": `cluster:disk_inode_utilization:ratio`,
"cluster_namespace_count": `count(kube_namespace_labels)`,
"cluster_pod_count": `cluster:pod:sum`,
"cluster_pod_quota": `sum(max(kube_node_status_capacity_pods) by (node) unless on (node) (kube_node_status_condition{condition="Ready",status=~"unknown|false"} > 0))`,
"cluster_pod_utilisation": `cluster:pod_utilization:ratio`,
"cluster_pod_running_count": `cluster:pod_running:count`,
"cluster_pod_succeeded_count": `count(kube_pod_info unless on (pod) (kube_pod_status_phase{phase=~"Failed|Pending|Unknown|Running"} > 0) unless on (node) (kube_node_status_condition{condition="Ready",status=~"unknown|false"} > 0))`,
"cluster_pod_abnormal_count": `cluster:pod_abnormal:sum`,
"cluster_node_online": `sum(kube_node_status_condition{condition="Ready",status="true"})`,
"cluster_node_offline": `cluster:node_offline:sum`,
"cluster_node_total": `sum(kube_node_status_condition{condition="Ready"})`,
"cluster_cronjob_count": `sum(kube_cronjob_labels)`,
"cluster_pvc_count": `sum(kube_persistentvolumeclaim_info)`,
"cluster_daemonset_count": `sum(kube_daemonset_labels)`,
"cluster_deployment_count": `sum(kube_deployment_labels)`,
"cluster_endpoint_count": `sum(kube_endpoint_labels)`,
"cluster_hpa_count": `sum(kube_hpa_labels)`,
"cluster_job_count": `sum(kube_job_labels)`,
"cluster_statefulset_count": `sum(kube_statefulset_labels)`,
"cluster_replicaset_count": `count(kube_replicaset_labels)`,
"cluster_service_count": `sum(kube_service_info)`,
"cluster_secret_count": `sum(kube_secret_info)`,
"cluster_pv_count": `sum(kube_persistentvolume_labels)`,
"cluster_ingresses_extensions_count": `sum(kube_ingress_labels)`,
"cluster_load1": `sum(node_load1{job="node-exporter"}) / sum(node:node_num_cpu:sum)`,
"cluster_load5": `sum(node_load5{job="node-exporter"}) / sum(node:node_num_cpu:sum)`,
"cluster_load15": `sum(node_load15{job="node-exporter"}) / sum(node:node_num_cpu:sum)`,
"cluster_pod_abnormal_ratio": `cluster:pod_abnormal:ratio`,
"cluster_node_offline_ratio": `cluster:node_offline:ratio`,
//node
"node_cpu_utilisation": "node:node_cpu_utilisation:avg1m{$1}",
"node_cpu_total": "node:node_num_cpu:sum{$1}",
"node_memory_utilisation": "node:node_memory_utilisation:{$1}",
"node_memory_available": "node:node_memory_bytes_available:sum{$1}",
"node_memory_total": "node:node_memory_bytes_total:sum{$1}",
"node_memory_usage_wo_cache": "node:node_memory_bytes_total:sum{$1} - node:node_memory_bytes_available:sum{$1}",
"node_net_utilisation": "node:node_net_utilisation:sum_irate{$1}",
"node_net_bytes_transmitted": "node:node_net_bytes_transmitted:sum_irate{$1}",
"node_net_bytes_received": "node:node_net_bytes_received:sum_irate{$1}",
"node_disk_read_iops": "node:data_volume_iops_reads:sum{$1}",
"node_disk_write_iops": "node:data_volume_iops_writes:sum{$1}",
"node_disk_read_throughput": "node:data_volume_throughput_bytes_read:sum{$1}",
"node_disk_write_throughput": "node:data_volume_throughput_bytes_written:sum{$1}",
"node_disk_size_capacity": `sum(max(node_filesystem_size_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"} * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{$1}) by (device, node)) by (node)`,
"node_disk_size_available": `node:disk_space_available:{$1}`,
"node_disk_size_usage": `sum(max((node_filesystem_size_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"} - node_filesystem_avail_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"}) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{$1}) by (device, node)) by (node)`,
"node_disk_size_utilisation": `node:disk_space_utilization:ratio{$1}`,
"node_disk_inode_total": `node:node_inodes_total:{$1}`,
"node_disk_inode_usage": `node:node_inodes_total:{$1} - node:node_inodes_free:{$1}`,
"node_disk_inode_utilisation": `node:disk_inode_utilization:ratio{$1}`,
"node_pod_count": `node:pod_count:sum{$1}`,
"node_pod_quota": `max(kube_node_status_capacity_pods{$1}) by (node) unless on (node) (kube_node_status_condition{condition="Ready",status=~"unknown|false"} > 0)`,
"node_pod_utilisation": `node:pod_utilization:ratio{$1}`,
"node_pod_running_count": `node:pod_running:count{$1}`,
"node_pod_succeeded_count": `node:pod_succeeded:count{$1}`,
"node_pod_abnormal_count": `node:pod_abnormal:count{$1}`,
"node_cpu_usage": `round(node:node_cpu_utilisation:avg1m{$1} * node:node_num_cpu:sum{$1}, 0.001)`,
"node_load1": `node:load1:ratio{$1}`,
"node_load5": `node:load5:ratio{$1}`,
"node_load15": `node:load15:ratio{$1}`,
"node_pod_abnormal_ratio": `node:pod_abnormal:ratio{$1}`,
// workspace
"workspace_cpu_usage": `round(sum by (label_kubesphere_io_workspace) (namespace:container_cpu_usage_seconds_total:sum_rate{namespace!="", $1}), 0.001)`,
"workspace_memory_usage": `sum by (label_kubesphere_io_workspace) (namespace:container_memory_usage_bytes:sum{namespace!="", $1})`,
"workspace_memory_usage_wo_cache": `sum by (label_kubesphere_io_workspace) (namespace:container_memory_usage_bytes_wo_cache:sum{namespace!="", $1})`,
"workspace_net_bytes_transmitted": `sum by (label_kubesphere_io_workspace) (sum by (namespace) (irate(container_network_transmit_bytes_total{namespace!="", pod_name!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m])) * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
"workspace_net_bytes_received": `sum by (label_kubesphere_io_workspace) (sum by (namespace) (irate(container_network_receive_bytes_total{namespace!="", pod_name!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m])) * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
"workspace_pod_count": `sum by (label_kubesphere_io_workspace) (kube_pod_status_phase{phase!~"Failed|Succeeded", namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
"workspace_pod_running_count": `sum by (label_kubesphere_io_workspace) (kube_pod_status_phase{phase="Running", namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
"workspace_pod_succeeded_count": `sum by (label_kubesphere_io_workspace) (kube_pod_status_phase{phase="Succeeded", namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
"workspace_pod_abnormal_count": `count by (label_kubesphere_io_workspace) ((kube_pod_info{node!=""} unless on (pod, namespace) (kube_pod_status_phase{job="kube-state-metrics", phase="Succeeded"}>0) unless on (pod, namespace) ((kube_pod_status_ready{job="kube-state-metrics", condition="true"}>0) and on (pod, namespace) (kube_pod_status_phase{job="kube-state-metrics", phase="Running"}>0)) unless on (pod, namespace) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", reason="ContainerCreating"}>0)) * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
"workspace_ingresses_extensions_count": `sum by (label_kubesphere_io_workspace) (kube_ingress_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
"workspace_cronjob_count": `sum by (label_kubesphere_io_workspace) (kube_cronjob_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
"workspace_pvc_count": `sum by (label_kubesphere_io_workspace) (kube_persistentvolumeclaim_info{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
"workspace_daemonset_count": `sum by (label_kubesphere_io_workspace) (kube_daemonset_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
"workspace_deployment_count": `sum by (label_kubesphere_io_workspace) (kube_deployment_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
"workspace_endpoint_count": `sum by (label_kubesphere_io_workspace) (kube_endpoint_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
"workspace_hpa_count": `sum by (label_kubesphere_io_workspace) (kube_hpa_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
"workspace_job_count": `sum by (label_kubesphere_io_workspace) (kube_job_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
"workspace_statefulset_count": `sum by (label_kubesphere_io_workspace) (kube_statefulset_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
"workspace_replicaset_count": `count by (label_kubesphere_io_workspace) (kube_replicaset_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
"workspace_service_count": `sum by (label_kubesphere_io_workspace) (kube_service_info{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
"workspace_secret_count": `sum by (label_kubesphere_io_workspace) (kube_secret_info{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
"workspace_pod_abnormal_ratio": `count by (label_kubesphere_io_workspace) ((kube_pod_info{node!=""} unless on (pod, namespace) (kube_pod_status_phase{job="kube-state-metrics", phase="Succeeded"}>0) unless on (pod, namespace) ((kube_pod_status_ready{job="kube-state-metrics", condition="true"}>0) and on (pod, namespace) (kube_pod_status_phase{job="kube-state-metrics", phase="Running"}>0)) unless on (pod, namespace) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", reason="ContainerCreating"}>0)) * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1}) / sum by (label_kubesphere_io_workspace) (kube_pod_status_phase{phase!="Succeeded", namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace)(kube_namespace_labels{$1}))`,
//namespace
"namespace_cpu_usage": `round(namespace:container_cpu_usage_seconds_total:sum_rate{namespace!="", $1}, 0.001)`,
"namespace_memory_usage": `namespace:container_memory_usage_bytes:sum{namespace!="", $1}`,
"namespace_memory_usage_wo_cache": `namespace:container_memory_usage_bytes_wo_cache:sum{namespace!="", $1}`,
"namespace_net_bytes_transmitted": `sum by (namespace) (irate(container_network_transmit_bytes_total{namespace!="", pod_name!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m]) * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
"namespace_net_bytes_received": `sum by (namespace) (irate(container_network_receive_bytes_total{namespace!="", pod_name!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m]) * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
"namespace_pod_count": `sum by (namespace) (kube_pod_status_phase{phase!~"Failed|Succeeded", namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
"namespace_pod_running_count": `sum by (namespace) (kube_pod_status_phase{phase="Running", namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
"namespace_pod_succeeded_count": `sum by (namespace) (kube_pod_status_phase{phase="Succeeded", namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
"namespace_pod_abnormal_count": `namespace:pod_abnormal:count{namespace!="", $1}`,
"namespace_pod_abnormal_ratio": `namespace:pod_abnormal:ratio{namespace!="", $1}`,
"namespace_memory_limit_hard": `min by (namespace) (kube_resourcequota{resourcequota!="quota", type="hard", namespace!="", resource="limits.memory"} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
"namespace_cpu_limit_hard": `min by (namespace) (kube_resourcequota{resourcequota!="quota", type="hard", namespace!="", resource="limits.cpu"} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
"namespace_pod_count_hard": `min by (namespace) (kube_resourcequota{resourcequota!="quota", type="hard", namespace!="", resource="count/pods"} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
"namespace_cronjob_count": `sum by (namespace) (kube_cronjob_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
"namespace_pvc_count": `sum by (namespace) (kube_persistentvolumeclaim_info{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
"namespace_daemonset_count": `sum by (namespace) (kube_daemonset_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
"namespace_deployment_count": `sum by (namespace) (kube_deployment_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
"namespace_endpoint_count": `sum by (namespace) (kube_endpoint_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
"namespace_hpa_count": `sum by (namespace) (kube_hpa_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
"namespace_job_count": `sum by (namespace) (kube_job_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
"namespace_statefulset_count": `sum by (namespace) (kube_statefulset_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
"namespace_replicaset_count": `count by (namespace) (kube_replicaset_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
"namespace_service_count": `sum by (namespace) (kube_service_info{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
"namespace_secret_count": `sum by (namespace) (kube_secret_info{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
"namespace_configmap_count": `sum by (namespace) (kube_configmap_info{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
"namespace_ingresses_extensions_count": `sum by (namespace) (kube_ingress_labels{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
"namespace_s2ibuilder_count": `sum by (namespace) (s2i_s2ibuilder_created{namespace!=""} * on (namespace) group_left(label_kubesphere_io_workspace) kube_namespace_labels{$1})`,
// workload
"workload_cpu_usage": `round(namespace:workload_cpu_usage:sum{$1}, 0.001)`,
"workload_memory_usage": `namespace:workload_memory_usage:sum{$1}`,
"workload_memory_usage_wo_cache": `namespace:workload_memory_usage_wo_cache:sum{$1}`,
"workload_net_bytes_transmitted": `namespace:workload_net_bytes_transmitted:sum_irate{$1}`,
"workload_net_bytes_received": `namespace:workload_net_bytes_received:sum_irate{$1}`,
"workload_deployment_replica": `label_join(sum (label_join(label_replace(kube_deployment_spec_replicas{$2}, "owner_kind", "Deployment", "", ""), "workload", "", "deployment")) by (namespace, owner_kind, workload), "workload", ":", "owner_kind", "workload")`,
"workload_deployment_replica_available": `label_join(sum (label_join(label_replace(kube_deployment_status_replicas_available{$2}, "owner_kind", "Deployment", "", ""), "workload", "", "deployment")) by (namespace, owner_kind, workload), "workload", ":", "owner_kind", "workload")`,
"workload_statefulset_replica": `label_join(sum (label_join(label_replace(kube_statefulset_replicas{$2}, "owner_kind", "StatefulSet", "", ""), "workload", "", "statefulset")) by (namespace, owner_kind, workload), "workload", ":", "owner_kind", "workload")`,
"workload_statefulset_replica_available": `label_join(sum (label_join(label_replace(kube_statefulset_status_replicas_current{$2}, "owner_kind", "StatefulSet", "", ""), "workload", "", "statefulset")) by (namespace, owner_kind, workload), "workload", ":", "owner_kind", "workload")`,
"workload_daemonset_replica": `label_join(sum (label_join(label_replace(kube_daemonset_status_desired_number_scheduled{$2}, "owner_kind", "DaemonSet", "", ""), "workload", "", "daemonset")) by (namespace, owner_kind, workload), "workload", ":", "owner_kind", "workload")`,
"workload_daemonset_replica_available": `label_join(sum (label_join(label_replace(kube_daemonset_status_number_available{$2}, "owner_kind", "DaemonSet", "", ""), "workload", "", "daemonset")) by (namespace, owner_kind, workload), "workload", ":", "owner_kind", "workload")`,
"workload_deployment_unavailable_replicas_ratio": `namespace:deployment_unavailable_replicas:ratio{$1}`,
"workload_daemonset_unavailable_replicas_ratio": `namespace:daemonset_unavailable_replicas:ratio{$1}`,
"workload_statefulset_unavailable_replicas_ratio": `namespace:statefulset_unavailable_replicas:ratio{$1}`,
// pod
"pod_cpu_usage": `round(label_join(sum by (namespace, pod_name) (irate(container_cpu_usage_seconds_total{job="kubelet", pod_name!="", image!=""}[5m])), "pod", "", "pod_name") * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}, 0.001)`,
"pod_memory_usage": `label_join(sum by (namespace, pod_name) (container_memory_usage_bytes{job="kubelet", pod_name!="", image!=""}), "pod", "", "pod_name") * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`,
"pod_memory_usage_wo_cache": `label_join(sum by (namespace, pod_name) (container_memory_working_set_bytes{job="kubelet", pod_name!="", image!=""}), "pod", "", "pod_name") * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`,
"pod_net_bytes_transmitted": `label_join(sum by (namespace, pod_name) (irate(container_network_transmit_bytes_total{pod_name!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m])), "pod", "", "pod_name") * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`,
"pod_net_bytes_received": `label_join(sum by (namespace, pod_name) (irate(container_network_receive_bytes_total{pod_name!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m])), "pod", "", "pod_name") * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`,
// container
"container_cpu_usage": `round(sum by (namespace, pod_name, container_name) (irate(container_cpu_usage_seconds_total{job="kubelet", container_name!="POD", container_name!="", image!="", $1}[5m])), 0.001)`,
"container_memory_usage": `sum by (namespace, pod_name, container_name) (container_memory_usage_bytes{job="kubelet", container_name!="POD", container_name!="", image!="", $1})`,
"container_memory_usage_wo_cache": `sum by (namespace, pod_name, container_name) (container_memory_working_set_bytes{job="kubelet", container_name!="POD", container_name!="", image!="", $1})`,
// pvc
"pvc_inodes_available": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes_free) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
"pvc_inodes_used": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes_used) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
"pvc_inodes_total": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
"pvc_inodes_utilisation": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes_used / kubelet_volume_stats_inodes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
"pvc_bytes_available": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_available_bytes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
"pvc_bytes_used": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_used_bytes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
"pvc_bytes_total": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
"pvc_bytes_utilisation": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
// component
"etcd_server_list": `label_replace(up{job="etcd"}, "node_ip", "$1", "instance", "(.*):.*")`,
"etcd_server_total": `count(up{job="etcd"})`,
"etcd_server_up_total": `etcd:up:sum`,
"etcd_server_has_leader": `label_replace(etcd_server_has_leader, "node_ip", "$1", "instance", "(.*):.*")`,
"etcd_server_leader_changes": `label_replace(etcd:etcd_server_leader_changes_seen:sum_changes, "node_ip", "$1", "node", "(.*)")`,
"etcd_server_proposals_failed_rate": `avg(etcd:etcd_server_proposals_failed:sum_irate)`,
"etcd_server_proposals_applied_rate": `avg(etcd:etcd_server_proposals_applied:sum_irate)`,
"etcd_server_proposals_committed_rate": `avg(etcd:etcd_server_proposals_committed:sum_irate)`,
"etcd_server_proposals_pending_count": `avg(etcd:etcd_server_proposals_pending:sum)`,
"etcd_mvcc_db_size": `avg(etcd:etcd_debugging_mvcc_db_total_size:sum)`,
"etcd_network_client_grpc_received_bytes": `sum(etcd:etcd_network_client_grpc_received_bytes:sum_irate)`,
"etcd_network_client_grpc_sent_bytes": `sum(etcd:etcd_network_client_grpc_sent_bytes:sum_irate)`,
"etcd_grpc_call_rate": `sum(etcd:grpc_server_started:sum_irate)`,
"etcd_grpc_call_failed_rate": `sum(etcd:grpc_server_handled:sum_irate)`,
"etcd_grpc_server_msg_received_rate": `sum(etcd:grpc_server_msg_received:sum_irate)`,
"etcd_grpc_server_msg_sent_rate": `sum(etcd:grpc_server_msg_sent:sum_irate)`,
"etcd_disk_wal_fsync_duration": `avg(etcd:etcd_disk_wal_fsync_duration:avg)`,
"etcd_disk_wal_fsync_duration_quantile": `avg(etcd:etcd_disk_wal_fsync_duration:histogram_quantile) by (quantile)`,
"etcd_disk_backend_commit_duration": `avg(etcd:etcd_disk_backend_commit_duration:avg)`,
"etcd_disk_backend_commit_duration_quantile": `avg(etcd:etcd_disk_backend_commit_duration:histogram_quantile) by (quantile)`,
"apiserver_up_sum": `apiserver:up:sum`,
"apiserver_request_rate": `apiserver:apiserver_request_count:sum_irate`,
"apiserver_request_by_verb_rate": `apiserver:apiserver_request_count:sum_verb_irate`,
"apiserver_request_latencies": `apiserver:apiserver_request_latencies:avg`,
"apiserver_request_by_verb_latencies": `apiserver:apiserver_request_latencies:avg_by_verb`,
"scheduler_up_sum": `scheduler:up:sum`,
"scheduler_schedule_attempts": `scheduler:scheduler_schedule_attempts:sum`,
"scheduler_schedule_attempt_rate": `scheduler:scheduler_schedule_attempts:sum_rate`,
"scheduler_e2e_scheduling_latency": `scheduler:scheduler_e2e_scheduling_latency:avg`,
"scheduler_e2e_scheduling_latency_quantile": `scheduler:scheduler_e2e_scheduling_latency:histogram_quantile`,
"controller_manager_up_sum": `controller_manager:up:sum`,
"coredns_up_sum": `coredns:up:sum`,
"coredns_cache_hits": `coredns:coredns_cache_hits_total:sum_irate`,
"coredns_cache_misses": `coredns:coredns_cache_misses:sum_irate`,
"coredns_dns_request_rate": `coredns:coredns_dns_request_count:sum_irate`,
"coredns_dns_request_duration": `coredns:coredns_dns_request_duration:avg`,
"coredns_dns_request_duration_quantile": `coredns:coredns_dns_request_duration:histogram_quantile`,
"coredns_dns_request_by_type_rate": `coredns:coredns_dns_request_type_count:sum_irate`,
"coredns_dns_request_by_rcode_rate": `coredns:coredns_dns_response_rcode_count:sum_irate`,
"coredns_panic_rate": `coredns:coredns_panic_count:sum_irate`,
"coredns_proxy_request_rate": `coredns:coredns_proxy_request_count:sum_irate`,
"coredns_proxy_request_duration": `coredns:coredns_proxy_request_duration:avg`,
"coredns_proxy_request_duration_quantile": `coredns:coredns_proxy_request_duration:histogram_quantile`,
"prometheus_up_sum": `prometheus:up:sum`,
"prometheus_tsdb_head_samples_appended_rate": `prometheus:prometheus_tsdb_head_samples_appended:sum_rate`,
}
func makeExpression(metric string, opt monitoring.QueryOptions) string {
tmpl := promQLTemplates[metric]
switch opt.Level {
case monitoring.LevelCluster:
case monitoring.LevelNode:
makeNodeMetricExpression(tmpl, opt)
case monitoring.LevelWorkspace:
makeWorkspaceMetricExpression(tmpl, opt)
case monitoring.LevelNamespace:
makeNamespaceMetricExpression(tmpl, opt)
case monitoring.LevelWorkload:
makeWorkloadMetricExpression(tmpl, opt)
case monitoring.LevelPod:
makePodMetricExpression(tmpl, opt)
case monitoring.LevelContainer:
makeContainerMetricExpression(tmpl, opt)
case monitoring.LevelPVC:
makePVCMetricExpression(tmpl, opt)
case monitoring.LevelComponent:
default:
}
return tmpl
}
func makeNodeMetricExpression(tmpl string, o monitoring.QueryOptions) string {
var nodeSelector string
if o.NodeName != "" {
nodeSelector = fmt.Sprintf(`node="%s"`, o.NodeName)
} else {
nodeSelector = fmt.Sprintf(`node=~"%s"`, o.ResourceFilter)
}
return strings.Replace(tmpl, "$1", nodeSelector, -1)
}
func makeWorkspaceMetricExpression(tmpl string, o monitoring.QueryOptions) string {
var workspaceSelector string
if o.WorkspaceName != "" {
workspaceSelector = fmt.Sprintf(`label_kubesphere_io_workspace="%s"`, o.WorkspaceName)
} else {
workspaceSelector = fmt.Sprintf(`label_kubesphere_io_workspace=~"%s", label_kubesphere_io_workspace!=""`, o.ResourceFilter)
}
return strings.Replace(tmpl, "$1", workspaceSelector, -1)
}
func makeNamespaceMetricExpression(tmpl string, o monitoring.QueryOptions) string {
var namespaceSelector string
// For monitoring namespaces in the specific workspace
// GET /workspaces/{workspace}/namespaces
if o.WorkspaceName != "" {
namespaceSelector = fmt.Sprintf(`label_kubesphere_io_workspace="%s", namespace=~"%s"`, o.WorkspaceName, o.ResourceFilter)
return strings.Replace(tmpl, "$1", namespaceSelector, -1)
}
// For monitoring the specific namespaces
// GET /namespaces/{namespace} or
// GET /namespaces
if o.NamespaceName != "" {
namespaceSelector = fmt.Sprintf(`namespace="%s"`, o.NamespaceName)
} else {
namespaceSelector = fmt.Sprintf(`namespace=~"%s"`, o.ResourceFilter)
}
return strings.Replace(tmpl, "$1", namespaceSelector, -1)
}
func makeWorkloadMetricExpression(tmpl string, o monitoring.QueryOptions) string {
var kindSelector, workloadSelector string
switch o.WorkloadKind {
case "deployment":
o.WorkloadKind = Deployment
kindSelector = fmt.Sprintf(`namespace="%s", deployment!="", deployment=~"%s"`, o.NamespaceName, o.ResourceFilter)
case "statefulset":
o.WorkloadKind = StatefulSet
kindSelector = fmt.Sprintf(`namespace="%s", statefulset!="", statefulset=~"%s"`, o.NamespaceName, o.ResourceFilter)
case "daemonset":
o.WorkloadKind = DaemonSet
kindSelector = fmt.Sprintf(`namespace="%s", daemonset!="", daemonset=~"%s"`, o.NamespaceName, o.ResourceFilter)
default:
o.WorkloadKind = ".*"
kindSelector = fmt.Sprintf(`namespace="%s"`, o.NamespaceName)
}
workloadSelector = fmt.Sprintf(`namespace="%s", workload=~"%s:%s"`, o.NamespaceName, o.WorkloadKind, o.ResourceFilter)
return strings.NewReplacer("$1", workloadSelector, "$2", kindSelector).Replace(tmpl)
}
func makePodMetricExpression(tmpl string, o monitoring.QueryOptions) string {
var podSelector, workloadSelector string
// For monitoriong pods of the specific workload
// GET /namespaces/{namespace}/workloads/{kind}/{workload}/pods
if o.WorkloadName != "" {
switch o.WorkloadKind {
case "deployment":
workloadSelector = fmt.Sprintf(`owner_kind="ReplicaSet", owner_name=~"^%s-[^-]{1,10}$"`, o.WorkloadKind)
case "statefulset":
workloadSelector = fmt.Sprintf(`owner_kind="StatefulSet", owner_name="%s"`, o.WorkloadKind)
case "daemonset":
workloadSelector = fmt.Sprintf(`owner_kind="DaemonSet", owner_name="%s"`, o.WorkloadKind)
}
}
// For monitoring pods in the specific namespace
// GET /namespaces/{namespace}/workloads/{kind}/{workload}/pods or
// GET /namespaces/{namespace}/pods/{pod} or
// GET /namespaces/{namespace}/pods
if o.NamespaceName != "" {
if o.PodName != "" {
podSelector = fmt.Sprintf(`pod="%s", namespace="%s"`, o.PodName, o.NamespaceName)
} else {
podSelector = fmt.Sprintf(`pod=~"%s", namespace="%s"`, o.ResourceFilter, o.NamespaceName)
}
}
// For monitoring pods on the specific node
// GET /nodes/{node}/pods/{pod}
if o.PodName != "" {
if o.PodName != "" {
podSelector = fmt.Sprintf(`pod="%s", node="%s"`, o.PodName, o.NodeName)
} else {
podSelector = fmt.Sprintf(`pod=~"%s", node="%s"`, o.ResourceFilter, o.NodeName)
}
}
return strings.NewReplacer("$1", workloadSelector, "$2", podSelector).Replace(tmpl)
}
func makeContainerMetricExpression(tmpl string, o monitoring.QueryOptions) string {
var containerSelector string
if o.ContainerName != "" {
containerSelector = fmt.Sprintf(`pod_name="%s", namespace="%s", container_name="%s"`, o.PodName, o.NamespaceName, o.ContainerName)
} else {
containerSelector = fmt.Sprintf(`pod_name="%s", namespace="%s", container_name=~"%s"`, o.PodName, o.NamespaceName, o.ResourceFilter)
}
return strings.Replace(tmpl, "$1", containerSelector, -1)
}
func makePVCMetricExpression(tmpl string, o monitoring.QueryOptions) string {
var pvcSelector string
// For monitoring persistentvolumeclaims in the specific namespace
// GET /namespaces/{namespace}/persistentvolumeclaims/{persistentvolumeclaim} or
// GET /namespaces/{namespace}/persistentvolumeclaims
if o.NamespaceName != "" {
if o.PersistentVolumeClaimName != "" {
pvcSelector = fmt.Sprintf(`namespace="%s", persistentvolumeclaim="%s"`, o.NamespaceName, o.PersistentVolumeClaimName)
} else {
pvcSelector = fmt.Sprintf(`namespace="%s", persistentvolumeclaim=~"%s"`, o.NamespaceName, o.ResourceFilter)
}
return strings.Replace(tmpl, "$1", pvcSelector, -1)
}
// For monitoring persistentvolumeclaims of the specific storageclass
// GET /storageclasses/{storageclass}/persistentvolumeclaims
if o.StorageClassName != "" {
pvcSelector = fmt.Sprintf(`storageclass="%s", persistentvolumeclaim=~"%s"`, o.StorageClassName, o.ResourceFilter)
}
return strings.Replace(tmpl, "$1", pvcSelector, -1)
}

View File

@@ -1 +0,0 @@
package monitoring

View File

@@ -0,0 +1,162 @@
package monitoring
type QueryOption interface {
Apply(*QueryOptions)
}
type QueryOptions struct {
Level MonitoringLevel
NamedMetrics []string
MetricFilter string
ResourceFilter string
NodeName string
WorkspaceName string
NamespaceName string
WorkloadKind string
WorkloadName string
PodName string
ContainerName string
StorageClassName string
PersistentVolumeClaimName string
}
func NewQueryOptions() *QueryOptions {
return &QueryOptions{}
}
type ClusterOption struct {
MetricFilter string
}
func (co ClusterOption) Apply(o *QueryOptions) {
o.Level = LevelCluster
o.NamedMetrics = ClusterMetrics
}
type NodeOption struct {
MetricFilter string
ResourceFilter string
NodeName string
}
func (no NodeOption) Apply(o *QueryOptions) {
o.Level = LevelNode
o.NamedMetrics = NodeMetrics
o.ResourceFilter = no.ResourceFilter
o.NodeName = no.NodeName
}
type WorkspaceOption struct {
MetricFilter string
ResourceFilter string
WorkspaceName string
}
func (wo WorkspaceOption) Apply(o *QueryOptions) {
o.Level = LevelWorkspace
o.NamedMetrics = WorkspaceMetrics
o.MetricFilter = wo.MetricFilter
o.ResourceFilter = wo.ResourceFilter
o.WorkspaceName = wo.WorkspaceName
}
type NamespaceOption struct {
MetricFilter string
ResourceFilter string
WorkspaceName string
NamespaceName string
}
func (no NamespaceOption) Apply(o *QueryOptions) {
o.Level = LevelNamespace
o.NamedMetrics = NamespaceMetrics
o.MetricFilter = no.MetricFilter
o.ResourceFilter = no.ResourceFilter
o.WorkspaceName = no.WorkspaceName
o.NamespaceName = no.NamespaceName
}
type WorkloadOption struct {
MetricFilter string
ResourceFilter string
NamespaceName string
WorkloadKind string
WorkloadName string
}
func (wo WorkloadOption) Apply(o *QueryOptions) {
o.Level = LevelWorkload
o.NamedMetrics = WorkspaceMetrics
o.MetricFilter = wo.MetricFilter
o.ResourceFilter = wo.ResourceFilter
o.NamespaceName = wo.NamespaceName
o.WorkloadKind = wo.WorkloadKind
o.WorkloadName = wo.WorkloadName
}
type PodOption struct {
MetricFilter string
ResourceFilter string
NodeName string
NamespaceName string
WorkloadKind string
WorkloadName string
PodName string
}
func (po PodOption) Apply(o *QueryOptions) {
o.Level = LevelPod
o.NamedMetrics = PodMetrics
o.MetricFilter = po.MetricFilter
o.ResourceFilter = po.ResourceFilter
o.NamespaceName = po.NamespaceName
o.WorkloadKind = po.WorkloadKind
o.WorkloadName = po.WorkloadName
}
type ContainerOption struct {
MetricFilter string
ResourceFilter string
NamespaceName string
PodName string
ContainerName string
}
func (co ContainerOption) Apply(o *QueryOptions) {
o.Level = LevelContainer
o.NamedMetrics = ContainerMetrics
o.MetricFilter = co.MetricFilter
o.ResourceFilter = co.ResourceFilter
o.NamespaceName = co.NamespaceName
o.PodName = co.PodName
o.ContainerName = co.ContainerName
}
type PVCOption struct {
MetricFilter string
ResourceFilter string
NamespaceName string
StorageClassName string
PersistentVolumeClaimName string
}
func (po PVCOption) Apply(o *QueryOptions) {
o.Level = LevelPVC
o.NamedMetrics = PVCMetrics
o.MetricFilter = po.MetricFilter
o.ResourceFilter = po.ResourceFilter
o.NamespaceName = po.NamespaceName
o.StorageClassName = po.StorageClassName
o.PersistentVolumeClaimName = po.PersistentVolumeClaimName
}
type ComponentOption struct {
MetricFilter string
}
func (co ComponentOption) Apply(o *QueryOptions) {
o.Level = LevelComponent
o.NamedMetrics = ComponentMetrics
o.MetricFilter = co.MetricFilter
}