Upgrade k8s package verison (#5358)

* upgrade k8s package version

Signed-off-by: hongzhouzi <hongzhouzi@kubesphere.io>

* Script upgrade and code formatting.

Signed-off-by: hongzhouzi <hongzhouzi@kubesphere.io>

Signed-off-by: hongzhouzi <hongzhouzi@kubesphere.io>
This commit is contained in:
hongzhouzi
2022-11-15 14:56:38 +08:00
committed by GitHub
parent 5f91c1663a
commit 44167aa47a
3106 changed files with 321340 additions and 172080 deletions

View File

@@ -1,15 +1,15 @@
# See the OWNERS docs at https://go.k8s.io/owners
approvers:
- lavalamp
- deads2k
- yue9944882
- MikeSpreitzer
- lavalamp
- deads2k
- yue9944882
- MikeSpreitzer
reviewers:
- lavalamp
- deads2k
- yue9944882
- MikeSpreitzer
- lavalamp
- deads2k
- yue9944882
- MikeSpreitzer
labels:
- sig/api-machinery
- area/apiserver
- sig/api-machinery
- area/apiserver

View File

@@ -0,0 +1,93 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package flowcontrol
import (
"context"
"sync"
)
type priorityAndFairnessKeyType int
const (
// priorityAndFairnessInitializationSignalKey is a key under which
// initialization signal function for watch requests is stored
// in the context.
priorityAndFairnessInitializationSignalKey priorityAndFairnessKeyType = iota
)
// WithInitializationSignal creates a copy of parent context with
// priority and fairness initialization signal value.
func WithInitializationSignal(ctx context.Context, signal InitializationSignal) context.Context {
return context.WithValue(ctx, priorityAndFairnessInitializationSignalKey, signal)
}
// initializationSignalFrom returns an initialization signal function
// which when called signals that watch initialization has already finished
// to priority and fairness dispatcher.
func initializationSignalFrom(ctx context.Context) (InitializationSignal, bool) {
signal, ok := ctx.Value(priorityAndFairnessInitializationSignalKey).(InitializationSignal)
return signal, ok && signal != nil
}
// WatchInitialized sends a signal to priority and fairness dispatcher
// that a given watch request has already been initialized.
func WatchInitialized(ctx context.Context) {
if signal, ok := initializationSignalFrom(ctx); ok {
signal.Signal()
}
}
// RequestDelegated informs the priority and fairness dispatcher that
// a given request has been delegated to an aggregated API
// server. No-op when priority and fairness is disabled.
func RequestDelegated(ctx context.Context) {
// The watch initialization signal doesn't traverse request
// boundaries, so we generously fire it as soon as we know
// that the request won't be serviced locally. Safe to call
// for non-watch requests.
WatchInitialized(ctx)
}
// InitializationSignal is an interface that allows sending and handling
// initialization signals.
type InitializationSignal interface {
// Signal notifies the dispatcher about finished initialization.
Signal()
// Wait waits for the initialization signal.
Wait()
}
type initializationSignal struct {
once sync.Once
done chan struct{}
}
func NewInitializationSignal() InitializationSignal {
return &initializationSignal{
once: sync.Once{},
done: make(chan struct{}),
}
}
func (i *initializationSignal) Signal() {
i.once.Do(func() { close(i.done) })
}
func (i *initializationSignal) Wait() {
<-i.done
}

View File

@@ -21,6 +21,7 @@ import (
"crypto/sha256"
"encoding/binary"
"encoding/json"
"errors"
"fmt"
"math"
"math/rand"
@@ -28,14 +29,12 @@ import (
"sync"
"time"
"github.com/pkg/errors"
"github.com/google/go-cmp/cmp"
apiequality "k8s.io/apimachinery/pkg/api/equality"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
apitypes "k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/clock"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/sets"
@@ -47,13 +46,15 @@ import (
fq "k8s.io/apiserver/pkg/util/flowcontrol/fairqueuing"
fcfmt "k8s.io/apiserver/pkg/util/flowcontrol/format"
"k8s.io/apiserver/pkg/util/flowcontrol/metrics"
fcrequest "k8s.io/apiserver/pkg/util/flowcontrol/request"
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/util/workqueue"
"k8s.io/klog/v2"
"k8s.io/utils/clock"
flowcontrol "k8s.io/api/flowcontrol/v1beta1"
flowcontrolclient "k8s.io/client-go/kubernetes/typed/flowcontrol/v1beta1"
flowcontrollister "k8s.io/client-go/listers/flowcontrol/v1beta1"
flowcontrol "k8s.io/api/flowcontrol/v1beta2"
flowcontrolclient "k8s.io/client-go/kubernetes/typed/flowcontrol/v1beta2"
flowcontrollister "k8s.io/client-go/listers/flowcontrol/v1beta2"
)
const timeFmt = "2006-01-02T15:04:05.999"
@@ -67,6 +68,14 @@ const timeFmt = "2006-01-02T15:04:05.999"
// undesired becomes completely unused, all the config objects are
// read and processed as a whole.
// The funcs in this package follow the naming convention that the suffix
// "Locked" means the relevant mutex must be locked at the start of each
// call and will be locked upon return. For a configController, the
// suffix "ReadLocked" stipulates a read lock while just "Locked"
// stipulates a full lock. Absence of either suffix means that either
// (a) the lock must NOT be held at call time and will not be held
// upon return or (b) locking is irrelevant.
// StartFunction begins the process of handling a request. If the
// request gets queued then this function uses the given hashValue as
// the source of entropy as it shuffle-shards the request into a
@@ -90,10 +99,11 @@ type RequestDigest struct {
// this type and cfgMeal follow the convention that the suffix
// "Locked" means that the caller must hold the configController lock.
type configController struct {
name string // varies in tests of fighting controllers
clock clock.PassiveClock
queueSetFactory fq.QueueSetFactory
obsPairGenerator metrics.TimedObserverPairGenerator
name string // varies in tests of fighting controllers
clock clock.PassiveClock
queueSetFactory fq.QueueSetFactory
reqsGaugeVec metrics.RatioedGaugeVec
execSeatsGaugeVec metrics.RatioedGaugeVec
// How this controller appears in an ObjectMeta ManagedFieldsEntry.Manager
asFieldManager string
@@ -113,7 +123,7 @@ type configController struct {
fsLister flowcontrollister.FlowSchemaLister
fsInformerSynced cache.InformerSynced
flowcontrolClient flowcontrolclient.FlowcontrolV1beta1Interface
flowcontrolClient flowcontrolclient.FlowcontrolV1beta2Interface
// serverConcurrencyLimit is the limit on the server's total
// number of non-exempt requests being served at once. This comes
@@ -123,10 +133,25 @@ type configController struct {
// requestWaitLimit comes from server configuration.
requestWaitLimit time.Duration
// This must be locked while accessing flowSchemas or
// priorityLevelStates. It is the lock involved in
// LockingWriteMultiple.
lock sync.Mutex
// watchTracker implements the necessary WatchTracker interface.
WatchTracker
// the most recent update attempts, ordered by increasing age.
// Consumer trims to keep only the last minute's worth of entries.
// The controller uses this to limit itself to at most six updates
// to a given FlowSchema in any minute.
// This may only be accessed from the one and only worker goroutine.
mostRecentUpdates []updateAttempt
// This must be locked while accessing the later fields.
// A lock for writing is needed
// for writing to any of the following:
// - the flowSchemas field
// - the slice held in the flowSchemas field
// - the priorityLevelStates field
// - the map held in the priorityLevelStates field
// - any field of a priorityLevelState held in that map
lock sync.RWMutex
// flowSchemas holds the flow schema objects, sorted by increasing
// numerical (decreasing logical) matching precedence. Every
@@ -137,13 +162,6 @@ type configController struct {
// name to the state for that level. Every name referenced from a
// member of `flowSchemas` has an entry here.
priorityLevelStates map[string]*priorityLevelState
// the most recent update attempts, ordered by increasing age.
// Consumer trims to keep only the last minute's worth of entries.
// The controller uses this to limit itself to at most six updates
// to a given FlowSchema in any minute.
// This may only be accessed from the one and only worker goroutine.
mostRecentUpdates []updateAttempt
}
type updateAttempt struct {
@@ -174,8 +192,11 @@ type priorityLevelState struct {
// returned StartFunction
numPending int
// Observers tracking number waiting, executing
obsPair metrics.TimedObserverPair
// Observers tracking number of requests waiting, executing
reqsGaugePair metrics.RatioedGaugePair
// Observer of number of seats occupied throughout execution
execSeatsObs metrics.RatioedGauge
}
// NewTestableController is extra flexible to facilitate testing
@@ -184,13 +205,15 @@ func newTestableController(config TestableConfig) *configController {
name: config.Name,
clock: config.Clock,
queueSetFactory: config.QueueSetFactory,
obsPairGenerator: config.ObsPairGenerator,
reqsGaugeVec: config.ReqsGaugeVec,
execSeatsGaugeVec: config.ExecSeatsGaugeVec,
asFieldManager: config.AsFieldManager,
foundToDangling: config.FoundToDangling,
serverConcurrencyLimit: config.ServerConcurrencyLimit,
requestWaitLimit: config.RequestWaitLimit,
flowcontrolClient: config.FlowcontrolClient,
priorityLevelStates: make(map[string]*priorityLevelState),
WatchTracker: NewWatchTracker(),
}
klog.V(2).Infof("NewTestableController %q with serverConcurrencyLimit=%d, requestWaitLimit=%s, name=%s, asFieldManager=%q", cfgCtlr.name, cfgCtlr.serverConcurrencyLimit, cfgCtlr.requestWaitLimit, cfgCtlr.name, cfgCtlr.asFieldManager)
// Start with longish delay because conflicts will be between
@@ -198,7 +221,7 @@ func newTestableController(config TestableConfig) *configController {
cfgCtlr.configQueue = workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(200*time.Millisecond, 8*time.Hour), "priority_and_fairness_config_queue")
// ensure the data structure reflects the mandatory config
cfgCtlr.lockAndDigestConfigObjects(nil, nil)
fci := config.InformerFactory.Flowcontrol().V1beta1()
fci := config.InformerFactory.Flowcontrol().V1beta2()
pli := fci.PriorityLevelConfigurations()
fsi := fci.FlowSchemas()
cfgCtlr.plLister = pli.Lister()
@@ -268,23 +291,6 @@ func newTestableController(config TestableConfig) *configController {
return cfgCtlr
}
// MaintainObservations keeps the observers from
// metrics.PriorityLevelConcurrencyObserverPairGenerator from falling
// too far behind
func (cfgCtlr *configController) MaintainObservations(stopCh <-chan struct{}) {
wait.Until(cfgCtlr.updateObservations, 10*time.Second, stopCh)
}
func (cfgCtlr *configController) updateObservations() {
cfgCtlr.lock.Lock()
defer cfgCtlr.lock.Unlock()
for _, plc := range cfgCtlr.priorityLevelStates {
if plc.queues != nil {
plc.queues.UpdateObservations()
}
}
}
func (cfgCtlr *configController) Run(stopCh <-chan struct{}) error {
defer utilruntime.HandleCrash()
@@ -322,7 +328,7 @@ func (cfgCtlr *configController) processNextWorkItem() bool {
func(obj interface{}) {
defer cfgCtlr.configQueue.Done(obj)
specificDelay, err := cfgCtlr.syncOne(map[string]string{})
specificDelay, err := cfgCtlr.syncOne()
switch {
case err != nil:
klog.Error(err)
@@ -341,7 +347,7 @@ func (cfgCtlr *configController) processNextWorkItem() bool {
// objects that configure API Priority and Fairness and updates the
// local configController accordingly.
// Only invoke this in the one and only worker goroutine
func (cfgCtlr *configController) syncOne(flowSchemaRVs map[string]string) (specificDelay time.Duration, err error) {
func (cfgCtlr *configController) syncOne() (specificDelay time.Duration, err error) {
klog.V(5).Infof("%s syncOne at %s", cfgCtlr.name, cfgCtlr.clock.Now().Format(timeFmt))
all := labels.Everything()
newPLs, err := cfgCtlr.plLister.List(all)
@@ -352,7 +358,7 @@ func (cfgCtlr *configController) syncOne(flowSchemaRVs map[string]string) (speci
if err != nil {
return 0, fmt.Errorf("unable to list FlowSchema objects: %w", err)
}
return cfgCtlr.digestConfigObjects(newPLs, newFSs, flowSchemaRVs)
return cfgCtlr.digestConfigObjects(newPLs, newFSs)
}
// cfgMeal is the data involved in the process of digesting the API
@@ -381,6 +387,8 @@ type cfgMeal struct {
// provoking a call into this controller while the lock held
// waiting on that request to complete.
fsStatusUpdates []fsStatusUpdate
maxWaitingRequests, maxExecutingRequests int
}
// A buffered set of status updates for FlowSchemas
@@ -393,7 +401,7 @@ type fsStatusUpdate struct {
// digestConfigObjects is given all the API objects that configure
// cfgCtlr and writes its consequent new configState.
// Only invoke this in the one and only worker goroutine
func (cfgCtlr *configController) digestConfigObjects(newPLs []*flowcontrol.PriorityLevelConfiguration, newFSs []*flowcontrol.FlowSchema, flowSchemaRVs map[string]string) (time.Duration, error) {
func (cfgCtlr *configController) digestConfigObjects(newPLs []*flowcontrol.PriorityLevelConfiguration, newFSs []*flowcontrol.FlowSchema) (time.Duration, error) {
fsStatusUpdates := cfgCtlr.lockAndDigestConfigObjects(newPLs, newFSs)
var errs []error
currResult := updateAttempt{
@@ -412,26 +420,26 @@ func (cfgCtlr *configController) digestConfigObjects(newPLs []*flowcontrol.Prior
// if we are going to issue an update, be sure we track every name we update so we know if we update it too often.
currResult.updatedItems.Insert(fsu.flowSchema.Name)
enc, err := json.Marshal(fsu.condition)
patchBytes, err := makeFlowSchemaConditionPatch(fsu.condition)
if err != nil {
// should never happen because these conditions are created here and well formed
panic(fmt.Sprintf("Failed to json.Marshall(%#+v): %s", fsu.condition, err.Error()))
}
klog.V(4).Infof("%s writing Condition %s to FlowSchema %s, which had ResourceVersion=%s, because its previous value was %s", cfgCtlr.name, string(enc), fsu.flowSchema.Name, fsu.flowSchema.ResourceVersion, fcfmt.Fmt(fsu.oldValue))
if klogV := klog.V(4); klogV.Enabled() {
klogV.Infof("%s writing Condition %s to FlowSchema %s, which had ResourceVersion=%s, because its previous value was %s, diff: %s",
cfgCtlr.name, fsu.condition, fsu.flowSchema.Name, fsu.flowSchema.ResourceVersion, fcfmt.Fmt(fsu.oldValue), cmp.Diff(fsu.oldValue, fsu.condition))
}
fsIfc := cfgCtlr.flowcontrolClient.FlowSchemas()
patchBytes := []byte(fmt.Sprintf(`{"status": {"conditions": [ %s ] } }`, string(enc)))
patchOptions := metav1.PatchOptions{FieldManager: cfgCtlr.asFieldManager}
patchedFlowSchema, err := fsIfc.Patch(context.TODO(), fsu.flowSchema.Name, apitypes.StrategicMergePatchType, patchBytes, patchOptions, "status")
if err == nil {
key, _ := cache.MetaNamespaceKeyFunc(patchedFlowSchema)
flowSchemaRVs[key] = patchedFlowSchema.ResourceVersion
} else if apierrors.IsNotFound(err) {
// This object has been deleted. A notification is coming
// and nothing more needs to be done here.
klog.V(5).Infof("%s at %s: attempted update of concurrently deleted FlowSchema %s; nothing more needs to be done", cfgCtlr.name, cfgCtlr.clock.Now().Format(timeFmt), fsu.flowSchema.Name)
} else {
errs = append(errs, errors.Wrap(err, fmt.Sprintf("failed to set a status.condition for FlowSchema %s", fsu.flowSchema.Name)))
_, err = fsIfc.Patch(context.TODO(), fsu.flowSchema.Name, apitypes.StrategicMergePatchType, patchBytes, patchOptions, "status")
if err != nil {
if apierrors.IsNotFound(err) {
// This object has been deleted. A notification is coming
// and nothing more needs to be done here.
klog.V(5).Infof("%s at %s: attempted update of concurrently deleted FlowSchema %s; nothing more needs to be done", cfgCtlr.name, cfgCtlr.clock.Now().Format(timeFmt), fsu.flowSchema.Name)
} else {
errs = append(errs, fmt.Errorf("failed to set a status.condition for FlowSchema %s: %w", fsu.flowSchema.Name, err))
}
}
}
cfgCtlr.addUpdateResult(currResult)
@@ -439,6 +447,20 @@ func (cfgCtlr *configController) digestConfigObjects(newPLs []*flowcontrol.Prior
return suggestedDelay, utilerrors.NewAggregate(errs)
}
// makeFlowSchemaConditionPatch takes in a condition and returns the patch status as a json.
func makeFlowSchemaConditionPatch(condition flowcontrol.FlowSchemaCondition) ([]byte, error) {
o := struct {
Status flowcontrol.FlowSchemaStatus `json:"status"`
}{
Status: flowcontrol.FlowSchemaStatus{
Conditions: []flowcontrol.FlowSchemaCondition{
condition,
},
},
}
return json.Marshal(o)
}
// shouldDelayUpdate checks to see if a flowschema has been updated too often and returns true if a delay is needed.
// Only invoke this in the one and only worker goroutine
func (cfgCtlr *configController) shouldDelayUpdate(flowSchemaName string) bool {
@@ -491,7 +513,13 @@ func (cfgCtlr *configController) lockAndDigestConfigObjects(newPLs []*flowcontro
// The new config has been constructed
cfgCtlr.priorityLevelStates = meal.newPLStates
klog.V(5).Infof("Switched to new API Priority and Fairness configuration")
klog.V(5).InfoS("Switched to new API Priority and Fairness configuration", "maxWaitingRequests", meal.maxWaitingRequests, "maxExecutinRequests", meal.maxExecutingRequests)
metrics.GetWaitingReadonlyConcurrency().SetDenominator(float64(meal.maxWaitingRequests))
metrics.GetWaitingMutatingConcurrency().SetDenominator(float64(meal.maxWaitingRequests))
metrics.GetExecutingReadonlyConcurrency().SetDenominator(float64(meal.maxExecutingRequests))
metrics.GetExecutingMutatingConcurrency().SetDenominator(float64(meal.maxExecutingRequests))
return meal.fsStatusUpdates
}
@@ -501,9 +529,10 @@ func (meal *cfgMeal) digestNewPLsLocked(newPLs []*flowcontrol.PriorityLevelConfi
for _, pl := range newPLs {
state := meal.cfgCtlr.priorityLevelStates[pl.Name]
if state == nil {
state = &priorityLevelState{obsPair: meal.cfgCtlr.obsPairGenerator.Generate(1, 1, []string{pl.Name})}
labelValues := []string{pl.Name}
state = &priorityLevelState{reqsGaugePair: metrics.RatioedGaugeVecPhasedElementPair(meal.cfgCtlr.reqsGaugeVec, 1, 1, labelValues), execSeatsObs: meal.cfgCtlr.execSeatsGaugeVec.NewForLabelValuesSafe(0, 1, labelValues)}
}
qsCompleter, err := queueSetCompleterForPL(meal.cfgCtlr.queueSetFactory, state.queues, pl, meal.cfgCtlr.requestWaitLimit, state.obsPair)
qsCompleter, err := queueSetCompleterForPL(meal.cfgCtlr.queueSetFactory, state.queues, pl, meal.cfgCtlr.requestWaitLimit, state.reqsGaugePair, state.execSeatsObs)
if err != nil {
klog.Warningf("Ignoring PriorityLevelConfiguration object %s because its spec (%s) is broken: %s", pl.Name, fcfmt.Fmt(pl.Spec), err)
continue
@@ -570,9 +599,10 @@ func (meal *cfgMeal) digestFlowSchemasLocked(newFSs []*flowcontrol.FlowSchema) {
}
meal.cfgCtlr.flowSchemas = fsSeq
if klog.V(5).Enabled() {
klogV := klog.V(5)
if klogV.Enabled() {
for _, fs := range fsSeq {
klog.Infof("Using FlowSchema %s", fcfmt.Fmt(fs))
klogV.Infof("Using FlowSchema %s", fcfmt.Fmt(fs))
}
}
}
@@ -606,7 +636,7 @@ func (meal *cfgMeal) processOldPLsLocked() {
}
}
var err error
plState.qsCompleter, err = queueSetCompleterForPL(meal.cfgCtlr.queueSetFactory, plState.queues, plState.pl, meal.cfgCtlr.requestWaitLimit, plState.obsPair)
plState.qsCompleter, err = queueSetCompleterForPL(meal.cfgCtlr.queueSetFactory, plState.queues, plState.pl, meal.cfgCtlr.requestWaitLimit, plState.reqsGaugePair, plState.execSeatsObs)
if err != nil {
// This can not happen because queueSetCompleterForPL already approved this config
panic(fmt.Sprintf("%s from name=%q spec=%s", err, plName, fcfmt.Fmt(plState.pl.Spec)))
@@ -641,6 +671,12 @@ func (meal *cfgMeal) finishQueueSetReconfigsLocked() {
// difference will be negligible.
concurrencyLimit := int(math.Ceil(float64(meal.cfgCtlr.serverConcurrencyLimit) * float64(plState.pl.Spec.Limited.AssuredConcurrencyShares) / meal.shareSum))
metrics.UpdateSharedConcurrencyLimit(plName, concurrencyLimit)
meal.maxExecutingRequests += concurrencyLimit
var waitLimit int
if qCfg := plState.pl.Spec.Limited.LimitResponse.Queuing; qCfg != nil {
waitLimit = int(qCfg.Queues * qCfg.QueueLengthLimit)
}
meal.maxWaitingRequests += waitLimit
if plState.queues == nil {
klog.V(5).Infof("Introducing queues for priority level %q: config=%s, concurrencyLimit=%d, quiescing=%v (shares=%v, shareSum=%v)", plName, fcfmt.Fmt(plState.pl.Spec), concurrencyLimit, plState.quiescing, plState.pl.Spec.Limited.AssuredConcurrencyShares, meal.shareSum)
@@ -655,7 +691,7 @@ func (meal *cfgMeal) finishQueueSetReconfigsLocked() {
// given priority level configuration. Returns nil if that config
// does not call for limiting. Returns nil and an error if the given
// object is malformed in a way that is a problem for this package.
func queueSetCompleterForPL(qsf fq.QueueSetFactory, queues fq.QueueSet, pl *flowcontrol.PriorityLevelConfiguration, requestWaitLimit time.Duration, intPair metrics.TimedObserverPair) (fq.QueueSetCompleter, error) {
func queueSetCompleterForPL(qsf fq.QueueSetFactory, queues fq.QueueSet, pl *flowcontrol.PriorityLevelConfiguration, requestWaitLimit time.Duration, reqsIntPair metrics.RatioedGaugePair, execSeatsObs metrics.RatioedGauge) (fq.QueueSetCompleter, error) {
if (pl.Spec.Type == flowcontrol.PriorityLevelEnablementExempt) != (pl.Spec.Limited == nil) {
return nil, errors.New("broken union structure at the top")
}
@@ -684,10 +720,10 @@ func queueSetCompleterForPL(qsf fq.QueueSetFactory, queues fq.QueueSet, pl *flow
if queues != nil {
qsc, err = queues.BeginConfigChange(qcQS)
} else {
qsc, err = qsf.BeginConstruction(qcQS, intPair)
qsc, err = qsf.BeginConstruction(qcQS, reqsIntPair, execSeatsObs)
}
if err != nil {
err = errors.Wrap(err, fmt.Sprintf("priority level %q has QueuingConfiguration %#+v, which is invalid", pl.Name, qcAPI))
err = fmt.Errorf("priority level %q has QueuingConfiguration %#+v, which is invalid: %w", pl.Name, qcAPI, err)
}
return qsc, err
}
@@ -729,17 +765,20 @@ func (meal *cfgMeal) presyncFlowSchemaStatus(fs *flowcontrol.FlowSchema, isDangl
// that does not actually exist (right now) as a real API object.
func (meal *cfgMeal) imaginePL(proto *flowcontrol.PriorityLevelConfiguration, requestWaitLimit time.Duration) {
klog.V(3).Infof("No %s PriorityLevelConfiguration found, imagining one", proto.Name)
obsPair := meal.cfgCtlr.obsPairGenerator.Generate(1, 1, []string{proto.Name})
qsCompleter, err := queueSetCompleterForPL(meal.cfgCtlr.queueSetFactory, nil, proto, requestWaitLimit, obsPair)
labelValues := []string{proto.Name}
reqsGaugePair := metrics.RatioedGaugeVecPhasedElementPair(meal.cfgCtlr.reqsGaugeVec, 1, 1, labelValues)
execSeatsObs := meal.cfgCtlr.execSeatsGaugeVec.NewForLabelValuesSafe(0, 1, labelValues)
qsCompleter, err := queueSetCompleterForPL(meal.cfgCtlr.queueSetFactory, nil, proto, requestWaitLimit, reqsGaugePair, execSeatsObs)
if err != nil {
// This can not happen because proto is one of the mandatory
// objects and these are not erroneous
panic(err)
}
meal.newPLStates[proto.Name] = &priorityLevelState{
pl: proto,
qsCompleter: qsCompleter,
obsPair: obsPair,
pl: proto,
qsCompleter: qsCompleter,
reqsGaugePair: reqsGaugePair,
execSeatsObs: execSeatsObs,
}
if proto.Spec.Limited != nil {
meal.shareSum += float64(proto.Spec.Limited.AssuredConcurrencyShares)
@@ -758,10 +797,13 @@ func (immediateRequest) Finish(execute func()) bool {
// The returned bool indicates whether the request is exempt from
// limitation. The startWaitingTime is when the request started
// waiting in its queue, or `Time{}` if this did not happen.
func (cfgCtlr *configController) startRequest(ctx context.Context, rd RequestDigest, queueNoteFn fq.QueueNoteFn) (fs *flowcontrol.FlowSchema, pl *flowcontrol.PriorityLevelConfiguration, isExempt bool, req fq.Request, startWaitingTime time.Time) {
func (cfgCtlr *configController) startRequest(ctx context.Context, rd RequestDigest,
noteFn func(fs *flowcontrol.FlowSchema, pl *flowcontrol.PriorityLevelConfiguration, flowDistinguisher string),
workEstimator func() fcrequest.WorkEstimate,
queueNoteFn fq.QueueNoteFn) (fs *flowcontrol.FlowSchema, pl *flowcontrol.PriorityLevelConfiguration, isExempt bool, req fq.Request, startWaitingTime time.Time) {
klog.V(7).Infof("startRequest(%#+v)", rd)
cfgCtlr.lock.Lock()
defer cfgCtlr.lock.Unlock()
cfgCtlr.lock.RLock()
defer cfgCtlr.lock.RUnlock()
var selectedFlowSchema, catchAllFlowSchema *flowcontrol.FlowSchema
for _, fs := range cfgCtlr.flowSchemas {
if matchesFlowSchema(rd, fs) {
@@ -789,6 +831,7 @@ func (cfgCtlr *configController) startRequest(ctx context.Context, rd RequestDig
plName := selectedFlowSchema.Spec.PriorityLevelConfiguration.Name
plState := cfgCtlr.priorityLevelStates[plName]
if plState.pl.Spec.Type == flowcontrol.PriorityLevelEnablementExempt {
noteFn(selectedFlowSchema, plState.pl, "")
klog.V(7).Infof("startRequest(%#+v) => fsName=%q, distMethod=%#+v, plName=%q, immediate", rd, selectedFlowSchema.Name, selectedFlowSchema.Spec.DistinguisherMethod, plName)
return selectedFlowSchema, plState.pl, true, immediateRequest{}, time.Time{}
}
@@ -802,11 +845,15 @@ func (cfgCtlr *configController) startRequest(ctx context.Context, rd RequestDig
flowDistinguisher = computeFlowDistinguisher(rd, selectedFlowSchema.Spec.DistinguisherMethod)
hashValue = hashFlowID(selectedFlowSchema.Name, flowDistinguisher)
}
noteFn(selectedFlowSchema, plState.pl, flowDistinguisher)
workEstimate := workEstimator()
startWaitingTime = time.Now()
klog.V(7).Infof("startRequest(%#+v) => fsName=%q, distMethod=%#+v, plName=%q, numQueues=%d", rd, selectedFlowSchema.Name, selectedFlowSchema.Spec.DistinguisherMethod, plName, numQueues)
req, idle := plState.queues.StartRequest(ctx, hashValue, flowDistinguisher, selectedFlowSchema.Name, rd.RequestInfo, rd.User, queueNoteFn)
req, idle := plState.queues.StartRequest(ctx, &workEstimate, hashValue, flowDistinguisher, selectedFlowSchema.Name, rd.RequestInfo, rd.User, queueNoteFn)
if idle {
cfgCtlr.maybeReapLocked(plName, plState)
cfgCtlr.maybeReapReadLocked(plName, plState)
}
return selectedFlowSchema, plState.pl, false, req, startWaitingTime
}
@@ -815,8 +862,8 @@ func (cfgCtlr *configController) startRequest(ctx context.Context, rd RequestDig
// priority level if it has no more use. Call this after getting a
// clue that the given priority level is undesired and idle.
func (cfgCtlr *configController) maybeReap(plName string) {
cfgCtlr.lock.Lock()
defer cfgCtlr.lock.Unlock()
cfgCtlr.lock.RLock()
defer cfgCtlr.lock.RUnlock()
plState := cfgCtlr.priorityLevelStates[plName]
if plState == nil {
klog.V(7).Infof("plName=%s, plState==nil", plName)
@@ -838,7 +885,7 @@ func (cfgCtlr *configController) maybeReap(plName string) {
// it has no more use. Call this if both (1) plState.queues is
// non-nil and reported being idle, and (2) cfgCtlr's lock has not
// been released since then.
func (cfgCtlr *configController) maybeReapLocked(plName string, plState *priorityLevelState) {
func (cfgCtlr *configController) maybeReapReadLocked(plName string, plState *priorityLevelState) {
if !(plState.quiescing && plState.numPending == 0) {
return
}

View File

@@ -102,7 +102,11 @@ func (cfgCtlr *configController) dumpQueues(w http.ResponseWriter, r *http.Reque
"Index", // 2
"PendingRequests", // 3
"ExecutingRequests", // 4
"VirtualStart", // 5
"SeatsInUse", // 5
"NextDispatchR", // 6
"InitialSeatsSum", // 7
"MaxSeatsSum", // 8
"TotalWorkSum", // 9
}
tabPrint(tabWriter, rowForHeaders(columnHeaders))
endLine(tabWriter)
@@ -114,18 +118,26 @@ func (cfgCtlr *configController) dumpQueues(w http.ResponseWriter, r *http.Reque
"<none>", // 3
"<none>", // 4
"<none>", // 5
"<none>", // 6
"<none>", // 7
"<none>", // 8
"<none>", // 9
))
endLine(tabWriter)
continue
}
queueSetDigest := plState.queues.Dump(false)
for i, q := range queueSetDigest.Queues {
tabPrint(tabWriter, rowForQueue(
plState.pl.Name, // 1
i, // 2
len(q.Requests), // 3
q.ExecutingRequests, // 4
q.VirtualStart, // 5
tabPrint(tabWriter, row(
plState.pl.Name, // 1 - "PriorityLevelName"
strconv.Itoa(i), // 2 - "Index"
strconv.Itoa(len(q.Requests)), // 3 - "PendingRequests"
strconv.Itoa(q.ExecutingRequests), // 4 - "ExecutingRequests"
strconv.Itoa(q.SeatsInUse), // 5 - "SeatsInUse"
q.NextDispatchR, // 6 - "NextDispatchR"
strconv.Itoa(q.QueueSum.InitialSeatsSum), // 7 - "InitialSeatsSum"
strconv.Itoa(q.QueueSum.MaxSeatsSum), // 8 - "MaxSeatsSum"
q.QueueSum.TotalWorkSum, // 9 - "TotalWorkSum"
))
endLine(tabWriter)
}
@@ -147,18 +159,21 @@ func (cfgCtlr *configController) dumpRequests(w http.ResponseWriter, r *http.Req
"RequestIndexInQueue", // 4
"FlowDistingsher", // 5
"ArriveTime", // 6
"InitialSeats", // 7
"FinalSeats", // 8
"AdditionalLatency", // 9
}))
if includeRequestDetails {
continueLine(tabWriter)
tabPrint(tabWriter, rowForHeaders([]string{
"UserName", // 7
"Verb", // 8
"APIPath", // 9
"Namespace", // 10
"Name", // 11
"APIVersion", // 12
"Resource", // 13
"SubResource", // 14
"UserName", // 10
"Verb", // 11
"APIPath", // 12
"Namespace", // 13
"Name", // 14
"APIVersion", // 15
"Resource", // 16
"SubResource", // 17
}))
}
endLine(tabWriter)
@@ -169,28 +184,31 @@ func (cfgCtlr *configController) dumpRequests(w http.ResponseWriter, r *http.Req
queueSetDigest := plState.queues.Dump(includeRequestDetails)
for iq, q := range queueSetDigest.Queues {
for ir, r := range q.Requests {
tabPrint(tabWriter, rowForRequest(
tabPrint(tabWriter, row(
plState.pl.Name, // 1
r.MatchedFlowSchema, // 2
iq, // 3
ir, // 4
strconv.Itoa(iq), // 3
strconv.Itoa(ir), // 4
r.FlowDistinguisher, // 5
r.ArriveTime, // 6
r.ArriveTime.UTC().Format(time.RFC3339Nano), // 6
strconv.Itoa(int(r.WorkEstimate.InitialSeats)), // 7
strconv.Itoa(int(r.WorkEstimate.FinalSeats)), // 8
r.WorkEstimate.AdditionalLatency.String(), // 9
))
if includeRequestDetails {
continueLine(tabWriter)
tabPrint(tabWriter, rowForRequestDetails(
r.UserName, // 7
r.RequestInfo.Verb, // 8
r.RequestInfo.Path, // 9
r.RequestInfo.Namespace, // 10
r.RequestInfo.Name, // 11
r.UserName, // 10
r.RequestInfo.Verb, // 11
r.RequestInfo.Path, // 12
r.RequestInfo.Namespace, // 13
r.RequestInfo.Name, // 14
schema.GroupVersion{
Group: r.RequestInfo.APIGroup,
Version: r.RequestInfo.APIVersion,
}.String(), // 12
r.RequestInfo.Resource, // 13
r.RequestInfo.Subresource, // 14
}.String(), // 15
r.RequestInfo.Resource, // 16
r.RequestInfo.Subresource, // 17
))
}
endLine(tabWriter)
@@ -229,27 +247,6 @@ func rowForPriorityLevel(plName string, activeQueues int, isIdle, isQuiescing bo
)
}
func rowForQueue(plName string, index, waitingRequests, executingRequests int, virtualStart float64) string {
return row(
plName,
strconv.Itoa(index),
strconv.Itoa(waitingRequests),
strconv.Itoa(executingRequests),
fmt.Sprintf("%.4f", virtualStart),
)
}
func rowForRequest(plName, fsName string, queueIndex, requestIndex int, flowDistinguisher string, arriveTime time.Time) string {
return row(
plName,
fsName,
strconv.Itoa(queueIndex),
strconv.Itoa(requestIndex),
flowDistinguisher,
arriveTime.UTC().Format(time.RFC3339Nano),
)
}
func rowForRequestDetails(username, verb, path, namespace, name, apiVersion, resource, subResource string) string {
return row(
username,

View File

@@ -21,17 +21,19 @@ import (
"strconv"
"time"
"k8s.io/apimachinery/pkg/util/clock"
"k8s.io/apiserver/pkg/server/httplog"
"k8s.io/apiserver/pkg/server/mux"
"k8s.io/apiserver/pkg/util/flowcontrol/counter"
fq "k8s.io/apiserver/pkg/util/flowcontrol/fairqueuing"
"k8s.io/apiserver/pkg/util/flowcontrol/fairqueuing/eventclock"
fqs "k8s.io/apiserver/pkg/util/flowcontrol/fairqueuing/queueset"
"k8s.io/apiserver/pkg/util/flowcontrol/metrics"
fcrequest "k8s.io/apiserver/pkg/util/flowcontrol/request"
kubeinformers "k8s.io/client-go/informers"
"k8s.io/klog/v2"
"k8s.io/utils/clock"
flowcontrol "k8s.io/api/flowcontrol/v1beta1"
flowcontrolclient "k8s.io/client-go/kubernetes/typed/flowcontrol/v1beta1"
flowcontrol "k8s.io/api/flowcontrol/v1beta2"
flowcontrolclient "k8s.io/client-go/kubernetes/typed/flowcontrol/v1beta2"
)
// ConfigConsumerAsFieldManager is how the config consuminng
@@ -42,23 +44,28 @@ const ConfigConsumerAsFieldManager = "api-priority-and-fairness-config-consumer-
type Interface interface {
// Handle takes care of queuing and dispatching a request
// characterized by the given digest. The given `noteFn` will be
// invoked with the results of request classification. If the
// request is queued then `queueNoteFn` will be called twice,
// invoked with the results of request classification.
// The given `workEstimator` is called, if at all, after noteFn.
// `workEstimator` will be invoked only when the request
// is classified as non 'exempt'.
// 'workEstimator', when invoked, must return the
// work parameters for the request.
// If the request is queued then `queueNoteFn` will be called twice,
// first with `true` and then with `false`; otherwise
// `queueNoteFn` will not be called at all. If Handle decides
// that the request should be executed then `execute()` will be
// invoked once to execute the request; otherwise `execute()` will
// not be invoked.
// Handle() should never return while execute() is running, even if
// ctx is cancelled or times out.
Handle(ctx context.Context,
requestDigest RequestDigest,
noteFn func(fs *flowcontrol.FlowSchema, pl *flowcontrol.PriorityLevelConfiguration),
noteFn func(fs *flowcontrol.FlowSchema, pl *flowcontrol.PriorityLevelConfiguration, flowDistinguisher string),
workEstimator func() fcrequest.WorkEstimate,
queueNoteFn fq.QueueNoteFn,
execFn func(),
)
// MaintainObservations is a helper for maintaining statistics.
MaintainObservations(stopCh <-chan struct{})
// Run monitors config objects from the main apiservers and causes
// any needed changes to local behavior. This method ceases
// activity and returns after the given channel is closed.
@@ -66,6 +73,9 @@ type Interface interface {
// Install installs debugging endpoints to the web-server.
Install(c *mux.PathRecorderMux)
// WatchTracker provides the WatchTracker interface.
WatchTracker
}
// This request filter implements https://github.com/kubernetes/enhancements/blob/master/keps/sig-api-machinery/1040-priority-and-fairness/README.md
@@ -73,12 +83,11 @@ type Interface interface {
// New creates a new instance to implement API priority and fairness
func New(
informerFactory kubeinformers.SharedInformerFactory,
flowcontrolClient flowcontrolclient.FlowcontrolV1beta1Interface,
flowcontrolClient flowcontrolclient.FlowcontrolV1beta2Interface,
serverConcurrencyLimit int,
requestWaitLimit time.Duration,
) Interface {
grc := counter.NoOp{}
clk := clock.RealClock{}
clk := eventclock.Real{}
return NewTestable(TestableConfig{
Name: "Controller",
Clock: clk,
@@ -88,8 +97,9 @@ func New(
FlowcontrolClient: flowcontrolClient,
ServerConcurrencyLimit: serverConcurrencyLimit,
RequestWaitLimit: requestWaitLimit,
ObsPairGenerator: metrics.PriorityLevelConcurrencyObserverPairGenerator,
QueueSetFactory: fqs.NewQueueSetFactory(clk, grc),
ReqsGaugeVec: metrics.PriorityLevelConcurrencyGaugeVec,
ExecSeatsGaugeVec: metrics.PriorityLevelExecutionSeatsGaugeVec,
QueueSetFactory: fqs.NewQueueSetFactory(clk),
})
}
@@ -119,7 +129,7 @@ type TestableConfig struct {
InformerFactory kubeinformers.SharedInformerFactory
// FlowcontrolClient to use for manipulating config objects
FlowcontrolClient flowcontrolclient.FlowcontrolV1beta1Interface
FlowcontrolClient flowcontrolclient.FlowcontrolV1beta2Interface
// ServerConcurrencyLimit for the controller to enforce
ServerConcurrencyLimit int
@@ -127,8 +137,11 @@ type TestableConfig struct {
// RequestWaitLimit configured on the server
RequestWaitLimit time.Duration
// ObsPairGenerator for metrics
ObsPairGenerator metrics.TimedObserverPairGenerator
// GaugeVec for metrics about requests, broken down by phase and priority_level
ReqsGaugeVec metrics.RatioedGaugeVec
// RatioedGaugePairVec for metrics about seats occupied by all phases of execution
ExecSeatsGaugeVec metrics.RatioedGaugeVec
// QueueSetFactory for the queuing implementation
QueueSetFactory fq.QueueSetFactory
@@ -140,12 +153,12 @@ func NewTestable(config TestableConfig) Interface {
}
func (cfgCtlr *configController) Handle(ctx context.Context, requestDigest RequestDigest,
noteFn func(fs *flowcontrol.FlowSchema, pl *flowcontrol.PriorityLevelConfiguration),
noteFn func(fs *flowcontrol.FlowSchema, pl *flowcontrol.PriorityLevelConfiguration, flowDistinguisher string),
workEstimator func() fcrequest.WorkEstimate,
queueNoteFn fq.QueueNoteFn,
execFn func()) {
fs, pl, isExempt, req, startWaitingTime := cfgCtlr.startRequest(ctx, requestDigest, queueNoteFn)
fs, pl, isExempt, req, startWaitingTime := cfgCtlr.startRequest(ctx, requestDigest, noteFn, workEstimator, queueNoteFn)
queued := startWaitingTime != time.Time{}
noteFn(fs, pl)
if req == nil {
if queued {
metrics.ObserveWaitingDuration(ctx, pl.Name, fs.Name, strconv.FormatBool(req != nil), time.Since(startWaitingTime))
@@ -171,7 +184,9 @@ func (cfgCtlr *configController) Handle(ctx context.Context, requestDigest Reque
executed = true
startExecutionTime := time.Now()
defer func() {
metrics.ObserveExecutionDuration(ctx, pl.Name, fs.Name, time.Since(startExecutionTime))
executionTime := time.Since(startExecutionTime)
httplog.AddKeyValue(ctx, "apf_execution_time", executionTime)
metrics.ObserveExecutionDuration(ctx, pl.Name, fs.Name, executionTime)
}()
execFn()
})

View File

@@ -1,33 +0,0 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package counter
// GoRoutineCounter keeps track of the number of active goroutines
// working on/for something. This is a utility that makes such code more
// testable. The code uses this utility to report the number of active
// goroutines to the test code, so that the test code can advance a fake
// clock when and only when the code being tested has finished all
// the work that is ready to do at the present time.
type GoRoutineCounter interface {
// Add adds the given delta to the count of active goroutines.
// Call Add(1) before forking a goroutine, Add(-1) at the end of that goroutine.
// Call Add(-1) just before waiting on something from another goroutine (e.g.,
// just before a `select`).
// Call Add(1) just before doing something that unblocks a goroutine that is
// waiting on that something.
Add(delta int)
}

View File

@@ -20,20 +20,30 @@ import (
"time"
"k8s.io/apiserver/pkg/endpoints/request"
flowcontrolrequest "k8s.io/apiserver/pkg/util/flowcontrol/request"
)
// QueueSetDump is an instant dump of queue-set.
type QueueSetDump struct {
Queues []QueueDump
Waiting int
Executing int
Queues []QueueDump
Waiting int
Executing int
SeatsInUse int
}
// QueueDump is an instant dump of one queue in a queue-set.
type QueueDump struct {
QueueSum QueueSum
Requests []RequestDump
VirtualStart float64
NextDispatchR string
ExecutingRequests int
SeatsInUse int
}
type QueueSum struct {
InitialSeatsSum int
MaxSeatsSum int
TotalWorkSum string
}
// RequestDump is an instant dump of one requests pending in the queue.
@@ -42,6 +52,7 @@ type RequestDump struct {
FlowDistinguisher string
ArriveTime time.Time
StartTime time.Time
WorkEstimate flowcontrolrequest.WorkEstimate
// request details
UserName string
RequestInfo request.RequestInfo

View File

@@ -0,0 +1,47 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package eventclock
import (
"time"
baseclock "k8s.io/utils/clock"
)
// EventFunc does some work that needs to be done at or after the
// given time.
type EventFunc func(time.Time)
// EventClock is an active clock abstraction for use in code that is
// testable with a fake clock that itself determines how time may be
// advanced. The timing paradigm is invoking EventFuncs rather than
// synchronizing through channels, so that the fake clock has a handle
// on when associated activity is done.
type Interface interface {
baseclock.PassiveClock
// Sleep returns after the given duration (or more).
Sleep(d time.Duration)
// EventAfterDuration invokes the given EventFunc after the given duration (or more),
// passing the time when the invocation was launched.
EventAfterDuration(f EventFunc, d time.Duration)
// EventAfterTime invokes the given EventFunc at the given time or later,
// passing the time when the invocation was launched.
EventAfterTime(f EventFunc, t time.Time)
}

View File

@@ -1,5 +1,5 @@
/*
Copyright 2019 The Kubernetes Authors.
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@@ -14,12 +14,31 @@ See the License for the specific language governing permissions and
limitations under the License.
*/
package counter
package eventclock
// NoOp is a GoRoutineCounter that does not actually count
type NoOp struct{}
import (
"time"
var _ GoRoutineCounter = NoOp{}
"k8s.io/utils/clock"
)
// Add would adjust the count, if a count were being kept
func (NoOp) Add(int) {}
// RealEventClock fires event on real world time
type Real struct {
clock.RealClock
}
var _ Interface = Real{}
// EventAfterDuration schedules an EventFunc
func (Real) EventAfterDuration(f EventFunc, d time.Duration) {
ch := time.After(d)
go func() {
t := <-ch
f(t)
}()
}
// EventAfterTime schedules an EventFunc
func (r Real) EventAfterTime(f EventFunc, t time.Time) {
r.EventAfterDuration(f, time.Until(t))
}

View File

@@ -21,17 +21,16 @@ import (
"sync"
"time"
"k8s.io/apimachinery/pkg/util/clock"
"k8s.io/apiserver/pkg/util/flowcontrol/metrics"
"k8s.io/utils/clock"
)
// Integrator computes the moments of some variable X over time as
// read from a particular clock. The integrals start when the
// Integrator is created, and ends at the latest operation on the
// Integrator. As a `metrics.TimedObserver` this fixes X1=1 and
// ignores attempts to change X1.
// Integrator.
type Integrator interface {
metrics.TimedObserver
Set(float64)
Add(float64)
GetResults() IntegratorResults
@@ -70,9 +69,6 @@ func NewIntegrator(clock clock.PassiveClock) Integrator {
}
}
func (igr *integrator) SetX1(x1 float64) {
}
func (igr *integrator) Set(x float64) {
igr.Lock()
igr.setLocked(x)

View File

@@ -22,6 +22,7 @@ import (
"k8s.io/apiserver/pkg/util/flowcontrol/debug"
"k8s.io/apiserver/pkg/util/flowcontrol/metrics"
"k8s.io/apiserver/pkg/util/flowcontrol/request"
)
// QueueSetFactory is used to create QueueSet objects. Creation, like
@@ -30,8 +31,11 @@ import (
// are separated so that errors from the first phase can be found
// before committing to a concurrency allotment for the second.
type QueueSetFactory interface {
// BeginConstruction does the first phase of creating a QueueSet
BeginConstruction(QueuingConfig, metrics.TimedObserverPair) (QueueSetCompleter, error)
// BeginConstruction does the first phase of creating a QueueSet.
// The RatioedGaugePair observes number of requests,
// execution covering just the regular phase.
// The RatioedGauge observes number of seats occupied through all phases of execution.
BeginConstruction(QueuingConfig, metrics.RatioedGaugePair, metrics.RatioedGauge) (QueueSetCompleter, error)
}
// QueueSetCompleter finishes the two-step process of creating or
@@ -80,11 +84,7 @@ type QueueSet interface {
// was idle at the moment of the return. Otherwise idle==false
// and the client must call the Finish method of the Request
// exactly once.
StartRequest(ctx context.Context, hashValue uint64, flowDistinguisher, fsName string, descr1, descr2 interface{}, queueNoteFn QueueNoteFn) (req Request, idle bool)
// UpdateObservations makes sure any time-based statistics have
// caught up with the current clock reading
UpdateObservations()
StartRequest(ctx context.Context, width *request.WorkEstimate, hashValue uint64, flowDistinguisher, fsName string, descr1, descr2 interface{}, queueNoteFn QueueNoteFn) (req Request, idle bool)
// Dump saves and returns the instant internal state of the queue-set.
// Note that dumping process will stop the queue-set from proceeding

View File

@@ -16,47 +16,15 @@ limitations under the License.
package promise
// This file defines interfaces for promises and futures and related
// things. These are about coordination among multiple goroutines and
// so are safe for concurrent calls --- although moderated in some
// cases by a requirement that the caller hold a certain lock.
// Readable represents a variable that is initially not set and later
// becomes set. Some instances may be set to multiple values in
// series. A Readable for a variable that can only get one value is
// commonly known as a "future".
type Readable interface {
// Get reads the current value of this variable. If this variable
// is not set yet then this call blocks until this variable gets a
// value.
// WriteOnce represents a variable that is initially not set and can
// be set once and is readable. This is the common meaning for
// "promise".
type WriteOnce interface {
// Get reads the current value of this variable. If this
// variable is not set yet then this call blocks until this
// variable gets a value.
Get() interface{}
// IsSet returns immediately with an indication of whether this
// variable has been set.
IsSet() bool
}
// LockingReadable is a Readable whose implementation is protected by
// a lock
type LockingReadable interface {
Readable
// GetLocked is like Get but the caller must already hold the
// lock. GetLocked may release, and later re-acquire, the lock
// any number of times. Get may acquire, and later release, the
// lock any number of times.
GetLocked() interface{}
// IsSetLocked is like IsSet but the caller must already hold the
// lock. IsSetLocked may release, and later re-acquire, the lock
// any number of times. IsSet may acquire, and later release, the
// lock any number of times.
IsSetLocked() bool
}
// WriteOnceOnly represents a variable that is initially not set and
// can be set once.
type WriteOnceOnly interface {
// Set normally writes a value into this variable, unblocks every
// goroutine waiting for this variable to have a value, and
// returns true. In the unhappy case that this variable is
@@ -64,66 +32,3 @@ type WriteOnceOnly interface {
// variable's value.
Set(interface{}) bool
}
// WriteOnce represents a variable that is initially not set and can
// be set once and is readable. This is the common meaning for
// "promise".
type WriteOnce interface {
Readable
WriteOnceOnly
}
// LockingWriteOnceOnly is a WriteOnceOnly whose implementation is
// protected by a lock.
type LockingWriteOnceOnly interface {
WriteOnceOnly
// SetLocked is like Set but the caller must already hold the
// lock. SetLocked may release, and later re-acquire, the lock
// any number of times. Set may acquire, and later release, the
// lock any number of times
SetLocked(interface{}) bool
}
// LockingWriteOnce is a WriteOnce whose implementation is protected
// by a lock.
type LockingWriteOnce interface {
LockingReadable
LockingWriteOnceOnly
}
// WriteMultipleOnly represents a variable that is initially not set
// and can be set one or more times (unlike a traditional "promise",
// which can be written only once).
type WriteMultipleOnly interface {
// Set writes a value into this variable and unblocks every
// goroutine waiting for this variable to have a value
Set(interface{})
}
// WriteMultiple represents a variable that is initially not set and
// can be set one or more times (unlike a traditional "promise", which
// can be written only once) and is readable.
type WriteMultiple interface {
Readable
WriteMultipleOnly
}
// LockingWriteMultipleOnly is a WriteMultipleOnly whose
// implementation is protected by a lock.
type LockingWriteMultipleOnly interface {
WriteMultipleOnly
// SetLocked is like Set but the caller must already hold the
// lock. SetLocked may release, and later re-acquire, the lock
// any number of times. Set may acquire, and later release, the
// lock any number of times
SetLocked(interface{})
}
// LockingWriteMultiple is a WriteMultiple whose implementation is
// protected by a lock.
type LockingWriteMultiple interface {
LockingReadable
LockingWriteMultipleOnly
}

View File

@@ -1,124 +0,0 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package lockingpromise
import (
"sync"
"k8s.io/apiserver/pkg/util/flowcontrol/counter"
"k8s.io/apiserver/pkg/util/flowcontrol/fairqueuing/promise"
)
// promisoid is the data and behavior common to all the promise-like
// abstractions implemented here. This implementation is based on a
// condition variable. This implementation tracks active goroutines:
// the given counter is decremented for a goroutine waiting for this
// varible to be set and incremented when such a goroutine is
// unblocked.
type promisoid struct {
lock sync.Locker
cond sync.Cond
activeCounter counter.GoRoutineCounter // counter of active goroutines
waitingCount int // number of goroutines idle due to this being unset
isSet bool
value interface{}
}
func (pr *promisoid) Get() interface{} {
pr.lock.Lock()
defer pr.lock.Unlock()
return pr.GetLocked()
}
func (pr *promisoid) GetLocked() interface{} {
if !pr.isSet {
pr.waitingCount++
pr.activeCounter.Add(-1)
pr.cond.Wait()
}
return pr.value
}
func (pr *promisoid) IsSet() bool {
pr.lock.Lock()
defer pr.lock.Unlock()
return pr.IsSetLocked()
}
func (pr *promisoid) IsSetLocked() bool {
return pr.isSet
}
func (pr *promisoid) SetLocked(value interface{}) {
pr.isSet = true
pr.value = value
if pr.waitingCount > 0 {
pr.activeCounter.Add(pr.waitingCount)
pr.waitingCount = 0
pr.cond.Broadcast()
}
}
type writeOnce struct {
promisoid
}
var _ promise.LockingWriteOnce = &writeOnce{}
// NewWriteOnce makes a new promise.LockingWriteOnce
func NewWriteOnce(lock sync.Locker, activeCounter counter.GoRoutineCounter) promise.LockingWriteOnce {
return &writeOnce{promisoid{
lock: lock,
cond: *sync.NewCond(lock),
activeCounter: activeCounter,
}}
}
func (wr *writeOnce) Set(value interface{}) bool {
wr.lock.Lock()
defer wr.lock.Unlock()
return wr.SetLocked(value)
}
func (wr *writeOnce) SetLocked(value interface{}) bool {
if wr.isSet {
return false
}
wr.promisoid.SetLocked(value)
return true
}
type writeMultiple struct {
promisoid
}
var _ promise.LockingWriteMultiple = &writeMultiple{}
// NewWriteMultiple makes a new promise.LockingWriteMultiple
func NewWriteMultiple(lock sync.Locker, activeCounter counter.GoRoutineCounter) promise.LockingWriteMultiple {
return &writeMultiple{promisoid{
lock: lock,
cond: *sync.NewCond(lock),
activeCounter: activeCounter,
}}
}
func (wr *writeMultiple) Set(value interface{}) {
wr.lock.Lock()
defer wr.lock.Unlock()
wr.SetLocked(value)
}

View File

@@ -0,0 +1,70 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package promise
import (
"sync"
)
// promise implements the WriteOnce interface.
type promise struct {
doneCh <-chan struct{}
doneVal interface{}
setCh chan struct{}
onceler sync.Once
value interface{}
}
var _ WriteOnce = &promise{}
// NewWriteOnce makes a new thread-safe WriteOnce.
//
// If `initial` is non-nil then that value is Set at creation time.
//
// If a `Get` is waiting soon after `doneCh` becomes selectable (which
// never happens for the nil channel) then `Set(doneVal)` effectively
// happens at that time.
func NewWriteOnce(initial interface{}, doneCh <-chan struct{}, doneVal interface{}) WriteOnce {
p := &promise{
doneCh: doneCh,
doneVal: doneVal,
setCh: make(chan struct{}),
}
if initial != nil {
p.Set(initial)
}
return p
}
func (p *promise) Get() interface{} {
select {
case <-p.setCh:
case <-p.doneCh:
p.Set(p.doneVal)
}
return p.value
}
func (p *promise) Set(value interface{}) bool {
var ans bool
p.onceler.Do(func() {
p.value = value
close(p.setCh)
ans = true
})
return ans
}

View File

@@ -52,12 +52,12 @@ limitations under the License.
// limit. In the original paper, the partial derivative of R(t) with
// respect to t is
//
// 1 / NEQ(t) .
// 1 / NEQ(t) .
//
// To generalize from transmitting one packet at a time to executing C
// requests at a time, that derivative becomes
//
// C / NEQ(t) .
// C / NEQ(t) .
//
// However, sometimes there are fewer than C requests available to
// execute. For a given queue "q", let us also write "reqs(q, t)" for
@@ -70,7 +70,7 @@ limitations under the License.
// for server requests: at a particular time t, the partial derivative
// of R(t) with respect to t is
//
// min( C, sum[over q] reqs(q, t) ) / NEQ(t) .
// min( C, sum[over q] reqs(q, t) ) / NEQ(t) .
//
// In terms of the implementation outline, this is the rate at which
// virtual time is advancing at time t (in virtual nanoseconds per
@@ -116,5 +116,4 @@ limitations under the License.
// queues virtual start time is advanced by G. When a request
// finishes being served, and the actual service time was S, the
// queues virtual start time is decremented by G - S.
//
package queueset

View File

@@ -0,0 +1,156 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package queueset
import (
"container/list"
)
// removeFromFIFOFunc removes a designated element from the list
// if that element is in the list.
// The complexity of the runtime cost is O(1).
// The returned value is the element removed, if indeed one was removed,
// otherwise `nil`.
type removeFromFIFOFunc func() *request
// walkFunc is called for each request in the list in the
// oldest -> newest order.
// ok: if walkFunc returns false then the iteration stops immediately.
// walkFunc may remove the given request from the fifo,
// but may not mutate the fifo in any othe way.
type walkFunc func(*request) (ok bool)
// Internal interface to abstract out the implementation details
// of the underlying list used to maintain the requests.
//
// Note that a fifo, including the removeFromFIFOFuncs returned from Enqueue,
// is not safe for concurrent use by multiple goroutines.
type fifo interface {
// Enqueue enqueues the specified request into the list and
// returns a removeFromFIFOFunc function that can be used to remove the
// request from the list
Enqueue(*request) removeFromFIFOFunc
// Dequeue pulls out the oldest request from the list.
Dequeue() (*request, bool)
// Peek returns the oldest request without removing it.
Peek() (*request, bool)
// Length returns the number of requests in the list.
Length() int
// QueueSum returns the sum of initial seats, final seats, and
// additional latency aggregated from all requests in this queue.
QueueSum() queueSum
// Walk iterates through the list in order of oldest -> newest
// and executes the specified walkFunc for each request in that order.
//
// if the specified walkFunc returns false the Walk function
// stops the walk an returns immediately.
Walk(walkFunc)
}
// the FIFO list implementation is not safe for concurrent use by multiple
// goroutines.
type requestFIFO struct {
*list.List
sum queueSum
}
func newRequestFIFO() fifo {
return &requestFIFO{
List: list.New(),
}
}
func (l *requestFIFO) Length() int {
return l.Len()
}
func (l *requestFIFO) QueueSum() queueSum {
return l.sum
}
func (l *requestFIFO) Enqueue(req *request) removeFromFIFOFunc {
e := l.PushBack(req)
addToQueueSum(&l.sum, req)
return func() *request {
if e.Value == nil {
return nil
}
l.Remove(e)
e.Value = nil
deductFromQueueSum(&l.sum, req)
return req
}
}
func (l *requestFIFO) Dequeue() (*request, bool) {
return l.getFirst(true)
}
func (l *requestFIFO) Peek() (*request, bool) {
return l.getFirst(false)
}
func (l *requestFIFO) getFirst(remove bool) (*request, bool) {
e := l.Front()
if e == nil {
return nil, false
}
if remove {
defer func() {
l.Remove(e)
e.Value = nil
}()
}
request, ok := e.Value.(*request)
if remove && ok {
deductFromQueueSum(&l.sum, request)
}
return request, ok
}
func (l *requestFIFO) Walk(f walkFunc) {
var next *list.Element
for current := l.Front(); current != nil; current = next {
next = current.Next() // f is allowed to remove current
if r, ok := current.Value.(*request); ok {
if !f(r) {
return
}
}
}
}
func addToQueueSum(sum *queueSum, req *request) {
sum.InitialSeatsSum += req.InitialSeats()
sum.MaxSeatsSum += req.MaxSeats()
sum.TotalWorkSum += req.totalWork()
}
func deductFromQueueSum(sum *queueSum, req *request) {
sum.InitialSeatsSum -= req.InitialSeats()
sum.MaxSeatsSum -= req.MaxSeats()
sum.TotalWorkSum -= req.totalWork()
}

File diff suppressed because it is too large Load Diff

View File

@@ -24,10 +24,11 @@ import (
"k8s.io/apiserver/pkg/util/flowcontrol/debug"
fq "k8s.io/apiserver/pkg/util/flowcontrol/fairqueuing"
"k8s.io/apiserver/pkg/util/flowcontrol/fairqueuing/promise"
fcrequest "k8s.io/apiserver/pkg/util/flowcontrol/request"
)
// request is a temporary container for "requests" with additional
// tracking fields required for the functionality FQScheduler
// tracking fields required for QueueSet functionality.
type request struct {
ctx context.Context
@@ -40,15 +41,17 @@ type request struct {
// a queue.
queue *queue
// startTime is the real time when the request began executing
startTime time.Time
// estimated amount of work of the request
workEstimate completedWorkEstimate
// decision gets set to a `requestDecision` indicating what to do
// with this request. It gets set exactly once, when the request
// is removed from its queue. The value will be decisionReject,
// decisionCancel, or decisionExecute; decisionTryAnother never
// appears here.
decision promise.LockingWriteOnce
// decisionCancel, or decisionExecute.
//
// decision.Set is called with the queueSet locked.
// decision.Get is called without the queueSet locked.
decision promise.WriteOnce
// arrivalTime is the real time when the request entered this system
arrivalTime time.Time
@@ -57,60 +60,100 @@ type request struct {
// log messages
descr1, descr2 interface{}
queueNoteFn fq.QueueNoteFn
// The preceding fields are filled in at creation and not modified since;
// the following fields may be modified later and must only be accessed while
// holding the queueSet's lock.
// Removes this request from its queue. If the request is not put into a
// a queue it will be nil.
removeFromQueueLocked removeFromFIFOFunc
// arrivalR is R(arrivalTime). R is, confusingly, also called "virtual time".
// This field is meaningful only while the request is waiting in the virtual world.
arrivalR fcrequest.SeatSeconds
// startTime is the real time when the request began executing
startTime time.Time
// Indicates whether client has called Request::Wait()
waitStarted bool
queueNoteFn fq.QueueNoteFn
}
// queue is an array of requests with additional metadata required for
// the FQScheduler
type completedWorkEstimate struct {
fcrequest.WorkEstimate
totalWork fcrequest.SeatSeconds // initial plus final work
finalWork fcrequest.SeatSeconds // only final work
}
// queue is a sequence of requests that have arrived but not yet finished
// execution in both the real and virtual worlds.
type queue struct {
requests []*request
// The requests not yet executing in the real world are stored in a FIFO list.
requests fifo
// virtualStart is the virtual time (virtual seconds since process
// startup) when the oldest request in the queue (if there is any)
// started virtually executing
virtualStart float64
// nextDispatchR is the R progress meter reading at
// which the next request will be dispatched in the virtual world.
nextDispatchR fcrequest.SeatSeconds
// requestsExecuting is the count in the real world.
requestsExecuting int
index int
// index is the position of this queue among those in its queueSet.
index int
// seatsInUse is the total number of "seats" currently occupied
// by all the requests that are currently executing in this queue.
seatsInUse int
}
// Enqueue enqueues a request into the queue
func (q *queue) Enqueue(request *request) {
q.requests = append(q.requests, request)
// queueSum tracks the sum of initial seats, max seats, and
// totalWork from all requests in a given queue
type queueSum struct {
// InitialSeatsSum is the sum of InitialSeats
// associated with all requests in a given queue.
InitialSeatsSum int
// MaxSeatsSum is the sum of MaxSeats
// associated with all requests in a given queue.
MaxSeatsSum int
// TotalWorkSum is the sum of totalWork of the waiting requests
TotalWorkSum fcrequest.SeatSeconds
}
// Dequeue dequeues a request from the queue
func (q *queue) Dequeue() (*request, bool) {
if len(q.requests) == 0 {
return nil, false
func (req *request) totalWork() fcrequest.SeatSeconds {
return req.workEstimate.totalWork
}
func (qs *queueSet) completeWorkEstimate(we *fcrequest.WorkEstimate) completedWorkEstimate {
finalWork := qs.computeFinalWork(we)
return completedWorkEstimate{
WorkEstimate: *we,
totalWork: qs.computeInitialWork(we) + finalWork,
finalWork: finalWork,
}
request := q.requests[0]
q.requests = q.requests[1:]
return request, true
}
// GetVirtualFinish returns the expected virtual finish time of the request at
// index J in the queue with estimated finish time G
func (q *queue) GetVirtualFinish(J int, G float64) float64 {
// The virtual finish time of request number J in the queue
// (counting from J=1 for the head) is J * G + (virtual start time).
// counting from J=1 for the head (eg: queue.requests[0] -> J=1) - J+1
jg := float64(J+1) * float64(G)
return jg + q.virtualStart
func (qs *queueSet) computeInitialWork(we *fcrequest.WorkEstimate) fcrequest.SeatSeconds {
return fcrequest.SeatsTimesDuration(float64(we.InitialSeats), qs.estimatedServiceDuration)
}
func (q *queue) dump(includeDetails bool) debug.QueueDump {
digest := make([]debug.RequestDump, len(q.requests))
for i, r := range q.requests {
func (qs *queueSet) computeFinalWork(we *fcrequest.WorkEstimate) fcrequest.SeatSeconds {
return fcrequest.SeatsTimesDuration(float64(we.FinalSeats), we.AdditionalLatency)
}
func (q *queue) dumpLocked(includeDetails bool) debug.QueueDump {
digest := make([]debug.RequestDump, q.requests.Length())
i := 0
q.requests.Walk(func(r *request) bool {
// dump requests.
digest[i].MatchedFlowSchema = r.fsName
digest[i].FlowDistinguisher = r.flowDistinguisher
digest[i].ArriveTime = r.arrivalTime
digest[i].StartTime = r.startTime
digest[i].WorkEstimate = r.workEstimate.WorkEstimate
if includeDetails {
userInfo, _ := genericrequest.UserFrom(r.ctx)
digest[i].UserName = userInfo.GetName()
@@ -119,10 +162,22 @@ func (q *queue) dump(includeDetails bool) debug.QueueDump {
digest[i].RequestInfo = *requestInfo
}
}
i++
return true
})
sum := q.requests.QueueSum()
queueSum := debug.QueueSum{
InitialSeatsSum: sum.InitialSeatsSum,
MaxSeatsSum: sum.MaxSeatsSum,
TotalWorkSum: sum.TotalWorkSum.String(),
}
return debug.QueueDump{
VirtualStart: q.virtualStart,
NextDispatchR: q.nextDispatchR.String(),
Requests: digest,
ExecutingRequests: q.requestsExecuting,
SeatsInUse: q.seatsInUse,
QueueSum: queueSum,
}
}

View File

@@ -21,7 +21,7 @@ import (
"encoding/json"
"fmt"
flowcontrol "k8s.io/api/flowcontrol/v1beta1"
flowcontrol "k8s.io/api/flowcontrol/v1beta2"
"k8s.io/apiserver/pkg/authentication/user"
"k8s.io/apiserver/pkg/endpoints/request"
)

View File

@@ -0,0 +1,67 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package metrics
// Gauge is the methods of a gauge that are used by instrumented code.
type Gauge interface {
Set(float64)
Inc()
Dec()
Add(float64)
SetToCurrentTime()
}
// RatioedGauge tracks ratios.
// The numerator is set/changed through the Gauge methods,
// and the denominator can be updated through the SetDenominator method.
// A ratio is tracked whenever the numerator or denominator is set/changed.
type RatioedGauge interface {
Gauge
// SetDenominator sets the denominator to use until it is changed again
SetDenominator(float64)
}
// RatioedGaugeVec creates related observers that are
// differentiated by a series of label values
type RatioedGaugeVec interface {
// NewForLabelValuesSafe makes a new vector member for the given tuple of label values,
// initialized with the given numerator and denominator.
// Unlike the usual Vec WithLabelValues method, this is intended to be called only
// once per vector member (at the start of its lifecycle).
// The "Safe" part is saying that the returned object will function properly after metric registration
// even if this method is called before registration.
NewForLabelValuesSafe(initialNumerator, initialDenominator float64, labelValues []string) RatioedGauge
}
//////////////////////////////// Pairs ////////////////////////////////
//
// API Priority and Fairness tends to use RatioedGaugeVec members in pairs,
// one for requests waiting in a queue and one for requests being executed.
// The following definitions are a convenience layer that adds support for that
// particular pattern of usage.
// RatioedGaugePair is a corresponding pair of gauges, one for the
// number of requests waiting in queue(s) and one for the number of
// requests being executed.
type RatioedGaugePair struct {
// RequestsWaiting is given observations of the number of currently queued requests
RequestsWaiting RatioedGauge
// RequestsExecuting is given observations of the number of requests currently executing
RequestsExecuting RatioedGauge
}

View File

@@ -18,11 +18,13 @@ package metrics
import (
"context"
"strconv"
"strings"
"sync"
"time"
"k8s.io/apimachinery/pkg/util/clock"
epmetrics "k8s.io/apiserver/pkg/endpoints/metrics"
apirequest "k8s.io/apiserver/pkg/endpoints/request"
compbasemetrics "k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
basemetricstestutil "k8s.io/component-base/metrics/testutil"
@@ -34,11 +36,13 @@ const (
)
const (
requestKind = "request_kind"
priorityLevel = "priority_level"
flowSchema = "flow_schema"
phase = "phase"
mark = "mark"
requestKind = "request_kind"
priorityLevel = "priority_level"
flowSchema = "flow_schema"
phase = "phase"
LabelNamePhase = "phase"
LabelValueWaiting = "waiting"
LabelValueExecuting = "executing"
)
var (
@@ -88,7 +92,7 @@ var (
Namespace: namespace,
Subsystem: subsystem,
Name: "rejected_requests_total",
Help: "Number of requests rejected by API Priority and Fairness system",
Help: "Number of requests rejected by API Priority and Fairness subsystem",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{priorityLevel, flowSchema, "reason"},
@@ -98,58 +102,109 @@ var (
Namespace: namespace,
Subsystem: subsystem,
Name: "dispatched_requests_total",
Help: "Number of requests released by API Priority and Fairness system for service",
Help: "Number of requests executed by API Priority and Fairness subsystem",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{priorityLevel, flowSchema},
)
// PriorityLevelConcurrencyObserverPairGenerator creates pairs that observe concurrency for priority levels
PriorityLevelConcurrencyObserverPairGenerator = NewSampleAndWaterMarkHistogramsPairGenerator(clock.RealClock{}, time.Millisecond,
&compbasemetrics.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "priority_level_request_count_samples",
Help: "Periodic observations of the number of requests",
Buckets: []float64{0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1},
// PriorityLevelExecutionSeatsGaugeVec creates observers of seats occupied throughout execution for priority levels
PriorityLevelExecutionSeatsGaugeVec = NewTimingRatioHistogramVec(
&compbasemetrics.TimingHistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "priority_level_seat_utilization",
Help: "Observations, at the end of every nanosecond, of utilization of seats for any stage of execution (but only initial stage for WATCHes)",
// Buckets for both 0.99 and 1.0 mean PromQL's histogram_quantile will reveal saturation
Buckets: []float64{0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99, 1},
ConstLabels: map[string]string{phase: "executing"},
StabilityLevel: compbasemetrics.ALPHA,
},
&compbasemetrics.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "priority_level_request_count_watermarks",
Help: "Watermarks of the number of requests",
Buckets: []float64{0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1},
priorityLevel,
)
// PriorityLevelConcurrencyGaugeVec creates gauges of concurrency broken down by phase, priority level
PriorityLevelConcurrencyGaugeVec = NewTimingRatioHistogramVec(
&compbasemetrics.TimingHistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "priority_level_request_utilization",
Help: "Observations, at the end of every nanosecond, of number of requests (as a fraction of the relevant limit) waiting or in any stage of execution (but only initial stage for WATCHes)",
// For executing: the denominator will be seats, so this metric will skew low.
// For waiting: total queue capacity is generally quite generous, so this metric will skew low.
Buckets: []float64{0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.25, 0.5, 0.75, 1},
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{priorityLevel})
// ReadWriteConcurrencyObserverPairGenerator creates pairs that observe concurrency broken down by mutating vs readonly
ReadWriteConcurrencyObserverPairGenerator = NewSampleAndWaterMarkHistogramsPairGenerator(clock.RealClock{}, time.Millisecond,
&compbasemetrics.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "read_vs_write_request_count_samples",
Help: "Periodic observations of the number of requests",
Buckets: []float64{0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1},
LabelNamePhase, priorityLevel,
)
// readWriteConcurrencyGaugeVec creates ratioed gauges of requests/limit broken down by phase and mutating vs readonly
readWriteConcurrencyGaugeVec = NewTimingRatioHistogramVec(
&compbasemetrics.TimingHistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "read_vs_write_current_requests",
Help: "Observations, at the end of every nanosecond, of the number of requests (as a fraction of the relevant limit) waiting or in regular stage of execution",
// This metric will skew low for the same reason as the priority level metrics
// and also because APF has a combined limit for mutating and readonly.
Buckets: []float64{0, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99, 1},
StabilityLevel: compbasemetrics.ALPHA,
},
&compbasemetrics.HistogramOpts{
LabelNamePhase, requestKind,
)
apiserverCurrentR = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "read_vs_write_request_count_watermarks",
Help: "Watermarks of the number of requests",
Buckets: []float64{0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1},
Name: "current_r",
Help: "R(time of last change)",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{requestKind})
[]string{priorityLevel},
)
apiserverDispatchR = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "dispatch_r",
Help: "R(time of last dispatch)",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{priorityLevel},
)
apiserverLatestS = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "latest_s",
Help: "S(most recently dispatched request)",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{priorityLevel},
)
apiserverNextSBounds = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "next_s_bounds",
Help: "min and max, over queues, of S(oldest waiting request in queue)",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{priorityLevel, "bound"},
)
apiserverNextDiscountedSBounds = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "next_discounted_s_bounds",
Help: "min and max, over queues, of S(oldest waiting request in queue) - estimated work in progress",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{priorityLevel, "bound"},
)
apiserverCurrentInqueueRequests = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "current_inqueue_requests",
Help: "Number of requests currently pending in queues of the API Priority and Fairness system",
Help: "Number of requests currently pending in queues of the API Priority and Fairness subsystem",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{priorityLevel, flowSchema},
@@ -159,7 +214,7 @@ var (
Namespace: namespace,
Subsystem: subsystem,
Name: "request_queue_length_after_enqueue",
Help: "Length of queue in the API Priority and Fairness system, as seen by each request after it is enqueued",
Help: "Length of queue in the API Priority and Fairness subsystem, as seen by each request after it is enqueued",
Buckets: queueLengthBuckets,
StabilityLevel: compbasemetrics.ALPHA,
},
@@ -170,7 +225,7 @@ var (
Namespace: namespace,
Subsystem: subsystem,
Name: "request_concurrency_limit",
Help: "Shared concurrency limit in the API Priority and Fairness system",
Help: "Shared concurrency limit in the API Priority and Fairness subsystem",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{priorityLevel},
@@ -180,7 +235,17 @@ var (
Namespace: namespace,
Subsystem: subsystem,
Name: "current_executing_requests",
Help: "Number of requests currently executing in the API Priority and Fairness system",
Help: "Number of requests in initial (for a WATCH) or any (for a non-WATCH) execution stage in the API Priority and Fairness subsystem",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{priorityLevel, flowSchema},
)
apiserverRequestConcurrencyInUse = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "request_concurrency_in_use",
Help: "Concurrency (number of seats) occupied by the currently executing (initial stage for a WATCH, any stage otherwise) requests in the API Priority and Fairness subsystem",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{priorityLevel, flowSchema},
@@ -201,26 +266,112 @@ var (
Namespace: namespace,
Subsystem: subsystem,
Name: "request_execution_seconds",
Help: "Duration of request execution in the API Priority and Fairness system",
Help: "Duration of initial stage (for a WATCH) or any (for a non-WATCH) stage of request execution in the API Priority and Fairness subsystem",
Buckets: requestDurationSecondsBuckets,
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{priorityLevel, flowSchema, "type"},
)
watchCountSamples = compbasemetrics.NewHistogramVec(
&compbasemetrics.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "watch_count_samples",
Help: "count of watchers for mutating requests in API Priority and Fairness",
Buckets: []float64{0, 1, 10, 100, 1000, 10000},
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{priorityLevel, flowSchema},
)
apiserverEpochAdvances = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "epoch_advance_total",
Help: "Number of times the queueset's progress meter jumped backward",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{priorityLevel, "success"},
)
apiserverWorkEstimatedSeats = compbasemetrics.NewHistogramVec(
&compbasemetrics.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "work_estimated_seats",
Help: "Number of estimated seats (maximum of initial and final seats) associated with requests in API Priority and Fairness",
// the upper bound comes from the maximum number of seats a request
// can occupy which is currently set at 10.
Buckets: []float64{1, 2, 4, 10},
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{priorityLevel, flowSchema},
)
apiserverDispatchWithNoAccommodation = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "request_dispatch_no_accommodation_total",
Help: "Number of times a dispatch attempt resulted in a non accommodation due to lack of available seats",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{priorityLevel, flowSchema},
)
metrics = Registerables{
apiserverRejectedRequestsTotal,
apiserverDispatchedRequestsTotal,
apiserverCurrentR,
apiserverDispatchR,
apiserverLatestS,
apiserverNextSBounds,
apiserverNextDiscountedSBounds,
apiserverCurrentInqueueRequests,
apiserverRequestQueueLength,
apiserverRequestConcurrencyLimit,
apiserverRequestConcurrencyInUse,
apiserverCurrentExecutingRequests,
apiserverRequestWaitingSeconds,
apiserverRequestExecutionSeconds,
watchCountSamples,
apiserverEpochAdvances,
apiserverWorkEstimatedSeats,
apiserverDispatchWithNoAccommodation,
}.
Append(PriorityLevelConcurrencyObserverPairGenerator.metrics()...).
Append(ReadWriteConcurrencyObserverPairGenerator.metrics()...)
Append(PriorityLevelExecutionSeatsGaugeVec.metrics()...).
Append(PriorityLevelConcurrencyGaugeVec.metrics()...).
Append(readWriteConcurrencyGaugeVec.metrics()...)
)
type indexOnce struct {
labelValues []string
once sync.Once
gauge RatioedGauge
}
func (io *indexOnce) getGauge() RatioedGauge {
io.once.Do(func() {
io.gauge = readWriteConcurrencyGaugeVec.NewForLabelValuesSafe(0, 1, io.labelValues)
})
return io.gauge
}
var waitingReadonly = indexOnce{labelValues: []string{LabelValueWaiting, epmetrics.ReadOnlyKind}}
var executingReadonly = indexOnce{labelValues: []string{LabelValueExecuting, epmetrics.ReadOnlyKind}}
var waitingMutating = indexOnce{labelValues: []string{LabelValueWaiting, epmetrics.MutatingKind}}
var executingMutating = indexOnce{labelValues: []string{LabelValueExecuting, epmetrics.MutatingKind}}
// GetWaitingReadonlyConcurrency returns the gauge of number of readonly requests waiting / limit on those.
var GetWaitingReadonlyConcurrency = waitingReadonly.getGauge
// GetExecutingReadonlyConcurrency returns the gauge of number of executing readonly requests / limit on those.
var GetExecutingReadonlyConcurrency = executingReadonly.getGauge
// GetWaitingMutatingConcurrency returns the gauge of number of mutating requests waiting / limit on those.
var GetWaitingMutatingConcurrency = waitingMutating.getGauge
// GetExecutingMutatingConcurrency returns the gauge of number of executing mutating requests / limit on those.
var GetExecutingMutatingConcurrency = executingMutating.getGauge
// AddRequestsInQueues adds the given delta to the gauge of the # of requests in the queues of the specified flowSchema and priorityLevel
func AddRequestsInQueues(ctx context.Context, priorityLevel, flowSchema string, delta int) {
apiserverCurrentInqueueRequests.WithLabelValues(priorityLevel, flowSchema).Add(float64(delta))
@@ -231,6 +382,27 @@ func AddRequestsExecuting(ctx context.Context, priorityLevel, flowSchema string,
apiserverCurrentExecutingRequests.WithLabelValues(priorityLevel, flowSchema).Add(float64(delta))
}
// SetCurrentR sets the current-R (virtualTime) gauge for the given priority level
func SetCurrentR(priorityLevel string, r float64) {
apiserverCurrentR.WithLabelValues(priorityLevel).Set(r)
}
// SetLatestS sets the latest-S (virtual time of dispatched request) gauge for the given priority level
func SetDispatchMetrics(priorityLevel string, r, s, sMin, sMax, discountedSMin, discountedSMax float64) {
apiserverDispatchR.WithLabelValues(priorityLevel).Set(r)
apiserverLatestS.WithLabelValues(priorityLevel).Set(s)
apiserverNextSBounds.WithLabelValues(priorityLevel, "min").Set(sMin)
apiserverNextSBounds.WithLabelValues(priorityLevel, "max").Set(sMax)
apiserverNextDiscountedSBounds.WithLabelValues(priorityLevel, "min").Set(discountedSMin)
apiserverNextDiscountedSBounds.WithLabelValues(priorityLevel, "max").Set(discountedSMax)
}
// AddRequestConcurrencyInUse adds the given delta to the gauge of concurrency in use by
// the currently executing requests of the given flowSchema and priorityLevel
func AddRequestConcurrencyInUse(priorityLevel, flowSchema string, delta int) {
apiserverRequestConcurrencyInUse.WithLabelValues(priorityLevel, flowSchema).Add(float64(delta))
}
// UpdateSharedConcurrencyLimit updates the value for the concurrency limit in flow control
func UpdateSharedConcurrencyLimit(priorityLevel string, limit int) {
apiserverRequestConcurrencyLimit.WithLabelValues(priorityLevel).Set(float64(limit))
@@ -258,5 +430,30 @@ func ObserveWaitingDuration(ctx context.Context, priorityLevel, flowSchema, exec
// ObserveExecutionDuration observes the execution duration for flow control
func ObserveExecutionDuration(ctx context.Context, priorityLevel, flowSchema string, executionTime time.Duration) {
apiserverRequestExecutionSeconds.WithContext(ctx).WithLabelValues(priorityLevel, flowSchema).Observe(executionTime.Seconds())
reqType := "regular"
if requestInfo, ok := apirequest.RequestInfoFrom(ctx); ok && requestInfo.Verb == "watch" {
reqType = requestInfo.Verb
}
apiserverRequestExecutionSeconds.WithContext(ctx).WithLabelValues(priorityLevel, flowSchema, reqType).Observe(executionTime.Seconds())
}
// ObserveWatchCount notes a sampling of a watch count
func ObserveWatchCount(ctx context.Context, priorityLevel, flowSchema string, count int) {
watchCountSamples.WithLabelValues(priorityLevel, flowSchema).Observe(float64(count))
}
// AddEpochAdvance notes an advance of the progress meter baseline for a given priority level
func AddEpochAdvance(ctx context.Context, priorityLevel string, success bool) {
apiserverEpochAdvances.WithContext(ctx).WithLabelValues(priorityLevel, strconv.FormatBool(success)).Inc()
}
// ObserveWorkEstimatedSeats notes a sampling of estimated seats associated with a request
func ObserveWorkEstimatedSeats(priorityLevel, flowSchema string, seats int) {
apiserverWorkEstimatedSeats.WithLabelValues(priorityLevel, flowSchema).Observe(float64(seats))
}
// AddDispatchWithNoAccommodation keeps track of number of times dispatch attempt results
// in a non accommodation due to lack of available seats.
func AddDispatchWithNoAccommodation(priorityLevel, flowSchema string) {
apiserverDispatchWithNoAccommodation.WithLabelValues(priorityLevel, flowSchema).Inc()
}

View File

@@ -1,209 +0,0 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package metrics
import (
"sync"
"time"
"k8s.io/apimachinery/pkg/util/clock"
compbasemetrics "k8s.io/component-base/metrics"
"k8s.io/klog/v2"
)
const (
labelNameMark = "mark"
labelValueLo = "low"
labelValueHi = "high"
labelNamePhase = "phase"
labelValueWaiting = "waiting"
labelValueExecuting = "executing"
)
// SampleAndWaterMarkPairGenerator makes pairs of TimedObservers that
// track samples and watermarks.
type SampleAndWaterMarkPairGenerator struct {
urGenerator SampleAndWaterMarkObserverGenerator
}
var _ TimedObserverPairGenerator = SampleAndWaterMarkPairGenerator{}
// NewSampleAndWaterMarkHistogramsPairGenerator makes a new pair generator
func NewSampleAndWaterMarkHistogramsPairGenerator(clock clock.PassiveClock, samplePeriod time.Duration, sampleOpts, waterMarkOpts *compbasemetrics.HistogramOpts, labelNames []string) SampleAndWaterMarkPairGenerator {
return SampleAndWaterMarkPairGenerator{
urGenerator: NewSampleAndWaterMarkHistogramsGenerator(clock, samplePeriod, sampleOpts, waterMarkOpts, append([]string{labelNamePhase}, labelNames...)),
}
}
// Generate makes a new pair
func (spg SampleAndWaterMarkPairGenerator) Generate(waiting1, executing1 float64, labelValues []string) TimedObserverPair {
return TimedObserverPair{
RequestsWaiting: spg.urGenerator.Generate(0, waiting1, append([]string{labelValueWaiting}, labelValues...)),
RequestsExecuting: spg.urGenerator.Generate(0, executing1, append([]string{labelValueExecuting}, labelValues...)),
}
}
func (spg SampleAndWaterMarkPairGenerator) metrics() Registerables {
return spg.urGenerator.metrics()
}
// SampleAndWaterMarkObserverGenerator creates TimedObservers that
// populate histograms of samples and low- and high-water-marks. The
// generator has a samplePeriod, and the histograms get an observation
// every samplePeriod. The sampling windows are quantized based on
// the monotonic rather than wall-clock times. The `t0` field is
// there so to provide a baseline for monotonic clock differences.
type SampleAndWaterMarkObserverGenerator struct {
*sampleAndWaterMarkObserverGenerator
}
type sampleAndWaterMarkObserverGenerator struct {
clock clock.PassiveClock
t0 time.Time
samplePeriod time.Duration
samples *compbasemetrics.HistogramVec
waterMarks *compbasemetrics.HistogramVec
}
var _ TimedObserverGenerator = (*sampleAndWaterMarkObserverGenerator)(nil)
// NewSampleAndWaterMarkHistogramsGenerator makes a new one
func NewSampleAndWaterMarkHistogramsGenerator(clock clock.PassiveClock, samplePeriod time.Duration, sampleOpts, waterMarkOpts *compbasemetrics.HistogramOpts, labelNames []string) SampleAndWaterMarkObserverGenerator {
return SampleAndWaterMarkObserverGenerator{
&sampleAndWaterMarkObserverGenerator{
clock: clock,
t0: clock.Now(),
samplePeriod: samplePeriod,
samples: compbasemetrics.NewHistogramVec(sampleOpts, labelNames),
waterMarks: compbasemetrics.NewHistogramVec(waterMarkOpts, append([]string{labelNameMark}, labelNames...)),
}}
}
func (swg *sampleAndWaterMarkObserverGenerator) quantize(when time.Time) int64 {
return int64(when.Sub(swg.t0) / swg.samplePeriod)
}
// Generate makes a new TimedObserver
func (swg *sampleAndWaterMarkObserverGenerator) Generate(x, x1 float64, labelValues []string) TimedObserver {
relX := x / x1
when := swg.clock.Now()
return &sampleAndWaterMarkHistograms{
sampleAndWaterMarkObserverGenerator: swg,
labelValues: labelValues,
loLabelValues: append([]string{labelValueLo}, labelValues...),
hiLabelValues: append([]string{labelValueHi}, labelValues...),
x1: x1,
sampleAndWaterMarkAccumulator: sampleAndWaterMarkAccumulator{
lastSet: when,
lastSetInt: swg.quantize(when),
x: x,
relX: relX,
loRelX: relX,
hiRelX: relX,
}}
}
func (swg *sampleAndWaterMarkObserverGenerator) metrics() Registerables {
return Registerables{swg.samples, swg.waterMarks}
}
type sampleAndWaterMarkHistograms struct {
*sampleAndWaterMarkObserverGenerator
labelValues []string
loLabelValues, hiLabelValues []string
sync.Mutex
x1 float64
sampleAndWaterMarkAccumulator
}
type sampleAndWaterMarkAccumulator struct {
lastSet time.Time
lastSetInt int64 // lastSet / samplePeriod
x float64
relX float64 // x / x1
loRelX, hiRelX float64
}
var _ TimedObserver = (*sampleAndWaterMarkHistograms)(nil)
func (saw *sampleAndWaterMarkHistograms) Add(deltaX float64) {
saw.innerSet(func() {
saw.x += deltaX
})
}
func (saw *sampleAndWaterMarkHistograms) Set(x float64) {
saw.innerSet(func() {
saw.x = x
})
}
func (saw *sampleAndWaterMarkHistograms) SetX1(x1 float64) {
saw.innerSet(func() {
saw.x1 = x1
})
}
func (saw *sampleAndWaterMarkHistograms) innerSet(updateXOrX1 func()) {
when, whenInt, acc, wellOrdered := func() (time.Time, int64, sampleAndWaterMarkAccumulator, bool) {
saw.Lock()
defer saw.Unlock()
// Moved these variables here to tiptoe around https://github.com/golang/go/issues/43570 for #97685
when := saw.clock.Now()
whenInt := saw.quantize(when)
acc := saw.sampleAndWaterMarkAccumulator
wellOrdered := !when.Before(acc.lastSet)
updateXOrX1()
saw.relX = saw.x / saw.x1
if wellOrdered {
if acc.lastSetInt < whenInt {
saw.loRelX, saw.hiRelX = acc.relX, acc.relX
saw.lastSetInt = whenInt
}
saw.lastSet = when
}
// `wellOrdered` should always be true because we are using
// monotonic clock readings and they never go backwards. Yet
// very small backwards steps (under 1 microsecond) have been
// observed
// (https://github.com/kubernetes/kubernetes/issues/96459).
// In the backwards case, treat the current reading as if it
// had occurred at time `saw.lastSet` and log an error. It
// would be wrong to update `saw.lastSet` in this case because
// that plants a time bomb for future updates to
// `saw.lastSetInt`.
if saw.relX < saw.loRelX {
saw.loRelX = saw.relX
} else if saw.relX > saw.hiRelX {
saw.hiRelX = saw.relX
}
return when, whenInt, acc, wellOrdered
}()
if !wellOrdered {
lastSetS := acc.lastSet.String()
whenS := when.String()
klog.Errorf("Time went backwards from %s to %s for labelValues=%#+v", lastSetS, whenS, saw.labelValues)
}
for acc.lastSetInt < whenInt {
saw.samples.WithLabelValues(saw.labelValues...).Observe(acc.relX)
saw.waterMarks.WithLabelValues(saw.loLabelValues...).Observe(acc.loRelX)
saw.waterMarks.WithLabelValues(saw.hiLabelValues...).Observe(acc.hiRelX)
acc.lastSetInt++
acc.loRelX, acc.hiRelX = acc.relX, acc.relX
}
}

View File

@@ -1,52 +0,0 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package metrics
// TimedObserver gets informed about the values assigned to a variable
// `X float64` over time, and reports on the ratio `X/X1`.
type TimedObserver interface {
// Add notes a change to the variable
Add(deltaX float64)
// Set notes a setting of the variable
Set(x float64)
// SetX1 changes the value to use for X1
SetX1(x1 float64)
}
// TimedObserverGenerator creates related observers that are
// differentiated by a series of label values
type TimedObserverGenerator interface {
Generate(x, x1 float64, labelValues []string) TimedObserver
}
// TimedObserverPair is a corresponding pair of observers, one for the
// number of requests waiting in queue(s) and one for the number of
// requests being executed
type TimedObserverPair struct {
// RequestsWaiting is given observations of the number of currently queued requests
RequestsWaiting TimedObserver
// RequestsExecuting is given observations of the number of requests currently executing
RequestsExecuting TimedObserver
}
// TimedObserverPairGenerator generates pairs
type TimedObserverPairGenerator interface {
Generate(waiting1, executing1 float64, labelValues []string) TimedObserverPair
}

View File

@@ -0,0 +1,225 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package metrics
import (
"context"
"sync"
"time"
compbasemetrics "k8s.io/component-base/metrics"
"k8s.io/klog/v2"
)
// TimingRatioHistogram is essentially a gauge for a ratio where the client
// independently controls the numerator and denominator.
// When scraped it produces a histogram of samples of the ratio
// taken at the end of every nanosecond.
// `*TimingRatioHistogram` implements both Registerable and RatioedGauge.
type TimingRatioHistogram struct {
// The implementation is layered on TimingHistogram,
// adding the division by an occasionally adjusted denominator.
// Registerable is the registerable aspect.
// That is the registerable aspect of the underlying TimingHistogram.
compbasemetrics.Registerable
// timingRatioHistogramInner implements the RatioedGauge aspect.
timingRatioHistogramInner
}
// TimingRatioHistogramOpts is the constructor parameters of a TimingRatioHistogram.
// The `TimingHistogramOpts.InitialValue` is the initial numerator.
type TimingRatioHistogramOpts struct {
compbasemetrics.TimingHistogramOpts
InitialDenominator float64
}
// timingRatioHistogramInner implements the instrumentation aspect
type timingRatioHistogramInner struct {
nowFunc func() time.Time
getGaugeOfRatio func() Gauge
sync.Mutex
// access only with mutex locked
numerator, denominator float64
}
var _ RatioedGauge = &timingRatioHistogramInner{}
var _ RatioedGauge = &TimingRatioHistogram{}
var _ compbasemetrics.Registerable = &TimingRatioHistogram{}
// NewTimingHistogram returns an object which is TimingHistogram-like. However, nothing
// will be measured until the histogram is registered in at least one registry.
func NewTimingRatioHistogram(opts *TimingRatioHistogramOpts) *TimingRatioHistogram {
return NewTestableTimingRatioHistogram(time.Now, opts)
}
// NewTestableTimingHistogram adds injection of the clock
func NewTestableTimingRatioHistogram(nowFunc func() time.Time, opts *TimingRatioHistogramOpts) *TimingRatioHistogram {
ratioedOpts := opts.TimingHistogramOpts
ratioedOpts.InitialValue /= opts.InitialDenominator
th := compbasemetrics.NewTestableTimingHistogram(nowFunc, &ratioedOpts)
return &TimingRatioHistogram{
Registerable: th,
timingRatioHistogramInner: timingRatioHistogramInner{
nowFunc: nowFunc,
getGaugeOfRatio: func() Gauge { return th },
numerator: opts.InitialValue,
denominator: opts.InitialDenominator,
}}
}
func (trh *timingRatioHistogramInner) Set(numerator float64) {
trh.Lock()
defer trh.Unlock()
trh.numerator = numerator
ratio := numerator / trh.denominator
trh.getGaugeOfRatio().Set(ratio)
}
func (trh *timingRatioHistogramInner) Add(deltaNumerator float64) {
trh.Lock()
defer trh.Unlock()
numerator := trh.numerator + deltaNumerator
trh.numerator = numerator
ratio := numerator / trh.denominator
trh.getGaugeOfRatio().Set(ratio)
}
func (trh *timingRatioHistogramInner) Sub(deltaNumerator float64) {
trh.Add(-deltaNumerator)
}
func (trh *timingRatioHistogramInner) Inc() {
trh.Add(1)
}
func (trh *timingRatioHistogramInner) Dec() {
trh.Add(-1)
}
func (trh *timingRatioHistogramInner) SetToCurrentTime() {
trh.Set(float64(trh.nowFunc().Sub(time.Unix(0, 0))))
}
func (trh *timingRatioHistogramInner) SetDenominator(denominator float64) {
trh.Lock()
defer trh.Unlock()
trh.denominator = denominator
ratio := trh.numerator / denominator
trh.getGaugeOfRatio().Set(ratio)
}
// WithContext allows the normal TimingHistogram metric to pass in context.
// The context is no-op at the current level of development.
func (trh *timingRatioHistogramInner) WithContext(ctx context.Context) RatioedGauge {
return trh
}
// TimingRatioHistogramVec is a collection of TimingRatioHistograms that differ
// only in label values.
// `*TimingRatioHistogramVec` implements both Registerable and RatioedGaugeVec.
type TimingRatioHistogramVec struct {
// promote only the Registerable methods
compbasemetrics.Registerable
// delegate is TimingHistograms of the ratio
delegate compbasemetrics.GaugeVecMetric
}
var _ RatioedGaugeVec = &TimingRatioHistogramVec{}
var _ compbasemetrics.Registerable = &TimingRatioHistogramVec{}
// NewTimingHistogramVec constructs a new vector.
// `opts.InitialValue` is the initial ratio, but this applies
// only for the tiny period of time until NewForLabelValuesSafe sets
// the ratio based on the given initial numerator and denominator.
// Thus there is a tiny splinter of time during member construction when
// its underlying TimingHistogram is given the initial numerator rather than
// the initial ratio (which is obviously a non-issue when both are zero).
// Note the difficulties associated with extracting a member
// before registering the vector.
func NewTimingRatioHistogramVec(opts *compbasemetrics.TimingHistogramOpts, labelNames ...string) *TimingRatioHistogramVec {
return NewTestableTimingRatioHistogramVec(time.Now, opts, labelNames...)
}
// NewTestableTimingHistogramVec adds injection of the clock.
func NewTestableTimingRatioHistogramVec(nowFunc func() time.Time, opts *compbasemetrics.TimingHistogramOpts, labelNames ...string) *TimingRatioHistogramVec {
delegate := compbasemetrics.NewTestableTimingHistogramVec(nowFunc, opts, labelNames)
return &TimingRatioHistogramVec{
Registerable: delegate,
delegate: delegate,
}
}
func (v *TimingRatioHistogramVec) metrics() Registerables {
return Registerables{v}
}
// NewForLabelValuesChecked will return an error if this vec is not hidden and not yet registered
// or there is a syntactic problem with the labelValues.
func (v *TimingRatioHistogramVec) NewForLabelValuesChecked(initialNumerator, initialDenominator float64, labelValues []string) (RatioedGauge, error) {
underMember, err := v.delegate.WithLabelValuesChecked(labelValues...)
if err != nil {
return noopRatioed{}, err
}
underMember.Set(initialNumerator / initialDenominator)
return &timingRatioHistogramInner{
getGaugeOfRatio: func() Gauge { return underMember },
numerator: initialNumerator,
denominator: initialDenominator,
}, nil
}
// NewForLabelValuesSafe is the same as NewForLabelValuesChecked in cases where that does not
// return an error. When the unsafe version returns an error due to the vector not being
// registered yet, the safe version returns an object that implements its methods
// by looking up the relevant vector member in each call (thus getting a non-noop after registration).
// In the other error cases the object returned here is a noop.
func (v *TimingRatioHistogramVec) NewForLabelValuesSafe(initialNumerator, initialDenominator float64, labelValues []string) RatioedGauge {
tro, err := v.NewForLabelValuesChecked(initialNumerator, initialDenominator, labelValues)
if err == nil {
klog.V(3).InfoS("TimingRatioHistogramVec.NewForLabelValuesSafe hit the efficient case", "fqName", v.FQName(), "labelValues", labelValues)
return tro
}
if !compbasemetrics.ErrIsNotRegistered(err) {
klog.ErrorS(err, "Failed to extract TimingRatioHistogramVec member, using noop instead", "vectorname", v.FQName(), "labelValues", labelValues)
return tro
}
klog.V(3).InfoS("TimingRatioHistogramVec.NewForLabelValuesSafe hit the inefficient case", "fqName", v.FQName(), "labelValues", labelValues)
// At this point we know v.NewForLabelValuesChecked(..) returns a permanent noop,
// which we precisely want to avoid using. Instead, make our own gauge that
// fetches the element on every Set.
return &timingRatioHistogramInner{
getGaugeOfRatio: func() Gauge { return v.delegate.WithLabelValues(labelValues...) },
numerator: initialNumerator,
denominator: initialDenominator,
}
}
type noopRatioed struct{}
func (noopRatioed) Set(float64) {}
func (noopRatioed) Add(float64) {}
func (noopRatioed) Sub(float64) {}
func (noopRatioed) Inc() {}
func (noopRatioed) Dec() {}
func (noopRatioed) SetToCurrentTime() {}
func (noopRatioed) SetDenominator(float64) {}
func (v *TimingRatioHistogramVec) Reset() {
v.delegate.Reset()
}

View File

@@ -0,0 +1,25 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package metrics
// RatioedGaugeVecPhasedElementPair extracts a pair of elements that differ in handling phase
func RatioedGaugeVecPhasedElementPair(vec RatioedGaugeVec, initialWaitingDenominator, initialExecutingDenominator float64, labelValues []string) RatioedGaugePair {
return RatioedGaugePair{
RequestsWaiting: vec.NewForLabelValuesSafe(0, initialWaitingDenominator, append([]string{LabelValueWaiting}, labelValues...)),
RequestsExecuting: vec.NewForLabelValuesSafe(0, initialExecutingDenominator, append([]string{LabelValueExecuting}, labelValues...)),
}
}

View File

@@ -0,0 +1,92 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package request
import (
"time"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
const (
minimumSeats = 1
maximumSeats = 10
objectsPerSeat = 100.0
watchesPerSeat = 10.0
enableMutatingWorkEstimator = true
)
var eventAdditionalDuration = 5 * time.Millisecond
// WorkEstimatorConfig holds work estimator parameters.
type WorkEstimatorConfig struct {
*ListWorkEstimatorConfig `json:"listWorkEstimatorConfig,omitempty"`
*MutatingWorkEstimatorConfig `json:"mutatingWorkEstimatorConfig,omitempty"`
// MinimumSeats is the minimum number of seats a request must occupy.
MinimumSeats uint64 `json:"minimumSeats,omitempty"`
// MaximumSeats is the maximum number of seats a request can occupy
//
// NOTE: work_estimate_seats_samples metric uses the value of maximumSeats
// as the upper bound, so when we change maximumSeats we should also
// update the buckets of the metric.
MaximumSeats uint64 `json:"maximumSeats,omitempty"`
}
// ListWorkEstimatorConfig holds work estimator parameters related to list requests.
type ListWorkEstimatorConfig struct {
ObjectsPerSeat float64 `json:"objectsPerSeat,omitempty"`
}
// MutatingWorkEstimatorConfig holds work estimator
// parameters related to watches of mutating objects.
type MutatingWorkEstimatorConfig struct {
// TODO(wojtekt): Remove it once we tune the algorithm to not fail
// scalability tests.
Enabled bool `json:"enable,omitempty"`
EventAdditionalDuration metav1.Duration `json:"eventAdditionalDurationMs,omitempty"`
WatchesPerSeat float64 `json:"watchesPerSeat,omitempty"`
}
// DefaultWorkEstimatorConfig creates a new WorkEstimatorConfig with default values.
func DefaultWorkEstimatorConfig() *WorkEstimatorConfig {
return &WorkEstimatorConfig{
MinimumSeats: minimumSeats,
MaximumSeats: maximumSeats,
ListWorkEstimatorConfig: defaultListWorkEstimatorConfig(),
MutatingWorkEstimatorConfig: defaultMutatingWorkEstimatorConfig(),
}
}
// defaultListWorkEstimatorConfig creates a new ListWorkEstimatorConfig with default values.
func defaultListWorkEstimatorConfig() *ListWorkEstimatorConfig {
return &ListWorkEstimatorConfig{ObjectsPerSeat: objectsPerSeat}
}
// defaultMutatingWorkEstimatorConfig creates a new MutatingWorkEstimatorConfig with default values.
func defaultMutatingWorkEstimatorConfig() *MutatingWorkEstimatorConfig {
return &MutatingWorkEstimatorConfig{
Enabled: enableMutatingWorkEstimator,
EventAdditionalDuration: metav1.Duration{Duration: eventAdditionalDuration},
WatchesPerSeat: watchesPerSeat,
}
}
// eventAdditionalDuration converts eventAdditionalDurationMs to a time.Duration type.
func (c *MutatingWorkEstimatorConfig) eventAdditionalDuration() time.Duration {
return c.EventAdditionalDuration.Duration
}

View File

@@ -0,0 +1,154 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package request
import (
"math"
"net/http"
"net/url"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema"
apirequest "k8s.io/apiserver/pkg/endpoints/request"
"k8s.io/apiserver/pkg/features"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/klog/v2"
)
func newListWorkEstimator(countFn objectCountGetterFunc, config *WorkEstimatorConfig) WorkEstimatorFunc {
estimator := &listWorkEstimator{
config: config,
countGetterFn: countFn,
}
return estimator.estimate
}
type listWorkEstimator struct {
config *WorkEstimatorConfig
countGetterFn objectCountGetterFunc
}
func (e *listWorkEstimator) estimate(r *http.Request, flowSchemaName, priorityLevelName string) WorkEstimate {
requestInfo, ok := apirequest.RequestInfoFrom(r.Context())
if !ok {
// no RequestInfo should never happen, but to be on the safe side
// let's return maximumSeats
return WorkEstimate{InitialSeats: e.config.MaximumSeats}
}
if requestInfo.Name != "" {
// Requests with metadata.name specified are usually executed as get
// requests in storage layer so their width should be 1.
// Example of such list requests:
// /apis/certificates.k8s.io/v1/certificatesigningrequests?fieldSelector=metadata.name%3Dcsr-xxs4m
// /api/v1/namespaces/test/configmaps?fieldSelector=metadata.name%3Dbig-deployment-1&limit=500&resourceVersion=0
return WorkEstimate{InitialSeats: e.config.MinimumSeats}
}
query := r.URL.Query()
listOptions := metav1.ListOptions{}
if err := metav1.Convert_url_Values_To_v1_ListOptions(&query, &listOptions, nil); err != nil {
klog.ErrorS(err, "Failed to convert options while estimating work for the list request")
// This request is destined to fail in the validation layer,
// return maximumSeats for this request to be consistent.
return WorkEstimate{InitialSeats: e.config.MaximumSeats}
}
isListFromCache := !shouldListFromStorage(query, &listOptions)
numStored, err := e.countGetterFn(key(requestInfo))
switch {
case err == ObjectCountStaleErr:
// object count going stale is indicative of degradation, so we should
// be conservative here and allocate maximum seats to this list request.
// NOTE: if a CRD is removed, its count will go stale first and then the
// pruner will eventually remove the CRD from the cache.
return WorkEstimate{InitialSeats: e.config.MaximumSeats}
case err == ObjectCountNotFoundErr:
// there are multiple scenarios in which we can see this error:
// a. the type is truly unknown, a typo on the caller's part.
// b. the count has gone stale for too long and the pruner
// has removed the type from the cache.
// c. the type is an aggregated resource that is served by a
// different apiserver (thus its object count is not updated)
// we don't have a way to distinguish between those situations.
// However, in case c, the request is delegated to a different apiserver,
// and thus its cost for our server is minimal. To avoid the situation
// when aggregated API calls are overestimated, we allocate the minimum
// possible seats (see #109106 as an example when being more conservative
// led to problems).
return WorkEstimate{InitialSeats: e.config.MinimumSeats}
case err != nil:
// we should never be here since Get returns either ObjectCountStaleErr or
// ObjectCountNotFoundErr, return maximumSeats to be on the safe side.
klog.ErrorS(err, "Unexpected error from object count tracker")
return WorkEstimate{InitialSeats: e.config.MaximumSeats}
}
limit := numStored
if utilfeature.DefaultFeatureGate.Enabled(features.APIListChunking) && listOptions.Limit > 0 &&
listOptions.Limit < numStored {
limit = listOptions.Limit
}
var estimatedObjectsToBeProcessed int64
switch {
case isListFromCache:
// TODO: For resources that implement indexes at the watchcache level,
// we need to adjust the cost accordingly
estimatedObjectsToBeProcessed = numStored
case listOptions.FieldSelector != "" || listOptions.LabelSelector != "":
estimatedObjectsToBeProcessed = numStored + limit
default:
estimatedObjectsToBeProcessed = 2 * limit
}
// for now, our rough estimate is to allocate one seat to each 100 obejcts that
// will be processed by the list request.
// we will come up with a different formula for the transformation function and/or
// fine tune this number in future iteratons.
seats := uint64(math.Ceil(float64(estimatedObjectsToBeProcessed) / e.config.ObjectsPerSeat))
// make sure we never return a seat of zero
if seats < e.config.MinimumSeats {
seats = e.config.MinimumSeats
}
if seats > e.config.MaximumSeats {
seats = e.config.MaximumSeats
}
return WorkEstimate{InitialSeats: seats}
}
func key(requestInfo *apirequest.RequestInfo) string {
groupResource := &schema.GroupResource{
Group: requestInfo.APIGroup,
Resource: requestInfo.Resource,
}
return groupResource.String()
}
// NOTICE: Keep in sync with shouldDelegateList function in
//
// staging/src/k8s.io/apiserver/pkg/storage/cacher/cacher.go
func shouldListFromStorage(query url.Values, opts *metav1.ListOptions) bool {
resourceVersion := opts.ResourceVersion
pagingEnabled := utilfeature.DefaultFeatureGate.Enabled(features.APIListChunking)
hasContinuation := pagingEnabled && len(opts.Continue) > 0
hasLimit := pagingEnabled && opts.Limit > 0 && resourceVersion != "0"
return resourceVersion == "" || hasContinuation || hasLimit || opts.ResourceVersionMatch == metav1.ResourceVersionMatchExact
}

View File

@@ -0,0 +1,131 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package request
import (
"math"
"net/http"
"time"
apirequest "k8s.io/apiserver/pkg/endpoints/request"
"k8s.io/apiserver/pkg/util/flowcontrol/metrics"
)
func newMutatingWorkEstimator(countFn watchCountGetterFunc, config *WorkEstimatorConfig) WorkEstimatorFunc {
estimator := &mutatingWorkEstimator{
config: config,
countFn: countFn,
}
return estimator.estimate
}
type mutatingWorkEstimator struct {
config *WorkEstimatorConfig
countFn watchCountGetterFunc
}
func (e *mutatingWorkEstimator) estimate(r *http.Request, flowSchemaName, priorityLevelName string) WorkEstimate {
// TODO(wojtekt): Remove once we tune the algorithm to not fail
// scalability tests.
if !e.config.Enabled {
return WorkEstimate{
InitialSeats: 1,
}
}
requestInfo, ok := apirequest.RequestInfoFrom(r.Context())
if !ok {
// no RequestInfo should never happen, but to be on the safe side
// let's return a large value.
return WorkEstimate{
InitialSeats: 1,
FinalSeats: e.config.MaximumSeats,
AdditionalLatency: e.config.eventAdditionalDuration(),
}
}
watchCount := e.countFn(requestInfo)
metrics.ObserveWatchCount(r.Context(), priorityLevelName, flowSchemaName, watchCount)
// The cost of the request associated with the watchers of that event
// consists of three parts:
// - cost of going through the event change logic
// - cost of serialization of the event
// - cost of processing an event object for each watcher (e.g. filtering,
// sending data over network)
// We're starting simple to get some operational experience with it and
// we will work on tuning the algorithm later. Given that the actual work
// associated with processing watch events is happening in multiple
// goroutines (proportional to the number of watchers) that are all
// resumed at once, as a starting point we assume that each such goroutine
// is taking 1/Nth of a seat for M milliseconds.
// We allow the accounting of that work in P&F to be reshaped into another
// rectangle of equal area for practical reasons.
var finalSeats uint64
var additionalLatency time.Duration
// TODO: Make this unconditional after we tune the algorithm better.
// Technically, there is an overhead connected to processing an event after
// the request finishes even if there is a small number of watches.
// However, until we tune the estimation we want to stay on the safe side
// an avoid introducing additional latency for almost every single request.
if watchCount >= int(e.config.WatchesPerSeat) {
// TODO: As described in the KEP, we should take into account that not all
// events are equal and try to estimate the cost of a single event based on
// some historical data about size of events.
finalSeats = uint64(math.Ceil(float64(watchCount) / e.config.WatchesPerSeat))
finalWork := SeatsTimesDuration(float64(finalSeats), e.config.eventAdditionalDuration())
// While processing individual events is highly parallel,
// the design/implementation of P&F has a couple limitations that
// make using this assumption in the P&F implementation very
// inefficient because:
// - we reserve max(initialSeats, finalSeats) for time of executing
// both phases of the request
// - even more importantly, when a given `wide` request is the one to
// be dispatched, we are not dispatching any other request until
// we accumulate enough seats to dispatch the nominated one, even
// if currently unoccupied seats would allow for dispatching some
// other requests in the meantime
// As a consequence of these, the wider the request, the more capacity
// will effectively be blocked and unused during dispatching and
// executing this request.
//
// To mitigate the impact of it, we're capping the maximum number of
// seats that can be assigned to a given request. Thanks to it:
// 1) we reduce the amount of seat-seconds that are "wasted" during
// dispatching and executing initial phase of the request
// 2) we are not changing the finalWork estimate - just potentially
// reshaping it to be narrower and longer. As long as the maximum
// seats setting will prevent dispatching too many requests at once
// to prevent overloading kube-apiserver (and/or etcd or the VM or
// a physical machine it is running on), we believe the relaxed
// version should be good enough to achieve the P&F goals.
//
// TODO: Confirm that the current cap of maximumSeats allow us to
// achieve the above.
if finalSeats > e.config.MaximumSeats {
finalSeats = e.config.MaximumSeats
}
additionalLatency = finalWork.DurationPerSeat(float64(finalSeats))
}
return WorkEstimate{
InitialSeats: 1,
FinalSeats: finalSeats,
AdditionalLatency: additionalLatency,
}
}

View File

@@ -0,0 +1,169 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package request
import (
"errors"
"sync"
"time"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/klog/v2"
"k8s.io/utils/clock"
)
const (
// type deletion (it applies mostly to CRD) is not a very frequent
// operation so we can afford to prune the cache at a large interval.
// at the same time, we also want to make sure that the scalability
// tests hit this code path.
pruneInterval = 1 * time.Hour
// the storage layer polls for object count at every 1m interval, we will allow
// up to 2-3 transient failures to get the latest count for a given resource.
staleTolerationThreshold = 3 * time.Minute
)
var (
// ObjectCountNotFoundErr is returned when the object count for
// a given resource is not being tracked.
ObjectCountNotFoundErr = errors.New("object count not found for the given resource")
// ObjectCountStaleErr is returned when the object count for a
// given resource has gone stale due to transient failures.
ObjectCountStaleErr = errors.New("object count has gone stale for the given resource")
)
// StorageObjectCountTracker is an interface that is used to keep track of
// of the total number of objects for each resource.
// {group}.{resource} is used as the key name to update and retrieve
// the total number of objects for a given resource.
type StorageObjectCountTracker interface {
// Set is invoked to update the current number of total
// objects for the given resource
Set(string, int64)
// Get returns the total number of objects for the given resource.
// The following errors are returned:
// - if the count has gone stale for a given resource due to transient
// failures ObjectCountStaleErr is returned.
// - if the given resource is not being tracked then
// ObjectCountNotFoundErr is returned.
Get(string) (int64, error)
// RunUntil starts all the necessary maintenance.
RunUntil(stopCh <-chan struct{})
}
// NewStorageObjectCountTracker returns an instance of
// StorageObjectCountTracker interface that can be used to
// keep track of the total number of objects for each resource.
func NewStorageObjectCountTracker() StorageObjectCountTracker {
return &objectCountTracker{
clock: &clock.RealClock{},
counts: map[string]*timestampedCount{},
}
}
// timestampedCount stores the count of a given resource with a last updated
// timestamp so we can prune it after it goes stale for certain threshold.
type timestampedCount struct {
count int64
lastUpdatedAt time.Time
}
// objectCountTracker implements StorageObjectCountTracker with
// reader/writer mutual exclusion lock.
type objectCountTracker struct {
clock clock.PassiveClock
lock sync.RWMutex
counts map[string]*timestampedCount
}
func (t *objectCountTracker) Set(groupResource string, count int64) {
if count <= -1 {
// a value of -1 indicates that the 'Count' call failed to contact
// the storage layer, in most cases this error can be transient.
// we will continue to work with the count that is in the cache
// up to a certain threshold defined by staleTolerationThreshold.
// in case this becomes a non transient error then the count for
// the given resource will will eventually be removed from
// the cache by the pruner.
return
}
now := t.clock.Now()
// lock for writing
t.lock.Lock()
defer t.lock.Unlock()
if item, ok := t.counts[groupResource]; ok {
item.count = count
item.lastUpdatedAt = now
return
}
t.counts[groupResource] = &timestampedCount{
count: count,
lastUpdatedAt: now,
}
}
func (t *objectCountTracker) Get(groupResource string) (int64, error) {
staleThreshold := t.clock.Now().Add(-staleTolerationThreshold)
t.lock.RLock()
defer t.lock.RUnlock()
if item, ok := t.counts[groupResource]; ok {
if item.lastUpdatedAt.Before(staleThreshold) {
return item.count, ObjectCountStaleErr
}
return item.count, nil
}
return 0, ObjectCountNotFoundErr
}
// RunUntil runs all the necessary maintenance.
func (t *objectCountTracker) RunUntil(stopCh <-chan struct{}) {
wait.PollUntil(
pruneInterval,
func() (bool, error) {
// always prune at every pruneInterval
return false, t.prune(pruneInterval)
}, stopCh)
klog.InfoS("StorageObjectCountTracker pruner is exiting")
}
func (t *objectCountTracker) prune(threshold time.Duration) error {
oldestLastUpdatedAtAllowed := t.clock.Now().Add(-threshold)
// lock for writing
t.lock.Lock()
defer t.lock.Unlock()
for groupResource, count := range t.counts {
if count.lastUpdatedAt.After(oldestLastUpdatedAtAllowed) {
continue
}
delete(t.counts, groupResource)
}
return nil
}

View File

@@ -0,0 +1,65 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package request
import (
"fmt"
"math"
"time"
)
// SeatSeconds is a measure of work, in units of seat-seconds, using a fixed-point representation.
// `SeatSeconds(n)` represents `n/ssScale` seat-seconds.
// The `ssScale` constant is private to the implementation here,
// no other code should use it.
type SeatSeconds uint64
// MaxSeatsSeconds is the maximum representable value of SeatSeconds
const MaxSeatSeconds = SeatSeconds(math.MaxUint64)
// MinSeatSeconds is the lowest representable value of SeatSeconds
const MinSeatSeconds = SeatSeconds(0)
// SeatsTimeDuration produces the SeatSeconds value for the given factors.
// This is intended only to produce small values, increments in work
// rather than amount of work done since process start.
func SeatsTimesDuration(seats float64, duration time.Duration) SeatSeconds {
return SeatSeconds(math.Round(seats * float64(duration/time.Nanosecond) / (1e9 / ssScale)))
}
// ToFloat converts to a floating-point representation.
// This conversion may lose precision.
func (ss SeatSeconds) ToFloat() float64 {
return float64(ss) / ssScale
}
// DurationPerSeat returns duration per seat.
// This division may lose precision.
func (ss SeatSeconds) DurationPerSeat(seats float64) time.Duration {
return time.Duration(float64(ss) / seats * (float64(time.Second) / ssScale))
}
// String converts to a string.
// This is suitable for large as well as small values.
func (ss SeatSeconds) String() string {
const div = SeatSeconds(ssScale)
quo := ss / div
rem := ss - quo*div
return fmt.Sprintf("%d.%08dss", quo, rem)
}
const ssScale = 1e8

View File

@@ -0,0 +1,113 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package request
import (
"fmt"
"net/http"
"time"
apirequest "k8s.io/apiserver/pkg/endpoints/request"
"k8s.io/klog/v2"
)
// WorkEstimate carries three of the four parameters that determine the work in a request.
// The fourth parameter is the duration of the initial phase of execution.
type WorkEstimate struct {
// InitialSeats is the number of seats occupied while the server is
// executing this request.
InitialSeats uint64
// FinalSeats is the number of seats occupied at the end,
// during the AdditionalLatency.
FinalSeats uint64
// AdditionalLatency specifies the additional duration the seats allocated
// to this request must be reserved after the given request had finished.
// AdditionalLatency should not have any impact on the user experience, the
// caller must not experience this additional latency.
AdditionalLatency time.Duration
}
// MaxSeats returns the maximum number of seats the request occupies over the
// phases of being served.
func (we *WorkEstimate) MaxSeats() int {
if we.InitialSeats >= we.FinalSeats {
return int(we.InitialSeats)
}
return int(we.FinalSeats)
}
// objectCountGetterFunc represents a function that gets the total
// number of objects for a given resource.
type objectCountGetterFunc func(string) (int64, error)
// watchCountGetterFunc represents a function that gets the total
// number of watchers potentially interested in a given request.
type watchCountGetterFunc func(*apirequest.RequestInfo) int
// NewWorkEstimator estimates the work that will be done by a given request,
// if no WorkEstimatorFunc matches the given request then the default
// work estimate of 1 seat is allocated to the request.
func NewWorkEstimator(objectCountFn objectCountGetterFunc, watchCountFn watchCountGetterFunc, config *WorkEstimatorConfig) WorkEstimatorFunc {
estimator := &workEstimator{
minimumSeats: config.MinimumSeats,
maximumSeats: config.MaximumSeats,
listWorkEstimator: newListWorkEstimator(objectCountFn, config),
mutatingWorkEstimator: newMutatingWorkEstimator(watchCountFn, config),
}
return estimator.estimate
}
// WorkEstimatorFunc returns the estimated work of a given request.
// This function will be used by the Priority & Fairness filter to
// estimate the work of of incoming requests.
type WorkEstimatorFunc func(request *http.Request, flowSchemaName, priorityLevelName string) WorkEstimate
func (e WorkEstimatorFunc) EstimateWork(r *http.Request, flowSchemaName, priorityLevelName string) WorkEstimate {
return e(r, flowSchemaName, priorityLevelName)
}
type workEstimator struct {
// the minimum number of seats a request must occupy
minimumSeats uint64
// the maximum number of seats a request can occupy
maximumSeats uint64
// listWorkEstimator estimates work for list request(s)
listWorkEstimator WorkEstimatorFunc
// mutatingWorkEstimator calculates the width of mutating request(s)
mutatingWorkEstimator WorkEstimatorFunc
}
func (e *workEstimator) estimate(r *http.Request, flowSchemaName, priorityLevelName string) WorkEstimate {
requestInfo, ok := apirequest.RequestInfoFrom(r.Context())
if !ok {
klog.ErrorS(fmt.Errorf("no RequestInfo found in context"), "Failed to estimate work for the request", "URI", r.RequestURI)
// no RequestInfo should never happen, but to be on the safe side let's return maximumSeats
return WorkEstimate{InitialSeats: e.maximumSeats}
}
switch requestInfo.Verb {
case "list":
return e.listWorkEstimator.EstimateWork(r, flowSchemaName, priorityLevelName)
case "create", "update", "patch", "delete":
return e.mutatingWorkEstimator.EstimateWork(r, flowSchemaName, priorityLevelName)
}
return WorkEstimate{InitialSeats: e.minimumSeats}
}

View File

@@ -19,7 +19,7 @@ package flowcontrol
import (
"strings"
flowcontrol "k8s.io/api/flowcontrol/v1beta1"
flowcontrol "k8s.io/api/flowcontrol/v1beta2"
"k8s.io/apiserver/pkg/authentication/serviceaccount"
"k8s.io/apiserver/pkg/authentication/user"
"k8s.io/apiserver/pkg/endpoints/request"

View File

@@ -0,0 +1,234 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package flowcontrol
import (
"net/http"
"sync"
metainternalversion "k8s.io/apimachinery/pkg/apis/meta/internalversion"
"k8s.io/apimachinery/pkg/apis/meta/internalversion/scheme"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apiserver/pkg/endpoints/request"
"k8s.io/klog/v2"
)
// readOnlyVerbs contains verbs for read-only requests.
var readOnlyVerbs = sets.NewString("get", "list", "watch", "proxy")
// watchIdentifier identifies group of watches that are similar.
// As described in the "Priority and Fairness" KEP, we consider
// watches similar if they have the same resourceType, namespace
// and name. We ignore selectors as they have to be evaluated
// when processing an even anyway.
//
// TODO: For now we only track the number of watches registered
// in our kube-apiserver. Eventually we should consider sharing
// this information with other kube-apiserver as described in the
// KEP, but this isn't part of the first version.
type watchIdentifier struct {
apiGroup string
resource string
namespace string
name string
}
// ForgetWatchFunc is a function that should be called to forget
// the previously registered watch from the watch tracker.
type ForgetWatchFunc func()
// WatchTracker is an interface that allows tracking the number
// of watches in the system for the purpose of estimating the
// cost of incoming mutating requests.
type WatchTracker interface {
// RegisterWatch reqisters a watch based on the provided http.Request
// in the tracker. It returns the function that should be called
// to forget the watcher once it is finished.
RegisterWatch(r *http.Request) ForgetWatchFunc
// GetInterestedWatchCount returns the number of watches that are
// potentially interested in a request with a given RequestInfo
// for the purpose of estimating cost of that request.
GetInterestedWatchCount(requestInfo *request.RequestInfo) int
}
// builtinIndexes represents of set of indexes registered in
// watchcache that are indexing watches and increase speed of
// their processing.
// We define the indexes as a map from a resource to the path
// to the field in the object on which the index is built.
type builtinIndexes map[string]string
func getBuiltinIndexes() builtinIndexes {
// The only existing indexes as of now are:
// - spec.nodeName for pods
// - metadata.Name for nodes, secrets and configmaps
// However, we can ignore the latter, because the requestInfo.Name
// is set for them (i.e. we already catch them correctly).
return map[string]string{
"pods": "spec.nodeName",
}
}
// watchTracker tracks the number of watches in the system for
// the purpose of estimating the cost of incoming mutating requests.
type watchTracker struct {
// indexes represents a set of registered indexes.
// It can't change after creation.
indexes builtinIndexes
lock sync.Mutex
watchCount map[watchIdentifier]int
}
func NewWatchTracker() WatchTracker {
return &watchTracker{
indexes: getBuiltinIndexes(),
watchCount: make(map[watchIdentifier]int),
}
}
const (
unsetValue = "<unset>"
)
func getIndexValue(r *http.Request, field string) string {
opts := metainternalversion.ListOptions{}
if err := scheme.ParameterCodec.DecodeParameters(r.URL.Query(), metav1.SchemeGroupVersion, &opts); err != nil {
klog.Warningf("Couldn't parse list options for %v: %v", r.URL.Query(), err)
return unsetValue
}
if opts.FieldSelector == nil {
return unsetValue
}
if value, ok := opts.FieldSelector.RequiresExactMatch(field); ok {
return value
}
return unsetValue
}
type indexValue struct {
resource string
value string
}
// RegisterWatch implements WatchTracker interface.
func (w *watchTracker) RegisterWatch(r *http.Request) ForgetWatchFunc {
requestInfo, ok := request.RequestInfoFrom(r.Context())
if !ok || requestInfo == nil || requestInfo.Verb != "watch" {
return nil
}
var index *indexValue
if indexField, ok := w.indexes[requestInfo.Resource]; ok {
index = &indexValue{
resource: requestInfo.Resource,
value: getIndexValue(r, indexField),
}
}
identifier := &watchIdentifier{
apiGroup: requestInfo.APIGroup,
resource: requestInfo.Resource,
namespace: requestInfo.Namespace,
name: requestInfo.Name,
}
w.lock.Lock()
defer w.lock.Unlock()
w.updateIndexLocked(identifier, index, 1)
return w.forgetWatch(identifier, index)
}
func (w *watchTracker) updateIndexLocked(identifier *watchIdentifier, index *indexValue, incr int) {
if index == nil {
w.watchCount[*identifier] += incr
} else {
// For resources with defined index, for a given watch event we are
// only processing the watchers that:
// (a) do not specify field selector for an index field
// (b) do specify field selector with the value equal to the value
// coming from the processed object
//
// TODO(wojtek-t): For the sake of making progress and initially
// simplifying the implementation, we approximate (b) for all values
// as the value for an empty string. The assumption we're making here
// is that the difference between the actual number of watchers that
// will be processed, i.e. (a)+(b) above and the one from our
// approximation i.e. (a)+[(b) for field value of ""] will be small.
// This seem to be true in almost all production clusters, which makes
// it a reasonable first step simplification to unblock progres on it.
if index.value == unsetValue || index.value == "" {
w.watchCount[*identifier] += incr
}
}
}
func (w *watchTracker) forgetWatch(identifier *watchIdentifier, index *indexValue) ForgetWatchFunc {
return func() {
w.lock.Lock()
defer w.lock.Unlock()
w.updateIndexLocked(identifier, index, -1)
if w.watchCount[*identifier] == 0 {
delete(w.watchCount, *identifier)
}
}
}
// GetInterestedWatchCount implements WatchTracker interface.
//
// TODO(wojtek-t): As of now, requestInfo for object creation (POST) doesn't
//
// contain the Name field set. Figure out if we can somehow get it for the
// more accurate cost estimation.
//
// TODO(wojtek-t): Figure out how to approach DELETECOLLECTION calls.
func (w *watchTracker) GetInterestedWatchCount(requestInfo *request.RequestInfo) int {
if requestInfo == nil || readOnlyVerbs.Has(requestInfo.Verb) {
return 0
}
result := 0
// The watches that we're interested in include:
// - watches for all objects of a resource type (no namespace and name specified)
// - watches for all objects of a resource type in the same namespace (no name specified)
// - watched interested in this particular object
identifier := &watchIdentifier{
apiGroup: requestInfo.APIGroup,
resource: requestInfo.Resource,
}
w.lock.Lock()
defer w.lock.Unlock()
result += w.watchCount[*identifier]
if requestInfo.Namespace != "" {
identifier.namespace = requestInfo.Namespace
result += w.watchCount[*identifier]
}
if requestInfo.Name != "" {
identifier.name = requestInfo.Name
result += w.watchCount[*identifier]
}
return result
}