update vendor

Signed-off-by: Roland.Ma <rolandma@yunify.com>
This commit is contained in:
Roland.Ma
2021-08-11 07:10:14 +00:00
parent a18f72b565
commit ea8f47c73a
2901 changed files with 269317 additions and 43103 deletions

View File

@@ -15,7 +15,6 @@ reviewers:
- mikedanese
- liggitt
- ncdc
- tallclair
- timothysc
- hongchaodeng
- krousey
@@ -23,7 +22,4 @@ reviewers:
- mml
- ingvagabund
- resouer
- mbohlool
- mqliang
- rrati
- enj

View File

@@ -39,29 +39,11 @@ import (
"k8s.io/apiserver/pkg/storage"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/client-go/tools/cache"
"k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
"k8s.io/klog"
"k8s.io/klog/v2"
utiltrace "k8s.io/utils/trace"
)
/*
* By default, all the following metrics are defined as falling under
* ALPHA stability level https://github.com/kubernetes/enhancements/blob/master/keps/sig-instrumentation/20190404-kubernetes-control-plane-metrics-stability.md#stability-classes)
*
* Promoting the stability level of the metric is a responsibility of the component owner, since it
* involves explicitly acknowledging support for the metric across multiple releases, in accordance with
* the metric stability policy.
*/
var (
initCounter = metrics.NewCounterVec(
&metrics.CounterOpts{
Name: "apiserver_init_events_total",
Help: "Counter of init events processed in watchcache broken by resource type",
StabilityLevel: metrics.ALPHA,
},
[]string{"resource"},
)
emptyFunc = func() {}
)
@@ -69,17 +51,15 @@ const (
// storageWatchListPageSize is the cacher's request chunk size of
// initial and resync watch lists to storage.
storageWatchListPageSize = int64(10000)
// defaultBookmarkFrequency defines how frequently watch bookmarks should be send
// in addition to sending a bookmark right before watch deadline.
//
// NOTE: Update `eventFreshDuration` when changing this value.
defaultBookmarkFrequency = time.Minute
)
func init() {
legacyregistry.MustRegister(initCounter)
}
// Config contains the configuration for a given Cache.
type Config struct {
// Maximum size of the history cached in memory.
CacheCapacity int
// An underlying storage.Interface.
Storage storage.Interface
@@ -112,6 +92,8 @@ type Config struct {
NewListFunc func() runtime.Object
Codec runtime.Codec
Clock clock.Clock
}
type watchersMap map[int]*cacheWatcher
@@ -176,28 +158,33 @@ func (i *indexedWatchers) terminateAll(objectType reflect.Type, done func(*cache
// second in a bucket, and pop up them once at the timeout. To be more specific,
// if you set fire time at X, you can get the bookmark within (X-1,X+1) period.
type watcherBookmarkTimeBuckets struct {
lock sync.Mutex
watchersBuckets map[int64][]*cacheWatcher
startBucketID int64
clock clock.Clock
lock sync.Mutex
// the key of watcherBuckets is the number of seconds since createTime
watchersBuckets map[int64][]*cacheWatcher
createTime time.Time
startBucketID int64
clock clock.Clock
bookmarkFrequency time.Duration
}
func newTimeBucketWatchers(clock clock.Clock) *watcherBookmarkTimeBuckets {
func newTimeBucketWatchers(clock clock.Clock, bookmarkFrequency time.Duration) *watcherBookmarkTimeBuckets {
return &watcherBookmarkTimeBuckets{
watchersBuckets: make(map[int64][]*cacheWatcher),
startBucketID: clock.Now().Unix(),
clock: clock,
watchersBuckets: make(map[int64][]*cacheWatcher),
createTime: clock.Now(),
startBucketID: 0,
clock: clock,
bookmarkFrequency: bookmarkFrequency,
}
}
// adds a watcher to the bucket, if the deadline is before the start, it will be
// added to the first one.
func (t *watcherBookmarkTimeBuckets) addWatcher(w *cacheWatcher) bool {
nextTime, ok := w.nextBookmarkTime(t.clock.Now())
nextTime, ok := w.nextBookmarkTime(t.clock.Now(), t.bookmarkFrequency)
if !ok {
return false
}
bucketID := nextTime.Unix()
bucketID := int64(nextTime.Sub(t.createTime) / time.Second)
t.lock.Lock()
defer t.lock.Unlock()
if bucketID < t.startBucketID {
@@ -209,7 +196,7 @@ func (t *watcherBookmarkTimeBuckets) addWatcher(w *cacheWatcher) bool {
}
func (t *watcherBookmarkTimeBuckets) popExpiredWatchers() [][]*cacheWatcher {
currentBucketID := t.clock.Now().Unix()
currentBucketID := int64(t.clock.Since(t.createTime) / time.Second)
// There should be one or two elements in almost all cases
expiredWatchers := make([][]*cacheWatcher, 0, 2)
t.lock.Lock()
@@ -279,7 +266,7 @@ type Cacher struct {
// Defines a time budget that can be spend on waiting for not-ready watchers
// while dispatching event before shutting them down.
dispatchTimeoutBudget *timeBudget
dispatchTimeoutBudget timeBudget
// Handling graceful termination.
stopLock sync.RWMutex
@@ -336,11 +323,14 @@ func NewCacherFromConfig(config Config) (*Cacher, error) {
}
}
clock := clock.RealClock{}
if config.Clock == nil {
config.Clock = clock.RealClock{}
}
objType := reflect.TypeOf(obj)
cacher := &Cacher{
ready: newReady(),
storage: config.Storage,
objectType: reflect.TypeOf(obj),
objectType: objType,
versioner: config.Versioner,
newFunc: config.NewFunc,
indexedTrigger: indexedTrigger,
@@ -358,9 +348,9 @@ func NewCacherFromConfig(config Config) (*Cacher, error) {
// and there are no guarantees on the order that they will stop.
// So we will be simply closing the channel, and synchronizing on the WaitGroup.
stopCh: stopCh,
clock: clock,
clock: config.Clock,
timer: time.NewTimer(time.Duration(0)),
bookmarkWatchers: newTimeBucketWatchers(clock),
bookmarkWatchers: newTimeBucketWatchers(config.Clock, defaultBookmarkFrequency),
}
// Ensure that timer is stopped.
@@ -371,7 +361,7 @@ func NewCacherFromConfig(config Config) (*Cacher, error) {
}
watchCache := newWatchCache(
config.CacheCapacity, config.KeyFunc, cacher.processEvent, config.GetAttrsFunc, config.Versioner, config.Indexers)
config.KeyFunc, cacher.processEvent, config.GetAttrsFunc, config.Versioner, config.Indexers, config.Clock, objType)
listerWatcher := NewCacherListerWatcher(config.Storage, config.ResourcePrefix, config.NewListFunc)
reflectorName := "storage/cacher.go:" + config.ResourcePrefix
@@ -412,6 +402,7 @@ func (c *Cacher) startCaching(stopChannel <-chan struct{}) {
c.watchCache.SetOnReplace(func() {
successfulList = true
c.ready.set(true)
klog.V(1).Infof("cacher (%v): initialized", c.objectType.String())
})
defer func() {
if successfulList {
@@ -425,7 +416,7 @@ func (c *Cacher) startCaching(stopChannel <-chan struct{}) {
// Also note that startCaching is called in a loop, so there's no need
// to have another loop here.
if err := c.reflector.ListAndWatch(stopChannel); err != nil {
klog.Errorf("unexpected ListAndWatch error: %v", err)
klog.Errorf("cacher (%v): unexpected ListAndWatch error: %v; reinitializing...", c.objectType.String(), err)
}
}
@@ -440,13 +431,27 @@ func (c *Cacher) Create(ctx context.Context, key string, obj, out runtime.Object
}
// Delete implements storage.Interface.
func (c *Cacher) Delete(ctx context.Context, key string, out runtime.Object, preconditions *storage.Preconditions, validateDeletion storage.ValidateObjectFunc) error {
return c.storage.Delete(ctx, key, out, preconditions, validateDeletion)
func (c *Cacher) Delete(
ctx context.Context, key string, out runtime.Object, preconditions *storage.Preconditions,
validateDeletion storage.ValidateObjectFunc, _ runtime.Object) error {
// Ignore the suggestion and try to pass down the current version of the object
// read from cache.
if elem, exists, err := c.watchCache.GetByKey(key); err != nil {
klog.Errorf("GetByKey returned error: %v", err)
} else if exists {
// DeepCopy the object since we modify resource version when serializing the
// current object.
currObj := elem.(*storeElement).Object.DeepCopyObject()
return c.storage.Delete(ctx, key, out, preconditions, validateDeletion, currObj)
}
// If we couldn't get the object, fallback to no-suggestion.
return c.storage.Delete(ctx, key, out, preconditions, validateDeletion, nil)
}
// Watch implements storage.Interface.
func (c *Cacher) Watch(ctx context.Context, key string, resourceVersion string, pred storage.SelectionPredicate) (watch.Interface, error) {
watchRV, err := c.versioner.ParseResourceVersion(resourceVersion)
func (c *Cacher) Watch(ctx context.Context, key string, opts storage.ListOptions) (watch.Interface, error) {
pred := opts.Predicate
watchRV, err := c.versioner.ParseResourceVersion(opts.ResourceVersion)
if err != nil {
return nil, err
}
@@ -482,11 +487,14 @@ func (c *Cacher) Watch(ctx context.Context, key string, resourceVersion string,
// Determine watch timeout('0' means deadline is not set, ignore checking)
deadline, _ := ctx.Deadline()
identifier := fmt.Sprintf("key: %q, labels: %q, fields: %q", key, pred.Label, pred.Field)
// Create a watcher here to reduce memory allocations under lock,
// given that memory allocation may trigger GC and block the thread.
// Also note that emptyFunc is a placeholder, until we will be able
// to compute watcher.forget function (which has to happen under lock).
watcher := newCacheWatcher(chanSize, filterWithAttrsFunction(key, pred), emptyFunc, c.versioner, deadline, pred.AllowWatchBookmarks, c.objectType)
watcher := newCacheWatcher(chanSize, filterWithAttrsFunction(key, pred), emptyFunc, c.versioner, deadline, pred.AllowWatchBookmarks, c.objectType, identifier)
// We explicitly use thread unsafe version and do locking ourself to ensure that
// no new events will be processed in the meantime. The watchCache will be unlocked
@@ -529,22 +537,22 @@ func (c *Cacher) Watch(ctx context.Context, key string, resourceVersion string,
}
// WatchList implements storage.Interface.
func (c *Cacher) WatchList(ctx context.Context, key string, resourceVersion string, pred storage.SelectionPredicate) (watch.Interface, error) {
return c.Watch(ctx, key, resourceVersion, pred)
func (c *Cacher) WatchList(ctx context.Context, key string, opts storage.ListOptions) (watch.Interface, error) {
return c.Watch(ctx, key, opts)
}
// Get implements storage.Interface.
func (c *Cacher) Get(ctx context.Context, key string, resourceVersion string, objPtr runtime.Object, ignoreNotFound bool) error {
if resourceVersion == "" {
func (c *Cacher) Get(ctx context.Context, key string, opts storage.GetOptions, objPtr runtime.Object) error {
if opts.ResourceVersion == "" {
// If resourceVersion is not specified, serve it from underlying
// storage (for backward compatibility).
return c.storage.Get(ctx, key, resourceVersion, objPtr, ignoreNotFound)
return c.storage.Get(ctx, key, opts, objPtr)
}
// If resourceVersion is specified, serve it from cache.
// It's guaranteed that the returned value is at least that
// fresh as the given resourceVersion.
getRV, err := c.versioner.ParseResourceVersion(resourceVersion)
getRV, err := c.versioner.ParseResourceVersion(opts.ResourceVersion)
if err != nil {
return err
}
@@ -552,7 +560,7 @@ func (c *Cacher) Get(ctx context.Context, key string, resourceVersion string, ob
if getRV == 0 && !c.ready.check() {
// If Cacher is not yet initialized and we don't require any specific
// minimal resource version, simply forward the request to storage.
return c.storage.Get(ctx, key, resourceVersion, objPtr, ignoreNotFound)
return c.storage.Get(ctx, key, opts, objPtr)
}
// Do not create a trace - it's not for free and there are tons
@@ -577,7 +585,7 @@ func (c *Cacher) Get(ctx context.Context, key string, resourceVersion string, ob
objVal.Set(reflect.ValueOf(elem.Object).Elem())
} else {
objVal.Set(reflect.Zero(objVal.Type()))
if !ignoreNotFound {
if !opts.IgnoreNotFound {
return storage.NewKeyNotFoundError(key, int64(readResourceVersion))
}
}
@@ -585,18 +593,20 @@ func (c *Cacher) Get(ctx context.Context, key string, resourceVersion string, ob
}
// GetToList implements storage.Interface.
func (c *Cacher) GetToList(ctx context.Context, key string, resourceVersion string, pred storage.SelectionPredicate, listObj runtime.Object) error {
func (c *Cacher) GetToList(ctx context.Context, key string, opts storage.ListOptions, listObj runtime.Object) error {
resourceVersion := opts.ResourceVersion
pred := opts.Predicate
pagingEnabled := utilfeature.DefaultFeatureGate.Enabled(features.APIListChunking)
hasContinuation := pagingEnabled && len(pred.Continue) > 0
hasLimit := pagingEnabled && pred.Limit > 0 && resourceVersion != "0"
if resourceVersion == "" || hasContinuation || hasLimit {
if resourceVersion == "" || hasContinuation || hasLimit || opts.ResourceVersionMatch == metav1.ResourceVersionMatchExact {
// If resourceVersion is not specified, serve it from underlying
// storage (for backward compatibility). If a continuation is
// requested, serve it from the underlying storage as well.
// Limits are only sent to storage when resourceVersion is non-zero
// since the watch cache isn't able to perform continuations, and
// limits are ignored when resource version is zero
return c.storage.GetToList(ctx, key, resourceVersion, pred, listObj)
return c.storage.GetToList(ctx, key, opts, listObj)
}
// If resourceVersion is specified, serve it from cache.
@@ -610,7 +620,7 @@ func (c *Cacher) GetToList(ctx context.Context, key string, resourceVersion stri
if listRV == 0 && !c.ready.check() {
// If Cacher is not yet initialized and we don't require any specific
// minimal resource version, simply forward the request to storage.
return c.storage.GetToList(ctx, key, resourceVersion, pred, listObj)
return c.storage.GetToList(ctx, key, opts, listObj)
}
trace := utiltrace.New("cacher list", utiltrace.Field{"type", c.objectType.String()})
@@ -657,18 +667,20 @@ func (c *Cacher) GetToList(ctx context.Context, key string, resourceVersion stri
}
// List implements storage.Interface.
func (c *Cacher) List(ctx context.Context, key string, resourceVersion string, pred storage.SelectionPredicate, listObj runtime.Object) error {
func (c *Cacher) List(ctx context.Context, key string, opts storage.ListOptions, listObj runtime.Object) error {
resourceVersion := opts.ResourceVersion
pred := opts.Predicate
pagingEnabled := utilfeature.DefaultFeatureGate.Enabled(features.APIListChunking)
hasContinuation := pagingEnabled && len(pred.Continue) > 0
hasLimit := pagingEnabled && pred.Limit > 0 && resourceVersion != "0"
if resourceVersion == "" || hasContinuation || hasLimit {
if resourceVersion == "" || hasContinuation || hasLimit || opts.ResourceVersionMatch == metav1.ResourceVersionMatchExact {
// If resourceVersion is not specified, serve it from underlying
// storage (for backward compatibility). If a continuation is
// requested, serve it from the underlying storage as well.
// Limits are only sent to storage when resourceVersion is non-zero
// since the watch cache isn't able to perform continuations, and
// limits are ignored when resource version is zero.
return c.storage.List(ctx, key, resourceVersion, pred, listObj)
return c.storage.List(ctx, key, opts, listObj)
}
// If resourceVersion is specified, serve it from cache.
@@ -682,7 +694,7 @@ func (c *Cacher) List(ctx context.Context, key string, resourceVersion string, p
if listRV == 0 && !c.ready.check() {
// If Cacher is not yet initialized and we don't require any specific
// minimal resource version, simply forward the request to storage.
return c.storage.List(ctx, key, resourceVersion, pred, listObj)
return c.storage.List(ctx, key, opts, listObj)
}
trace := utiltrace.New("cacher list", utiltrace.Field{"type", c.objectType.String()})
@@ -737,17 +749,19 @@ func (c *Cacher) List(ctx context.Context, key string, resourceVersion string, p
// GuaranteedUpdate implements storage.Interface.
func (c *Cacher) GuaranteedUpdate(
ctx context.Context, key string, ptrToType runtime.Object, ignoreNotFound bool,
preconditions *storage.Preconditions, tryUpdate storage.UpdateFunc, _ ...runtime.Object) error {
preconditions *storage.Preconditions, tryUpdate storage.UpdateFunc, _ runtime.Object) error {
// Ignore the suggestion and try to pass down the current version of the object
// read from cache.
if elem, exists, err := c.watchCache.GetByKey(key); err != nil {
klog.Errorf("GetByKey returned error: %v", err)
} else if exists {
// DeepCopy the object since we modify resource version when serializing the
// current object.
currObj := elem.(*storeElement).Object.DeepCopyObject()
return c.storage.GuaranteedUpdate(ctx, key, ptrToType, ignoreNotFound, preconditions, tryUpdate, currObj)
}
// If we couldn't get the object, fallback to no-suggestion.
return c.storage.GuaranteedUpdate(ctx, key, ptrToType, ignoreNotFound, preconditions, tryUpdate)
return c.storage.GuaranteedUpdate(ctx, key, ptrToType, ignoreNotFound, preconditions, tryUpdate, nil)
}
// Count implements storage.Interface.
@@ -800,7 +814,19 @@ func (c *Cacher) dispatchEvents() {
if !ok {
return
}
c.dispatchEvent(&event)
// Don't dispatch bookmarks coming from the storage layer.
// They can be very frequent (even to the level of subseconds)
// to allow efficient watch resumption on kube-apiserver restarts,
// and propagating them down may overload the whole system.
//
// TODO: If at some point we decide the performance and scalability
// footprint is acceptable, this is the place to hook them in.
// However, we then need to check if this was called as a result
// of a bookmark event or regular Add/Update/Delete operation by
// checking if resourceVersion here has changed.
if event.Type != watch.Bookmark {
c.dispatchEvent(&event)
}
lastProcessedResourceVersion = event.ResourceVersion
case <-bookmarkTimer.C():
bookmarkTimer.Reset(wait.Jitter(time.Second, 0.25))
@@ -935,9 +961,8 @@ func (c *Cacher) startDispatchingBookmarkEvents() {
continue
}
c.watchersBuffer = append(c.watchersBuffer, watcher)
// Given that we send bookmark event once at deadline-2s, never push again
// after the watcher pops up from the buckets. Once we decide to change the
// strategy to more sophisticated, we may need it here.
// Requeue the watcher for the next bookmark if needed.
c.bookmarkWatchers.addWatcher(watcher)
}
}
}
@@ -1098,7 +1123,7 @@ func (lw *cacherListerWatcher) List(options metav1.ListOptions) (runtime.Object,
Continue: options.Continue,
}
if err := lw.storage.List(context.TODO(), lw.resourcePrefix, "", pred, list); err != nil {
if err := lw.storage.List(context.TODO(), lw.resourcePrefix, storage.ListOptions{ResourceVersionMatch: options.ResourceVersionMatch, Predicate: pred}, list); err != nil {
return nil, err
}
return list, nil
@@ -1106,7 +1131,14 @@ func (lw *cacherListerWatcher) List(options metav1.ListOptions) (runtime.Object,
// Implements cache.ListerWatcher interface.
func (lw *cacherListerWatcher) Watch(options metav1.ListOptions) (watch.Interface, error) {
return lw.storage.WatchList(context.TODO(), lw.resourcePrefix, options.ResourceVersion, storage.Everything)
opts := storage.ListOptions{
ResourceVersion: options.ResourceVersion,
Predicate: storage.Everything,
}
if utilfeature.DefaultFeatureGate.Enabled(features.EfficientWatchResumption) {
opts.ProgressNotify = true
}
return lw.storage.WatchList(context.TODO(), lw.resourcePrefix, opts)
}
// errWatcher implements watch.Interface to return a single error
@@ -1165,9 +1197,13 @@ type cacheWatcher struct {
allowWatchBookmarks bool
// Object type of the cache watcher interests
objectType reflect.Type
// human readable identifier that helps assigning cacheWatcher
// instance with request
identifier string
}
func newCacheWatcher(chanSize int, filter filterWithAttrsFunc, forget func(), versioner storage.Versioner, deadline time.Time, allowWatchBookmarks bool, objectType reflect.Type) *cacheWatcher {
func newCacheWatcher(chanSize int, filter filterWithAttrsFunc, forget func(), versioner storage.Versioner, deadline time.Time, allowWatchBookmarks bool, objectType reflect.Type, identifier string) *cacheWatcher {
return &cacheWatcher{
input: make(chan *watchCacheEvent, chanSize),
result: make(chan watch.Event, chanSize),
@@ -1179,6 +1215,7 @@ func newCacheWatcher(chanSize int, filter filterWithAttrsFunc, forget func(), ve
deadline: deadline,
allowWatchBookmarks: allowWatchBookmarks,
objectType: objectType,
identifier: identifier,
}
}
@@ -1221,7 +1258,8 @@ func (c *cacheWatcher) add(event *watchCacheEvent, timer *time.Timer) bool {
// This means that we couldn't send event to that watcher.
// Since we don't want to block on it infinitely,
// we simply terminate it.
klog.V(1).Infof("Forcing watcher close due to unresponsiveness: %v", c.objectType.String())
klog.V(1).Infof("Forcing %v watcher close due to unresponsiveness: %v. len(c.input) = %v, len(c.result) = %v", c.objectType.String(), c.identifier, len(c.input), len(c.result))
terminatedWatchersCounter.WithLabelValues(c.objectType.String()).Inc()
c.forget()
}
@@ -1240,13 +1278,28 @@ func (c *cacheWatcher) add(event *watchCacheEvent, timer *time.Timer) bool {
}
}
func (c *cacheWatcher) nextBookmarkTime(now time.Time) (time.Time, bool) {
// For now we return 2s before deadline (and maybe +infinity is now already passed this time)
// but it gives us extensibility for the future(false when deadline is not set).
func (c *cacheWatcher) nextBookmarkTime(now time.Time, bookmarkFrequency time.Duration) (time.Time, bool) {
// We try to send bookmarks:
// (a) roughly every minute
// (b) right before the watcher timeout - for now we simply set it 2s before
// the deadline
// The former gives us periodicity if the watch breaks due to unexpected
// conditions, the later ensures that on timeout the watcher is as close to
// now as possible - this covers 99% of cases.
heartbeatTime := now.Add(bookmarkFrequency)
if c.deadline.IsZero() {
return c.deadline, false
// Timeout is set by our client libraries (e.g. reflector) as well as defaulted by
// apiserver if properly configured. So this shoudln't happen in practice.
return heartbeatTime, true
}
return c.deadline.Add(-2 * time.Second), true
if pretimeoutTime := c.deadline.Add(-2 * time.Second); pretimeoutTime.Before(heartbeatTime) {
heartbeatTime = pretimeoutTime
}
if heartbeatTime.Before(now) {
return time.Time{}, false
}
return heartbeatTime, true
}
func getEventObject(object runtime.Object) runtime.Object {
@@ -1357,7 +1410,7 @@ func (c *cacheWatcher) process(ctx context.Context, initEvents []*watchCacheEven
}
processingTime := time.Since(startTime)
if processingTime > initProcessThreshold {
klog.V(2).Infof("processing %d initEvents of %s took %v", len(initEvents), objType, processingTime)
klog.V(2).Infof("processing %d initEvents of %s (%s) took %v", len(initEvents), objType, c.identifier, processingTime)
}
defer close(c.result)

View File

@@ -30,7 +30,7 @@ import (
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/types"
"k8s.io/klog"
"k8s.io/klog/v2"
)
var _ runtime.CacheableObject = &cachingObject{}

95
vendor/k8s.io/apiserver/pkg/storage/cacher/metrics.go generated vendored Normal file
View File

@@ -0,0 +1,95 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cacher
import (
"k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
)
/*
* By default, all the following metrics are defined as falling under
* ALPHA stability level https://github.com/kubernetes/enhancements/blob/master/keps/sig-instrumentation/1209-metrics-stability/20190404-kubernetes-control-plane-metrics-stability.md#stability-classes)
*
* Promoting the stability level of the metric is a responsibility of the component owner, since it
* involves explicitly acknowledging support for the metric across multiple releases, in accordance with
* the metric stability policy.
*/
var (
initCounter = metrics.NewCounterVec(
&metrics.CounterOpts{
Name: "apiserver_init_events_total",
Help: "Counter of init events processed in watchcache broken by resource type.",
StabilityLevel: metrics.ALPHA,
},
[]string{"resource"},
)
terminatedWatchersCounter = metrics.NewCounterVec(
&metrics.CounterOpts{
Name: "apiserver_terminated_watchers_total",
Help: "Counter of watchers closed due to unresponsiveness broken by resource type.",
StabilityLevel: metrics.ALPHA,
},
[]string{"resource"},
)
watchCacheCapacityIncreaseTotal = metrics.NewCounterVec(
&metrics.CounterOpts{
Name: "watch_cache_capacity_increase_total",
Help: "Total number of watch cache capacity increase events broken by resource type.",
StabilityLevel: metrics.ALPHA,
},
[]string{"resource"},
)
watchCacheCapacityDecreaseTotal = metrics.NewCounterVec(
&metrics.CounterOpts{
Name: "watch_cache_capacity_decrease_total",
Help: "Total number of watch cache capacity decrease events broken by resource type.",
StabilityLevel: metrics.ALPHA,
},
[]string{"resource"},
)
watchCacheCapacity = metrics.NewGaugeVec(
&metrics.GaugeOpts{
Name: "watch_cache_capacity",
Help: "Total capacity of watch cache broken by resource type.",
StabilityLevel: metrics.ALPHA,
},
[]string{"resource"},
)
)
func init() {
legacyregistry.MustRegister(initCounter)
legacyregistry.MustRegister(terminatedWatchersCounter)
legacyregistry.MustRegister(watchCacheCapacityIncreaseTotal)
legacyregistry.MustRegister(watchCacheCapacityDecreaseTotal)
legacyregistry.MustRegister(watchCacheCapacity)
}
// recordsWatchCacheCapacityChange record watchCache capacity resize(increase or decrease) operations.
func recordsWatchCacheCapacityChange(objType string, old, new int) {
if old < new {
watchCacheCapacityIncreaseTotal.WithLabelValues(objType).Inc()
return
}
watchCacheCapacityDecreaseTotal.WithLabelValues(objType).Inc()
watchCacheCapacity.WithLabelValues(objType).Set(float64(new))
}

View File

@@ -39,7 +39,12 @@ const (
// NOTE: It's not recommended to be used concurrently from multiple threads -
// if first user takes the whole timeout, the second one will get 0 timeout
// even though the first one may return something later.
type timeBudget struct {
type timeBudget interface {
takeAvailable() time.Duration
returnUnused(unused time.Duration)
}
type timeBudgetImpl struct {
sync.Mutex
budget time.Duration
@@ -47,8 +52,8 @@ type timeBudget struct {
maxBudget time.Duration
}
func newTimeBudget(stopCh <-chan struct{}) *timeBudget {
result := &timeBudget{
func newTimeBudget(stopCh <-chan struct{}) timeBudget {
result := &timeBudgetImpl{
budget: time.Duration(0),
refresh: refreshPerSecond,
maxBudget: maxBudget,
@@ -57,7 +62,7 @@ func newTimeBudget(stopCh <-chan struct{}) *timeBudget {
return result
}
func (t *timeBudget) periodicallyRefresh(stopCh <-chan struct{}) {
func (t *timeBudgetImpl) periodicallyRefresh(stopCh <-chan struct{}) {
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for {
@@ -74,7 +79,7 @@ func (t *timeBudget) periodicallyRefresh(stopCh <-chan struct{}) {
}
}
func (t *timeBudget) takeAvailable() time.Duration {
func (t *timeBudgetImpl) takeAvailable() time.Duration {
t.Lock()
defer t.Unlock()
result := t.budget
@@ -82,7 +87,7 @@ func (t *timeBudget) takeAvailable() time.Duration {
return result
}
func (t *timeBudget) returnUnused(unused time.Duration) {
func (t *timeBudgetImpl) returnUnused(unused time.Duration) {
t.Lock()
defer t.Unlock()
if unused < 0 {

View File

@@ -44,3 +44,17 @@ func hasPathPrefix(s, pathPrefix string) bool {
}
return false
}
func max(a, b int) int {
if a > b {
return a
}
return b
}
func min(a, b int) int {
if a < b {
return a
}
return b
}

View File

@@ -18,6 +18,7 @@ package cacher
import (
"fmt"
"reflect"
"sort"
"sync"
"time"
@@ -30,7 +31,7 @@ import (
"k8s.io/apimachinery/pkg/watch"
"k8s.io/apiserver/pkg/storage"
"k8s.io/client-go/tools/cache"
"k8s.io/klog"
"k8s.io/klog/v2"
utiltrace "k8s.io/utils/trace"
)
@@ -44,6 +45,19 @@ const (
// resourceVersionTooHighRetrySeconds is the seconds before a operation should be retried by the client
// after receiving a 'too high resource version' error.
resourceVersionTooHighRetrySeconds = 1
// eventFreshDuration is time duration of events we want to keep.
// We set it to `defaultBookmarkFrequency` plus epsilon to maximize
// chances that last bookmark was sent within kept history, at the
// same time, minimizing the needed memory usage.
eventFreshDuration = 75 * time.Second
// defaultLowerBoundCapacity is a default value for event cache capacity's lower bound.
// TODO: Figure out, to what value we can decreased it.
defaultLowerBoundCapacity = 100
// defaultUpperBoundCapacity should be able to keep eventFreshDuration of history.
defaultUpperBoundCapacity = 100 * 1024
)
// watchCacheEvent is a single "watch event" that is send to users of
@@ -60,6 +74,7 @@ type watchCacheEvent struct {
PrevObjFields fields.Set
Key string
ResourceVersion uint64
RecordTime time.Time
}
// Computing a key of an object is generally non-trivial (it performs
@@ -126,6 +141,12 @@ type watchCache struct {
// Maximum size of history window.
capacity int
// upper bound of capacity since event cache has a dynamic size.
upperBoundCapacity int
// lower bound of capacity since event cache has a dynamic size.
lowerBoundCapacity int
// keyFunc is used to get a key in the underlying storage for a given object.
keyFunc func(runtime.Object) (string, error)
@@ -165,29 +186,38 @@ type watchCache struct {
// An underlying storage.Versioner.
versioner storage.Versioner
// cacher's objectType.
objectType reflect.Type
}
func newWatchCache(
capacity int,
keyFunc func(runtime.Object) (string, error),
eventHandler func(*watchCacheEvent),
getAttrsFunc func(runtime.Object) (labels.Set, fields.Set, error),
versioner storage.Versioner,
indexers *cache.Indexers) *watchCache {
indexers *cache.Indexers,
clock clock.Clock,
objectType reflect.Type) *watchCache {
wc := &watchCache{
capacity: capacity,
capacity: defaultLowerBoundCapacity,
keyFunc: keyFunc,
getAttrsFunc: getAttrsFunc,
cache: make([]*watchCacheEvent, capacity),
cache: make([]*watchCacheEvent, defaultLowerBoundCapacity),
lowerBoundCapacity: defaultLowerBoundCapacity,
upperBoundCapacity: defaultUpperBoundCapacity,
startIndex: 0,
endIndex: 0,
store: cache.NewIndexer(storeElementKey, storeElementIndexers(indexers)),
resourceVersion: 0,
listResourceVersion: 0,
eventHandler: eventHandler,
clock: clock.RealClock{},
clock: clock,
versioner: versioner,
objectType: objectType,
}
objType := objectType.String()
watchCacheCapacity.WithLabelValues(objType).Set(float64(wc.capacity))
wc.cond = sync.NewCond(wc.RLocker())
return wc
}
@@ -260,6 +290,7 @@ func (w *watchCache) processEvent(event watch.Event, resourceVersion uint64, upd
ObjFields: elem.Fields,
Key: key,
ResourceVersion: resourceVersion,
RecordTime: w.clock.Now(),
}
if err := func() error {
@@ -291,8 +322,9 @@ func (w *watchCache) processEvent(event watch.Event, resourceVersion uint64, upd
}
// Avoid calling event handler under lock.
// This is safe as long as there is at most one call to processEvent in flight
// at any point in time.
// This is safe as long as there is at most one call to Add/Update/Delete and
// UpdateResourceVersion in flight at any point in time, which is true now,
// because reflector calls them synchronously from its main thread.
if w.eventHandler != nil {
w.eventHandler(wcEvent)
}
@@ -301,7 +333,8 @@ func (w *watchCache) processEvent(event watch.Event, resourceVersion uint64, upd
// Assumes that lock is already held for write.
func (w *watchCache) updateCache(event *watchCacheEvent) {
if w.endIndex == w.startIndex+w.capacity {
w.resizeCacheLocked(event.RecordTime)
if w.isCacheFullLocked() {
// Cache is full - remove the oldest element.
w.startIndex++
}
@@ -309,6 +342,74 @@ func (w *watchCache) updateCache(event *watchCacheEvent) {
w.endIndex++
}
// resizeCacheLocked resizes the cache if necessary:
// - increases capacity by 2x if cache is full and all cached events occurred within last eventFreshDuration.
// - decreases capacity by 2x when recent quarter of events occurred outside of eventFreshDuration(protect watchCache from flapping).
func (w *watchCache) resizeCacheLocked(eventTime time.Time) {
if w.isCacheFullLocked() && eventTime.Sub(w.cache[w.startIndex%w.capacity].RecordTime) < eventFreshDuration {
capacity := min(w.capacity*2, w.upperBoundCapacity)
if capacity > w.capacity {
w.doCacheResizeLocked(capacity)
}
return
}
if w.isCacheFullLocked() && eventTime.Sub(w.cache[(w.endIndex-w.capacity/4)%w.capacity].RecordTime) > eventFreshDuration {
capacity := max(w.capacity/2, w.lowerBoundCapacity)
if capacity < w.capacity {
w.doCacheResizeLocked(capacity)
}
return
}
}
// isCacheFullLocked used to judge whether watchCacheEvent is full.
// Assumes that lock is already held for write.
func (w *watchCache) isCacheFullLocked() bool {
return w.endIndex == w.startIndex+w.capacity
}
// doCacheResizeLocked resize watchCache's event array with different capacity.
// Assumes that lock is already held for write.
func (w *watchCache) doCacheResizeLocked(capacity int) {
newCache := make([]*watchCacheEvent, capacity)
if capacity < w.capacity {
// adjust startIndex if cache capacity shrink.
w.startIndex = w.endIndex - capacity
}
for i := w.startIndex; i < w.endIndex; i++ {
newCache[i%capacity] = w.cache[i%w.capacity]
}
w.cache = newCache
recordsWatchCacheCapacityChange(w.objectType.String(), w.capacity, capacity)
w.capacity = capacity
}
func (w *watchCache) UpdateResourceVersion(resourceVersion string) {
rv, err := w.versioner.ParseResourceVersion(resourceVersion)
if err != nil {
klog.Errorf("Couldn't parse resourceVersion: %v", err)
return
}
func() {
w.Lock()
defer w.Unlock()
w.resourceVersion = rv
}()
// Avoid calling event handler under lock.
// This is safe as long as there is at most one call to Add/Update/Delete and
// UpdateResourceVersion in flight at any point in time, which is true now,
// because reflector calls them synchronously from its main thread.
if w.eventHandler != nil {
wcEvent := &watchCacheEvent{
Type: watch.Bookmark,
ResourceVersion: rv,
}
w.eventHandler(wcEvent)
}
}
// List returns list of pointers to <storeElement> objects.
func (w *watchCache) List() []interface{} {
return w.store.List()
@@ -460,19 +561,16 @@ func (w *watchCache) GetAllEventsSinceThreadUnsafe(resourceVersion uint64) ([]*w
size := w.endIndex - w.startIndex
var oldest uint64
switch {
case size >= w.capacity:
// Once the watch event buffer is full, the oldest watch event we can deliver
// is the first one in the buffer.
oldest = w.cache[w.startIndex%w.capacity].ResourceVersion
case w.listResourceVersion > 0:
// If the watch event buffer isn't full, the oldest watch event we can deliver
// is one greater than the resource version of the last full list.
case w.listResourceVersion > 0 && w.startIndex == 0:
// If no event was removed from the buffer since last relist, the oldest watch
// event we can deliver is one greater than the resource version of the list.
oldest = w.listResourceVersion + 1
case size > 0:
// If we've never completed a list, use the resourceVersion of the oldest event
// in the buffer.
// This should only happen in unit tests that populate the buffer without
// performing list/replace operations.
// If the previous condition is not satisfied: either some event was already
// removed from the buffer or we've never completed a list (the latter can
// only happen in unit tests that populate the buffer without performing
// list/replace operations), the oldest watch event we can deliver is the first
// one in the buffer.
oldest = w.cache[w.startIndex%w.capacity].ResourceVersion
default:
return nil, fmt.Errorf("watch cache isn't correctly initialized")

View File

@@ -3,5 +3,4 @@
reviewers:
- wojtek-t
- timothysc
- madhusudancs
- hongchaodeng

View File

@@ -17,6 +17,7 @@ limitations under the License.
package etcd3
import (
"fmt"
"strconv"
"k8s.io/apimachinery/pkg/api/meta"
@@ -45,14 +46,14 @@ func (a APIObjectVersioner) UpdateObject(obj runtime.Object, resourceVersion uin
// UpdateList implements Versioner
func (a APIObjectVersioner) UpdateList(obj runtime.Object, resourceVersion uint64, nextKey string, count *int64) error {
if resourceVersion == 0 {
return fmt.Errorf("illegal resource version from storage: %d", resourceVersion)
}
listAccessor, err := meta.ListAccessor(obj)
if err != nil || listAccessor == nil {
return err
}
versionString := ""
if resourceVersion != 0 {
versionString = strconv.FormatUint(resourceVersion, 10)
}
versionString := strconv.FormatUint(resourceVersion, 10)
listAccessor.SetResourceVersion(versionString)
listAccessor.SetContinue(nextKey)
listAccessor.SetRemainingItemCount(count)

View File

@@ -23,7 +23,7 @@ import (
"time"
"go.etcd.io/etcd/clientv3"
"k8s.io/klog"
"k8s.io/klog/v2"
)
const (

View File

@@ -23,12 +23,13 @@ import (
)
type event struct {
key string
value []byte
prevValue []byte
rev int64
isDeleted bool
isCreated bool
key string
value []byte
prevValue []byte
rev int64
isDeleted bool
isCreated bool
isProgressNotify bool
}
// parseKV converts a KeyValue retrieved from an initial sync() listing to a synthetic isCreated event.
@@ -61,3 +62,10 @@ func parseEvent(e *clientv3.Event) (*event, error) {
}
return ret, nil
}
func progressNotifyEvent(rev int64) *event {
return &event{
rev: rev,
isProgressNotify: true,
}
}

View File

@@ -22,8 +22,30 @@ import (
"time"
"go.etcd.io/etcd/clientv3"
"k8s.io/apiserver/pkg/storage/etcd3/metrics"
)
const (
defaultLeaseReuseDurationSeconds = 60
defaultLeaseMaxObjectCount = 1000
)
// LeaseManagerConfig is configuration for creating a lease manager.
type LeaseManagerConfig struct {
// ReuseDurationSeconds specifies time in seconds that each lease is reused
ReuseDurationSeconds int64
// MaxObjectCount specifies how many objects that a lease can attach
MaxObjectCount int64
}
// NewDefaultLeaseManagerConfig creates a LeaseManagerConfig with default values
func NewDefaultLeaseManagerConfig() LeaseManagerConfig {
return LeaseManagerConfig{
ReuseDurationSeconds: defaultLeaseReuseDurationSeconds,
MaxObjectCount: defaultLeaseMaxObjectCount,
}
}
// leaseManager is used to manage leases requested from etcd. If a new write
// needs a lease that has similar expiration time to the previous one, the old
// lease will be reused to reduce the overhead of etcd, since lease operations
@@ -36,35 +58,33 @@ type leaseManager struct {
prevLeaseExpirationTime time.Time
// The period of time in seconds and percent of TTL that each lease is
// reused. The minimum of them is used to avoid unreasonably large
// numbers. We use var instead of const for testing purposes.
leaseReuseDurationSeconds int64
leaseReuseDurationPercent float64
// numbers.
leaseReuseDurationSeconds int64
leaseReuseDurationPercent float64
leaseMaxAttachedObjectCount int64
leaseAttachedObjectCount int64
}
// newDefaultLeaseManager creates a new lease manager using default setting.
func newDefaultLeaseManager(client *clientv3.Client) *leaseManager {
return newLeaseManager(client, 60, 0.05)
func newDefaultLeaseManager(client *clientv3.Client, config LeaseManagerConfig) *leaseManager {
if config.MaxObjectCount <= 0 {
config.MaxObjectCount = defaultLeaseMaxObjectCount
}
return newLeaseManager(client, config.ReuseDurationSeconds, 0.05, config.MaxObjectCount)
}
// newLeaseManager creates a new lease manager with the number of buffered
// leases, lease reuse duration in seconds and percentage. The percentage
// value x means x*100%.
func newLeaseManager(client *clientv3.Client, leaseReuseDurationSeconds int64, leaseReuseDurationPercent float64) *leaseManager {
func newLeaseManager(client *clientv3.Client, leaseReuseDurationSeconds int64, leaseReuseDurationPercent float64, maxObjectCount int64) *leaseManager {
return &leaseManager{
client: client,
leaseReuseDurationSeconds: leaseReuseDurationSeconds,
leaseReuseDurationPercent: leaseReuseDurationPercent,
client: client,
leaseReuseDurationSeconds: leaseReuseDurationSeconds,
leaseReuseDurationPercent: leaseReuseDurationPercent,
leaseMaxAttachedObjectCount: maxObjectCount,
}
}
// setLeaseReuseDurationSeconds is used for testing purpose. It is used to
// reduce the extra lease duration to avoid unnecessary timeout in testing.
func (l *leaseManager) setLeaseReuseDurationSeconds(duration int64) {
l.leaseMu.Lock()
defer l.leaseMu.Unlock()
l.leaseReuseDurationSeconds = duration
}
// GetLease returns a lease based on requested ttl: if the cached previous
// lease can be reused, reuse it; otherwise request a new one from etcd.
func (l *leaseManager) GetLease(ctx context.Context, ttl int64) (clientv3.LeaseID, error) {
@@ -75,9 +95,15 @@ func (l *leaseManager) GetLease(ctx context.Context, ttl int64) (clientv3.LeaseI
reuseDurationSeconds := l.getReuseDurationSecondsLocked(ttl)
valid := now.Add(time.Duration(ttl) * time.Second).Before(l.prevLeaseExpirationTime)
sufficient := now.Add(time.Duration(ttl+reuseDurationSeconds) * time.Second).After(l.prevLeaseExpirationTime)
if valid && sufficient {
// We count all operations that happened in the same lease, regardless of success or failure.
// Currently each GetLease call only attach 1 object
l.leaseAttachedObjectCount++
if valid && sufficient && l.leaseAttachedObjectCount <= l.leaseMaxAttachedObjectCount {
return l.prevLeaseID, nil
}
// request a lease with a little extra ttl from etcd
ttl += reuseDurationSeconds
lcr, err := l.client.Lease.Grant(ctx, ttl)
@@ -87,6 +113,9 @@ func (l *leaseManager) GetLease(ctx context.Context, ttl int64) (clientv3.LeaseI
// cache the new lease id
l.prevLeaseID = lcr.ID
l.prevLeaseExpirationTime = now.Add(time.Duration(ttl) * time.Second)
// refresh count
metrics.UpdateLeaseObjectCount(l.leaseAttachedObjectCount)
l.leaseAttachedObjectCount = 1
return lcr.ID, nil
}

View File

@@ -20,7 +20,7 @@ import (
"fmt"
"go.etcd.io/etcd/clientv3"
"k8s.io/klog"
"k8s.io/klog/v2"
)
func init() {
@@ -80,5 +80,5 @@ func (klogWrapper) Fatalf(format string, args ...interface{}) {
}
func (klogWrapper) V(l int) bool {
return bool(klog.V(klog.Level(l)))
return bool(klog.V(klog.Level(l)).Enabled())
}

View File

@@ -0,0 +1,4 @@
# See the OWNERS docs at https://go.k8s.io/owners
approvers:
- logicalhan

View File

@@ -26,7 +26,7 @@ import (
/*
* By default, all the following metrics are defined as falling under
* ALPHA stability level https://github.com/kubernetes/enhancements/blob/master/keps/sig-instrumentation/20190404-kubernetes-control-plane-metrics-stability.md#stability-classes)
* ALPHA stability level https://github.com/kubernetes/enhancements/blob/master/keps/sig-instrumentation/1209-metrics-stability/20190404-kubernetes-control-plane-metrics-stability.md#stability-classes)
*
* Promoting the stability level of the metric is a responsibility of the component owner, since it
* involves explicitly acknowledging support for the metric across multiple releases, in accordance with
@@ -35,20 +35,56 @@ import (
var (
etcdRequestLatency = compbasemetrics.NewHistogramVec(
&compbasemetrics.HistogramOpts{
Name: "etcd_request_duration_seconds",
Help: "Etcd request latency in seconds for each operation and object type.",
Name: "etcd_request_duration_seconds",
Help: "Etcd request latency in seconds for each operation and object type.",
// Etcd request latency in seconds for each operation and object type.
Buckets: []float64{0.005, 0.025, 0.1, 0.25, 0.5, 1.0, 2.0, 4.0, 15.0, 30.0, 60.0},
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"operation", "type"},
)
etcdObjectCounts = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
Name: "etcd_object_counts",
DeprecatedVersion: "1.22.0",
Help: "Number of stored objects at the time of last check split by kind. This metric is replaced by apiserver_storage_object_counts.",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"resource"},
)
objectCounts = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
Name: "etcd_object_counts",
Name: "apiserver_storage_objects",
Help: "Number of stored objects at the time of last check split by kind.",
StabilityLevel: compbasemetrics.STABLE,
},
[]string{"resource"},
)
dbTotalSize = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
Name: "etcd_db_total_size_in_bytes",
Help: "Total size of the etcd database file physically allocated in bytes.",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"endpoint"},
)
etcdBookmarkCounts = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
Name: "etcd_bookmark_counts",
Help: "Number of etcd bookmarks (progress notify events) split by kind.",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"resource"},
)
etcdLeaseObjectCounts = compbasemetrics.NewHistogramVec(
&compbasemetrics.HistogramOpts{
Name: "etcd_lease_object_counts",
Help: "Number of objects attached to a single etcd lease.",
Buckets: []float64{10, 50, 100, 500, 1000, 2500, 5000},
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{},
)
)
var registerMetrics sync.Once
@@ -59,12 +95,17 @@ func Register() {
registerMetrics.Do(func() {
legacyregistry.MustRegister(etcdRequestLatency)
legacyregistry.MustRegister(objectCounts)
legacyregistry.MustRegister(etcdObjectCounts)
legacyregistry.MustRegister(dbTotalSize)
legacyregistry.MustRegister(etcdBookmarkCounts)
legacyregistry.MustRegister(etcdLeaseObjectCounts)
})
}
// UpdateObjectCount sets the etcd_object_counts metric.
// UpdateObjectCount sets the apiserver_storage_object_counts and etcd_object_counts (deprecated) metric.
func UpdateObjectCount(resourcePrefix string, count int64) {
objectCounts.WithLabelValues(resourcePrefix).Set(float64(count))
etcdObjectCounts.WithLabelValues(resourcePrefix).Set(float64(count))
}
// RecordEtcdRequestLatency sets the etcd_request_duration_seconds metrics.
@@ -72,6 +113,11 @@ func RecordEtcdRequestLatency(verb, resource string, startTime time.Time) {
etcdRequestLatency.WithLabelValues(verb, resource).Observe(sinceInSeconds(startTime))
}
// RecordEtcdBookmark updates the etcd_bookmark_counts metric.
func RecordEtcdBookmark(resource string) {
etcdBookmarkCounts.WithLabelValues(resource).Inc()
}
// Reset resets the etcd_request_duration_seconds metric.
func Reset() {
etcdRequestLatency.Reset()
@@ -81,3 +127,15 @@ func Reset() {
func sinceInSeconds(start time.Time) float64 {
return time.Since(start).Seconds()
}
// UpdateEtcdDbSize sets the etcd_db_total_size_in_bytes metric.
func UpdateEtcdDbSize(ep string, size int64) {
dbTotalSize.WithLabelValues(ep).Set(float64(size))
}
// UpdateLeaseObjectCount sets the etcd_lease_object_counts metric.
func UpdateLeaseObjectCount(count int64) {
// Currently we only store one previous lease, since all the events have the same ttl.
// See pkg/storage/etcd3/lease_manager.go
etcdLeaseObjectCounts.WithLabelValues().Observe(float64(count))
}

View File

@@ -32,6 +32,8 @@ import (
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/conversion"
"k8s.io/apimachinery/pkg/runtime"
@@ -41,7 +43,7 @@ import (
"k8s.io/apiserver/pkg/storage/etcd3/metrics"
"k8s.io/apiserver/pkg/storage/value"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/klog"
"k8s.io/klog/v2"
utiltrace "k8s.io/utils/trace"
)
@@ -62,10 +64,7 @@ func (d authenticatedDataString) AuthenticatedData() []byte {
var _ value.Context = authenticatedDataString("")
type store struct {
client *clientv3.Client
// getOpts contains additional options that should be passed
// to all Get() calls.
getOps []clientv3.OpOption
client *clientv3.Client
codec runtime.Codec
versioner storage.Versioner
transformer value.Transformer
@@ -84,11 +83,11 @@ type objState struct {
}
// New returns an etcd3 implementation of storage.Interface.
func New(c *clientv3.Client, codec runtime.Codec, prefix string, transformer value.Transformer, pagingEnabled bool) storage.Interface {
return newStore(c, pagingEnabled, codec, prefix, transformer)
func New(c *clientv3.Client, codec runtime.Codec, newFunc func() runtime.Object, prefix string, transformer value.Transformer, pagingEnabled bool, leaseManagerConfig LeaseManagerConfig) storage.Interface {
return newStore(c, codec, newFunc, prefix, transformer, pagingEnabled, leaseManagerConfig)
}
func newStore(c *clientv3.Client, pagingEnabled bool, codec runtime.Codec, prefix string, transformer value.Transformer) *store {
func newStore(c *clientv3.Client, codec runtime.Codec, newFunc func() runtime.Object, prefix string, transformer value.Transformer, pagingEnabled bool, leaseManagerConfig LeaseManagerConfig) *store {
versioner := APIObjectVersioner{}
result := &store{
client: c,
@@ -100,8 +99,8 @@ func newStore(c *clientv3.Client, pagingEnabled bool, codec runtime.Codec, prefi
// no-op for default prefix of '/registry'.
// keeps compatibility with etcd2 impl for custom prefixes that don't start with '/'
pathPrefix: path.Join("/", prefix),
watcher: newWatcher(c, codec, versioner, transformer),
leaseManager: newDefaultLeaseManager(c),
watcher: newWatcher(c, codec, newFunc, versioner, transformer),
leaseManager: newDefaultLeaseManager(c, leaseManagerConfig),
}
return result
}
@@ -112,20 +111,20 @@ func (s *store) Versioner() storage.Versioner {
}
// Get implements storage.Interface.Get.
func (s *store) Get(ctx context.Context, key string, resourceVersion string, out runtime.Object, ignoreNotFound bool) error {
func (s *store) Get(ctx context.Context, key string, opts storage.GetOptions, out runtime.Object) error {
key = path.Join(s.pathPrefix, key)
startTime := time.Now()
getResp, err := s.client.KV.Get(ctx, key, s.getOps...)
getResp, err := s.client.KV.Get(ctx, key)
metrics.RecordEtcdRequestLatency("get", getTypeName(out), startTime)
if err != nil {
return err
}
if err = s.ensureMinimumResourceVersion(resourceVersion, uint64(getResp.Header.Revision)); err != nil {
if err = s.validateMinimumResourceVersion(opts.ResourceVersion, uint64(getResp.Header.Revision)); err != nil {
return err
}
if len(getResp.Kvs) == 0 {
if ignoreNotFound {
if opts.IgnoreNotFound {
return runtime.SetZeroValue(out)
}
return storage.NewKeyNotFoundError(key, 0)
@@ -186,35 +185,77 @@ func (s *store) Create(ctx context.Context, key string, obj, out runtime.Object,
}
// Delete implements storage.Interface.Delete.
func (s *store) Delete(ctx context.Context, key string, out runtime.Object, preconditions *storage.Preconditions, validateDeletion storage.ValidateObjectFunc) error {
func (s *store) Delete(
ctx context.Context, key string, out runtime.Object, preconditions *storage.Preconditions,
validateDeletion storage.ValidateObjectFunc, cachedExistingObject runtime.Object) error {
v, err := conversion.EnforcePtr(out)
if err != nil {
return fmt.Errorf("unable to convert output object to pointer: %v", err)
}
key = path.Join(s.pathPrefix, key)
return s.conditionalDelete(ctx, key, out, v, preconditions, validateDeletion)
return s.conditionalDelete(ctx, key, out, v, preconditions, validateDeletion, cachedExistingObject)
}
func (s *store) conditionalDelete(ctx context.Context, key string, out runtime.Object, v reflect.Value, preconditions *storage.Preconditions, validateDeletion storage.ValidateObjectFunc) error {
startTime := time.Now()
getResp, err := s.client.KV.Get(ctx, key)
metrics.RecordEtcdRequestLatency("get", getTypeName(out), startTime)
func (s *store) conditionalDelete(
ctx context.Context, key string, out runtime.Object, v reflect.Value, preconditions *storage.Preconditions,
validateDeletion storage.ValidateObjectFunc, cachedExistingObject runtime.Object) error {
getCurrentState := func() (*objState, error) {
startTime := time.Now()
getResp, err := s.client.KV.Get(ctx, key)
metrics.RecordEtcdRequestLatency("get", getTypeName(out), startTime)
if err != nil {
return nil, err
}
return s.getState(getResp, key, v, false)
}
var origState *objState
var err error
var origStateIsCurrent bool
if cachedExistingObject != nil {
origState, err = s.getStateFromObject(cachedExistingObject)
} else {
origState, err = getCurrentState()
origStateIsCurrent = true
}
if err != nil {
return err
}
for {
origState, err := s.getState(getResp, key, v, false)
if err != nil {
return err
}
if preconditions != nil {
if err := preconditions.Check(key, origState.obj); err != nil {
return err
if origStateIsCurrent {
return err
}
// It's possible we're working with stale data.
// Actually fetch
origState, err = getCurrentState()
if err != nil {
return err
}
origStateIsCurrent = true
// Retry
continue
}
}
if err := validateDeletion(ctx, origState.obj); err != nil {
return err
if origStateIsCurrent {
return err
}
// It's possible we're working with stale data.
// Actually fetch
origState, err = getCurrentState()
if err != nil {
return err
}
origStateIsCurrent = true
// Retry
continue
}
startTime := time.Now()
txnResp, err := s.client.KV.Txn(ctx).If(
clientv3.Compare(clientv3.ModRevision(key), "=", origState.rev),
@@ -228,8 +269,13 @@ func (s *store) conditionalDelete(ctx context.Context, key string, out runtime.O
return err
}
if !txnResp.Succeeded {
getResp = (*clientv3.GetResponse)(txnResp.Responses[0].GetResponseRange())
getResp := (*clientv3.GetResponse)(txnResp.Responses[0].GetResponseRange())
klog.V(4).Infof("deletion of %s failed because of a conflict, going to retry", key)
origState, err = s.getState(getResp, key, v, false)
if err != nil {
return err
}
origStateIsCurrent = true
continue
}
return decode(s.codec, s.versioner, origState.data, out, origState.rev)
@@ -239,7 +285,7 @@ func (s *store) conditionalDelete(ctx context.Context, key string, out runtime.O
// GuaranteedUpdate implements storage.Interface.GuaranteedUpdate.
func (s *store) GuaranteedUpdate(
ctx context.Context, key string, out runtime.Object, ignoreNotFound bool,
preconditions *storage.Preconditions, tryUpdate storage.UpdateFunc, suggestion ...runtime.Object) error {
preconditions *storage.Preconditions, tryUpdate storage.UpdateFunc, cachedExistingObject runtime.Object) error {
trace := utiltrace.New("GuaranteedUpdate etcd3", utiltrace.Field{"type", getTypeName(out)})
defer trace.LogIfLong(500 * time.Millisecond)
@@ -251,7 +297,7 @@ func (s *store) GuaranteedUpdate(
getCurrentState := func() (*objState, error) {
startTime := time.Now()
getResp, err := s.client.KV.Get(ctx, key, s.getOps...)
getResp, err := s.client.KV.Get(ctx, key)
metrics.RecordEtcdRequestLatency("get", getTypeName(out), startTime)
if err != nil {
return nil, err
@@ -260,18 +306,15 @@ func (s *store) GuaranteedUpdate(
}
var origState *objState
var mustCheckData bool
if len(suggestion) == 1 && suggestion[0] != nil {
origState, err = s.getStateFromObject(suggestion[0])
if err != nil {
return err
}
mustCheckData = true
var origStateIsCurrent bool
if cachedExistingObject != nil {
origState, err = s.getStateFromObject(cachedExistingObject)
} else {
origState, err = getCurrentState()
if err != nil {
return err
}
origStateIsCurrent = true
}
if err != nil {
return err
}
trace.Step("initial value restored")
@@ -279,7 +322,7 @@ func (s *store) GuaranteedUpdate(
for {
if err := preconditions.Check(key, origState.obj); err != nil {
// If our data is already up to date, return the error
if !mustCheckData {
if origStateIsCurrent {
return err
}
@@ -289,7 +332,7 @@ func (s *store) GuaranteedUpdate(
if err != nil {
return err
}
mustCheckData = false
origStateIsCurrent = true
// Retry
continue
}
@@ -297,7 +340,7 @@ func (s *store) GuaranteedUpdate(
ret, ttl, err := s.updateState(origState, tryUpdate)
if err != nil {
// If our data is already up to date, return the error
if !mustCheckData {
if origStateIsCurrent {
return err
}
@@ -307,7 +350,7 @@ func (s *store) GuaranteedUpdate(
if err != nil {
return err
}
mustCheckData = false
origStateIsCurrent = true
// Retry
continue
}
@@ -320,12 +363,12 @@ func (s *store) GuaranteedUpdate(
// if we skipped the original Get in this loop, we must refresh from
// etcd in order to be sure the data in the store is equivalent to
// our desired serialization
if mustCheckData {
if !origStateIsCurrent {
origState, err = getCurrentState()
if err != nil {
return err
}
mustCheckData = false
origStateIsCurrent = true
if !bytes.Equal(data, origState.data) {
// original data changed, restart loop
continue
@@ -369,7 +412,7 @@ func (s *store) GuaranteedUpdate(
return err
}
trace.Step("Retry value restored")
mustCheckData = false
origStateIsCurrent = true
continue
}
putResp := txnResp.Responses[0].GetResponsePut()
@@ -379,10 +422,14 @@ func (s *store) GuaranteedUpdate(
}
// GetToList implements storage.Interface.GetToList.
func (s *store) GetToList(ctx context.Context, key string, resourceVersion string, pred storage.SelectionPredicate, listObj runtime.Object) error {
func (s *store) GetToList(ctx context.Context, key string, listOpts storage.ListOptions, listObj runtime.Object) error {
resourceVersion := listOpts.ResourceVersion
match := listOpts.ResourceVersionMatch
pred := listOpts.Predicate
trace := utiltrace.New("GetToList etcd3",
utiltrace.Field{"key", key},
utiltrace.Field{"resourceVersion", resourceVersion},
utiltrace.Field{"resourceVersionMatch", match},
utiltrace.Field{"limit", pred.Limit},
utiltrace.Field{"continue", pred.Continue})
defer trace.LogIfLong(500 * time.Millisecond)
@@ -399,12 +446,21 @@ func (s *store) GetToList(ctx context.Context, key string, resourceVersion strin
key = path.Join(s.pathPrefix, key)
startTime := time.Now()
getResp, err := s.client.KV.Get(ctx, key, s.getOps...)
var opts []clientv3.OpOption
if len(resourceVersion) > 0 && match == metav1.ResourceVersionMatchExact {
rv, err := s.versioner.ParseResourceVersion(resourceVersion)
if err != nil {
return apierrors.NewBadRequest(fmt.Sprintf("invalid resource version: %v", err))
}
opts = append(opts, clientv3.WithRev(int64(rv)))
}
getResp, err := s.client.KV.Get(ctx, key, opts...)
metrics.RecordEtcdRequestLatency("get", getTypeName(listPtr), startTime)
if err != nil {
return err
}
if err = s.ensureMinimumResourceVersion(resourceVersion, uint64(getResp.Header.Revision)); err != nil {
if err = s.validateMinimumResourceVersion(resourceVersion, uint64(getResp.Header.Revision)); err != nil {
return err
}
@@ -440,6 +496,14 @@ func getNewItemFunc(listObj runtime.Object, v reflect.Value) func() runtime.Obje
func (s *store) Count(key string) (int64, error) {
key = path.Join(s.pathPrefix, key)
// We need to make sure the key ended with "/" so that we only get children "directories".
// e.g. if we have key "/a", "/a/b", "/ab", getting keys with prefix "/a" will return all three,
// while with prefix "/a/" will return only "/a/b" which is the correct answer.
if !strings.HasSuffix(key, "/") {
key += "/"
}
startTime := time.Now()
getResp, err := s.client.KV.Get(context.Background(), key, clientv3.WithRange(clientv3.GetPrefixRangeEnd(key)), clientv3.WithCountOnly())
metrics.RecordEtcdRequestLatency("listWithCount", key, startTime)
@@ -510,10 +574,14 @@ func encodeContinue(key, keyPrefix string, resourceVersion int64) (string, error
}
// List implements storage.Interface.List.
func (s *store) List(ctx context.Context, key, resourceVersion string, pred storage.SelectionPredicate, listObj runtime.Object) error {
func (s *store) List(ctx context.Context, key string, opts storage.ListOptions, listObj runtime.Object) error {
resourceVersion := opts.ResourceVersion
match := opts.ResourceVersionMatch
pred := opts.Predicate
trace := utiltrace.New("List etcd3",
utiltrace.Field{"key", key},
utiltrace.Field{"resourceVersion", resourceVersion},
utiltrace.Field{"resourceVersionMatch", match},
utiltrace.Field{"limit", pred.Limit},
utiltrace.Field{"continue", pred.Continue})
defer trace.LogIfLong(500 * time.Millisecond)
@@ -547,7 +615,16 @@ func (s *store) List(ctx context.Context, key, resourceVersion string, pred stor
newItemFunc := getNewItemFunc(listObj, v)
var returnedRV, continueRV int64
var fromRV *uint64
if len(resourceVersion) > 0 {
parsedRV, err := s.versioner.ParseResourceVersion(resourceVersion)
if err != nil {
return apierrors.NewBadRequest(fmt.Sprintf("invalid resource version: %v", err))
}
fromRV = &parsedRV
}
var returnedRV, continueRV, withRev int64
var continueKey string
switch {
case s.pagingEnabled && len(pred.Continue) > 0:
@@ -568,27 +645,50 @@ func (s *store) List(ctx context.Context, key, resourceVersion string, pred stor
// continueRV==0 is invalid.
// If continueRV < 0, the request is for the latest resource version.
if continueRV > 0 {
options = append(options, clientv3.WithRev(continueRV))
withRev = continueRV
returnedRV = continueRV
}
case s.pagingEnabled && pred.Limit > 0:
if len(resourceVersion) > 0 {
fromRV, err := s.versioner.ParseResourceVersion(resourceVersion)
if err != nil {
return apierrors.NewBadRequest(fmt.Sprintf("invalid resource version: %v", err))
if fromRV != nil {
switch match {
case metav1.ResourceVersionMatchNotOlderThan:
// The not older than constraint is checked after we get a response from etcd,
// and returnedRV is then set to the revision we get from the etcd response.
case metav1.ResourceVersionMatchExact:
returnedRV = int64(*fromRV)
withRev = returnedRV
case "": // legacy case
if *fromRV > 0 {
returnedRV = int64(*fromRV)
withRev = returnedRV
}
default:
return fmt.Errorf("unknown ResourceVersionMatch value: %v", match)
}
if fromRV > 0 {
options = append(options, clientv3.WithRev(int64(fromRV)))
}
returnedRV = int64(fromRV)
}
rangeEnd := clientv3.GetPrefixRangeEnd(keyPrefix)
options = append(options, clientv3.WithRange(rangeEnd))
default:
if fromRV != nil {
switch match {
case metav1.ResourceVersionMatchNotOlderThan:
// The not older than constraint is checked after we get a response from etcd,
// and returnedRV is then set to the revision we get from the etcd response.
case metav1.ResourceVersionMatchExact:
returnedRV = int64(*fromRV)
withRev = returnedRV
case "": // legacy case
default:
return fmt.Errorf("unknown ResourceVersionMatch value: %v", match)
}
}
options = append(options, clientv3.WithPrefix())
}
if withRev != 0 {
options = append(options, clientv3.WithRev(withRev))
}
// loop until we have filled the requested limit from etcd or there are no more results
var lastKey []byte
@@ -601,7 +701,7 @@ func (s *store) List(ctx context.Context, key, resourceVersion string, pred stor
if err != nil {
return interpretListError(err, len(pred.Continue) > 0, continueKey, keyPrefix)
}
if err = s.ensureMinimumResourceVersion(resourceVersion, uint64(getResp.Header.Revision)); err != nil {
if err = s.validateMinimumResourceVersion(resourceVersion, uint64(getResp.Header.Revision)); err != nil {
return err
}
hasMore = getResp.More
@@ -650,6 +750,10 @@ func (s *store) List(ctx context.Context, key, resourceVersion string, pred stor
break
}
key = string(lastKey) + "\x00"
if withRev == 0 {
withRev = returnedRV
options = append(options, clientv3.WithRev(withRev))
}
}
// instruct the client to begin querying from immediately after the last key we returned
@@ -709,22 +813,22 @@ func growSlice(v reflect.Value, maxCapacity int, sizes ...int) {
}
// Watch implements storage.Interface.Watch.
func (s *store) Watch(ctx context.Context, key string, resourceVersion string, pred storage.SelectionPredicate) (watch.Interface, error) {
return s.watch(ctx, key, resourceVersion, pred, false)
func (s *store) Watch(ctx context.Context, key string, opts storage.ListOptions) (watch.Interface, error) {
return s.watch(ctx, key, opts, false)
}
// WatchList implements storage.Interface.WatchList.
func (s *store) WatchList(ctx context.Context, key string, resourceVersion string, pred storage.SelectionPredicate) (watch.Interface, error) {
return s.watch(ctx, key, resourceVersion, pred, true)
func (s *store) WatchList(ctx context.Context, key string, opts storage.ListOptions) (watch.Interface, error) {
return s.watch(ctx, key, opts, true)
}
func (s *store) watch(ctx context.Context, key string, rv string, pred storage.SelectionPredicate, recursive bool) (watch.Interface, error) {
rev, err := s.versioner.ParseResourceVersion(rv)
func (s *store) watch(ctx context.Context, key string, opts storage.ListOptions, recursive bool) (watch.Interface, error) {
rev, err := s.versioner.ParseResourceVersion(opts.ResourceVersion)
if err != nil {
return nil, err
}
key = path.Join(s.pathPrefix, key)
return s.watcher.Watch(ctx, key, int64(rev), recursive, pred)
return s.watcher.Watch(ctx, key, int64(rev), recursive, opts.ProgressNotify, opts.Predicate)
}
func (s *store) getState(getResp *clientv3.GetResponse, key string, v reflect.Value, ignoreNotFound bool) (*objState, error) {
@@ -818,9 +922,9 @@ func (s *store) ttlOpts(ctx context.Context, ttl int64) ([]clientv3.OpOption, er
return []clientv3.OpOption{clientv3.WithLease(id)}, nil
}
// ensureMinimumResourceVersion returns a 'too large resource' version error when the provided minimumResourceVersion is
// validateMinimumResourceVersion returns a 'too large resource' version error when the provided minimumResourceVersion is
// greater than the most recent actualRevision available from storage.
func (s *store) ensureMinimumResourceVersion(minimumResourceVersion string, actualRevision uint64) error {
func (s *store) validateMinimumResourceVersion(minimumResourceVersion string, actualRevision uint64) error {
if minimumResourceVersion == "" {
return nil
}

View File

@@ -21,6 +21,7 @@ import (
"errors"
"fmt"
"os"
"reflect"
"strconv"
"strings"
"sync"
@@ -29,10 +30,11 @@ import (
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/apiserver/pkg/storage"
"k8s.io/apiserver/pkg/storage/etcd3/metrics"
"k8s.io/apiserver/pkg/storage/value"
"go.etcd.io/etcd/clientv3"
"k8s.io/klog"
"k8s.io/klog/v2"
)
const (
@@ -68,6 +70,8 @@ func TestOnlySetFatalOnDecodeError(b bool) {
type watcher struct {
client *clientv3.Client
codec runtime.Codec
newFunc func() runtime.Object
objectType string
versioner storage.Versioner
transformer value.Transformer
}
@@ -78,6 +82,7 @@ type watchChan struct {
key string
initialRev int64
recursive bool
progressNotify bool
internalPred storage.SelectionPredicate
ctx context.Context
cancel context.CancelFunc
@@ -86,13 +91,20 @@ type watchChan struct {
errChan chan error
}
func newWatcher(client *clientv3.Client, codec runtime.Codec, versioner storage.Versioner, transformer value.Transformer) *watcher {
return &watcher{
func newWatcher(client *clientv3.Client, codec runtime.Codec, newFunc func() runtime.Object, versioner storage.Versioner, transformer value.Transformer) *watcher {
res := &watcher{
client: client,
codec: codec,
newFunc: newFunc,
versioner: versioner,
transformer: transformer,
}
if newFunc == nil {
res.objectType = "<unknown>"
} else {
res.objectType = reflect.TypeOf(newFunc()).String()
}
return res
}
// Watch watches on a key and returns a watch.Interface that transfers relevant notifications.
@@ -102,21 +114,22 @@ func newWatcher(client *clientv3.Client, codec runtime.Codec, versioner storage.
// If recursive is false, it watches on given key.
// If recursive is true, it watches any children and directories under the key, excluding the root key itself.
// pred must be non-nil. Only if pred matches the change, it will be returned.
func (w *watcher) Watch(ctx context.Context, key string, rev int64, recursive bool, pred storage.SelectionPredicate) (watch.Interface, error) {
func (w *watcher) Watch(ctx context.Context, key string, rev int64, recursive, progressNotify bool, pred storage.SelectionPredicate) (watch.Interface, error) {
if recursive && !strings.HasSuffix(key, "/") {
key += "/"
}
wc := w.createWatchChan(ctx, key, rev, recursive, pred)
wc := w.createWatchChan(ctx, key, rev, recursive, progressNotify, pred)
go wc.run()
return wc, nil
}
func (w *watcher) createWatchChan(ctx context.Context, key string, rev int64, recursive bool, pred storage.SelectionPredicate) *watchChan {
func (w *watcher) createWatchChan(ctx context.Context, key string, rev int64, recursive, progressNotify bool, pred storage.SelectionPredicate) *watchChan {
wc := &watchChan{
watcher: w,
key: key,
initialRev: rev,
recursive: recursive,
progressNotify: progressNotify,
internalPred: pred,
incomingEventChan: make(chan *event, incomingBufSize),
resultChan: make(chan watch.Event, outgoingBufSize),
@@ -126,7 +139,15 @@ func (w *watcher) createWatchChan(ctx context.Context, key string, rev int64, re
// The filter doesn't filter out any object.
wc.internalPred = storage.Everything
}
wc.ctx, wc.cancel = context.WithCancel(ctx)
// The etcd server waits until it cannot find a leader for 3 election
// timeouts to cancel existing streams. 3 is currently a hard coded
// constant. The election timeout defaults to 1000ms. If the cluster is
// healthy, when the leader is stopped, the leadership transfer should be
// smooth. (leader transfers its leadership before stopping). If leader is
// hard killed, other servers will take an election timeout to realize
// leader lost and start campaign.
wc.ctx, wc.cancel = context.WithCancel(clientv3.WithRequireLeader(ctx))
return wc
}
@@ -215,6 +236,9 @@ func (wc *watchChan) startWatching(watchClosedCh chan struct{}) {
if wc.recursive {
opts = append(opts, clientv3.WithPrefix())
}
if wc.progressNotify {
opts = append(opts, clientv3.WithProgressNotify())
}
wch := wc.watcher.client.Watch(wc.ctx, wc.key, opts...)
for wres := range wch {
if wres.Err() != nil {
@@ -224,6 +248,12 @@ func (wc *watchChan) startWatching(watchClosedCh chan struct{}) {
wc.sendError(err)
return
}
if wres.IsProgressNotify() {
wc.sendEvent(progressNotifyEvent(wres.Header.GetRevision()))
metrics.RecordEtcdBookmark(wc.watcher.objectType)
continue
}
for _, e := range wres.Events {
parsedEvent, err := parseEvent(e)
if err != nil {
@@ -253,8 +283,7 @@ func (wc *watchChan) processEvent(wg *sync.WaitGroup) {
continue
}
if len(wc.resultChan) == outgoingBufSize {
klog.V(3).Infof("Fast watcher, slow processing. Number of buffered events: %d."+
"Probably caused by slow dispatching events to watchers", outgoingBufSize)
klog.V(3).InfoS("Fast watcher, slow processing. Probably caused by slow dispatching events to watchers", "outgoingEvents", outgoingBufSize)
}
// If user couldn't receive results fast enough, we also block incoming events from watcher.
// Because storing events in local will cause more memory usage.
@@ -292,6 +321,19 @@ func (wc *watchChan) transform(e *event) (res *watch.Event) {
}
switch {
case e.isProgressNotify:
if wc.watcher.newFunc == nil {
return nil
}
object := wc.watcher.newFunc()
if err := wc.watcher.versioner.UpdateObject(object, uint64(e.rev)); err != nil {
klog.Errorf("failed to propagate object version: %v", err)
return nil
}
res = &watch.Event{
Type: watch.Bookmark,
Object: object,
}
case e.isDeleted:
if !wc.filter(oldObj) {
return nil
@@ -360,9 +402,7 @@ func (wc *watchChan) sendError(err error) {
func (wc *watchChan) sendEvent(e *event) {
if len(wc.incomingEventChan) == incomingBufSize {
klog.V(3).Infof("Fast watcher, slow processing. Number of buffered events: %d."+
"Probably caused by slow decoding, user not receiving fast, or other processing logic",
incomingBufSize)
klog.V(3).InfoS("Fast watcher, slow processing. Probably caused by slow decoding, user not receiving fast, or other processing logic", "incomingEvents", incomingBufSize)
}
select {
case wc.incomingEventChan <- e:
@@ -371,6 +411,11 @@ func (wc *watchChan) sendEvent(e *event) {
}
func (wc *watchChan) prepareObjs(e *event) (curObj runtime.Object, oldObj runtime.Object, err error) {
if e.isProgressNotify {
// progressNotify events doesn't contain neither current nor previous object version,
return nil, nil, nil
}
if !e.isDeleted {
data, _, err := wc.watcher.transformer.TransformFromStorage(e.value, authenticatedDataString(e.key))
if err != nil {

View File

@@ -21,6 +21,7 @@ import (
"fmt"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
@@ -166,7 +167,12 @@ type Interface interface {
// Delete removes the specified key and returns the value that existed at that spot.
// If key didn't exist, it will return NotFound storage error.
Delete(ctx context.Context, key string, out runtime.Object, preconditions *Preconditions, validateDeletion ValidateObjectFunc) error
// If 'cachedExistingObject' is non-nil, it can be used as a suggestion about the
// current version of the object to avoid read operation from storage to get it.
// However, the implementations have to retry in case suggestion is stale.
Delete(
ctx context.Context, key string, out runtime.Object, preconditions *Preconditions,
validateDeletion ValidateObjectFunc, cachedExistingObject runtime.Object) error
// Watch begins watching the specified key. Events are decoded into API objects,
// and any items selected by 'p' are sent down to returned watch.Interface.
@@ -175,7 +181,7 @@ type Interface interface {
// (e.g. reconnecting without missing any updates).
// If resource version is "0", this interface will get current object at given key
// and send it in an "ADDED" event, before watch starts.
Watch(ctx context.Context, key string, resourceVersion string, p SelectionPredicate) (watch.Interface, error)
Watch(ctx context.Context, key string, opts ListOptions) (watch.Interface, error)
// WatchList begins watching the specified key's items. Items are decoded into API
// objects and any item selected by 'p' are sent down to returned watch.Interface.
@@ -184,26 +190,26 @@ type Interface interface {
// (e.g. reconnecting without missing any updates).
// If resource version is "0", this interface will list current objects directory defined by key
// and send them in "ADDED" events, before watch starts.
WatchList(ctx context.Context, key string, resourceVersion string, p SelectionPredicate) (watch.Interface, error)
WatchList(ctx context.Context, key string, opts ListOptions) (watch.Interface, error)
// Get unmarshals json found at key into objPtr. On a not found error, will either
// return a zero object of the requested type, or an error, depending on ignoreNotFound.
// return a zero object of the requested type, or an error, depending on 'opts.ignoreNotFound'.
// Treats empty responses and nil response nodes exactly like a not found error.
// The returned contents may be delayed, but it is guaranteed that they will
// be have at least 'resourceVersion'.
Get(ctx context.Context, key string, resourceVersion string, objPtr runtime.Object, ignoreNotFound bool) error
// match 'opts.ResourceVersion' according 'opts.ResourceVersionMatch'.
Get(ctx context.Context, key string, opts GetOptions, objPtr runtime.Object) error
// GetToList unmarshals json found at key and opaque it into *List api object
// (an object that satisfies the runtime.IsList definition).
// The returned contents may be delayed, but it is guaranteed that they will
// be have at least 'resourceVersion'.
GetToList(ctx context.Context, key string, resourceVersion string, p SelectionPredicate, listObj runtime.Object) error
// match 'opts.ResourceVersion' according 'opts.ResourceVersionMatch'.
GetToList(ctx context.Context, key string, opts ListOptions, listObj runtime.Object) error
// List unmarshalls jsons found at directory defined by key and opaque them
// into *List api object (an object that satisfies runtime.IsList definition).
// The returned contents may be delayed, but it is guaranteed that they will
// be have at least 'resourceVersion'.
List(ctx context.Context, key string, resourceVersion string, p SelectionPredicate, listObj runtime.Object) error
// match 'opts.ResourceVersion' according 'opts.ResourceVersionMatch'.
List(ctx context.Context, key string, opts ListOptions, listObj runtime.Object) error
// GuaranteedUpdate keeps calling 'tryUpdate()' to update key 'key' (of type 'ptrToType')
// retrying the update until success if there is index conflict.
@@ -214,9 +220,9 @@ type Interface interface {
// or zero value in 'ptrToType' parameter otherwise.
// If the object to update has the same value as previous, it won't do any update
// but will return the object in 'ptrToType' parameter.
// If 'suggestion' can contain zero or one element - in such case this can be used as
// a suggestion about the current version of the object to avoid read operation from
// storage to get it.
// If 'cachedExistingObject' is non-nil, it can be used as a suggestion about the
// current version of the object to avoid read operation from storage to get it.
// However, the implementations have to retry in case suggestion is stale.
//
// Example:
//
@@ -238,8 +244,37 @@ type Interface interface {
// )
GuaranteedUpdate(
ctx context.Context, key string, ptrToType runtime.Object, ignoreNotFound bool,
precondtions *Preconditions, tryUpdate UpdateFunc, suggestion ...runtime.Object) error
precondtions *Preconditions, tryUpdate UpdateFunc, cachedExistingObject runtime.Object) error
// Count returns number of different entries under the key (generally being path prefix).
Count(key string) (int64, error)
}
// GetOptions provides the options that may be provided for storage get operations.
type GetOptions struct {
// IgnoreNotFound determines what is returned if the requested object is not found. If
// true, a zero object is returned. If false, an error is returned.
IgnoreNotFound bool
// ResourceVersion provides a resource version constraint to apply to the get operation
// as a "not older than" constraint: the result contains data at least as new as the provided
// ResourceVersion. The newest available data is preferred, but any data not older than this
// ResourceVersion may be served.
ResourceVersion string
}
// ListOptions provides the options that may be provided for storage list operations.
type ListOptions struct {
// ResourceVersion provides a resource version constraint to apply to the list operation
// as a "not older than" constraint: the result contains data at least as new as the provided
// ResourceVersion. The newest available data is preferred, but any data not older than this
// ResourceVersion may be served.
ResourceVersion string
// ResourceVersionMatch provides the rule for how the resource version constraint applies. If set
// to the default value "" the legacy resource version semantic apply.
ResourceVersionMatch metav1.ResourceVersionMatch
// Predicate provides the selection rules for the list operation.
Predicate SelectionPredicate
// ProgressNotify determines whether storage-originated bookmark (progress notify) events should
// be delivered to the users. The option is ignored for non-watch requests.
ProgressNotify bool
}

View File

@@ -21,6 +21,7 @@ import (
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apiserver/pkg/server/egressselector"
"k8s.io/apiserver/pkg/storage/etcd3"
"k8s.io/apiserver/pkg/storage/value"
)
@@ -28,7 +29,9 @@ const (
StorageTypeUnset = ""
StorageTypeETCD3 = "etcd3"
DefaultCompactInterval = 5 * time.Minute
DefaultCompactInterval = 5 * time.Minute
DefaultDBMetricPollInterval = 30 * time.Second
DefaultHealthcheckTimeout = 2 * time.Second
)
// TransportConfig holds all connection related info, i.e. equal TransportConfig means equal servers we talk to.
@@ -71,13 +74,22 @@ type Config struct {
CompactionInterval time.Duration
// CountMetricPollPeriod specifies how often should count metric be updated
CountMetricPollPeriod time.Duration
// DBMetricPollInterval specifies how often should storage backend metric be updated.
DBMetricPollInterval time.Duration
// HealthcheckTimeout specifies the timeout used when checking health
HealthcheckTimeout time.Duration
LeaseManagerConfig etcd3.LeaseManagerConfig
}
func NewDefaultConfig(prefix string, codec runtime.Codec) *Config {
return &Config{
Paging: true,
Prefix: prefix,
Codec: codec,
CompactionInterval: DefaultCompactInterval,
Paging: true,
Prefix: prefix,
Codec: codec,
CompactionInterval: DefaultCompactInterval,
DBMetricPollInterval: DefaultDBMetricPollInterval,
HealthcheckTimeout: DefaultHealthcheckTimeout,
LeaseManagerConfig: etcd3.NewDefaultLeaseManagerConfig(),
}
}

View File

@@ -31,25 +31,32 @@ import (
"go.etcd.io/etcd/pkg/transport"
"google.golang.org/grpc"
"k8s.io/apimachinery/pkg/runtime"
utilnet "k8s.io/apimachinery/pkg/util/net"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/apiserver/pkg/server/egressselector"
"k8s.io/apiserver/pkg/storage"
"k8s.io/apiserver/pkg/storage/etcd3"
"k8s.io/apiserver/pkg/storage/etcd3/metrics"
"k8s.io/apiserver/pkg/storage/storagebackend"
"k8s.io/apiserver/pkg/storage/value"
"k8s.io/component-base/metrics/legacyregistry"
"k8s.io/klog/v2"
)
// The short keepalive timeout and interval have been chosen to aggressively
// detect a failed etcd server without introducing much overhead.
const keepaliveTime = 30 * time.Second
const keepaliveTimeout = 10 * time.Second
const (
// The short keepalive timeout and interval have been chosen to aggressively
// detect a failed etcd server without introducing much overhead.
keepaliveTime = 30 * time.Second
keepaliveTimeout = 10 * time.Second
// dialTimeout is the timeout for failing to establish a connection.
// It is set to 20 seconds as times shorter than that will cause TLS connections to fail
// on heavily loaded arm64 CPUs (issue #64649)
const dialTimeout = 20 * time.Second
// dialTimeout is the timeout for failing to establish a connection.
// It is set to 20 seconds as times shorter than that will cause TLS connections to fail
// on heavily loaded arm64 CPUs (issue #64649)
dialTimeout = 20 * time.Second
dbMetricsMonitorJitter = 0.5
)
func init() {
// grpcprom auto-registers (via an init function) their client metrics, since we are opting out of
@@ -57,6 +64,7 @@ func init() {
// we need to explicitly register these metrics to our global registry here.
// For reference: https://github.com/kubernetes/kubernetes/pull/81387
legacyregistry.RawMustRegister(grpcprom.DefaultClientMetrics)
dbMetricsMonitors = make(map[string]struct{})
}
func newETCD3HealthCheck(c storagebackend.Config) (func() error, error) {
@@ -84,7 +92,11 @@ func newETCD3HealthCheck(c storagebackend.Config) (func() error, error) {
return fmt.Errorf(errMsg)
}
client := clientValue.Load().(*clientv3.Client)
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
healthcheckTimeout := storagebackend.DefaultHealthcheckTimeout
if c.HealthcheckTimeout != time.Duration(0) {
healthcheckTimeout = c.HealthcheckTimeout
}
ctx, cancel := context.WithTimeout(context.Background(), healthcheckTimeout)
defer cancel()
// See https://github.com/etcd-io/etcd/blob/c57f8b3af865d1b531b979889c602ba14377420e/etcdctl/ctlv3/command/ep_command.go#L118
_, err := client.Get(ctx, path.Join("/", c.Prefix, "health"))
@@ -153,16 +165,20 @@ type runningCompactor struct {
}
var (
lock sync.Mutex
compactors = map[string]*runningCompactor{}
// compactorsMu guards access to compactors map
compactorsMu sync.Mutex
compactors = map[string]*runningCompactor{}
// dbMetricsMonitorsMu guards access to dbMetricsMonitors map
dbMetricsMonitorsMu sync.Mutex
dbMetricsMonitors map[string]struct{}
)
// startCompactorOnce start one compactor per transport. If the interval get smaller on repeated calls, the
// compactor is replaced. A destroy func is returned. If all destroy funcs with the same transport are called,
// the compactor is stopped.
func startCompactorOnce(c storagebackend.TransportConfig, interval time.Duration) (func(), error) {
lock.Lock()
defer lock.Unlock()
compactorsMu.Lock()
defer compactorsMu.Unlock()
key := fmt.Sprintf("%v", c) // gives: {[server1 server2] keyFile certFile caFile}
if compactor, foundBefore := compactors[key]; !foundBefore || compactor.interval > interval {
@@ -193,8 +209,8 @@ func startCompactorOnce(c storagebackend.TransportConfig, interval time.Duration
compactors[key].refs++
return func() {
lock.Lock()
defer lock.Unlock()
compactorsMu.Lock()
defer compactorsMu.Unlock()
compactor := compactors[key]
compactor.refs--
@@ -206,7 +222,7 @@ func startCompactorOnce(c storagebackend.TransportConfig, interval time.Duration
}, nil
}
func newETCD3Storage(c storagebackend.Config) (storage.Interface, DestroyFunc, error) {
func newETCD3Storage(c storagebackend.Config, newFunc func() runtime.Object) (storage.Interface, DestroyFunc, error) {
stopCompactor, err := startCompactorOnce(c.Transport, c.CompactionInterval)
if err != nil {
return nil, nil, err
@@ -218,6 +234,11 @@ func newETCD3Storage(c storagebackend.Config) (storage.Interface, DestroyFunc, e
return nil, nil, err
}
stopDBSizeMonitor, err := startDBSizeMonitorPerEndpoint(client, c.DBMetricPollInterval)
if err != nil {
return nil, nil, err
}
var once sync.Once
destroyFunc := func() {
// we know that storage destroy funcs are called multiple times (due to reuse in subresources).
@@ -225,6 +246,7 @@ func newETCD3Storage(c storagebackend.Config) (storage.Interface, DestroyFunc, e
// TODO: fix duplicated storage destroy calls higher level
once.Do(func() {
stopCompactor()
stopDBSizeMonitor()
client.Close()
})
}
@@ -232,5 +254,38 @@ func newETCD3Storage(c storagebackend.Config) (storage.Interface, DestroyFunc, e
if transformer == nil {
transformer = value.IdentityTransformer
}
return etcd3.New(client, c.Codec, c.Prefix, transformer, c.Paging), destroyFunc, nil
return etcd3.New(client, c.Codec, newFunc, c.Prefix, transformer, c.Paging, c.LeaseManagerConfig), destroyFunc, nil
}
// startDBSizeMonitorPerEndpoint starts a loop to monitor etcd database size and update the
// corresponding metric etcd_db_total_size_in_bytes for each etcd server endpoint.
func startDBSizeMonitorPerEndpoint(client *clientv3.Client, interval time.Duration) (func(), error) {
if interval == 0 {
return func() {}, nil
}
dbMetricsMonitorsMu.Lock()
defer dbMetricsMonitorsMu.Unlock()
ctx, cancel := context.WithCancel(context.Background())
for _, ep := range client.Endpoints() {
if _, found := dbMetricsMonitors[ep]; found {
continue
}
dbMetricsMonitors[ep] = struct{}{}
endpoint := ep
klog.V(4).Infof("Start monitoring storage db size metric for endpoint %s with polling interval %v", endpoint, interval)
go wait.JitterUntilWithContext(ctx, func(context.Context) {
epStatus, err := client.Maintenance.Status(ctx, endpoint)
if err != nil {
klog.V(4).Infof("Failed to get storage db size for ep %s: %v", endpoint, err)
metrics.UpdateEtcdDbSize(endpoint, -1)
} else {
metrics.UpdateEtcdDbSize(endpoint, epStatus.DbSize)
}
}, interval, dbMetricsMonitorJitter, true)
}
return func() {
cancel()
}, nil
}

View File

@@ -19,6 +19,7 @@ package factory
import (
"fmt"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apiserver/pkg/storage"
"k8s.io/apiserver/pkg/storage/storagebackend"
)
@@ -27,12 +28,12 @@ import (
type DestroyFunc func()
// Create creates a storage backend based on given config.
func Create(c storagebackend.Config) (storage.Interface, DestroyFunc, error) {
func Create(c storagebackend.Config, newFunc func() runtime.Object) (storage.Interface, DestroyFunc, error) {
switch c.Type {
case "etcd2":
return nil, nil, fmt.Errorf("%v is no longer a supported storage backend", c.Type)
case storagebackend.StorageTypeUnset, storagebackend.StorageTypeETCD3:
return newETCD3Storage(c)
return newETCD3Storage(c, newFunc)
default:
return nil, nil, fmt.Errorf("unknown storage type: %s", c.Type)
}

View File

@@ -26,7 +26,7 @@ import (
"sync"
"time"
"k8s.io/klog"
"k8s.io/klog/v2"
"google.golang.org/grpc"

View File

@@ -33,7 +33,7 @@ const (
/*
* By default, all the following metrics are defined as falling under
* ALPHA stability level https://github.com/kubernetes/enhancements/blob/master/keps/sig-instrumentation/20190404-kubernetes-control-plane-metrics-stability.md#stability-classes)
* ALPHA stability level https://github.com/kubernetes/enhancements/blob/master/keps/sig-instrumentation/1209-metrics-stability/20190404-kubernetes-control-plane-metrics-stability.md#stability-classes)
*
* Promoting the stability level of the metric is a responsibility of the component owner, since it
* involves explicitly acknowledging support for the metric across multiple releases, in accordance with

View File

@@ -33,7 +33,7 @@ const (
/*
* By default, all the following metrics are defined as falling under
* ALPHA stability level https://github.com/kubernetes/enhancements/blob/master/keps/sig-instrumentation/20190404-kubernetes-control-plane-metrics-stability.md#stability-classes)
* ALPHA stability level https://github.com/kubernetes/enhancements/blob/master/keps/sig-instrumentation/1209-metrics-stability/20190404-kubernetes-control-plane-metrics-stability.md#stability-classes)
*
* Promoting the stability level of the metric is a responsibility of the component owner, since it
* involves explicitly acknowledging support for the metric across multiple releases, in accordance with