feat: kubesphere 4.0 (#6115)

* feat: kubesphere 4.0

Signed-off-by: ci-bot <ci-bot@kubesphere.io>

* feat: kubesphere 4.0

Signed-off-by: ci-bot <ci-bot@kubesphere.io>

---------

Signed-off-by: ci-bot <ci-bot@kubesphere.io>
Co-authored-by: ks-ci-bot <ks-ci-bot@example.com>
Co-authored-by: joyceliu <joyceliu@yunify.com>
This commit is contained in:
KubeSphere CI Bot
2024-09-06 11:05:52 +08:00
committed by GitHub
parent b5015ec7b9
commit 447a51f08b
8557 changed files with 546695 additions and 1146174 deletions

View File

@@ -1,11 +1,9 @@
# See the OWNERS docs at https://go.k8s.io/owners
approvers:
- lavalamp
- liggitt
- wojtek-t
reviewers:
- lavalamp
- smarterclayton
- wojtek-t
- deads2k
@@ -16,6 +14,8 @@ reviewers:
- ingvagabund
- enj
- stevekuznetsov
- MadhavJivrajani
emeritus_approvers:
- xiang90
- timothysc
- lavalamp

View File

@@ -0,0 +1,534 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cacher
import (
"context"
"fmt"
"sync"
"time"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/apiserver/pkg/storage"
"k8s.io/apiserver/pkg/storage/cacher/metrics"
utilflowcontrol "k8s.io/apiserver/pkg/util/flowcontrol"
"k8s.io/klog/v2"
)
// possible states of the cache watcher
const (
// cacheWatcherWaitingForBookmark indicates the cacher
// is waiting for a bookmark event with a specific RV set
cacheWatcherWaitingForBookmark = iota
// cacheWatcherBookmarkReceived indicates that the cacher
// has received a bookmark event with required RV
cacheWatcherBookmarkReceived
// cacheWatcherBookmarkSent indicates that the cacher
// has already sent a bookmark event to a client
cacheWatcherBookmarkSent
)
// cacheWatcher implements watch.Interface
// this is not thread-safe
type cacheWatcher struct {
input chan *watchCacheEvent
result chan watch.Event
done chan struct{}
filter filterWithAttrsFunc
stopped bool
forget func(bool)
versioner storage.Versioner
// The watcher will be closed by server after the deadline,
// save it here to send bookmark events before that.
deadline time.Time
allowWatchBookmarks bool
groupResource schema.GroupResource
// human readable identifier that helps assigning cacheWatcher
// instance with request
identifier string
// drainInputBuffer indicates whether we should delay closing this watcher
// and send all event in the input buffer.
drainInputBuffer bool
// bookmarkAfterResourceVersion holds an RV that indicates
// when we should start delivering bookmark events.
// If this field holds the value of 0 that means
// we don't have any special preferences toward delivering bookmark events.
// Note that this field is used in conjunction with the state field.
// It should not be changed once the watcher has been started.
bookmarkAfterResourceVersion uint64
// stateMutex protects state
stateMutex sync.Mutex
// state holds a numeric value indicating the current state of the watcher
state int
}
func newCacheWatcher(
chanSize int,
filter filterWithAttrsFunc,
forget func(bool),
versioner storage.Versioner,
deadline time.Time,
allowWatchBookmarks bool,
groupResource schema.GroupResource,
identifier string,
) *cacheWatcher {
return &cacheWatcher{
input: make(chan *watchCacheEvent, chanSize),
result: make(chan watch.Event, chanSize),
done: make(chan struct{}),
filter: filter,
stopped: false,
forget: forget,
versioner: versioner,
deadline: deadline,
allowWatchBookmarks: allowWatchBookmarks,
groupResource: groupResource,
identifier: identifier,
}
}
// Implements watch.Interface.
func (c *cacheWatcher) ResultChan() <-chan watch.Event {
return c.result
}
// Implements watch.Interface.
func (c *cacheWatcher) Stop() {
c.forget(false)
}
// we rely on the fact that stopLocked is actually protected by Cacher.Lock()
func (c *cacheWatcher) stopLocked() {
if !c.stopped {
c.stopped = true
// stop without draining the input channel was requested.
if !c.drainInputBuffer {
close(c.done)
}
close(c.input)
}
// Even if the watcher was already stopped, if it previously was
// using draining mode and it's not using it now we need to
// close the done channel now. Otherwise we could leak the
// processing goroutine if it will be trying to put more objects
// into result channel, the channel will be full and there will
// already be noone on the processing the events on the receiving end.
if !c.drainInputBuffer && !c.isDoneChannelClosedLocked() {
close(c.done)
}
}
func (c *cacheWatcher) nonblockingAdd(event *watchCacheEvent) bool {
// if the bookmarkAfterResourceVersion hasn't been seen
// we will try to deliver a bookmark event every second.
// the following check will discard a bookmark event
// if it is < than the bookmarkAfterResourceVersion
// so that we don't pollute the input channel
if event.Type == watch.Bookmark && event.ResourceVersion < c.bookmarkAfterResourceVersion {
return false
}
select {
case c.input <- event:
c.markBookmarkAfterRvAsReceived(event)
return true
default:
return false
}
}
// Nil timer means that add will not block (if it can't send event immediately, it will break the watcher)
//
// Note that bookmark events are never added via the add method only via the nonblockingAdd.
// Changing this behaviour will require moving the markBookmarkAfterRvAsReceived method
func (c *cacheWatcher) add(event *watchCacheEvent, timer *time.Timer) bool {
// Try to send the event immediately, without blocking.
if c.nonblockingAdd(event) {
return true
}
closeFunc := func() {
// This means that we couldn't send event to that watcher.
// Since we don't want to block on it infinitely,
// we simply terminate it.
metrics.TerminatedWatchersCounter.WithLabelValues(c.groupResource.String()).Inc()
// This means that we couldn't send event to that watcher.
// Since we don't want to block on it infinitely, we simply terminate it.
// we are graceful = false, when:
//
// (a) The bookmarkAfterResourceVersionReceived hasn't been received,
// we can safely terminate the watcher. Because the client is waiting
// for this specific bookmark, and we even haven't received one.
// (b) We have seen the bookmarkAfterResourceVersion, and it was sent already to the client.
// We can simply terminate the watcher.
// we are graceful = true, when:
//
// (a) We have seen a bookmark, but it hasn't been sent to the client yet.
// That means we should drain the input buffer which contains
// the bookmarkAfterResourceVersion we want. We do that to make progress
// as clients can re-establish a new watch with the given RV and receive
// further notifications.
graceful := func() bool {
c.stateMutex.Lock()
defer c.stateMutex.Unlock()
return c.state == cacheWatcherBookmarkReceived
}()
klog.V(1).Infof("Forcing %v watcher close due to unresponsiveness: %v. len(c.input) = %v, len(c.result) = %v, graceful = %v", c.groupResource.String(), c.identifier, len(c.input), len(c.result), graceful)
c.forget(graceful)
}
if timer == nil {
closeFunc()
return false
}
// OK, block sending, but only until timer fires.
select {
case c.input <- event:
return true
case <-timer.C:
closeFunc()
return false
}
}
func (c *cacheWatcher) nextBookmarkTime(now time.Time, bookmarkFrequency time.Duration) (time.Time, bool) {
// We try to send bookmarks:
//
// (a) right before the watcher timeout - for now we simply set it 2s before
// the deadline
//
// (b) roughly every minute
//
// (c) immediately when the bookmarkAfterResourceVersion wasn't confirmed
// in this scenario the client have already seen (or is in the process of sending)
// all initial data and is interested in seeing
// a specific RV value (aka. the bookmarkAfterResourceVersion)
// since we don't know when the cacher will see the RV we increase frequency
//
// (b) gives us periodicity if the watch breaks due to unexpected
// conditions, (a) ensures that on timeout the watcher is as close to
// now as possible - this covers 99% of cases.
if !c.wasBookmarkAfterRvReceived() {
return time.Time{}, true // schedule immediately
}
heartbeatTime := now.Add(bookmarkFrequency)
if c.deadline.IsZero() {
// Timeout is set by our client libraries (e.g. reflector) as well as defaulted by
// apiserver if properly configured. So this shoudln't happen in practice.
return heartbeatTime, true
}
if pretimeoutTime := c.deadline.Add(-2 * time.Second); pretimeoutTime.Before(heartbeatTime) {
heartbeatTime = pretimeoutTime
}
if heartbeatTime.Before(now) {
return time.Time{}, false
}
return heartbeatTime, true
}
// wasBookmarkAfterRvReceived same as wasBookmarkAfterRvReceivedLocked just acquires a lock
func (c *cacheWatcher) wasBookmarkAfterRvReceived() bool {
c.stateMutex.Lock()
defer c.stateMutex.Unlock()
return c.wasBookmarkAfterRvReceivedLocked()
}
// wasBookmarkAfterRvReceivedLocked checks if the given cacheWatcher
// have seen a bookmark event >= bookmarkAfterResourceVersion
func (c *cacheWatcher) wasBookmarkAfterRvReceivedLocked() bool {
return c.state != cacheWatcherWaitingForBookmark
}
// markBookmarkAfterRvAsReceived indicates that the given cacheWatcher
// have seen a bookmark event >= bookmarkAfterResourceVersion
func (c *cacheWatcher) markBookmarkAfterRvAsReceived(event *watchCacheEvent) {
if event.Type == watch.Bookmark {
c.stateMutex.Lock()
defer c.stateMutex.Unlock()
if c.wasBookmarkAfterRvReceivedLocked() {
return
}
// bookmark events are scheduled by startDispatchingBookmarkEvents method
// since we received a bookmark event that means we have
// converged towards the expected RV and it is okay to update the state so that
// this cacher can be scheduler for a regular bookmark events
c.state = cacheWatcherBookmarkReceived
}
}
// wasBookmarkAfterRvSentLocked checks if a bookmark event
// with an RV >= the bookmarkAfterResourceVersion has been sent by this watcher
func (c *cacheWatcher) wasBookmarkAfterRvSentLocked() bool {
return c.state == cacheWatcherBookmarkSent
}
// wasBookmarkAfterRvSent same as wasBookmarkAfterRvSentLocked just acquires a lock
func (c *cacheWatcher) wasBookmarkAfterRvSent() bool {
c.stateMutex.Lock()
defer c.stateMutex.Unlock()
return c.wasBookmarkAfterRvSentLocked()
}
// markBookmarkAfterRvSent indicates that the given cacheWatcher
// have sent a bookmark event with an RV >= the bookmarkAfterResourceVersion
//
// this function relies on the fact that the nonblockingAdd method
// won't admit a bookmark event with an RV < the bookmarkAfterResourceVersion
// so the first received bookmark event is considered to match the bookmarkAfterResourceVersion
func (c *cacheWatcher) markBookmarkAfterRvSent(event *watchCacheEvent) {
// note that bookmark events are not so common so will acquire a lock every ~60 second or so
if event.Type == watch.Bookmark {
c.stateMutex.Lock()
defer c.stateMutex.Unlock()
if !c.wasBookmarkAfterRvSentLocked() {
c.state = cacheWatcherBookmarkSent
}
}
}
// setBookmarkAfterResourceVersion sets the bookmarkAfterResourceVersion and the state associated with it
func (c *cacheWatcher) setBookmarkAfterResourceVersion(bookmarkAfterResourceVersion uint64) {
state := cacheWatcherWaitingForBookmark
if bookmarkAfterResourceVersion == 0 {
state = cacheWatcherBookmarkSent // if no specific RV was requested we assume no-op
}
c.state = state
c.bookmarkAfterResourceVersion = bookmarkAfterResourceVersion
}
// setDrainInputBufferLocked if set to true indicates that we should delay closing this watcher
// until we send all events residing in the input buffer.
func (c *cacheWatcher) setDrainInputBufferLocked(drain bool) {
c.drainInputBuffer = drain
}
// isDoneChannelClosed checks if c.done channel is closed
func (c *cacheWatcher) isDoneChannelClosedLocked() bool {
select {
case <-c.done:
return true
default:
}
return false
}
func getMutableObject(object runtime.Object) runtime.Object {
if _, ok := object.(*cachingObject); ok {
// It is safe to return without deep-copy, because the underlying
// object will lazily perform deep-copy on the first try to change
// any of its fields.
return object
}
return object.DeepCopyObject()
}
func updateResourceVersion(object runtime.Object, versioner storage.Versioner, resourceVersion uint64) {
if err := versioner.UpdateObject(object, resourceVersion); err != nil {
utilruntime.HandleError(fmt.Errorf("failure to version api object (%d) %#v: %v", resourceVersion, object, err))
}
}
func (c *cacheWatcher) convertToWatchEvent(event *watchCacheEvent) *watch.Event {
if event.Type == watch.Bookmark {
e := &watch.Event{Type: watch.Bookmark, Object: event.Object.DeepCopyObject()}
if !c.wasBookmarkAfterRvSent() {
if err := storage.AnnotateInitialEventsEndBookmark(e.Object); err != nil {
utilruntime.HandleError(fmt.Errorf("error while accessing object's metadata gr: %v, identifier: %v, obj: %#v, err: %v", c.groupResource, c.identifier, e.Object, err))
return nil
}
}
return e
}
curObjPasses := event.Type != watch.Deleted && c.filter(event.Key, event.ObjLabels, event.ObjFields)
oldObjPasses := false
if event.PrevObject != nil {
oldObjPasses = c.filter(event.Key, event.PrevObjLabels, event.PrevObjFields)
}
if !curObjPasses && !oldObjPasses {
// Watcher is not interested in that object.
return nil
}
switch {
case curObjPasses && !oldObjPasses:
return &watch.Event{Type: watch.Added, Object: getMutableObject(event.Object)}
case curObjPasses && oldObjPasses:
return &watch.Event{Type: watch.Modified, Object: getMutableObject(event.Object)}
case !curObjPasses && oldObjPasses:
// return a delete event with the previous object content, but with the event's resource version
oldObj := getMutableObject(event.PrevObject)
// We know that if oldObj is cachingObject (which can only be set via
// setCachingObjects), its resourceVersion is already set correctly and
// we don't need to update it. However, since cachingObject efficiently
// handles noop updates, we avoid this microoptimization here.
updateResourceVersion(oldObj, c.versioner, event.ResourceVersion)
return &watch.Event{Type: watch.Deleted, Object: oldObj}
}
return nil
}
// NOTE: sendWatchCacheEvent is assumed to not modify <event> !!!
func (c *cacheWatcher) sendWatchCacheEvent(event *watchCacheEvent) {
watchEvent := c.convertToWatchEvent(event)
if watchEvent == nil {
// Watcher is not interested in that object.
return
}
// We need to ensure that if we put event X to the c.result, all
// previous events were already put into it before, no matter whether
// c.done is close or not.
// Thus we cannot simply select from c.done and c.result and this
// would give us non-determinism.
// At the same time, we don't want to block infinitely on putting
// to c.result, when c.done is already closed.
//
// This ensures that with c.done already close, we at most once go
// into the next select after this. With that, no matter which
// statement we choose there, we will deliver only consecutive
// events.
select {
case <-c.done:
return
default:
}
select {
case c.result <- *watchEvent:
c.markBookmarkAfterRvSent(event)
case <-c.done:
}
}
func (c *cacheWatcher) processInterval(ctx context.Context, cacheInterval *watchCacheInterval, resourceVersion uint64) {
defer utilruntime.HandleCrash()
defer close(c.result)
defer c.Stop()
// Check how long we are processing initEvents.
// As long as these are not processed, we are not processing
// any incoming events, so if it takes long, we may actually
// block all watchers for some time.
// TODO: From the logs it seems that there happens processing
// times even up to 1s which is very long. However, this doesn't
// depend that much on the number of initEvents. E.g. from the
// 2000-node Kubemark run we have logs like this, e.g.:
// ... processing 13862 initEvents took 66.808689ms
// ... processing 14040 initEvents took 993.532539ms
// We should understand what is blocking us in those cases (e.g.
// is it lack of CPU, network, or sth else) and potentially
// consider increase size of result buffer in those cases.
const initProcessThreshold = 500 * time.Millisecond
startTime := time.Now()
initEventCount := 0
for {
event, err := cacheInterval.Next()
if err != nil {
// An error indicates that the cache interval
// has been invalidated and can no longer serve
// events.
//
// Initially we considered sending an "out-of-history"
// Error event in this case, but because historically
// such events weren't sent out of the watchCache, we
// decided not to. This is still ok, because on watch
// closure, the watcher will try to re-instantiate the
// watch and then will get an explicit "out-of-history"
// window. There is potential for optimization, but for
// now, in order to be on the safe side and not break
// custom clients, the cost of it is something that we
// are fully accepting.
klog.Warningf("couldn't retrieve watch event to serve: %#v", err)
return
}
if event == nil {
break
}
c.sendWatchCacheEvent(event)
// With some events already sent, update resourceVersion so that
// events that were buffered and not yet processed won't be delivered
// to this watcher second time causing going back in time.
//
// There is one case where events are not necessary ordered by
// resourceVersion, being a case of watching from resourceVersion=0,
// which at the beginning returns the state of each objects.
// For the purpose of it, we need to max it with the resource version
// that we have so far.
if event.ResourceVersion > resourceVersion {
resourceVersion = event.ResourceVersion
}
initEventCount++
}
if initEventCount > 0 {
metrics.InitCounter.WithLabelValues(c.groupResource.String()).Add(float64(initEventCount))
}
processingTime := time.Since(startTime)
if processingTime > initProcessThreshold {
klog.V(2).Infof("processing %d initEvents of %s (%s) took %v", initEventCount, c.groupResource, c.identifier, processingTime)
}
c.process(ctx, resourceVersion)
}
func (c *cacheWatcher) process(ctx context.Context, resourceVersion uint64) {
// At this point we already start processing incoming watch events.
// However, the init event can still be processed because their serialization
// and sending to the client happens asynchrnously.
// TODO: As describe in the KEP, we would like to estimate that by delaying
// the initialization signal proportionally to the number of events to
// process, but we're leaving this to the tuning phase.
utilflowcontrol.WatchInitialized(ctx)
for {
select {
case event, ok := <-c.input:
if !ok {
return
}
// only send events newer than resourceVersion
// or a bookmark event with an RV equal to resourceVersion
// if we haven't sent one to the client
if event.ResourceVersion > resourceVersion || (event.Type == watch.Bookmark && event.ResourceVersion == resourceVersion && !c.wasBookmarkAfterRvSent()) {
c.sendWatchCacheEvent(event)
}
case <-ctx.Done():
return
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -148,6 +148,10 @@ func (o *cachingObject) CacheEncode(id runtime.Identifier, encode func(runtime.O
if result.err != nil {
return result.err
}
if b, support := w.(runtime.Splice); support {
b.Splice(result.raw)
return nil
}
_, err := w.Write(result.raw)
return err
}

View File

@@ -0,0 +1,77 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cacher
import (
"context"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/apiserver/pkg/storage"
"k8s.io/client-go/tools/cache"
)
// listerWatcher opaques storage.Interface to expose cache.ListerWatcher.
type listerWatcher struct {
storage storage.Interface
resourcePrefix string
newListFunc func() runtime.Object
}
// NewListerWatcher returns a storage.Interface backed ListerWatcher.
func NewListerWatcher(storage storage.Interface, resourcePrefix string, newListFunc func() runtime.Object) cache.ListerWatcher {
return &listerWatcher{
storage: storage,
resourcePrefix: resourcePrefix,
newListFunc: newListFunc,
}
}
// Implements cache.ListerWatcher interface.
func (lw *listerWatcher) List(options metav1.ListOptions) (runtime.Object, error) {
list := lw.newListFunc()
pred := storage.SelectionPredicate{
Label: labels.Everything(),
Field: fields.Everything(),
Limit: options.Limit,
Continue: options.Continue,
}
storageOpts := storage.ListOptions{
ResourceVersionMatch: options.ResourceVersionMatch,
Predicate: pred,
Recursive: true,
}
if err := lw.storage.GetList(context.TODO(), lw.resourcePrefix, storageOpts, list); err != nil {
return nil, err
}
return list, nil
}
// Implements cache.ListerWatcher interface.
func (lw *listerWatcher) Watch(options metav1.ListOptions) (watch.Interface, error) {
opts := storage.ListOptions{
ResourceVersion: options.ResourceVersion,
Predicate: storage.Everything,
Recursive: true,
ProgressNotify: true,
}
return lw.storage.Watch(context.TODO(), lw.resourcePrefix, opts)
}

View File

@@ -74,6 +74,17 @@ var (
[]string{"resource"},
)
EventsReceivedCounter = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "events_received_total",
Help: "Counter of events received in watch cache broken by resource type.",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"resource"},
)
EventsCounter = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Namespace: namespace,
@@ -147,6 +158,7 @@ func Register() {
legacyregistry.MustRegister(listCacheNumFetched)
legacyregistry.MustRegister(listCacheNumReturned)
legacyregistry.MustRegister(InitCounter)
legacyregistry.MustRegister(EventsReceivedCounter)
legacyregistry.MustRegister(EventsCounter)
legacyregistry.MustRegister(TerminatedWatchersCounter)
legacyregistry.MustRegister(watchCacheCapacityIncreaseTotal)
@@ -167,7 +179,7 @@ func RecordListCacheMetrics(resourcePrefix, indexName string, numFetched, numRet
func RecordsWatchCacheCapacityChange(objType string, old, new int) {
WatchCacheCapacity.WithLabelValues(objType).Set(float64(new))
if old < new {
WatchCacheCapacity.WithLabelValues(objType).Inc()
watchCacheCapacityIncreaseTotal.WithLabelValues(objType).Inc()
return
}
watchCacheCapacityDecreaseTotal.WithLabelValues(objType).Inc()

View File

@@ -17,6 +17,7 @@ limitations under the License.
package cacher
import (
"context"
"fmt"
"sync"
)
@@ -30,67 +31,127 @@ const (
)
// ready is a three state condition variable that blocks until is Ready if is not Stopped.
// Its initial state is Pending.
// Its initial state is Pending and its state machine diagram is as follow.
//
// Pending <------> Ready -----> Stopped
//
// | ^
// └---------------------------┘
type ready struct {
state status
c *sync.Cond
state status // represent the state of the variable
generation int // represent the number of times we have transtioned to ready
lock sync.RWMutex // protect the state and generation variables
restartLock sync.Mutex // protect the transition from ready to pending where the channel is recreated
waitCh chan struct{} // blocks until is ready or stopped
}
func newReady() *ready {
return &ready{
c: sync.NewCond(&sync.RWMutex{}),
state: Pending,
waitCh: make(chan struct{}),
state: Pending,
}
}
// done close the channel once the state is Ready or Stopped
func (r *ready) done() chan struct{} {
r.restartLock.Lock()
defer r.restartLock.Unlock()
return r.waitCh
}
// wait blocks until it is Ready or Stopped, it returns an error if is Stopped.
func (r *ready) wait() error {
r.c.L.Lock()
defer r.c.L.Unlock()
for r.state == Pending {
r.c.Wait()
}
switch r.state {
case Ready:
return nil
case Stopped:
return fmt.Errorf("apiserver cacher is stopped")
default:
return fmt.Errorf("unexpected apiserver cache state: %v", r.state)
func (r *ready) wait(ctx context.Context) error {
_, err := r.waitAndReadGeneration(ctx)
return err
}
// waitAndReadGenration blocks until it is Ready or Stopped and returns number
// of times we entered ready state if Ready and error otherwise.
func (r *ready) waitAndReadGeneration(ctx context.Context) (int, error) {
for {
// r.done() only blocks if state is Pending
select {
case <-ctx.Done():
return 0, ctx.Err()
case <-r.done():
}
r.lock.RLock()
switch r.state {
case Pending:
// since we allow to switch between the states Pending and Ready
// if there is a quick transition from Pending -> Ready -> Pending
// a process that was waiting can get unblocked and see a Pending
// state again. If the state is Pending we have to wait again to
// avoid an inconsistent state on the system, with some processes not
// waiting despite the state moved back to Pending.
r.lock.RUnlock()
case Ready:
generation := r.generation
r.lock.RUnlock()
return generation, nil
case Stopped:
r.lock.RUnlock()
return 0, fmt.Errorf("apiserver cacher is stopped")
default:
r.lock.RUnlock()
return 0, fmt.Errorf("unexpected apiserver cache state: %v", r.state)
}
}
}
// check returns true only if it is Ready.
func (r *ready) check() bool {
// TODO: Make check() function more sophisticated, in particular
// allow it to behave as "waitWithTimeout".
rwMutex := r.c.L.(*sync.RWMutex)
rwMutex.RLock()
defer rwMutex.RUnlock()
return r.state == Ready
_, ok := r.checkAndReadGeneration()
return ok
}
// checkAndReadGeneration returns the current generation and whether it is Ready.
func (r *ready) checkAndReadGeneration() (int, bool) {
r.lock.RLock()
defer r.lock.RUnlock()
return r.generation, r.state == Ready
}
// set the state to Pending (false) or Ready (true), it does not have effect if the state is Stopped.
func (r *ready) set(ok bool) {
r.c.L.Lock()
defer r.c.L.Unlock()
r.lock.Lock()
defer r.lock.Unlock()
if r.state == Stopped {
return
}
if ok {
if ok && r.state == Pending {
r.state = Ready
} else {
r.generation++
select {
case <-r.waitCh:
default:
close(r.waitCh)
}
} else if !ok && r.state == Ready {
// creating the waitCh can be racy if
// something enter the wait() method
select {
case <-r.waitCh:
r.restartLock.Lock()
r.waitCh = make(chan struct{})
r.restartLock.Unlock()
default:
}
r.state = Pending
}
r.c.Broadcast()
}
// stop the condition variable and set it as Stopped. This state is irreversible.
func (r *ready) stop() {
r.c.L.Lock()
defer r.c.L.Unlock()
r.lock.Lock()
defer r.lock.Unlock()
if r.state != Stopped {
r.state = Stopped
r.c.Broadcast()
}
select {
case <-r.waitCh:
default:
close(r.waitCh)
}
}

View File

@@ -30,8 +30,10 @@ import (
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/apiserver/pkg/features"
"k8s.io/apiserver/pkg/storage"
"k8s.io/apiserver/pkg/storage/cacher/metrics"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/client-go/tools/cache"
"k8s.io/component-base/tracing"
"k8s.io/klog/v2"
@@ -156,14 +158,15 @@ type watchCache struct {
// getAttrsFunc is used to get labels and fields of an object.
getAttrsFunc func(runtime.Object) (labels.Set, fields.Set, error)
// cache is used a cyclic buffer - its first element (with the smallest
// resourceVersion) is defined by startIndex, its last element is defined
// by endIndex (if cache is full it will be startIndex + capacity).
// Both startIndex and endIndex can be greater than buffer capacity -
// you should always apply modulo capacity to get an index in cache array.
// cache is used a cyclic buffer - the "current" contents of it are
// stored in [start_index%capacity, end_index%capacity) - so the
// "current" contents have exactly end_index-start_index items.
cache []*watchCacheEvent
startIndex int
endIndex int
// removedEventSinceRelist holds the information whether any of the events
// were already removed from the `cache` cyclic buffer since the last relist
removedEventSinceRelist bool
// store will effectively support LIST operation from the "end of cache
// history" i.e. from the moment just after the newest cached watched event.
@@ -195,6 +198,10 @@ type watchCache struct {
// For testing cache interval invalidation.
indexValidator indexValidator
// Requests progress notification if there are requests waiting for watch
// to be fresh
waitingUntilFresh *conditionalProgressRequester
}
func newWatchCache(
@@ -203,8 +210,9 @@ func newWatchCache(
getAttrsFunc func(runtime.Object) (labels.Set, fields.Set, error),
versioner storage.Versioner,
indexers *cache.Indexers,
clock clock.Clock,
groupResource schema.GroupResource) *watchCache {
clock clock.WithTicker,
groupResource schema.GroupResource,
progressRequester *conditionalProgressRequester) *watchCache {
wc := &watchCache{
capacity: defaultLowerBoundCapacity,
keyFunc: keyFunc,
@@ -221,6 +229,7 @@ func newWatchCache(
clock: clock,
versioner: versioner,
groupResource: groupResource,
waitingUntilFresh: progressRequester,
}
metrics.WatchCacheCapacity.WithLabelValues(groupResource.String()).Set(float64(wc.capacity))
wc.cond = sync.NewCond(wc.RLocker())
@@ -280,6 +289,8 @@ func (w *watchCache) objectToVersionedRuntimeObject(obj interface{}) (runtime.Ob
// processEvent is safe as long as there is at most one call to it in flight
// at any point in time.
func (w *watchCache) processEvent(event watch.Event, resourceVersion uint64, updateFunc func(*storeElement) error) error {
metrics.EventsReceivedCounter.WithLabelValues(w.groupResource.String()).Inc()
key, err := w.keyFunc(event.Object)
if err != nil {
return fmt.Errorf("couldn't compute key: %v", err)
@@ -302,7 +313,7 @@ func (w *watchCache) processEvent(event watch.Event, resourceVersion uint64, upd
if err := func() error {
// TODO: We should consider moving this lock below after the watchCacheEvent
// is created. In such situation, the only problematic scenario is Replace(
// is created. In such situation, the only problematic scenario is Replace()
// happening after getting object from store and before acquiring a lock.
// Maybe introduce another lock for this purpose.
w.Lock()
@@ -344,6 +355,7 @@ func (w *watchCache) updateCache(event *watchCacheEvent) {
if w.isCacheFullLocked() {
// Cache is full - remove the oldest element.
w.startIndex++
w.removedEventSinceRelist = true
}
w.cache[w.endIndex%w.capacity] = event
w.endIndex++
@@ -402,6 +414,7 @@ func (w *watchCache) UpdateResourceVersion(resourceVersion string) {
w.Lock()
defer w.Unlock()
w.resourceVersion = rv
w.cond.Broadcast()
}()
// Avoid calling event handler under lock.
@@ -463,25 +476,57 @@ func (w *watchCache) waitUntilFreshAndBlock(ctx context.Context, resourceVersion
return nil
}
type sortableStoreElements []interface{}
func (s sortableStoreElements) Len() int {
return len(s)
}
func (s sortableStoreElements) Less(i, j int) bool {
return s[i].(*storeElement).Key < s[j].(*storeElement).Key
}
func (s sortableStoreElements) Swap(i, j int) {
s[i], s[j] = s[j], s[i]
}
// WaitUntilFreshAndList returns list of pointers to `storeElement` objects along
// with their ResourceVersion and the name of the index, if any, that was used.
func (w *watchCache) WaitUntilFreshAndList(ctx context.Context, resourceVersion uint64, matchValues []storage.MatchValue) ([]interface{}, uint64, string, error) {
err := w.waitUntilFreshAndBlock(ctx, resourceVersion)
var err error
if utilfeature.DefaultFeatureGate.Enabled(features.ConsistentListFromCache) && w.notFresh(resourceVersion) {
w.waitingUntilFresh.Add()
err = w.waitUntilFreshAndBlock(ctx, resourceVersion)
w.waitingUntilFresh.Remove()
} else {
err = w.waitUntilFreshAndBlock(ctx, resourceVersion)
}
defer w.RUnlock()
if err != nil {
return nil, 0, "", err
}
// This isn't the place where we do "final filtering" - only some "prefiltering" is happening here. So the only
// requirement here is to NOT miss anything that should be returned. We can return as many non-matching items as we
// want - they will be filtered out later. The fact that we return less things is only further performance improvement.
// TODO: if multiple indexes match, return the one with the fewest items, so as to do as much filtering as possible.
for _, matchValue := range matchValues {
if result, err := w.store.ByIndex(matchValue.IndexName, matchValue.Value); err == nil {
return result, w.resourceVersion, matchValue.IndexName, nil
result, rv, index, err := func() ([]interface{}, uint64, string, error) {
// This isn't the place where we do "final filtering" - only some "prefiltering" is happening here. So the only
// requirement here is to NOT miss anything that should be returned. We can return as many non-matching items as we
// want - they will be filtered out later. The fact that we return less things is only further performance improvement.
// TODO: if multiple indexes match, return the one with the fewest items, so as to do as much filtering as possible.
for _, matchValue := range matchValues {
if result, err := w.store.ByIndex(matchValue.IndexName, matchValue.Value); err == nil {
return result, w.resourceVersion, matchValue.IndexName, nil
}
}
}
return w.store.List(), w.resourceVersion, "", nil
return w.store.List(), w.resourceVersion, "", nil
}()
sort.Sort(sortableStoreElements(result))
return result, rv, index, err
}
func (w *watchCache) notFresh(resourceVersion uint64) bool {
w.RLock()
defer w.RUnlock()
return resourceVersion > w.resourceVersion
}
// WaitUntilFreshAndGet returns a pointers to <storeElement> object.
@@ -551,8 +596,15 @@ func (w *watchCache) Replace(objs []interface{}, resourceVersion string) error {
w.Lock()
defer w.Unlock()
w.startIndex = 0
w.endIndex = 0
// Ensure startIndex never decreases, so that existing watchCacheInterval
// instances get "invalid" errors if the try to download from the buffer
// using their own start/end indexes calculated from previous buffer
// content.
// Empty the cyclic buffer, ensuring startIndex doesn't decrease.
w.startIndex = w.endIndex
w.removedEventSinceRelist = false
if err := w.store.Replace(toReplace, resourceVersion); err != nil {
return err
}
@@ -578,8 +630,8 @@ func (w *watchCache) Resync() error {
}
func (w *watchCache) currentCapacity() int {
w.Lock()
defer w.Unlock()
w.RLock()
defer w.RUnlock()
return w.capacity
}
@@ -643,7 +695,7 @@ func (w *watchCache) getAllEventsSinceLocked(resourceVersion uint64) (*watchCach
size := w.endIndex - w.startIndex
var oldest uint64
switch {
case w.listResourceVersion > 0 && w.startIndex == 0:
case w.listResourceVersion > 0 && !w.removedEventSinceRelist:
// If no event was removed from the buffer since last relist, the oldest watch
// event we can deliver is one greater than the resource version of the list.
oldest = w.listResourceVersion + 1
@@ -665,11 +717,7 @@ func (w *watchCache) getAllEventsSinceLocked(resourceVersion uint64) (*watchCach
// current state and only then start watching from that point.
//
// TODO: In v2 api, we should stop returning the current state - #13969.
ci, err := newCacheIntervalFromStore(w.resourceVersion, w.store, w.getAttrsFunc)
if err != nil {
return nil, err
}
return ci, nil
return w.getIntervalFromStoreLocked()
}
if resourceVersion < oldest-1 {
return nil, errors.NewResourceExpired(fmt.Sprintf("too old resource version: %d (%d)", resourceVersion, oldest-1))
@@ -686,3 +734,14 @@ func (w *watchCache) getAllEventsSinceLocked(resourceVersion uint64) (*watchCach
ci := newCacheInterval(w.startIndex+first, w.endIndex, indexerFunc, w.indexValidator, &w.RWMutex)
return ci, nil
}
// getIntervalFromStoreLocked returns a watchCacheInterval
// that covers the entire storage state.
// This function assumes to be called under the watchCache lock.
func (w *watchCache) getIntervalFromStoreLocked() (*watchCacheInterval, error) {
ci, err := newCacheIntervalFromStore(w.resourceVersion, w.store, w.getAttrsFunc)
if err != nil {
return nil, err
}
return ci, nil
}

View File

@@ -0,0 +1,121 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cacher
import (
"context"
"sync"
"time"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/klog/v2"
"k8s.io/utils/clock"
)
const (
// progressRequestPeriod determines period of requesting progress
// from etcd when there is a request waiting for watch cache to be fresh.
progressRequestPeriod = 100 * time.Millisecond
)
func newConditionalProgressRequester(requestWatchProgress WatchProgressRequester, clock TickerFactory) *conditionalProgressRequester {
pr := &conditionalProgressRequester{
clock: clock,
requestWatchProgress: requestWatchProgress,
}
pr.cond = sync.NewCond(pr.mux.RLocker())
return pr
}
type WatchProgressRequester func(ctx context.Context) error
type TickerFactory interface {
NewTicker(time.Duration) clock.Ticker
}
// conditionalProgressRequester will request progress notification if there
// is a request waiting for watch cache to be fresh.
type conditionalProgressRequester struct {
clock TickerFactory
requestWatchProgress WatchProgressRequester
mux sync.RWMutex
cond *sync.Cond
waiting int
stopped bool
}
func (pr *conditionalProgressRequester) Run(stopCh <-chan struct{}) {
ctx := wait.ContextForChannel(stopCh)
go func() {
defer utilruntime.HandleCrash()
<-stopCh
pr.mux.Lock()
defer pr.mux.Unlock()
pr.stopped = true
pr.cond.Signal()
}()
ticker := pr.clock.NewTicker(progressRequestPeriod)
defer ticker.Stop()
for {
stopped := func() bool {
pr.mux.RLock()
defer pr.mux.RUnlock()
for pr.waiting == 0 && !pr.stopped {
pr.cond.Wait()
}
return pr.stopped
}()
if stopped {
return
}
select {
case <-ticker.C():
shouldRequest := func() bool {
pr.mux.RLock()
defer pr.mux.RUnlock()
return pr.waiting > 0 && !pr.stopped
}()
if !shouldRequest {
continue
}
err := pr.requestWatchProgress(ctx)
if err != nil {
klog.V(4).InfoS("Error requesting bookmark", "err", err)
}
case <-stopCh:
return
}
}
}
func (pr *conditionalProgressRequester) Add() {
pr.mux.Lock()
defer pr.mux.Unlock()
pr.waiting += 1
pr.cond.Signal()
}
func (pr *conditionalProgressRequester) Remove() {
pr.mux.Lock()
defer pr.mux.Unlock()
pr.waiting -= 1
pr.cond.Signal()
}

View File

@@ -17,13 +17,16 @@ limitations under the License.
package storage
import (
"errors"
"fmt"
"k8s.io/apimachinery/pkg/api/errors"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/validation/field"
)
var ErrResourceVersionSetOnCreate = errors.New("resourceVersion should not be set on objects to be created")
const (
ErrCodeKeyNotFound int = iota + 1
ErrCodeKeyExists
@@ -176,7 +179,7 @@ var tooLargeResourceVersionCauseMsg = "Too large resource version"
// NewTooLargeResourceVersionError returns a timeout error with the given retrySeconds for a request for
// a minimum resource version that is larger than the largest currently available resource version for a requested resource.
func NewTooLargeResourceVersionError(minimumResourceVersion, currentRevision uint64, retrySeconds int) error {
err := errors.NewTimeoutError(fmt.Sprintf("Too large resource version: %d, current: %d", minimumResourceVersion, currentRevision), retrySeconds)
err := apierrors.NewTimeoutError(fmt.Sprintf("Too large resource version: %d, current: %d", minimumResourceVersion, currentRevision), retrySeconds)
err.ErrStatus.Details.Causes = []metav1.StatusCause{
{
Type: metav1.CauseTypeResourceVersionTooLarge,
@@ -188,8 +191,8 @@ func NewTooLargeResourceVersionError(minimumResourceVersion, currentRevision uin
// IsTooLargeResourceVersion returns true if the error is a TooLargeResourceVersion error.
func IsTooLargeResourceVersion(err error) bool {
if !errors.IsTimeout(err) {
if !apierrors.IsTimeout(err) {
return false
}
return errors.HasStatusCause(err, metav1.CauseTypeResourceVersionTooLarge)
return apierrors.HasStatusCause(err, metav1.CauseTypeResourceVersionTooLarge)
}

View File

@@ -30,6 +30,17 @@ type event struct {
isDeleted bool
isCreated bool
isProgressNotify bool
// isInitialEventsEndBookmark helps us keep track
// of whether we have sent an annotated bookmark event.
//
// when this variable is set to true,
// a special annotation will be added
// to the bookmark event.
//
// note that we decided to extend the event
// struct field to eliminate contention
// between startWatching and processEvent
isInitialEventsEndBookmark bool
}
// parseKV converts a KeyValue retrieved from an initial sync() listing to a synthetic isCreated event.

View File

@@ -28,6 +28,7 @@ type etcdHealth struct {
}
// EtcdHealthCheck decodes data returned from etcd /healthz handler.
// Deprecated: Validate health by passing storagebackend.Config directly to storagefactory.CreateProber.
func EtcdHealthCheck(data []byte) error {
obj := etcdHealth{}
if err := json.Unmarshal(data, &obj); err != nil {

View File

@@ -47,8 +47,7 @@ func NewETCDLatencyTracker(delegate clientv3.KV) clientv3.KV {
// tracking function TrackStorageLatency is thread safe.
//
// NOTE: Compact is an asynchronous process and is not associated with
//
// any request, so we will not be tracking its latency.
// any request, so we will not be tracking its latency.
type clientV3KVLatencyTracker struct {
clientv3.KV
}

View File

@@ -17,11 +17,14 @@ limitations under the License.
package metrics
import (
"context"
"fmt"
"sync"
"time"
compbasemetrics "k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
"k8s.io/klog/v2"
)
/*
@@ -47,23 +50,51 @@ var (
},
[]string{"operation", "type"},
)
etcdRequestCounts = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Name: "etcd_requests_total",
Help: "Etcd request counts for each operation and object type.",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"operation", "type"},
)
etcdRequestErrorCounts = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Name: "etcd_request_errors_total",
Help: "Etcd failed request counts for each operation and object type.",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"operation", "type"},
)
objectCounts = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
Name: "apiserver_storage_objects",
Help: "Number of stored objects at the time of last check split by kind.",
Help: "Number of stored objects at the time of last check split by kind. In case of a fetching error, the value will be -1.",
StabilityLevel: compbasemetrics.STABLE,
},
[]string{"resource"},
)
dbTotalSize = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
Subsystem: "apiserver",
Name: "storage_db_total_size_in_bytes",
Help: "Total size of the storage database file physically allocated in bytes.",
StabilityLevel: compbasemetrics.ALPHA,
Subsystem: "apiserver",
Name: "storage_db_total_size_in_bytes",
Help: "Total size of the storage database file physically allocated in bytes.",
StabilityLevel: compbasemetrics.ALPHA,
DeprecatedVersion: "1.28.0",
},
[]string{"endpoint"},
)
storageSizeDescription = compbasemetrics.NewDesc("apiserver_storage_size_bytes", "Size of the storage database file physically allocated in bytes.", []string{"cluster"}, nil, compbasemetrics.ALPHA, "")
storageMonitor = &monitorCollector{monitorGetter: func() ([]Monitor, error) { return nil, nil }}
etcdEventsReceivedCounts = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Subsystem: "apiserver",
Name: "storage_events_received_total",
Help: "Number of etcd events received split by kind.",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"resource"},
)
etcdBookmarkCounts = compbasemetrics.NewGaugeVec(
&compbasemetrics.GaugeOpts{
Name: "etcd_bookmark_counts",
@@ -113,6 +144,15 @@ var (
},
[]string{"resource"},
)
decodeErrorCounts = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Namespace: "apiserver",
Name: "storage_decode_errors_total",
Help: "Number of stored object decode errors split by object type",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"resource"},
)
)
var registerMetrics sync.Once
@@ -122,14 +162,18 @@ func Register() {
// Register the metrics.
registerMetrics.Do(func() {
legacyregistry.MustRegister(etcdRequestLatency)
legacyregistry.MustRegister(etcdRequestCounts)
legacyregistry.MustRegister(etcdRequestErrorCounts)
legacyregistry.MustRegister(objectCounts)
legacyregistry.MustRegister(dbTotalSize)
legacyregistry.CustomMustRegister(storageMonitor)
legacyregistry.MustRegister(etcdBookmarkCounts)
legacyregistry.MustRegister(etcdLeaseObjectCounts)
legacyregistry.MustRegister(listStorageCount)
legacyregistry.MustRegister(listStorageNumFetched)
legacyregistry.MustRegister(listStorageNumSelectorEvals)
legacyregistry.MustRegister(listStorageNumReturned)
legacyregistry.MustRegister(decodeErrorCounts)
})
}
@@ -138,9 +182,20 @@ func UpdateObjectCount(resourcePrefix string, count int64) {
objectCounts.WithLabelValues(resourcePrefix).Set(float64(count))
}
// RecordEtcdRequestLatency sets the etcd_request_duration_seconds metrics.
func RecordEtcdRequestLatency(verb, resource string, startTime time.Time) {
etcdRequestLatency.WithLabelValues(verb, resource).Observe(sinceInSeconds(startTime))
// RecordEtcdRequest updates and sets the etcd_request_duration_seconds,
// etcd_request_total, etcd_request_errors_total metrics.
func RecordEtcdRequest(verb, resource string, err error, startTime time.Time) {
v := []string{verb, resource}
etcdRequestLatency.WithLabelValues(v...).Observe(sinceInSeconds(startTime))
etcdRequestCounts.WithLabelValues(v...).Inc()
if err != nil {
etcdRequestErrorCounts.WithLabelValues(v...).Inc()
}
}
// RecordEtcdEvent updated the etcd_events_received_total metric.
func RecordEtcdEvent(resource string) {
etcdEventsReceivedCounts.WithLabelValues(resource).Inc()
}
// RecordEtcdBookmark updates the etcd_bookmark_counts metric.
@@ -148,21 +203,34 @@ func RecordEtcdBookmark(resource string) {
etcdBookmarkCounts.WithLabelValues(resource).Inc()
}
// RecordDecodeError sets the storage_decode_errors metrics.
func RecordDecodeError(resource string) {
decodeErrorCounts.WithLabelValues(resource).Inc()
}
// Reset resets the etcd_request_duration_seconds metric.
func Reset() {
etcdRequestLatency.Reset()
}
// sinceInSeconds gets the time since the specified start in seconds.
func sinceInSeconds(start time.Time) float64 {
//
// This is a variable to facilitate testing.
var sinceInSeconds = func(start time.Time) float64 {
return time.Since(start).Seconds()
}
// UpdateEtcdDbSize sets the etcd_db_total_size_in_bytes metric.
// Deprecated: Metric etcd_db_total_size_in_bytes will be replaced with apiserver_storage_size_bytes
func UpdateEtcdDbSize(ep string, size int64) {
dbTotalSize.WithLabelValues(ep).Set(float64(size))
}
// SetStorageMonitorGetter sets monitor getter to allow monitoring etcd stats.
func SetStorageMonitorGetter(getter func() ([]Monitor, error)) {
storageMonitor.setGetter(getter)
}
// UpdateLeaseObjectCount sets the etcd_lease_object_counts metric.
func UpdateLeaseObjectCount(count int64) {
// Currently we only store one previous lease, since all the events have the same ttl.
@@ -177,3 +245,64 @@ func RecordStorageListMetrics(resource string, numFetched, numEvald, numReturned
listStorageNumSelectorEvals.WithLabelValues(resource).Add(float64(numEvald))
listStorageNumReturned.WithLabelValues(resource).Add(float64(numReturned))
}
type Monitor interface {
Monitor(ctx context.Context) (StorageMetrics, error)
Close() error
}
type StorageMetrics struct {
Size int64
}
type monitorCollector struct {
compbasemetrics.BaseStableCollector
mutex sync.Mutex
monitorGetter func() ([]Monitor, error)
}
func (m *monitorCollector) setGetter(monitorGetter func() ([]Monitor, error)) {
m.mutex.Lock()
defer m.mutex.Unlock()
m.monitorGetter = monitorGetter
}
func (m *monitorCollector) getGetter() func() ([]Monitor, error) {
m.mutex.Lock()
defer m.mutex.Unlock()
return m.monitorGetter
}
// DescribeWithStability implements compbasemetrics.StableColletor
func (c *monitorCollector) DescribeWithStability(ch chan<- *compbasemetrics.Desc) {
ch <- storageSizeDescription
}
// CollectWithStability implements compbasemetrics.StableColletor
func (c *monitorCollector) CollectWithStability(ch chan<- compbasemetrics.Metric) {
monitors, err := c.getGetter()()
if err != nil {
return
}
for i, m := range monitors {
cluster := fmt.Sprintf("etcd-%d", i)
klog.V(4).InfoS("Start collecting storage metrics", "cluster", cluster)
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
metrics, err := m.Monitor(ctx)
cancel()
m.Close()
if err != nil {
klog.InfoS("Failed to get storage metrics", "cluster", cluster, "err", err)
continue
}
metric, err := compbasemetrics.NewConstMetric(storageSizeDescription, compbasemetrics.GaugeValue, float64(metrics.Size), cluster)
if err != nil {
klog.ErrorS(err, "Failed to create metric", "cluster", cluster)
}
ch <- metric
}
}

View File

@@ -32,18 +32,15 @@ import (
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/conversion"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/apiserver/pkg/audit"
"k8s.io/apiserver/pkg/features"
"k8s.io/apiserver/pkg/storage"
"k8s.io/apiserver/pkg/storage/etcd3/metrics"
"k8s.io/apiserver/pkg/storage/value"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/component-base/tracing"
"k8s.io/klog/v2"
)
@@ -80,10 +77,15 @@ type store struct {
groupResource schema.GroupResource
groupResourceString string
watcher *watcher
pagingEnabled bool
leaseManager *leaseManager
}
func (s *store) RequestWatchProgress(ctx context.Context) error {
// Use watchContext to match ctx metadata provided when creating the watch.
// In best case scenario we would use the same context that watch was created, but there is no way access it from watchCache.
return s.client.RequestProgress(s.watchContext(ctx))
}
type objState struct {
obj runtime.Object
meta *storage.ResponseMeta
@@ -93,11 +95,11 @@ type objState struct {
}
// New returns an etcd3 implementation of storage.Interface.
func New(c *clientv3.Client, codec runtime.Codec, newFunc func() runtime.Object, prefix string, groupResource schema.GroupResource, transformer value.Transformer, pagingEnabled bool, leaseManagerConfig LeaseManagerConfig) storage.Interface {
return newStore(c, codec, newFunc, prefix, groupResource, transformer, pagingEnabled, leaseManagerConfig)
func New(c *clientv3.Client, codec runtime.Codec, newFunc, newListFunc func() runtime.Object, prefix, resourcePrefix string, groupResource schema.GroupResource, transformer value.Transformer, leaseManagerConfig LeaseManagerConfig) storage.Interface {
return newStore(c, codec, newFunc, newListFunc, prefix, resourcePrefix, groupResource, transformer, leaseManagerConfig)
}
func newStore(c *clientv3.Client, codec runtime.Codec, newFunc func() runtime.Object, prefix string, groupResource schema.GroupResource, transformer value.Transformer, pagingEnabled bool, leaseManagerConfig LeaseManagerConfig) *store {
func newStore(c *clientv3.Client, codec runtime.Codec, newFunc, newListFunc func() runtime.Object, prefix, resourcePrefix string, groupResource schema.GroupResource, transformer value.Transformer, leaseManagerConfig LeaseManagerConfig) *store {
versioner := storage.APIObjectVersioner{}
// for compatibility with etcd2 impl.
// no-op for default prefix of '/registry'.
@@ -107,19 +109,36 @@ func newStore(c *clientv3.Client, codec runtime.Codec, newFunc func() runtime.Ob
// Ensure the pathPrefix ends in "/" here to simplify key concatenation later.
pathPrefix += "/"
}
result := &store{
w := &watcher{
client: c,
codec: codec,
newFunc: newFunc,
groupResource: groupResource,
versioner: versioner,
transformer: transformer,
}
if newFunc == nil {
w.objectType = "<unknown>"
} else {
w.objectType = reflect.TypeOf(newFunc()).String()
}
s := &store{
client: c,
codec: codec,
versioner: versioner,
transformer: transformer,
pagingEnabled: pagingEnabled,
pathPrefix: pathPrefix,
groupResource: groupResource,
groupResourceString: groupResource.String(),
watcher: newWatcher(c, codec, groupResource, newFunc, versioner),
watcher: w,
leaseManager: newDefaultLeaseManager(c, leaseManagerConfig),
}
return result
w.getCurrentStorageRV = func(ctx context.Context) (uint64, error) {
return storage.GetCurrentResourceVersionFromStorage(ctx, s, newListFunc, resourcePrefix, w.objectType)
}
return s
}
// Versioner implements storage.Interface.Versioner.
@@ -135,7 +154,7 @@ func (s *store) Get(ctx context.Context, key string, opts storage.GetOptions, ou
}
startTime := time.Now()
getResp, err := s.client.KV.Get(ctx, preparedKey)
metrics.RecordEtcdRequestLatency("get", s.groupResourceString, startTime)
metrics.RecordEtcdRequest("get", s.groupResourceString, err, startTime)
if err != nil {
return err
}
@@ -156,7 +175,12 @@ func (s *store) Get(ctx context.Context, key string, opts storage.GetOptions, ou
return storage.NewInternalError(err.Error())
}
return decode(s.codec, s.versioner, data, out, kv.ModRevision)
err = decode(s.codec, s.versioner, data, out, kv.ModRevision)
if err != nil {
recordDecodeError(s.groupResourceString, preparedKey)
return err
}
return nil
}
// Create implements storage.Interface.Create.
@@ -173,7 +197,7 @@ func (s *store) Create(ctx context.Context, key string, obj, out runtime.Object,
)
defer span.End(500 * time.Millisecond)
if version, err := s.versioner.ObjectResourceVersion(obj); err == nil && version != 0 {
return errors.New("resourceVersion should not be set on objects to be created")
return storage.ErrResourceVersionSetOnCreate
}
if err := s.versioner.PrepareObjectForStorage(obj); err != nil {
return fmt.Errorf("PrepareObjectForStorage failed: %v", err)
@@ -204,7 +228,7 @@ func (s *store) Create(ctx context.Context, key string, obj, out runtime.Object,
).Then(
clientv3.OpPut(preparedKey, string(newData), opts...),
).Commit()
metrics.RecordEtcdRequestLatency("create", s.groupResourceString, startTime)
metrics.RecordEtcdRequest("create", s.groupResourceString, err, startTime)
if err != nil {
span.AddEvent("Txn call failed", attribute.String("err", err.Error()))
return err
@@ -220,6 +244,7 @@ func (s *store) Create(ctx context.Context, key string, obj, out runtime.Object,
err = decode(s.codec, s.versioner, data, out, putResp.Header.Revision)
if err != nil {
span.AddEvent("decode failed", attribute.Int("len", len(data)), attribute.String("err", err.Error()))
recordDecodeError(s.groupResourceString, preparedKey)
return err
}
span.AddEvent("decode succeeded", attribute.Int("len", len(data)))
@@ -245,15 +270,7 @@ func (s *store) Delete(
func (s *store) conditionalDelete(
ctx context.Context, key string, out runtime.Object, v reflect.Value, preconditions *storage.Preconditions,
validateDeletion storage.ValidateObjectFunc, cachedExistingObject runtime.Object) error {
getCurrentState := func() (*objState, error) {
startTime := time.Now()
getResp, err := s.client.KV.Get(ctx, key)
metrics.RecordEtcdRequestLatency("get", s.groupResourceString, startTime)
if err != nil {
return nil, err
}
return s.getState(ctx, getResp, key, v, false)
}
getCurrentState := s.getCurrentState(ctx, key, v, false)
var origState *objState
var err error
@@ -330,7 +347,7 @@ func (s *store) conditionalDelete(
).Else(
clientv3.OpGet(key),
).Commit()
metrics.RecordEtcdRequestLatency("delete", s.groupResourceString, startTime)
metrics.RecordEtcdRequest("delete", s.groupResourceString, err, startTime)
if err != nil {
return err
}
@@ -352,7 +369,12 @@ func (s *store) conditionalDelete(
if deleteResp.Header == nil {
return errors.New("invalid DeleteRange response - nil header")
}
return decode(s.codec, s.versioner, origState.data, out, deleteResp.Header.Revision)
err = decode(s.codec, s.versioner, origState.data, out, deleteResp.Header.Revision)
if err != nil {
recordDecodeError(s.groupResourceString, key)
return err
}
return nil
}
}
@@ -376,15 +398,7 @@ func (s *store) GuaranteedUpdate(
return fmt.Errorf("unable to convert output object to pointer: %v", err)
}
getCurrentState := func() (*objState, error) {
startTime := time.Now()
getResp, err := s.client.KV.Get(ctx, preparedKey)
metrics.RecordEtcdRequestLatency("get", s.groupResourceString, startTime)
if err != nil {
return nil, err
}
return s.getState(ctx, getResp, preparedKey, v, ignoreNotFound)
}
getCurrentState := s.getCurrentState(ctx, preparedKey, v, ignoreNotFound)
var origState *objState
var origStateIsCurrent bool
@@ -470,7 +484,12 @@ func (s *store) GuaranteedUpdate(
}
// recheck that the data from etcd is not stale before short-circuiting a write
if !origState.stale {
return decode(s.codec, s.versioner, origState.data, destination, origState.rev)
err = decode(s.codec, s.versioner, origState.data, destination, origState.rev)
if err != nil {
recordDecodeError(s.groupResourceString, preparedKey)
return err
}
return nil
}
}
@@ -495,7 +514,7 @@ func (s *store) GuaranteedUpdate(
).Else(
clientv3.OpGet(preparedKey),
).Commit()
metrics.RecordEtcdRequestLatency("update", s.groupResourceString, startTime)
metrics.RecordEtcdRequest("update", s.groupResourceString, err, startTime)
if err != nil {
span.AddEvent("Txn call failed", attribute.String("err", err.Error()))
return err
@@ -518,6 +537,7 @@ func (s *store) GuaranteedUpdate(
err = decode(s.codec, s.versioner, data, destination, putResp.Header.Revision)
if err != nil {
span.AddEvent("decode failed", attribute.Int("len", len(data)), attribute.String("err", err.Error()))
recordDecodeError(s.groupResourceString, preparedKey)
return err
}
span.AddEvent("decode succeeded", attribute.Int("len", len(data)))
@@ -557,7 +577,7 @@ func (s *store) Count(key string) (int64, error) {
startTime := time.Now()
getResp, err := s.client.KV.Get(context.Background(), preparedKey, clientv3.WithRange(clientv3.GetPrefixRangeEnd(preparedKey)), clientv3.WithCountOnly())
metrics.RecordEtcdRequestLatency("listWithCount", preparedKey, startTime)
metrics.RecordEtcdRequest("listWithCount", preparedKey, err, startTime)
if err != nil {
return 0, err
}
@@ -570,17 +590,13 @@ func (s *store) GetList(ctx context.Context, key string, opts storage.ListOption
if err != nil {
return err
}
recursive := opts.Recursive
resourceVersion := opts.ResourceVersion
match := opts.ResourceVersionMatch
pred := opts.Predicate
ctx, span := tracing.Start(ctx, fmt.Sprintf("List(recursive=%v) etcd3", recursive),
ctx, span := tracing.Start(ctx, fmt.Sprintf("List(recursive=%v) etcd3", opts.Recursive),
attribute.String("audit-id", audit.GetAuditIDTruncated(ctx)),
attribute.String("key", key),
attribute.String("resourceVersion", resourceVersion),
attribute.String("resourceVersionMatch", string(match)),
attribute.Int("limit", int(pred.Limit)),
attribute.String("continue", pred.Continue))
attribute.String("resourceVersion", opts.ResourceVersion),
attribute.String("resourceVersionMatch", string(opts.ResourceVersionMatch)),
attribute.Int("limit", int(opts.Predicate.Limit)),
attribute.String("continue", opts.Predicate.Continue))
defer span.End(500 * time.Millisecond)
listPtr, err := meta.GetItemsPtr(listObj)
if err != nil {
@@ -595,97 +611,68 @@ func (s *store) GetList(ctx context.Context, key string, opts storage.ListOption
// get children "directories". e.g. if we have key "/a", "/a/b", "/ab", getting keys
// with prefix "/a" will return all three, while with prefix "/a/" will return only
// "/a/b" which is the correct answer.
if recursive && !strings.HasSuffix(preparedKey, "/") {
if opts.Recursive && !strings.HasSuffix(preparedKey, "/") {
preparedKey += "/"
}
keyPrefix := preparedKey
// set the appropriate clientv3 options to filter the returned data set
var limitOption *clientv3.OpOption
limit := pred.Limit
limit := opts.Predicate.Limit
var paging bool
options := make([]clientv3.OpOption, 0, 4)
if s.pagingEnabled && pred.Limit > 0 {
if opts.Predicate.Limit > 0 {
paging = true
options = append(options, clientv3.WithLimit(limit))
limitOption = &options[len(options)-1]
}
newItemFunc := getNewItemFunc(listObj, v)
var fromRV *uint64
if len(resourceVersion) > 0 {
parsedRV, err := s.versioner.ParseResourceVersion(resourceVersion)
if err != nil {
return apierrors.NewBadRequest(fmt.Sprintf("invalid resource version: %v", err))
}
fromRV = &parsedRV
if opts.Recursive {
rangeEnd := clientv3.GetPrefixRangeEnd(keyPrefix)
options = append(options, clientv3.WithRange(rangeEnd))
}
var returnedRV, continueRV, withRev int64
newItemFunc := getNewItemFunc(listObj, v)
var continueRV, withRev int64
var continueKey string
switch {
case recursive && s.pagingEnabled && len(pred.Continue) > 0:
continueKey, continueRV, err = storage.DecodeContinue(pred.Continue, keyPrefix)
case opts.Recursive && len(opts.Predicate.Continue) > 0:
continueKey, continueRV, err = storage.DecodeContinue(opts.Predicate.Continue, keyPrefix)
if err != nil {
return apierrors.NewBadRequest(fmt.Sprintf("invalid continue token: %v", err))
}
if len(resourceVersion) > 0 && resourceVersion != "0" {
if len(opts.ResourceVersion) > 0 && opts.ResourceVersion != "0" {
return apierrors.NewBadRequest("specifying resource version is not allowed when using continue")
}
rangeEnd := clientv3.GetPrefixRangeEnd(keyPrefix)
options = append(options, clientv3.WithRange(rangeEnd))
preparedKey = continueKey
// If continueRV > 0, the LIST request needs a specific resource version.
// continueRV==0 is invalid.
// If continueRV < 0, the request is for the latest resource version.
if continueRV > 0 {
withRev = continueRV
returnedRV = continueRV
}
case recursive && s.pagingEnabled && pred.Limit > 0:
if fromRV != nil {
switch match {
case metav1.ResourceVersionMatchNotOlderThan:
// The not older than constraint is checked after we get a response from etcd,
// and returnedRV is then set to the revision we get from the etcd response.
case metav1.ResourceVersionMatchExact:
returnedRV = int64(*fromRV)
withRev = returnedRV
case "": // legacy case
if *fromRV > 0 {
returnedRV = int64(*fromRV)
withRev = returnedRV
}
default:
return fmt.Errorf("unknown ResourceVersionMatch value: %v", match)
case len(opts.ResourceVersion) > 0:
parsedRV, err := s.versioner.ParseResourceVersion(opts.ResourceVersion)
if err != nil {
return apierrors.NewBadRequest(fmt.Sprintf("invalid resource version: %v", err))
}
switch opts.ResourceVersionMatch {
case metav1.ResourceVersionMatchNotOlderThan:
// The not older than constraint is checked after we get a response from etcd,
// and returnedRV is then set to the revision we get from the etcd response.
case metav1.ResourceVersionMatchExact:
withRev = int64(parsedRV)
case "": // legacy case
if opts.Recursive && opts.Predicate.Limit > 0 && parsedRV > 0 {
withRev = int64(parsedRV)
}
}
rangeEnd := clientv3.GetPrefixRangeEnd(keyPrefix)
options = append(options, clientv3.WithRange(rangeEnd))
default:
if fromRV != nil {
switch match {
case metav1.ResourceVersionMatchNotOlderThan:
// The not older than constraint is checked after we get a response from etcd,
// and returnedRV is then set to the revision we get from the etcd response.
case metav1.ResourceVersionMatchExact:
returnedRV = int64(*fromRV)
withRev = returnedRV
case "": // legacy case
default:
return fmt.Errorf("unknown ResourceVersionMatch value: %v", match)
}
}
if recursive {
options = append(options, clientv3.WithPrefix())
default:
return fmt.Errorf("unknown ResourceVersionMatch value: %v", opts.ResourceVersionMatch)
}
}
if withRev != 0 {
options = append(options, clientv3.WithRev(withRev))
}
@@ -702,19 +689,21 @@ func (s *store) GetList(ctx context.Context, key string, opts storage.ListOption
numReturn := v.Len()
metrics.RecordStorageListMetrics(s.groupResourceString, numFetched, numEvald, numReturn)
}()
metricsOp := "get"
if opts.Recursive {
metricsOp = "list"
}
for {
startTime := time.Now()
getResp, err = s.client.KV.Get(ctx, preparedKey, options...)
if recursive {
metrics.RecordEtcdRequestLatency("list", s.groupResourceString, startTime)
} else {
metrics.RecordEtcdRequestLatency("get", s.groupResourceString, startTime)
}
metrics.RecordEtcdRequest(metricsOp, s.groupResourceString, err, startTime)
if err != nil {
return interpretListError(err, len(pred.Continue) > 0, continueKey, keyPrefix)
return interpretListError(err, len(opts.Predicate.Continue) > 0, continueKey, keyPrefix)
}
numFetched += len(getResp.Kvs)
if err = s.validateMinimumResourceVersion(resourceVersion, uint64(getResp.Header.Revision)); err != nil {
if err = s.validateMinimumResourceVersion(opts.ResourceVersion, uint64(getResp.Header.Revision)); err != nil {
return err
}
hasMore = getResp.More
@@ -722,10 +711,15 @@ func (s *store) GetList(ctx context.Context, key string, opts storage.ListOption
if len(getResp.Kvs) == 0 && getResp.More {
return fmt.Errorf("no results were found, but etcd indicated there were more values remaining")
}
// indicate to the client which resource version was returned, and use the same resource version for subsequent requests.
if withRev == 0 {
withRev = getResp.Header.Revision
options = append(options, clientv3.WithRev(withRev))
}
// avoid small allocations for the result slice, since this can be called in many
// different contexts and we don't know how significantly the result will be filtered
if pred.Empty() {
if opts.Predicate.Empty() {
growSlice(v, len(getResp.Kvs))
} else {
growSlice(v, 2048, len(getResp.Kvs))
@@ -733,7 +727,7 @@ func (s *store) GetList(ctx context.Context, key string, opts storage.ListOption
// take items from the response until the bucket is full, filtering as we go
for i, kv := range getResp.Kvs {
if paging && int64(v.Len()) >= pred.Limit {
if paging && int64(v.Len()) >= opts.Predicate.Limit {
hasMore = true
break
}
@@ -744,7 +738,8 @@ func (s *store) GetList(ctx context.Context, key string, opts storage.ListOption
return storage.NewInternalErrorf("unable to transform key %q: %v", kv.Key, err)
}
if err := appendListItem(v, data, uint64(kv.ModRevision), pred, s.codec, s.versioner, newItemFunc); err != nil {
if err := appendListItem(v, data, uint64(kv.ModRevision), opts.Predicate, s.codec, s.versioner, newItemFunc); err != nil {
recordDecodeError(s.groupResourceString, string(kv.Key))
return err
}
numEvald++
@@ -753,17 +748,12 @@ func (s *store) GetList(ctx context.Context, key string, opts storage.ListOption
getResp.Kvs[i] = nil
}
// indicate to the client which resource version was returned
if returnedRV == 0 {
returnedRV = getResp.Header.Revision
}
// no more results remain or we didn't request paging
if !hasMore || !paging {
break
}
// we're paging but we have filled our bucket
if int64(v.Len()) >= pred.Limit {
if int64(v.Len()) >= opts.Predicate.Limit {
break
}
@@ -777,17 +767,18 @@ func (s *store) GetList(ctx context.Context, key string, opts storage.ListOption
*limitOption = clientv3.WithLimit(limit)
}
preparedKey = string(lastKey) + "\x00"
if withRev == 0 {
withRev = returnedRV
options = append(options, clientv3.WithRev(withRev))
}
}
if v.IsNil() {
// Ensure that we never return a nil Items pointer in the result for consistency.
v.Set(reflect.MakeSlice(v.Type(), 0, 0))
}
// instruct the client to begin querying from immediately after the last key we returned
// we never return a key that the client wouldn't be allowed to see
if hasMore {
// we want to start immediately after the last key
next, err := storage.EncodeContinue(string(lastKey)+"\x00", keyPrefix, returnedRV)
next, err := storage.EncodeContinue(string(lastKey)+"\x00", keyPrefix, withRev)
if err != nil {
return err
}
@@ -795,17 +786,15 @@ func (s *store) GetList(ctx context.Context, key string, opts storage.ListOption
// getResp.Count counts in objects that do not match the pred.
// Instead of returning inaccurate count for non-empty selectors, we return nil.
// Only set remainingItemCount if the predicate is empty.
if utilfeature.DefaultFeatureGate.Enabled(features.RemainingItemCount) {
if pred.Empty() {
c := int64(getResp.Count - pred.Limit)
remainingItemCount = &c
}
if opts.Predicate.Empty() {
c := int64(getResp.Count - opts.Predicate.Limit)
remainingItemCount = &c
}
return s.versioner.UpdateList(listObj, uint64(returnedRV), next, remainingItemCount)
return s.versioner.UpdateList(listObj, uint64(withRev), next, remainingItemCount)
}
// no continuation
return s.versioner.UpdateList(listObj, uint64(returnedRV), "", nil)
return s.versioner.UpdateList(listObj, uint64(withRev), "", nil)
}
// growSlice takes a slice value and grows its capacity up
@@ -849,7 +838,30 @@ func (s *store) Watch(ctx context.Context, key string, opts storage.ListOptions)
if err != nil {
return nil, err
}
return s.watcher.Watch(ctx, preparedKey, int64(rev), opts.Recursive, opts.ProgressNotify, s.transformer, opts.Predicate)
return s.watcher.Watch(s.watchContext(ctx), preparedKey, int64(rev), opts)
}
func (s *store) watchContext(ctx context.Context) context.Context {
// The etcd server waits until it cannot find a leader for 3 election
// timeouts to cancel existing streams. 3 is currently a hard coded
// constant. The election timeout defaults to 1000ms. If the cluster is
// healthy, when the leader is stopped, the leadership transfer should be
// smooth. (leader transfers its leadership before stopping). If leader is
// hard killed, other servers will take an election timeout to realize
// leader lost and start campaign.
return clientv3.WithRequireLeader(ctx)
}
func (s *store) getCurrentState(ctx context.Context, key string, v reflect.Value, ignoreNotFound bool) func() (*objState, error) {
return func() (*objState, error) {
startTime := time.Now()
getResp, err := s.client.KV.Get(ctx, key)
metrics.RecordEtcdRequest("get", s.groupResourceString, err, startTime)
if err != nil {
return nil, err
}
return s.getState(ctx, getResp, key, v, ignoreNotFound)
}
}
func (s *store) getState(ctx context.Context, getResp *clientv3.GetResponse, key string, v reflect.Value, ignoreNotFound bool) (*objState, error) {
@@ -880,6 +892,7 @@ func (s *store) getState(ctx context.Context, getResp *clientv3.GetResponse, key
state.data = data
state.stale = stale
if err := decode(s.codec, s.versioner, state.data, state.obj, state.rev); err != nil {
recordDecodeError(s.groupResourceString, key)
return nil, err
}
}
@@ -1018,6 +1031,12 @@ func appendListItem(v reflect.Value, data []byte, rev uint64, pred storage.Selec
return nil
}
// recordDecodeError record decode error split by object type.
func recordDecodeError(resource string, key string) {
metrics.RecordDecodeError(resource)
klog.V(4).Infof("Decoding %s \"%s\" failed", resource, key)
}
func notFound(key string) clientv3.Cmp {
return clientv3.Compare(clientv3.ModRevision(key), "=", 0)
}

View File

@@ -18,23 +18,29 @@ package etcd3
import (
"context"
"errors"
"fmt"
"os"
"reflect"
"strconv"
"strings"
"sync"
"time"
clientv3 "go.etcd.io/etcd/client/v3"
grpccodes "google.golang.org/grpc/codes"
grpcstatus "google.golang.org/grpc/status"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/apiserver/pkg/features"
"k8s.io/apiserver/pkg/storage"
"k8s.io/apiserver/pkg/storage/etcd3/metrics"
"k8s.io/apiserver/pkg/storage/value"
utilfeature "k8s.io/apiserver/pkg/util/feature"
utilflowcontrol "k8s.io/apiserver/pkg/util/flowcontrol"
clientv3 "go.etcd.io/etcd/client/v3"
"k8s.io/klog/v2"
)
@@ -44,6 +50,9 @@ const (
outgoingBufSize = 100
)
// defaultWatcherMaxLimit is used to facilitate construction tests
var defaultWatcherMaxLimit int64 = maxLimit
// fatalOnDecodeError is used during testing to panic the server if watcher encounters a decoding error
var fatalOnDecodeError = false
@@ -59,18 +68,19 @@ func TestOnlySetFatalOnDecodeError(b bool) {
}
type watcher struct {
client *clientv3.Client
codec runtime.Codec
newFunc func() runtime.Object
objectType string
groupResource schema.GroupResource
versioner storage.Versioner
client *clientv3.Client
codec runtime.Codec
newFunc func() runtime.Object
objectType string
groupResource schema.GroupResource
versioner storage.Versioner
transformer value.Transformer
getCurrentStorageRV func(context.Context) (uint64, error)
}
// watchChan implements watch.Interface.
type watchChan struct {
watcher *watcher
transformer value.Transformer
key string
initialRev int64
recursive bool
@@ -83,35 +93,26 @@ type watchChan struct {
errChan chan error
}
func newWatcher(client *clientv3.Client, codec runtime.Codec, groupResource schema.GroupResource, newFunc func() runtime.Object, versioner storage.Versioner) *watcher {
res := &watcher{
client: client,
codec: codec,
groupResource: groupResource,
newFunc: newFunc,
versioner: versioner,
}
if newFunc == nil {
res.objectType = "<unknown>"
} else {
res.objectType = reflect.TypeOf(newFunc()).String()
}
return res
}
// Watch watches on a key and returns a watch.Interface that transfers relevant notifications.
// If rev is zero, it will return the existing object(s) and then start watching from
// the maximum revision+1 from returned objects.
// If rev is non-zero, it will watch events happened after given revision.
// If recursive is false, it watches on given key.
// If recursive is true, it watches any children and directories under the key, excluding the root key itself.
// pred must be non-nil. Only if pred matches the change, it will be returned.
func (w *watcher) Watch(ctx context.Context, key string, rev int64, recursive, progressNotify bool, transformer value.Transformer, pred storage.SelectionPredicate) (watch.Interface, error) {
if recursive && !strings.HasSuffix(key, "/") {
// If opts.Recursive is false, it watches on given key.
// If opts.Recursive is true, it watches any children and directories under the key, excluding the root key itself.
// pred must be non-nil. Only if opts.Predicate matches the change, it will be returned.
func (w *watcher) Watch(ctx context.Context, key string, rev int64, opts storage.ListOptions) (watch.Interface, error) {
if opts.Recursive && !strings.HasSuffix(key, "/") {
key += "/"
}
wc := w.createWatchChan(ctx, key, rev, recursive, progressNotify, transformer, pred)
go wc.run()
if opts.ProgressNotify && w.newFunc == nil {
return nil, apierrors.NewInternalError(errors.New("progressNotify for watch is unsupported by the etcd storage because no newFunc was provided"))
}
startWatchRV, err := w.getStartWatchResourceVersion(ctx, rev, opts)
if err != nil {
return nil, err
}
wc := w.createWatchChan(ctx, key, startWatchRV, opts.Recursive, opts.ProgressNotify, opts.Predicate)
go wc.run(isInitialEventsEndBookmarkRequired(opts), areInitialEventsRequired(rev, opts))
// For etcd watch we don't have an easy way to answer whether the watch
// has already caught up. So in the initial version (given that watchcache
@@ -123,10 +124,9 @@ func (w *watcher) Watch(ctx context.Context, key string, rev int64, recursive, p
return wc, nil
}
func (w *watcher) createWatchChan(ctx context.Context, key string, rev int64, recursive, progressNotify bool, transformer value.Transformer, pred storage.SelectionPredicate) *watchChan {
func (w *watcher) createWatchChan(ctx context.Context, key string, rev int64, recursive, progressNotify bool, pred storage.SelectionPredicate) *watchChan {
wc := &watchChan{
watcher: w,
transformer: transformer,
key: key,
initialRev: rev,
recursive: recursive,
@@ -140,21 +140,94 @@ func (w *watcher) createWatchChan(ctx context.Context, key string, rev int64, re
// The filter doesn't filter out any object.
wc.internalPred = storage.Everything
}
// The etcd server waits until it cannot find a leader for 3 election
// timeouts to cancel existing streams. 3 is currently a hard coded
// constant. The election timeout defaults to 1000ms. If the cluster is
// healthy, when the leader is stopped, the leadership transfer should be
// smooth. (leader transfers its leadership before stopping). If leader is
// hard killed, other servers will take an election timeout to realize
// leader lost and start campaign.
wc.ctx, wc.cancel = context.WithCancel(clientv3.WithRequireLeader(ctx))
wc.ctx, wc.cancel = context.WithCancel(ctx)
return wc
}
func (wc *watchChan) run() {
// getStartWatchResourceVersion returns a ResourceVersion
// the watch will be started from.
// Depending on the input parameters the semantics of the returned ResourceVersion are:
// - start at Exact (return resourceVersion)
// - start at Most Recent (return an RV from etcd)
func (w *watcher) getStartWatchResourceVersion(ctx context.Context, resourceVersion int64, opts storage.ListOptions) (int64, error) {
if resourceVersion > 0 {
return resourceVersion, nil
}
if !utilfeature.DefaultFeatureGate.Enabled(features.WatchList) {
return 0, nil
}
if opts.SendInitialEvents == nil || *opts.SendInitialEvents {
// note that when opts.SendInitialEvents=true
// we will be issuing a consistent LIST request
// against etcd followed by the special bookmark event
return 0, nil
}
// at this point the clients is interested
// only in getting a stream of events
// starting at the MostRecent point in time (RV)
currentStorageRV, err := w.getCurrentStorageRV(ctx)
if err != nil {
return 0, err
}
// currentStorageRV is taken from resp.Header.Revision (int64)
// and cast to uint64, so it is safe to do reverse
// at some point we should unify the interface but that
// would require changing Versioner.UpdateList
return int64(currentStorageRV), nil
}
// isInitialEventsEndBookmarkRequired since there is no way to directly set
// opts.ProgressNotify from the API and the etcd3 impl doesn't support
// notification for external clients we simply return initialEventsEndBookmarkRequired
// to only send the bookmark event after the initial list call.
//
// see: https://github.com/kubernetes/kubernetes/issues/120348
func isInitialEventsEndBookmarkRequired(opts storage.ListOptions) bool {
if !utilfeature.DefaultFeatureGate.Enabled(features.WatchList) {
return false
}
return opts.SendInitialEvents != nil && *opts.SendInitialEvents && opts.Predicate.AllowWatchBookmarks
}
// areInitialEventsRequired returns true if all events from the etcd should be returned.
func areInitialEventsRequired(resourceVersion int64, opts storage.ListOptions) bool {
if opts.SendInitialEvents == nil && resourceVersion == 0 {
return true // legacy case
}
if !utilfeature.DefaultFeatureGate.Enabled(features.WatchList) {
return false
}
return opts.SendInitialEvents != nil && *opts.SendInitialEvents
}
type etcdError interface {
Code() grpccodes.Code
Error() string
}
type grpcError interface {
GRPCStatus() *grpcstatus.Status
}
func isCancelError(err error) bool {
if err == nil {
return false
}
if err == context.Canceled {
return true
}
if etcdErr, ok := err.(etcdError); ok && etcdErr.Code() == grpccodes.Canceled {
return true
}
if grpcErr, ok := err.(grpcError); ok && grpcErr.GRPCStatus().Code() == grpccodes.Canceled {
return true
}
return false
}
func (wc *watchChan) run(initialEventsEndBookmarkRequired, forceInitialEvents bool) {
watchClosedCh := make(chan struct{})
go wc.startWatching(watchClosedCh)
go wc.startWatching(watchClosedCh, initialEventsEndBookmarkRequired, forceInitialEvents)
var resultChanWG sync.WaitGroup
resultChanWG.Add(1)
@@ -162,7 +235,7 @@ func (wc *watchChan) run() {
select {
case err := <-wc.errChan:
if err == context.Canceled {
if isCancelError(err) {
break
}
errResult := transformErrorToEvent(err)
@@ -194,45 +267,123 @@ func (wc *watchChan) ResultChan() <-chan watch.Event {
return wc.resultChan
}
func (wc *watchChan) RequestWatchProgress() error {
return wc.watcher.client.RequestProgress(wc.ctx)
}
// sync tries to retrieve existing data and send them to process.
// The revision to watch will be set to the revision in response.
// All events sent will have isCreated=true
func (wc *watchChan) sync() error {
opts := []clientv3.OpOption{}
if wc.recursive {
opts = append(opts, clientv3.WithPrefix())
opts = append(opts, clientv3.WithLimit(defaultWatcherMaxLimit))
rangeEnd := clientv3.GetPrefixRangeEnd(wc.key)
opts = append(opts, clientv3.WithRange(rangeEnd))
}
getResp, err := wc.watcher.client.Get(wc.ctx, wc.key, opts...)
if err != nil {
return err
var err error
var lastKey []byte
var withRev int64
var getResp *clientv3.GetResponse
metricsOp := "get"
if wc.recursive {
metricsOp = "list"
}
wc.initialRev = getResp.Header.Revision
for _, kv := range getResp.Kvs {
wc.sendEvent(parseKV(kv))
preparedKey := wc.key
for {
startTime := time.Now()
getResp, err = wc.watcher.client.KV.Get(wc.ctx, preparedKey, opts...)
metrics.RecordEtcdRequest(metricsOp, wc.watcher.groupResource.String(), err, startTime)
if err != nil {
return interpretListError(err, true, preparedKey, wc.key)
}
if len(getResp.Kvs) == 0 && getResp.More {
return fmt.Errorf("no results were found, but etcd indicated there were more values remaining")
}
// send items from the response until no more results
for i, kv := range getResp.Kvs {
lastKey = kv.Key
wc.sendEvent(parseKV(kv))
// free kv early. Long lists can take O(seconds) to decode.
getResp.Kvs[i] = nil
}
if withRev == 0 {
wc.initialRev = getResp.Header.Revision
}
// no more results remain
if !getResp.More {
return nil
}
preparedKey = string(lastKey) + "\x00"
if withRev == 0 {
withRev = getResp.Header.Revision
opts = append(opts, clientv3.WithRev(withRev))
}
}
return nil
}
// logWatchChannelErr checks whether the error is about mvcc revision compaction which is regarded as warning
func logWatchChannelErr(err error) {
if !strings.Contains(err.Error(), "mvcc: required revision has been compacted") {
klog.Errorf("watch chan error: %v", err)
} else {
switch {
case strings.Contains(err.Error(), "mvcc: required revision has been compacted"):
// mvcc revision compaction which is regarded as warning, not error
klog.Warningf("watch chan error: %v", err)
case isCancelError(err):
// expected when watches close, no need to log
default:
klog.Errorf("watch chan error: %v", err)
}
}
// startWatching does:
// - get current objects if initialRev=0; set initialRev to current rev
// - watch on given key and send events to process.
func (wc *watchChan) startWatching(watchClosedCh chan struct{}) {
if wc.initialRev == 0 {
//
// initialEventsEndBookmarkSent helps us keep track
// of whether we have sent an annotated bookmark event.
//
// it's important to note that we don't
// need to track the actual RV because
// we only send the bookmark event
// after the initial list call.
//
// when this variable is set to false,
// it means we don't have any specific
// preferences for delivering bookmark events.
func (wc *watchChan) startWatching(watchClosedCh chan struct{}, initialEventsEndBookmarkRequired, forceInitialEvents bool) {
if wc.initialRev > 0 && forceInitialEvents {
currentStorageRV, err := wc.watcher.getCurrentStorageRV(wc.ctx)
if err != nil {
wc.sendError(err)
return
}
if uint64(wc.initialRev) > currentStorageRV {
wc.sendError(storage.NewTooLargeResourceVersionError(uint64(wc.initialRev), currentStorageRV, int(wait.Jitter(1*time.Second, 3).Seconds())))
return
}
}
if forceInitialEvents {
if err := wc.sync(); err != nil {
klog.Errorf("failed to sync with latest state: %v", err)
wc.sendError(err)
return
}
}
if initialEventsEndBookmarkRequired {
wc.sendEvent(func() *event {
e := progressNotifyEvent(wc.initialRev)
e.isInitialEventsEndBookmark = true
return e
}())
}
opts := []clientv3.OpOption{clientv3.WithRev(wc.initialRev + 1), clientv3.WithPrevKV()}
if wc.recursive {
opts = append(opts, clientv3.WithPrefix())
@@ -256,6 +407,7 @@ func (wc *watchChan) startWatching(watchClosedCh chan struct{}) {
}
for _, e := range wres.Events {
metrics.RecordEtcdEvent(wc.watcher.groupResource.String())
parsedEvent, err := parseEvent(e)
if err != nil {
logWatchChannelErr(err)
@@ -323,14 +475,17 @@ func (wc *watchChan) transform(e *event) (res *watch.Event) {
switch {
case e.isProgressNotify:
if wc.watcher.newFunc == nil {
return nil
}
object := wc.watcher.newFunc()
if err := wc.watcher.versioner.UpdateObject(object, uint64(e.rev)); err != nil {
klog.Errorf("failed to propagate object version: %v", err)
return nil
}
if e.isInitialEventsEndBookmark {
if err := storage.AnnotateInitialEventsEndBookmark(object); err != nil {
wc.sendError(fmt.Errorf("error while accessing object's metadata gr: %v, type: %v, obj: %#v, err: %v", wc.watcher.groupResource, wc.watcher.objectType, object, err))
return nil
}
}
res = &watch.Event{
Type: watch.Bookmark,
Object: object,
@@ -418,7 +573,7 @@ func (wc *watchChan) prepareObjs(e *event) (curObj runtime.Object, oldObj runtim
}
if !e.isDeleted {
data, _, err := wc.transformer.TransformFromStorage(wc.ctx, e.value, authenticatedDataString(e.key))
data, _, err := wc.watcher.transformer.TransformFromStorage(wc.ctx, e.value, authenticatedDataString(e.key))
if err != nil {
return nil, nil, err
}
@@ -433,7 +588,7 @@ func (wc *watchChan) prepareObjs(e *event) (curObj runtime.Object, oldObj runtim
// we need the object only to compute whether it was filtered out
// before).
if len(e.prevValue) > 0 && (e.isDeleted || !wc.acceptAll()) {
data, _, err := wc.transformer.TransformFromStorage(wc.ctx, e.prevValue, authenticatedDataString(e.key))
data, _, err := wc.watcher.transformer.TransformFromStorage(wc.ctx, e.prevValue, authenticatedDataString(e.key))
if err != nil {
return nil, nil, err
}

View File

@@ -236,6 +236,21 @@ type Interface interface {
// Count returns number of different entries under the key (generally being path prefix).
Count(key string) (int64, error)
// RequestWatchProgress requests the a watch stream progress status be sent in the
// watch response stream as soon as possible.
// Used for monitor watch progress even if watching resources with no changes.
//
// If watch is lagging, progress status might:
// * be pointing to stale resource version. Use etcd KV request to get linearizable resource version.
// * not be delivered at all. It's recommended to poll request progress periodically.
//
// Note: Only watches with matching context grpc metadata will be notified.
// https://github.com/kubernetes/kubernetes/blob/9325a57125e8502941d1b0c7379c4bb80a678d5c/vendor/go.etcd.io/etcd/client/v3/watch.go#L1037-L1042
//
// TODO: Remove when storage.Interface will be separate from etc3.store.
// Deprecated: Added temporarily to simplify exposing RequestProgress for watch cache.
RequestWatchProgress(ctx context.Context) error
}
// GetOptions provides the options that may be provided for storage get operations.
@@ -267,5 +282,24 @@ type ListOptions struct {
Recursive bool
// ProgressNotify determines whether storage-originated bookmark (progress notify) events should
// be delivered to the users. The option is ignored for non-watch requests.
//
// Firstly, note that this field is different from the Predicate.AllowWatchBookmarks field.
// Secondly, this field is intended for internal clients only such as the watch cache.
//
// This means that external clients do not have the ability to set this field directly.
// For example by setting the allowWatchBookmarks query parameter.
//
// The motivation for this approach is the fact that the frequency
// of bookmark events from a storage like etcd might be very high.
// As the number of watch requests increases, the server load would also increase.
//
// Furthermore, the server is not obligated to provide bookmark events at all,
// as described in https://github.com/kubernetes/enhancements/tree/master/keps/sig-api-machinery/956-watch-bookmark#proposal
ProgressNotify bool
// SendInitialEvents, when set together with Watch option,
// begin the watch stream with synthetic init events to build the
// whole state of all resources followed by a synthetic "Bookmark"
// event containing a ResourceVersion after which the server
// continues streaming events.
SendInitialEvents *bool
}

View File

@@ -112,6 +112,18 @@ func (s *SelectionPredicate) MatchesObjectAttributes(l labels.Set, f fields.Set)
return matched
}
// MatchesSingleNamespace will return (namespace, true) if and only if s.Field matches on the object's
// namespace.
func (s *SelectionPredicate) MatchesSingleNamespace() (string, bool) {
if len(s.Continue) > 0 {
return "", false
}
if namespace, ok := s.Field.RequiresExactMatch("metadata.namespace"); ok {
return namespace, true
}
return "", false
}
// MatchesSingle will return (name, true) if and only if s.Field matches on the object's
// name.
func (s *SelectionPredicate) MatchesSingle() (string, bool) {

View File

@@ -1,6 +1,5 @@
# See the OWNERS docs at https://go.k8s.io/owners
reviewers:
- lavalamp
- smarterclayton
- wojtek-t

View File

@@ -62,11 +62,6 @@ type Config struct {
Prefix string
// Transport holds all connection related info, i.e. equal TransportConfig means equal servers we talk to.
Transport TransportConfig
// Paging indicates whether the server implementation should allow paging (if it is
// supported). This is generally configured by feature gating, or by a specific
// resource type not wishing to allow paging, and is not intended for end users to
// set.
Paging bool
Codec runtime.Codec
// EncodeVersioner is the same groupVersioner used to build the
@@ -115,7 +110,6 @@ func (config *Config) ForResource(resource schema.GroupResource) *ConfigForResou
func NewDefaultConfig(prefix string, codec runtime.Codec) *Config {
return &Config{
Paging: true,
Prefix: prefix,
Codec: codec,
CompactionInterval: DefaultCompactInterval,

View File

@@ -20,6 +20,7 @@ import (
"context"
"fmt"
"log"
"math/rand"
"net"
"net/url"
"os"
@@ -37,6 +38,7 @@ import (
"go.uber.org/zap/zapcore"
"golang.org/x/time/rate"
"google.golang.org/grpc"
"k8s.io/klog/v2"
"k8s.io/apimachinery/pkg/runtime"
utilnet "k8s.io/apimachinery/pkg/util/net"
@@ -52,7 +54,6 @@ import (
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/component-base/metrics/legacyregistry"
tracing "k8s.io/component-base/tracing"
"k8s.io/klog/v2"
)
const (
@@ -153,18 +154,18 @@ func newETCD3Check(c storagebackend.Config, timeout time.Duration, stopCh <-chan
// retry in a loop in the background until we successfully create the client, storing the client or error encountered
lock := sync.RWMutex{}
var client *clientv3.Client
var prober *etcd3ProberMonitor
clientErr := fmt.Errorf("etcd client connection not yet established")
go wait.PollUntil(time.Second, func() (bool, error) {
newClient, err := newETCD3Client(c.Transport)
go wait.PollImmediateUntil(time.Second, func() (bool, error) {
lock.Lock()
defer lock.Unlock()
newProber, err := newETCD3ProberMonitor(c)
// Ensure that server is already not shutting down.
select {
case <-stopCh:
if err == nil {
newClient.Close()
newProber.Close()
}
return true, nil
default:
@@ -173,7 +174,7 @@ func newETCD3Check(c storagebackend.Config, timeout time.Duration, stopCh <-chan
clientErr = err
return false, nil
}
client = newClient
prober = newProber
clientErr = nil
return true, nil
}, stopCh)
@@ -185,8 +186,8 @@ func newETCD3Check(c storagebackend.Config, timeout time.Duration, stopCh <-chan
lock.Lock()
defer lock.Unlock()
if client != nil {
client.Close()
if prober != nil {
prober.Close()
clientErr = fmt.Errorf("server is shutting down")
}
}()
@@ -214,17 +215,73 @@ func newETCD3Check(c storagebackend.Config, timeout time.Duration, stopCh <-chan
}
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
// See https://github.com/etcd-io/etcd/blob/c57f8b3af865d1b531b979889c602ba14377420e/etcdctl/ctlv3/command/ep_command.go#L118
now := time.Now()
_, err := client.Get(ctx, path.Join("/", c.Prefix, "health"))
if err != nil {
err = fmt.Errorf("error getting data from etcd: %w", err)
}
err := prober.Probe(ctx)
lastError.Store(err, now)
return err
}, nil
}
func newETCD3ProberMonitor(c storagebackend.Config) (*etcd3ProberMonitor, error) {
client, err := newETCD3Client(c.Transport)
if err != nil {
return nil, err
}
return &etcd3ProberMonitor{
client: client,
prefix: c.Prefix,
endpoints: c.Transport.ServerList,
}, nil
}
type etcd3ProberMonitor struct {
prefix string
endpoints []string
mux sync.RWMutex
client *clientv3.Client
closed bool
}
func (t *etcd3ProberMonitor) Close() error {
t.mux.Lock()
defer t.mux.Unlock()
if !t.closed {
t.closed = true
return t.client.Close()
}
return fmt.Errorf("closed")
}
func (t *etcd3ProberMonitor) Probe(ctx context.Context) error {
t.mux.RLock()
defer t.mux.RUnlock()
if t.closed {
return fmt.Errorf("closed")
}
// See https://github.com/etcd-io/etcd/blob/c57f8b3af865d1b531b979889c602ba14377420e/etcdctl/ctlv3/command/ep_command.go#L118
_, err := t.client.Get(ctx, path.Join("/", t.prefix, "health"))
if err != nil {
return fmt.Errorf("error getting data from etcd: %w", err)
}
return nil
}
func (t *etcd3ProberMonitor) Monitor(ctx context.Context) (metrics.StorageMetrics, error) {
t.mux.RLock()
defer t.mux.RUnlock()
if t.closed {
return metrics.StorageMetrics{}, fmt.Errorf("closed")
}
status, err := t.client.Status(ctx, t.endpoints[rand.Int()%len(t.endpoints)])
if err != nil {
return metrics.StorageMetrics{}, err
}
return metrics.StorageMetrics{
Size: status.DbSize,
}, nil
}
var newETCD3Client = func(c storagebackend.TransportConfig) (*clientv3.Client, error) {
tlsInfo := transport.TLSInfo{
CertFile: c.CertFile,
@@ -362,7 +419,7 @@ func startCompactorOnce(c storagebackend.TransportConfig, interval time.Duration
}, nil
}
func newETCD3Storage(c storagebackend.ConfigForResource, newFunc func() runtime.Object) (storage.Interface, DestroyFunc, error) {
func newETCD3Storage(c storagebackend.ConfigForResource, newFunc, newListFunc func() runtime.Object, resourcePrefix string) (storage.Interface, DestroyFunc, error) {
stopCompactor, err := startCompactorOnce(c.Transport, c.CompactionInterval)
if err != nil {
return nil, nil, err
@@ -397,11 +454,12 @@ func newETCD3Storage(c storagebackend.ConfigForResource, newFunc func() runtime.
if transformer == nil {
transformer = identity.NewEncryptCheckTransformer()
}
return etcd3.New(client, c.Codec, newFunc, c.Prefix, c.GroupResource, transformer, c.Paging, c.LeaseManagerConfig), destroyFunc, nil
return etcd3.New(client, c.Codec, newFunc, newListFunc, c.Prefix, resourcePrefix, c.GroupResource, transformer, c.LeaseManagerConfig), destroyFunc, nil
}
// startDBSizeMonitorPerEndpoint starts a loop to monitor etcd database size and update the
// corresponding metric etcd_db_total_size_in_bytes for each etcd server endpoint.
// Deprecated: Will be replaced with newETCD3ProberMonitor
func startDBSizeMonitorPerEndpoint(client *clientv3.Client, interval time.Duration) (func(), error) {
if interval == 0 {
return func() {}, nil

View File

@@ -17,10 +17,12 @@ limitations under the License.
package factory
import (
"context"
"fmt"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apiserver/pkg/storage"
"k8s.io/apiserver/pkg/storage/etcd3/metrics"
"k8s.io/apiserver/pkg/storage/storagebackend"
)
@@ -28,12 +30,12 @@ import (
type DestroyFunc func()
// Create creates a storage backend based on given config.
func Create(c storagebackend.ConfigForResource, newFunc func() runtime.Object) (storage.Interface, DestroyFunc, error) {
func Create(c storagebackend.ConfigForResource, newFunc, newListFunc func() runtime.Object, resourcePrefix string) (storage.Interface, DestroyFunc, error) {
switch c.Type {
case storagebackend.StorageTypeETCD2:
return nil, nil, fmt.Errorf("%s is no longer a supported storage backend", c.Type)
case storagebackend.StorageTypeUnset, storagebackend.StorageTypeETCD3:
return newETCD3Storage(c, newFunc)
return newETCD3Storage(c, newFunc, newListFunc, resourcePrefix)
default:
return nil, nil, fmt.Errorf("unknown storage type: %s", c.Type)
}
@@ -61,3 +63,31 @@ func CreateReadyCheck(c storagebackend.Config, stopCh <-chan struct{}) (func() e
return nil, fmt.Errorf("unknown storage type: %s", c.Type)
}
}
func CreateProber(c storagebackend.Config) (Prober, error) {
switch c.Type {
case storagebackend.StorageTypeETCD2:
return nil, fmt.Errorf("%s is no longer a supported storage backend", c.Type)
case storagebackend.StorageTypeUnset, storagebackend.StorageTypeETCD3:
return newETCD3ProberMonitor(c)
default:
return nil, fmt.Errorf("unknown storage type: %s", c.Type)
}
}
func CreateMonitor(c storagebackend.Config) (metrics.Monitor, error) {
switch c.Type {
case storagebackend.StorageTypeETCD2:
return nil, fmt.Errorf("%s is no longer a supported storage backend", c.Type)
case storagebackend.StorageTypeUnset, storagebackend.StorageTypeETCD3:
return newETCD3ProberMonitor(c)
default:
return nil, fmt.Errorf("unknown storage type: %s", c.Type)
}
}
// Prober is an interface that defines the Probe function for doing etcd readiness/liveness checks.
type Prober interface {
Probe(ctx context.Context) error
Close() error
}

View File

@@ -17,14 +17,25 @@ limitations under the License.
package storage
import (
"context"
"fmt"
"strconv"
"sync/atomic"
"k8s.io/apimachinery/pkg/api/meta"
"k8s.io/apimachinery/pkg/api/validation/path"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
)
const (
// initialEventsAnnotationKey the name of the key
// under which an annotation marking the end of list stream
// is kept.
initialEventsAnnotationKey = "k8s.io/initial-events-end"
)
type SimpleUpdateFunc func(runtime.Object) (runtime.Object, error)
// SimpleUpdateFunc converts SimpleUpdateFunc into UpdateFunc
@@ -79,3 +90,72 @@ func (hwm *HighWaterMark) Update(current int64) bool {
}
}
}
// GetCurrentResourceVersionFromStorage gets the current resource version from the underlying storage engine.
// This method issues an empty list request and reads only the ResourceVersion from the object metadata
func GetCurrentResourceVersionFromStorage(ctx context.Context, storage Interface, newListFunc func() runtime.Object, resourcePrefix, objectType string) (uint64, error) {
if storage == nil {
return 0, fmt.Errorf("storage wasn't provided for %s", objectType)
}
if newListFunc == nil {
return 0, fmt.Errorf("newListFunction wasn't provided for %s", objectType)
}
emptyList := newListFunc()
pred := SelectionPredicate{
Label: labels.Everything(),
Field: fields.Everything(),
Limit: 1, // just in case we actually hit something
}
err := storage.GetList(ctx, resourcePrefix, ListOptions{Predicate: pred}, emptyList)
if err != nil {
return 0, err
}
emptyListAccessor, err := meta.ListAccessor(emptyList)
if err != nil {
return 0, err
}
if emptyListAccessor == nil {
return 0, fmt.Errorf("unable to extract a list accessor from %T", emptyList)
}
currentResourceVersion, err := strconv.Atoi(emptyListAccessor.GetResourceVersion())
if err != nil {
return 0, err
}
if currentResourceVersion == 0 {
return 0, fmt.Errorf("the current resource version must be greater than 0")
}
return uint64(currentResourceVersion), nil
}
// AnnotateInitialEventsEndBookmark adds a special annotation to the given object
// which indicates that the initial events have been sent.
//
// Note that this function assumes that the obj's annotation
// field is a reference type (i.e. a map).
func AnnotateInitialEventsEndBookmark(obj runtime.Object) error {
objMeta, err := meta.Accessor(obj)
if err != nil {
return err
}
objAnnotations := objMeta.GetAnnotations()
if objAnnotations == nil {
objAnnotations = map[string]string{}
}
objAnnotations[initialEventsAnnotationKey] = "true"
objMeta.SetAnnotations(objAnnotations)
return nil
}
// HasInitialEventsEndBookmarkAnnotation checks the presence of the
// special annotation which marks that the initial events have been sent.
func HasInitialEventsEndBookmarkAnnotation(obj runtime.Object) (bool, error) {
objMeta, err := meta.Accessor(obj)
if err != nil {
return false, err
}
objAnnotations := objMeta.GetAnnotations()
return objAnnotations[initialEventsAnnotationKey] == "true", nil
}

View File

@@ -23,14 +23,129 @@ import (
"crypto/aes"
"crypto/cipher"
"crypto/rand"
"encoding/binary"
"errors"
"fmt"
"io"
"sync/atomic"
"time"
"k8s.io/apiserver/pkg/storage/value"
"k8s.io/klog/v2"
)
// gcm implements AEAD encryption of the provided values given a cipher.Block algorithm.
// commonSize is the length of various security sensitive byte slices such as encryption keys.
// Do not change this value. It would be a backward incompatible change.
const commonSize = 32
const keySizeCounterNonceGCM = commonSize
// NewGCMTransformerWithUniqueKeyUnsafe is the same as NewGCMTransformer but is unsafe for general
// use because it makes assumptions about the key underlying the block cipher. Specifically,
// it uses a 96-bit nonce where the first 32 bits are random data and the remaining 64 bits are
// a monotonically incrementing atomic counter. This means that the key must be randomly generated
// on process startup and must never be used for encryption outside the lifetime of the process.
// Unlike NewGCMTransformer, this function is immune to the birthday attack and thus the key can
// be used for 2^64-1 writes without rotation. Furthermore, cryptographic wear out of AES-GCM with
// a sequential nonce occurs after 2^64 encryptions, which is not a concern for our use cases.
// Even if that occurs, the nonce counter would overflow and crash the process. We have no concerns
// around plaintext length because all stored items are small (less than 2 MB). To prevent the
// chance of the block cipher being accidentally re-used, it is not taken in as input. Instead,
// a new random key is generated and returned on every invocation of this function. This key is
// used as the input to the block cipher. If the key is stored and retrieved at a later point,
// it can be passed to NewGCMTransformer(aes.NewCipher(key)) to construct a transformer capable
// of decrypting values encrypted by this transformer (that transformer must not be used for encryption).
func NewGCMTransformerWithUniqueKeyUnsafe() (value.Transformer, []byte, error) {
key, err := GenerateKey(keySizeCounterNonceGCM)
if err != nil {
return nil, nil, err
}
block, err := aes.NewCipher(key)
if err != nil {
return nil, nil, err
}
nonceGen := &nonceGenerator{
// we start the nonce counter at one billion so that we are
// guaranteed to detect rollover across different go routines
zero: 1_000_000_000,
fatal: die,
}
nonceGen.nonce.Add(nonceGen.zero)
transformer, err := newGCMTransformerWithUniqueKeyUnsafe(block, nonceGen)
if err != nil {
return nil, nil, err
}
return transformer, key, nil
}
func newGCMTransformerWithUniqueKeyUnsafe(block cipher.Block, nonceGen *nonceGenerator) (value.Transformer, error) {
aead, err := newGCM(block)
if err != nil {
return nil, err
}
nonceFunc := func(b []byte) error {
// we only need 8 bytes to store our 64 bit incrementing nonce
// instead of leaving the unused bytes as zeros, set those to random bits
// this mostly protects us from weird edge cases like a VM restore that rewinds our atomic counter
randNonceSize := len(b) - 8
if err := randomNonce(b[:randNonceSize]); err != nil {
return err
}
nonceGen.next(b[randNonceSize:])
return nil
}
return &gcm{aead: aead, nonceFunc: nonceFunc}, nil
}
func randomNonce(b []byte) error {
_, err := rand.Read(b)
return err
}
type nonceGenerator struct {
// even at one million encryptions per second, this counter is enough for half a million years
// using this struct avoids alignment bugs: https://pkg.go.dev/sync/atomic#pkg-note-BUG
nonce atomic.Uint64
zero uint64
fatal func(msg string)
}
func (n *nonceGenerator) next(b []byte) {
incrementingNonce := n.nonce.Add(1)
if incrementingNonce <= n.zero {
// this should never happen, and is unrecoverable if it does
n.fatal("aes-gcm detected nonce overflow - cryptographic wear out has occurred")
}
binary.LittleEndian.PutUint64(b, incrementingNonce)
}
func die(msg string) {
// nolint:logcheck // we want the stack traces, log flushing, and process exiting logic from FatalDepth
klog.FatalDepth(1, msg)
}
// GenerateKey generates a random key using system randomness.
func GenerateKey(length int) (key []byte, err error) {
defer func(start time.Time) {
value.RecordDataKeyGeneration(start, err)
}(time.Now())
key = make([]byte, length)
if _, err = rand.Read(key); err != nil {
return nil, err
}
return key, nil
}
// NewGCMTransformer takes the given block cipher and performs encryption and decryption on the given data.
// It implements AEAD encryption of the provided values given a cipher.Block algorithm.
// The authenticated data provided as part of the value.Context method must match when the same
// value is set to and loaded from storage. In order to ensure that values cannot be copied by
// an attacker from a location under their control, use characteristics of the storage location
@@ -43,44 +158,49 @@ import (
// therefore transformers using this implementation *must* ensure they allow for frequent key
// rotation. Future work should include investigation of AES-GCM-SIV as an alternative to
// random nonces.
type gcm struct {
block cipher.Block
func NewGCMTransformer(block cipher.Block) (value.Transformer, error) {
aead, err := newGCM(block)
if err != nil {
return nil, err
}
return &gcm{aead: aead, nonceFunc: randomNonce}, nil
}
// NewGCMTransformer takes the given block cipher and performs encryption and decryption on the given
// data.
func NewGCMTransformer(block cipher.Block) value.Transformer {
return &gcm{block: block}
func newGCM(block cipher.Block) (cipher.AEAD, error) {
aead, err := cipher.NewGCM(block)
if err != nil {
return nil, err
}
if nonceSize := aead.NonceSize(); nonceSize != 12 { // all data in etcd will be broken if this ever changes
return nil, fmt.Errorf("crypto/cipher.NewGCM returned unexpected nonce size: %d", nonceSize)
}
return aead, nil
}
type gcm struct {
aead cipher.AEAD
nonceFunc func([]byte) error
}
func (t *gcm) TransformFromStorage(ctx context.Context, data []byte, dataCtx value.Context) ([]byte, bool, error) {
aead, err := cipher.NewGCM(t.block)
if err != nil {
return nil, false, err
}
nonceSize := aead.NonceSize()
nonceSize := t.aead.NonceSize()
if len(data) < nonceSize {
return nil, false, fmt.Errorf("the stored data was shorter than the required size")
return nil, false, errors.New("the stored data was shorter than the required size")
}
result, err := aead.Open(nil, data[:nonceSize], data[nonceSize:], dataCtx.AuthenticatedData())
result, err := t.aead.Open(nil, data[:nonceSize], data[nonceSize:], dataCtx.AuthenticatedData())
return result, false, err
}
func (t *gcm) TransformToStorage(ctx context.Context, data []byte, dataCtx value.Context) ([]byte, error) {
aead, err := cipher.NewGCM(t.block)
if err != nil {
return nil, err
nonceSize := t.aead.NonceSize()
result := make([]byte, nonceSize+t.aead.Overhead()+len(data))
if err := t.nonceFunc(result[:nonceSize]); err != nil {
return nil, fmt.Errorf("failed to write nonce for AES-GCM: %w", err)
}
nonceSize := aead.NonceSize()
result := make([]byte, nonceSize+aead.Overhead()+len(data))
n, err := rand.Read(result[:nonceSize])
if err != nil {
return nil, err
}
if n != nonceSize {
return nil, fmt.Errorf("unable to read sufficient random bytes")
}
cipherText := aead.Seal(result[nonceSize:nonceSize], result[:nonceSize], data, dataCtx.AuthenticatedData())
cipherText := t.aead.Seal(result[nonceSize:nonceSize], result[:nonceSize], data, dataCtx.AuthenticatedData())
return result[:nonceSize+len(cipherText)], nil
}
@@ -96,7 +216,7 @@ func NewCBCTransformer(block cipher.Block) value.Transformer {
}
var (
ErrInvalidBlockSize = fmt.Errorf("the stored data is not a multiple of the block size")
errInvalidBlockSize = errors.New("the stored data is not a multiple of the block size")
errInvalidPKCS7Data = errors.New("invalid PKCS7 data (empty or not padded)")
errInvalidPKCS7Padding = errors.New("invalid padding on input")
)
@@ -104,13 +224,13 @@ var (
func (t *cbc) TransformFromStorage(ctx context.Context, data []byte, dataCtx value.Context) ([]byte, bool, error) {
blockSize := aes.BlockSize
if len(data) < blockSize {
return nil, false, fmt.Errorf("the stored data was shorter than the required size")
return nil, false, errors.New("the stored data was shorter than the required size")
}
iv := data[:blockSize]
data = data[blockSize:]
if len(data)%blockSize != 0 {
return nil, false, ErrInvalidBlockSize
return nil, false, errInvalidBlockSize
}
result := make([]byte, len(data))
@@ -140,7 +260,7 @@ func (t *cbc) TransformToStorage(ctx context.Context, data []byte, dataCtx value
result := make([]byte, blockSize+len(data)+paddingSize)
iv := result[:blockSize]
if _, err := io.ReadFull(rand.Reader, iv); err != nil {
return nil, fmt.Errorf("unable to read sufficient random bytes")
return nil, errors.New("unable to read sufficient random bytes")
}
copy(result[blockSize:], data)

View File

@@ -0,0 +1,186 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package aes
import (
"bytes"
"context"
"crypto/aes"
"crypto/sha256"
"errors"
"fmt"
"io"
"time"
"golang.org/x/crypto/hkdf"
"k8s.io/apiserver/pkg/storage/value"
"k8s.io/utils/clock"
)
const (
// cacheTTL is the TTL of KDF cache entries. We assume that the value.Context.AuthenticatedData
// for every call is the etcd storage path of the associated resource, and use that as the primary
// cache key (with a secondary check that confirms that the info matches). Thus if a client
// is constantly creating resources with new names (and thus new paths), they will keep adding new
// entries to the cache for up to this TTL before the GC logic starts deleting old entries. Each
// entry is ~300 bytes in size, so even a malicious client will be bounded in the overall memory
// it can consume.
cacheTTL = 10 * time.Minute
derivedKeySizeExtendedNonceGCM = commonSize
infoSizeExtendedNonceGCM
MinSeedSizeExtendedNonceGCM
)
// NewHKDFExtendedNonceGCMTransformer is the same as NewGCMTransformer but trades storage,
// memory and CPU to work around the limitations of AES-GCM's 12 byte nonce size. The input seed
// is assumed to be a cryptographically strong slice of MinSeedSizeExtendedNonceGCM+ random bytes.
// Unlike NewGCMTransformer, this function is immune to the birthday attack because a new key is generated
// per encryption via a key derivation function: KDF(seed, random_bytes) -> key. The derived key is
// only used once as an AES-GCM key with a random 12 byte nonce. This avoids any concerns around
// cryptographic wear out (by either number of encryptions or the amount of data being encrypted).
// Speaking on the cryptographic safety, the limit on the number of operations that can be preformed
// with a single seed with derived keys and randomly generated nonces is not practically reachable.
// Thus, the scheme does not impose any specific requirements on the seed rotation schedule.
// Reusing the same seed is safe to do over time and across process restarts. Whenever a new
// seed is needed, the caller should generate it via GenerateKey(MinSeedSizeExtendedNonceGCM).
// In regard to KMSv2, organization standards or compliance policies around rotation may require
// that the seed be rotated at some interval. This can be implemented externally by rotating
// the key encryption key via a key ID change.
func NewHKDFExtendedNonceGCMTransformer(seed []byte) (value.Transformer, error) {
if seedLen := len(seed); seedLen < MinSeedSizeExtendedNonceGCM {
return nil, fmt.Errorf("invalid seed length %d used for key generation", seedLen)
}
return &extendedNonceGCM{
seed: seed,
cache: newSimpleCache(clock.RealClock{}, cacheTTL),
}, nil
}
type extendedNonceGCM struct {
seed []byte
cache *simpleCache
}
func (e *extendedNonceGCM) TransformFromStorage(ctx context.Context, data []byte, dataCtx value.Context) ([]byte, bool, error) {
if len(data) < infoSizeExtendedNonceGCM {
return nil, false, errors.New("the stored data was shorter than the required size")
}
info := data[:infoSizeExtendedNonceGCM]
transformer, err := e.derivedKeyTransformer(info, dataCtx, false)
if err != nil {
return nil, false, fmt.Errorf("failed to derive read key from KDF: %w", err)
}
return transformer.TransformFromStorage(ctx, data, dataCtx)
}
func (e *extendedNonceGCM) TransformToStorage(ctx context.Context, data []byte, dataCtx value.Context) ([]byte, error) {
info := make([]byte, infoSizeExtendedNonceGCM)
if err := randomNonce(info); err != nil {
return nil, fmt.Errorf("failed to generate info for KDF: %w", err)
}
transformer, err := e.derivedKeyTransformer(info, dataCtx, true)
if err != nil {
return nil, fmt.Errorf("failed to derive write key from KDF: %w", err)
}
return transformer.TransformToStorage(ctx, data, dataCtx)
}
func (e *extendedNonceGCM) derivedKeyTransformer(info []byte, dataCtx value.Context, write bool) (value.Transformer, error) {
if !write { // no need to check cache on write since we always generate a new transformer
if transformer := e.cache.get(info, dataCtx); transformer != nil {
return transformer, nil
}
// on read, this is a subslice of a much larger slice and we do not want to hold onto that larger slice
info = bytes.Clone(info)
}
key, err := e.sha256KDFExpandOnly(info)
if err != nil {
return nil, fmt.Errorf("failed to KDF expand seed with info: %w", err)
}
transformer, err := newGCMTransformerWithInfo(key, info)
if err != nil {
return nil, fmt.Errorf("failed to build transformer with KDF derived key: %w", err)
}
e.cache.set(dataCtx, transformer)
return transformer, nil
}
func (e *extendedNonceGCM) sha256KDFExpandOnly(info []byte) ([]byte, error) {
kdf := hkdf.Expand(sha256.New, e.seed, info)
derivedKey := make([]byte, derivedKeySizeExtendedNonceGCM)
if _, err := io.ReadFull(kdf, derivedKey); err != nil {
return nil, fmt.Errorf("failed to read a derived key from KDF: %w", err)
}
return derivedKey, nil
}
func newGCMTransformerWithInfo(key, info []byte) (*transformerWithInfo, error) {
block, err := aes.NewCipher(key)
if err != nil {
return nil, err
}
transformer, err := NewGCMTransformer(block)
if err != nil {
return nil, err
}
return &transformerWithInfo{transformer: transformer, info: info}, nil
}
type transformerWithInfo struct {
transformer value.Transformer
// info are extra opaque bytes prepended to the writes from transformer and stripped from reads.
// currently info is used to generate a key via KDF(seed, info) -> key
// and transformer is the output of NewGCMTransformer(aes.NewCipher(key))
info []byte
}
func (t *transformerWithInfo) TransformFromStorage(ctx context.Context, data []byte, dataCtx value.Context) ([]byte, bool, error) {
if !bytes.HasPrefix(data, t.info) {
return nil, false, errors.New("the stored data is missing the required info prefix")
}
return t.transformer.TransformFromStorage(ctx, data[len(t.info):], dataCtx)
}
func (t *transformerWithInfo) TransformToStorage(ctx context.Context, data []byte, dataCtx value.Context) ([]byte, error) {
out, err := t.transformer.TransformToStorage(ctx, data, dataCtx)
if err != nil {
return nil, err
}
outWithInfo := make([]byte, 0, len(out)+len(t.info))
outWithInfo = append(outWithInfo, t.info...)
outWithInfo = append(outWithInfo, out...)
return outWithInfo, nil
}

View File

@@ -0,0 +1,91 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package aes
import (
"bytes"
"time"
"unsafe"
utilcache "k8s.io/apimachinery/pkg/util/cache"
"k8s.io/apiserver/pkg/storage/value"
"k8s.io/utils/clock"
)
type simpleCache struct {
cache *utilcache.Expiring
ttl time.Duration
}
func newSimpleCache(clock clock.Clock, ttl time.Duration) *simpleCache {
cache := utilcache.NewExpiringWithClock(clock)
// "Stale" entries are always valid for us because the TTL is just used to prevent
// unbounded growth on the cache - for a given info the transformer is always the same.
// The key always corresponds to the exact same value, with the caveat that
// since we use the value.Context.AuthenticatedData to overwrite old keys,
// we always have to check that the info matches (to validate the transformer is correct).
cache.AllowExpiredGet = true
return &simpleCache{
cache: cache,
ttl: ttl,
}
}
// given a key, return the transformer, or nil if it does not exist in the cache
func (c *simpleCache) get(info []byte, dataCtx value.Context) *transformerWithInfo {
val, ok := c.cache.Get(keyFunc(dataCtx))
if !ok {
return nil
}
transformer := val.(*transformerWithInfo)
if !bytes.Equal(transformer.info, info) {
return nil
}
return transformer
}
// set caches the record for the key
func (c *simpleCache) set(dataCtx value.Context, transformer *transformerWithInfo) {
if dataCtx == nil || len(dataCtx.AuthenticatedData()) == 0 {
panic("authenticated data must not be empty")
}
if transformer == nil {
panic("transformer must not be nil")
}
if len(transformer.info) == 0 {
panic("info must not be empty")
}
c.cache.Set(keyFunc(dataCtx), transformer, c.ttl)
}
func keyFunc(dataCtx value.Context) string {
return toString(dataCtx.AuthenticatedData())
}
// toString performs unholy acts to avoid allocations
func toString(b []byte) string {
// unsafe.SliceData relies on cap whereas we want to rely on len
if len(b) == 0 {
return ""
}
// Copied from go 1.20.1 strings.Builder.String
// https://github.com/golang/go/blob/202a1a57064127c3f19d96df57b9f9586145e21c/src/strings/builder.go#L48
return unsafe.String(unsafe.SliceData(b), len(b))
}

View File

@@ -53,7 +53,7 @@ type envelopeTransformer struct {
transformers *lru.Cache
// baseTransformerFunc creates a new transformer for encrypting the data with the DEK.
baseTransformerFunc func(cipher.Block) value.Transformer
baseTransformerFunc func(cipher.Block) (value.Transformer, error)
cacheSize int
cacheEnabled bool
@@ -63,7 +63,7 @@ type envelopeTransformer struct {
// It uses envelopeService to encrypt and decrypt DEKs. Respective DEKs (in encrypted form) are prepended to
// the data items they encrypt. A cache (of size cacheSize) is maintained to store the most recently
// used decrypted DEKs in memory.
func NewEnvelopeTransformer(envelopeService Service, cacheSize int, baseTransformerFunc func(cipher.Block) value.Transformer) value.Transformer {
func NewEnvelopeTransformer(envelopeService Service, cacheSize int, baseTransformerFunc func(cipher.Block) (value.Transformer, error)) value.Transformer {
var (
cache *lru.Cache
)
@@ -161,7 +161,11 @@ func (t *envelopeTransformer) addTransformer(encKey []byte, key []byte) (value.T
if err != nil {
return nil, err
}
transformer := t.baseTransformerFunc(block)
transformer, err := t.baseTransformerFunc(block)
if err != nil {
return nil, err
}
// Use base64 of encKey as the key into the cache because hashicorp/golang-lru
// cannot hash []uint8.
if t.cacheEnabled {

View File

@@ -28,9 +28,9 @@ import (
"google.golang.org/grpc/credentials/insecure"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apiserver/pkg/storage/value/encrypt/envelope/util"
"k8s.io/klog/v2"
kmsapi "k8s.io/kms/apis/v1beta1"
"k8s.io/kms/pkg/util"
)
const (
@@ -53,7 +53,7 @@ type gRPCService struct {
// NewGRPCService returns an envelope.Service which use gRPC to communicate the remote KMS provider.
func NewGRPCService(ctx context.Context, endpoint string, callTimeout time.Duration) (Service, error) {
klog.V(4).Infof("Configure KMS provider with endpoint: %s", endpoint)
klog.V(4).InfoS("Configure KMS provider", "endpoint", endpoint)
addr, err := util.ParseEndpoint(endpoint)
if err != nil {
@@ -72,9 +72,9 @@ func NewGRPCService(ctx context.Context, endpoint string, callTimeout time.Durat
// addr - comes from the closure
c, err := net.DialUnix(unixProtocol, nil, &net.UnixAddr{Name: addr})
if err != nil {
klog.Errorf("failed to create connection to unix socket: %s, error: %v", addr, err)
klog.ErrorS(err, "failed to create connection to unix socket", "addr", addr)
} else {
klog.V(4).Infof("Successfully dialed Unix socket %v", addr)
klog.V(4).InfoS("Successfully dialed Unix socket", "addr", addr)
}
return c, err
}))
@@ -113,7 +113,7 @@ func (g *gRPCService) checkAPIVersion(ctx context.Context) error {
}
g.versionChecked = true
klog.V(4).Infof("Version of KMS provider is %s", response.Version)
klog.V(4).InfoS("KMS provider api version verified", "version", response.Version)
return nil
}

View File

@@ -0,0 +1,112 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package kmsv2 transforms values for storage at rest using a Envelope v2 provider
package kmsv2
import (
"crypto/sha256"
"hash"
"sync"
"time"
"unsafe"
utilcache "k8s.io/apimachinery/pkg/util/cache"
"k8s.io/apiserver/pkg/storage/value"
"k8s.io/apiserver/pkg/storage/value/encrypt/envelope/metrics"
"k8s.io/utils/clock"
)
// simpleCache stores the decryption subset of value.Transformer (value.Read).
// this statically enforces that transformers placed in the cache are not used for encryption.
// this is relevant in the context of nonce collision since transformers that are created
// from encrypted DEKs retrieved from etcd cannot maintain their nonce counter state.
type simpleCache struct {
cache *utilcache.Expiring
ttl time.Duration
// hashPool is a per cache pool of hash.Hash (to avoid allocations from building the Hash)
// SHA-256 is used to prevent collisions
hashPool *sync.Pool
providerName string
mu sync.Mutex // guards call to set
recordCacheSize func(providerName string, size int) // for unit tests
}
func newSimpleCache(clock clock.Clock, ttl time.Duration, providerName string) *simpleCache {
cache := utilcache.NewExpiringWithClock(clock)
cache.AllowExpiredGet = true // for a given key, the value (the decryptTransformer) is always the same
return &simpleCache{
cache: cache,
ttl: ttl,
hashPool: &sync.Pool{
New: func() interface{} {
return sha256.New()
},
},
providerName: providerName,
recordCacheSize: metrics.RecordDekSourceCacheSize,
}
}
// given a key, return the transformer, or nil if it does not exist in the cache
func (c *simpleCache) get(key []byte) value.Read {
record, ok := c.cache.Get(c.keyFunc(key))
if !ok {
return nil
}
return record.(value.Read)
}
// set caches the record for the key
func (c *simpleCache) set(key []byte, transformer value.Read) {
c.mu.Lock()
defer c.mu.Unlock()
if len(key) == 0 {
panic("key must not be empty")
}
if transformer == nil {
panic("transformer must not be nil")
}
c.cache.Set(c.keyFunc(key), transformer, c.ttl)
// Add metrics for cache size
c.recordCacheSize(c.providerName, c.cache.Len())
}
// keyFunc generates a string key by hashing the inputs.
// This lowers the memory requirement of the cache.
func (c *simpleCache) keyFunc(s []byte) string {
h := c.hashPool.Get().(hash.Hash)
h.Reset()
if _, err := h.Write(s); err != nil {
panic(err) // Write() on hash never fails
}
key := toString(h.Sum(nil)) // skip base64 encoding to save an allocation
c.hashPool.Put(h)
return key
}
// toString performs unholy acts to avoid allocations
func toString(b []byte) string {
// unsafe.SliceData relies on cap whereas we want to rely on len
if len(b) == 0 {
return ""
}
// Copied from go 1.20.1 strings.Builder.String
// https://github.com/golang/go/blob/202a1a57064127c3f19d96df57b9f9586145e21c/src/strings/builder.go#L48
return unsafe.String(unsafe.SliceData(b), len(b))
}

View File

@@ -21,204 +21,296 @@ import (
"context"
"crypto/aes"
"crypto/cipher"
"crypto/rand"
"encoding/base64"
"crypto/sha256"
"fmt"
"sort"
"time"
"unsafe"
"github.com/gogo/protobuf/proto"
"go.opentelemetry.io/otel/attribute"
"golang.org/x/crypto/cryptobyte"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apimachinery/pkg/util/uuid"
"k8s.io/apimachinery/pkg/util/validation"
"k8s.io/apimachinery/pkg/util/validation/field"
genericapirequest "k8s.io/apiserver/pkg/endpoints/request"
"k8s.io/apiserver/pkg/storage/value"
kmstypes "k8s.io/apiserver/pkg/storage/value/encrypt/envelope/kmsv2/v2alpha1"
aestransformer "k8s.io/apiserver/pkg/storage/value/encrypt/aes"
kmstypes "k8s.io/apiserver/pkg/storage/value/encrypt/envelope/kmsv2/v2"
"k8s.io/apiserver/pkg/storage/value/encrypt/envelope/metrics"
"k8s.io/component-base/tracing"
"k8s.io/klog/v2"
"k8s.io/utils/lru"
kmsservice "k8s.io/kms/pkg/service"
"k8s.io/utils/clock"
)
func init() {
value.RegisterMetrics()
metrics.RegisterMetrics()
}
const (
// KMSAPIVersion is the version of the KMS API.
KMSAPIVersion = "v2alpha1"
// KMSAPIVersionv2 is a version of the KMS API.
KMSAPIVersionv2 = "v2"
// KMSAPIVersionv2beta1 is a version of the KMS API.
KMSAPIVersionv2beta1 = "v2beta1"
// annotationsMaxSize is the maximum size of the annotations.
annotationsMaxSize = 32 * 1024 // 32 kB
// keyIDMaxSize is the maximum size of the keyID.
keyIDMaxSize = 1 * 1024 // 1 kB
// encryptedDEKMaxSize is the maximum size of the encrypted DEK.
encryptedDEKMaxSize = 1 * 1024 // 1 kB
// KeyIDMaxSize is the maximum size of the keyID.
KeyIDMaxSize = 1 * 1024 // 1 kB
// encryptedDEKSourceMaxSize is the maximum size of the encrypted DEK source.
encryptedDEKSourceMaxSize = 1 * 1024 // 1 kB
// cacheTTL is the default time-to-live for the cache entry.
// this allows the cache to grow to an infinite size for up to a day.
// there is unlikely to be any meaningful memory impact on the server
// because the cache will likely never have more than a few thousand entries.
// each entry can be large due to an internal cache that maps the DEK seed to individual
// DEK entries, but that cache has an aggressive TTL to keep the size under control.
// with DEK/seed reuse and no storage migration, the number of entries in this cache
// would be approximated by unique key IDs used by the KMS plugin
// combined with the number of server restarts. If storage migration
// is performed after key ID changes, and the number of restarts
// is limited, this cache size may be as small as the number of API
// servers in use (once old entries expire out from the TTL).
cacheTTL = 24 * time.Hour
// key ID related error codes for metrics
errKeyIDOKCode ErrCodeKeyID = "ok"
errKeyIDEmptyCode ErrCodeKeyID = "empty"
errKeyIDTooLongCode ErrCodeKeyID = "too_long"
)
// Service allows encrypting and decrypting data using an external Key Management Service.
type Service interface {
// Decrypt a given bytearray to obtain the original data as bytes.
Decrypt(ctx context.Context, uid string, req *DecryptRequest) ([]byte, error)
// Encrypt bytes to a ciphertext.
Encrypt(ctx context.Context, uid string, data []byte) (*EncryptResponse, error)
// Status returns the status of the KMS.
Status(ctx context.Context) (*StatusResponse, error)
// NowFunc is exported so tests can override it.
var NowFunc = time.Now
type StateFunc func() (State, error)
type ErrCodeKeyID string
type State struct {
Transformer value.Transformer
EncryptedObject kmstypes.EncryptedObject
UID string
ExpirationTimestamp time.Time
// CacheKey is the key used to cache the DEK/seed in envelopeTransformer.cache.
CacheKey []byte
}
func (s *State) ValidateEncryptCapability() error {
if now := NowFunc(); now.After(s.ExpirationTimestamp) {
return fmt.Errorf("encryptedDEKSource with keyID hash %q expired at %s (current time is %s)",
GetHashIfNotEmpty(s.EncryptedObject.KeyID), s.ExpirationTimestamp.Format(time.RFC3339), now.Format(time.RFC3339))
}
return nil
}
type envelopeTransformer struct {
envelopeService Service
envelopeService kmsservice.Service
providerName string
stateFunc StateFunc
// transformers is a thread-safe LRU cache which caches decrypted DEKs indexed by their encrypted form.
transformers *lru.Cache
// baseTransformerFunc creates a new transformer for encrypting the data with the DEK.
baseTransformerFunc func(cipher.Block) value.Transformer
cacheSize int
cacheEnabled bool
}
// EncryptResponse is the response from the Envelope service when encrypting data.
type EncryptResponse struct {
Ciphertext []byte
KeyID string
Annotations map[string][]byte
}
// DecryptRequest is the request to the Envelope service when decrypting data.
type DecryptRequest struct {
Ciphertext []byte
KeyID string
Annotations map[string][]byte
}
// StatusResponse is the response from the Envelope service when getting the status of the service.
type StatusResponse struct {
Version string
Healthz string
KeyID string
// cache is a thread-safe expiring lru cache which caches decrypted DEKs indexed by their encrypted form.
cache *simpleCache
apiServerID string
}
// NewEnvelopeTransformer returns a transformer which implements a KEK-DEK based envelope encryption scheme.
// It uses envelopeService to encrypt and decrypt DEKs. Respective DEKs (in encrypted form) are prepended to
// the data items they encrypt. A cache (of size cacheSize) is maintained to store the most recently
// used decrypted DEKs in memory.
func NewEnvelopeTransformer(envelopeService Service, cacheSize int, baseTransformerFunc func(cipher.Block) value.Transformer) value.Transformer {
var cache *lru.Cache
if cacheSize > 0 {
// TODO(aramase): Switch to using expiring cache: kubernetes/kubernetes/staging/src/k8s.io/apimachinery/pkg/util/cache/expiring.go.
// It handles scans a lot better, doesn't have to be right sized, and don't have a global lock on reads.
cache = lru.New(cacheSize)
}
// the data items they encrypt.
func NewEnvelopeTransformer(envelopeService kmsservice.Service, providerName string, stateFunc StateFunc, apiServerID string) value.Transformer {
return newEnvelopeTransformerWithClock(envelopeService, providerName, stateFunc, apiServerID, cacheTTL, clock.RealClock{})
}
func newEnvelopeTransformerWithClock(envelopeService kmsservice.Service, providerName string, stateFunc StateFunc, apiServerID string, cacheTTL time.Duration, clock clock.Clock) value.Transformer {
return &envelopeTransformer{
envelopeService: envelopeService,
transformers: cache,
baseTransformerFunc: baseTransformerFunc,
cacheEnabled: cacheSize > 0,
cacheSize: cacheSize,
envelopeService: envelopeService,
providerName: providerName,
stateFunc: stateFunc,
cache: newSimpleCache(clock, cacheTTL, providerName),
apiServerID: apiServerID,
}
}
// TransformFromStorage decrypts data encrypted by this transformer using envelope encryption.
func (t *envelopeTransformer) TransformFromStorage(ctx context.Context, data []byte, dataCtx value.Context) ([]byte, bool, error) {
metrics.RecordArrival(metrics.FromStorageLabel, time.Now())
ctx, span := tracing.Start(ctx, "TransformFromStorage with envelopeTransformer",
attribute.String("transformer.provider.name", t.providerName),
// The service.instance_id of the apiserver is already available in the trace
/*
{
"key": "service.instance.id",
"type": "string",
"value": "apiserver-zsteyir5lyrtdcmqqmd5kzze6m"
}
*/
)
defer span.End(500 * time.Millisecond)
span.AddEvent("About to decode encrypted object")
// Deserialize the EncryptedObject from the data.
encryptedObject, err := t.doDecode(data)
if err != nil {
span.AddEvent("Decoding encrypted object failed")
span.RecordError(err)
return nil, false, err
}
span.AddEvent("Decoded encrypted object")
useSeed := encryptedObject.EncryptedDEKSourceType == kmstypes.EncryptedDEKSourceType_HKDF_SHA256_XNONCE_AES_GCM_SEED
// TODO: consider marking state.EncryptedDEK != encryptedObject.EncryptedDEK as a stale read to support DEK defragmentation
// at a minimum we should have a metric that helps the user understand if DEK fragmentation is high
state, err := t.stateFunc() // no need to call state.ValidateEncryptCapability on reads
if err != nil {
return nil, false, err
}
// Look up the decrypted DEK from cache or Envelope.
transformer := t.getTransformer(encryptedObject.EncryptedDEK)
encryptedObjectCacheKey, err := generateCacheKey(encryptedObject.EncryptedDEKSourceType, encryptedObject.EncryptedDEKSource, encryptedObject.KeyID, encryptedObject.Annotations)
if err != nil {
return nil, false, err
}
// Look up the decrypted DEK from cache first
transformer := t.cache.get(encryptedObjectCacheKey)
// fallback to the envelope service if we do not have the transformer locally
if transformer == nil {
if t.cacheEnabled {
value.RecordCacheMiss()
}
span.AddEvent("About to decrypt DEK using remote service")
value.RecordCacheMiss()
requestInfo := getRequestInfoFromContext(ctx)
uid := string(uuid.NewUUID())
klog.V(6).InfoS("Decrypting content using envelope service", "uid", uid, "key", string(dataCtx.AuthenticatedData()))
key, err := t.envelopeService.Decrypt(ctx, uid, &DecryptRequest{
Ciphertext: encryptedObject.EncryptedDEK,
klog.V(6).InfoS("decrypting content using envelope service", "uid", uid, "key", string(dataCtx.AuthenticatedData()),
"group", requestInfo.APIGroup, "version", requestInfo.APIVersion, "resource", requestInfo.Resource, "subresource", requestInfo.Subresource,
"verb", requestInfo.Verb, "namespace", requestInfo.Namespace, "name", requestInfo.Name)
key, err := t.envelopeService.Decrypt(ctx, uid, &kmsservice.DecryptRequest{
Ciphertext: encryptedObject.EncryptedDEKSource,
KeyID: encryptedObject.KeyID,
Annotations: encryptedObject.Annotations,
})
if err != nil {
span.AddEvent("DEK decryption failed")
span.RecordError(err)
return nil, false, fmt.Errorf("failed to decrypt DEK, error: %w", err)
}
span.AddEvent("DEK decryption succeeded")
transformer, err = t.addTransformer(encryptedObject.EncryptedDEK, key)
transformer, err = t.addTransformerForDecryption(encryptedObjectCacheKey, key, useSeed)
if err != nil {
return nil, false, err
}
}
metrics.RecordKeyID(metrics.FromStorageLabel, t.providerName, encryptedObject.KeyID, t.apiServerID)
return transformer.TransformFromStorage(ctx, encryptedObject.EncryptedData, dataCtx)
span.AddEvent("About to decrypt data using DEK")
out, stale, err := transformer.TransformFromStorage(ctx, encryptedObject.EncryptedData, dataCtx)
if err != nil {
span.AddEvent("Data decryption failed")
span.RecordError(err)
return nil, false, err
}
span.AddEvent("Data decryption succeeded")
// data is considered stale if the key ID does not match our current write transformer
return out,
stale ||
encryptedObject.KeyID != state.EncryptedObject.KeyID ||
encryptedObject.EncryptedDEKSourceType != state.EncryptedObject.EncryptedDEKSourceType,
nil
}
// TransformToStorage encrypts data to be written to disk using envelope encryption.
func (t *envelopeTransformer) TransformToStorage(ctx context.Context, data []byte, dataCtx value.Context) ([]byte, error) {
metrics.RecordArrival(metrics.ToStorageLabel, time.Now())
newKey, err := generateKey(32)
ctx, span := tracing.Start(ctx, "TransformToStorage with envelopeTransformer",
attribute.String("transformer.provider.name", t.providerName),
// The service.instance_id of the apiserver is already available in the trace
/*
{
"key": "service.instance.id",
"type": "string",
"value": "apiserver-zsteyir5lyrtdcmqqmd5kzze6m"
}
*/
)
defer span.End(500 * time.Millisecond)
state, err := t.stateFunc()
if err != nil {
return nil, err
}
uid := string(uuid.NewUUID())
klog.V(6).InfoS("Encrypting content using envelope service", "uid", uid, "key", string(dataCtx.AuthenticatedData()))
resp, err := t.envelopeService.Encrypt(ctx, uid, newKey)
if err != nil {
return nil, fmt.Errorf("failed to encrypt DEK, error: %w", err)
}
transformer, err := t.addTransformer(resp.Ciphertext, newKey)
if err != nil {
if err := state.ValidateEncryptCapability(); err != nil {
return nil, err
}
result, err := transformer.TransformToStorage(ctx, data, dataCtx)
// this prevents a cache miss every time the DEK rotates
// this has the side benefit of causing the cache to perform a GC
// TODO see if we can do this inside the stateFunc control loop
t.cache.set(state.CacheKey, state.Transformer)
requestInfo := getRequestInfoFromContext(ctx)
klog.V(6).InfoS("encrypting content using DEK", "uid", state.UID, "key", string(dataCtx.AuthenticatedData()),
"group", requestInfo.APIGroup, "version", requestInfo.APIVersion, "resource", requestInfo.Resource, "subresource", requestInfo.Subresource,
"verb", requestInfo.Verb, "namespace", requestInfo.Namespace, "name", requestInfo.Name)
span.AddEvent("About to encrypt data using DEK")
result, err := state.Transformer.TransformToStorage(ctx, data, dataCtx)
if err != nil {
span.AddEvent("Data encryption failed")
span.RecordError(err)
return nil, err
}
span.AddEvent("Data encryption succeeded")
encObject := &kmstypes.EncryptedObject{
KeyID: resp.KeyID,
EncryptedDEK: resp.Ciphertext,
EncryptedData: result,
Annotations: resp.Annotations,
}
metrics.RecordKeyID(metrics.ToStorageLabel, t.providerName, state.EncryptedObject.KeyID, t.apiServerID)
encObjectCopy := state.EncryptedObject
encObjectCopy.EncryptedData = result
span.AddEvent("About to encode encrypted object")
// Serialize the EncryptedObject to a byte array.
return t.doEncode(encObject)
out, err := t.doEncode(&encObjectCopy)
if err != nil {
span.AddEvent("Encoding encrypted object failed")
span.RecordError(err)
return nil, err
}
span.AddEvent("Encoded encrypted object")
return out, nil
}
// addTransformer inserts a new transformer to the Envelope cache of DEKs for future reads.
func (t *envelopeTransformer) addTransformer(encKey []byte, key []byte) (value.Transformer, error) {
block, err := aes.NewCipher(key)
// addTransformerForDecryption inserts a new transformer to the Envelope cache of DEKs for future reads.
func (t *envelopeTransformer) addTransformerForDecryption(cacheKey []byte, key []byte, useSeed bool) (value.Read, error) {
var transformer value.Read
var err error
if useSeed {
// the input key is considered safe to use here because it is coming from the KMS plugin / etcd
transformer, err = aestransformer.NewHKDFExtendedNonceGCMTransformer(key)
} else {
var block cipher.Block
block, err = aes.NewCipher(key)
if err != nil {
return nil, err
}
// this is compatible with NewGCMTransformerWithUniqueKeyUnsafe for decryption
// it would use random nonces for encryption but we never do that
transformer, err = aestransformer.NewGCMTransformer(block)
}
if err != nil {
return nil, err
}
transformer := t.baseTransformerFunc(block)
// Use base64 of encKey as the key into the cache because hashicorp/golang-lru
// cannot hash []uint8.
if t.cacheEnabled {
t.transformers.Add(base64.StdEncoding.EncodeToString(encKey), transformer)
metrics.RecordDekCacheFillPercent(float64(t.transformers.Len()) / float64(t.cacheSize))
}
t.cache.set(cacheKey, transformer)
return transformer, nil
}
// getTransformer fetches the transformer corresponding to encKey from cache, if it exists.
func (t *envelopeTransformer) getTransformer(encKey []byte) value.Transformer {
if !t.cacheEnabled {
return nil
}
_transformer, found := t.transformers.Get(base64.StdEncoding.EncodeToString(encKey))
if found {
return _transformer.(value.Transformer)
}
return nil
}
// doEncode encodes the EncryptedObject to a byte array.
func (t *envelopeTransformer) doEncode(request *kmstypes.EncryptedObject) ([]byte, error) {
if err := validateEncryptedObject(request); err != nil {
if err := ValidateEncryptedObject(request); err != nil {
return nil, err
}
return proto.Marshal(request)
@@ -230,38 +322,86 @@ func (t *envelopeTransformer) doDecode(originalData []byte) (*kmstypes.Encrypted
if err := proto.Unmarshal(originalData, o); err != nil {
return nil, err
}
// validate the EncryptedObject
if err := validateEncryptedObject(o); err != nil {
if err := ValidateEncryptedObject(o); err != nil {
return nil, err
}
return o, nil
}
// generateKey generates a random key using system randomness.
func generateKey(length int) (key []byte, err error) {
defer func(start time.Time) {
value.RecordDataKeyGeneration(start, err)
}(time.Now())
key = make([]byte, length)
if _, err = rand.Read(key); err != nil {
return nil, err
// GenerateTransformer generates a new transformer and encrypts the DEK/seed using the envelope service.
// It returns the transformer, the encrypted DEK/seed, cache key and error.
func GenerateTransformer(ctx context.Context, uid string, envelopeService kmsservice.Service, useSeed bool) (value.Transformer, *kmstypes.EncryptedObject, []byte, error) {
newTransformerFunc := func() (value.Transformer, []byte, error) {
seed, err := aestransformer.GenerateKey(aestransformer.MinSeedSizeExtendedNonceGCM)
if err != nil {
return nil, nil, err
}
transformer, err := aestransformer.NewHKDFExtendedNonceGCMTransformer(seed)
if err != nil {
return nil, nil, err
}
return transformer, seed, nil
}
if !useSeed {
newTransformerFunc = aestransformer.NewGCMTransformerWithUniqueKeyUnsafe
}
transformer, newKey, err := newTransformerFunc()
if err != nil {
return nil, nil, nil, err
}
return key, nil
klog.V(6).InfoS("encrypting content using envelope service", "uid", uid)
resp, err := envelopeService.Encrypt(ctx, uid, newKey)
if err != nil {
return nil, nil, nil, fmt.Errorf("failed to encrypt DEK, error: %w", err)
}
o := &kmstypes.EncryptedObject{
KeyID: resp.KeyID,
EncryptedDEKSource: resp.Ciphertext,
EncryptedData: []byte{0}, // any non-empty value to pass validation
Annotations: resp.Annotations,
}
if useSeed {
o.EncryptedDEKSourceType = kmstypes.EncryptedDEKSourceType_HKDF_SHA256_XNONCE_AES_GCM_SEED
} else {
o.EncryptedDEKSourceType = kmstypes.EncryptedDEKSourceType_AES_GCM_KEY
}
if err := ValidateEncryptedObject(o); err != nil {
return nil, nil, nil, err
}
cacheKey, err := generateCacheKey(o.EncryptedDEKSourceType, resp.Ciphertext, resp.KeyID, resp.Annotations)
if err != nil {
return nil, nil, nil, err
}
o.EncryptedData = nil // make sure that later code that uses this encrypted object sets this field
return transformer, o, cacheKey, nil
}
func validateEncryptedObject(o *kmstypes.EncryptedObject) error {
func ValidateEncryptedObject(o *kmstypes.EncryptedObject) error {
if o == nil {
return fmt.Errorf("encrypted object is nil")
}
switch t := o.EncryptedDEKSourceType; t {
case kmstypes.EncryptedDEKSourceType_AES_GCM_KEY:
case kmstypes.EncryptedDEKSourceType_HKDF_SHA256_XNONCE_AES_GCM_SEED:
default:
return fmt.Errorf("unknown encryptedDEKSourceType: %d", t)
}
if len(o.EncryptedData) == 0 {
return fmt.Errorf("encrypted data is empty")
}
if err := validateEncryptedDEK(o.EncryptedDEK); err != nil {
return fmt.Errorf("failed to validate encrypted DEK: %w", err)
if err := validateEncryptedDEKSource(o.EncryptedDEKSource); err != nil {
return fmt.Errorf("failed to validate encrypted DEK source: %w", err)
}
if err := validateKeyID(o.KeyID); err != nil {
if _, err := ValidateKeyID(o.KeyID); err != nil {
return fmt.Errorf("failed to validate key id: %w", err)
}
if err := validateAnnotations(o.Annotations); err != nil {
@@ -270,15 +410,15 @@ func validateEncryptedObject(o *kmstypes.EncryptedObject) error {
return nil
}
// validateEncryptedDEK tests the following:
// 1. The encrypted DEK is not empty.
// 2. The size of encrypted DEK is less than 1 kB.
func validateEncryptedDEK(encryptedDEK []byte) error {
if len(encryptedDEK) == 0 {
return fmt.Errorf("encrypted DEK is empty")
// validateEncryptedDEKSource tests the following:
// 1. The encrypted DEK source is not empty.
// 2. The size of encrypted DEK source is less than 1 kB.
func validateEncryptedDEKSource(encryptedDEKSource []byte) error {
if len(encryptedDEKSource) == 0 {
return fmt.Errorf("encrypted DEK source is empty")
}
if len(encryptedDEK) > encryptedDEKMaxSize {
return fmt.Errorf("encrypted DEK is %d bytes, which exceeds the max size of %d", len(encryptedDEK), encryptedDEKMaxSize)
if len(encryptedDEKSource) > encryptedDEKSourceMaxSize {
return fmt.Errorf("encrypted DEK source is %d bytes, which exceeds the max size of %d", len(encryptedDEKSource), encryptedDEKSourceMaxSize)
}
return nil
}
@@ -301,15 +441,88 @@ func validateAnnotations(annotations map[string][]byte) error {
return utilerrors.NewAggregate(errs)
}
// validateKeyID tests the following:
// ValidateKeyID tests the following:
// 1. The keyID is not empty.
// 2. The size of keyID is less than 1 kB.
func validateKeyID(keyID string) error {
func ValidateKeyID(keyID string) (ErrCodeKeyID, error) {
if len(keyID) == 0 {
return fmt.Errorf("keyID is empty")
return errKeyIDEmptyCode, fmt.Errorf("keyID is empty")
}
if len(keyID) > keyIDMaxSize {
return fmt.Errorf("keyID is %d bytes, which exceeds the max size of %d", len(keyID), keyIDMaxSize)
if len(keyID) > KeyIDMaxSize {
return errKeyIDTooLongCode, fmt.Errorf("keyID is %d bytes, which exceeds the max size of %d", len(keyID), KeyIDMaxSize)
}
return nil
return errKeyIDOKCode, nil
}
func getRequestInfoFromContext(ctx context.Context) *genericapirequest.RequestInfo {
if reqInfo, found := genericapirequest.RequestInfoFrom(ctx); found {
return reqInfo
}
return &genericapirequest.RequestInfo{}
}
// generateCacheKey returns a key for the cache.
// The key is a concatenation of:
// 0. encryptedDEKSourceType
// 1. encryptedDEKSource
// 2. keyID
// 3. length of annotations
// 4. annotations (sorted by key) - each annotation is a concatenation of:
// a. annotation key
// b. annotation value
func generateCacheKey(encryptedDEKSourceType kmstypes.EncryptedDEKSourceType, encryptedDEKSource []byte, keyID string, annotations map[string][]byte) ([]byte, error) {
// TODO(aramase): use sync pool buffer to avoid allocations
b := cryptobyte.NewBuilder(nil)
b.AddUint32(uint32(encryptedDEKSourceType))
b.AddUint16LengthPrefixed(func(b *cryptobyte.Builder) {
b.AddBytes(encryptedDEKSource)
})
b.AddUint16LengthPrefixed(func(b *cryptobyte.Builder) {
b.AddBytes(toBytes(keyID))
})
if len(annotations) == 0 {
return b.Bytes()
}
// add the length of annotations to the cache key
b.AddUint32(uint32(len(annotations)))
// Sort the annotations by key.
keys := make([]string, 0, len(annotations))
for k := range annotations {
k := k
keys = append(keys, k)
}
sort.Strings(keys)
for _, k := range keys {
// The maximum size of annotations is annotationsMaxSize (32 kB) so we can safely
// assume that the length of the key and value will fit in a uint16.
b.AddUint16LengthPrefixed(func(b *cryptobyte.Builder) {
b.AddBytes(toBytes(k))
})
b.AddUint16LengthPrefixed(func(b *cryptobyte.Builder) {
b.AddBytes(annotations[k])
})
}
return b.Bytes()
}
// toBytes performs unholy acts to avoid allocations
func toBytes(s string) []byte {
// unsafe.StringData is unspecified for the empty string, so we provide a strict interpretation
if len(s) == 0 {
return nil
}
// Copied from go 1.20.1 os.File.WriteString
// https://github.com/golang/go/blob/202a1a57064127c3f19d96df57b9f9586145e21c/src/os/file.go#L246
return unsafe.Slice(unsafe.StringData(s), len(s))
}
// GetHashIfNotEmpty returns the sha256 hash of the data if it is not empty.
func GetHashIfNotEmpty(data string) string {
if len(data) > 0 {
return fmt.Sprintf("sha256:%x", sha256.Sum256([]byte(data)))
}
return ""
}

View File

@@ -27,9 +27,11 @@ import (
"google.golang.org/grpc/credentials/insecure"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apiserver/pkg/storage/value/encrypt/envelope/util"
"k8s.io/apiserver/pkg/storage/value/encrypt/envelope/metrics"
"k8s.io/klog/v2"
kmsapi "k8s.io/kms/apis/v2alpha1"
kmsapi "k8s.io/kms/apis/v2"
kmsservice "k8s.io/kms/pkg/service"
"k8s.io/kms/pkg/util"
)
const (
@@ -45,8 +47,8 @@ type gRPCService struct {
}
// NewGRPCService returns an envelope.Service which use gRPC to communicate the remote KMS provider.
func NewGRPCService(ctx context.Context, endpoint string, callTimeout time.Duration) (Service, error) {
klog.V(4).Infof("Configure KMS provider with endpoint: %s", endpoint)
func NewGRPCService(ctx context.Context, endpoint, providerName string, callTimeout time.Duration) (kmsservice.Service, error) {
klog.V(4).InfoS("Configure KMS provider", "endpoint", endpoint)
addr, err := util.ParseEndpoint(endpoint)
if err != nil {
@@ -64,12 +66,14 @@ func NewGRPCService(ctx context.Context, endpoint string, callTimeout time.Durat
// addr - comes from the closure
c, err := net.DialUnix(unixProtocol, nil, &net.UnixAddr{Name: addr})
if err != nil {
klog.Errorf("failed to create connection to unix socket: %s, error: %v", addr, err)
klog.ErrorS(err, "failed to create connection to unix socket", "addr", addr)
} else {
klog.V(4).Infof("Successfully dialed Unix socket %v", addr)
klog.V(4).InfoS("Successfully dialed Unix socket", "addr", addr)
}
return c, err
}))
}),
grpc.WithChainUnaryInterceptor(recordMetricsInterceptor(providerName)),
)
if err != nil {
return nil, fmt.Errorf("failed to create connection to %s, error: %v", endpoint, err)
@@ -88,7 +92,7 @@ func NewGRPCService(ctx context.Context, endpoint string, callTimeout time.Durat
}
// Decrypt a given data string to obtain the original byte data.
func (g *gRPCService) Decrypt(ctx context.Context, uid string, req *DecryptRequest) ([]byte, error) {
func (g *gRPCService) Decrypt(ctx context.Context, uid string, req *kmsservice.DecryptRequest) ([]byte, error) {
ctx, cancel := context.WithTimeout(ctx, g.callTimeout)
defer cancel()
@@ -106,7 +110,7 @@ func (g *gRPCService) Decrypt(ctx context.Context, uid string, req *DecryptReque
}
// Encrypt bytes to a string ciphertext.
func (g *gRPCService) Encrypt(ctx context.Context, uid string, plaintext []byte) (*EncryptResponse, error) {
func (g *gRPCService) Encrypt(ctx context.Context, uid string, plaintext []byte) (*kmsservice.EncryptResponse, error) {
ctx, cancel := context.WithTimeout(ctx, g.callTimeout)
defer cancel()
@@ -118,7 +122,7 @@ func (g *gRPCService) Encrypt(ctx context.Context, uid string, plaintext []byte)
if err != nil {
return nil, err
}
return &EncryptResponse{
return &kmsservice.EncryptResponse{
Ciphertext: response.Ciphertext,
KeyID: response.KeyId,
Annotations: response.Annotations,
@@ -126,7 +130,7 @@ func (g *gRPCService) Encrypt(ctx context.Context, uid string, plaintext []byte)
}
// Status returns the status of the KMSv2 provider.
func (g *gRPCService) Status(ctx context.Context) (*StatusResponse, error) {
func (g *gRPCService) Status(ctx context.Context) (*kmsservice.StatusResponse, error) {
ctx, cancel := context.WithTimeout(ctx, g.callTimeout)
defer cancel()
@@ -135,5 +139,15 @@ func (g *gRPCService) Status(ctx context.Context) (*StatusResponse, error) {
if err != nil {
return nil, err
}
return &StatusResponse{Version: response.Version, Healthz: response.Healthz, KeyID: response.KeyId}, nil
return &kmsservice.StatusResponse{Version: response.Version, Healthz: response.Healthz, KeyID: response.KeyId}, nil
}
func recordMetricsInterceptor(providerName string) grpc.UnaryClientInterceptor {
return func(ctx context.Context, method string, req, reply interface{}, cc *grpc.ClientConn, invoker grpc.UnaryInvoker, opts ...grpc.CallOption) error {
start := NowFunc()
respErr := invoker(ctx, method, req, reply, cc, opts...)
elapsed := NowFunc().Sub(start)
metrics.RecordKMSOperationLatency(providerName, method, elapsed, respErr)
return respErr
}
}

View File

@@ -0,0 +1,186 @@
/*
Copyright The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by protoc-gen-gogo. DO NOT EDIT.
// source: api.proto
package v2
import (
fmt "fmt"
proto "github.com/gogo/protobuf/proto"
math "math"
)
// Reference imports to suppress errors if they are not otherwise used.
var _ = proto.Marshal
var _ = fmt.Errorf
var _ = math.Inf
// This is a compile-time assertion to ensure that this generated file
// is compatible with the proto package it is being compiled against.
// A compilation error at this line likely means your copy of the
// proto package needs to be updated.
const _ = proto.GoGoProtoPackageIsVersion3 // please upgrade the proto package
type EncryptedDEKSourceType int32
const (
// AES_GCM_KEY means that the plaintext of encryptedDEKSource is the DEK itself, with AES-GCM as the encryption algorithm.
EncryptedDEKSourceType_AES_GCM_KEY EncryptedDEKSourceType = 0
// HKDF_SHA256_XNONCE_AES_GCM_SEED means that the plaintext of encryptedDEKSource is the pseudo random key
// (referred to as the seed throughout the code) that is fed into HKDF expand. SHA256 is the hash algorithm
// and first 32 bytes of encryptedData are the info param. The first 32 bytes from the HKDF stream are used
// as the DEK with AES-GCM as the encryption algorithm.
EncryptedDEKSourceType_HKDF_SHA256_XNONCE_AES_GCM_SEED EncryptedDEKSourceType = 1
)
var EncryptedDEKSourceType_name = map[int32]string{
0: "AES_GCM_KEY",
1: "HKDF_SHA256_XNONCE_AES_GCM_SEED",
}
var EncryptedDEKSourceType_value = map[string]int32{
"AES_GCM_KEY": 0,
"HKDF_SHA256_XNONCE_AES_GCM_SEED": 1,
}
func (x EncryptedDEKSourceType) String() string {
return proto.EnumName(EncryptedDEKSourceType_name, int32(x))
}
func (EncryptedDEKSourceType) EnumDescriptor() ([]byte, []int) {
return fileDescriptor_00212fb1f9d3bf1c, []int{0}
}
// EncryptedObject is the representation of data stored in etcd after envelope encryption.
type EncryptedObject struct {
// EncryptedData is the encrypted data.
EncryptedData []byte `protobuf:"bytes,1,opt,name=encryptedData,proto3" json:"encryptedData,omitempty"`
// KeyID is the KMS key ID used for encryption operations.
// keyID must satisfy the following constraints:
// 1. The keyID is not empty.
// 2. The size of keyID is less than 1 kB.
KeyID string `protobuf:"bytes,2,opt,name=keyID,proto3" json:"keyID,omitempty"`
// EncryptedDEKSource is the ciphertext of the source of the DEK used to encrypt the data stored in encryptedData.
// encryptedDEKSourceType defines the process of using the plaintext of this field to determine the aforementioned DEK.
// encryptedDEKSource must satisfy the following constraints:
// 1. The encrypted DEK source is not empty.
// 2. The size of encrypted DEK source is less than 1 kB.
EncryptedDEKSource []byte `protobuf:"bytes,3,opt,name=encryptedDEKSource,proto3" json:"encryptedDEKSource,omitempty"`
// Annotations is additional metadata that was provided by the KMS plugin.
// Annotations must satisfy the following constraints:
// 1. Annotation key must be a fully qualified domain name that conforms to the definition in DNS (RFC 1123).
// 2. The size of annotations keys + values is less than 32 kB.
Annotations map[string][]byte `protobuf:"bytes,4,rep,name=annotations,proto3" json:"annotations,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"`
// encryptedDEKSourceType defines the process of using the plaintext of encryptedDEKSource to determine the DEK.
EncryptedDEKSourceType EncryptedDEKSourceType `protobuf:"varint,5,opt,name=encryptedDEKSourceType,proto3,enum=v2.EncryptedDEKSourceType" json:"encryptedDEKSourceType,omitempty"`
XXX_NoUnkeyedLiteral struct{} `json:"-"`
XXX_unrecognized []byte `json:"-"`
XXX_sizecache int32 `json:"-"`
}
func (m *EncryptedObject) Reset() { *m = EncryptedObject{} }
func (m *EncryptedObject) String() string { return proto.CompactTextString(m) }
func (*EncryptedObject) ProtoMessage() {}
func (*EncryptedObject) Descriptor() ([]byte, []int) {
return fileDescriptor_00212fb1f9d3bf1c, []int{0}
}
func (m *EncryptedObject) XXX_Unmarshal(b []byte) error {
return xxx_messageInfo_EncryptedObject.Unmarshal(m, b)
}
func (m *EncryptedObject) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
return xxx_messageInfo_EncryptedObject.Marshal(b, m, deterministic)
}
func (m *EncryptedObject) XXX_Merge(src proto.Message) {
xxx_messageInfo_EncryptedObject.Merge(m, src)
}
func (m *EncryptedObject) XXX_Size() int {
return xxx_messageInfo_EncryptedObject.Size(m)
}
func (m *EncryptedObject) XXX_DiscardUnknown() {
xxx_messageInfo_EncryptedObject.DiscardUnknown(m)
}
var xxx_messageInfo_EncryptedObject proto.InternalMessageInfo
func (m *EncryptedObject) GetEncryptedData() []byte {
if m != nil {
return m.EncryptedData
}
return nil
}
func (m *EncryptedObject) GetKeyID() string {
if m != nil {
return m.KeyID
}
return ""
}
func (m *EncryptedObject) GetEncryptedDEKSource() []byte {
if m != nil {
return m.EncryptedDEKSource
}
return nil
}
func (m *EncryptedObject) GetAnnotations() map[string][]byte {
if m != nil {
return m.Annotations
}
return nil
}
func (m *EncryptedObject) GetEncryptedDEKSourceType() EncryptedDEKSourceType {
if m != nil {
return m.EncryptedDEKSourceType
}
return EncryptedDEKSourceType_AES_GCM_KEY
}
func init() {
proto.RegisterEnum("v2.EncryptedDEKSourceType", EncryptedDEKSourceType_name, EncryptedDEKSourceType_value)
proto.RegisterType((*EncryptedObject)(nil), "v2.EncryptedObject")
proto.RegisterMapType((map[string][]byte)(nil), "v2.EncryptedObject.AnnotationsEntry")
}
func init() { proto.RegisterFile("api.proto", fileDescriptor_00212fb1f9d3bf1c) }
var fileDescriptor_00212fb1f9d3bf1c = []byte{
// 329 bytes of a gzipped FileDescriptorProto
0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x74, 0x91, 0xe1, 0x4b, 0xc2, 0x40,
0x18, 0xc6, 0xdb, 0xcc, 0xc0, 0xd3, 0x72, 0x1c, 0x21, 0xc3, 0x2f, 0x8d, 0xf2, 0xc3, 0xe8, 0xc3,
0x0e, 0x16, 0x85, 0x44, 0x08, 0xe6, 0xce, 0x0c, 0x49, 0x61, 0xeb, 0x43, 0xf5, 0x65, 0x9c, 0xf6,
0x22, 0x6b, 0xb6, 0x1b, 0xb7, 0xf3, 0x60, 0x7f, 0x6a, 0xff, 0x4d, 0x38, 0x13, 0xd3, 0xec, 0xdb,
0xbd, 0xef, 0xfd, 0xde, 0xe7, 0xb9, 0x7b, 0x5e, 0x54, 0x61, 0x69, 0xe4, 0xa4, 0x82, 0x4b, 0x8e,
0x75, 0xe5, 0x9e, 0x7f, 0xe9, 0xa8, 0x4e, 0x93, 0xa9, 0xc8, 0x53, 0x09, 0xef, 0xe3, 0xc9, 0x07,
0x4c, 0x25, 0x6e, 0xa1, 0x63, 0x58, 0xb7, 0x3c, 0x26, 0x99, 0xa9, 0x59, 0x9a, 0x5d, 0xf3, 0xb7,
0x9b, 0xf8, 0x14, 0x95, 0x63, 0xc8, 0x1f, 0x3d, 0x53, 0xb7, 0x34, 0xbb, 0xe2, 0xaf, 0x0a, 0xec,
0x20, 0xbc, 0xc1, 0xe8, 0x30, 0xe0, 0x0b, 0x31, 0x05, 0xb3, 0x54, 0x08, 0xec, 0xb9, 0xc1, 0x7d,
0x54, 0x65, 0x49, 0xc2, 0x25, 0x93, 0x11, 0x4f, 0x32, 0xf3, 0xd0, 0x2a, 0xd9, 0x55, 0xb7, 0xe5,
0x28, 0xd7, 0xd9, 0x79, 0x95, 0xd3, 0xdd, 0x60, 0x34, 0x91, 0x22, 0xf7, 0x7f, 0x0f, 0x62, 0x1f,
0x35, 0xfe, 0xaa, 0x3f, 0xe7, 0x29, 0x98, 0x65, 0x4b, 0xb3, 0x4f, 0xdc, 0xe6, 0x96, 0xe4, 0x16,
0xe1, 0xff, 0x33, 0xd9, 0xec, 0x20, 0x63, 0xd7, 0x14, 0x1b, 0xa8, 0x14, 0x43, 0x5e, 0x24, 0x52,
0xf1, 0x97, 0xc7, 0x65, 0x0e, 0x8a, 0xcd, 0x17, 0x50, 0xe4, 0x50, 0xf3, 0x57, 0xc5, 0xad, 0xde,
0xd6, 0x2e, 0x47, 0xa8, 0xb1, 0xdf, 0x11, 0xd7, 0x51, 0xb5, 0x4b, 0x83, 0xf0, 0xa1, 0xf7, 0x14,
0x0e, 0xe9, 0xab, 0x71, 0x80, 0x2f, 0xd0, 0xd9, 0x60, 0xe8, 0xf5, 0xc3, 0x60, 0xd0, 0x75, 0xaf,
0x6f, 0xc2, 0x97, 0xd1, 0x78, 0xd4, 0xa3, 0xe1, 0x9a, 0x09, 0x28, 0xf5, 0x0c, 0xed, 0xbe, 0xf3,
0x76, 0x17, 0xb7, 0x33, 0x27, 0xe2, 0x84, 0xa5, 0x51, 0x06, 0x42, 0x81, 0x20, 0x69, 0x3c, 0x23,
0x99, 0xe4, 0x82, 0xcd, 0x80, 0x14, 0xce, 0xe4, 0xe7, 0x33, 0x04, 0x12, 0x05, 0x73, 0x9e, 0x02,
0x89, 0x3f, 0x33, 0xe5, 0x12, 0xe5, 0x4e, 0x8e, 0x8a, 0xb5, 0x5f, 0x7d, 0x07, 0x00, 0x00, 0xff,
0xff, 0xcc, 0x0f, 0x2b, 0x2e, 0x03, 0x02, 0x00, 0x00,
}

View File

@@ -0,0 +1,60 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// To regenerate api.pb.go run `hack/update-codegen.sh protobindings`
syntax = "proto3";
package v2;
option go_package = "k8s.io/apiserver/pkg/storage/value/encrypt/envelope/kmsv2/v2";
// EncryptedObject is the representation of data stored in etcd after envelope encryption.
message EncryptedObject {
// EncryptedData is the encrypted data.
bytes encryptedData = 1;
// KeyID is the KMS key ID used for encryption operations.
// keyID must satisfy the following constraints:
// 1. The keyID is not empty.
// 2. The size of keyID is less than 1 kB.
string keyID = 2;
// EncryptedDEKSource is the ciphertext of the source of the DEK used to encrypt the data stored in encryptedData.
// encryptedDEKSourceType defines the process of using the plaintext of this field to determine the aforementioned DEK.
// encryptedDEKSource must satisfy the following constraints:
// 1. The encrypted DEK source is not empty.
// 2. The size of encrypted DEK source is less than 1 kB.
bytes encryptedDEKSource = 3;
// Annotations is additional metadata that was provided by the KMS plugin.
// Annotations must satisfy the following constraints:
// 1. Annotation key must be a fully qualified domain name that conforms to the definition in DNS (RFC 1123).
// 2. The size of annotations keys + values is less than 32 kB.
map<string, bytes> annotations = 4;
// encryptedDEKSourceType defines the process of using the plaintext of encryptedDEKSource to determine the DEK.
EncryptedDEKSourceType encryptedDEKSourceType = 5;
}
enum EncryptedDEKSourceType {
// AES_GCM_KEY means that the plaintext of encryptedDEKSource is the DEK itself, with AES-GCM as the encryption algorithm.
AES_GCM_KEY = 0;
// HKDF_SHA256_XNONCE_AES_GCM_SEED means that the plaintext of encryptedDEKSource is the pseudo random key
// (referred to as the seed throughout the code) that is fed into HKDF expand. SHA256 is the hash algorithm
// and first 32 bytes of encryptedData are the info param. The first 32 bytes from the HKDF stream are used
// as the DEK with AES-GCM as the encryption algorithm.
HKDF_SHA256_XNONCE_AES_GCM_SEED = 1;
}

View File

@@ -14,5 +14,5 @@ See the License for the specific language governing permissions and
limitations under the License.
*/
// Package v2alpha1 contains definition of kms-plugin's serialized types.
package v2alpha1
// Package v2 contains definition of kms-plugin's serialized types.
package v2

View File

@@ -1,128 +0,0 @@
/*
Copyright The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by protoc-gen-gogo. DO NOT EDIT.
// source: api.proto
package v2alpha1
import (
fmt "fmt"
proto "github.com/gogo/protobuf/proto"
math "math"
)
// Reference imports to suppress errors if they are not otherwise used.
var _ = proto.Marshal
var _ = fmt.Errorf
var _ = math.Inf
// This is a compile-time assertion to ensure that this generated file
// is compatible with the proto package it is being compiled against.
// A compilation error at this line likely means your copy of the
// proto package needs to be updated.
const _ = proto.GoGoProtoPackageIsVersion3 // please upgrade the proto package
// EncryptedObject is the representation of data stored in etcd after envelope encryption.
type EncryptedObject struct {
// EncryptedData is the encrypted data.
EncryptedData []byte `protobuf:"bytes,1,opt,name=encryptedData,proto3" json:"encryptedData,omitempty"`
// KeyID is the KMS key ID used for encryption operations.
KeyID string `protobuf:"bytes,2,opt,name=keyID,proto3" json:"keyID,omitempty"`
// EncryptedDEK is the encrypted DEK.
EncryptedDEK []byte `protobuf:"bytes,3,opt,name=encryptedDEK,proto3" json:"encryptedDEK,omitempty"`
// Annotations is additional metadata that was provided by the KMS plugin.
Annotations map[string][]byte `protobuf:"bytes,4,rep,name=annotations,proto3" json:"annotations,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"`
XXX_NoUnkeyedLiteral struct{} `json:"-"`
XXX_unrecognized []byte `json:"-"`
XXX_sizecache int32 `json:"-"`
}
func (m *EncryptedObject) Reset() { *m = EncryptedObject{} }
func (m *EncryptedObject) String() string { return proto.CompactTextString(m) }
func (*EncryptedObject) ProtoMessage() {}
func (*EncryptedObject) Descriptor() ([]byte, []int) {
return fileDescriptor_00212fb1f9d3bf1c, []int{0}
}
func (m *EncryptedObject) XXX_Unmarshal(b []byte) error {
return xxx_messageInfo_EncryptedObject.Unmarshal(m, b)
}
func (m *EncryptedObject) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
return xxx_messageInfo_EncryptedObject.Marshal(b, m, deterministic)
}
func (m *EncryptedObject) XXX_Merge(src proto.Message) {
xxx_messageInfo_EncryptedObject.Merge(m, src)
}
func (m *EncryptedObject) XXX_Size() int {
return xxx_messageInfo_EncryptedObject.Size(m)
}
func (m *EncryptedObject) XXX_DiscardUnknown() {
xxx_messageInfo_EncryptedObject.DiscardUnknown(m)
}
var xxx_messageInfo_EncryptedObject proto.InternalMessageInfo
func (m *EncryptedObject) GetEncryptedData() []byte {
if m != nil {
return m.EncryptedData
}
return nil
}
func (m *EncryptedObject) GetKeyID() string {
if m != nil {
return m.KeyID
}
return ""
}
func (m *EncryptedObject) GetEncryptedDEK() []byte {
if m != nil {
return m.EncryptedDEK
}
return nil
}
func (m *EncryptedObject) GetAnnotations() map[string][]byte {
if m != nil {
return m.Annotations
}
return nil
}
func init() {
proto.RegisterType((*EncryptedObject)(nil), "v2alpha1.EncryptedObject")
proto.RegisterMapType((map[string][]byte)(nil), "v2alpha1.EncryptedObject.AnnotationsEntry")
}
func init() { proto.RegisterFile("api.proto", fileDescriptor_00212fb1f9d3bf1c) }
var fileDescriptor_00212fb1f9d3bf1c = []byte{
// 200 bytes of a gzipped FileDescriptorProto
0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xe2, 0xe2, 0x4c, 0x2c, 0xc8, 0xd4,
0x2b, 0x28, 0xca, 0x2f, 0xc9, 0x17, 0xe2, 0x28, 0x33, 0x4a, 0xcc, 0x29, 0xc8, 0x48, 0x34, 0x54,
0xfa, 0xcf, 0xc8, 0xc5, 0xef, 0x9a, 0x97, 0x5c, 0x54, 0x59, 0x50, 0x92, 0x9a, 0xe2, 0x9f, 0x94,
0x95, 0x9a, 0x5c, 0x22, 0xa4, 0xc2, 0xc5, 0x9b, 0x0a, 0x13, 0x72, 0x49, 0x2c, 0x49, 0x94, 0x60,
0x54, 0x60, 0xd4, 0xe0, 0x09, 0x42, 0x15, 0x14, 0x12, 0xe1, 0x62, 0xcd, 0x4e, 0xad, 0xf4, 0x74,
0x91, 0x60, 0x52, 0x60, 0xd4, 0xe0, 0x0c, 0x82, 0x70, 0x84, 0x94, 0xb8, 0x78, 0x10, 0xca, 0x5c,
0xbd, 0x25, 0x98, 0xc1, 0x5a, 0x51, 0xc4, 0x84, 0x7c, 0xb8, 0xb8, 0x13, 0xf3, 0xf2, 0xf2, 0x4b,
0x12, 0x4b, 0x32, 0xf3, 0xf3, 0x8a, 0x25, 0x58, 0x14, 0x98, 0x35, 0xb8, 0x8d, 0xb4, 0xf4, 0x60,
0x6e, 0xd2, 0x43, 0x73, 0x8f, 0x9e, 0x23, 0x42, 0xb1, 0x6b, 0x5e, 0x49, 0x51, 0x65, 0x10, 0xb2,
0x76, 0x29, 0x3b, 0x2e, 0x01, 0x74, 0x05, 0x42, 0x02, 0x5c, 0xcc, 0xd9, 0xa9, 0x95, 0x60, 0x77,
0x73, 0x06, 0x81, 0x98, 0x20, 0xd7, 0x96, 0x25, 0xe6, 0x94, 0xa6, 0x82, 0x5d, 0xcb, 0x13, 0x04,
0xe1, 0x58, 0x31, 0x59, 0x30, 0x26, 0xb1, 0x81, 0x83, 0xc4, 0x18, 0x10, 0x00, 0x00, 0xff, 0xff,
0x88, 0x8c, 0xbb, 0x4e, 0x1f, 0x01, 0x00, 0x00,
}

View File

@@ -1,35 +0,0 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// To regenerate api.pb.go run hack/update-generated-kms.sh
syntax = "proto3";
package v2alpha1;
// EncryptedObject is the representation of data stored in etcd after envelope encryption.
message EncryptedObject {
// EncryptedData is the encrypted data.
bytes encryptedData = 1;
// KeyID is the KMS key ID used for encryption operations.
string keyID = 2;
// EncryptedDEK is the encrypted DEK.
bytes encryptedDEK = 3;
// Annotations is additional metadata that was provided by the KMS plugin.
map<string, bytes> annotations = 4;
}

View File

@@ -17,11 +17,20 @@ limitations under the License.
package metrics
import (
"crypto/sha256"
"errors"
"fmt"
"hash"
"sync"
"time"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
"k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
"k8s.io/klog/v2"
"k8s.io/utils/lru"
)
const (
@@ -31,6 +40,13 @@ const (
ToStorageLabel = "to_storage"
)
type metricLabels struct {
transformationType string
providerName string
keyIDHash string
apiServerIDHash string
}
/*
* By default, all the following metrics are defined as falling under
* ALPHA stability level https://github.com/kubernetes/enhancements/blob/master/keps/sig-instrumentation/1209-metrics-stability/kubernetes-control-plane-metrics-stability.md#stability-classes)
@@ -40,12 +56,18 @@ const (
* the metric stability policy.
*/
var (
lockLastFromStorage sync.Mutex
lockLastToStorage sync.Mutex
lockLastFromStorage sync.Mutex
lockLastToStorage sync.Mutex
lockRecordKeyID sync.Mutex
lockRecordKeyIDStatus sync.Mutex
lastFromStorage time.Time
lastToStorage time.Time
lastFromStorage time.Time
lastToStorage time.Time
keyIDHashTotalMetricLabels *lru.Cache
keyIDHashStatusLastTimestampSecondsMetricLabels *lru.Cache
cacheSize = 100
// This metric is only used for KMS v1 API.
dekCacheFillPercent = metrics.NewGauge(
&metrics.GaugeOpts{
Namespace: namespace,
@@ -56,6 +78,7 @@ var (
},
)
// This metric is only used for KMS v1 API.
dekCacheInterArrivals = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Namespace: namespace,
@@ -67,17 +90,157 @@ var (
},
[]string{"transformation_type"},
)
// These metrics are made public to be used by unit tests.
KMSOperationsLatencyMetric = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "kms_operations_latency_seconds",
Help: "KMS operation duration with gRPC error code status total.",
StabilityLevel: metrics.ALPHA,
// Use custom buckets to avoid the default buckets which are too small for KMS operations.
// Start 0.1ms with the last bucket being [~52s, +Inf)
Buckets: metrics.ExponentialBuckets(0.0001, 2, 20),
},
[]string{"provider_name", "method_name", "grpc_status_code"},
)
// keyIDHashTotal is the number of times a keyID is used
// e.g. apiserver_envelope_encryption_key_id_hash_total counter
// apiserver_envelope_encryption_key_id_hash_total{apiserver_id_hash="sha256",key_id_hash="sha256",
// provider_name="providerName",transformation_type="from_storage"} 1
KeyIDHashTotal = metrics.NewCounterVec(
&metrics.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "key_id_hash_total",
Help: "Number of times a keyID is used split by transformation type, provider, and apiserver identity.",
StabilityLevel: metrics.ALPHA,
},
[]string{"transformation_type", "provider_name", "key_id_hash", "apiserver_id_hash"},
)
// keyIDHashLastTimestampSeconds is the last time in seconds when a keyID was used
// e.g. apiserver_envelope_encryption_key_id_hash_last_timestamp_seconds{apiserver_id_hash="sha256",key_id_hash="sha256", provider_name="providerName",transformation_type="from_storage"} 1.674865558833728e+09
KeyIDHashLastTimestampSeconds = metrics.NewGaugeVec(
&metrics.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "key_id_hash_last_timestamp_seconds",
Help: "The last time in seconds when a keyID was used.",
StabilityLevel: metrics.ALPHA,
},
[]string{"transformation_type", "provider_name", "key_id_hash", "apiserver_id_hash"},
)
// keyIDHashStatusLastTimestampSeconds is the last time in seconds when a keyID was returned by the Status RPC call.
// e.g. apiserver_envelope_encryption_key_id_hash_status_last_timestamp_seconds{apiserver_id_hash="sha256",key_id_hash="sha256", provider_name="providerName"} 1.674865558833728e+09
KeyIDHashStatusLastTimestampSeconds = metrics.NewGaugeVec(
&metrics.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "key_id_hash_status_last_timestamp_seconds",
Help: "The last time in seconds when a keyID was returned by the Status RPC call.",
StabilityLevel: metrics.ALPHA,
},
[]string{"provider_name", "key_id_hash", "apiserver_id_hash"},
)
InvalidKeyIDFromStatusTotal = metrics.NewCounterVec(
&metrics.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "invalid_key_id_from_status_total",
Help: "Number of times an invalid keyID is returned by the Status RPC call split by error.",
StabilityLevel: metrics.ALPHA,
},
[]string{"provider_name", "error"},
)
DekSourceCacheSize = metrics.NewGaugeVec(
&metrics.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "dek_source_cache_size",
Help: "Number of records in data encryption key (DEK) source cache. On a restart, this value is an approximation of the number of decrypt RPC calls the server will make to the KMS plugin.",
StabilityLevel: metrics.ALPHA,
},
[]string{"provider_name"},
)
)
var registerMetricsFunc sync.Once
var hashPool *sync.Pool
func registerLRUMetrics() {
if keyIDHashTotalMetricLabels != nil {
keyIDHashTotalMetricLabels.Clear()
}
if keyIDHashStatusLastTimestampSecondsMetricLabels != nil {
keyIDHashStatusLastTimestampSecondsMetricLabels.Clear()
}
keyIDHashTotalMetricLabels = lru.NewWithEvictionFunc(cacheSize, func(key lru.Key, _ interface{}) {
item := key.(metricLabels)
if deleted := KeyIDHashTotal.DeleteLabelValues(item.transformationType, item.providerName, item.keyIDHash, item.apiServerIDHash); deleted {
klog.InfoS("Deleted keyIDHashTotalMetricLabels", "transformationType", item.transformationType,
"providerName", item.providerName, "keyIDHash", item.keyIDHash, "apiServerIDHash", item.apiServerIDHash)
}
if deleted := KeyIDHashLastTimestampSeconds.DeleteLabelValues(item.transformationType, item.providerName, item.keyIDHash, item.apiServerIDHash); deleted {
klog.InfoS("Deleted keyIDHashLastTimestampSecondsMetricLabels", "transformationType", item.transformationType,
"providerName", item.providerName, "keyIDHash", item.keyIDHash, "apiServerIDHash", item.apiServerIDHash)
}
})
keyIDHashStatusLastTimestampSecondsMetricLabels = lru.NewWithEvictionFunc(cacheSize, func(key lru.Key, _ interface{}) {
item := key.(metricLabels)
if deleted := KeyIDHashStatusLastTimestampSeconds.DeleteLabelValues(item.providerName, item.keyIDHash, item.apiServerIDHash); deleted {
klog.InfoS("Deleted keyIDHashStatusLastTimestampSecondsMetricLabels", "providerName", item.providerName, "keyIDHash", item.keyIDHash, "apiServerIDHash", item.apiServerIDHash)
}
})
}
func RegisterMetrics() {
registerMetricsFunc.Do(func() {
registerLRUMetrics()
hashPool = &sync.Pool{
New: func() interface{} {
return sha256.New()
},
}
legacyregistry.MustRegister(dekCacheFillPercent)
legacyregistry.MustRegister(dekCacheInterArrivals)
legacyregistry.MustRegister(DekSourceCacheSize)
legacyregistry.MustRegister(KeyIDHashTotal)
legacyregistry.MustRegister(KeyIDHashLastTimestampSeconds)
legacyregistry.MustRegister(KeyIDHashStatusLastTimestampSeconds)
legacyregistry.MustRegister(InvalidKeyIDFromStatusTotal)
legacyregistry.MustRegister(KMSOperationsLatencyMetric)
})
}
// RecordKeyID records total count and last time in seconds when a KeyID was used for TransformFromStorage and TransformToStorage operations
func RecordKeyID(transformationType, providerName, keyID, apiServerID string) {
lockRecordKeyID.Lock()
defer lockRecordKeyID.Unlock()
keyIDHash, apiServerIDHash := addLabelToCache(keyIDHashTotalMetricLabels, transformationType, providerName, keyID, apiServerID)
KeyIDHashTotal.WithLabelValues(transformationType, providerName, keyIDHash, apiServerIDHash).Inc()
KeyIDHashLastTimestampSeconds.WithLabelValues(transformationType, providerName, keyIDHash, apiServerIDHash).SetToCurrentTime()
}
// RecordKeyIDFromStatus records last time in seconds when a KeyID was returned by the Status RPC call.
func RecordKeyIDFromStatus(providerName, keyID, apiServerID string) {
lockRecordKeyIDStatus.Lock()
defer lockRecordKeyIDStatus.Unlock()
keyIDHash, apiServerIDHash := addLabelToCache(keyIDHashStatusLastTimestampSecondsMetricLabels, "", providerName, keyID, apiServerID)
KeyIDHashStatusLastTimestampSeconds.WithLabelValues(providerName, keyIDHash, apiServerIDHash).SetToCurrentTime()
}
func RecordInvalidKeyIDFromStatus(providerName, errCode string) {
InvalidKeyIDFromStatusTotal.WithLabelValues(providerName, errCode).Inc()
}
func RecordArrival(transformationType string, start time.Time) {
switch transformationType {
case FromStorageLabel:
@@ -104,3 +267,56 @@ func RecordArrival(transformationType string, start time.Time) {
func RecordDekCacheFillPercent(percent float64) {
dekCacheFillPercent.Set(percent)
}
func RecordDekSourceCacheSize(providerName string, size int) {
DekSourceCacheSize.WithLabelValues(providerName).Set(float64(size))
}
// RecordKMSOperationLatency records the latency of KMS operation.
func RecordKMSOperationLatency(providerName, methodName string, duration time.Duration, err error) {
KMSOperationsLatencyMetric.WithLabelValues(providerName, methodName, getErrorCode(err)).Observe(duration.Seconds())
}
type gRPCError interface {
GRPCStatus() *status.Status
}
func getErrorCode(err error) string {
if err == nil {
return codes.OK.String()
}
// handle errors wrapped with fmt.Errorf and similar
var s gRPCError
if errors.As(err, &s) {
return s.GRPCStatus().Code().String()
}
// This is not gRPC error. The operation must have failed before gRPC
// method was called, otherwise we would get gRPC error.
return "unknown-non-grpc"
}
func getHash(data string) string {
if len(data) == 0 {
return ""
}
h := hashPool.Get().(hash.Hash)
h.Reset()
h.Write([]byte(data))
dataHash := fmt.Sprintf("sha256:%x", h.Sum(nil))
hashPool.Put(h)
return dataHash
}
func addLabelToCache(c *lru.Cache, transformationType, providerName, keyID, apiServerID string) (string, string) {
keyIDHash := getHash(keyID)
apiServerIDHash := getHash(apiServerID)
c.Add(metricLabels{
transformationType: transformationType,
providerName: providerName,
keyIDHash: keyIDHash,
apiServerIDHash: apiServerIDHash,
}, nil) // value is irrelevant, this is a set and not a map
return keyIDHash, apiServerIDHash
}

View File

@@ -1,54 +0,0 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package util
import (
"fmt"
"net/url"
"strings"
)
const (
// unixProtocol is the only supported protocol for remote KMS provider.
unixProtocol = "unix"
)
// Parse the endpoint to extract schema, host or path.
func ParseEndpoint(endpoint string) (string, error) {
if len(endpoint) == 0 {
return "", fmt.Errorf("remote KMS provider can't use empty string as endpoint")
}
u, err := url.Parse(endpoint)
if err != nil {
return "", fmt.Errorf("invalid endpoint %q for remote KMS provider, error: %v", endpoint, err)
}
if u.Scheme != unixProtocol {
return "", fmt.Errorf("unsupported scheme %q for remote KMS provider", u.Scheme)
}
// Linux abstract namespace socket - no physical file required
// Warning: Linux Abstract sockets have not concept of ACL (unlike traditional file based sockets).
// However, Linux Abstract sockets are subject to Linux networking namespace, so will only be accessible to
// containers within the same pod (unless host networking is used).
if strings.HasPrefix(u.Path, "/@") {
return strings.TrimPrefix(u.Path, "/"), nil
}
return u.Path, nil
}

View File

@@ -17,9 +17,11 @@ limitations under the License.
package value
import (
"errors"
"sync"
"time"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
"k8s.io/component-base/metrics"
@@ -51,7 +53,7 @@ var (
Buckets: metrics.ExponentialBuckets(5e-6, 2, 25),
StabilityLevel: metrics.ALPHA,
},
[]string{"transformation_type"},
[]string{"transformation_type", "transformer_prefix"},
)
transformerOperationsTotal = metrics.NewCounterVec(
@@ -59,7 +61,7 @@ var (
Namespace: namespace,
Subsystem: subsystem,
Name: "transformation_operations_total",
Help: "Total number of transformations.",
Help: "Total number of transformations. Successful transformation will have a status 'OK' and a varied status string when the transformation fails. This status and transformation_type fields may be used for alerting on encryption/decryption failure using transformation_type from_storage for decryption and to_storage for encryption",
StabilityLevel: metrics.ALPHA,
},
[]string{"transformation_type", "transformer_prefix", "status"},
@@ -111,12 +113,11 @@ func RegisterMetrics() {
// RecordTransformation records latencies and count of TransformFromStorage and TransformToStorage operations.
// Note that transformation_failures_total metric is deprecated, use transformation_operations_total instead.
func RecordTransformation(transformationType, transformerPrefix string, start time.Time, err error) {
transformerOperationsTotal.WithLabelValues(transformationType, transformerPrefix, status.Code(err).String()).Inc()
func RecordTransformation(transformationType, transformerPrefix string, elapsed time.Duration, err error) {
transformerOperationsTotal.WithLabelValues(transformationType, transformerPrefix, getErrorCode(err)).Inc()
switch {
case err == nil:
transformerLatencies.WithLabelValues(transformationType).Observe(sinceInSeconds(start))
if err == nil {
transformerLatencies.WithLabelValues(transformationType, transformerPrefix).Observe(elapsed.Seconds())
}
}
@@ -139,3 +140,23 @@ func RecordDataKeyGeneration(start time.Time, err error) {
func sinceInSeconds(start time.Time) float64 {
return time.Since(start).Seconds()
}
type gRPCError interface {
GRPCStatus() *status.Status
}
func getErrorCode(err error) string {
if err == nil {
return codes.OK.String()
}
// handle errors wrapped with fmt.Errorf and similar
var s gRPCError
if errors.As(err, &s) {
return s.GRPCStatus().Code().String()
}
// This is not gRPC error. The operation must have failed before gRPC
// method was called, otherwise we would get gRPC error.
return "unknown-non-grpc"
}

View File

@@ -23,7 +23,10 @@ import (
"fmt"
"time"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/util/errors"
genericapirequest "k8s.io/apiserver/pkg/endpoints/request"
"k8s.io/klog/v2"
)
func init() {
@@ -39,17 +42,30 @@ type Context interface {
AuthenticatedData() []byte
}
// Transformer allows a value to be transformed before being read from or written to the underlying store. The methods
// must be able to undo the transformation caused by the other.
type Transformer interface {
type Read interface {
// TransformFromStorage may transform the provided data from its underlying storage representation or return an error.
// Stale is true if the object on disk is stale and a write to etcd should be issued, even if the contents of the object
// have not changed.
TransformFromStorage(ctx context.Context, data []byte, dataCtx Context) (out []byte, stale bool, err error)
}
type Write interface {
// TransformToStorage may transform the provided data into the appropriate form in storage or return an error.
TransformToStorage(ctx context.Context, data []byte, dataCtx Context) (out []byte, err error)
}
// Transformer allows a value to be transformed before being read from or written to the underlying store. The methods
// must be able to undo the transformation caused by the other.
type Transformer interface {
Read
Write
}
// ResourceTransformers returns a transformer for the provided resource.
type ResourceTransformers interface {
TransformerForResource(resource schema.GroupResource) Transformer
}
// DefaultContext is a simple implementation of Context for a slice of bytes.
type DefaultContext []byte
@@ -100,9 +116,9 @@ func (t *prefixTransformers) TransformFromStorage(ctx context.Context, data []by
continue
}
if len(transformer.Prefix) == 0 {
RecordTransformation("from_storage", "identity", start, err)
RecordTransformation("from_storage", "identity", time.Since(start), err)
} else {
RecordTransformation("from_storage", string(transformer.Prefix), start, err)
RecordTransformation("from_storage", string(transformer.Prefix), time.Since(start), err)
}
// It is valid to have overlapping prefixes when the same encryption provider
@@ -144,9 +160,10 @@ func (t *prefixTransformers) TransformFromStorage(ctx context.Context, data []by
}
}
if err := errors.Reduce(errors.NewAggregate(errs)); err != nil {
logTransformErr(ctx, err, "failed to decrypt data")
return nil, false, err
}
RecordTransformation("from_storage", "unknown", start, t.err)
RecordTransformation("from_storage", "unknown", time.Since(start), t.err)
return nil, false, t.err
}
@@ -155,8 +172,9 @@ func (t *prefixTransformers) TransformToStorage(ctx context.Context, data []byte
start := time.Now()
transformer := t.transformers[0]
result, err := transformer.Transformer.TransformToStorage(ctx, data, dataCtx)
RecordTransformation("to_storage", string(transformer.Prefix), start, err)
RecordTransformation("to_storage", string(transformer.Prefix), time.Since(start), err)
if err != nil {
logTransformErr(ctx, err, "failed to encrypt data")
return nil, err
}
prefixedData := make([]byte, len(transformer.Prefix), len(result)+len(transformer.Prefix))
@@ -164,3 +182,32 @@ func (t *prefixTransformers) TransformToStorage(ctx context.Context, data []byte
prefixedData = append(prefixedData, result...)
return prefixedData, nil
}
func logTransformErr(ctx context.Context, err error, message string) {
requestInfo := getRequestInfoFromContext(ctx)
if klogLevel6 := klog.V(6); klogLevel6.Enabled() {
klogLevel6.InfoSDepth(
1,
message,
"err", err,
"group", requestInfo.APIGroup,
"version", requestInfo.APIVersion,
"resource", requestInfo.Resource,
"subresource", requestInfo.Subresource,
"verb", requestInfo.Verb,
"namespace", requestInfo.Namespace,
"name", requestInfo.Name,
)
return
}
klog.ErrorSDepth(1, err, message)
}
func getRequestInfoFromContext(ctx context.Context) *genericapirequest.RequestInfo {
if reqInfo, found := genericapirequest.RequestInfoFrom(ctx); found {
return reqInfo
}
return &genericapirequest.RequestInfo{}
}