From dd78c1a036440b33bc5c224b1af817ba4fd6d643 Mon Sep 17 00:00:00 2001 From: huanggze Date: Mon, 13 Apr 2020 20:20:21 +0800 Subject: [PATCH] feat: custom monitoring Signed-off-by: huanggze --- go.mod | 15 +- go.sum | 27 +- pkg/constants/constants.go | 1 + pkg/kapis/monitoring/v1alpha3/handler.go | 33 + pkg/kapis/monitoring/v1alpha3/helper.go | 2 + pkg/kapis/monitoring/v1alpha3/register.go | 23 + .../expressions/prometheus/label_replace.go | 99 + .../prometheus/label_replace_test.go | 51 + pkg/models/monitoring/expressions/registry.go | 9 + pkg/models/monitoring/monitoring.go | 37 +- pkg/models/monitoring/types.go | 4 + pkg/simple/client/monitoring/interface.go | 5 +- .../monitoring/prometheus/prometheus.go | 58 +- .../monitoring/prometheus/prometheus_test.go | 56 +- .../testdata/metadata-notfound-prom.json | 5 + .../testdata/metadata-notfound-res.json | 1 + .../prometheus/testdata/metadata-prom.json | 25 + .../prometheus/testdata/metadata-res.json | 12 + .../testdata/metrics-error-res.json | 2 +- pkg/simple/client/monitoring/types.go | 6 + vendor/github.com/golang/snappy/.gitignore | 16 + vendor/github.com/golang/snappy/AUTHORS | 15 + vendor/github.com/golang/snappy/CONTRIBUTORS | 37 + vendor/github.com/golang/snappy/LICENSE | 27 + vendor/github.com/golang/snappy/README | 107 + vendor/github.com/golang/snappy/decode.go | 237 ++ .../github.com/golang/snappy/decode_amd64.go | 14 + .../github.com/golang/snappy/decode_amd64.s | 490 ++++ .../github.com/golang/snappy/decode_other.go | 101 + vendor/github.com/golang/snappy/encode.go | 285 +++ .../github.com/golang/snappy/encode_amd64.go | 29 + .../github.com/golang/snappy/encode_amd64.s | 730 ++++++ .../github.com/golang/snappy/encode_other.go | 238 ++ vendor/github.com/golang/snappy/snappy.go | 98 + .../opentracing/opentracing-go/.gitignore | 1 + .../opentracing/opentracing-go/.travis.yml | 20 + .../opentracing/opentracing-go/CHANGELOG.md | 46 + .../opentracing/opentracing-go/LICENSE | 201 ++ .../opentracing/opentracing-go/Makefile | 20 + .../opentracing/opentracing-go/README.md | 171 ++ .../opentracing-go/globaltracer.go | 42 + .../opentracing/opentracing-go/gocontext.go | 60 + .../opentracing/opentracing-go/log/field.go | 269 +++ .../opentracing/opentracing-go/log/util.go | 54 + .../opentracing/opentracing-go/noop.go | 64 + .../opentracing/opentracing-go/propagation.go | 176 ++ .../opentracing/opentracing-go/span.go | 189 ++ .../opentracing/opentracing-go/tracer.go | 304 +++ .../prometheus/client_golang/api/client.go | 55 +- .../client_golang/api/prometheus/v1/api.go | 364 ++- .../client_golang/prometheus/build_info.go | 29 + .../prometheus/build_info_pre_1.12.go | 22 + .../client_golang/prometheus/go_collector.go | 32 +- .../prometheus/process_collector.go | 6 +- .../client_golang/prometheus/promhttp/http.go | 47 +- .../client_golang/prometheus/summary.go | 8 +- vendor/github.com/prometheus/procfs/Makefile | 1 + .../prometheus/procfs/Makefile.common | 2 +- vendor/github.com/prometheus/procfs/README.md | 44 +- .../github.com/prometheus/procfs/buddyinfo.go | 12 +- .../prometheus/procfs/fixtures.ttar | 144 +- vendor/github.com/prometheus/procfs/fs.go | 8 +- vendor/github.com/prometheus/procfs/ipvs.go | 28 +- vendor/github.com/prometheus/procfs/mdstat.go | 74 +- .../prometheus/procfs/mountstats.go | 41 +- .../github.com/prometheus/procfs/net_dev.go | 36 +- .../github.com/prometheus/procfs/net_unix.go | 275 +++ vendor/github.com/prometheus/procfs/proc.go | 11 +- .../github.com/prometheus/procfs/proc_io.go | 4 +- .../prometheus/procfs/proc_limits.go | 7 + .../github.com/prometheus/procfs/proc_ns.go | 4 +- .../github.com/prometheus/procfs/proc_psi.go | 17 +- .../github.com/prometheus/procfs/proc_stat.go | 9 +- .../prometheus/procfs/proc_status.go | 162 ++ vendor/github.com/prometheus/procfs/stat.go | 36 +- vendor/github.com/prometheus/procfs/ttar | 42 +- .../github.com/prometheus/prometheus/LICENSE | 201 ++ .../github.com/prometheus/prometheus/NOTICE | 87 + .../prometheus/prometheus/promql/ast.go | 317 +++ .../prometheus/prometheus/promql/engine.go | 1436 ++++++++++++ .../prometheus/prometheus/promql/functions.go | 1338 +++++++++++ .../prometheus/prometheus/promql/fuzz.go | 87 + .../prometheus/prometheus/promql/lex.go | 908 ++++++++ .../prometheus/prometheus/promql/parse.go | 1146 ++++++++++ .../prometheus/prometheus/promql/printer.go | 236 ++ .../prometheus/prometheus/promql/quantile.go | 185 ++ .../prometheus/prometheus/promql/test.go | 525 +++++ .../prometheus/storage/local/chunk/chunk.go | 494 ++++ .../prometheus/storage/local/chunk/delta.go | 379 +++ .../storage/local/chunk/delta_helpers.go | 84 + .../storage/local/chunk/doubledelta.go | 525 +++++ .../storage/local/chunk/instrumentation.go | 90 + .../prometheus/storage/local/chunk/varbit.go | 1210 ++++++++++ .../storage/local/chunk/varbit_helpers.go | 75 + .../storage/local/codable/codable.go | 467 ++++ .../prometheus/storage/local/crashrecovery.go | 559 +++++ .../prometheus/storage/local/heads.go | 261 +++ .../prometheus/storage/local/index/index.go | 303 +++ .../storage/local/index/interface.go | 61 + .../prometheus/storage/local/index/leveldb.go | 210 ++ .../storage/local/instrumentation.go | 46 + .../prometheus/storage/local/interface.go | 106 + .../prometheus/storage/local/locker.go | 79 + .../prometheus/storage/local/mapper.go | 218 ++ .../prometheus/storage/local/noop_storage.go | 100 + .../prometheus/storage/local/persistence.go | 1722 ++++++++++++++ .../prometheus/storage/local/series.go | 728 ++++++ .../prometheus/storage/local/storage.go | 2029 +++++++++++++++++ .../prometheus/storage/local/test_helpers.go | 72 + .../prometheus/storage/metric/matcher.go | 209 ++ .../prometheus/storage/metric/metric.go | 63 + .../prometheus/storage/metric/sample.go | 22 + .../prometheus/prometheus/storage/storage.go | 76 + .../prometheus/prometheus/util/flock/flock.go | 46 + .../prometheus/util/flock/flock_plan9.go | 32 + .../prometheus/util/flock/flock_solaris.go | 59 + .../prometheus/util/flock/flock_unix.go | 54 + .../prometheus/util/flock/flock_windows.go | 36 + .../prometheus/util/stats/query_stats.go | 48 + .../prometheus/prometheus/util/stats/timer.go | 108 + .../prometheus/util/strutil/quote.go | 223 ++ .../prometheus/util/strutil/strconv.go | 44 + .../prometheus/util/testutil/directory.go | 129 ++ .../prometheus/util/testutil/error.go | 31 + .../prometheus/util/testutil/roundtrip.go | 47 + vendor/github.com/syndtr/goleveldb/LICENSE | 24 + .../syndtr/goleveldb/leveldb/batch.go | 349 +++ .../syndtr/goleveldb/leveldb/cache/cache.go | 704 ++++++ .../syndtr/goleveldb/leveldb/cache/lru.go | 195 ++ .../syndtr/goleveldb/leveldb/comparer.go | 67 + .../leveldb/comparer/bytes_comparer.go | 51 + .../goleveldb/leveldb/comparer/comparer.go | 57 + .../github.com/syndtr/goleveldb/leveldb/db.go | 1179 ++++++++++ .../syndtr/goleveldb/leveldb/db_compaction.go | 854 +++++++ .../syndtr/goleveldb/leveldb/db_iter.go | 360 +++ .../syndtr/goleveldb/leveldb/db_snapshot.go | 187 ++ .../syndtr/goleveldb/leveldb/db_state.go | 239 ++ .../goleveldb/leveldb/db_transaction.go | 329 +++ .../syndtr/goleveldb/leveldb/db_util.go | 102 + .../syndtr/goleveldb/leveldb/db_write.go | 464 ++++ .../syndtr/goleveldb/leveldb/doc.go | 92 + .../syndtr/goleveldb/leveldb/errors.go | 20 + .../syndtr/goleveldb/leveldb/errors/errors.go | 78 + .../syndtr/goleveldb/leveldb/filter.go | 31 + .../syndtr/goleveldb/leveldb/filter/bloom.go | 116 + .../syndtr/goleveldb/leveldb/filter/filter.go | 60 + .../goleveldb/leveldb/iterator/array_iter.go | 184 ++ .../leveldb/iterator/indexed_iter.go | 242 ++ .../syndtr/goleveldb/leveldb/iterator/iter.go | 132 ++ .../goleveldb/leveldb/iterator/merged_iter.go | 304 +++ .../goleveldb/leveldb/journal/journal.go | 524 +++++ .../syndtr/goleveldb/leveldb/key.go | 143 ++ .../syndtr/goleveldb/leveldb/memdb/memdb.go | 479 ++++ .../syndtr/goleveldb/leveldb/opt/options.go | 697 ++++++ .../syndtr/goleveldb/leveldb/options.go | 107 + .../syndtr/goleveldb/leveldb/session.go | 210 ++ .../goleveldb/leveldb/session_compaction.go | 302 +++ .../goleveldb/leveldb/session_record.go | 323 +++ .../syndtr/goleveldb/leveldb/session_util.go | 271 +++ .../syndtr/goleveldb/leveldb/storage.go | 63 + .../goleveldb/leveldb/storage/file_storage.go | 671 ++++++ .../leveldb/storage/file_storage_nacl.go | 34 + .../leveldb/storage/file_storage_plan9.go | 63 + .../leveldb/storage/file_storage_solaris.go | 81 + .../leveldb/storage/file_storage_unix.go | 98 + .../leveldb/storage/file_storage_windows.go | 78 + .../goleveldb/leveldb/storage/mem_storage.go | 222 ++ .../goleveldb/leveldb/storage/storage.go | 187 ++ .../syndtr/goleveldb/leveldb/table.go | 531 +++++ .../syndtr/goleveldb/leveldb/table/reader.go | 1139 +++++++++ .../syndtr/goleveldb/leveldb/table/table.go | 177 ++ .../syndtr/goleveldb/leveldb/table/writer.go | 375 +++ .../syndtr/goleveldb/leveldb/util.go | 98 + .../syndtr/goleveldb/leveldb/util/buffer.go | 293 +++ .../goleveldb/leveldb/util/buffer_pool.go | 239 ++ .../syndtr/goleveldb/leveldb/util/crc32.go | 30 + .../syndtr/goleveldb/leveldb/util/hash.go | 48 + .../syndtr/goleveldb/leveldb/util/range.go | 32 + .../syndtr/goleveldb/leveldb/util/util.go | 73 + .../syndtr/goleveldb/leveldb/version.go | 528 +++++ vendor/modules.txt | 36 +- 181 files changed, 37758 insertions(+), 357 deletions(-) create mode 100644 pkg/models/monitoring/expressions/prometheus/label_replace.go create mode 100644 pkg/models/monitoring/expressions/prometheus/label_replace_test.go create mode 100644 pkg/models/monitoring/expressions/registry.go create mode 100644 pkg/simple/client/monitoring/prometheus/testdata/metadata-notfound-prom.json create mode 100644 pkg/simple/client/monitoring/prometheus/testdata/metadata-notfound-res.json create mode 100644 pkg/simple/client/monitoring/prometheus/testdata/metadata-prom.json create mode 100644 pkg/simple/client/monitoring/prometheus/testdata/metadata-res.json create mode 100644 vendor/github.com/golang/snappy/.gitignore create mode 100644 vendor/github.com/golang/snappy/AUTHORS create mode 100644 vendor/github.com/golang/snappy/CONTRIBUTORS create mode 100644 vendor/github.com/golang/snappy/LICENSE create mode 100644 vendor/github.com/golang/snappy/README create mode 100644 vendor/github.com/golang/snappy/decode.go create mode 100644 vendor/github.com/golang/snappy/decode_amd64.go create mode 100644 vendor/github.com/golang/snappy/decode_amd64.s create mode 100644 vendor/github.com/golang/snappy/decode_other.go create mode 100644 vendor/github.com/golang/snappy/encode.go create mode 100644 vendor/github.com/golang/snappy/encode_amd64.go create mode 100644 vendor/github.com/golang/snappy/encode_amd64.s create mode 100644 vendor/github.com/golang/snappy/encode_other.go create mode 100644 vendor/github.com/golang/snappy/snappy.go create mode 100644 vendor/github.com/opentracing/opentracing-go/.gitignore create mode 100644 vendor/github.com/opentracing/opentracing-go/.travis.yml create mode 100644 vendor/github.com/opentracing/opentracing-go/CHANGELOG.md create mode 100644 vendor/github.com/opentracing/opentracing-go/LICENSE create mode 100644 vendor/github.com/opentracing/opentracing-go/Makefile create mode 100644 vendor/github.com/opentracing/opentracing-go/README.md create mode 100644 vendor/github.com/opentracing/opentracing-go/globaltracer.go create mode 100644 vendor/github.com/opentracing/opentracing-go/gocontext.go create mode 100644 vendor/github.com/opentracing/opentracing-go/log/field.go create mode 100644 vendor/github.com/opentracing/opentracing-go/log/util.go create mode 100644 vendor/github.com/opentracing/opentracing-go/noop.go create mode 100644 vendor/github.com/opentracing/opentracing-go/propagation.go create mode 100644 vendor/github.com/opentracing/opentracing-go/span.go create mode 100644 vendor/github.com/opentracing/opentracing-go/tracer.go create mode 100644 vendor/github.com/prometheus/client_golang/prometheus/build_info.go create mode 100644 vendor/github.com/prometheus/client_golang/prometheus/build_info_pre_1.12.go create mode 100644 vendor/github.com/prometheus/procfs/net_unix.go create mode 100644 vendor/github.com/prometheus/procfs/proc_status.go create mode 100644 vendor/github.com/prometheus/prometheus/LICENSE create mode 100644 vendor/github.com/prometheus/prometheus/NOTICE create mode 100644 vendor/github.com/prometheus/prometheus/promql/ast.go create mode 100644 vendor/github.com/prometheus/prometheus/promql/engine.go create mode 100644 vendor/github.com/prometheus/prometheus/promql/functions.go create mode 100644 vendor/github.com/prometheus/prometheus/promql/fuzz.go create mode 100644 vendor/github.com/prometheus/prometheus/promql/lex.go create mode 100644 vendor/github.com/prometheus/prometheus/promql/parse.go create mode 100644 vendor/github.com/prometheus/prometheus/promql/printer.go create mode 100644 vendor/github.com/prometheus/prometheus/promql/quantile.go create mode 100644 vendor/github.com/prometheus/prometheus/promql/test.go create mode 100644 vendor/github.com/prometheus/prometheus/storage/local/chunk/chunk.go create mode 100644 vendor/github.com/prometheus/prometheus/storage/local/chunk/delta.go create mode 100644 vendor/github.com/prometheus/prometheus/storage/local/chunk/delta_helpers.go create mode 100644 vendor/github.com/prometheus/prometheus/storage/local/chunk/doubledelta.go create mode 100644 vendor/github.com/prometheus/prometheus/storage/local/chunk/instrumentation.go create mode 100644 vendor/github.com/prometheus/prometheus/storage/local/chunk/varbit.go create mode 100644 vendor/github.com/prometheus/prometheus/storage/local/chunk/varbit_helpers.go create mode 100644 vendor/github.com/prometheus/prometheus/storage/local/codable/codable.go create mode 100644 vendor/github.com/prometheus/prometheus/storage/local/crashrecovery.go create mode 100644 vendor/github.com/prometheus/prometheus/storage/local/heads.go create mode 100644 vendor/github.com/prometheus/prometheus/storage/local/index/index.go create mode 100644 vendor/github.com/prometheus/prometheus/storage/local/index/interface.go create mode 100644 vendor/github.com/prometheus/prometheus/storage/local/index/leveldb.go create mode 100644 vendor/github.com/prometheus/prometheus/storage/local/instrumentation.go create mode 100644 vendor/github.com/prometheus/prometheus/storage/local/interface.go create mode 100644 vendor/github.com/prometheus/prometheus/storage/local/locker.go create mode 100644 vendor/github.com/prometheus/prometheus/storage/local/mapper.go create mode 100644 vendor/github.com/prometheus/prometheus/storage/local/noop_storage.go create mode 100644 vendor/github.com/prometheus/prometheus/storage/local/persistence.go create mode 100644 vendor/github.com/prometheus/prometheus/storage/local/series.go create mode 100644 vendor/github.com/prometheus/prometheus/storage/local/storage.go create mode 100644 vendor/github.com/prometheus/prometheus/storage/local/test_helpers.go create mode 100644 vendor/github.com/prometheus/prometheus/storage/metric/matcher.go create mode 100644 vendor/github.com/prometheus/prometheus/storage/metric/metric.go create mode 100644 vendor/github.com/prometheus/prometheus/storage/metric/sample.go create mode 100644 vendor/github.com/prometheus/prometheus/storage/storage.go create mode 100644 vendor/github.com/prometheus/prometheus/util/flock/flock.go create mode 100644 vendor/github.com/prometheus/prometheus/util/flock/flock_plan9.go create mode 100644 vendor/github.com/prometheus/prometheus/util/flock/flock_solaris.go create mode 100644 vendor/github.com/prometheus/prometheus/util/flock/flock_unix.go create mode 100644 vendor/github.com/prometheus/prometheus/util/flock/flock_windows.go create mode 100644 vendor/github.com/prometheus/prometheus/util/stats/query_stats.go create mode 100644 vendor/github.com/prometheus/prometheus/util/stats/timer.go create mode 100644 vendor/github.com/prometheus/prometheus/util/strutil/quote.go create mode 100644 vendor/github.com/prometheus/prometheus/util/strutil/strconv.go create mode 100644 vendor/github.com/prometheus/prometheus/util/testutil/directory.go create mode 100644 vendor/github.com/prometheus/prometheus/util/testutil/error.go create mode 100644 vendor/github.com/prometheus/prometheus/util/testutil/roundtrip.go create mode 100644 vendor/github.com/syndtr/goleveldb/LICENSE create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/batch.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/cache/cache.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/cache/lru.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/comparer.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/comparer/bytes_comparer.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/comparer/comparer.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/db.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/db_compaction.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/db_iter.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/db_snapshot.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/db_state.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/db_transaction.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/db_util.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/db_write.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/doc.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/errors.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/errors/errors.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/filter.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/filter/bloom.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/filter/filter.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/iterator/array_iter.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/iterator/indexed_iter.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/iterator/iter.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/iterator/merged_iter.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/journal/journal.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/key.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/memdb/memdb.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/opt/options.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/options.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/session.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/session_compaction.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/session_record.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/session_util.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/storage.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_nacl.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_plan9.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_solaris.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_unix.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_windows.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/storage/mem_storage.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/storage/storage.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/table.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/table/reader.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/table/table.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/table/writer.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/util.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/util/buffer.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/util/buffer_pool.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/util/crc32.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/util/hash.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/util/range.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/util/util.go create mode 100644 vendor/github.com/syndtr/goleveldb/leveldb/version.go diff --git a/go.mod b/go.mod index 4a41dfc01..2a1c7564b 100644 --- a/go.mod +++ b/go.mod @@ -66,16 +66,19 @@ require ( github.com/opencontainers/go-digest v1.0.0-rc1 github.com/opencontainers/image-spec v1.0.1 // indirect github.com/openshift/api v0.0.0-20180801171038-322a19404e37 // indirect + github.com/opentracing/opentracing-go v1.1.0 // indirect github.com/pkg/errors v0.8.1 github.com/projectcalico/libcalico-go v1.7.2-0.20191104213956-8f81e1e344ce - github.com/prometheus/client_golang v0.9.3 - github.com/prometheus/common v0.4.0 + github.com/prometheus/client_golang v0.9.4 + github.com/prometheus/common v0.4.1 + github.com/prometheus/prometheus v1.8.2 github.com/sony/sonyflake v0.0.0-20181109022403-6d5bd6181009 github.com/speps/go-hashids v2.0.0+incompatible github.com/spf13/cobra v0.0.5 github.com/spf13/pflag v1.0.5 github.com/spf13/viper v1.4.0 github.com/stretchr/testify v1.4.0 + github.com/syndtr/goleveldb v1.0.0 // indirect github.com/xanzy/ssh-agent v0.2.1 // indirect golang.org/x/crypto v0.0.0-20190923035154-9ee001bba392 golang.org/x/net v0.0.0-20191004110552-13f9640d40b9 @@ -210,6 +213,7 @@ replace ( github.com/golang/groupcache => github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6 github.com/golang/mock => github.com/golang/mock v1.2.0 github.com/golang/protobuf => github.com/golang/protobuf v1.3.2 + github.com/golang/snappy => github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db github.com/google/btree => github.com/google/btree v1.0.0 github.com/google/go-cmp => github.com/google/go-cmp v0.3.0 github.com/google/go-querystring => github.com/google/go-querystring v1.0.0 @@ -297,6 +301,7 @@ replace ( github.com/opencontainers/image-spec => github.com/opencontainers/image-spec v1.0.1 github.com/openshift/api => github.com/openshift/api v0.0.0-20180801171038-322a19404e37 github.com/openshift/build-machinery-go => github.com/openshift/build-machinery-go v0.0.0-20200211121458-5e3d6e570160 + github.com/opentracing/opentracing-go => github.com/opentracing/opentracing-go v1.1.0 github.com/pborman/uuid => github.com/pborman/uuid v1.2.0 github.com/pelletier/go-buffruneio => github.com/pelletier/go-buffruneio v0.2.0 github.com/pelletier/go-toml => github.com/pelletier/go-toml v1.2.0 @@ -311,10 +316,11 @@ replace ( github.com/projectcalico/go-yaml => github.com/projectcalico/go-yaml v0.0.0-20161201183616-955bc3e451ef github.com/projectcalico/go-yaml-wrapper => github.com/projectcalico/go-yaml-wrapper v0.0.0-20161127220527-598e54215bee github.com/projectcalico/libcalico-go => github.com/projectcalico/libcalico-go v1.7.2-0.20191104213956-8f81e1e344ce - github.com/prometheus/client_golang => github.com/prometheus/client_golang v0.9.3 + github.com/prometheus/client_golang => github.com/prometheus/client_golang v0.9.4 github.com/prometheus/client_model => github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90 github.com/prometheus/common => github.com/prometheus/common v0.4.0 - github.com/prometheus/procfs => github.com/prometheus/procfs v0.0.0-20190507164030-5867b95ac084 + github.com/prometheus/procfs => github.com/prometheus/procfs v0.0.2 + github.com/prometheus/prometheus => github.com/prometheus/prometheus v1.8.2 github.com/prometheus/tsdb => github.com/prometheus/tsdb v0.7.1 github.com/rcrowley/go-metrics => github.com/rcrowley/go-metrics v0.0.0-20181016184325-3113b8401b8a github.com/remyoudompheng/bigfft => github.com/remyoudompheng/bigfft v0.0.0-20170806203942-52369c62f446 @@ -339,6 +345,7 @@ replace ( github.com/src-d/gcfg => github.com/src-d/gcfg v1.4.0 github.com/stretchr/objx => github.com/stretchr/objx v0.2.0 github.com/stretchr/testify => github.com/stretchr/testify v1.4.0 + github.com/syndtr/goleveldb => github.com/syndtr/goleveldb v1.0.0 github.com/tinylib/msgp => github.com/tinylib/msgp v1.1.0 github.com/tmc/grpc-websocket-proxy => github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5 github.com/ugorji/go => github.com/ugorji/go v1.1.4 diff --git a/go.sum b/go.sum index e5ab35c79..3e9a3bfd0 100644 --- a/go.sum +++ b/go.sum @@ -80,7 +80,6 @@ github.com/deckarep/golang-set v1.7.1/go.mod h1:93vsz/8Wt4joVM7c2AVqh+YRMiUSc14y github.com/denisenkom/go-mssqldb v0.0.0-20190204142019-df6d76eb9289/go.mod h1:xN/JuLBIz4bjkxNmByTiV1IbhfnYb6oo99phBn4Eqhc= github.com/dgrijalva/jwt-go v3.2.0+incompatible h1:7qlOGliEKZXTDg6OTjfoBKDXWrumCAMpl/TFQ4/5kLM= github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ= -github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no= github.com/docker/distribution v2.7.1+incompatible h1:a5mlkVzth6W5A4fOsS3D2EO5BUmsJpcB+cRlLU7cSug= github.com/docker/distribution v2.7.1+incompatible/go.mod h1:J2gT2udsDAN96Uj4KfcMRqY0/ypR+oyYUYmja8H+y+w= github.com/docker/engine v1.4.2-0.20190822205725-ed20165a37b4 h1:+VAGRKyn9Ca+ckzV/PJsaRO7UXO9KQjFmSffcSDrWdE= @@ -187,6 +186,8 @@ github.com/golang/mock v1.2.0 h1:28o5sBqPkBsMGnC6b4MvE2TzSr5/AT4c/1fLqVGIwlk= github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/protobuf v1.3.2 h1:6nsPYzhq5kReh6QImI3k5qWzO4PEbvbIW2cwSfR/6xs= github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db h1:woRePGFeVFfLKN/pOkfl+p/TAqKOfFu+7KPlMVpok/w= +github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/google/btree v1.0.0 h1:0udJVsspx3VBr5FwtLhQQtuAsVc79tTq0ocGIPAU6qo= github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= github.com/google/go-cmp v0.3.0 h1:crn/baboCvb5fXaQ0IJ1SGTsTVrWpDsCWC8EGETZijY= @@ -301,7 +302,6 @@ github.com/munnerz/goautoneg v0.0.0-20120707110453-a547fc61f48d/go.mod h1:+n7T8m github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= -github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U= github.com/olekukonko/tablewriter v0.0.1/go.mod h1:vsDQFd/mU46D+Z4whnwzcISnGGzXWMclvtLoiIKAKIo= github.com/onsi/ginkgo v1.8.0 h1:VkHVNpR4iVnU8XQR6DBm8BqYjN7CRzw+xKUbVVbbW9w= github.com/onsi/ginkgo v1.8.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= @@ -315,13 +315,14 @@ github.com/opencontainers/image-spec v1.0.1 h1:JMemWkRwHx4Zj+fVxWoMCFm/8sYGGrUVo github.com/opencontainers/image-spec v1.0.1/go.mod h1:BtxoFyWECRxE4U/7sNtV5W15zMzWCbyJoFRP3s7yZA0= github.com/openshift/api v0.0.0-20180801171038-322a19404e37 h1:05irGU4HK4IauGGDbsk+ZHrm1wOzMLYjMlfaiqMrBYc= github.com/openshift/api v0.0.0-20180801171038-322a19404e37/go.mod h1:dh9o4Fs58gpFXGSYfnVxGR9PnV53I8TW84pQaJDdGiY= +github.com/opentracing/opentracing-go v1.1.0 h1:pWlfV3Bxv7k65HYwkikxat0+s3pV4bsqf19k25Ur8rU= +github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= github.com/pborman/uuid v1.2.0 h1:J7Q5mO4ysT1dv8hyrUGHb9+ooztCXu1D8MY8DZYsu3g= github.com/pborman/uuid v1.2.0/go.mod h1:X/NO0urCmaxf9VXbdlT7C2Yzkj2IKimNn4k+gtPdI/k= github.com/pelletier/go-buffruneio v0.2.0 h1:U4t4R6YkofJ5xHm3dJzuRpPZ0mr5MMCoAWooScCR7aA= github.com/pelletier/go-buffruneio v0.2.0/go.mod h1:JkE26KsDizTr40EUHkXVtNPvgGtbSNq5BcowyYOWdKo= github.com/pelletier/go-toml v1.2.0 h1:T5zMGML61Wp+FlcbWjRDT7yAxhJNAiPPLOFECq181zc= github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic= -github.com/peterbourgon/diskv v2.0.1+incompatible h1:UBdAOUP5p4RWqPBg048CAvpKN+vxiaj6gdUUzhl4XmI= github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR/tNboyR3/BZd58JJSHlUSCU= github.com/peterh/liner v0.0.0-20170211195444-bf27d3ba8e1d/go.mod h1:xIteQHvHuaLYG9IFj6mSxM0fCKrs34IrEQUhOYuGPHc= github.com/philhofer/fwd v1.0.0/go.mod h1:gk3iGcWd9+svBvR0sR+KPcfE+RNWozjowpeBVG3ZVNU= @@ -339,22 +340,21 @@ github.com/projectcalico/go-yaml-wrapper v0.0.0-20161127220527-598e54215bee h1:y github.com/projectcalico/go-yaml-wrapper v0.0.0-20161127220527-598e54215bee/go.mod h1:UgC0aTQ2KMDxlX3lU/stndk7DMUBJqzN40yFiILHgxc= github.com/projectcalico/libcalico-go v1.7.2-0.20191104213956-8f81e1e344ce h1:O/R67iwUe8TvZwgKbDB2cvF2/8L8PR4zVOcBtYEHD5Y= github.com/projectcalico/libcalico-go v1.7.2-0.20191104213956-8f81e1e344ce/go.mod h1:z4tuFqrAg/423AMSaDamY5LgqeOZ5ETui6iOxDwJ/ag= -github.com/prometheus/client_golang v0.9.3 h1:9iH4JKXLzFbOAdtqv/a+j8aewx2Y8lAjAydhbaScPF8= -github.com/prometheus/client_golang v0.9.3/go.mod h1:/TN21ttK/J9q6uSwhBd54HahCDft0ttaMvbicHlPoso= +github.com/prometheus/client_golang v0.9.4 h1:Y8E/JaaPbmFSW2V81Ab/d8yZFYQQGbni1b1jPcG9Y6A= +github.com/prometheus/client_golang v0.9.4/go.mod h1:oCXIBxdI62A4cR6aTRJCgetEjecSIYzOEaeAn4iYEpM= github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90 h1:S/YWwWx/RA8rT8tKFRuGUZhuA90OyIBpPCXkcbwU8DE= github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/common v0.4.0 h1:7etb9YClo3a6HjLzfl6rIQaU+FDfi0VSX39io3aQ+DM= github.com/prometheus/common v0.4.0/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= -github.com/prometheus/procfs v0.0.0-20190507164030-5867b95ac084 h1:sofwID9zm4tzrgykg80hfFph1mryUeLRsUfoocVVmRY= -github.com/prometheus/procfs v0.0.0-20190507164030-5867b95ac084/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= -github.com/prometheus/tsdb v0.7.1 h1:YZcsG11NqnK4czYLrWd9mpEuAJIHVQLwdrleYfszMAA= -github.com/prometheus/tsdb v0.7.1/go.mod h1:qhTCs0VvXwvX/y3TZrWD7rabWM+ijKTux40TwIPHuXU= +github.com/prometheus/procfs v0.0.2 h1:6LJUbpNm42llc4HRCuvApCSWB/WfhuNo9K98Q9sNGfs= +github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= +github.com/prometheus/prometheus v1.8.2 h1:PAL466mnJw1VolZPm1OarpdUpqukUy/eX4tagia17DM= +github.com/prometheus/prometheus v1.8.2/go.mod h1:oAIUtOny2rjMX0OWN5vPR5/q/twIROJvdqnQKDdil/s= github.com/rcrowley/go-metrics v0.0.0-20181016184325-3113b8401b8a h1:9ZKAASQSHhDYGoxY8uLVpewe1GDZ2vu2Tr/vTdVAkFQ= github.com/rcrowley/go-metrics v0.0.0-20181016184325-3113b8401b8a/go.mod h1:bCqnVzQkZxMG4s8nGwiZ5l3QUCyqpo9Y+/ZMZ9VjZe4= github.com/remyoudompheng/bigfft v0.0.0-20170806203942-52369c62f446/go.mod h1:uYEyJGbgTkfkS4+E/PavXkNJcbFIpEtjt2B0KDQ5+9M= github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg= github.com/rogpeppe/go-charset v0.0.0-20180617210344-2471d30d28b4/go.mod h1:qgYeAmZ5ZIpBWTGllZSQnw97Dj+woV0toclVaRGI8pc= -github.com/russross/blackfriday v1.5.2 h1:HyvC0ARfnZBqnXwABFeSZHpKvJHJJfPz81GNueLj0oo= github.com/russross/blackfriday v1.5.2/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR/rfWxYHBV53g= github.com/satori/go.uuid v1.2.0 h1:0uYX9dsZ2yD7q2RtLRtPSdGDWzjeM3TbMJP9utgA0ww= github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0= @@ -390,6 +390,8 @@ github.com/stretchr/objx v0.2.0 h1:Hbg2NidpLE8veEBkEZTL3CvlkUIVzuU9jDplZO54c48= github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE= github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/syndtr/goleveldb v1.0.0 h1:fBdIW9lB4Iz0n9khmH8w27SJ3QEJ7+IgjPEwGSZiFdE= +github.com/syndtr/goleveldb v1.0.0/go.mod h1:ZVVdQEZoIme9iO1Ch2Jdy24qqXrMMOU6lpPAyBWyWuQ= github.com/tinylib/msgp v1.1.0/go.mod h1:+d+yLhGm8mzTaHzB+wgMYrodPfmZrzkirds8fDWklFE= github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5 h1:LnC5Kc/wtumK+WB441p7ynQJzVuNRJiqddSIE3IlSEQ= github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= @@ -414,15 +416,12 @@ go.uber.org/zap v1.10.0 h1:ORx85nbTijNz8ljznvCMR1ZBIPKFn3jQrag10X2AsuM= go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2 h1:VklqNMn3ovrHsnt90PveolxSbWFaJdECFbxSq0Mqo2M= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/exp v0.0.0-20190121172915-509febef88a4 h1:c2HOrn5iMezYjSlGPncknSEr/8x5LELb/ilJbXi9DEA= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= -golang.org/x/lint v0.0.0-20190301231843-5614ed5bae6f h1:hX65Cu3JDlGH3uEdK7I99Ii+9kjD6mvnnpfLdEAH0x4= golang.org/x/lint v0.0.0-20190301231843-5614ed5bae6f/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= golang.org/x/net v0.0.0-20190620200207-3b0461eec859 h1:R/3boaszxrf1GEUWTVDzSKVwLmSJpwZ1yqXm8j0v2QI= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/oauth2 v0.0.0-20190402181905-9f3314589c9a h1:tImsplftrFpALCYumobsd0K86vlAs/eXGFms2txfJfA= golang.org/x/oauth2 v0.0.0-20190402181905-9f3314589c9a/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= -golang.org/x/sync v0.0.0-20190423024810-112230192c58 h1:8gQV6CLnAEikrhgkHFbMAEhagSSnXWGV915qUMm9mrU= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20190228124157-a34e9553db1e h1:ZytStCyV048ZqDsWHiYDdoI2Vd4msMcrDECFxS+tL9c= golang.org/x/sys v0.0.0-20190228124157-a34e9553db1e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -436,7 +435,6 @@ golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7 h1:9zdDQZ7Thm29KFXgAX/+y golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gomodules.xyz/jsonpatch/v2 v2.0.1 h1:xyiBuvkD2g5n7cYzx6u2sxQvsAy4QJsZFCzGVdzOXZ0= gomodules.xyz/jsonpatch/v2 v2.0.1/go.mod h1:IhYNNY4jnS53ZnfE4PAmpKtDpTCj1JFXc+3mwe7XcUU= -gonum.org/v1/gonum v0.0.0-20190331200053-3d26580ed485 h1:OB/uP/Puiu5vS5QMRPrXCDWUPb+kt8f1KW8oQzFejQw= gonum.org/v1/gonum v0.0.0-20190331200053-3d26580ed485/go.mod h1:2ltnJ7xHfj0zHS40VVPYEAAMTa3ZGguvHGBSJeRWqE0= gonum.org/v1/netlib v0.0.0-20190331212654-76723241ea4e/go.mod h1:kS+toOQn6AQKjmKJ7gzohV1XkqsFehRA2FbsbkopSuQ= google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE= @@ -465,7 +463,6 @@ gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/natefinch/lumberjack.v2 v2.0.0 h1:1Lc07Kr7qY4U2YPouBjpCLxpiyxIVoxqXgkXLknAOE8= gopkg.in/natefinch/lumberjack.v2 v2.0.0/go.mod h1:l0ndWWf7gzL7RNwBG7wST/UCcT4T24xpD6X8LsfU/+k= gopkg.in/resty.v1 v1.12.0/go.mod h1:mDo4pnntr5jdWRML875a/NmxYqAlA73dVijT2AXvQQo= -gopkg.in/square/go-jose.v2 v2.3.1 h1:SK5KegNXmKmqE342YYN2qPHEnUYeoMiXXl1poUlI+o4= gopkg.in/square/go-jose.v2 v2.3.1/go.mod h1:M9dMgbHiYLoDGQrXy7OpJDJWiKiU//h+vD76mk0e1AI= gopkg.in/src-d/go-billy.v4 v4.3.0 h1:KtlZ4c1OWbIs4jCv5ZXrTqG8EQocr0g/d4DjNg70aek= gopkg.in/src-d/go-billy.v4 v4.3.0/go.mod h1:tm33zBoOwxjYHZIE+OV8bxTWFMJLrconzFMd38aARFk= diff --git a/pkg/constants/constants.go b/pkg/constants/constants.go index a72338ba7..8104b4dcf 100644 --- a/pkg/constants/constants.go +++ b/pkg/constants/constants.go @@ -75,6 +75,7 @@ const ( WorkloadMetricsTag = "Workload Metrics" WorkspaceMetricsTag = "Workspace Metrics" ComponentMetricsTag = "Component Metrics" + CustomMetricsTag = "Custom Metrics" LogQueryTag = "Log Query" TerminalTag = "Terminal" ) diff --git a/pkg/kapis/monitoring/v1alpha3/handler.go b/pkg/kapis/monitoring/v1alpha3/handler.go index 9f7220876..8bf4916cc 100644 --- a/pkg/kapis/monitoring/v1alpha3/handler.go +++ b/pkg/kapis/monitoring/v1alpha3/handler.go @@ -192,3 +192,36 @@ func (h handler) handleNamedMetricsQuery(resp *restful.Response, q queryOptions) } resp.WriteAsJson(res) } + +func (h handler) handleMetadataQuery(req *restful.Request, resp *restful.Response) { + res := h.mo.GetMetadata(req.PathParameter("namespace")) + resp.WriteAsJson(res) +} + +func (h handler) handleAdhocQuery(req *restful.Request, resp *restful.Response) { + var res monitoring.Metric + + params := parseRequestParams(req) + opt, err := h.makeQueryOptions(params, 0) + if err != nil { + if err.Error() == ErrNoHit { + resp.WriteAsJson(res) + return + } + + api.HandleBadRequest(resp, nil, err) + return + } + + if opt.isRangeQuery() { + res, err = h.mo.GetMetricOverTime(params.expression, params.namespaceName, opt.start, opt.end, opt.step) + } else { + res, err = h.mo.GetMetric(params.expression, params.namespaceName, opt.time) + } + + if err != nil { + api.HandleBadRequest(resp, nil, err) + } else { + resp.WriteAsJson(res) + } +} diff --git a/pkg/kapis/monitoring/v1alpha3/helper.go b/pkg/kapis/monitoring/v1alpha3/helper.go index d5442b7e0..f4a3ad27e 100644 --- a/pkg/kapis/monitoring/v1alpha3/helper.go +++ b/pkg/kapis/monitoring/v1alpha3/helper.go @@ -49,6 +49,7 @@ type reqParams struct { pvcName string storageClassName string componentType string + expression string } type queryOptions struct { @@ -99,6 +100,7 @@ func parseRequestParams(req *restful.Request) reqParams { r.pvcName = req.PathParameter("pvc") r.storageClassName = req.PathParameter("storageclass") r.componentType = req.PathParameter("component") + r.expression = req.QueryParameter("expr") return r } diff --git a/pkg/kapis/monitoring/v1alpha3/register.go b/pkg/kapis/monitoring/v1alpha3/register.go index 92110eeba..14e6239c8 100644 --- a/pkg/kapis/monitoring/v1alpha3/register.go +++ b/pkg/kapis/monitoring/v1alpha3/register.go @@ -400,6 +400,29 @@ func AddToContainer(c *restful.Container, k8sClient kubernetes.Interface, monito Returns(http.StatusOK, RespOK, model.Metrics{})). Produces(restful.MIME_JSON) + ws.Route(ws.GET("/namespaces/{namespace}/targets/metadata"). + To(h.handleMetadataQuery). + Doc("Get metadata of metrics for the specific namespace."). + Param(ws.PathParameter("namespace", "The name of the namespace.").DataType("string").Required(true)). + Metadata(restfulspec.KeyOpenAPITags, []string{constants.CustomMetricsTag}). + Writes(model.Metadata{}). + Returns(http.StatusOK, RespOK, model.Metadata{})). + Produces(restful.MIME_JSON) + + ws.Route(ws.GET("/namespaces/{namespace}/targets/query"). + To(h.handleAdhocQuery). + Doc("Make an ad-hoc query in the specific namespace."). + Param(ws.PathParameter("namespace", "The name of the namespace.").DataType("string").Required(true)). + Param(ws.QueryParameter("expr", "The expression to be evaluated.").DataType("string").Required(false)). + Param(ws.QueryParameter("start", "Start time of query. Use **start** and **end** to retrieve metric data over a time span. It is a string with Unix time format, eg. 1559347200. ").DataType("string").Required(true)). + Param(ws.QueryParameter("end", "End time of query. Use **start** and **end** to retrieve metric data over a time span. It is a string with Unix time format, eg. 1561939200. ").DataType("string").Required(false)). + Param(ws.QueryParameter("step", "Time interval. Retrieve metric data at a fixed interval within the time range of start and end. It requires both **start** and **end** are provided. The format is [0-9]+[smhdwy]. Defaults to 10m (i.e. 10 min).").DataType("string").DefaultValue("10m").Required(false)). + Param(ws.QueryParameter("time", "A timestamp in Unix time format. Retrieve metric data at a single point in time. Defaults to now. Time and the combination of start, end, step are mutually exclusive.").DataType("string").Required(false)). + Metadata(restfulspec.KeyOpenAPITags, []string{constants.CustomMetricsTag}). + Writes(monitoring.Metric{}). + Returns(http.StatusOK, RespOK, monitoring.Metric{})). + Produces(restful.MIME_JSON) + c.Add(ws) return nil } diff --git a/pkg/models/monitoring/expressions/prometheus/label_replace.go b/pkg/models/monitoring/expressions/prometheus/label_replace.go new file mode 100644 index 000000000..6e541598f --- /dev/null +++ b/pkg/models/monitoring/expressions/prometheus/label_replace.go @@ -0,0 +1,99 @@ +package prometheus + +import ( + "fmt" + "github.com/prometheus/common/model" + "github.com/prometheus/prometheus/promql" + "github.com/prometheus/prometheus/storage/metric" + "kubesphere.io/kubesphere/pkg/models/monitoring/expressions" +) + +func init() { + expressions.Register("prometheus", labelReplace) +} + +func labelReplace(input, ns string) (string, error) { + root, err := promql.ParseExpr(input) + if err != nil { + return "", err + } + + SetRecursive(root, ns) + if err != nil { + return "", err + } + + return root.String(), nil +} + +// Inspired by https://github.com/openshift/prom-label-proxy +func SetRecursive(node promql.Node, namespace string) (err error) { + switch n := node.(type) { + case *promql.EvalStmt: + if err := SetRecursive(n.Expr, namespace); err != nil { + return err + } + case promql.Expressions: + for _, e := range n { + if err := SetRecursive(e, namespace); err != nil { + return err + } + } + case *promql.AggregateExpr: + if err := SetRecursive(n.Expr, namespace); err != nil { + return err + } + case *promql.BinaryExpr: + if err := SetRecursive(n.LHS, namespace); err != nil { + return err + } + if err := SetRecursive(n.RHS, namespace); err != nil { + return err + } + case *promql.Call: + if err := SetRecursive(n.Args, namespace); err != nil { + return err + } + case *promql.ParenExpr: + if err := SetRecursive(n.Expr, namespace); err != nil { + return err + } + case *promql.UnaryExpr: + if err := SetRecursive(n.Expr, namespace); err != nil { + return err + } + case *promql.NumberLiteral, *promql.StringLiteral: + // nothing to do + case *promql.MatrixSelector: + n.LabelMatchers = enforceLabelMatchers(n.LabelMatchers, namespace) + case *promql.VectorSelector: + n.LabelMatchers = enforceLabelMatchers(n.LabelMatchers, namespace) + default: + return fmt.Errorf("promql.Walk: unhandled node type %T", node) + } + return err +} + +func enforceLabelMatchers(matchers metric.LabelMatchers, namespace string) metric.LabelMatchers { + var found bool + for i, m := range matchers { + if m.Name == "namespace" { + matchers[i] = &metric.LabelMatcher{ + Name: "namespace", + Type: metric.Equal, + Value: model.LabelValue(namespace), + } + found = true + break + } + } + + if !found { + matchers = append(matchers, &metric.LabelMatcher{ + Name: "namespace", + Type: metric.Equal, + Value: model.LabelValue(namespace), + }) + } + return matchers +} diff --git a/pkg/models/monitoring/expressions/prometheus/label_replace_test.go b/pkg/models/monitoring/expressions/prometheus/label_replace_test.go new file mode 100644 index 000000000..d265dc093 --- /dev/null +++ b/pkg/models/monitoring/expressions/prometheus/label_replace_test.go @@ -0,0 +1,51 @@ +package prometheus + +import ( + "fmt" + "github.com/google/go-cmp/cmp" + "testing" +) + +func TestLabelReplace(t *testing.T) { + tests := []struct { + expr string + expected string + expectedErr bool + }{ + { + expr: "up", + expected: `up{namespace="default"}`, + expectedErr: false, + }, + { + expr: `up{namespace="random"}`, + expected: `up{namespace="default"}`, + expectedErr: false, + }, + { + expr: `up{namespace="random"} + up{job="test"}`, + expected: `up{namespace="default"} + up{job="test",namespace="default"}`, + expectedErr: false, + }, + { + expr: `@@@@`, + expectedErr: true, + }, + } + + for i, tt := range tests { + t.Run(fmt.Sprintf("%d", i), func(t *testing.T) { + result, err := labelReplace(tt.expr, "default") + if err != nil { + if !tt.expectedErr { + t.Fatal(err) + } + return + } + + if diff := cmp.Diff(result, tt.expected); diff != "" { + t.Fatalf("%T differ (-got, +want): %s", tt.expected, diff) + } + }) + } +} diff --git a/pkg/models/monitoring/expressions/registry.go b/pkg/models/monitoring/expressions/registry.go new file mode 100644 index 000000000..12a606ad4 --- /dev/null +++ b/pkg/models/monitoring/expressions/registry.go @@ -0,0 +1,9 @@ +package expressions + +type labelReplaceFn func(expr, ns string) (string, error) + +var ReplaceNamespaceFns = make(map[string]labelReplaceFn) + +func Register(name string, fn labelReplaceFn) { + ReplaceNamespaceFns[name] = fn +} diff --git a/pkg/models/monitoring/monitoring.go b/pkg/models/monitoring/monitoring.go index b7443f1bd..795678932 100644 --- a/pkg/models/monitoring/monitoring.go +++ b/pkg/models/monitoring/monitoring.go @@ -19,15 +19,17 @@ package monitoring import ( + "kubesphere.io/kubesphere/pkg/models/monitoring/expressions" "kubesphere.io/kubesphere/pkg/simple/client/monitoring" "time" ) type MonitoringOperator interface { - GetMetrics(stmts []string, time time.Time) Metrics - GetMetricsOverTime(stmts []string, start, end time.Time, step time.Duration) Metrics + GetMetric(expr, namespace string, time time.Time) (monitoring.Metric, error) + GetMetricOverTime(expr, namespace string, start, end time.Time, step time.Duration) (monitoring.Metric, error) GetNamedMetrics(metrics []string, time time.Time, opt monitoring.QueryOption) Metrics GetNamedMetricsOverTime(metrics []string, start, end time.Time, step time.Duration, opt monitoring.QueryOption) Metrics + GetMetadata(namespace string) Metadata } type monitoringOperator struct { @@ -38,14 +40,28 @@ func NewMonitoringOperator(client monitoring.Interface) MonitoringOperator { return &monitoringOperator{client} } -// TODO(huanggze): reserve for custom monitoring -func (mo monitoringOperator) GetMetrics(stmts []string, time time.Time) Metrics { - panic("implement me") +func (mo monitoringOperator) GetMetric(expr, namespace string, time time.Time) (monitoring.Metric, error) { + // Different monitoring backend implementations have different ways to enforce namespace isolation. + // Each implementation should register itself to `ReplaceNamespaceFns` during init(). + // We hard code "prometheus" here because we only support this datasource so far. + // In the future, maybe the value should be returned from a method like `mo.c.GetMonitoringServiceName()`. + expr, err := expressions.ReplaceNamespaceFns["prometheus"](expr, namespace) + if err != nil { + return monitoring.Metric{}, err + } + return mo.c.GetMetric(expr, time), nil } -// TODO(huanggze): reserve for custom monitoring -func (mo monitoringOperator) GetMetricsOverTime(stmts []string, start, end time.Time, step time.Duration) Metrics { - panic("implement me") +func (mo monitoringOperator) GetMetricOverTime(expr, namespace string, start, end time.Time, step time.Duration) (monitoring.Metric, error) { + // Different monitoring backend implementations have different ways to enforce namespace isolation. + // Each implementation should register itself to `ReplaceNamespaceFns` during init(). + // We hard code "prometheus" here because we only support this datasource so far. + // In the future, maybe the value should be returned from a method like `mo.c.GetMonitoringServiceName()`. + expr, err := expressions.ReplaceNamespaceFns["prometheus"](expr, namespace) + if err != nil { + return monitoring.Metric{}, err + } + return mo.c.GetMetricOverTime(expr, start, end, step), nil } func (mo monitoringOperator) GetNamedMetrics(metrics []string, time time.Time, opt monitoring.QueryOption) Metrics { @@ -57,3 +73,8 @@ func (mo monitoringOperator) GetNamedMetricsOverTime(metrics []string, start, en ress := mo.c.GetNamedMetricsOverTime(metrics, start, end, step, opt) return Metrics{Results: ress} } + +func (mo monitoringOperator) GetMetadata(namespace string) Metadata { + data := mo.c.GetMetadata(namespace) + return Metadata{Data: data} +} diff --git a/pkg/models/monitoring/types.go b/pkg/models/monitoring/types.go index 22364cfc1..7e5c2c6c5 100644 --- a/pkg/models/monitoring/types.go +++ b/pkg/models/monitoring/types.go @@ -8,3 +8,7 @@ type Metrics struct { TotalPages int `json:"total_page,omitempty" description:"total number of pages"` TotalItems int `json:"total_item,omitempty" description:"page size"` } + +type Metadata struct { + Data []monitoring.Metadata `json:"data" description:"actual array of results"` +} diff --git a/pkg/simple/client/monitoring/interface.go b/pkg/simple/client/monitoring/interface.go index 488161dfd..68a471317 100644 --- a/pkg/simple/client/monitoring/interface.go +++ b/pkg/simple/client/monitoring/interface.go @@ -3,8 +3,9 @@ package monitoring import "time" type Interface interface { - GetMetrics(exprs []string, time time.Time) []Metric - GetMetricsOverTime(exprs []string, start, end time.Time, step time.Duration) []Metric + GetMetric(expr string, time time.Time) Metric + GetMetricOverTime(expr string, start, end time.Time, step time.Duration) Metric GetNamedMetrics(metrics []string, time time.Time, opt QueryOption) []Metric GetNamedMetricsOverTime(metrics []string, start, end time.Time, step time.Duration, opt QueryOption) []Metric + GetMetadata(namespace string) []Metadata } diff --git a/pkg/simple/client/monitoring/prometheus/prometheus.go b/pkg/simple/client/monitoring/prometheus/prometheus.go index 0cb3bc8b3..a40571c13 100644 --- a/pkg/simple/client/monitoring/prometheus/prometheus.go +++ b/pkg/simple/client/monitoring/prometheus/prometheus.go @@ -2,6 +2,7 @@ package prometheus import ( "context" + "fmt" "github.com/prometheus/client_golang/api" apiv1 "github.com/prometheus/client_golang/api/prometheus/v1" "github.com/prometheus/common/model" @@ -24,14 +25,35 @@ func NewPrometheus(options *Options) (monitoring.Interface, error) { return prometheus{client: apiv1.NewAPI(client)}, err } -// TODO(huanggze): reserve for custom monitoring -func (p prometheus) GetMetrics(stmts []string, time time.Time) []monitoring.Metric { - panic("implement me") +func (p prometheus) GetMetric(expr string, ts time.Time) monitoring.Metric { + var parsedResp monitoring.Metric + + value, err := p.client.Query(context.Background(), expr, ts) + if err != nil { + parsedResp.Error = err.Error() + } else { + parsedResp.MetricData = parseQueryResp(value) + } + + return parsedResp } -// TODO(huanggze): reserve for custom monitoring -func (p prometheus) GetMetricsOverTime(stmts []string, start, end time.Time, step time.Duration) []monitoring.Metric { - panic("implement me") +func (p prometheus) GetMetricOverTime(expr string, start, end time.Time, step time.Duration) monitoring.Metric { + timeRange := apiv1.Range{ + Start: start, + End: end, + Step: step, + } + + value, err := p.client.QueryRange(context.Background(), expr, timeRange) + + var parsedResp monitoring.Metric + if err != nil { + parsedResp.Error = err.Error() + } else { + parsedResp.MetricData = parseQueryRangeResp(value) + } + return parsedResp } func (p prometheus) GetNamedMetrics(metrics []string, ts time.Time, o monitoring.QueryOption) []monitoring.Metric { @@ -49,7 +71,7 @@ func (p prometheus) GetNamedMetrics(metrics []string, ts time.Time, o monitoring value, err := p.client.Query(context.Background(), makeExpr(metric, *opts), ts) if err != nil { - parsedResp.Error = err.(*apiv1.Error).Msg + parsedResp.Error = err.Error() } else { parsedResp.MetricData = parseQueryResp(value) } @@ -88,7 +110,7 @@ func (p prometheus) GetNamedMetricsOverTime(metrics []string, start, end time.Ti value, err := p.client.QueryRange(context.Background(), makeExpr(metric, *opts), timeRange) if err != nil { - parsedResp.Error = err.(*apiv1.Error).Msg + parsedResp.Error = err.Error() } else { parsedResp.MetricData = parseQueryRangeResp(value) } @@ -106,6 +128,26 @@ func (p prometheus) GetNamedMetricsOverTime(metrics []string, start, end time.Ti return res } +func (p prometheus) GetMetadata(namespace string) []monitoring.Metadata { + var meta []monitoring.Metadata + + // Filter metrics available to members of this namespace + matchTarget := fmt.Sprintf("{namespace=\"%s\"}", namespace) + items, err := p.client.TargetsMetadata(context.Background(), matchTarget, "", "") + if err != nil { + return meta + } + + for _, item := range items { + meta = append(meta, monitoring.Metadata{ + Metric: item.Metric, + Type: string(item.Type), + Help: item.Help, + }) + } + return meta +} + func parseQueryRangeResp(value model.Value) monitoring.MetricData { res := monitoring.MetricData{MetricType: monitoring.MetricTypeMatrix} diff --git a/pkg/simple/client/monitoring/prometheus/prometheus_test.go b/pkg/simple/client/monitoring/prometheus/prometheus_test.go index 043175b3d..16c60ee0e 100644 --- a/pkg/simple/client/monitoring/prometheus/prometheus_test.go +++ b/pkg/simple/client/monitoring/prometheus/prometheus_test.go @@ -24,7 +24,8 @@ func TestGetNamedMetrics(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - expected, err := jsonFromFile(tt.expected) + expected := make([]monitoring.Metric, 0) + err := jsonFromFile(tt.expected, &expected) if err != nil { t.Fatal(err) } @@ -53,7 +54,8 @@ func TestGetNamedMetricsOverTime(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - expected, err := jsonFromFile(tt.expected) + expected := make([]monitoring.Metric, 0) + err := jsonFromFile(tt.expected, &expected) if err != nil { t.Fatal(err) } @@ -70,6 +72,44 @@ func TestGetNamedMetricsOverTime(t *testing.T) { } } +func TestGetMetadata(t *testing.T) { + tests := []struct { + fakeResp string + expected string + }{ + { + fakeResp: "metadata-prom.json", + expected: "metadata-res.json", + }, + { + fakeResp: "metadata-notfound-prom.json", + expected: "metadata-notfound-res.json", + }, + } + + for i, tt := range tests { + t.Run(fmt.Sprintf("%d", i), func(t *testing.T) { + expected := make([]monitoring.Metadata, 0) + err := jsonFromFile(tt.expected, &expected) + if err != nil { + t.Fatal(err) + } + if len(expected) == 0 { + expected = nil + } + + srv := mockPrometheusService("/api/v1/targets/metadata", tt.fakeResp) + defer srv.Close() + + client, _ := NewPrometheus(&Options{Endpoint: srv.URL}) + result := client.GetMetadata("default") + if diff := cmp.Diff(result, expected); diff != "" { + t.Fatalf("%T differ (-got, +want): %s", expected, diff) + } + }) + } +} + func mockPrometheusService(pattern, fakeResp string) *httptest.Server { mux := http.NewServeMux() mux.HandleFunc(pattern, func(res http.ResponseWriter, req *http.Request) { @@ -79,17 +119,15 @@ func mockPrometheusService(pattern, fakeResp string) *httptest.Server { return httptest.NewServer(mux) } -func jsonFromFile(expectedFile string) ([]monitoring.Metric, error) { - expectedJson := []monitoring.Metric{} - +func jsonFromFile(expectedFile string, expectedJsonPtr interface{}) error { json, err := ioutil.ReadFile(fmt.Sprintf("./testdata/%s", expectedFile)) if err != nil { - return expectedJson, err + return err } - err = jsoniter.Unmarshal(json, &expectedJson) + err = jsoniter.Unmarshal(json, expectedJsonPtr) if err != nil { - return expectedJson, err + return err } - return expectedJson, nil + return nil } diff --git a/pkg/simple/client/monitoring/prometheus/testdata/metadata-notfound-prom.json b/pkg/simple/client/monitoring/prometheus/testdata/metadata-notfound-prom.json new file mode 100644 index 000000000..1ffb57de7 --- /dev/null +++ b/pkg/simple/client/monitoring/prometheus/testdata/metadata-notfound-prom.json @@ -0,0 +1,5 @@ +{ + "status":"error", + "errorType":"not_found", + "error":"specified metadata not found" +} \ No newline at end of file diff --git a/pkg/simple/client/monitoring/prometheus/testdata/metadata-notfound-res.json b/pkg/simple/client/monitoring/prometheus/testdata/metadata-notfound-res.json new file mode 100644 index 000000000..0637a088a --- /dev/null +++ b/pkg/simple/client/monitoring/prometheus/testdata/metadata-notfound-res.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/pkg/simple/client/monitoring/prometheus/testdata/metadata-prom.json b/pkg/simple/client/monitoring/prometheus/testdata/metadata-prom.json new file mode 100644 index 000000000..e9fca8153 --- /dev/null +++ b/pkg/simple/client/monitoring/prometheus/testdata/metadata-prom.json @@ -0,0 +1,25 @@ +{ + "status": "success", + "data": [ + { + "target": { + "instance": "127.0.0.1:9090", + "job": "prometheus" + }, + "metric": "prometheus_treecache_zookeeper_failures_total", + "type": "counter", + "help": "The total number of ZooKeeper failures.", + "unit": "" + }, + { + "target": { + "instance": "127.0.0.1:9090", + "job": "prometheus" + }, + "metric": "prometheus_tsdb_reloads_total", + "type": "counter", + "help": "Number of times the database reloaded block data from disk.", + "unit": "" + } + ] +} \ No newline at end of file diff --git a/pkg/simple/client/monitoring/prometheus/testdata/metadata-res.json b/pkg/simple/client/monitoring/prometheus/testdata/metadata-res.json new file mode 100644 index 000000000..915a0646b --- /dev/null +++ b/pkg/simple/client/monitoring/prometheus/testdata/metadata-res.json @@ -0,0 +1,12 @@ +[ + { + "metric": "prometheus_treecache_zookeeper_failures_total", + "type": "counter", + "help": "The total number of ZooKeeper failures." + }, + { + "metric": "prometheus_tsdb_reloads_total", + "type": "counter", + "help": "Number of times the database reloaded block data from disk." + } +] \ No newline at end of file diff --git a/pkg/simple/client/monitoring/prometheus/testdata/metrics-error-res.json b/pkg/simple/client/monitoring/prometheus/testdata/metrics-error-res.json index b580a35f2..5bd92c3a5 100644 --- a/pkg/simple/client/monitoring/prometheus/testdata/metrics-error-res.json +++ b/pkg/simple/client/monitoring/prometheus/testdata/metrics-error-res.json @@ -1,6 +1,6 @@ [ { "metric_name": "cluster_cpu_utilisation", - "error": "inconsistent body for response code" + "error": "bad_response: inconsistent body for response code" } ] \ No newline at end of file diff --git a/pkg/simple/client/monitoring/types.go b/pkg/simple/client/monitoring/types.go index a9bc915de..baf732434 100644 --- a/pkg/simple/client/monitoring/types.go +++ b/pkg/simple/client/monitoring/types.go @@ -5,6 +5,12 @@ const ( MetricTypeVector = "vector" ) +type Metadata struct { + Metric string `json:"metric,omitempty" description:"metric name"` + Type string `json:"type,omitempty" description:"metric type"` + Help string `json:"help,omitempty" description:"metric description"` +} + type Metric struct { MetricName string `json:"metric_name,omitempty" description:"metric name, eg. scheduler_up_sum"` MetricData `json:"data,omitempty" description:"actual metric result"` diff --git a/vendor/github.com/golang/snappy/.gitignore b/vendor/github.com/golang/snappy/.gitignore new file mode 100644 index 000000000..042091d9b --- /dev/null +++ b/vendor/github.com/golang/snappy/.gitignore @@ -0,0 +1,16 @@ +cmd/snappytool/snappytool +testdata/bench + +# These explicitly listed benchmark data files are for an obsolete version of +# snappy_test.go. +testdata/alice29.txt +testdata/asyoulik.txt +testdata/fireworks.jpeg +testdata/geo.protodata +testdata/html +testdata/html_x_4 +testdata/kppkn.gtb +testdata/lcet10.txt +testdata/paper-100k.pdf +testdata/plrabn12.txt +testdata/urls.10K diff --git a/vendor/github.com/golang/snappy/AUTHORS b/vendor/github.com/golang/snappy/AUTHORS new file mode 100644 index 000000000..bcfa19520 --- /dev/null +++ b/vendor/github.com/golang/snappy/AUTHORS @@ -0,0 +1,15 @@ +# This is the official list of Snappy-Go authors for copyright purposes. +# This file is distinct from the CONTRIBUTORS files. +# See the latter for an explanation. + +# Names should be added to this file as +# Name or Organization +# The email address is not required for organizations. + +# Please keep the list sorted. + +Damian Gryski +Google Inc. +Jan Mercl <0xjnml@gmail.com> +Rodolfo Carvalho +Sebastien Binet diff --git a/vendor/github.com/golang/snappy/CONTRIBUTORS b/vendor/github.com/golang/snappy/CONTRIBUTORS new file mode 100644 index 000000000..931ae3160 --- /dev/null +++ b/vendor/github.com/golang/snappy/CONTRIBUTORS @@ -0,0 +1,37 @@ +# This is the official list of people who can contribute +# (and typically have contributed) code to the Snappy-Go repository. +# The AUTHORS file lists the copyright holders; this file +# lists people. For example, Google employees are listed here +# but not in AUTHORS, because Google holds the copyright. +# +# The submission process automatically checks to make sure +# that people submitting code are listed in this file (by email address). +# +# Names should be added to this file only after verifying that +# the individual or the individual's organization has agreed to +# the appropriate Contributor License Agreement, found here: +# +# http://code.google.com/legal/individual-cla-v1.0.html +# http://code.google.com/legal/corporate-cla-v1.0.html +# +# The agreement for individuals can be filled out on the web. +# +# When adding J Random Contributor's name to this file, +# either J's name or J's organization's name should be +# added to the AUTHORS file, depending on whether the +# individual or corporate CLA was used. + +# Names should be added to this file like so: +# Name + +# Please keep the list sorted. + +Damian Gryski +Jan Mercl <0xjnml@gmail.com> +Kai Backman +Marc-Antoine Ruel +Nigel Tao +Rob Pike +Rodolfo Carvalho +Russ Cox +Sebastien Binet diff --git a/vendor/github.com/golang/snappy/LICENSE b/vendor/github.com/golang/snappy/LICENSE new file mode 100644 index 000000000..6050c10f4 --- /dev/null +++ b/vendor/github.com/golang/snappy/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2011 The Snappy-Go Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/golang/snappy/README b/vendor/github.com/golang/snappy/README new file mode 100644 index 000000000..cea12879a --- /dev/null +++ b/vendor/github.com/golang/snappy/README @@ -0,0 +1,107 @@ +The Snappy compression format in the Go programming language. + +To download and install from source: +$ go get github.com/golang/snappy + +Unless otherwise noted, the Snappy-Go source files are distributed +under the BSD-style license found in the LICENSE file. + + + +Benchmarks. + +The golang/snappy benchmarks include compressing (Z) and decompressing (U) ten +or so files, the same set used by the C++ Snappy code (github.com/google/snappy +and note the "google", not "golang"). On an "Intel(R) Core(TM) i7-3770 CPU @ +3.40GHz", Go's GOARCH=amd64 numbers as of 2016-05-29: + +"go test -test.bench=." + +_UFlat0-8 2.19GB/s ± 0% html +_UFlat1-8 1.41GB/s ± 0% urls +_UFlat2-8 23.5GB/s ± 2% jpg +_UFlat3-8 1.91GB/s ± 0% jpg_200 +_UFlat4-8 14.0GB/s ± 1% pdf +_UFlat5-8 1.97GB/s ± 0% html4 +_UFlat6-8 814MB/s ± 0% txt1 +_UFlat7-8 785MB/s ± 0% txt2 +_UFlat8-8 857MB/s ± 0% txt3 +_UFlat9-8 719MB/s ± 1% txt4 +_UFlat10-8 2.84GB/s ± 0% pb +_UFlat11-8 1.05GB/s ± 0% gaviota + +_ZFlat0-8 1.04GB/s ± 0% html +_ZFlat1-8 534MB/s ± 0% urls +_ZFlat2-8 15.7GB/s ± 1% jpg +_ZFlat3-8 740MB/s ± 3% jpg_200 +_ZFlat4-8 9.20GB/s ± 1% pdf +_ZFlat5-8 991MB/s ± 0% html4 +_ZFlat6-8 379MB/s ± 0% txt1 +_ZFlat7-8 352MB/s ± 0% txt2 +_ZFlat8-8 396MB/s ± 1% txt3 +_ZFlat9-8 327MB/s ± 1% txt4 +_ZFlat10-8 1.33GB/s ± 1% pb +_ZFlat11-8 605MB/s ± 1% gaviota + + + +"go test -test.bench=. -tags=noasm" + +_UFlat0-8 621MB/s ± 2% html +_UFlat1-8 494MB/s ± 1% urls +_UFlat2-8 23.2GB/s ± 1% jpg +_UFlat3-8 1.12GB/s ± 1% jpg_200 +_UFlat4-8 4.35GB/s ± 1% pdf +_UFlat5-8 609MB/s ± 0% html4 +_UFlat6-8 296MB/s ± 0% txt1 +_UFlat7-8 288MB/s ± 0% txt2 +_UFlat8-8 309MB/s ± 1% txt3 +_UFlat9-8 280MB/s ± 1% txt4 +_UFlat10-8 753MB/s ± 0% pb +_UFlat11-8 400MB/s ± 0% gaviota + +_ZFlat0-8 409MB/s ± 1% html +_ZFlat1-8 250MB/s ± 1% urls +_ZFlat2-8 12.3GB/s ± 1% jpg +_ZFlat3-8 132MB/s ± 0% jpg_200 +_ZFlat4-8 2.92GB/s ± 0% pdf +_ZFlat5-8 405MB/s ± 1% html4 +_ZFlat6-8 179MB/s ± 1% txt1 +_ZFlat7-8 170MB/s ± 1% txt2 +_ZFlat8-8 189MB/s ± 1% txt3 +_ZFlat9-8 164MB/s ± 1% txt4 +_ZFlat10-8 479MB/s ± 1% pb +_ZFlat11-8 270MB/s ± 1% gaviota + + + +For comparison (Go's encoded output is byte-for-byte identical to C++'s), here +are the numbers from C++ Snappy's + +make CXXFLAGS="-O2 -DNDEBUG -g" clean snappy_unittest.log && cat snappy_unittest.log + +BM_UFlat/0 2.4GB/s html +BM_UFlat/1 1.4GB/s urls +BM_UFlat/2 21.8GB/s jpg +BM_UFlat/3 1.5GB/s jpg_200 +BM_UFlat/4 13.3GB/s pdf +BM_UFlat/5 2.1GB/s html4 +BM_UFlat/6 1.0GB/s txt1 +BM_UFlat/7 959.4MB/s txt2 +BM_UFlat/8 1.0GB/s txt3 +BM_UFlat/9 864.5MB/s txt4 +BM_UFlat/10 2.9GB/s pb +BM_UFlat/11 1.2GB/s gaviota + +BM_ZFlat/0 944.3MB/s html (22.31 %) +BM_ZFlat/1 501.6MB/s urls (47.78 %) +BM_ZFlat/2 14.3GB/s jpg (99.95 %) +BM_ZFlat/3 538.3MB/s jpg_200 (73.00 %) +BM_ZFlat/4 8.3GB/s pdf (83.30 %) +BM_ZFlat/5 903.5MB/s html4 (22.52 %) +BM_ZFlat/6 336.0MB/s txt1 (57.88 %) +BM_ZFlat/7 312.3MB/s txt2 (61.91 %) +BM_ZFlat/8 353.1MB/s txt3 (54.99 %) +BM_ZFlat/9 289.9MB/s txt4 (66.26 %) +BM_ZFlat/10 1.2GB/s pb (19.68 %) +BM_ZFlat/11 527.4MB/s gaviota (37.72 %) diff --git a/vendor/github.com/golang/snappy/decode.go b/vendor/github.com/golang/snappy/decode.go new file mode 100644 index 000000000..72efb0353 --- /dev/null +++ b/vendor/github.com/golang/snappy/decode.go @@ -0,0 +1,237 @@ +// Copyright 2011 The Snappy-Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package snappy + +import ( + "encoding/binary" + "errors" + "io" +) + +var ( + // ErrCorrupt reports that the input is invalid. + ErrCorrupt = errors.New("snappy: corrupt input") + // ErrTooLarge reports that the uncompressed length is too large. + ErrTooLarge = errors.New("snappy: decoded block is too large") + // ErrUnsupported reports that the input isn't supported. + ErrUnsupported = errors.New("snappy: unsupported input") + + errUnsupportedLiteralLength = errors.New("snappy: unsupported literal length") +) + +// DecodedLen returns the length of the decoded block. +func DecodedLen(src []byte) (int, error) { + v, _, err := decodedLen(src) + return v, err +} + +// decodedLen returns the length of the decoded block and the number of bytes +// that the length header occupied. +func decodedLen(src []byte) (blockLen, headerLen int, err error) { + v, n := binary.Uvarint(src) + if n <= 0 || v > 0xffffffff { + return 0, 0, ErrCorrupt + } + + const wordSize = 32 << (^uint(0) >> 32 & 1) + if wordSize == 32 && v > 0x7fffffff { + return 0, 0, ErrTooLarge + } + return int(v), n, nil +} + +const ( + decodeErrCodeCorrupt = 1 + decodeErrCodeUnsupportedLiteralLength = 2 +) + +// Decode returns the decoded form of src. The returned slice may be a sub- +// slice of dst if dst was large enough to hold the entire decoded block. +// Otherwise, a newly allocated slice will be returned. +// +// The dst and src must not overlap. It is valid to pass a nil dst. +func Decode(dst, src []byte) ([]byte, error) { + dLen, s, err := decodedLen(src) + if err != nil { + return nil, err + } + if dLen <= len(dst) { + dst = dst[:dLen] + } else { + dst = make([]byte, dLen) + } + switch decode(dst, src[s:]) { + case 0: + return dst, nil + case decodeErrCodeUnsupportedLiteralLength: + return nil, errUnsupportedLiteralLength + } + return nil, ErrCorrupt +} + +// NewReader returns a new Reader that decompresses from r, using the framing +// format described at +// https://github.com/google/snappy/blob/master/framing_format.txt +func NewReader(r io.Reader) *Reader { + return &Reader{ + r: r, + decoded: make([]byte, maxBlockSize), + buf: make([]byte, maxEncodedLenOfMaxBlockSize+checksumSize), + } +} + +// Reader is an io.Reader that can read Snappy-compressed bytes. +type Reader struct { + r io.Reader + err error + decoded []byte + buf []byte + // decoded[i:j] contains decoded bytes that have not yet been passed on. + i, j int + readHeader bool +} + +// Reset discards any buffered data, resets all state, and switches the Snappy +// reader to read from r. This permits reusing a Reader rather than allocating +// a new one. +func (r *Reader) Reset(reader io.Reader) { + r.r = reader + r.err = nil + r.i = 0 + r.j = 0 + r.readHeader = false +} + +func (r *Reader) readFull(p []byte, allowEOF bool) (ok bool) { + if _, r.err = io.ReadFull(r.r, p); r.err != nil { + if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) { + r.err = ErrCorrupt + } + return false + } + return true +} + +// Read satisfies the io.Reader interface. +func (r *Reader) Read(p []byte) (int, error) { + if r.err != nil { + return 0, r.err + } + for { + if r.i < r.j { + n := copy(p, r.decoded[r.i:r.j]) + r.i += n + return n, nil + } + if !r.readFull(r.buf[:4], true) { + return 0, r.err + } + chunkType := r.buf[0] + if !r.readHeader { + if chunkType != chunkTypeStreamIdentifier { + r.err = ErrCorrupt + return 0, r.err + } + r.readHeader = true + } + chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16 + if chunkLen > len(r.buf) { + r.err = ErrUnsupported + return 0, r.err + } + + // The chunk types are specified at + // https://github.com/google/snappy/blob/master/framing_format.txt + switch chunkType { + case chunkTypeCompressedData: + // Section 4.2. Compressed data (chunk type 0x00). + if chunkLen < checksumSize { + r.err = ErrCorrupt + return 0, r.err + } + buf := r.buf[:chunkLen] + if !r.readFull(buf, false) { + return 0, r.err + } + checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24 + buf = buf[checksumSize:] + + n, err := DecodedLen(buf) + if err != nil { + r.err = err + return 0, r.err + } + if n > len(r.decoded) { + r.err = ErrCorrupt + return 0, r.err + } + if _, err := Decode(r.decoded, buf); err != nil { + r.err = err + return 0, r.err + } + if crc(r.decoded[:n]) != checksum { + r.err = ErrCorrupt + return 0, r.err + } + r.i, r.j = 0, n + continue + + case chunkTypeUncompressedData: + // Section 4.3. Uncompressed data (chunk type 0x01). + if chunkLen < checksumSize { + r.err = ErrCorrupt + return 0, r.err + } + buf := r.buf[:checksumSize] + if !r.readFull(buf, false) { + return 0, r.err + } + checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24 + // Read directly into r.decoded instead of via r.buf. + n := chunkLen - checksumSize + if n > len(r.decoded) { + r.err = ErrCorrupt + return 0, r.err + } + if !r.readFull(r.decoded[:n], false) { + return 0, r.err + } + if crc(r.decoded[:n]) != checksum { + r.err = ErrCorrupt + return 0, r.err + } + r.i, r.j = 0, n + continue + + case chunkTypeStreamIdentifier: + // Section 4.1. Stream identifier (chunk type 0xff). + if chunkLen != len(magicBody) { + r.err = ErrCorrupt + return 0, r.err + } + if !r.readFull(r.buf[:len(magicBody)], false) { + return 0, r.err + } + for i := 0; i < len(magicBody); i++ { + if r.buf[i] != magicBody[i] { + r.err = ErrCorrupt + return 0, r.err + } + } + continue + } + + if chunkType <= 0x7f { + // Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f). + r.err = ErrUnsupported + return 0, r.err + } + // Section 4.4 Padding (chunk type 0xfe). + // Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd). + if !r.readFull(r.buf[:chunkLen], false) { + return 0, r.err + } + } +} diff --git a/vendor/github.com/golang/snappy/decode_amd64.go b/vendor/github.com/golang/snappy/decode_amd64.go new file mode 100644 index 000000000..fcd192b84 --- /dev/null +++ b/vendor/github.com/golang/snappy/decode_amd64.go @@ -0,0 +1,14 @@ +// Copyright 2016 The Snappy-Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !appengine +// +build gc +// +build !noasm + +package snappy + +// decode has the same semantics as in decode_other.go. +// +//go:noescape +func decode(dst, src []byte) int diff --git a/vendor/github.com/golang/snappy/decode_amd64.s b/vendor/github.com/golang/snappy/decode_amd64.s new file mode 100644 index 000000000..e6179f65e --- /dev/null +++ b/vendor/github.com/golang/snappy/decode_amd64.s @@ -0,0 +1,490 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !appengine +// +build gc +// +build !noasm + +#include "textflag.h" + +// The asm code generally follows the pure Go code in decode_other.go, except +// where marked with a "!!!". + +// func decode(dst, src []byte) int +// +// All local variables fit into registers. The non-zero stack size is only to +// spill registers and push args when issuing a CALL. The register allocation: +// - AX scratch +// - BX scratch +// - CX length or x +// - DX offset +// - SI &src[s] +// - DI &dst[d] +// + R8 dst_base +// + R9 dst_len +// + R10 dst_base + dst_len +// + R11 src_base +// + R12 src_len +// + R13 src_base + src_len +// - R14 used by doCopy +// - R15 used by doCopy +// +// The registers R8-R13 (marked with a "+") are set at the start of the +// function, and after a CALL returns, and are not otherwise modified. +// +// The d variable is implicitly DI - R8, and len(dst)-d is R10 - DI. +// The s variable is implicitly SI - R11, and len(src)-s is R13 - SI. +TEXT ·decode(SB), NOSPLIT, $48-56 + // Initialize SI, DI and R8-R13. + MOVQ dst_base+0(FP), R8 + MOVQ dst_len+8(FP), R9 + MOVQ R8, DI + MOVQ R8, R10 + ADDQ R9, R10 + MOVQ src_base+24(FP), R11 + MOVQ src_len+32(FP), R12 + MOVQ R11, SI + MOVQ R11, R13 + ADDQ R12, R13 + +loop: + // for s < len(src) + CMPQ SI, R13 + JEQ end + + // CX = uint32(src[s]) + // + // switch src[s] & 0x03 + MOVBLZX (SI), CX + MOVL CX, BX + ANDL $3, BX + CMPL BX, $1 + JAE tagCopy + + // ---------------------------------------- + // The code below handles literal tags. + + // case tagLiteral: + // x := uint32(src[s] >> 2) + // switch + SHRL $2, CX + CMPL CX, $60 + JAE tagLit60Plus + + // case x < 60: + // s++ + INCQ SI + +doLit: + // This is the end of the inner "switch", when we have a literal tag. + // + // We assume that CX == x and x fits in a uint32, where x is the variable + // used in the pure Go decode_other.go code. + + // length = int(x) + 1 + // + // Unlike the pure Go code, we don't need to check if length <= 0 because + // CX can hold 64 bits, so the increment cannot overflow. + INCQ CX + + // Prepare to check if copying length bytes will run past the end of dst or + // src. + // + // AX = len(dst) - d + // BX = len(src) - s + MOVQ R10, AX + SUBQ DI, AX + MOVQ R13, BX + SUBQ SI, BX + + // !!! Try a faster technique for short (16 or fewer bytes) copies. + // + // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 { + // goto callMemmove // Fall back on calling runtime·memmove. + // } + // + // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s + // against 21 instead of 16, because it cannot assume that all of its input + // is contiguous in memory and so it needs to leave enough source bytes to + // read the next tag without refilling buffers, but Go's Decode assumes + // contiguousness (the src argument is a []byte). + CMPQ CX, $16 + JGT callMemmove + CMPQ AX, $16 + JLT callMemmove + CMPQ BX, $16 + JLT callMemmove + + // !!! Implement the copy from src to dst as a 16-byte load and store. + // (Decode's documentation says that dst and src must not overlap.) + // + // This always copies 16 bytes, instead of only length bytes, but that's + // OK. If the input is a valid Snappy encoding then subsequent iterations + // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a + // non-nil error), so the overrun will be ignored. + // + // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or + // 16-byte loads and stores. This technique probably wouldn't be as + // effective on architectures that are fussier about alignment. + MOVOU 0(SI), X0 + MOVOU X0, 0(DI) + + // d += length + // s += length + ADDQ CX, DI + ADDQ CX, SI + JMP loop + +callMemmove: + // if length > len(dst)-d || length > len(src)-s { etc } + CMPQ CX, AX + JGT errCorrupt + CMPQ CX, BX + JGT errCorrupt + + // copy(dst[d:], src[s:s+length]) + // + // This means calling runtime·memmove(&dst[d], &src[s], length), so we push + // DI, SI and CX as arguments. Coincidentally, we also need to spill those + // three registers to the stack, to save local variables across the CALL. + MOVQ DI, 0(SP) + MOVQ SI, 8(SP) + MOVQ CX, 16(SP) + MOVQ DI, 24(SP) + MOVQ SI, 32(SP) + MOVQ CX, 40(SP) + CALL runtime·memmove(SB) + + // Restore local variables: unspill registers from the stack and + // re-calculate R8-R13. + MOVQ 24(SP), DI + MOVQ 32(SP), SI + MOVQ 40(SP), CX + MOVQ dst_base+0(FP), R8 + MOVQ dst_len+8(FP), R9 + MOVQ R8, R10 + ADDQ R9, R10 + MOVQ src_base+24(FP), R11 + MOVQ src_len+32(FP), R12 + MOVQ R11, R13 + ADDQ R12, R13 + + // d += length + // s += length + ADDQ CX, DI + ADDQ CX, SI + JMP loop + +tagLit60Plus: + // !!! This fragment does the + // + // s += x - 58; if uint(s) > uint(len(src)) { etc } + // + // checks. In the asm version, we code it once instead of once per switch case. + ADDQ CX, SI + SUBQ $58, SI + MOVQ SI, BX + SUBQ R11, BX + CMPQ BX, R12 + JA errCorrupt + + // case x == 60: + CMPL CX, $61 + JEQ tagLit61 + JA tagLit62Plus + + // x = uint32(src[s-1]) + MOVBLZX -1(SI), CX + JMP doLit + +tagLit61: + // case x == 61: + // x = uint32(src[s-2]) | uint32(src[s-1])<<8 + MOVWLZX -2(SI), CX + JMP doLit + +tagLit62Plus: + CMPL CX, $62 + JA tagLit63 + + // case x == 62: + // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16 + MOVWLZX -3(SI), CX + MOVBLZX -1(SI), BX + SHLL $16, BX + ORL BX, CX + JMP doLit + +tagLit63: + // case x == 63: + // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 + MOVL -4(SI), CX + JMP doLit + +// The code above handles literal tags. +// ---------------------------------------- +// The code below handles copy tags. + +tagCopy4: + // case tagCopy4: + // s += 5 + ADDQ $5, SI + + // if uint(s) > uint(len(src)) { etc } + MOVQ SI, BX + SUBQ R11, BX + CMPQ BX, R12 + JA errCorrupt + + // length = 1 + int(src[s-5])>>2 + SHRQ $2, CX + INCQ CX + + // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24) + MOVLQZX -4(SI), DX + JMP doCopy + +tagCopy2: + // case tagCopy2: + // s += 3 + ADDQ $3, SI + + // if uint(s) > uint(len(src)) { etc } + MOVQ SI, BX + SUBQ R11, BX + CMPQ BX, R12 + JA errCorrupt + + // length = 1 + int(src[s-3])>>2 + SHRQ $2, CX + INCQ CX + + // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8) + MOVWQZX -2(SI), DX + JMP doCopy + +tagCopy: + // We have a copy tag. We assume that: + // - BX == src[s] & 0x03 + // - CX == src[s] + CMPQ BX, $2 + JEQ tagCopy2 + JA tagCopy4 + + // case tagCopy1: + // s += 2 + ADDQ $2, SI + + // if uint(s) > uint(len(src)) { etc } + MOVQ SI, BX + SUBQ R11, BX + CMPQ BX, R12 + JA errCorrupt + + // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])) + MOVQ CX, DX + ANDQ $0xe0, DX + SHLQ $3, DX + MOVBQZX -1(SI), BX + ORQ BX, DX + + // length = 4 + int(src[s-2])>>2&0x7 + SHRQ $2, CX + ANDQ $7, CX + ADDQ $4, CX + +doCopy: + // This is the end of the outer "switch", when we have a copy tag. + // + // We assume that: + // - CX == length && CX > 0 + // - DX == offset + + // if offset <= 0 { etc } + CMPQ DX, $0 + JLE errCorrupt + + // if d < offset { etc } + MOVQ DI, BX + SUBQ R8, BX + CMPQ BX, DX + JLT errCorrupt + + // if length > len(dst)-d { etc } + MOVQ R10, BX + SUBQ DI, BX + CMPQ CX, BX + JGT errCorrupt + + // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length + // + // Set: + // - R14 = len(dst)-d + // - R15 = &dst[d-offset] + MOVQ R10, R14 + SUBQ DI, R14 + MOVQ DI, R15 + SUBQ DX, R15 + + // !!! Try a faster technique for short (16 or fewer bytes) forward copies. + // + // First, try using two 8-byte load/stores, similar to the doLit technique + // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is + // still OK if offset >= 8. Note that this has to be two 8-byte load/stores + // and not one 16-byte load/store, and the first store has to be before the + // second load, due to the overlap if offset is in the range [8, 16). + // + // if length > 16 || offset < 8 || len(dst)-d < 16 { + // goto slowForwardCopy + // } + // copy 16 bytes + // d += length + CMPQ CX, $16 + JGT slowForwardCopy + CMPQ DX, $8 + JLT slowForwardCopy + CMPQ R14, $16 + JLT slowForwardCopy + MOVQ 0(R15), AX + MOVQ AX, 0(DI) + MOVQ 8(R15), BX + MOVQ BX, 8(DI) + ADDQ CX, DI + JMP loop + +slowForwardCopy: + // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we + // can still try 8-byte load stores, provided we can overrun up to 10 extra + // bytes. As above, the overrun will be fixed up by subsequent iterations + // of the outermost loop. + // + // The C++ snappy code calls this technique IncrementalCopyFastPath. Its + // commentary says: + // + // ---- + // + // The main part of this loop is a simple copy of eight bytes at a time + // until we've copied (at least) the requested amount of bytes. However, + // if d and d-offset are less than eight bytes apart (indicating a + // repeating pattern of length < 8), we first need to expand the pattern in + // order to get the correct results. For instance, if the buffer looks like + // this, with the eight-byte and patterns marked as + // intervals: + // + // abxxxxxxxxxxxx + // [------] d-offset + // [------] d + // + // a single eight-byte copy from to will repeat the pattern + // once, after which we can move two bytes without moving : + // + // ababxxxxxxxxxx + // [------] d-offset + // [------] d + // + // and repeat the exercise until the two no longer overlap. + // + // This allows us to do very well in the special case of one single byte + // repeated many times, without taking a big hit for more general cases. + // + // The worst case of extra writing past the end of the match occurs when + // offset == 1 and length == 1; the last copy will read from byte positions + // [0..7] and write to [4..11], whereas it was only supposed to write to + // position 1. Thus, ten excess bytes. + // + // ---- + // + // That "10 byte overrun" worst case is confirmed by Go's + // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy + // and finishSlowForwardCopy algorithm. + // + // if length > len(dst)-d-10 { + // goto verySlowForwardCopy + // } + SUBQ $10, R14 + CMPQ CX, R14 + JGT verySlowForwardCopy + +makeOffsetAtLeast8: + // !!! As above, expand the pattern so that offset >= 8 and we can use + // 8-byte load/stores. + // + // for offset < 8 { + // copy 8 bytes from dst[d-offset:] to dst[d:] + // length -= offset + // d += offset + // offset += offset + // // The two previous lines together means that d-offset, and therefore + // // R15, is unchanged. + // } + CMPQ DX, $8 + JGE fixUpSlowForwardCopy + MOVQ (R15), BX + MOVQ BX, (DI) + SUBQ DX, CX + ADDQ DX, DI + ADDQ DX, DX + JMP makeOffsetAtLeast8 + +fixUpSlowForwardCopy: + // !!! Add length (which might be negative now) to d (implied by DI being + // &dst[d]) so that d ends up at the right place when we jump back to the + // top of the loop. Before we do that, though, we save DI to AX so that, if + // length is positive, copying the remaining length bytes will write to the + // right place. + MOVQ DI, AX + ADDQ CX, DI + +finishSlowForwardCopy: + // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative + // length means that we overrun, but as above, that will be fixed up by + // subsequent iterations of the outermost loop. + CMPQ CX, $0 + JLE loop + MOVQ (R15), BX + MOVQ BX, (AX) + ADDQ $8, R15 + ADDQ $8, AX + SUBQ $8, CX + JMP finishSlowForwardCopy + +verySlowForwardCopy: + // verySlowForwardCopy is a simple implementation of forward copy. In C + // parlance, this is a do/while loop instead of a while loop, since we know + // that length > 0. In Go syntax: + // + // for { + // dst[d] = dst[d - offset] + // d++ + // length-- + // if length == 0 { + // break + // } + // } + MOVB (R15), BX + MOVB BX, (DI) + INCQ R15 + INCQ DI + DECQ CX + JNZ verySlowForwardCopy + JMP loop + +// The code above handles copy tags. +// ---------------------------------------- + +end: + // This is the end of the "for s < len(src)". + // + // if d != len(dst) { etc } + CMPQ DI, R10 + JNE errCorrupt + + // return 0 + MOVQ $0, ret+48(FP) + RET + +errCorrupt: + // return decodeErrCodeCorrupt + MOVQ $1, ret+48(FP) + RET diff --git a/vendor/github.com/golang/snappy/decode_other.go b/vendor/github.com/golang/snappy/decode_other.go new file mode 100644 index 000000000..8c9f2049b --- /dev/null +++ b/vendor/github.com/golang/snappy/decode_other.go @@ -0,0 +1,101 @@ +// Copyright 2016 The Snappy-Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !amd64 appengine !gc noasm + +package snappy + +// decode writes the decoding of src to dst. It assumes that the varint-encoded +// length of the decompressed bytes has already been read, and that len(dst) +// equals that length. +// +// It returns 0 on success or a decodeErrCodeXxx error code on failure. +func decode(dst, src []byte) int { + var d, s, offset, length int + for s < len(src) { + switch src[s] & 0x03 { + case tagLiteral: + x := uint32(src[s] >> 2) + switch { + case x < 60: + s++ + case x == 60: + s += 2 + if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. + return decodeErrCodeCorrupt + } + x = uint32(src[s-1]) + case x == 61: + s += 3 + if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. + return decodeErrCodeCorrupt + } + x = uint32(src[s-2]) | uint32(src[s-1])<<8 + case x == 62: + s += 4 + if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. + return decodeErrCodeCorrupt + } + x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16 + case x == 63: + s += 5 + if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. + return decodeErrCodeCorrupt + } + x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 + } + length = int(x) + 1 + if length <= 0 { + return decodeErrCodeUnsupportedLiteralLength + } + if length > len(dst)-d || length > len(src)-s { + return decodeErrCodeCorrupt + } + copy(dst[d:], src[s:s+length]) + d += length + s += length + continue + + case tagCopy1: + s += 2 + if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. + return decodeErrCodeCorrupt + } + length = 4 + int(src[s-2])>>2&0x7 + offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])) + + case tagCopy2: + s += 3 + if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. + return decodeErrCodeCorrupt + } + length = 1 + int(src[s-3])>>2 + offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8) + + case tagCopy4: + s += 5 + if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. + return decodeErrCodeCorrupt + } + length = 1 + int(src[s-5])>>2 + offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24) + } + + if offset <= 0 || d < offset || length > len(dst)-d { + return decodeErrCodeCorrupt + } + // Copy from an earlier sub-slice of dst to a later sub-slice. Unlike + // the built-in copy function, this byte-by-byte copy always runs + // forwards, even if the slices overlap. Conceptually, this is: + // + // d += forwardCopy(dst[d:d+length], dst[d-offset:]) + for end := d + length; d != end; d++ { + dst[d] = dst[d-offset] + } + } + if d != len(dst) { + return decodeErrCodeCorrupt + } + return 0 +} diff --git a/vendor/github.com/golang/snappy/encode.go b/vendor/github.com/golang/snappy/encode.go new file mode 100644 index 000000000..8d393e904 --- /dev/null +++ b/vendor/github.com/golang/snappy/encode.go @@ -0,0 +1,285 @@ +// Copyright 2011 The Snappy-Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package snappy + +import ( + "encoding/binary" + "errors" + "io" +) + +// Encode returns the encoded form of src. The returned slice may be a sub- +// slice of dst if dst was large enough to hold the entire encoded block. +// Otherwise, a newly allocated slice will be returned. +// +// The dst and src must not overlap. It is valid to pass a nil dst. +func Encode(dst, src []byte) []byte { + if n := MaxEncodedLen(len(src)); n < 0 { + panic(ErrTooLarge) + } else if len(dst) < n { + dst = make([]byte, n) + } + + // The block starts with the varint-encoded length of the decompressed bytes. + d := binary.PutUvarint(dst, uint64(len(src))) + + for len(src) > 0 { + p := src + src = nil + if len(p) > maxBlockSize { + p, src = p[:maxBlockSize], p[maxBlockSize:] + } + if len(p) < minNonLiteralBlockSize { + d += emitLiteral(dst[d:], p) + } else { + d += encodeBlock(dst[d:], p) + } + } + return dst[:d] +} + +// inputMargin is the minimum number of extra input bytes to keep, inside +// encodeBlock's inner loop. On some architectures, this margin lets us +// implement a fast path for emitLiteral, where the copy of short (<= 16 byte) +// literals can be implemented as a single load to and store from a 16-byte +// register. That literal's actual length can be as short as 1 byte, so this +// can copy up to 15 bytes too much, but that's OK as subsequent iterations of +// the encoding loop will fix up the copy overrun, and this inputMargin ensures +// that we don't overrun the dst and src buffers. +const inputMargin = 16 - 1 + +// minNonLiteralBlockSize is the minimum size of the input to encodeBlock that +// could be encoded with a copy tag. This is the minimum with respect to the +// algorithm used by encodeBlock, not a minimum enforced by the file format. +// +// The encoded output must start with at least a 1 byte literal, as there are +// no previous bytes to copy. A minimal (1 byte) copy after that, generated +// from an emitCopy call in encodeBlock's main loop, would require at least +// another inputMargin bytes, for the reason above: we want any emitLiteral +// calls inside encodeBlock's main loop to use the fast path if possible, which +// requires being able to overrun by inputMargin bytes. Thus, +// minNonLiteralBlockSize equals 1 + 1 + inputMargin. +// +// The C++ code doesn't use this exact threshold, but it could, as discussed at +// https://groups.google.com/d/topic/snappy-compression/oGbhsdIJSJ8/discussion +// The difference between Go (2+inputMargin) and C++ (inputMargin) is purely an +// optimization. It should not affect the encoded form. This is tested by +// TestSameEncodingAsCppShortCopies. +const minNonLiteralBlockSize = 1 + 1 + inputMargin + +// MaxEncodedLen returns the maximum length of a snappy block, given its +// uncompressed length. +// +// It will return a negative value if srcLen is too large to encode. +func MaxEncodedLen(srcLen int) int { + n := uint64(srcLen) + if n > 0xffffffff { + return -1 + } + // Compressed data can be defined as: + // compressed := item* literal* + // item := literal* copy + // + // The trailing literal sequence has a space blowup of at most 62/60 + // since a literal of length 60 needs one tag byte + one extra byte + // for length information. + // + // Item blowup is trickier to measure. Suppose the "copy" op copies + // 4 bytes of data. Because of a special check in the encoding code, + // we produce a 4-byte copy only if the offset is < 65536. Therefore + // the copy op takes 3 bytes to encode, and this type of item leads + // to at most the 62/60 blowup for representing literals. + // + // Suppose the "copy" op copies 5 bytes of data. If the offset is big + // enough, it will take 5 bytes to encode the copy op. Therefore the + // worst case here is a one-byte literal followed by a five-byte copy. + // That is, 6 bytes of input turn into 7 bytes of "compressed" data. + // + // This last factor dominates the blowup, so the final estimate is: + n = 32 + n + n/6 + if n > 0xffffffff { + return -1 + } + return int(n) +} + +var errClosed = errors.New("snappy: Writer is closed") + +// NewWriter returns a new Writer that compresses to w. +// +// The Writer returned does not buffer writes. There is no need to Flush or +// Close such a Writer. +// +// Deprecated: the Writer returned is not suitable for many small writes, only +// for few large writes. Use NewBufferedWriter instead, which is efficient +// regardless of the frequency and shape of the writes, and remember to Close +// that Writer when done. +func NewWriter(w io.Writer) *Writer { + return &Writer{ + w: w, + obuf: make([]byte, obufLen), + } +} + +// NewBufferedWriter returns a new Writer that compresses to w, using the +// framing format described at +// https://github.com/google/snappy/blob/master/framing_format.txt +// +// The Writer returned buffers writes. Users must call Close to guarantee all +// data has been forwarded to the underlying io.Writer. They may also call +// Flush zero or more times before calling Close. +func NewBufferedWriter(w io.Writer) *Writer { + return &Writer{ + w: w, + ibuf: make([]byte, 0, maxBlockSize), + obuf: make([]byte, obufLen), + } +} + +// Writer is an io.Writer that can write Snappy-compressed bytes. +type Writer struct { + w io.Writer + err error + + // ibuf is a buffer for the incoming (uncompressed) bytes. + // + // Its use is optional. For backwards compatibility, Writers created by the + // NewWriter function have ibuf == nil, do not buffer incoming bytes, and + // therefore do not need to be Flush'ed or Close'd. + ibuf []byte + + // obuf is a buffer for the outgoing (compressed) bytes. + obuf []byte + + // wroteStreamHeader is whether we have written the stream header. + wroteStreamHeader bool +} + +// Reset discards the writer's state and switches the Snappy writer to write to +// w. This permits reusing a Writer rather than allocating a new one. +func (w *Writer) Reset(writer io.Writer) { + w.w = writer + w.err = nil + if w.ibuf != nil { + w.ibuf = w.ibuf[:0] + } + w.wroteStreamHeader = false +} + +// Write satisfies the io.Writer interface. +func (w *Writer) Write(p []byte) (nRet int, errRet error) { + if w.ibuf == nil { + // Do not buffer incoming bytes. This does not perform or compress well + // if the caller of Writer.Write writes many small slices. This + // behavior is therefore deprecated, but still supported for backwards + // compatibility with code that doesn't explicitly Flush or Close. + return w.write(p) + } + + // The remainder of this method is based on bufio.Writer.Write from the + // standard library. + + for len(p) > (cap(w.ibuf)-len(w.ibuf)) && w.err == nil { + var n int + if len(w.ibuf) == 0 { + // Large write, empty buffer. + // Write directly from p to avoid copy. + n, _ = w.write(p) + } else { + n = copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p) + w.ibuf = w.ibuf[:len(w.ibuf)+n] + w.Flush() + } + nRet += n + p = p[n:] + } + if w.err != nil { + return nRet, w.err + } + n := copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p) + w.ibuf = w.ibuf[:len(w.ibuf)+n] + nRet += n + return nRet, nil +} + +func (w *Writer) write(p []byte) (nRet int, errRet error) { + if w.err != nil { + return 0, w.err + } + for len(p) > 0 { + obufStart := len(magicChunk) + if !w.wroteStreamHeader { + w.wroteStreamHeader = true + copy(w.obuf, magicChunk) + obufStart = 0 + } + + var uncompressed []byte + if len(p) > maxBlockSize { + uncompressed, p = p[:maxBlockSize], p[maxBlockSize:] + } else { + uncompressed, p = p, nil + } + checksum := crc(uncompressed) + + // Compress the buffer, discarding the result if the improvement + // isn't at least 12.5%. + compressed := Encode(w.obuf[obufHeaderLen:], uncompressed) + chunkType := uint8(chunkTypeCompressedData) + chunkLen := 4 + len(compressed) + obufEnd := obufHeaderLen + len(compressed) + if len(compressed) >= len(uncompressed)-len(uncompressed)/8 { + chunkType = chunkTypeUncompressedData + chunkLen = 4 + len(uncompressed) + obufEnd = obufHeaderLen + } + + // Fill in the per-chunk header that comes before the body. + w.obuf[len(magicChunk)+0] = chunkType + w.obuf[len(magicChunk)+1] = uint8(chunkLen >> 0) + w.obuf[len(magicChunk)+2] = uint8(chunkLen >> 8) + w.obuf[len(magicChunk)+3] = uint8(chunkLen >> 16) + w.obuf[len(magicChunk)+4] = uint8(checksum >> 0) + w.obuf[len(magicChunk)+5] = uint8(checksum >> 8) + w.obuf[len(magicChunk)+6] = uint8(checksum >> 16) + w.obuf[len(magicChunk)+7] = uint8(checksum >> 24) + + if _, err := w.w.Write(w.obuf[obufStart:obufEnd]); err != nil { + w.err = err + return nRet, err + } + if chunkType == chunkTypeUncompressedData { + if _, err := w.w.Write(uncompressed); err != nil { + w.err = err + return nRet, err + } + } + nRet += len(uncompressed) + } + return nRet, nil +} + +// Flush flushes the Writer to its underlying io.Writer. +func (w *Writer) Flush() error { + if w.err != nil { + return w.err + } + if len(w.ibuf) == 0 { + return nil + } + w.write(w.ibuf) + w.ibuf = w.ibuf[:0] + return w.err +} + +// Close calls Flush and then closes the Writer. +func (w *Writer) Close() error { + w.Flush() + ret := w.err + if w.err == nil { + w.err = errClosed + } + return ret +} diff --git a/vendor/github.com/golang/snappy/encode_amd64.go b/vendor/github.com/golang/snappy/encode_amd64.go new file mode 100644 index 000000000..150d91bc8 --- /dev/null +++ b/vendor/github.com/golang/snappy/encode_amd64.go @@ -0,0 +1,29 @@ +// Copyright 2016 The Snappy-Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !appengine +// +build gc +// +build !noasm + +package snappy + +// emitLiteral has the same semantics as in encode_other.go. +// +//go:noescape +func emitLiteral(dst, lit []byte) int + +// emitCopy has the same semantics as in encode_other.go. +// +//go:noescape +func emitCopy(dst []byte, offset, length int) int + +// extendMatch has the same semantics as in encode_other.go. +// +//go:noescape +func extendMatch(src []byte, i, j int) int + +// encodeBlock has the same semantics as in encode_other.go. +// +//go:noescape +func encodeBlock(dst, src []byte) (d int) diff --git a/vendor/github.com/golang/snappy/encode_amd64.s b/vendor/github.com/golang/snappy/encode_amd64.s new file mode 100644 index 000000000..adfd979fe --- /dev/null +++ b/vendor/github.com/golang/snappy/encode_amd64.s @@ -0,0 +1,730 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !appengine +// +build gc +// +build !noasm + +#include "textflag.h" + +// The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a +// Go toolchain regression. See https://github.com/golang/go/issues/15426 and +// https://github.com/golang/snappy/issues/29 +// +// As a workaround, the package was built with a known good assembler, and +// those instructions were disassembled by "objdump -d" to yield the +// 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15 +// style comments, in AT&T asm syntax. Note that rsp here is a physical +// register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm). +// The instructions were then encoded as "BYTE $0x.." sequences, which assemble +// fine on Go 1.6. + +// The asm code generally follows the pure Go code in encode_other.go, except +// where marked with a "!!!". + +// ---------------------------------------------------------------------------- + +// func emitLiteral(dst, lit []byte) int +// +// All local variables fit into registers. The register allocation: +// - AX len(lit) +// - BX n +// - DX return value +// - DI &dst[i] +// - R10 &lit[0] +// +// The 24 bytes of stack space is to call runtime·memmove. +// +// The unusual register allocation of local variables, such as R10 for the +// source pointer, matches the allocation used at the call site in encodeBlock, +// which makes it easier to manually inline this function. +TEXT ·emitLiteral(SB), NOSPLIT, $24-56 + MOVQ dst_base+0(FP), DI + MOVQ lit_base+24(FP), R10 + MOVQ lit_len+32(FP), AX + MOVQ AX, DX + MOVL AX, BX + SUBL $1, BX + + CMPL BX, $60 + JLT oneByte + CMPL BX, $256 + JLT twoBytes + +threeBytes: + MOVB $0xf4, 0(DI) + MOVW BX, 1(DI) + ADDQ $3, DI + ADDQ $3, DX + JMP memmove + +twoBytes: + MOVB $0xf0, 0(DI) + MOVB BX, 1(DI) + ADDQ $2, DI + ADDQ $2, DX + JMP memmove + +oneByte: + SHLB $2, BX + MOVB BX, 0(DI) + ADDQ $1, DI + ADDQ $1, DX + +memmove: + MOVQ DX, ret+48(FP) + + // copy(dst[i:], lit) + // + // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push + // DI, R10 and AX as arguments. + MOVQ DI, 0(SP) + MOVQ R10, 8(SP) + MOVQ AX, 16(SP) + CALL runtime·memmove(SB) + RET + +// ---------------------------------------------------------------------------- + +// func emitCopy(dst []byte, offset, length int) int +// +// All local variables fit into registers. The register allocation: +// - AX length +// - SI &dst[0] +// - DI &dst[i] +// - R11 offset +// +// The unusual register allocation of local variables, such as R11 for the +// offset, matches the allocation used at the call site in encodeBlock, which +// makes it easier to manually inline this function. +TEXT ·emitCopy(SB), NOSPLIT, $0-48 + MOVQ dst_base+0(FP), DI + MOVQ DI, SI + MOVQ offset+24(FP), R11 + MOVQ length+32(FP), AX + +loop0: + // for length >= 68 { etc } + CMPL AX, $68 + JLT step1 + + // Emit a length 64 copy, encoded as 3 bytes. + MOVB $0xfe, 0(DI) + MOVW R11, 1(DI) + ADDQ $3, DI + SUBL $64, AX + JMP loop0 + +step1: + // if length > 64 { etc } + CMPL AX, $64 + JLE step2 + + // Emit a length 60 copy, encoded as 3 bytes. + MOVB $0xee, 0(DI) + MOVW R11, 1(DI) + ADDQ $3, DI + SUBL $60, AX + +step2: + // if length >= 12 || offset >= 2048 { goto step3 } + CMPL AX, $12 + JGE step3 + CMPL R11, $2048 + JGE step3 + + // Emit the remaining copy, encoded as 2 bytes. + MOVB R11, 1(DI) + SHRL $8, R11 + SHLB $5, R11 + SUBB $4, AX + SHLB $2, AX + ORB AX, R11 + ORB $1, R11 + MOVB R11, 0(DI) + ADDQ $2, DI + + // Return the number of bytes written. + SUBQ SI, DI + MOVQ DI, ret+40(FP) + RET + +step3: + // Emit the remaining copy, encoded as 3 bytes. + SUBL $1, AX + SHLB $2, AX + ORB $2, AX + MOVB AX, 0(DI) + MOVW R11, 1(DI) + ADDQ $3, DI + + // Return the number of bytes written. + SUBQ SI, DI + MOVQ DI, ret+40(FP) + RET + +// ---------------------------------------------------------------------------- + +// func extendMatch(src []byte, i, j int) int +// +// All local variables fit into registers. The register allocation: +// - DX &src[0] +// - SI &src[j] +// - R13 &src[len(src) - 8] +// - R14 &src[len(src)] +// - R15 &src[i] +// +// The unusual register allocation of local variables, such as R15 for a source +// pointer, matches the allocation used at the call site in encodeBlock, which +// makes it easier to manually inline this function. +TEXT ·extendMatch(SB), NOSPLIT, $0-48 + MOVQ src_base+0(FP), DX + MOVQ src_len+8(FP), R14 + MOVQ i+24(FP), R15 + MOVQ j+32(FP), SI + ADDQ DX, R14 + ADDQ DX, R15 + ADDQ DX, SI + MOVQ R14, R13 + SUBQ $8, R13 + +cmp8: + // As long as we are 8 or more bytes before the end of src, we can load and + // compare 8 bytes at a time. If those 8 bytes are equal, repeat. + CMPQ SI, R13 + JA cmp1 + MOVQ (R15), AX + MOVQ (SI), BX + CMPQ AX, BX + JNE bsf + ADDQ $8, R15 + ADDQ $8, SI + JMP cmp8 + +bsf: + // If those 8 bytes were not equal, XOR the two 8 byte values, and return + // the index of the first byte that differs. The BSF instruction finds the + // least significant 1 bit, the amd64 architecture is little-endian, and + // the shift by 3 converts a bit index to a byte index. + XORQ AX, BX + BSFQ BX, BX + SHRQ $3, BX + ADDQ BX, SI + + // Convert from &src[ret] to ret. + SUBQ DX, SI + MOVQ SI, ret+40(FP) + RET + +cmp1: + // In src's tail, compare 1 byte at a time. + CMPQ SI, R14 + JAE extendMatchEnd + MOVB (R15), AX + MOVB (SI), BX + CMPB AX, BX + JNE extendMatchEnd + ADDQ $1, R15 + ADDQ $1, SI + JMP cmp1 + +extendMatchEnd: + // Convert from &src[ret] to ret. + SUBQ DX, SI + MOVQ SI, ret+40(FP) + RET + +// ---------------------------------------------------------------------------- + +// func encodeBlock(dst, src []byte) (d int) +// +// All local variables fit into registers, other than "var table". The register +// allocation: +// - AX . . +// - BX . . +// - CX 56 shift (note that amd64 shifts by non-immediates must use CX). +// - DX 64 &src[0], tableSize +// - SI 72 &src[s] +// - DI 80 &dst[d] +// - R9 88 sLimit +// - R10 . &src[nextEmit] +// - R11 96 prevHash, currHash, nextHash, offset +// - R12 104 &src[base], skip +// - R13 . &src[nextS], &src[len(src) - 8] +// - R14 . len(src), bytesBetweenHashLookups, &src[len(src)], x +// - R15 112 candidate +// +// The second column (56, 64, etc) is the stack offset to spill the registers +// when calling other functions. We could pack this slightly tighter, but it's +// simpler to have a dedicated spill map independent of the function called. +// +// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An +// extra 56 bytes, to call other functions, and an extra 64 bytes, to spill +// local variables (registers) during calls gives 32768 + 56 + 64 = 32888. +TEXT ·encodeBlock(SB), 0, $32888-56 + MOVQ dst_base+0(FP), DI + MOVQ src_base+24(FP), SI + MOVQ src_len+32(FP), R14 + + // shift, tableSize := uint32(32-8), 1<<8 + MOVQ $24, CX + MOVQ $256, DX + +calcShift: + // for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 { + // shift-- + // } + CMPQ DX, $16384 + JGE varTable + CMPQ DX, R14 + JGE varTable + SUBQ $1, CX + SHLQ $1, DX + JMP calcShift + +varTable: + // var table [maxTableSize]uint16 + // + // In the asm code, unlike the Go code, we can zero-initialize only the + // first tableSize elements. Each uint16 element is 2 bytes and each MOVOU + // writes 16 bytes, so we can do only tableSize/8 writes instead of the + // 2048 writes that would zero-initialize all of table's 32768 bytes. + SHRQ $3, DX + LEAQ table-32768(SP), BX + PXOR X0, X0 + +memclr: + MOVOU X0, 0(BX) + ADDQ $16, BX + SUBQ $1, DX + JNZ memclr + + // !!! DX = &src[0] + MOVQ SI, DX + + // sLimit := len(src) - inputMargin + MOVQ R14, R9 + SUBQ $15, R9 + + // !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't + // change for the rest of the function. + MOVQ CX, 56(SP) + MOVQ DX, 64(SP) + MOVQ R9, 88(SP) + + // nextEmit := 0 + MOVQ DX, R10 + + // s := 1 + ADDQ $1, SI + + // nextHash := hash(load32(src, s), shift) + MOVL 0(SI), R11 + IMULL $0x1e35a7bd, R11 + SHRL CX, R11 + +outer: + // for { etc } + + // skip := 32 + MOVQ $32, R12 + + // nextS := s + MOVQ SI, R13 + + // candidate := 0 + MOVQ $0, R15 + +inner0: + // for { etc } + + // s := nextS + MOVQ R13, SI + + // bytesBetweenHashLookups := skip >> 5 + MOVQ R12, R14 + SHRQ $5, R14 + + // nextS = s + bytesBetweenHashLookups + ADDQ R14, R13 + + // skip += bytesBetweenHashLookups + ADDQ R14, R12 + + // if nextS > sLimit { goto emitRemainder } + MOVQ R13, AX + SUBQ DX, AX + CMPQ AX, R9 + JA emitRemainder + + // candidate = int(table[nextHash]) + // XXX: MOVWQZX table-32768(SP)(R11*2), R15 + // XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15 + BYTE $0x4e + BYTE $0x0f + BYTE $0xb7 + BYTE $0x7c + BYTE $0x5c + BYTE $0x78 + + // table[nextHash] = uint16(s) + MOVQ SI, AX + SUBQ DX, AX + + // XXX: MOVW AX, table-32768(SP)(R11*2) + // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2) + BYTE $0x66 + BYTE $0x42 + BYTE $0x89 + BYTE $0x44 + BYTE $0x5c + BYTE $0x78 + + // nextHash = hash(load32(src, nextS), shift) + MOVL 0(R13), R11 + IMULL $0x1e35a7bd, R11 + SHRL CX, R11 + + // if load32(src, s) != load32(src, candidate) { continue } break + MOVL 0(SI), AX + MOVL (DX)(R15*1), BX + CMPL AX, BX + JNE inner0 + +fourByteMatch: + // As per the encode_other.go code: + // + // A 4-byte match has been found. We'll later see etc. + + // !!! Jump to a fast path for short (<= 16 byte) literals. See the comment + // on inputMargin in encode.go. + MOVQ SI, AX + SUBQ R10, AX + CMPQ AX, $16 + JLE emitLiteralFastPath + + // ---------------------------------------- + // Begin inline of the emitLiteral call. + // + // d += emitLiteral(dst[d:], src[nextEmit:s]) + + MOVL AX, BX + SUBL $1, BX + + CMPL BX, $60 + JLT inlineEmitLiteralOneByte + CMPL BX, $256 + JLT inlineEmitLiteralTwoBytes + +inlineEmitLiteralThreeBytes: + MOVB $0xf4, 0(DI) + MOVW BX, 1(DI) + ADDQ $3, DI + JMP inlineEmitLiteralMemmove + +inlineEmitLiteralTwoBytes: + MOVB $0xf0, 0(DI) + MOVB BX, 1(DI) + ADDQ $2, DI + JMP inlineEmitLiteralMemmove + +inlineEmitLiteralOneByte: + SHLB $2, BX + MOVB BX, 0(DI) + ADDQ $1, DI + +inlineEmitLiteralMemmove: + // Spill local variables (registers) onto the stack; call; unspill. + // + // copy(dst[i:], lit) + // + // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push + // DI, R10 and AX as arguments. + MOVQ DI, 0(SP) + MOVQ R10, 8(SP) + MOVQ AX, 16(SP) + ADDQ AX, DI // Finish the "d +=" part of "d += emitLiteral(etc)". + MOVQ SI, 72(SP) + MOVQ DI, 80(SP) + MOVQ R15, 112(SP) + CALL runtime·memmove(SB) + MOVQ 56(SP), CX + MOVQ 64(SP), DX + MOVQ 72(SP), SI + MOVQ 80(SP), DI + MOVQ 88(SP), R9 + MOVQ 112(SP), R15 + JMP inner1 + +inlineEmitLiteralEnd: + // End inline of the emitLiteral call. + // ---------------------------------------- + +emitLiteralFastPath: + // !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2". + MOVB AX, BX + SUBB $1, BX + SHLB $2, BX + MOVB BX, (DI) + ADDQ $1, DI + + // !!! Implement the copy from lit to dst as a 16-byte load and store. + // (Encode's documentation says that dst and src must not overlap.) + // + // This always copies 16 bytes, instead of only len(lit) bytes, but that's + // OK. Subsequent iterations will fix up the overrun. + // + // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or + // 16-byte loads and stores. This technique probably wouldn't be as + // effective on architectures that are fussier about alignment. + MOVOU 0(R10), X0 + MOVOU X0, 0(DI) + ADDQ AX, DI + +inner1: + // for { etc } + + // base := s + MOVQ SI, R12 + + // !!! offset := base - candidate + MOVQ R12, R11 + SUBQ R15, R11 + SUBQ DX, R11 + + // ---------------------------------------- + // Begin inline of the extendMatch call. + // + // s = extendMatch(src, candidate+4, s+4) + + // !!! R14 = &src[len(src)] + MOVQ src_len+32(FP), R14 + ADDQ DX, R14 + + // !!! R13 = &src[len(src) - 8] + MOVQ R14, R13 + SUBQ $8, R13 + + // !!! R15 = &src[candidate + 4] + ADDQ $4, R15 + ADDQ DX, R15 + + // !!! s += 4 + ADDQ $4, SI + +inlineExtendMatchCmp8: + // As long as we are 8 or more bytes before the end of src, we can load and + // compare 8 bytes at a time. If those 8 bytes are equal, repeat. + CMPQ SI, R13 + JA inlineExtendMatchCmp1 + MOVQ (R15), AX + MOVQ (SI), BX + CMPQ AX, BX + JNE inlineExtendMatchBSF + ADDQ $8, R15 + ADDQ $8, SI + JMP inlineExtendMatchCmp8 + +inlineExtendMatchBSF: + // If those 8 bytes were not equal, XOR the two 8 byte values, and return + // the index of the first byte that differs. The BSF instruction finds the + // least significant 1 bit, the amd64 architecture is little-endian, and + // the shift by 3 converts a bit index to a byte index. + XORQ AX, BX + BSFQ BX, BX + SHRQ $3, BX + ADDQ BX, SI + JMP inlineExtendMatchEnd + +inlineExtendMatchCmp1: + // In src's tail, compare 1 byte at a time. + CMPQ SI, R14 + JAE inlineExtendMatchEnd + MOVB (R15), AX + MOVB (SI), BX + CMPB AX, BX + JNE inlineExtendMatchEnd + ADDQ $1, R15 + ADDQ $1, SI + JMP inlineExtendMatchCmp1 + +inlineExtendMatchEnd: + // End inline of the extendMatch call. + // ---------------------------------------- + + // ---------------------------------------- + // Begin inline of the emitCopy call. + // + // d += emitCopy(dst[d:], base-candidate, s-base) + + // !!! length := s - base + MOVQ SI, AX + SUBQ R12, AX + +inlineEmitCopyLoop0: + // for length >= 68 { etc } + CMPL AX, $68 + JLT inlineEmitCopyStep1 + + // Emit a length 64 copy, encoded as 3 bytes. + MOVB $0xfe, 0(DI) + MOVW R11, 1(DI) + ADDQ $3, DI + SUBL $64, AX + JMP inlineEmitCopyLoop0 + +inlineEmitCopyStep1: + // if length > 64 { etc } + CMPL AX, $64 + JLE inlineEmitCopyStep2 + + // Emit a length 60 copy, encoded as 3 bytes. + MOVB $0xee, 0(DI) + MOVW R11, 1(DI) + ADDQ $3, DI + SUBL $60, AX + +inlineEmitCopyStep2: + // if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 } + CMPL AX, $12 + JGE inlineEmitCopyStep3 + CMPL R11, $2048 + JGE inlineEmitCopyStep3 + + // Emit the remaining copy, encoded as 2 bytes. + MOVB R11, 1(DI) + SHRL $8, R11 + SHLB $5, R11 + SUBB $4, AX + SHLB $2, AX + ORB AX, R11 + ORB $1, R11 + MOVB R11, 0(DI) + ADDQ $2, DI + JMP inlineEmitCopyEnd + +inlineEmitCopyStep3: + // Emit the remaining copy, encoded as 3 bytes. + SUBL $1, AX + SHLB $2, AX + ORB $2, AX + MOVB AX, 0(DI) + MOVW R11, 1(DI) + ADDQ $3, DI + +inlineEmitCopyEnd: + // End inline of the emitCopy call. + // ---------------------------------------- + + // nextEmit = s + MOVQ SI, R10 + + // if s >= sLimit { goto emitRemainder } + MOVQ SI, AX + SUBQ DX, AX + CMPQ AX, R9 + JAE emitRemainder + + // As per the encode_other.go code: + // + // We could immediately etc. + + // x := load64(src, s-1) + MOVQ -1(SI), R14 + + // prevHash := hash(uint32(x>>0), shift) + MOVL R14, R11 + IMULL $0x1e35a7bd, R11 + SHRL CX, R11 + + // table[prevHash] = uint16(s-1) + MOVQ SI, AX + SUBQ DX, AX + SUBQ $1, AX + + // XXX: MOVW AX, table-32768(SP)(R11*2) + // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2) + BYTE $0x66 + BYTE $0x42 + BYTE $0x89 + BYTE $0x44 + BYTE $0x5c + BYTE $0x78 + + // currHash := hash(uint32(x>>8), shift) + SHRQ $8, R14 + MOVL R14, R11 + IMULL $0x1e35a7bd, R11 + SHRL CX, R11 + + // candidate = int(table[currHash]) + // XXX: MOVWQZX table-32768(SP)(R11*2), R15 + // XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15 + BYTE $0x4e + BYTE $0x0f + BYTE $0xb7 + BYTE $0x7c + BYTE $0x5c + BYTE $0x78 + + // table[currHash] = uint16(s) + ADDQ $1, AX + + // XXX: MOVW AX, table-32768(SP)(R11*2) + // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2) + BYTE $0x66 + BYTE $0x42 + BYTE $0x89 + BYTE $0x44 + BYTE $0x5c + BYTE $0x78 + + // if uint32(x>>8) == load32(src, candidate) { continue } + MOVL (DX)(R15*1), BX + CMPL R14, BX + JEQ inner1 + + // nextHash = hash(uint32(x>>16), shift) + SHRQ $8, R14 + MOVL R14, R11 + IMULL $0x1e35a7bd, R11 + SHRL CX, R11 + + // s++ + ADDQ $1, SI + + // break out of the inner1 for loop, i.e. continue the outer loop. + JMP outer + +emitRemainder: + // if nextEmit < len(src) { etc } + MOVQ src_len+32(FP), AX + ADDQ DX, AX + CMPQ R10, AX + JEQ encodeBlockEnd + + // d += emitLiteral(dst[d:], src[nextEmit:]) + // + // Push args. + MOVQ DI, 0(SP) + MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative. + MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative. + MOVQ R10, 24(SP) + SUBQ R10, AX + MOVQ AX, 32(SP) + MOVQ AX, 40(SP) // Unnecessary, as the callee ignores it, but conservative. + + // Spill local variables (registers) onto the stack; call; unspill. + MOVQ DI, 80(SP) + CALL ·emitLiteral(SB) + MOVQ 80(SP), DI + + // Finish the "d +=" part of "d += emitLiteral(etc)". + ADDQ 48(SP), DI + +encodeBlockEnd: + MOVQ dst_base+0(FP), AX + SUBQ AX, DI + MOVQ DI, d+48(FP) + RET diff --git a/vendor/github.com/golang/snappy/encode_other.go b/vendor/github.com/golang/snappy/encode_other.go new file mode 100644 index 000000000..dbcae905e --- /dev/null +++ b/vendor/github.com/golang/snappy/encode_other.go @@ -0,0 +1,238 @@ +// Copyright 2016 The Snappy-Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !amd64 appengine !gc noasm + +package snappy + +func load32(b []byte, i int) uint32 { + b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line. + return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 +} + +func load64(b []byte, i int) uint64 { + b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line. + return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | + uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 +} + +// emitLiteral writes a literal chunk and returns the number of bytes written. +// +// It assumes that: +// dst is long enough to hold the encoded bytes +// 1 <= len(lit) && len(lit) <= 65536 +func emitLiteral(dst, lit []byte) int { + i, n := 0, uint(len(lit)-1) + switch { + case n < 60: + dst[0] = uint8(n)<<2 | tagLiteral + i = 1 + case n < 1<<8: + dst[0] = 60<<2 | tagLiteral + dst[1] = uint8(n) + i = 2 + default: + dst[0] = 61<<2 | tagLiteral + dst[1] = uint8(n) + dst[2] = uint8(n >> 8) + i = 3 + } + return i + copy(dst[i:], lit) +} + +// emitCopy writes a copy chunk and returns the number of bytes written. +// +// It assumes that: +// dst is long enough to hold the encoded bytes +// 1 <= offset && offset <= 65535 +// 4 <= length && length <= 65535 +func emitCopy(dst []byte, offset, length int) int { + i := 0 + // The maximum length for a single tagCopy1 or tagCopy2 op is 64 bytes. The + // threshold for this loop is a little higher (at 68 = 64 + 4), and the + // length emitted down below is is a little lower (at 60 = 64 - 4), because + // it's shorter to encode a length 67 copy as a length 60 tagCopy2 followed + // by a length 7 tagCopy1 (which encodes as 3+2 bytes) than to encode it as + // a length 64 tagCopy2 followed by a length 3 tagCopy2 (which encodes as + // 3+3 bytes). The magic 4 in the 64±4 is because the minimum length for a + // tagCopy1 op is 4 bytes, which is why a length 3 copy has to be an + // encodes-as-3-bytes tagCopy2 instead of an encodes-as-2-bytes tagCopy1. + for length >= 68 { + // Emit a length 64 copy, encoded as 3 bytes. + dst[i+0] = 63<<2 | tagCopy2 + dst[i+1] = uint8(offset) + dst[i+2] = uint8(offset >> 8) + i += 3 + length -= 64 + } + if length > 64 { + // Emit a length 60 copy, encoded as 3 bytes. + dst[i+0] = 59<<2 | tagCopy2 + dst[i+1] = uint8(offset) + dst[i+2] = uint8(offset >> 8) + i += 3 + length -= 60 + } + if length >= 12 || offset >= 2048 { + // Emit the remaining copy, encoded as 3 bytes. + dst[i+0] = uint8(length-1)<<2 | tagCopy2 + dst[i+1] = uint8(offset) + dst[i+2] = uint8(offset >> 8) + return i + 3 + } + // Emit the remaining copy, encoded as 2 bytes. + dst[i+0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1 + dst[i+1] = uint8(offset) + return i + 2 +} + +// extendMatch returns the largest k such that k <= len(src) and that +// src[i:i+k-j] and src[j:k] have the same contents. +// +// It assumes that: +// 0 <= i && i < j && j <= len(src) +func extendMatch(src []byte, i, j int) int { + for ; j < len(src) && src[i] == src[j]; i, j = i+1, j+1 { + } + return j +} + +func hash(u, shift uint32) uint32 { + return (u * 0x1e35a7bd) >> shift +} + +// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It +// assumes that the varint-encoded length of the decompressed bytes has already +// been written. +// +// It also assumes that: +// len(dst) >= MaxEncodedLen(len(src)) && +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +func encodeBlock(dst, src []byte) (d int) { + // Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive. + // The table element type is uint16, as s < sLimit and sLimit < len(src) + // and len(src) <= maxBlockSize and maxBlockSize == 65536. + const ( + maxTableSize = 1 << 14 + // tableMask is redundant, but helps the compiler eliminate bounds + // checks. + tableMask = maxTableSize - 1 + ) + shift := uint32(32 - 8) + for tableSize := 1 << 8; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 { + shift-- + } + // In Go, all array elements are zero-initialized, so there is no advantage + // to a smaller tableSize per se. However, it matches the C++ algorithm, + // and in the asm versions of this code, we can get away with zeroing only + // the first tableSize elements. + var table [maxTableSize]uint16 + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := len(src) - inputMargin + + // nextEmit is where in src the next emitLiteral should start from. + nextEmit := 0 + + // The encoded form must start with a literal, as there are no previous + // bytes to copy, so we start looking for hash matches at s == 1. + s := 1 + nextHash := hash(load32(src, s), shift) + + for { + // Copied from the C++ snappy implementation: + // + // Heuristic match skipping: If 32 bytes are scanned with no matches + // found, start looking only at every other byte. If 32 more bytes are + // scanned (or skipped), look at every third byte, etc.. When a match + // is found, immediately go back to looking at every byte. This is a + // small loss (~5% performance, ~0.1% density) for compressible data + // due to more bookkeeping, but for non-compressible data (such as + // JPEG) it's a huge win since the compressor quickly "realizes" the + // data is incompressible and doesn't bother looking for matches + // everywhere. + // + // The "skip" variable keeps track of how many bytes there are since + // the last match; dividing it by 32 (ie. right-shifting by five) gives + // the number of bytes to move ahead for each iteration. + skip := 32 + + nextS := s + candidate := 0 + for { + s = nextS + bytesBetweenHashLookups := skip >> 5 + nextS = s + bytesBetweenHashLookups + skip += bytesBetweenHashLookups + if nextS > sLimit { + goto emitRemainder + } + candidate = int(table[nextHash&tableMask]) + table[nextHash&tableMask] = uint16(s) + nextHash = hash(load32(src, nextS), shift) + if load32(src, s) == load32(src, candidate) { + break + } + } + + // A 4-byte match has been found. We'll later see if more than 4 bytes + // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit + // them as literal bytes. + d += emitLiteral(dst[d:], src[nextEmit:s]) + + // Call emitCopy, and then see if another emitCopy could be our next + // move. Repeat until we find no match for the input immediately after + // what was consumed by the last emitCopy call. + // + // If we exit this loop normally then we need to call emitLiteral next, + // though we don't yet know how big the literal will be. We handle that + // by proceeding to the next iteration of the main loop. We also can + // exit this loop via goto if we get close to exhausting the input. + for { + // Invariant: we have a 4-byte match at s, and no need to emit any + // literal bytes prior to s. + base := s + + // Extend the 4-byte match as long as possible. + // + // This is an inlined version of: + // s = extendMatch(src, candidate+4, s+4) + s += 4 + for i := candidate + 4; s < len(src) && src[i] == src[s]; i, s = i+1, s+1 { + } + + d += emitCopy(dst[d:], base-candidate, s-base) + nextEmit = s + if s >= sLimit { + goto emitRemainder + } + + // We could immediately start working at s now, but to improve + // compression we first update the hash table at s-1 and at s. If + // another emitCopy is not our next move, also calculate nextHash + // at s+1. At least on GOARCH=amd64, these three hash calculations + // are faster as one load64 call (with some shifts) instead of + // three load32 calls. + x := load64(src, s-1) + prevHash := hash(uint32(x>>0), shift) + table[prevHash&tableMask] = uint16(s - 1) + currHash := hash(uint32(x>>8), shift) + candidate = int(table[currHash&tableMask]) + table[currHash&tableMask] = uint16(s) + if uint32(x>>8) != load32(src, candidate) { + nextHash = hash(uint32(x>>16), shift) + s++ + break + } + } + } + +emitRemainder: + if nextEmit < len(src) { + d += emitLiteral(dst[d:], src[nextEmit:]) + } + return d +} diff --git a/vendor/github.com/golang/snappy/snappy.go b/vendor/github.com/golang/snappy/snappy.go new file mode 100644 index 000000000..ece692ea4 --- /dev/null +++ b/vendor/github.com/golang/snappy/snappy.go @@ -0,0 +1,98 @@ +// Copyright 2011 The Snappy-Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package snappy implements the Snappy compression format. It aims for very +// high speeds and reasonable compression. +// +// There are actually two Snappy formats: block and stream. They are related, +// but different: trying to decompress block-compressed data as a Snappy stream +// will fail, and vice versa. The block format is the Decode and Encode +// functions and the stream format is the Reader and Writer types. +// +// The block format, the more common case, is used when the complete size (the +// number of bytes) of the original data is known upfront, at the time +// compression starts. The stream format, also known as the framing format, is +// for when that isn't always true. +// +// The canonical, C++ implementation is at https://github.com/google/snappy and +// it only implements the block format. +package snappy // import "github.com/golang/snappy" + +import ( + "hash/crc32" +) + +/* +Each encoded block begins with the varint-encoded length of the decoded data, +followed by a sequence of chunks. Chunks begin and end on byte boundaries. The +first byte of each chunk is broken into its 2 least and 6 most significant bits +called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag. +Zero means a literal tag. All other values mean a copy tag. + +For literal tags: + - If m < 60, the next 1 + m bytes are literal bytes. + - Otherwise, let n be the little-endian unsigned integer denoted by the next + m - 59 bytes. The next 1 + n bytes after that are literal bytes. + +For copy tags, length bytes are copied from offset bytes ago, in the style of +Lempel-Ziv compression algorithms. In particular: + - For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12). + The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10 + of the offset. The next byte is bits 0-7 of the offset. + - For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65). + The length is 1 + m. The offset is the little-endian unsigned integer + denoted by the next 2 bytes. + - For l == 3, this tag is a legacy format that is no longer issued by most + encoders. Nonetheless, the offset ranges in [0, 1<<32) and the length in + [1, 65). The length is 1 + m. The offset is the little-endian unsigned + integer denoted by the next 4 bytes. +*/ +const ( + tagLiteral = 0x00 + tagCopy1 = 0x01 + tagCopy2 = 0x02 + tagCopy4 = 0x03 +) + +const ( + checksumSize = 4 + chunkHeaderSize = 4 + magicChunk = "\xff\x06\x00\x00" + magicBody + magicBody = "sNaPpY" + + // maxBlockSize is the maximum size of the input to encodeBlock. It is not + // part of the wire format per se, but some parts of the encoder assume + // that an offset fits into a uint16. + // + // Also, for the framing format (Writer type instead of Encode function), + // https://github.com/google/snappy/blob/master/framing_format.txt says + // that "the uncompressed data in a chunk must be no longer than 65536 + // bytes". + maxBlockSize = 65536 + + // maxEncodedLenOfMaxBlockSize equals MaxEncodedLen(maxBlockSize), but is + // hard coded to be a const instead of a variable, so that obufLen can also + // be a const. Their equivalence is confirmed by + // TestMaxEncodedLenOfMaxBlockSize. + maxEncodedLenOfMaxBlockSize = 76490 + + obufHeaderLen = len(magicChunk) + checksumSize + chunkHeaderSize + obufLen = obufHeaderLen + maxEncodedLenOfMaxBlockSize +) + +const ( + chunkTypeCompressedData = 0x00 + chunkTypeUncompressedData = 0x01 + chunkTypePadding = 0xfe + chunkTypeStreamIdentifier = 0xff +) + +var crcTable = crc32.MakeTable(crc32.Castagnoli) + +// crc implements the checksum specified in section 3 of +// https://github.com/google/snappy/blob/master/framing_format.txt +func crc(b []byte) uint32 { + c := crc32.Update(0, crcTable, b) + return uint32(c>>15|c<<17) + 0xa282ead8 +} diff --git a/vendor/github.com/opentracing/opentracing-go/.gitignore b/vendor/github.com/opentracing/opentracing-go/.gitignore new file mode 100644 index 000000000..c57100a59 --- /dev/null +++ b/vendor/github.com/opentracing/opentracing-go/.gitignore @@ -0,0 +1 @@ +coverage.txt diff --git a/vendor/github.com/opentracing/opentracing-go/.travis.yml b/vendor/github.com/opentracing/opentracing-go/.travis.yml new file mode 100644 index 000000000..8d5b75e41 --- /dev/null +++ b/vendor/github.com/opentracing/opentracing-go/.travis.yml @@ -0,0 +1,20 @@ +language: go + +matrix: + include: + - go: "1.11.x" + - go: "1.12.x" + - go: "tip" + env: + - LINT=true + - COVERAGE=true + +install: + - if [ "$LINT" == true ]; then go get -u golang.org/x/lint/golint/... ; else echo 'skipping lint'; fi + - go get -u github.com/stretchr/testify/... + +script: + - make test + - go build ./... + - if [ "$LINT" == true ]; then make lint ; else echo 'skipping lint'; fi + - if [ "$COVERAGE" == true ]; then make cover && bash <(curl -s https://codecov.io/bash) ; else echo 'skipping coverage'; fi diff --git a/vendor/github.com/opentracing/opentracing-go/CHANGELOG.md b/vendor/github.com/opentracing/opentracing-go/CHANGELOG.md new file mode 100644 index 000000000..7c14febe1 --- /dev/null +++ b/vendor/github.com/opentracing/opentracing-go/CHANGELOG.md @@ -0,0 +1,46 @@ +Changes by Version +================== + +1.1.0 (2019-03-23) +------------------- + +Notable changes: +- The library is now released under Apache 2.0 license +- Use Set() instead of Add() in HTTPHeadersCarrier is functionally a breaking change (fixes issue [#159](https://github.com/opentracing/opentracing-go/issues/159)) +- 'golang.org/x/net/context' is replaced with 'context' from the standard library + +List of all changes: + +- Export StartSpanFromContextWithTracer (#214) +- Add IsGlobalTracerRegistered() to indicate if a tracer has been registered (#201) +- Use Set() instead of Add() in HTTPHeadersCarrier (#191) +- Update license to Apache 2.0 (#181) +- Replace 'golang.org/x/net/context' with 'context' (#176) +- Port of Python opentracing/harness/api_check.py to Go (#146) +- Fix race condition in MockSpan.Context() (#170) +- Add PeerHostIPv4.SetString() (#155) +- Add a Noop log field type to log to allow for optional fields (#150) + + +1.0.2 (2017-04-26) +------------------- + +- Add more semantic tags (#139) + + +1.0.1 (2017-02-06) +------------------- + +- Correct spelling in comments +- Address race in nextMockID() (#123) +- log: avoid panic marshaling nil error (#131) +- Deprecate InitGlobalTracer in favor of SetGlobalTracer (#128) +- Drop Go 1.5 that fails in Travis (#129) +- Add convenience methods Key() and Value() to log.Field +- Add convenience methods to log.Field (2 years, 6 months ago) + +1.0.0 (2016-09-26) +------------------- + +- This release implements OpenTracing Specification 1.0 (https://opentracing.io/spec) + diff --git a/vendor/github.com/opentracing/opentracing-go/LICENSE b/vendor/github.com/opentracing/opentracing-go/LICENSE new file mode 100644 index 000000000..f0027349e --- /dev/null +++ b/vendor/github.com/opentracing/opentracing-go/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2016 The OpenTracing Authors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/vendor/github.com/opentracing/opentracing-go/Makefile b/vendor/github.com/opentracing/opentracing-go/Makefile new file mode 100644 index 000000000..62abb63f5 --- /dev/null +++ b/vendor/github.com/opentracing/opentracing-go/Makefile @@ -0,0 +1,20 @@ +.DEFAULT_GOAL := test-and-lint + +.PHONY: test-and-lint +test-and-lint: test lint + +.PHONY: test +test: + go test -v -cover -race ./... + +.PHONY: cover +cover: + go test -v -coverprofile=coverage.txt -covermode=atomic -race ./... + +.PHONY: lint +lint: + go fmt ./... + golint ./... + @# Run again with magic to exit non-zero if golint outputs anything. + @! (golint ./... | read dummy) + go vet ./... diff --git a/vendor/github.com/opentracing/opentracing-go/README.md b/vendor/github.com/opentracing/opentracing-go/README.md new file mode 100644 index 000000000..6ef1d7c9d --- /dev/null +++ b/vendor/github.com/opentracing/opentracing-go/README.md @@ -0,0 +1,171 @@ +[![Gitter chat](http://img.shields.io/badge/gitter-join%20chat%20%E2%86%92-brightgreen.svg)](https://gitter.im/opentracing/public) [![Build Status](https://travis-ci.org/opentracing/opentracing-go.svg?branch=master)](https://travis-ci.org/opentracing/opentracing-go) [![GoDoc](https://godoc.org/github.com/opentracing/opentracing-go?status.svg)](http://godoc.org/github.com/opentracing/opentracing-go) +[![Sourcegraph Badge](https://sourcegraph.com/github.com/opentracing/opentracing-go/-/badge.svg)](https://sourcegraph.com/github.com/opentracing/opentracing-go?badge) + +# OpenTracing API for Go + +This package is a Go platform API for OpenTracing. + +## Required Reading + +In order to understand the Go platform API, one must first be familiar with the +[OpenTracing project](https://opentracing.io) and +[terminology](https://opentracing.io/specification/) more specifically. + +## API overview for those adding instrumentation + +Everyday consumers of this `opentracing` package really only need to worry +about a couple of key abstractions: the `StartSpan` function, the `Span` +interface, and binding a `Tracer` at `main()`-time. Here are code snippets +demonstrating some important use cases. + +#### Singleton initialization + +The simplest starting point is `./default_tracer.go`. As early as possible, call + +```go + import "github.com/opentracing/opentracing-go" + import ".../some_tracing_impl" + + func main() { + opentracing.SetGlobalTracer( + // tracing impl specific: + some_tracing_impl.New(...), + ) + ... + } +``` + +#### Non-Singleton initialization + +If you prefer direct control to singletons, manage ownership of the +`opentracing.Tracer` implementation explicitly. + +#### Creating a Span given an existing Go `context.Context` + +If you use `context.Context` in your application, OpenTracing's Go library will +happily rely on it for `Span` propagation. To start a new (blocking child) +`Span`, you can use `StartSpanFromContext`. + +```go + func xyz(ctx context.Context, ...) { + ... + span, ctx := opentracing.StartSpanFromContext(ctx, "operation_name") + defer span.Finish() + span.LogFields( + log.String("event", "soft error"), + log.String("type", "cache timeout"), + log.Int("waited.millis", 1500)) + ... + } +``` + +#### Starting an empty trace by creating a "root span" + +It's always possible to create a "root" `Span` with no parent or other causal +reference. + +```go + func xyz() { + ... + sp := opentracing.StartSpan("operation_name") + defer sp.Finish() + ... + } +``` + +#### Creating a (child) Span given an existing (parent) Span + +```go + func xyz(parentSpan opentracing.Span, ...) { + ... + sp := opentracing.StartSpan( + "operation_name", + opentracing.ChildOf(parentSpan.Context())) + defer sp.Finish() + ... + } +``` + +#### Serializing to the wire + +```go + func makeSomeRequest(ctx context.Context) ... { + if span := opentracing.SpanFromContext(ctx); span != nil { + httpClient := &http.Client{} + httpReq, _ := http.NewRequest("GET", "http://myservice/", nil) + + // Transmit the span's TraceContext as HTTP headers on our + // outbound request. + opentracing.GlobalTracer().Inject( + span.Context(), + opentracing.HTTPHeaders, + opentracing.HTTPHeadersCarrier(httpReq.Header)) + + resp, err := httpClient.Do(httpReq) + ... + } + ... + } +``` + +#### Deserializing from the wire + +```go + http.HandleFunc("/", func(w http.ResponseWriter, req *http.Request) { + var serverSpan opentracing.Span + appSpecificOperationName := ... + wireContext, err := opentracing.GlobalTracer().Extract( + opentracing.HTTPHeaders, + opentracing.HTTPHeadersCarrier(req.Header)) + if err != nil { + // Optionally record something about err here + } + + // Create the span referring to the RPC client if available. + // If wireContext == nil, a root span will be created. + serverSpan = opentracing.StartSpan( + appSpecificOperationName, + ext.RPCServerOption(wireContext)) + + defer serverSpan.Finish() + + ctx := opentracing.ContextWithSpan(context.Background(), serverSpan) + ... + } +``` + +#### Conditionally capture a field using `log.Noop` + +In some situations, you may want to dynamically decide whether or not +to log a field. For example, you may want to capture additional data, +such as a customer ID, in non-production environments: + +```go + func Customer(order *Order) log.Field { + if os.Getenv("ENVIRONMENT") == "dev" { + return log.String("customer", order.Customer.ID) + } + return log.Noop() + } +``` + +#### Goroutine-safety + +The entire public API is goroutine-safe and does not require external +synchronization. + +## API pointers for those implementing a tracing system + +Tracing system implementors may be able to reuse or copy-paste-modify the `basictracer` package, found [here](https://github.com/opentracing/basictracer-go). In particular, see `basictracer.New(...)`. + +## API compatibility + +For the time being, "mild" backwards-incompatible changes may be made without changing the major version number. As OpenTracing and `opentracing-go` mature, backwards compatibility will become more of a priority. + +## Tracer test suite + +A test suite is available in the [harness](https://godoc.org/github.com/opentracing/opentracing-go/harness) package that can assist Tracer implementors to assert that their Tracer is working correctly. + +## Licensing + +[Apache 2.0 License](./LICENSE). diff --git a/vendor/github.com/opentracing/opentracing-go/globaltracer.go b/vendor/github.com/opentracing/opentracing-go/globaltracer.go new file mode 100644 index 000000000..4f7066a92 --- /dev/null +++ b/vendor/github.com/opentracing/opentracing-go/globaltracer.go @@ -0,0 +1,42 @@ +package opentracing + +type registeredTracer struct { + tracer Tracer + isRegistered bool +} + +var ( + globalTracer = registeredTracer{NoopTracer{}, false} +) + +// SetGlobalTracer sets the [singleton] opentracing.Tracer returned by +// GlobalTracer(). Those who use GlobalTracer (rather than directly manage an +// opentracing.Tracer instance) should call SetGlobalTracer as early as +// possible in main(), prior to calling the `StartSpan` global func below. +// Prior to calling `SetGlobalTracer`, any Spans started via the `StartSpan` +// (etc) globals are noops. +func SetGlobalTracer(tracer Tracer) { + globalTracer = registeredTracer{tracer, true} +} + +// GlobalTracer returns the global singleton `Tracer` implementation. +// Before `SetGlobalTracer()` is called, the `GlobalTracer()` is a noop +// implementation that drops all data handed to it. +func GlobalTracer() Tracer { + return globalTracer.tracer +} + +// StartSpan defers to `Tracer.StartSpan`. See `GlobalTracer()`. +func StartSpan(operationName string, opts ...StartSpanOption) Span { + return globalTracer.tracer.StartSpan(operationName, opts...) +} + +// InitGlobalTracer is deprecated. Please use SetGlobalTracer. +func InitGlobalTracer(tracer Tracer) { + SetGlobalTracer(tracer) +} + +// IsGlobalTracerRegistered returns a `bool` to indicate if a tracer has been globally registered +func IsGlobalTracerRegistered() bool { + return globalTracer.isRegistered +} diff --git a/vendor/github.com/opentracing/opentracing-go/gocontext.go b/vendor/github.com/opentracing/opentracing-go/gocontext.go new file mode 100644 index 000000000..08c00c04e --- /dev/null +++ b/vendor/github.com/opentracing/opentracing-go/gocontext.go @@ -0,0 +1,60 @@ +package opentracing + +import "context" + +type contextKey struct{} + +var activeSpanKey = contextKey{} + +// ContextWithSpan returns a new `context.Context` that holds a reference to +// `span`'s SpanContext. +func ContextWithSpan(ctx context.Context, span Span) context.Context { + return context.WithValue(ctx, activeSpanKey, span) +} + +// SpanFromContext returns the `Span` previously associated with `ctx`, or +// `nil` if no such `Span` could be found. +// +// NOTE: context.Context != SpanContext: the former is Go's intra-process +// context propagation mechanism, and the latter houses OpenTracing's per-Span +// identity and baggage information. +func SpanFromContext(ctx context.Context) Span { + val := ctx.Value(activeSpanKey) + if sp, ok := val.(Span); ok { + return sp + } + return nil +} + +// StartSpanFromContext starts and returns a Span with `operationName`, using +// any Span found within `ctx` as a ChildOfRef. If no such parent could be +// found, StartSpanFromContext creates a root (parentless) Span. +// +// The second return value is a context.Context object built around the +// returned Span. +// +// Example usage: +// +// SomeFunction(ctx context.Context, ...) { +// sp, ctx := opentracing.StartSpanFromContext(ctx, "SomeFunction") +// defer sp.Finish() +// ... +// } +func StartSpanFromContext(ctx context.Context, operationName string, opts ...StartSpanOption) (Span, context.Context) { + return StartSpanFromContextWithTracer(ctx, GlobalTracer(), operationName, opts...) +} + +// StartSpanFromContextWithTracer starts and returns a span with `operationName` +// using a span found within the context as a ChildOfRef. If that doesn't exist +// it creates a root span. It also returns a context.Context object built +// around the returned span. +// +// It's behavior is identical to StartSpanFromContext except that it takes an explicit +// tracer as opposed to using the global tracer. +func StartSpanFromContextWithTracer(ctx context.Context, tracer Tracer, operationName string, opts ...StartSpanOption) (Span, context.Context) { + if parentSpan := SpanFromContext(ctx); parentSpan != nil { + opts = append(opts, ChildOf(parentSpan.Context())) + } + span := tracer.StartSpan(operationName, opts...) + return span, ContextWithSpan(ctx, span) +} diff --git a/vendor/github.com/opentracing/opentracing-go/log/field.go b/vendor/github.com/opentracing/opentracing-go/log/field.go new file mode 100644 index 000000000..50feea341 --- /dev/null +++ b/vendor/github.com/opentracing/opentracing-go/log/field.go @@ -0,0 +1,269 @@ +package log + +import ( + "fmt" + "math" +) + +type fieldType int + +const ( + stringType fieldType = iota + boolType + intType + int32Type + uint32Type + int64Type + uint64Type + float32Type + float64Type + errorType + objectType + lazyLoggerType + noopType +) + +// Field instances are constructed via LogBool, LogString, and so on. +// Tracing implementations may then handle them via the Field.Marshal +// method. +// +// "heavily influenced by" (i.e., partially stolen from) +// https://github.com/uber-go/zap +type Field struct { + key string + fieldType fieldType + numericVal int64 + stringVal string + interfaceVal interface{} +} + +// String adds a string-valued key:value pair to a Span.LogFields() record +func String(key, val string) Field { + return Field{ + key: key, + fieldType: stringType, + stringVal: val, + } +} + +// Bool adds a bool-valued key:value pair to a Span.LogFields() record +func Bool(key string, val bool) Field { + var numericVal int64 + if val { + numericVal = 1 + } + return Field{ + key: key, + fieldType: boolType, + numericVal: numericVal, + } +} + +// Int adds an int-valued key:value pair to a Span.LogFields() record +func Int(key string, val int) Field { + return Field{ + key: key, + fieldType: intType, + numericVal: int64(val), + } +} + +// Int32 adds an int32-valued key:value pair to a Span.LogFields() record +func Int32(key string, val int32) Field { + return Field{ + key: key, + fieldType: int32Type, + numericVal: int64(val), + } +} + +// Int64 adds an int64-valued key:value pair to a Span.LogFields() record +func Int64(key string, val int64) Field { + return Field{ + key: key, + fieldType: int64Type, + numericVal: val, + } +} + +// Uint32 adds a uint32-valued key:value pair to a Span.LogFields() record +func Uint32(key string, val uint32) Field { + return Field{ + key: key, + fieldType: uint32Type, + numericVal: int64(val), + } +} + +// Uint64 adds a uint64-valued key:value pair to a Span.LogFields() record +func Uint64(key string, val uint64) Field { + return Field{ + key: key, + fieldType: uint64Type, + numericVal: int64(val), + } +} + +// Float32 adds a float32-valued key:value pair to a Span.LogFields() record +func Float32(key string, val float32) Field { + return Field{ + key: key, + fieldType: float32Type, + numericVal: int64(math.Float32bits(val)), + } +} + +// Float64 adds a float64-valued key:value pair to a Span.LogFields() record +func Float64(key string, val float64) Field { + return Field{ + key: key, + fieldType: float64Type, + numericVal: int64(math.Float64bits(val)), + } +} + +// Error adds an error with the key "error" to a Span.LogFields() record +func Error(err error) Field { + return Field{ + key: "error", + fieldType: errorType, + interfaceVal: err, + } +} + +// Object adds an object-valued key:value pair to a Span.LogFields() record +func Object(key string, obj interface{}) Field { + return Field{ + key: key, + fieldType: objectType, + interfaceVal: obj, + } +} + +// LazyLogger allows for user-defined, late-bound logging of arbitrary data +type LazyLogger func(fv Encoder) + +// Lazy adds a LazyLogger to a Span.LogFields() record; the tracing +// implementation will call the LazyLogger function at an indefinite time in +// the future (after Lazy() returns). +func Lazy(ll LazyLogger) Field { + return Field{ + fieldType: lazyLoggerType, + interfaceVal: ll, + } +} + +// Noop creates a no-op log field that should be ignored by the tracer. +// It can be used to capture optional fields, for example those that should +// only be logged in non-production environment: +// +// func customerField(order *Order) log.Field { +// if os.Getenv("ENVIRONMENT") == "dev" { +// return log.String("customer", order.Customer.ID) +// } +// return log.Noop() +// } +// +// span.LogFields(log.String("event", "purchase"), customerField(order)) +// +func Noop() Field { + return Field{ + fieldType: noopType, + } +} + +// Encoder allows access to the contents of a Field (via a call to +// Field.Marshal). +// +// Tracer implementations typically provide an implementation of Encoder; +// OpenTracing callers typically do not need to concern themselves with it. +type Encoder interface { + EmitString(key, value string) + EmitBool(key string, value bool) + EmitInt(key string, value int) + EmitInt32(key string, value int32) + EmitInt64(key string, value int64) + EmitUint32(key string, value uint32) + EmitUint64(key string, value uint64) + EmitFloat32(key string, value float32) + EmitFloat64(key string, value float64) + EmitObject(key string, value interface{}) + EmitLazyLogger(value LazyLogger) +} + +// Marshal passes a Field instance through to the appropriate +// field-type-specific method of an Encoder. +func (lf Field) Marshal(visitor Encoder) { + switch lf.fieldType { + case stringType: + visitor.EmitString(lf.key, lf.stringVal) + case boolType: + visitor.EmitBool(lf.key, lf.numericVal != 0) + case intType: + visitor.EmitInt(lf.key, int(lf.numericVal)) + case int32Type: + visitor.EmitInt32(lf.key, int32(lf.numericVal)) + case int64Type: + visitor.EmitInt64(lf.key, int64(lf.numericVal)) + case uint32Type: + visitor.EmitUint32(lf.key, uint32(lf.numericVal)) + case uint64Type: + visitor.EmitUint64(lf.key, uint64(lf.numericVal)) + case float32Type: + visitor.EmitFloat32(lf.key, math.Float32frombits(uint32(lf.numericVal))) + case float64Type: + visitor.EmitFloat64(lf.key, math.Float64frombits(uint64(lf.numericVal))) + case errorType: + if err, ok := lf.interfaceVal.(error); ok { + visitor.EmitString(lf.key, err.Error()) + } else { + visitor.EmitString(lf.key, "") + } + case objectType: + visitor.EmitObject(lf.key, lf.interfaceVal) + case lazyLoggerType: + visitor.EmitLazyLogger(lf.interfaceVal.(LazyLogger)) + case noopType: + // intentionally left blank + } +} + +// Key returns the field's key. +func (lf Field) Key() string { + return lf.key +} + +// Value returns the field's value as interface{}. +func (lf Field) Value() interface{} { + switch lf.fieldType { + case stringType: + return lf.stringVal + case boolType: + return lf.numericVal != 0 + case intType: + return int(lf.numericVal) + case int32Type: + return int32(lf.numericVal) + case int64Type: + return int64(lf.numericVal) + case uint32Type: + return uint32(lf.numericVal) + case uint64Type: + return uint64(lf.numericVal) + case float32Type: + return math.Float32frombits(uint32(lf.numericVal)) + case float64Type: + return math.Float64frombits(uint64(lf.numericVal)) + case errorType, objectType, lazyLoggerType: + return lf.interfaceVal + case noopType: + return nil + default: + return nil + } +} + +// String returns a string representation of the key and value. +func (lf Field) String() string { + return fmt.Sprint(lf.key, ":", lf.Value()) +} diff --git a/vendor/github.com/opentracing/opentracing-go/log/util.go b/vendor/github.com/opentracing/opentracing-go/log/util.go new file mode 100644 index 000000000..3832feb5c --- /dev/null +++ b/vendor/github.com/opentracing/opentracing-go/log/util.go @@ -0,0 +1,54 @@ +package log + +import "fmt" + +// InterleavedKVToFields converts keyValues a la Span.LogKV() to a Field slice +// a la Span.LogFields(). +func InterleavedKVToFields(keyValues ...interface{}) ([]Field, error) { + if len(keyValues)%2 != 0 { + return nil, fmt.Errorf("non-even keyValues len: %d", len(keyValues)) + } + fields := make([]Field, len(keyValues)/2) + for i := 0; i*2 < len(keyValues); i++ { + key, ok := keyValues[i*2].(string) + if !ok { + return nil, fmt.Errorf( + "non-string key (pair #%d): %T", + i, keyValues[i*2]) + } + switch typedVal := keyValues[i*2+1].(type) { + case bool: + fields[i] = Bool(key, typedVal) + case string: + fields[i] = String(key, typedVal) + case int: + fields[i] = Int(key, typedVal) + case int8: + fields[i] = Int32(key, int32(typedVal)) + case int16: + fields[i] = Int32(key, int32(typedVal)) + case int32: + fields[i] = Int32(key, typedVal) + case int64: + fields[i] = Int64(key, typedVal) + case uint: + fields[i] = Uint64(key, uint64(typedVal)) + case uint64: + fields[i] = Uint64(key, typedVal) + case uint8: + fields[i] = Uint32(key, uint32(typedVal)) + case uint16: + fields[i] = Uint32(key, uint32(typedVal)) + case uint32: + fields[i] = Uint32(key, typedVal) + case float32: + fields[i] = Float32(key, typedVal) + case float64: + fields[i] = Float64(key, typedVal) + default: + // When in doubt, coerce to a string + fields[i] = String(key, fmt.Sprint(typedVal)) + } + } + return fields, nil +} diff --git a/vendor/github.com/opentracing/opentracing-go/noop.go b/vendor/github.com/opentracing/opentracing-go/noop.go new file mode 100644 index 000000000..0d32f692c --- /dev/null +++ b/vendor/github.com/opentracing/opentracing-go/noop.go @@ -0,0 +1,64 @@ +package opentracing + +import "github.com/opentracing/opentracing-go/log" + +// A NoopTracer is a trivial, minimum overhead implementation of Tracer +// for which all operations are no-ops. +// +// The primary use of this implementation is in libraries, such as RPC +// frameworks, that make tracing an optional feature controlled by the +// end user. A no-op implementation allows said libraries to use it +// as the default Tracer and to write instrumentation that does +// not need to keep checking if the tracer instance is nil. +// +// For the same reason, the NoopTracer is the default "global" tracer +// (see GlobalTracer and SetGlobalTracer functions). +// +// WARNING: NoopTracer does not support baggage propagation. +type NoopTracer struct{} + +type noopSpan struct{} +type noopSpanContext struct{} + +var ( + defaultNoopSpanContext = noopSpanContext{} + defaultNoopSpan = noopSpan{} + defaultNoopTracer = NoopTracer{} +) + +const ( + emptyString = "" +) + +// noopSpanContext: +func (n noopSpanContext) ForeachBaggageItem(handler func(k, v string) bool) {} + +// noopSpan: +func (n noopSpan) Context() SpanContext { return defaultNoopSpanContext } +func (n noopSpan) SetBaggageItem(key, val string) Span { return defaultNoopSpan } +func (n noopSpan) BaggageItem(key string) string { return emptyString } +func (n noopSpan) SetTag(key string, value interface{}) Span { return n } +func (n noopSpan) LogFields(fields ...log.Field) {} +func (n noopSpan) LogKV(keyVals ...interface{}) {} +func (n noopSpan) Finish() {} +func (n noopSpan) FinishWithOptions(opts FinishOptions) {} +func (n noopSpan) SetOperationName(operationName string) Span { return n } +func (n noopSpan) Tracer() Tracer { return defaultNoopTracer } +func (n noopSpan) LogEvent(event string) {} +func (n noopSpan) LogEventWithPayload(event string, payload interface{}) {} +func (n noopSpan) Log(data LogData) {} + +// StartSpan belongs to the Tracer interface. +func (n NoopTracer) StartSpan(operationName string, opts ...StartSpanOption) Span { + return defaultNoopSpan +} + +// Inject belongs to the Tracer interface. +func (n NoopTracer) Inject(sp SpanContext, format interface{}, carrier interface{}) error { + return nil +} + +// Extract belongs to the Tracer interface. +func (n NoopTracer) Extract(format interface{}, carrier interface{}) (SpanContext, error) { + return nil, ErrSpanContextNotFound +} diff --git a/vendor/github.com/opentracing/opentracing-go/propagation.go b/vendor/github.com/opentracing/opentracing-go/propagation.go new file mode 100644 index 000000000..b0c275eb0 --- /dev/null +++ b/vendor/github.com/opentracing/opentracing-go/propagation.go @@ -0,0 +1,176 @@ +package opentracing + +import ( + "errors" + "net/http" +) + +/////////////////////////////////////////////////////////////////////////////// +// CORE PROPAGATION INTERFACES: +/////////////////////////////////////////////////////////////////////////////// + +var ( + // ErrUnsupportedFormat occurs when the `format` passed to Tracer.Inject() or + // Tracer.Extract() is not recognized by the Tracer implementation. + ErrUnsupportedFormat = errors.New("opentracing: Unknown or unsupported Inject/Extract format") + + // ErrSpanContextNotFound occurs when the `carrier` passed to + // Tracer.Extract() is valid and uncorrupted but has insufficient + // information to extract a SpanContext. + ErrSpanContextNotFound = errors.New("opentracing: SpanContext not found in Extract carrier") + + // ErrInvalidSpanContext errors occur when Tracer.Inject() is asked to + // operate on a SpanContext which it is not prepared to handle (for + // example, since it was created by a different tracer implementation). + ErrInvalidSpanContext = errors.New("opentracing: SpanContext type incompatible with tracer") + + // ErrInvalidCarrier errors occur when Tracer.Inject() or Tracer.Extract() + // implementations expect a different type of `carrier` than they are + // given. + ErrInvalidCarrier = errors.New("opentracing: Invalid Inject/Extract carrier") + + // ErrSpanContextCorrupted occurs when the `carrier` passed to + // Tracer.Extract() is of the expected type but is corrupted. + ErrSpanContextCorrupted = errors.New("opentracing: SpanContext data corrupted in Extract carrier") +) + +/////////////////////////////////////////////////////////////////////////////// +// BUILTIN PROPAGATION FORMATS: +/////////////////////////////////////////////////////////////////////////////// + +// BuiltinFormat is used to demarcate the values within package `opentracing` +// that are intended for use with the Tracer.Inject() and Tracer.Extract() +// methods. +type BuiltinFormat byte + +const ( + // Binary represents SpanContexts as opaque binary data. + // + // For Tracer.Inject(): the carrier must be an `io.Writer`. + // + // For Tracer.Extract(): the carrier must be an `io.Reader`. + Binary BuiltinFormat = iota + + // TextMap represents SpanContexts as key:value string pairs. + // + // Unlike HTTPHeaders, the TextMap format does not restrict the key or + // value character sets in any way. + // + // For Tracer.Inject(): the carrier must be a `TextMapWriter`. + // + // For Tracer.Extract(): the carrier must be a `TextMapReader`. + TextMap + + // HTTPHeaders represents SpanContexts as HTTP header string pairs. + // + // Unlike TextMap, the HTTPHeaders format requires that the keys and values + // be valid as HTTP headers as-is (i.e., character casing may be unstable + // and special characters are disallowed in keys, values should be + // URL-escaped, etc). + // + // For Tracer.Inject(): the carrier must be a `TextMapWriter`. + // + // For Tracer.Extract(): the carrier must be a `TextMapReader`. + // + // See HTTPHeadersCarrier for an implementation of both TextMapWriter + // and TextMapReader that defers to an http.Header instance for storage. + // For example, Inject(): + // + // carrier := opentracing.HTTPHeadersCarrier(httpReq.Header) + // err := span.Tracer().Inject( + // span.Context(), opentracing.HTTPHeaders, carrier) + // + // Or Extract(): + // + // carrier := opentracing.HTTPHeadersCarrier(httpReq.Header) + // clientContext, err := tracer.Extract( + // opentracing.HTTPHeaders, carrier) + // + HTTPHeaders +) + +// TextMapWriter is the Inject() carrier for the TextMap builtin format. With +// it, the caller can encode a SpanContext for propagation as entries in a map +// of unicode strings. +type TextMapWriter interface { + // Set a key:value pair to the carrier. Multiple calls to Set() for the + // same key leads to undefined behavior. + // + // NOTE: The backing store for the TextMapWriter may contain data unrelated + // to SpanContext. As such, Inject() and Extract() implementations that + // call the TextMapWriter and TextMapReader interfaces must agree on a + // prefix or other convention to distinguish their own key:value pairs. + Set(key, val string) +} + +// TextMapReader is the Extract() carrier for the TextMap builtin format. With it, +// the caller can decode a propagated SpanContext as entries in a map of +// unicode strings. +type TextMapReader interface { + // ForeachKey returns TextMap contents via repeated calls to the `handler` + // function. If any call to `handler` returns a non-nil error, ForeachKey + // terminates and returns that error. + // + // NOTE: The backing store for the TextMapReader may contain data unrelated + // to SpanContext. As such, Inject() and Extract() implementations that + // call the TextMapWriter and TextMapReader interfaces must agree on a + // prefix or other convention to distinguish their own key:value pairs. + // + // The "foreach" callback pattern reduces unnecessary copying in some cases + // and also allows implementations to hold locks while the map is read. + ForeachKey(handler func(key, val string) error) error +} + +// TextMapCarrier allows the use of regular map[string]string +// as both TextMapWriter and TextMapReader. +type TextMapCarrier map[string]string + +// ForeachKey conforms to the TextMapReader interface. +func (c TextMapCarrier) ForeachKey(handler func(key, val string) error) error { + for k, v := range c { + if err := handler(k, v); err != nil { + return err + } + } + return nil +} + +// Set implements Set() of opentracing.TextMapWriter +func (c TextMapCarrier) Set(key, val string) { + c[key] = val +} + +// HTTPHeadersCarrier satisfies both TextMapWriter and TextMapReader. +// +// Example usage for server side: +// +// carrier := opentracing.HTTPHeadersCarrier(httpReq.Header) +// clientContext, err := tracer.Extract(opentracing.HTTPHeaders, carrier) +// +// Example usage for client side: +// +// carrier := opentracing.HTTPHeadersCarrier(httpReq.Header) +// err := tracer.Inject( +// span.Context(), +// opentracing.HTTPHeaders, +// carrier) +// +type HTTPHeadersCarrier http.Header + +// Set conforms to the TextMapWriter interface. +func (c HTTPHeadersCarrier) Set(key, val string) { + h := http.Header(c) + h.Set(key, val) +} + +// ForeachKey conforms to the TextMapReader interface. +func (c HTTPHeadersCarrier) ForeachKey(handler func(key, val string) error) error { + for k, vals := range c { + for _, v := range vals { + if err := handler(k, v); err != nil { + return err + } + } + } + return nil +} diff --git a/vendor/github.com/opentracing/opentracing-go/span.go b/vendor/github.com/opentracing/opentracing-go/span.go new file mode 100644 index 000000000..0d3fb5341 --- /dev/null +++ b/vendor/github.com/opentracing/opentracing-go/span.go @@ -0,0 +1,189 @@ +package opentracing + +import ( + "time" + + "github.com/opentracing/opentracing-go/log" +) + +// SpanContext represents Span state that must propagate to descendant Spans and across process +// boundaries (e.g., a tuple). +type SpanContext interface { + // ForeachBaggageItem grants access to all baggage items stored in the + // SpanContext. + // The handler function will be called for each baggage key/value pair. + // The ordering of items is not guaranteed. + // + // The bool return value indicates if the handler wants to continue iterating + // through the rest of the baggage items; for example if the handler is trying to + // find some baggage item by pattern matching the name, it can return false + // as soon as the item is found to stop further iterations. + ForeachBaggageItem(handler func(k, v string) bool) +} + +// Span represents an active, un-finished span in the OpenTracing system. +// +// Spans are created by the Tracer interface. +type Span interface { + // Sets the end timestamp and finalizes Span state. + // + // With the exception of calls to Context() (which are always allowed), + // Finish() must be the last call made to any span instance, and to do + // otherwise leads to undefined behavior. + Finish() + // FinishWithOptions is like Finish() but with explicit control over + // timestamps and log data. + FinishWithOptions(opts FinishOptions) + + // Context() yields the SpanContext for this Span. Note that the return + // value of Context() is still valid after a call to Span.Finish(), as is + // a call to Span.Context() after a call to Span.Finish(). + Context() SpanContext + + // Sets or changes the operation name. + // + // Returns a reference to this Span for chaining. + SetOperationName(operationName string) Span + + // Adds a tag to the span. + // + // If there is a pre-existing tag set for `key`, it is overwritten. + // + // Tag values can be numeric types, strings, or bools. The behavior of + // other tag value types is undefined at the OpenTracing level. If a + // tracing system does not know how to handle a particular value type, it + // may ignore the tag, but shall not panic. + // + // Returns a reference to this Span for chaining. + SetTag(key string, value interface{}) Span + + // LogFields is an efficient and type-checked way to record key:value + // logging data about a Span, though the programming interface is a little + // more verbose than LogKV(). Here's an example: + // + // span.LogFields( + // log.String("event", "soft error"), + // log.String("type", "cache timeout"), + // log.Int("waited.millis", 1500)) + // + // Also see Span.FinishWithOptions() and FinishOptions.BulkLogData. + LogFields(fields ...log.Field) + + // LogKV is a concise, readable way to record key:value logging data about + // a Span, though unfortunately this also makes it less efficient and less + // type-safe than LogFields(). Here's an example: + // + // span.LogKV( + // "event", "soft error", + // "type", "cache timeout", + // "waited.millis", 1500) + // + // For LogKV (as opposed to LogFields()), the parameters must appear as + // key-value pairs, like + // + // span.LogKV(key1, val1, key2, val2, key3, val3, ...) + // + // The keys must all be strings. The values may be strings, numeric types, + // bools, Go error instances, or arbitrary structs. + // + // (Note to implementors: consider the log.InterleavedKVToFields() helper) + LogKV(alternatingKeyValues ...interface{}) + + // SetBaggageItem sets a key:value pair on this Span and its SpanContext + // that also propagates to descendants of this Span. + // + // SetBaggageItem() enables powerful functionality given a full-stack + // opentracing integration (e.g., arbitrary application data from a mobile + // app can make it, transparently, all the way into the depths of a storage + // system), and with it some powerful costs: use this feature with care. + // + // IMPORTANT NOTE #1: SetBaggageItem() will only propagate baggage items to + // *future* causal descendants of the associated Span. + // + // IMPORTANT NOTE #2: Use this thoughtfully and with care. Every key and + // value is copied into every local *and remote* child of the associated + // Span, and that can add up to a lot of network and cpu overhead. + // + // Returns a reference to this Span for chaining. + SetBaggageItem(restrictedKey, value string) Span + + // Gets the value for a baggage item given its key. Returns the empty string + // if the value isn't found in this Span. + BaggageItem(restrictedKey string) string + + // Provides access to the Tracer that created this Span. + Tracer() Tracer + + // Deprecated: use LogFields or LogKV + LogEvent(event string) + // Deprecated: use LogFields or LogKV + LogEventWithPayload(event string, payload interface{}) + // Deprecated: use LogFields or LogKV + Log(data LogData) +} + +// LogRecord is data associated with a single Span log. Every LogRecord +// instance must specify at least one Field. +type LogRecord struct { + Timestamp time.Time + Fields []log.Field +} + +// FinishOptions allows Span.FinishWithOptions callers to override the finish +// timestamp and provide log data via a bulk interface. +type FinishOptions struct { + // FinishTime overrides the Span's finish time, or implicitly becomes + // time.Now() if FinishTime.IsZero(). + // + // FinishTime must resolve to a timestamp that's >= the Span's StartTime + // (per StartSpanOptions). + FinishTime time.Time + + // LogRecords allows the caller to specify the contents of many LogFields() + // calls with a single slice. May be nil. + // + // None of the LogRecord.Timestamp values may be .IsZero() (i.e., they must + // be set explicitly). Also, they must be >= the Span's start timestamp and + // <= the FinishTime (or time.Now() if FinishTime.IsZero()). Otherwise the + // behavior of FinishWithOptions() is undefined. + // + // If specified, the caller hands off ownership of LogRecords at + // FinishWithOptions() invocation time. + // + // If specified, the (deprecated) BulkLogData must be nil or empty. + LogRecords []LogRecord + + // BulkLogData is DEPRECATED. + BulkLogData []LogData +} + +// LogData is DEPRECATED +type LogData struct { + Timestamp time.Time + Event string + Payload interface{} +} + +// ToLogRecord converts a deprecated LogData to a non-deprecated LogRecord +func (ld *LogData) ToLogRecord() LogRecord { + var literalTimestamp time.Time + if ld.Timestamp.IsZero() { + literalTimestamp = time.Now() + } else { + literalTimestamp = ld.Timestamp + } + rval := LogRecord{ + Timestamp: literalTimestamp, + } + if ld.Payload == nil { + rval.Fields = []log.Field{ + log.String("event", ld.Event), + } + } else { + rval.Fields = []log.Field{ + log.String("event", ld.Event), + log.Object("payload", ld.Payload), + } + } + return rval +} diff --git a/vendor/github.com/opentracing/opentracing-go/tracer.go b/vendor/github.com/opentracing/opentracing-go/tracer.go new file mode 100644 index 000000000..715f0cedf --- /dev/null +++ b/vendor/github.com/opentracing/opentracing-go/tracer.go @@ -0,0 +1,304 @@ +package opentracing + +import "time" + +// Tracer is a simple, thin interface for Span creation and SpanContext +// propagation. +type Tracer interface { + + // Create, start, and return a new Span with the given `operationName` and + // incorporate the given StartSpanOption `opts`. (Note that `opts` borrows + // from the "functional options" pattern, per + // http://dave.cheney.net/2014/10/17/functional-options-for-friendly-apis) + // + // A Span with no SpanReference options (e.g., opentracing.ChildOf() or + // opentracing.FollowsFrom()) becomes the root of its own trace. + // + // Examples: + // + // var tracer opentracing.Tracer = ... + // + // // The root-span case: + // sp := tracer.StartSpan("GetFeed") + // + // // The vanilla child span case: + // sp := tracer.StartSpan( + // "GetFeed", + // opentracing.ChildOf(parentSpan.Context())) + // + // // All the bells and whistles: + // sp := tracer.StartSpan( + // "GetFeed", + // opentracing.ChildOf(parentSpan.Context()), + // opentracing.Tag{"user_agent", loggedReq.UserAgent}, + // opentracing.StartTime(loggedReq.Timestamp), + // ) + // + StartSpan(operationName string, opts ...StartSpanOption) Span + + // Inject() takes the `sm` SpanContext instance and injects it for + // propagation within `carrier`. The actual type of `carrier` depends on + // the value of `format`. + // + // OpenTracing defines a common set of `format` values (see BuiltinFormat), + // and each has an expected carrier type. + // + // Other packages may declare their own `format` values, much like the keys + // used by `context.Context` (see https://godoc.org/context#WithValue). + // + // Example usage (sans error handling): + // + // carrier := opentracing.HTTPHeadersCarrier(httpReq.Header) + // err := tracer.Inject( + // span.Context(), + // opentracing.HTTPHeaders, + // carrier) + // + // NOTE: All opentracing.Tracer implementations MUST support all + // BuiltinFormats. + // + // Implementations may return opentracing.ErrUnsupportedFormat if `format` + // is not supported by (or not known by) the implementation. + // + // Implementations may return opentracing.ErrInvalidCarrier or any other + // implementation-specific error if the format is supported but injection + // fails anyway. + // + // See Tracer.Extract(). + Inject(sm SpanContext, format interface{}, carrier interface{}) error + + // Extract() returns a SpanContext instance given `format` and `carrier`. + // + // OpenTracing defines a common set of `format` values (see BuiltinFormat), + // and each has an expected carrier type. + // + // Other packages may declare their own `format` values, much like the keys + // used by `context.Context` (see + // https://godoc.org/golang.org/x/net/context#WithValue). + // + // Example usage (with StartSpan): + // + // + // carrier := opentracing.HTTPHeadersCarrier(httpReq.Header) + // clientContext, err := tracer.Extract(opentracing.HTTPHeaders, carrier) + // + // // ... assuming the ultimate goal here is to resume the trace with a + // // server-side Span: + // var serverSpan opentracing.Span + // if err == nil { + // span = tracer.StartSpan( + // rpcMethodName, ext.RPCServerOption(clientContext)) + // } else { + // span = tracer.StartSpan(rpcMethodName) + // } + // + // + // NOTE: All opentracing.Tracer implementations MUST support all + // BuiltinFormats. + // + // Return values: + // - A successful Extract returns a SpanContext instance and a nil error + // - If there was simply no SpanContext to extract in `carrier`, Extract() + // returns (nil, opentracing.ErrSpanContextNotFound) + // - If `format` is unsupported or unrecognized, Extract() returns (nil, + // opentracing.ErrUnsupportedFormat) + // - If there are more fundamental problems with the `carrier` object, + // Extract() may return opentracing.ErrInvalidCarrier, + // opentracing.ErrSpanContextCorrupted, or implementation-specific + // errors. + // + // See Tracer.Inject(). + Extract(format interface{}, carrier interface{}) (SpanContext, error) +} + +// StartSpanOptions allows Tracer.StartSpan() callers and implementors a +// mechanism to override the start timestamp, specify Span References, and make +// a single Tag or multiple Tags available at Span start time. +// +// StartSpan() callers should look at the StartSpanOption interface and +// implementations available in this package. +// +// Tracer implementations can convert a slice of `StartSpanOption` instances +// into a `StartSpanOptions` struct like so: +// +// func StartSpan(opName string, opts ...opentracing.StartSpanOption) { +// sso := opentracing.StartSpanOptions{} +// for _, o := range opts { +// o.Apply(&sso) +// } +// ... +// } +// +type StartSpanOptions struct { + // Zero or more causal references to other Spans (via their SpanContext). + // If empty, start a "root" Span (i.e., start a new trace). + References []SpanReference + + // StartTime overrides the Span's start time, or implicitly becomes + // time.Now() if StartTime.IsZero(). + StartTime time.Time + + // Tags may have zero or more entries; the restrictions on map values are + // identical to those for Span.SetTag(). May be nil. + // + // If specified, the caller hands off ownership of Tags at + // StartSpan() invocation time. + Tags map[string]interface{} +} + +// StartSpanOption instances (zero or more) may be passed to Tracer.StartSpan. +// +// StartSpanOption borrows from the "functional options" pattern, per +// http://dave.cheney.net/2014/10/17/functional-options-for-friendly-apis +type StartSpanOption interface { + Apply(*StartSpanOptions) +} + +// SpanReferenceType is an enum type describing different categories of +// relationships between two Spans. If Span-2 refers to Span-1, the +// SpanReferenceType describes Span-1 from Span-2's perspective. For example, +// ChildOfRef means that Span-1 created Span-2. +// +// NOTE: Span-1 and Span-2 do *not* necessarily depend on each other for +// completion; e.g., Span-2 may be part of a background job enqueued by Span-1, +// or Span-2 may be sitting in a distributed queue behind Span-1. +type SpanReferenceType int + +const ( + // ChildOfRef refers to a parent Span that caused *and* somehow depends + // upon the new child Span. Often (but not always), the parent Span cannot + // finish until the child Span does. + // + // An timing diagram for a ChildOfRef that's blocked on the new Span: + // + // [-Parent Span---------] + // [-Child Span----] + // + // See http://opentracing.io/spec/ + // + // See opentracing.ChildOf() + ChildOfRef SpanReferenceType = iota + + // FollowsFromRef refers to a parent Span that does not depend in any way + // on the result of the new child Span. For instance, one might use + // FollowsFromRefs to describe pipeline stages separated by queues, + // or a fire-and-forget cache insert at the tail end of a web request. + // + // A FollowsFromRef Span is part of the same logical trace as the new Span: + // i.e., the new Span is somehow caused by the work of its FollowsFromRef. + // + // All of the following could be valid timing diagrams for children that + // "FollowFrom" a parent. + // + // [-Parent Span-] [-Child Span-] + // + // + // [-Parent Span--] + // [-Child Span-] + // + // + // [-Parent Span-] + // [-Child Span-] + // + // See http://opentracing.io/spec/ + // + // See opentracing.FollowsFrom() + FollowsFromRef +) + +// SpanReference is a StartSpanOption that pairs a SpanReferenceType and a +// referenced SpanContext. See the SpanReferenceType documentation for +// supported relationships. If SpanReference is created with +// ReferencedContext==nil, it has no effect. Thus it allows for a more concise +// syntax for starting spans: +// +// sc, _ := tracer.Extract(someFormat, someCarrier) +// span := tracer.StartSpan("operation", opentracing.ChildOf(sc)) +// +// The `ChildOf(sc)` option above will not panic if sc == nil, it will just +// not add the parent span reference to the options. +type SpanReference struct { + Type SpanReferenceType + ReferencedContext SpanContext +} + +// Apply satisfies the StartSpanOption interface. +func (r SpanReference) Apply(o *StartSpanOptions) { + if r.ReferencedContext != nil { + o.References = append(o.References, r) + } +} + +// ChildOf returns a StartSpanOption pointing to a dependent parent span. +// If sc == nil, the option has no effect. +// +// See ChildOfRef, SpanReference +func ChildOf(sc SpanContext) SpanReference { + return SpanReference{ + Type: ChildOfRef, + ReferencedContext: sc, + } +} + +// FollowsFrom returns a StartSpanOption pointing to a parent Span that caused +// the child Span but does not directly depend on its result in any way. +// If sc == nil, the option has no effect. +// +// See FollowsFromRef, SpanReference +func FollowsFrom(sc SpanContext) SpanReference { + return SpanReference{ + Type: FollowsFromRef, + ReferencedContext: sc, + } +} + +// StartTime is a StartSpanOption that sets an explicit start timestamp for the +// new Span. +type StartTime time.Time + +// Apply satisfies the StartSpanOption interface. +func (t StartTime) Apply(o *StartSpanOptions) { + o.StartTime = time.Time(t) +} + +// Tags are a generic map from an arbitrary string key to an opaque value type. +// The underlying tracing system is responsible for interpreting and +// serializing the values. +type Tags map[string]interface{} + +// Apply satisfies the StartSpanOption interface. +func (t Tags) Apply(o *StartSpanOptions) { + if o.Tags == nil { + o.Tags = make(map[string]interface{}) + } + for k, v := range t { + o.Tags[k] = v + } +} + +// Tag may be passed as a StartSpanOption to add a tag to new spans, +// or its Set method may be used to apply the tag to an existing Span, +// for example: +// +// tracer.StartSpan("opName", Tag{"Key", value}) +// +// or +// +// Tag{"key", value}.Set(span) +type Tag struct { + Key string + Value interface{} +} + +// Apply satisfies the StartSpanOption interface. +func (t Tag) Apply(o *StartSpanOptions) { + if o.Tags == nil { + o.Tags = make(map[string]interface{}) + } + o.Tags[t.Key] = t.Value +} + +// Set applies the tag to an existing Span. +func (t Tag) Set(s Span) { + s.SetTag(t.Key, t.Value) +} diff --git a/vendor/github.com/prometheus/client_golang/api/client.go b/vendor/github.com/prometheus/client_golang/api/client.go index db78ce230..2e6a5518e 100644 --- a/vendor/github.com/prometheus/client_golang/api/client.go +++ b/vendor/github.com/prometheus/client_golang/api/client.go @@ -25,6 +25,42 @@ import ( "time" ) +func NewErrorAPI(err error, warnings []string) Error { + if err == nil && warnings == nil { + return nil + } + return &ErrorAPI{err, warnings} +} + +type ErrorAPI struct { + err error + warnings []string +} + +func (w *ErrorAPI) Err() error { + return w.err +} + +func (w *ErrorAPI) Error() string { + if w.err != nil { + return w.err.Error() + } + return "Warnings: " + strings.Join(w.warnings, " , ") +} + +func (w *ErrorAPI) Warnings() []string { + return w.warnings +} + +// Error encapsulates an error + warning +type Error interface { + error + // Err returns the underlying error. + Err() error + // Warnings returns a list of warnings. + Warnings() []string +} + // DefaultRoundTripper is used if no RoundTripper is set in Config. var DefaultRoundTripper http.RoundTripper = &http.Transport{ Proxy: http.ProxyFromEnvironment, @@ -55,14 +91,14 @@ func (cfg *Config) roundTripper() http.RoundTripper { // Client is the interface for an API client. type Client interface { URL(ep string, args map[string]string) *url.URL - Do(context.Context, *http.Request) (*http.Response, []byte, error) + Do(context.Context, *http.Request) (*http.Response, []byte, Error) } // DoGetFallback will attempt to do the request as-is, and on a 405 it will fallback to a GET request. -func DoGetFallback(c Client, ctx context.Context, u *url.URL, args url.Values) (*http.Response, []byte, error) { +func DoGetFallback(c Client, ctx context.Context, u *url.URL, args url.Values) (*http.Response, []byte, Error) { req, err := http.NewRequest(http.MethodPost, u.String(), strings.NewReader(args.Encode())) if err != nil { - return nil, nil, err + return nil, nil, NewErrorAPI(err, nil) } req.Header.Set("Content-Type", "application/x-www-form-urlencoded") @@ -71,11 +107,14 @@ func DoGetFallback(c Client, ctx context.Context, u *url.URL, args url.Values) ( u.RawQuery = args.Encode() req, err = http.NewRequest(http.MethodGet, u.String(), nil) if err != nil { - return nil, nil, err + return nil, nil, NewErrorAPI(err, nil) } } else { - return resp, body, err + if err != nil { + return resp, body, NewErrorAPI(err, nil) + } + return resp, body, nil } return c.Do(ctx, req) } @@ -115,7 +154,7 @@ func (c *httpClient) URL(ep string, args map[string]string) *url.URL { return &u } -func (c *httpClient) Do(ctx context.Context, req *http.Request) (*http.Response, []byte, error) { +func (c *httpClient) Do(ctx context.Context, req *http.Request) (*http.Response, []byte, Error) { if ctx != nil { req = req.WithContext(ctx) } @@ -127,7 +166,7 @@ func (c *httpClient) Do(ctx context.Context, req *http.Request) (*http.Response, }() if err != nil { - return nil, nil, err + return nil, nil, NewErrorAPI(err, nil) } var body []byte @@ -147,5 +186,5 @@ func (c *httpClient) Do(ctx context.Context, req *http.Request) (*http.Response, case <-done: } - return resp, body, err + return resp, body, NewErrorAPI(err, nil) } diff --git a/vendor/github.com/prometheus/client_golang/api/prometheus/v1/api.go b/vendor/github.com/prometheus/client_golang/api/prometheus/v1/api.go index 8394c97af..28cdaef69 100644 --- a/vendor/github.com/prometheus/client_golang/api/prometheus/v1/api.go +++ b/vendor/github.com/prometheus/client_golang/api/prometheus/v1/api.go @@ -17,17 +17,105 @@ package v1 import ( "context" - "encoding/json" "errors" "fmt" + "math" "net/http" "strconv" + "strings" "time" + "unsafe" + + json "github.com/json-iterator/go" + + "github.com/prometheus/common/model" "github.com/prometheus/client_golang/api" - "github.com/prometheus/common/model" ) +func init() { + json.RegisterTypeEncoderFunc("model.SamplePair", marshalPointJSON, marshalPointJSONIsEmpty) + json.RegisterTypeDecoderFunc("model.SamplePair", unMarshalPointJSON) +} + +func unMarshalPointJSON(ptr unsafe.Pointer, iter *json.Iterator) { + p := (*model.SamplePair)(ptr) + if !iter.ReadArray() { + iter.ReportError("unmarshal model.SamplePair", "SamplePair must be [timestamp, value]") + return + } + t := iter.ReadNumber() + if err := p.Timestamp.UnmarshalJSON([]byte(t)); err != nil { + iter.ReportError("unmarshal model.SamplePair", err.Error()) + return + } + if !iter.ReadArray() { + iter.ReportError("unmarshal model.SamplePair", "SamplePair missing value") + return + } + + f, err := strconv.ParseFloat(iter.ReadString(), 64) + if err != nil { + iter.ReportError("unmarshal model.SamplePair", err.Error()) + return + } + p.Value = model.SampleValue(f) + + if iter.ReadArray() { + iter.ReportError("unmarshal model.SamplePair", "SamplePair has too many values, must be [timestamp, value]") + return + } +} + +func marshalPointJSON(ptr unsafe.Pointer, stream *json.Stream) { + p := *((*model.SamplePair)(ptr)) + stream.WriteArrayStart() + // Write out the timestamp as a float divided by 1000. + // This is ~3x faster than converting to a float. + t := int64(p.Timestamp) + if t < 0 { + stream.WriteRaw(`-`) + t = -t + } + stream.WriteInt64(t / 1000) + fraction := t % 1000 + if fraction != 0 { + stream.WriteRaw(`.`) + if fraction < 100 { + stream.WriteRaw(`0`) + } + if fraction < 10 { + stream.WriteRaw(`0`) + } + stream.WriteInt64(fraction) + } + stream.WriteMore() + stream.WriteRaw(`"`) + + // Taken from https://github.com/json-iterator/go/blob/master/stream_float.go#L71 as a workaround + // to https://github.com/json-iterator/go/issues/365 (jsoniter, to follow json standard, doesn't allow inf/nan) + buf := stream.Buffer() + abs := math.Abs(float64(p.Value)) + fmt := byte('f') + // Note: Must use float32 comparisons for underlying float32 value to get precise cutoffs right. + if abs != 0 { + if abs < 1e-6 || abs >= 1e21 { + fmt = 'e' + fmt = 'e' + } + } + buf = strconv.AppendFloat(buf, float64(p.Value), fmt, -1, 64) + stream.SetBuffer(buf) + + stream.WriteRaw(`"`) + stream.WriteArrayEnd() + +} + +func marshalPointJSONIsEmpty(ptr unsafe.Pointer) bool { + return false +} + const ( statusAPIError = 422 @@ -40,6 +128,7 @@ const ( epLabelValues = apiPrefix + "/label/:name/values" epSeries = apiPrefix + "/series" epTargets = apiPrefix + "/targets" + epTargetsMetadata = apiPrefix + "/targets/metadata" epRules = apiPrefix + "/rules" epSnapshot = apiPrefix + "/admin/tsdb/snapshot" epDeleteSeries = apiPrefix + "/admin/tsdb/delete_series" @@ -63,6 +152,9 @@ type RuleType string // RuleHealth models the health status of a rule. type RuleHealth string +// MetricType models the type of a metric. +type MetricType string + const ( // Possible values for AlertState. AlertStateFiring AlertState = "firing" @@ -91,17 +183,40 @@ const ( RuleHealthGood = "ok" RuleHealthUnknown = "unknown" RuleHealthBad = "err" + + // Possible values for MetricType + MetricTypeCounter MetricType = "counter" + MetricTypeGauge MetricType = "gauge" + MetricTypeHistogram MetricType = "histogram" + MetricTypeGaugeHistogram MetricType = "gaugehistogram" + MetricTypeSummary MetricType = "summary" + MetricTypeInfo MetricType = "info" + MetricTypeStateset MetricType = "stateset" + MetricTypeUnknown MetricType = "unknown" ) // Error is an error returned by the API. type Error struct { - Type ErrorType - Msg string - Detail string + Type ErrorType + Msg string + Detail string + warnings []string } func (e *Error) Error() string { - return fmt.Sprintf("%s: %s", e.Type, e.Msg) + if e.Type != "" || e.Msg != "" { + return fmt.Sprintf("%s: %s", e.Type, e.Msg) + } + + return "Warnings: " + strings.Join(e.warnings, " , ") +} + +func (w *Error) Err() error { + return w +} + +func (w *Error) Warnings() []string { + return w.warnings } // Range represents a sliced time range. @@ -115,32 +230,34 @@ type Range struct { // API provides bindings for Prometheus's v1 API. type API interface { // Alerts returns a list of all active alerts. - Alerts(ctx context.Context) (AlertsResult, error) + Alerts(ctx context.Context) (AlertsResult, api.Error) // AlertManagers returns an overview of the current state of the Prometheus alert manager discovery. - AlertManagers(ctx context.Context) (AlertManagersResult, error) + AlertManagers(ctx context.Context) (AlertManagersResult, api.Error) // CleanTombstones removes the deleted data from disk and cleans up the existing tombstones. - CleanTombstones(ctx context.Context) error + CleanTombstones(ctx context.Context) api.Error // Config returns the current Prometheus configuration. - Config(ctx context.Context) (ConfigResult, error) + Config(ctx context.Context) (ConfigResult, api.Error) // DeleteSeries deletes data for a selection of series in a time range. - DeleteSeries(ctx context.Context, matches []string, startTime time.Time, endTime time.Time) error + DeleteSeries(ctx context.Context, matches []string, startTime time.Time, endTime time.Time) api.Error // Flags returns the flag values that Prometheus was launched with. - Flags(ctx context.Context) (FlagsResult, error) + Flags(ctx context.Context) (FlagsResult, api.Error) // LabelValues performs a query for the values of the given label. - LabelValues(ctx context.Context, label string) (model.LabelValues, error) + LabelValues(ctx context.Context, label string) (model.LabelValues, api.Error) // Query performs a query for the given time. - Query(ctx context.Context, query string, ts time.Time) (model.Value, error) + Query(ctx context.Context, query string, ts time.Time) (model.Value, api.Error) // QueryRange performs a query for the given range. - QueryRange(ctx context.Context, query string, r Range) (model.Value, error) + QueryRange(ctx context.Context, query string, r Range) (model.Value, api.Error) // Series finds series by label matchers. - Series(ctx context.Context, matches []string, startTime time.Time, endTime time.Time) ([]model.LabelSet, error) + Series(ctx context.Context, matches []string, startTime time.Time, endTime time.Time) ([]model.LabelSet, api.Error) // Snapshot creates a snapshot of all current data into snapshots/- // under the TSDB's data directory and returns the directory as response. - Snapshot(ctx context.Context, skipHead bool) (SnapshotResult, error) + Snapshot(ctx context.Context, skipHead bool) (SnapshotResult, api.Error) // Rules returns a list of alerting and recording rules that are currently loaded. - Rules(ctx context.Context) (RulesResult, error) + Rules(ctx context.Context) (RulesResult, api.Error) // Targets returns an overview of the current state of the Prometheus target discovery. - Targets(ctx context.Context) (TargetsResult, error) + Targets(ctx context.Context) (TargetsResult, api.Error) + // TargetsMetadata returns metadata about metrics currently scraped by the target. + TargetsMetadata(ctx context.Context, matchTarget string, metric string, limit string) ([]MetricMetadata, api.Error) } // AlertsResult contains the result from querying the alerts endpoint. @@ -226,7 +343,7 @@ type Alert struct { Annotations model.LabelSet Labels model.LabelSet State AlertState - Value float64 + Value string } // TargetsResult contains the result from querying the targets endpoint. @@ -250,6 +367,15 @@ type DroppedTarget struct { DiscoveredLabels map[string]string `json:"discoveredLabels"` } +// MetricMetadata models the metadata of a metric. +type MetricMetadata struct { + Target map[string]string `json:"target"` + Metric string `json:"metric,omitempty"` + Type MetricType `json:"type"` + Help string `json:"help"` + Unit string `json:"unit"` +} + // queryResult contains result data for a query. type queryResult struct { Type model.ValueType `json:"resultType"` @@ -408,73 +534,73 @@ type httpAPI struct { client api.Client } -func (h *httpAPI) Alerts(ctx context.Context) (AlertsResult, error) { +func (h *httpAPI) Alerts(ctx context.Context) (AlertsResult, api.Error) { u := h.client.URL(epAlerts, nil) req, err := http.NewRequest(http.MethodGet, u.String(), nil) if err != nil { - return AlertsResult{}, err + return AlertsResult{}, api.NewErrorAPI(err, nil) } - _, body, err := h.client.Do(ctx, req) - if err != nil { - return AlertsResult{}, err + _, body, apiErr := h.client.Do(ctx, req) + if apiErr != nil { + return AlertsResult{}, apiErr } var res AlertsResult err = json.Unmarshal(body, &res) - return res, err + return res, api.NewErrorAPI(err, nil) } -func (h *httpAPI) AlertManagers(ctx context.Context) (AlertManagersResult, error) { +func (h *httpAPI) AlertManagers(ctx context.Context) (AlertManagersResult, api.Error) { u := h.client.URL(epAlertManagers, nil) req, err := http.NewRequest(http.MethodGet, u.String(), nil) if err != nil { - return AlertManagersResult{}, err + return AlertManagersResult{}, api.NewErrorAPI(err, nil) } - _, body, err := h.client.Do(ctx, req) - if err != nil { - return AlertManagersResult{}, err + _, body, apiErr := h.client.Do(ctx, req) + if apiErr != nil { + return AlertManagersResult{}, apiErr } var res AlertManagersResult err = json.Unmarshal(body, &res) - return res, err + return res, api.NewErrorAPI(err, nil) } -func (h *httpAPI) CleanTombstones(ctx context.Context) error { +func (h *httpAPI) CleanTombstones(ctx context.Context) api.Error { u := h.client.URL(epCleanTombstones, nil) req, err := http.NewRequest(http.MethodPost, u.String(), nil) if err != nil { - return err + return api.NewErrorAPI(err, nil) } - _, _, err = h.client.Do(ctx, req) - return err + _, _, apiErr := h.client.Do(ctx, req) + return apiErr } -func (h *httpAPI) Config(ctx context.Context) (ConfigResult, error) { +func (h *httpAPI) Config(ctx context.Context) (ConfigResult, api.Error) { u := h.client.URL(epConfig, nil) req, err := http.NewRequest(http.MethodGet, u.String(), nil) if err != nil { - return ConfigResult{}, err + return ConfigResult{}, api.NewErrorAPI(err, nil) } - _, body, err := h.client.Do(ctx, req) - if err != nil { - return ConfigResult{}, err + _, body, apiErr := h.client.Do(ctx, req) + if apiErr != nil { + return ConfigResult{}, apiErr } var res ConfigResult err = json.Unmarshal(body, &res) - return res, err + return res, api.NewErrorAPI(err, nil) } -func (h *httpAPI) DeleteSeries(ctx context.Context, matches []string, startTime time.Time, endTime time.Time) error { +func (h *httpAPI) DeleteSeries(ctx context.Context, matches []string, startTime time.Time, endTime time.Time) api.Error { u := h.client.URL(epDeleteSeries, nil) q := u.Query() @@ -489,47 +615,47 @@ func (h *httpAPI) DeleteSeries(ctx context.Context, matches []string, startTime req, err := http.NewRequest(http.MethodPost, u.String(), nil) if err != nil { - return err + return api.NewErrorAPI(err, nil) } - _, _, err = h.client.Do(ctx, req) - return err + _, _, apiErr := h.client.Do(ctx, req) + return apiErr } -func (h *httpAPI) Flags(ctx context.Context) (FlagsResult, error) { +func (h *httpAPI) Flags(ctx context.Context) (FlagsResult, api.Error) { u := h.client.URL(epFlags, nil) req, err := http.NewRequest(http.MethodGet, u.String(), nil) if err != nil { - return FlagsResult{}, err + return FlagsResult{}, api.NewErrorAPI(err, nil) } - _, body, err := h.client.Do(ctx, req) - if err != nil { - return FlagsResult{}, err + _, body, apiErr := h.client.Do(ctx, req) + if apiErr != nil { + return FlagsResult{}, apiErr } var res FlagsResult err = json.Unmarshal(body, &res) - return res, err + return res, api.NewErrorAPI(err, nil) } -func (h *httpAPI) LabelValues(ctx context.Context, label string) (model.LabelValues, error) { +func (h *httpAPI) LabelValues(ctx context.Context, label string) (model.LabelValues, api.Error) { u := h.client.URL(epLabelValues, map[string]string{"name": label}) req, err := http.NewRequest(http.MethodGet, u.String(), nil) if err != nil { - return nil, err + return nil, api.NewErrorAPI(err, nil) } - _, body, err := h.client.Do(ctx, req) - if err != nil { - return nil, err + _, body, apiErr := h.client.Do(ctx, req) + if apiErr != nil { + return nil, apiErr } var labelValues model.LabelValues err = json.Unmarshal(body, &labelValues) - return labelValues, err + return labelValues, api.NewErrorAPI(err, nil) } -func (h *httpAPI) Query(ctx context.Context, query string, ts time.Time) (model.Value, error) { +func (h *httpAPI) Query(ctx context.Context, query string, ts time.Time) (model.Value, api.Error) { u := h.client.URL(epQuery, nil) q := u.Query() @@ -538,18 +664,16 @@ func (h *httpAPI) Query(ctx context.Context, query string, ts time.Time) (model. q.Set("time", ts.Format(time.RFC3339Nano)) } - _, body, err := api.DoGetFallback(h.client, ctx, u, q) - if err != nil { - return nil, err + _, body, apiErr := api.DoGetFallback(h.client, ctx, u, q) + if apiErr != nil { + return nil, apiErr } var qres queryResult - err = json.Unmarshal(body, &qres) - - return model.Value(qres.v), err + return model.Value(qres.v), api.NewErrorAPI(json.Unmarshal(body, &qres), nil) } -func (h *httpAPI) QueryRange(ctx context.Context, query string, r Range) (model.Value, error) { +func (h *httpAPI) QueryRange(ctx context.Context, query string, r Range) (model.Value, api.Error) { u := h.client.URL(epQueryRange, nil) q := u.Query() @@ -564,18 +688,17 @@ func (h *httpAPI) QueryRange(ctx context.Context, query string, r Range) (model. q.Set("end", end) q.Set("step", step) - _, body, err := api.DoGetFallback(h.client, ctx, u, q) - if err != nil { - return nil, err + _, body, apiErr := api.DoGetFallback(h.client, ctx, u, q) + if apiErr != nil { + return nil, apiErr } var qres queryResult - err = json.Unmarshal(body, &qres) - return model.Value(qres.v), err + return model.Value(qres.v), api.NewErrorAPI(json.Unmarshal(body, &qres), nil) } -func (h *httpAPI) Series(ctx context.Context, matches []string, startTime time.Time, endTime time.Time) ([]model.LabelSet, error) { +func (h *httpAPI) Series(ctx context.Context, matches []string, startTime time.Time, endTime time.Time) ([]model.LabelSet, api.Error) { u := h.client.URL(epSeries, nil) q := u.Query() @@ -590,20 +713,20 @@ func (h *httpAPI) Series(ctx context.Context, matches []string, startTime time.T req, err := http.NewRequest(http.MethodGet, u.String(), nil) if err != nil { - return nil, err + return nil, api.NewErrorAPI(err, nil) } - _, body, err := h.client.Do(ctx, req) - if err != nil { - return nil, err + _, body, apiErr := h.client.Do(ctx, req) + if apiErr != nil { + return nil, apiErr } var mset []model.LabelSet err = json.Unmarshal(body, &mset) - return mset, err + return mset, api.NewErrorAPI(err, nil) } -func (h *httpAPI) Snapshot(ctx context.Context, skipHead bool) (SnapshotResult, error) { +func (h *httpAPI) Snapshot(ctx context.Context, skipHead bool) (SnapshotResult, api.Error) { u := h.client.URL(epSnapshot, nil) q := u.Query() @@ -613,53 +736,78 @@ func (h *httpAPI) Snapshot(ctx context.Context, skipHead bool) (SnapshotResult, req, err := http.NewRequest(http.MethodPost, u.String(), nil) if err != nil { - return SnapshotResult{}, err + return SnapshotResult{}, api.NewErrorAPI(err, nil) } - _, body, err := h.client.Do(ctx, req) - if err != nil { - return SnapshotResult{}, err + _, body, apiErr := h.client.Do(ctx, req) + if apiErr != nil { + return SnapshotResult{}, apiErr } var res SnapshotResult err = json.Unmarshal(body, &res) - return res, err + return res, api.NewErrorAPI(err, nil) } -func (h *httpAPI) Rules(ctx context.Context) (RulesResult, error) { +func (h *httpAPI) Rules(ctx context.Context) (RulesResult, api.Error) { u := h.client.URL(epRules, nil) req, err := http.NewRequest(http.MethodGet, u.String(), nil) if err != nil { - return RulesResult{}, err + return RulesResult{}, api.NewErrorAPI(err, nil) } - _, body, err := h.client.Do(ctx, req) - if err != nil { - return RulesResult{}, err + _, body, apiErr := h.client.Do(ctx, req) + if apiErr != nil { + return RulesResult{}, apiErr } var res RulesResult err = json.Unmarshal(body, &res) - return res, err + return res, api.NewErrorAPI(err, nil) } -func (h *httpAPI) Targets(ctx context.Context) (TargetsResult, error) { +func (h *httpAPI) Targets(ctx context.Context) (TargetsResult, api.Error) { u := h.client.URL(epTargets, nil) req, err := http.NewRequest(http.MethodGet, u.String(), nil) if err != nil { - return TargetsResult{}, err + return TargetsResult{}, api.NewErrorAPI(err, nil) } - _, body, err := h.client.Do(ctx, req) - if err != nil { - return TargetsResult{}, err + _, body, apiErr := h.client.Do(ctx, req) + if apiErr != nil { + return TargetsResult{}, apiErr } var res TargetsResult err = json.Unmarshal(body, &res) - return res, err + return res, api.NewErrorAPI(err, nil) +} + +func (h *httpAPI) TargetsMetadata(ctx context.Context, matchTarget string, metric string, limit string) ([]MetricMetadata, api.Error) { + u := h.client.URL(epTargetsMetadata, nil) + q := u.Query() + + q.Set("match_target", matchTarget) + q.Set("metric", metric) + q.Set("limit", limit) + + u.RawQuery = q.Encode() + + req, err := http.NewRequest(http.MethodGet, u.String(), nil) + if err != nil { + return nil, api.NewErrorAPI(err, nil) + } + + _, body, apiErr := h.client.Do(ctx, req) + if apiErr != nil { + return nil, apiErr + } + + var res []MetricMetadata + err = json.Unmarshal(body, &res) + return res, api.NewErrorAPI(err, nil) } // apiClient wraps a regular client and processes successful API responses. @@ -673,6 +821,7 @@ type apiResponse struct { Data json.RawMessage `json:"data"` ErrorType ErrorType `json:"errorType"` Error string `json:"error"` + Warnings []string `json:"warnings,omitempty"` } func apiError(code int) bool { @@ -690,14 +839,16 @@ func errorTypeAndMsgFor(resp *http.Response) (ErrorType, string) { return ErrBadResponse, fmt.Sprintf("bad response code %d", resp.StatusCode) } -func (c apiClient) Do(ctx context.Context, req *http.Request) (*http.Response, []byte, error) { - resp, body, err := c.Client.Do(ctx, req) - if err != nil { - return resp, body, err +func (c apiClient) Do(ctx context.Context, req *http.Request) (*http.Response, []byte, api.Error) { + resp, body, apiErr := c.Client.Do(ctx, req) + if apiErr != nil { + return resp, body, apiErr } code := resp.StatusCode + var err api.Error + if code/100 != 2 && !apiError(code) { errorType, errorMsg := errorTypeAndMsgFor(resp) return resp, body, &Error{ @@ -710,27 +861,30 @@ func (c apiClient) Do(ctx context.Context, req *http.Request) (*http.Response, [ var result apiResponse if http.StatusNoContent != code { - if err = json.Unmarshal(body, &result); err != nil { + if jsonErr := json.Unmarshal(body, &result); jsonErr != nil { return resp, body, &Error{ Type: ErrBadResponse, - Msg: err.Error(), + Msg: jsonErr.Error(), } } } if apiError(code) != (result.Status == "error") { err = &Error{ - Type: ErrBadResponse, - Msg: "inconsistent body for response code", + Type: ErrBadResponse, + Msg: "inconsistent body for response code", + warnings: result.Warnings, } } if apiError(code) && result.Status == "error" { err = &Error{ - Type: result.ErrorType, - Msg: result.Error, + Type: result.ErrorType, + Msg: result.Error, + warnings: result.Warnings, } } return resp, []byte(result.Data), err + } diff --git a/vendor/github.com/prometheus/client_golang/prometheus/build_info.go b/vendor/github.com/prometheus/client_golang/prometheus/build_info.go new file mode 100644 index 000000000..288f0e854 --- /dev/null +++ b/vendor/github.com/prometheus/client_golang/prometheus/build_info.go @@ -0,0 +1,29 @@ +// Copyright 2019 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build go1.12 + +package prometheus + +import "runtime/debug" + +// readBuildInfo is a wrapper around debug.ReadBuildInfo for Go 1.12+. +func readBuildInfo() (path, version, sum string) { + path, version, sum = "unknown", "unknown", "unknown" + if bi, ok := debug.ReadBuildInfo(); ok { + path = bi.Main.Path + version = bi.Main.Version + sum = bi.Main.Sum + } + return +} diff --git a/vendor/github.com/prometheus/client_golang/prometheus/build_info_pre_1.12.go b/vendor/github.com/prometheus/client_golang/prometheus/build_info_pre_1.12.go new file mode 100644 index 000000000..6609e2877 --- /dev/null +++ b/vendor/github.com/prometheus/client_golang/prometheus/build_info_pre_1.12.go @@ -0,0 +1,22 @@ +// Copyright 2019 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build !go1.12 + +package prometheus + +// readBuildInfo is a wrapper around debug.ReadBuildInfo for Go versions before +// 1.12. Remove this whole file once the minimum supported Go version is 1.12. +func readBuildInfo() (path, version, sum string) { + return "unknown", "unknown", "unknown" +} diff --git a/vendor/github.com/prometheus/client_golang/prometheus/go_collector.go b/vendor/github.com/prometheus/client_golang/prometheus/go_collector.go index b108ec513..dc9247fed 100644 --- a/vendor/github.com/prometheus/client_golang/prometheus/go_collector.go +++ b/vendor/github.com/prometheus/client_golang/prometheus/go_collector.go @@ -36,7 +36,7 @@ type goCollector struct { msMaxAge time.Duration // Maximum allowed age of old memstats. } -// NewGoCollector returns a collector which exports metrics about the current Go +// NewGoCollector returns a collector that exports metrics about the current Go // process. This includes memory stats. To collect those, runtime.ReadMemStats // is called. This requires to “stop the world”, which usually only happens for // garbage collection (GC). Take the following implications into account when @@ -364,3 +364,33 @@ type memStatsMetrics []struct { eval func(*runtime.MemStats) float64 valType ValueType } + +// NewBuildInfoCollector returns a collector collecting a single metric +// "go_build_info" with the constant value 1 and three labels "path", "version", +// and "checksum". Their label values contain the main module path, version, and +// checksum, respectively. The labels will only have meaningful values if the +// binary is built with Go module support and from source code retrieved from +// the source repository (rather than the local file system). This is usually +// accomplished by building from outside of GOPATH, specifying the full address +// of the main package, e.g. "GO111MODULE=on go run +// github.com/prometheus/client_golang/examples/random". If built without Go +// module support, all label values will be "unknown". If built with Go module +// support but using the source code from the local file system, the "path" will +// be set appropriately, but "checksum" will be empty and "version" will be +// "(devel)". +// +// This collector uses only the build information for the main module. See +// https://github.com/povilasv/prommod for an example of a collector for the +// module dependencies. +func NewBuildInfoCollector() Collector { + path, version, sum := readBuildInfo() + c := &selfCollector{MustNewConstMetric( + NewDesc( + "go_build_info", + "Build information about the main Go module.", + nil, Labels{"path": path, "version": version, "checksum": sum}, + ), + GaugeValue, 1)} + c.init(c.self) + return c +} diff --git a/vendor/github.com/prometheus/client_golang/prometheus/process_collector.go b/vendor/github.com/prometheus/client_golang/prometheus/process_collector.go index 55176d58c..37d2026ac 100644 --- a/vendor/github.com/prometheus/client_golang/prometheus/process_collector.go +++ b/vendor/github.com/prometheus/client_golang/prometheus/process_collector.go @@ -126,7 +126,7 @@ func NewProcessCollector(opts ProcessCollectorOpts) Collector { } // Set up process metric collection if supported by the runtime. - if _, err := procfs.NewStat(); err == nil { + if _, err := procfs.NewDefaultFS(); err == nil { c.collectFn = c.processCollect } else { c.collectFn = func(ch chan<- Metric) { @@ -166,7 +166,7 @@ func (c *processCollector) processCollect(ch chan<- Metric) { return } - if stat, err := p.NewStat(); err == nil { + if stat, err := p.Stat(); err == nil { ch <- MustNewConstMetric(c.cpuTotal, CounterValue, stat.CPUTime()) ch <- MustNewConstMetric(c.vsize, GaugeValue, float64(stat.VirtualMemory())) ch <- MustNewConstMetric(c.rss, GaugeValue, float64(stat.ResidentMemory())) @@ -185,7 +185,7 @@ func (c *processCollector) processCollect(ch chan<- Metric) { c.reportError(ch, c.openFDs, err) } - if limits, err := p.NewLimits(); err == nil { + if limits, err := p.Limits(); err == nil { ch <- MustNewConstMetric(c.maxFDs, GaugeValue, float64(limits.OpenFiles)) ch <- MustNewConstMetric(c.maxVsize, GaugeValue, float64(limits.AddressSpace)) } else { diff --git a/vendor/github.com/prometheus/client_golang/prometheus/promhttp/http.go b/vendor/github.com/prometheus/client_golang/prometheus/promhttp/http.go index b137c8830..cea5a90fd 100644 --- a/vendor/github.com/prometheus/client_golang/prometheus/promhttp/http.go +++ b/vendor/github.com/prometheus/client_golang/prometheus/promhttp/http.go @@ -84,10 +84,32 @@ func Handler() http.Handler { // instrumentation. Use the InstrumentMetricHandler function to apply the same // kind of instrumentation as it is used by the Handler function. func HandlerFor(reg prometheus.Gatherer, opts HandlerOpts) http.Handler { - var inFlightSem chan struct{} + var ( + inFlightSem chan struct{} + errCnt = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "promhttp_metric_handler_errors_total", + Help: "Total number of internal errors encountered by the promhttp metric handler.", + }, + []string{"cause"}, + ) + ) + if opts.MaxRequestsInFlight > 0 { inFlightSem = make(chan struct{}, opts.MaxRequestsInFlight) } + if opts.Registry != nil { + // Initialize all possibilites that can occur below. + errCnt.WithLabelValues("gathering") + errCnt.WithLabelValues("encoding") + if err := opts.Registry.Register(errCnt); err != nil { + if are, ok := err.(prometheus.AlreadyRegisteredError); ok { + errCnt = are.ExistingCollector.(*prometheus.CounterVec) + } else { + panic(err) + } + } + } h := http.HandlerFunc(func(rsp http.ResponseWriter, req *http.Request) { if inFlightSem != nil { @@ -106,6 +128,7 @@ func HandlerFor(reg prometheus.Gatherer, opts HandlerOpts) http.Handler { if opts.ErrorLog != nil { opts.ErrorLog.Println("error gathering metrics:", err) } + errCnt.WithLabelValues("gathering").Inc() switch opts.ErrorHandling { case PanicOnError: panic(err) @@ -146,6 +169,7 @@ func HandlerFor(reg prometheus.Gatherer, opts HandlerOpts) http.Handler { if opts.ErrorLog != nil { opts.ErrorLog.Println("error encoding and sending metric family:", err) } + errCnt.WithLabelValues("encoding").Inc() switch opts.ErrorHandling { case PanicOnError: panic(err) @@ -236,9 +260,12 @@ const ( // Ignore errors and try to serve as many metrics as possible. However, // if no metrics can be served, serve an HTTP status code 500 and the // last error message in the body. Only use this in deliberate "best - // effort" metrics collection scenarios. It is recommended to at least - // log errors (by providing an ErrorLog in HandlerOpts) to not mask - // errors completely. + // effort" metrics collection scenarios. In this case, it is highly + // recommended to provide other means of detecting errors: By setting an + // ErrorLog in HandlerOpts, the errors are logged. By providing a + // Registry in HandlerOpts, the exposed metrics include an error counter + // "promhttp_metric_handler_errors_total", which can be used for + // alerts. ContinueOnError // Panic upon the first error encountered (useful for "crash only" apps). PanicOnError @@ -261,6 +288,18 @@ type HandlerOpts struct { // logged regardless of the configured ErrorHandling provided ErrorLog // is not nil. ErrorHandling HandlerErrorHandling + // If Registry is not nil, it is used to register a metric + // "promhttp_metric_handler_errors_total", partitioned by "cause". A + // failed registration causes a panic. Note that this error counter is + // different from the instrumentation you get from the various + // InstrumentHandler... helpers. It counts errors that don't necessarily + // result in a non-2xx HTTP status code. There are two typical cases: + // (1) Encoding errors that only happen after streaming of the HTTP body + // has already started (and the status code 200 has been sent). This + // should only happen with custom collectors. (2) Collection errors with + // no effect on the HTTP status code because ErrorHandling is set to + // ContinueOnError. + Registry prometheus.Registerer // If DisableCompression is true, the handler will never compress the // response, even if requested by the client. DisableCompression bool diff --git a/vendor/github.com/prometheus/client_golang/prometheus/summary.go b/vendor/github.com/prometheus/client_golang/prometheus/summary.go index 1574b0fe7..ec663ec3d 100644 --- a/vendor/github.com/prometheus/client_golang/prometheus/summary.go +++ b/vendor/github.com/prometheus/client_golang/prometheus/summary.go @@ -39,7 +39,7 @@ const quantileLabel = "quantile" // A typical use-case is the observation of request latencies. By default, a // Summary provides the median, the 90th and the 99th percentile of the latency // as rank estimations. However, the default behavior will change in the -// upcoming v0.10 of the library. There will be no rank estimations at all by +// upcoming v1.0.0 of the library. There will be no rank estimations at all by // default. For a sane transition, it is recommended to set the desired rank // estimations explicitly. // @@ -61,7 +61,7 @@ type Summary interface { // DefObjectives are the default Summary quantile values. // // Deprecated: DefObjectives will not be used as the default objectives in -// v0.10 of the library. The default Summary will have no quantiles then. +// v1.0.0 of the library. The default Summary will have no quantiles then. var ( DefObjectives = map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001} @@ -86,7 +86,7 @@ const ( // mandatory to set Name to a non-empty string. While all other fields are // optional and can safely be left at their zero value, it is recommended to set // a help string and to explicitly set the Objectives field to the desired value -// as the default value will change in the upcoming v0.10 of the library. +// as the default value will change in the upcoming v1.0.0 of the library. type SummaryOpts struct { // Namespace, Subsystem, and Name are components of the fully-qualified // name of the Summary (created by joining these components with @@ -128,7 +128,7 @@ type SummaryOpts struct { // set it to an empty map (i.e. map[float64]float64{}). // // Note that the current value of DefObjectives is deprecated. It will - // be replaced by an empty map in v0.10 of the library. Please + // be replaced by an empty map in v1.0.0 of the library. Please // explicitly set Objectives to the desired value to avoid problems // during the transition. Objectives map[float64]float64 diff --git a/vendor/github.com/prometheus/procfs/Makefile b/vendor/github.com/prometheus/procfs/Makefile index 314d1ba56..616a0d25e 100644 --- a/vendor/github.com/prometheus/procfs/Makefile +++ b/vendor/github.com/prometheus/procfs/Makefile @@ -14,6 +14,7 @@ include Makefile.common %/.unpacked: %.ttar + @echo ">> extracting fixtures" ./ttar -C $(dir $*) -x -f $*.ttar touch $@ diff --git a/vendor/github.com/prometheus/procfs/Makefile.common b/vendor/github.com/prometheus/procfs/Makefile.common index 4f18ea587..c7f9ea64f 100644 --- a/vendor/github.com/prometheus/procfs/Makefile.common +++ b/vendor/github.com/prometheus/procfs/Makefile.common @@ -69,7 +69,7 @@ else GO_BUILD_PLATFORM ?= $(GOHOSTOS)-$(GOHOSTARCH) endif -PROMU_VERSION ?= 0.3.0 +PROMU_VERSION ?= 0.4.0 PROMU_URL := https://github.com/prometheus/promu/releases/download/v$(PROMU_VERSION)/promu-$(PROMU_VERSION).$(GO_BUILD_PLATFORM).tar.gz GOLANGCI_LINT := diff --git a/vendor/github.com/prometheus/procfs/README.md b/vendor/github.com/prometheus/procfs/README.md index 209549471..6f8850feb 100644 --- a/vendor/github.com/prometheus/procfs/README.md +++ b/vendor/github.com/prometheus/procfs/README.md @@ -1,7 +1,7 @@ # procfs This procfs package provides functions to retrieve system, kernel and process -metrics from the pseudo-filesystem proc. +metrics from the pseudo-filesystems /proc and /sys. *WARNING*: This package is a work in progress. Its API may still break in backwards-incompatible ways without warnings. Use it at your own risk. @@ -9,3 +9,45 @@ backwards-incompatible ways without warnings. Use it at your own risk. [![GoDoc](https://godoc.org/github.com/prometheus/procfs?status.png)](https://godoc.org/github.com/prometheus/procfs) [![Build Status](https://travis-ci.org/prometheus/procfs.svg?branch=master)](https://travis-ci.org/prometheus/procfs) [![Go Report Card](https://goreportcard.com/badge/github.com/prometheus/procfs)](https://goreportcard.com/report/github.com/prometheus/procfs) + +## Usage + +The procfs library is organized by packages based on whether the gathered data is coming from +/proc, /sys, or both. Each package contains an `FS` type which represents the path to either /proc, /sys, or both. For example, current cpu statistics are gathered from +`/proc/stat` and are available via the root procfs package. First, the proc filesystem mount +point is initialized, and then the stat information is read. + +```go +fs, err := procfs.NewFS("/proc") +stats, err := fs.Stat() +``` + +Some sub-packages such as `blockdevice`, require access to both the proc and sys filesystems. + +```go + fs, err := blockdevice.NewFS("/proc", "/sys") + stats, err := fs.ProcDiskstats() +``` + +## Building and Testing + +The procfs library is normally built as part of another application. However, when making +changes to the library, the `make test` command can be used to run the API test suite. + +### Updating Test Fixtures + +The procfs library includes a set of test fixtures which include many example files from +the `/proc` and `/sys` filesystems. These fixtures are included as a [ttar](https://github.com/ideaship/ttar) file +which is extracted automatically during testing. To add/update the test fixtures, first +ensure the `fixtures` directory is up to date by removing the existing directory and then +extracting the ttar file using `make fixtures/.unpacked` or just `make test`. + +```bash +rm -rf fixtures +make test +``` + +Next, make the required changes to the extracted files in the `fixtures` directory. When +the changes are complete, run `make update_fixtures` to create a new `fixtures.ttar` file +based on the updated `fixtures` directory. And finally, verify the changes using +`git diff fixtures.ttar`. diff --git a/vendor/github.com/prometheus/procfs/buddyinfo.go b/vendor/github.com/prometheus/procfs/buddyinfo.go index 5cd22a837..63d4229a4 100644 --- a/vendor/github.com/prometheus/procfs/buddyinfo.go +++ b/vendor/github.com/prometheus/procfs/buddyinfo.go @@ -31,18 +31,8 @@ type BuddyInfo struct { Sizes []float64 } -// NewBuddyInfo reads the buddyinfo statistics. -func NewBuddyInfo() ([]BuddyInfo, error) { - fs, err := NewFS(DefaultMountPoint) - if err != nil { - return nil, err - } - - return fs.NewBuddyInfo() -} - // NewBuddyInfo reads the buddyinfo statistics from the specified `proc` filesystem. -func (fs FS) NewBuddyInfo() ([]BuddyInfo, error) { +func (fs FS) BuddyInfo() ([]BuddyInfo, error) { file, err := os.Open(fs.proc.Path("buddyinfo")) if err != nil { return nil, err diff --git a/vendor/github.com/prometheus/procfs/fixtures.ttar b/vendor/github.com/prometheus/procfs/fixtures.ttar index f7f84ef36..951d909af 100644 --- a/vendor/github.com/prometheus/procfs/fixtures.ttar +++ b/vendor/github.com/prometheus/procfs/fixtures.ttar @@ -75,13 +75,13 @@ Max realtime timeout unlimited unlimited us Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: fixtures/proc/26231/mountstats -Lines: 19 +Lines: 20 device rootfs mounted on / with fstype rootfs device sysfs mounted on /sys with fstype sysfs device proc mounted on /proc with fstype proc device /dev/sda1 mounted on / with fstype ext4 device 192.168.1.1:/srv/test mounted on /mnt/nfs/test with fstype nfs4 statvers=1.1 - opts: rw,vers=4.0,rsize=1048576,wsize=1048576,namlen=255,acregmin=3,acregmax=60,acdirmin=30,acdirmax=60,hard,proto=tcp,port=0,timeo=600,retrans=2,sec=sys,clientaddr=192.168.1.5,local_lock=none + opts: rw,vers=4.0,rsize=1048576,wsize=1048576,namlen=255,acregmin=3,acregmax=60,acdirmin=30,acdirmax=60,hard,proto=tcp,port=0,timeo=600,retrans=2,sec=sys,mountaddr=192.168.1.1,clientaddr=192.168.1.5,local_lock=none age: 13968 caps: caps=0xfff7,wtmult=512,dtsize=32768,bsize=0,namlen=255 nfsv4: bm0=0xfdffafff,bm1=0xf9be3e,bm2=0x0,acl=0x0,pnfs=not configured @@ -94,6 +94,7 @@ device 192.168.1.1:/srv/test mounted on /mnt/nfs/test with fstype nfs4 statvers= NULL: 0 0 0 0 0 0 0 0 READ: 1298 1298 0 207680 1210292152 6 79386 79407 WRITE: 0 0 0 0 0 0 0 0 + ACCESS: 2927395007 2927394995 0 526931094212 362996810236 18446743919241604546 1667369447 1953587717 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -125,6 +126,63 @@ Lines: 1 26231 (vim) R 5392 7446 5392 34835 7446 4218880 32533 309516 26 82 1677 44 158 99 20 0 1 0 82375 56274944 1981 18446744073709551615 4194304 6294284 140736914091744 140736914087944 139965136429984 0 0 12288 1870679807 0 0 0 17 0 0 0 31 0 0 8391624 8481048 16420864 140736914093252 140736914093279 140736914093279 140736914096107 0 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: fixtures/proc/26231/status +Lines: 53 + +Name: prometheus +Umask: 0022 +State: S (sleeping) +Tgid: 1 +Ngid: 0 +Pid: 1 +PPid: 0 +TracerPid: 0 +Uid: 0 0 0 0 +Gid: 0 0 0 0 +FDSize: 128 +Groups: +NStgid: 1 +NSpid: 1 +NSpgid: 1 +NSsid: 1 +VmPeak: 58472 kB +VmSize: 58440 kB +VmLck: 0 kB +VmPin: 0 kB +VmHWM: 8028 kB +VmRSS: 6716 kB +RssAnon: 2092 kB +RssFile: 4624 kB +RssShmem: 0 kB +VmData: 2580 kB +VmStk: 136 kB +VmExe: 948 kB +VmLib: 6816 kB +VmPTE: 128 kB +VmPMD: 12 kB +VmSwap: 660 kB +HugetlbPages: 0 kB +Threads: 1 +SigQ: 8/63965 +SigPnd: 0000000000000000 +ShdPnd: 0000000000000000 +SigBlk: 7be3c0fe28014a03 +SigIgn: 0000000000001000 +SigCgt: 00000001800004ec +CapInh: 0000000000000000 +CapPrm: 0000003fffffffff +CapEff: 0000003fffffffff +CapBnd: 0000003fffffffff +CapAmb: 0000000000000000 +Seccomp: 0 +Cpus_allowed: ff +Cpus_allowed_list: 0-7 +Mems_allowed: 00000000,00000001 +Mems_allowed_list: 0 +voluntary_ctxt_switches: 4742839 +nonvoluntary_ctxt_switches: 1727500 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Directory: fixtures/proc/26232 Mode: 755 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -160,23 +218,23 @@ SymlinkTo: ../../symlinktargets/xyz # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: fixtures/proc/26232/limits Lines: 17 -Limit Soft Limit Hard Limit Units -Max cpu time unlimited unlimited seconds -Max file size unlimited unlimited bytes -Max data size unlimited unlimited bytes -Max stack size 8388608 unlimited bytes -Max core file size 0 unlimited bytes -Max resident set unlimited unlimited bytes -Max processes 29436 29436 processes -Max open files 1024 4096 files -Max locked memory 65536 65536 bytes -Max address space unlimited unlimited bytes -Max file locks unlimited unlimited locks -Max pending signals 29436 29436 signals -Max msgqueue size 819200 819200 bytes -Max nice priority 0 0 -Max realtime priority 0 0 -Max realtime timeout unlimited unlimited us +Limit Soft Limit Hard Limit Units +Max cpu time unlimited unlimited seconds +Max file size unlimited unlimited bytes +Max data size unlimited unlimited bytes +Max stack size 8388608 unlimited bytes +Max core file size 0 unlimited bytes +Max resident set unlimited unlimited bytes +Max processes 29436 29436 processes +Max open files 1024 4096 files +Max locked memory 65536 65536 bytes +Max address space unlimited unlimited bytes +Max file locks unlimited unlimited locks +Max pending signals 29436 29436 signals +Max msgqueue size 819200 819200 bytes +Max nice priority 0 0 +Max realtime priority 0 0 +Max realtime timeout unlimited unlimited us Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: fixtures/proc/26232/root @@ -206,9 +264,9 @@ Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: fixtures/proc/buddyinfo Lines: 3 -Node 0, zone DMA 1 0 1 0 2 1 1 0 1 1 3 -Node 0, zone DMA32 759 572 791 475 194 45 12 0 0 0 0 -Node 0, zone Normal 4381 1093 185 1530 567 102 4 0 0 0 0 +Node 0, zone DMA 1 0 1 0 2 1 1 0 1 1 3 +Node 0, zone DMA32 759 572 791 475 194 45 12 0 0 0 0 +Node 0, zone Normal 4381 1093 185 1530 567 102 4 0 0 0 0 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: fixtures/proc/diskstats @@ -302,13 +360,13 @@ Lines: 26 Personalities : [linear] [multipath] [raid0] [raid1] [raid6] [raid5] [raid4] [raid10] md3 : active raid6 sda1[8] sdh1[7] sdg1[6] sdf1[5] sde1[11] sdd1[3] sdc1[10] sdb1[9] 5853468288 blocks super 1.2 level 6, 64k chunk, algorithm 2 [8/8] [UUUUUUUU] - + md127 : active raid1 sdi2[0] sdj2[1] 312319552 blocks [2/2] [UU] - + md0 : active raid1 sdk[2](S) sdi1[0] sdj1[1] 248896 blocks [2/2] [UU] - + md4 : inactive raid1 sda3[0] sdb3[1] 4883648 blocks [2/2] [UU] @@ -402,6 +460,26 @@ proc4 2 2 10853 proc4ops 72 0 0 0 1098 2 0 0 0 0 8179 5896 0 0 0 0 5900 0 0 2 0 2 0 9609 0 2 150 1272 0 0 0 1236 0 0 0 0 3 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: fixtures/proc/net/unix +Lines: 6 +Num RefCount Protocol Flags Type St Inode Path +0000000000000000: 00000002 00000000 00010000 0001 01 3442596 /var/run/postgresql/.s.PGSQL.5432 +0000000000000000: 0000000a 00000000 00010000 0005 01 10061 /run/udev/control +0000000000000000: 00000007 00000000 00000000 0002 01 12392 /dev/log +0000000000000000: 00000003 00000000 00000000 0001 03 4787297 /var/run/postgresql/.s.PGSQL.5432 +0000000000000000: 00000003 00000000 00000000 0001 03 5091797 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: fixtures/proc/net/unix_without_inode +Lines: 6 +Num RefCount Protocol Flags Type St Path +0000000000000000: 00000002 00000000 00010000 0001 01 /var/run/postgresql/.s.PGSQL.5432 +0000000000000000: 0000000a 00000000 00010000 0005 01 /run/udev/control +0000000000000000: 00000007 00000000 00000000 0002 01 /dev/log +0000000000000000: 00000003 00000000 00000000 0001 03 /var/run/postgresql/.s.PGSQL.5432 +0000000000000000: 00000003 00000000 00000000 0001 03 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: fixtures/proc/net/xfrm_stat Lines: 28 XfrmInError 1 @@ -1107,6 +1185,22 @@ Mode: 644 Directory: fixtures/sys/devices/system Mode: 775 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: fixtures/sys/devices/system/clocksource +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: fixtures/sys/devices/system/clocksource/clocksource0 +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: fixtures/sys/devices/system/clocksource/clocksource0/available_clocksource +Lines: 1 +tsc hpet acpi_pm +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: fixtures/sys/devices/system/clocksource/clocksource0/current_clocksource +Lines: 1 +tsc +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Directory: fixtures/sys/devices/system/cpu Mode: 775 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/vendor/github.com/prometheus/procfs/fs.go b/vendor/github.com/prometheus/procfs/fs.go index 9c56c8395..0102ab0fd 100644 --- a/vendor/github.com/prometheus/procfs/fs.go +++ b/vendor/github.com/prometheus/procfs/fs.go @@ -26,8 +26,14 @@ type FS struct { // DefaultMountPoint is the common mount point of the proc filesystem. const DefaultMountPoint = fs.DefaultProcMountPoint +// NewDefaultFS returns a new proc FS mounted under the default proc mountPoint. +// It will error if the mount point directory can't be read or is a file. +func NewDefaultFS() (FS, error) { + return NewFS(DefaultMountPoint) +} + // NewFS returns a new proc FS mounted under the given proc mountPoint. It will error -// if the mount point dirctory can't be read or is a file. +// if the mount point directory can't be read or is a file. func NewFS(mountPoint string) (FS, error) { fs, err := fs.NewFS(mountPoint) if err != nil { diff --git a/vendor/github.com/prometheus/procfs/ipvs.go b/vendor/github.com/prometheus/procfs/ipvs.go index 41e645d23..2d6cb8d1c 100644 --- a/vendor/github.com/prometheus/procfs/ipvs.go +++ b/vendor/github.com/prometheus/procfs/ipvs.go @@ -62,18 +62,8 @@ type IPVSBackendStatus struct { Weight uint64 } -// NewIPVSStats reads the IPVS statistics. -func NewIPVSStats() (IPVSStats, error) { - fs, err := NewFS(DefaultMountPoint) - if err != nil { - return IPVSStats{}, err - } - - return fs.NewIPVSStats() -} - -// NewIPVSStats reads the IPVS statistics from the specified `proc` filesystem. -func (fs FS) NewIPVSStats() (IPVSStats, error) { +// IPVSStats reads the IPVS statistics from the specified `proc` filesystem. +func (fs FS) IPVSStats() (IPVSStats, error) { file, err := os.Open(fs.proc.Path("net/ip_vs_stats")) if err != nil { return IPVSStats{}, err @@ -131,18 +121,8 @@ func parseIPVSStats(file io.Reader) (IPVSStats, error) { return stats, nil } -// NewIPVSBackendStatus reads and returns the status of all (virtual,real) server pairs. -func NewIPVSBackendStatus() ([]IPVSBackendStatus, error) { - fs, err := NewFS(DefaultMountPoint) - if err != nil { - return []IPVSBackendStatus{}, err - } - - return fs.NewIPVSBackendStatus() -} - -// NewIPVSBackendStatus reads and returns the status of all (virtual,real) server pairs from the specified `proc` filesystem. -func (fs FS) NewIPVSBackendStatus() ([]IPVSBackendStatus, error) { +// IPVSBackendStatus reads and returns the status of all (virtual,real) server pairs from the specified `proc` filesystem. +func (fs FS) IPVSBackendStatus() ([]IPVSBackendStatus, error) { file, err := os.Open(fs.proc.Path("net/ip_vs")) if err != nil { return nil, err diff --git a/vendor/github.com/prometheus/procfs/mdstat.go b/vendor/github.com/prometheus/procfs/mdstat.go index 6ac7a12f9..71c106782 100644 --- a/vendor/github.com/prometheus/procfs/mdstat.go +++ b/vendor/github.com/prometheus/procfs/mdstat.go @@ -42,64 +42,64 @@ type MDStat struct { BlocksSynced int64 } -// ParseMDStat parses an mdstat-file and returns a struct with the relevant infos. -func (fs FS) ParseMDStat() (mdstates []MDStat, err error) { - mdStatusFilePath := fs.proc.Path("mdstat") - content, err := ioutil.ReadFile(mdStatusFilePath) +// MDStat parses an mdstat-file (/proc/mdstat) and returns a slice of +// structs containing the relevant info. More information available here: +// https://raid.wiki.kernel.org/index.php/Mdstat +func (fs FS) MDStat() ([]MDStat, error) { + data, err := ioutil.ReadFile(fs.proc.Path("mdstat")) if err != nil { - return []MDStat{}, fmt.Errorf("error parsing %s: %s", mdStatusFilePath, err) + return nil, fmt.Errorf("error parsing mdstat %s: %s", fs.proc.Path("mdstat"), err) } + mdstat, err := parseMDStat(data) + if err != nil { + return nil, fmt.Errorf("error parsing mdstat %s: %s", fs.proc.Path("mdstat"), err) + } + return mdstat, nil +} - mdStates := []MDStat{} - lines := strings.Split(string(content), "\n") +// parseMDStat parses data from mdstat file (/proc/mdstat) and returns a slice of +// structs containing the relevant info. +func parseMDStat(mdstatData []byte) ([]MDStat, error) { + mdStats := []MDStat{} + lines := strings.Split(string(mdstatData), "\n") for i, l := range lines { - if l == "" { - continue - } - if l[0] == ' ' { - continue - } - if strings.HasPrefix(l, "Personalities") || strings.HasPrefix(l, "unused") { + if strings.TrimSpace(l) == "" || l[0] == ' ' || + strings.HasPrefix(l, "Personalities") || strings.HasPrefix(l, "unused") { continue } - mainLine := strings.Split(l, " ") - if len(mainLine) < 3 { - return mdStates, fmt.Errorf("error parsing mdline: %s", l) + deviceFields := strings.Fields(l) + if len(deviceFields) < 3 { + return nil, fmt.Errorf("not enough fields in mdline (expected at least 3): %s", l) } - mdName := mainLine[0] - activityState := mainLine[2] + mdName := deviceFields[0] + activityState := deviceFields[2] if len(lines) <= i+3 { - return mdStates, fmt.Errorf( - "error parsing %s: too few lines for md device %s", - mdStatusFilePath, - mdName, - ) + return mdStats, fmt.Errorf("missing lines for md device %s", mdName) } - active, total, size, err := evalStatusline(lines[i+1]) + active, total, size, err := evalStatusLine(lines[i+1]) if err != nil { - return mdStates, fmt.Errorf("error parsing %s: %s", mdStatusFilePath, err) + return nil, err } - // j is the line number of the syncing-line. - j := i + 2 + syncLineIdx := i + 2 if strings.Contains(lines[i+2], "bitmap") { // skip bitmap line - j = i + 3 + syncLineIdx++ } - // If device is syncing at the moment, get the number of currently + // If device is recovering/syncing at the moment, get the number of currently // synced bytes, otherwise that number equals the size of the device. syncedBlocks := size - if strings.Contains(lines[j], "recovery") || strings.Contains(lines[j], "resync") { - syncedBlocks, err = evalBuildline(lines[j]) + if strings.Contains(lines[syncLineIdx], "recovery") || strings.Contains(lines[syncLineIdx], "resync") { + syncedBlocks, err = evalRecoveryLine(lines[syncLineIdx]) if err != nil { - return mdStates, fmt.Errorf("error parsing %s: %s", mdStatusFilePath, err) + return nil, err } } - mdStates = append(mdStates, MDStat{ + mdStats = append(mdStats, MDStat{ Name: mdName, ActivityState: activityState, DisksActive: active, @@ -109,10 +109,10 @@ func (fs FS) ParseMDStat() (mdstates []MDStat, err error) { }) } - return mdStates, nil + return mdStats, nil } -func evalStatusline(statusline string) (active, total, size int64, err error) { +func evalStatusLine(statusline string) (active, total, size int64, err error) { matches := statuslineRE.FindStringSubmatch(statusline) if len(matches) != 4 { return 0, 0, 0, fmt.Errorf("unexpected statusline: %s", statusline) @@ -136,7 +136,7 @@ func evalStatusline(statusline string) (active, total, size int64, err error) { return active, total, size, nil } -func evalBuildline(buildline string) (syncedBlocks int64, err error) { +func evalRecoveryLine(buildline string) (syncedBlocks int64, err error) { matches := buildlineRE.FindStringSubmatch(buildline) if len(matches) != 2 { return 0, fmt.Errorf("unexpected buildline: %s", buildline) diff --git a/vendor/github.com/prometheus/procfs/mountstats.go b/vendor/github.com/prometheus/procfs/mountstats.go index fc385afcf..35b2ef351 100644 --- a/vendor/github.com/prometheus/procfs/mountstats.go +++ b/vendor/github.com/prometheus/procfs/mountstats.go @@ -69,8 +69,8 @@ type MountStats interface { type MountStatsNFS struct { // The version of statistics provided. StatVersion string - // The optional mountaddr of the NFS mount. - MountAddress string + // The mount options of the NFS mount. + Opts map[string]string // The age of the NFS mount. Age time.Duration // Statistics related to byte counters for various operations. @@ -181,11 +181,11 @@ type NFSOperationStats struct { // Number of bytes received for this operation, including RPC headers and payload. BytesReceived uint64 // Duration all requests spent queued for transmission before they were sent. - CumulativeQueueTime time.Duration + CumulativeQueueMilliseconds uint64 // Duration it took to get a reply back after the request was transmitted. - CumulativeTotalResponseTime time.Duration + CumulativeTotalResponseMilliseconds uint64 // Duration from when a request was enqueued to when it was completely handled. - CumulativeTotalRequestTime time.Duration + CumulativeTotalRequestMilliseconds uint64 } // A NFSTransportStats contains statistics for the NFS mount RPC requests and @@ -204,7 +204,7 @@ type NFSTransportStats struct { // spent waiting for connections to the server to be established. ConnectIdleTime uint64 // Duration since the NFS mount last saw any RPC traffic. - IdleTime time.Duration + IdleTimeSeconds uint64 // Number of RPC requests for this mount sent to the NFS server. Sends uint64 // Number of RPC responses for this mount received from the NFS server. @@ -342,10 +342,15 @@ func parseMountStatsNFS(s *bufio.Scanner, statVersion string) (*MountStatsNFS, e switch ss[0] { case fieldOpts: + if stats.Opts == nil { + stats.Opts = map[string]string{} + } for _, opt := range strings.Split(ss[1], ",") { split := strings.Split(opt, "=") - if len(split) == 2 && split[0] == "mountaddr" { - stats.MountAddress = split[1] + if len(split) == 2 { + stats.Opts[split[0]] = split[1] + } else { + stats.Opts[opt] = "" } } case fieldAge: @@ -519,15 +524,15 @@ func parseNFSOperationStats(s *bufio.Scanner) ([]NFSOperationStats, error) { } ops = append(ops, NFSOperationStats{ - Operation: strings.TrimSuffix(ss[0], ":"), - Requests: ns[0], - Transmissions: ns[1], - MajorTimeouts: ns[2], - BytesSent: ns[3], - BytesReceived: ns[4], - CumulativeQueueTime: time.Duration(ns[5]) * time.Millisecond, - CumulativeTotalResponseTime: time.Duration(ns[6]) * time.Millisecond, - CumulativeTotalRequestTime: time.Duration(ns[7]) * time.Millisecond, + Operation: strings.TrimSuffix(ss[0], ":"), + Requests: ns[0], + Transmissions: ns[1], + MajorTimeouts: ns[2], + BytesSent: ns[3], + BytesReceived: ns[4], + CumulativeQueueMilliseconds: ns[5], + CumulativeTotalResponseMilliseconds: ns[6], + CumulativeTotalRequestMilliseconds: ns[7], }) } @@ -603,7 +608,7 @@ func parseNFSTransportStats(ss []string, statVersion string) (*NFSTransportStats Bind: ns[1], Connect: ns[2], ConnectIdleTime: ns[3], - IdleTime: time.Duration(ns[4]) * time.Second, + IdleTimeSeconds: ns[4], Sends: ns[5], Receives: ns[6], BadTransactionIDs: ns[7], diff --git a/vendor/github.com/prometheus/procfs/net_dev.go b/vendor/github.com/prometheus/procfs/net_dev.go index 0063594e6..a0b7a0119 100644 --- a/vendor/github.com/prometheus/procfs/net_dev.go +++ b/vendor/github.com/prometheus/procfs/net_dev.go @@ -47,23 +47,13 @@ type NetDevLine struct { // are interface names. type NetDev map[string]NetDevLine -// NewNetDev returns kernel/system statistics read from /proc/net/dev. -func NewNetDev() (NetDev, error) { - fs, err := NewFS(DefaultMountPoint) - if err != nil { - return nil, err - } - - return fs.NewNetDev() -} - -// NewNetDev returns kernel/system statistics read from /proc/net/dev. -func (fs FS) NewNetDev() (NetDev, error) { +// NetDev returns kernel/system statistics read from /proc/net/dev. +func (fs FS) NetDev() (NetDev, error) { return newNetDev(fs.proc.Path("net/dev")) } -// NewNetDev returns kernel/system statistics read from /proc/[pid]/net/dev. -func (p Proc) NewNetDev() (NetDev, error) { +// NetDev returns kernel/system statistics read from /proc/[pid]/net/dev. +func (p Proc) NetDev() (NetDev, error) { return newNetDev(p.path("net/dev")) } @@ -75,7 +65,7 @@ func newNetDev(file string) (NetDev, error) { } defer f.Close() - nd := NetDev{} + netDev := NetDev{} s := bufio.NewScanner(f) for n := 0; s.Scan(); n++ { // Skip the 2 header lines. @@ -83,20 +73,20 @@ func newNetDev(file string) (NetDev, error) { continue } - line, err := nd.parseLine(s.Text()) + line, err := netDev.parseLine(s.Text()) if err != nil { - return nd, err + return netDev, err } - nd[line.Name] = *line + netDev[line.Name] = *line } - return nd, s.Err() + return netDev, s.Err() } // parseLine parses a single line from the /proc/net/dev file. Header lines // must be filtered prior to calling this method. -func (nd NetDev) parseLine(rawLine string) (*NetDevLine, error) { +func (netDev NetDev) parseLine(rawLine string) (*NetDevLine, error) { parts := strings.SplitN(rawLine, ":", 2) if len(parts) != 2 { return nil, errors.New("invalid net/dev line, missing colon") @@ -185,11 +175,11 @@ func (nd NetDev) parseLine(rawLine string) (*NetDevLine, error) { // Total aggregates the values across interfaces and returns a new NetDevLine. // The Name field will be a sorted comma separated list of interface names. -func (nd NetDev) Total() NetDevLine { +func (netDev NetDev) Total() NetDevLine { total := NetDevLine{} - names := make([]string, 0, len(nd)) - for _, ifc := range nd { + names := make([]string, 0, len(netDev)) + for _, ifc := range netDev { names = append(names, ifc.Name) total.RxBytes += ifc.RxBytes total.RxPackets += ifc.RxPackets diff --git a/vendor/github.com/prometheus/procfs/net_unix.go b/vendor/github.com/prometheus/procfs/net_unix.go new file mode 100644 index 000000000..240340a83 --- /dev/null +++ b/vendor/github.com/prometheus/procfs/net_unix.go @@ -0,0 +1,275 @@ +// Copyright 2018 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package procfs + +import ( + "bufio" + "errors" + "fmt" + "io" + "os" + "strconv" + "strings" +) + +// For the proc file format details, +// see https://elixir.bootlin.com/linux/v4.17/source/net/unix/af_unix.c#L2815 +// and https://elixir.bootlin.com/linux/latest/source/include/uapi/linux/net.h#L48. + +const ( + netUnixKernelPtrIdx = iota + netUnixRefCountIdx + _ + netUnixFlagsIdx + netUnixTypeIdx + netUnixStateIdx + netUnixInodeIdx + + // Inode and Path are optional. + netUnixStaticFieldsCnt = 6 +) + +const ( + netUnixTypeStream = 1 + netUnixTypeDgram = 2 + netUnixTypeSeqpacket = 5 + + netUnixFlagListen = 1 << 16 + + netUnixStateUnconnected = 1 + netUnixStateConnecting = 2 + netUnixStateConnected = 3 + netUnixStateDisconnected = 4 +) + +var errInvalidKernelPtrFmt = errors.New("Invalid Num(the kernel table slot number) format") + +// NetUnixType is the type of the type field. +type NetUnixType uint64 + +// NetUnixFlags is the type of the flags field. +type NetUnixFlags uint64 + +// NetUnixState is the type of the state field. +type NetUnixState uint64 + +// NetUnixLine represents a line of /proc/net/unix. +type NetUnixLine struct { + KernelPtr string + RefCount uint64 + Protocol uint64 + Flags NetUnixFlags + Type NetUnixType + State NetUnixState + Inode uint64 + Path string +} + +// NetUnix holds the data read from /proc/net/unix. +type NetUnix struct { + Rows []*NetUnixLine +} + +// NewNetUnix returns data read from /proc/net/unix. +func NewNetUnix() (*NetUnix, error) { + fs, err := NewFS(DefaultMountPoint) + if err != nil { + return nil, err + } + + return fs.NewNetUnix() +} + +// NewNetUnix returns data read from /proc/net/unix. +func (fs FS) NewNetUnix() (*NetUnix, error) { + return NewNetUnixByPath(fs.proc.Path("net/unix")) +} + +// NewNetUnixByPath returns data read from /proc/net/unix by file path. +// It might returns an error with partial parsed data, if an error occur after some data parsed. +func NewNetUnixByPath(path string) (*NetUnix, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + return NewNetUnixByReader(f) +} + +// NewNetUnixByReader returns data read from /proc/net/unix by a reader. +// It might returns an error with partial parsed data, if an error occur after some data parsed. +func NewNetUnixByReader(reader io.Reader) (*NetUnix, error) { + nu := &NetUnix{ + Rows: make([]*NetUnixLine, 0, 32), + } + scanner := bufio.NewScanner(reader) + // Omit the header line. + scanner.Scan() + header := scanner.Text() + // From the man page of proc(5), it does not contain an Inode field, + // but in actually it exists. + // This code works for both cases. + hasInode := strings.Contains(header, "Inode") + + minFieldsCnt := netUnixStaticFieldsCnt + if hasInode { + minFieldsCnt++ + } + for scanner.Scan() { + line := scanner.Text() + item, err := nu.parseLine(line, hasInode, minFieldsCnt) + if err != nil { + return nu, err + } + nu.Rows = append(nu.Rows, item) + } + + return nu, scanner.Err() +} + +func (u *NetUnix) parseLine(line string, hasInode bool, minFieldsCnt int) (*NetUnixLine, error) { + fields := strings.Fields(line) + fieldsLen := len(fields) + if fieldsLen < minFieldsCnt { + return nil, fmt.Errorf( + "Parse Unix domain failed: expect at least %d fields but got %d", + minFieldsCnt, fieldsLen) + } + kernelPtr, err := u.parseKernelPtr(fields[netUnixKernelPtrIdx]) + if err != nil { + return nil, fmt.Errorf("Parse Unix domain num(%s) failed: %s", fields[netUnixKernelPtrIdx], err) + } + users, err := u.parseUsers(fields[netUnixRefCountIdx]) + if err != nil { + return nil, fmt.Errorf("Parse Unix domain ref count(%s) failed: %s", fields[netUnixRefCountIdx], err) + } + flags, err := u.parseFlags(fields[netUnixFlagsIdx]) + if err != nil { + return nil, fmt.Errorf("Parse Unix domain flags(%s) failed: %s", fields[netUnixFlagsIdx], err) + } + typ, err := u.parseType(fields[netUnixTypeIdx]) + if err != nil { + return nil, fmt.Errorf("Parse Unix domain type(%s) failed: %s", fields[netUnixTypeIdx], err) + } + state, err := u.parseState(fields[netUnixStateIdx]) + if err != nil { + return nil, fmt.Errorf("Parse Unix domain state(%s) failed: %s", fields[netUnixStateIdx], err) + } + var inode uint64 + if hasInode { + inodeStr := fields[netUnixInodeIdx] + inode, err = u.parseInode(inodeStr) + if err != nil { + return nil, fmt.Errorf("Parse Unix domain inode(%s) failed: %s", inodeStr, err) + } + } + + nuLine := &NetUnixLine{ + KernelPtr: kernelPtr, + RefCount: users, + Type: typ, + Flags: flags, + State: state, + Inode: inode, + } + + // Path field is optional. + if fieldsLen > minFieldsCnt { + pathIdx := netUnixInodeIdx + 1 + if !hasInode { + pathIdx-- + } + nuLine.Path = fields[pathIdx] + } + + return nuLine, nil +} + +func (u NetUnix) parseKernelPtr(str string) (string, error) { + if !strings.HasSuffix(str, ":") { + return "", errInvalidKernelPtrFmt + } + return str[:len(str)-1], nil +} + +func (u NetUnix) parseUsers(hexStr string) (uint64, error) { + return strconv.ParseUint(hexStr, 16, 32) +} + +func (u NetUnix) parseProtocol(hexStr string) (uint64, error) { + return strconv.ParseUint(hexStr, 16, 32) +} + +func (u NetUnix) parseType(hexStr string) (NetUnixType, error) { + typ, err := strconv.ParseUint(hexStr, 16, 16) + if err != nil { + return 0, err + } + return NetUnixType(typ), nil +} + +func (u NetUnix) parseFlags(hexStr string) (NetUnixFlags, error) { + flags, err := strconv.ParseUint(hexStr, 16, 32) + if err != nil { + return 0, err + } + return NetUnixFlags(flags), nil +} + +func (u NetUnix) parseState(hexStr string) (NetUnixState, error) { + st, err := strconv.ParseInt(hexStr, 16, 8) + if err != nil { + return 0, err + } + return NetUnixState(st), nil +} + +func (u NetUnix) parseInode(inodeStr string) (uint64, error) { + return strconv.ParseUint(inodeStr, 10, 64) +} + +func (t NetUnixType) String() string { + switch t { + case netUnixTypeStream: + return "stream" + case netUnixTypeDgram: + return "dgram" + case netUnixTypeSeqpacket: + return "seqpacket" + } + return "unknown" +} + +func (f NetUnixFlags) String() string { + switch f { + case netUnixFlagListen: + return "listen" + default: + return "default" + } +} + +func (s NetUnixState) String() string { + switch s { + case netUnixStateUnconnected: + return "unconnected" + case netUnixStateConnecting: + return "connecting" + case netUnixStateConnected: + return "connected" + case netUnixStateDisconnected: + return "disconnected" + } + return "unknown" +} diff --git a/vendor/github.com/prometheus/procfs/proc.go b/vendor/github.com/prometheus/procfs/proc.go index 8e38493a8..8a8430147 100644 --- a/vendor/github.com/prometheus/procfs/proc.go +++ b/vendor/github.com/prometheus/procfs/proc.go @@ -54,7 +54,7 @@ func NewProc(pid int) (Proc, error) { if err != nil { return Proc{}, err } - return fs.NewProc(pid) + return fs.Proc(pid) } // AllProcs returns a list of all currently available processes under /proc. @@ -76,11 +76,18 @@ func (fs FS) Self() (Proc, error) { if err != nil { return Proc{}, err } - return fs.NewProc(pid) + return fs.Proc(pid) } // NewProc returns a process for the given pid. +// +// Deprecated: use fs.Proc() instead func (fs FS) NewProc(pid int) (Proc, error) { + return fs.Proc(pid) +} + +// Proc returns a process for the given pid. +func (fs FS) Proc(pid int) (Proc, error) { if _, err := os.Stat(fs.proc.Path(strconv.Itoa(pid))); err != nil { return Proc{}, err } diff --git a/vendor/github.com/prometheus/procfs/proc_io.go b/vendor/github.com/prometheus/procfs/proc_io.go index 0251c83bf..0ff89b1ce 100644 --- a/vendor/github.com/prometheus/procfs/proc_io.go +++ b/vendor/github.com/prometheus/procfs/proc_io.go @@ -39,8 +39,8 @@ type ProcIO struct { CancelledWriteBytes int64 } -// NewIO creates a new ProcIO instance from a given Proc instance. -func (p Proc) NewIO() (ProcIO, error) { +// IO creates a new ProcIO instance from a given Proc instance. +func (p Proc) IO() (ProcIO, error) { pio := ProcIO{} f, err := os.Open(p.path("io")) diff --git a/vendor/github.com/prometheus/procfs/proc_limits.go b/vendor/github.com/prometheus/procfs/proc_limits.go index f04ba6fda..91ee24df8 100644 --- a/vendor/github.com/prometheus/procfs/proc_limits.go +++ b/vendor/github.com/prometheus/procfs/proc_limits.go @@ -78,7 +78,14 @@ var ( ) // NewLimits returns the current soft limits of the process. +// +// Deprecated: use p.Limits() instead func (p Proc) NewLimits() (ProcLimits, error) { + return p.Limits() +} + +// Limits returns the current soft limits of the process. +func (p Proc) Limits() (ProcLimits, error) { f, err := os.Open(p.path("limits")) if err != nil { return ProcLimits{}, err diff --git a/vendor/github.com/prometheus/procfs/proc_ns.go b/vendor/github.com/prometheus/procfs/proc_ns.go index d06c26eba..c66740ff7 100644 --- a/vendor/github.com/prometheus/procfs/proc_ns.go +++ b/vendor/github.com/prometheus/procfs/proc_ns.go @@ -29,9 +29,9 @@ type Namespace struct { // Namespaces contains all of the namespaces that the process is contained in. type Namespaces map[string]Namespace -// NewNamespaces reads from /proc/[pid/ns/* to get the namespaces of which the +// Namespaces reads from /proc//ns/* to get the namespaces of which the // process is a member. -func (p Proc) NewNamespaces() (Namespaces, error) { +func (p Proc) Namespaces() (Namespaces, error) { d, err := os.Open(p.path("ns")) if err != nil { return nil, err diff --git a/vendor/github.com/prometheus/procfs/proc_psi.go b/vendor/github.com/prometheus/procfs/proc_psi.go index a23d4c0f0..46fe26626 100644 --- a/vendor/github.com/prometheus/procfs/proc_psi.go +++ b/vendor/github.com/prometheus/procfs/proc_psi.go @@ -51,19 +51,10 @@ type PSIStats struct { Full *PSILine } -// NewPSIStatsForResource reads pressure stall information for the specified -// resource. At time of writing this can be either "cpu", "memory" or "io". -func NewPSIStatsForResource(resource string) (PSIStats, error) { - fs, err := NewFS(DefaultMountPoint) - if err != nil { - return PSIStats{}, err - } - - return fs.NewPSIStatsForResource(resource) -} - -// NewPSIStatsForResource reads pressure stall information from /proc/pressure/ -func (fs FS) NewPSIStatsForResource(resource string) (PSIStats, error) { +// PSIStatsForResource reads pressure stall information for the specified +// resource from /proc/pressure/. At time of writing this can be +// either "cpu", "memory" or "io". +func (fs FS) PSIStatsForResource(resource string) (PSIStats, error) { file, err := os.Open(fs.proc.Path(fmt.Sprintf("%s/%s", "pressure", resource))) if err != nil { return PSIStats{}, fmt.Errorf("psi_stats: unavailable for %s", resource) diff --git a/vendor/github.com/prometheus/procfs/proc_stat.go b/vendor/github.com/prometheus/procfs/proc_stat.go index 4c8b03ced..6ed98a8ae 100644 --- a/vendor/github.com/prometheus/procfs/proc_stat.go +++ b/vendor/github.com/prometheus/procfs/proc_stat.go @@ -105,7 +105,14 @@ type ProcStat struct { } // NewStat returns the current status information of the process. +// +// Deprecated: use NewStat() instead func (p Proc) NewStat() (ProcStat, error) { + return p.Stat() +} + +// Stat returns the current status information of the process. +func (p Proc) Stat() (ProcStat, error) { f, err := os.Open(p.path("stat")) if err != nil { return ProcStat{}, err @@ -178,7 +185,7 @@ func (s ProcStat) ResidentMemory() int { // StartTime returns the unix timestamp of the process in seconds. func (s ProcStat) StartTime() (float64, error) { fs := FS{proc: s.proc} - stat, err := fs.NewStat() + stat, err := fs.Stat() if err != nil { return 0, err } diff --git a/vendor/github.com/prometheus/procfs/proc_status.go b/vendor/github.com/prometheus/procfs/proc_status.go new file mode 100644 index 000000000..6b4b61f71 --- /dev/null +++ b/vendor/github.com/prometheus/procfs/proc_status.go @@ -0,0 +1,162 @@ +// Copyright 2018 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package procfs + +import ( + "bytes" + "io/ioutil" + "os" + "strconv" + "strings" +) + +// ProcStat provides status information about the process, +// read from /proc/[pid]/stat. +type ProcStatus struct { + // The process ID. + PID int + // The process name. + Name string + + // Peak virtual memory size. + VmPeak uint64 + // Virtual memory size. + VmSize uint64 + // Locked memory size. + VmLck uint64 + // Pinned memory size. + VmPin uint64 + // Peak resident set size. + VmHWM uint64 + // Resident set size (sum of RssAnnon RssFile and RssShmem). + VmRSS uint64 + // Size of resident anonymous memory. + RssAnon uint64 + // Size of resident file mappings. + RssFile uint64 + // Size of resident shared memory. + RssShmem uint64 + // Size of data segments. + VmData uint64 + // Size of stack segments. + VmStk uint64 + // Size of text segments. + VmExe uint64 + // Shared library code size. + VmLib uint64 + // Page table entries size. + VmPTE uint64 + // Size of second-level page tables. + VmPMD uint64 + // Swapped-out virtual memory size by anonymous private. + VmSwap uint64 + // Size of hugetlb memory portions + HugetlbPages uint64 + + // Number of voluntary context switches. + VoluntaryCtxtSwitches uint64 + // Number of involuntary context switches. + NonVoluntaryCtxtSwitches uint64 +} + +// NewStatus returns the current status information of the process. +func (p Proc) NewStatus() (ProcStatus, error) { + f, err := os.Open(p.path("status")) + if err != nil { + return ProcStatus{}, err + } + defer f.Close() + + data, err := ioutil.ReadAll(f) + if err != nil { + return ProcStatus{}, err + } + + s := ProcStatus{PID: p.PID} + + lines := strings.Split(string(data), "\n") + for _, line := range lines { + if !bytes.Contains([]byte(line), []byte(":")) { + continue + } + + kv := strings.SplitN(line, ":", 2) + + // removes spaces + k := string(strings.TrimSpace(kv[0])) + v := string(strings.TrimSpace(kv[1])) + // removes "kB" + v = string(bytes.Trim([]byte(v), " kB")) + + // value to int when possible + // we can skip error check here, 'cause vKBytes is not used when value is a string + vKBytes, _ := strconv.ParseUint(v, 10, 64) + // convert kB to B + vBytes := vKBytes * 1024 + + s.fillStatus(k, v, vKBytes, vBytes) + } + + return s, nil +} + +func (s *ProcStatus) fillStatus(k string, vString string, vUint uint64, vUintBytes uint64) { + switch k { + case "Name": + s.Name = vString + case "VmPeak": + s.VmPeak = vUintBytes + case "VmSize": + s.VmSize = vUintBytes + case "VmLck": + s.VmLck = vUintBytes + case "VmPin": + s.VmPin = vUintBytes + case "VmHWM": + s.VmHWM = vUintBytes + case "VmRSS": + s.VmRSS = vUintBytes + case "RssAnon": + s.RssAnon = vUintBytes + case "RssFile": + s.RssFile = vUintBytes + case "RssShmem": + s.RssShmem = vUintBytes + case "VmData": + s.VmData = vUintBytes + case "VmStk": + s.VmStk = vUintBytes + case "VmExe": + s.VmExe = vUintBytes + case "VmLib": + s.VmLib = vUintBytes + case "VmPTE": + s.VmPTE = vUintBytes + case "VmPMD": + s.VmPMD = vUintBytes + case "VmSwap": + s.VmSwap = vUintBytes + case "HugetlbPages": + s.HugetlbPages = vUintBytes + case "voluntary_ctxt_switches": + s.VoluntaryCtxtSwitches = vUint + case "nonvoluntary_ctxt_switches": + s.NonVoluntaryCtxtSwitches = vUint + } +} + +// TotalCtxtSwitches returns the total context switch. +func (s ProcStatus) TotalCtxtSwitches() uint64 { + return s.VoluntaryCtxtSwitches + s.NonVoluntaryCtxtSwitches +} diff --git a/vendor/github.com/prometheus/procfs/stat.go b/vendor/github.com/prometheus/procfs/stat.go index 44c9af1b0..6661ee03a 100644 --- a/vendor/github.com/prometheus/procfs/stat.go +++ b/vendor/github.com/prometheus/procfs/stat.go @@ -20,6 +20,8 @@ import ( "os" "strconv" "strings" + + "github.com/prometheus/procfs/internal/fs" ) // CPUStat shows how much time the cpu spend in various stages. @@ -78,16 +80,6 @@ type Stat struct { SoftIRQ SoftIRQStat } -// NewStat returns kernel/system statistics read from /proc/stat. -func NewStat() (Stat, error) { - fs, err := NewFS(DefaultMountPoint) - if err != nil { - return Stat{}, err - } - - return fs.NewStat() -} - // Parse a cpu statistics line and returns the CPUStat struct plus the cpu id (or -1 for the overall sum). func parseCPUStat(line string) (CPUStat, int64, error) { cpuStat := CPUStat{} @@ -149,9 +141,29 @@ func parseSoftIRQStat(line string) (SoftIRQStat, uint64, error) { return softIRQStat, total, nil } -// NewStat returns an information about current kernel/system statistics. +// NewStat returns information about current cpu/process statistics. +// See https://www.kernel.org/doc/Documentation/filesystems/proc.txt +// +// Deprecated: use fs.Stat() instead +func NewStat() (Stat, error) { + fs, err := NewFS(fs.DefaultProcMountPoint) + if err != nil { + return Stat{}, err + } + return fs.Stat() +} + +// NewStat returns information about current cpu/process statistics. +// See https://www.kernel.org/doc/Documentation/filesystems/proc.txt +// +// Deprecated: use fs.Stat() instead func (fs FS) NewStat() (Stat, error) { - // See https://www.kernel.org/doc/Documentation/filesystems/proc.txt + return fs.Stat() +} + +// Stat returns information about current cpu/process statistics. +// See https://www.kernel.org/doc/Documentation/filesystems/proc.txt +func (fs FS) Stat() (Stat, error) { f, err := os.Open(fs.proc.Path("stat")) if err != nil { diff --git a/vendor/github.com/prometheus/procfs/ttar b/vendor/github.com/prometheus/procfs/ttar index b0171a12b..19ef02b8d 100644 --- a/vendor/github.com/prometheus/procfs/ttar +++ b/vendor/github.com/prometheus/procfs/ttar @@ -86,8 +86,10 @@ Usage: $bname [-C ] -c -f (create archive) $bname [-C ] -x -f (extract archive) Options: - -C (change directory) - -v (verbose) + -C (change directory) + -v (verbose) + --recursive-unlink (recursively delete existing directory if path + collides with file or directory to extract) Example: Change to sysfs directory, create ttar file from fixtures directory $bname -C sysfs -c -f sysfs/fixtures.ttar fixtures/ @@ -111,8 +113,9 @@ function set_cmd { } unset VERBOSE +unset RECURSIVE_UNLINK -while getopts :cf:htxvC: opt; do +while getopts :cf:-:htxvC: opt; do case $opt in c) set_cmd "create" @@ -136,6 +139,18 @@ while getopts :cf:htxvC: opt; do C) CDIR=$OPTARG ;; + -) + case $OPTARG in + recursive-unlink) + RECURSIVE_UNLINK="yes" + ;; + *) + echo -e "Error: invalid option -$OPTARG" + echo + usage 1 + ;; + esac + ;; *) echo >&2 "ERROR: invalid option -$OPTARG" echo @@ -212,16 +227,16 @@ function extract { local eof_without_newline if [ "$size" -gt 0 ]; then if [[ "$line" =~ [^\\]EOF ]]; then - # An EOF not preceeded by a backslash indicates that the line + # An EOF not preceded by a backslash indicates that the line # does not end with a newline eof_without_newline=1 else eof_without_newline=0 fi # Replace NULLBYTE with null byte if at beginning of line - # Replace NULLBYTE with null byte unless preceeded by backslash + # Replace NULLBYTE with null byte unless preceded by backslash # Remove one backslash in front of NULLBYTE (if any) - # Remove EOF unless preceeded by backslash + # Remove EOF unless preceded by backslash # Remove one backslash in front of EOF if [ $USE_PYTHON -eq 1 ]; then echo -n "$line" | python -c "$PYTHON_EXTRACT_FILTER" >> "$path" @@ -245,7 +260,16 @@ function extract { fi if [[ $line =~ ^Path:\ (.*)$ ]]; then path=${BASH_REMATCH[1]} - if [ -e "$path" ] || [ -L "$path" ]; then + if [ -L "$path" ]; then + rm "$path" + elif [ -d "$path" ]; then + if [ "${RECURSIVE_UNLINK:-}" == "yes" ]; then + rm -r "$path" + else + # Safe because symlinks to directories are dealt with above + rmdir "$path" + fi + elif [ -e "$path" ]; then rm "$path" fi elif [[ $line =~ ^Lines:\ (.*)$ ]]; then @@ -338,8 +362,8 @@ function _create { else < "$file" \ sed 's/EOF/\\EOF/g; - s/NULLBYTE/\\NULLBYTE/g; - s/\x0/NULLBYTE/g; + s/NULLBYTE/\\NULLBYTE/g; + s/\x0/NULLBYTE/g; ' fi if [[ "$eof_without_newline" -eq 1 ]]; then diff --git a/vendor/github.com/prometheus/prometheus/LICENSE b/vendor/github.com/prometheus/prometheus/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/vendor/github.com/prometheus/prometheus/NOTICE b/vendor/github.com/prometheus/prometheus/NOTICE new file mode 100644 index 000000000..47de2415e --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/NOTICE @@ -0,0 +1,87 @@ +The Prometheus systems and service monitoring server +Copyright 2012-2015 The Prometheus Authors + +This product includes software developed at +SoundCloud Ltd. (http://soundcloud.com/). + + +The following components are included in this product: + +Bootstrap +http://getbootstrap.com +Copyright 2011-2014 Twitter, Inc. +Licensed under the MIT License + +bootstrap3-typeahead.js +https://github.com/bassjobsen/Bootstrap-3-Typeahead +Original written by @mdo and @fat +Copyright 2014 Bass Jobsen @bassjobsen +Licensed under the Apache License, Version 2.0 + +fuzzy +https://github.com/mattyork/fuzzy +Original written by @mattyork +Copyright 2012 Matt York +Licensed under the MIT License + +bootstrap-datetimepicker.js +https://github.com/Eonasdan/bootstrap-datetimepicker +Copyright 2015 Jonathan Peterson (@Eonasdan) +Licensed under the MIT License + +moment.js +https://github.com/moment/moment/ +Copyright JS Foundation and other contributors +Licensed under the MIT License + +Rickshaw +https://github.com/shutterstock/rickshaw +Copyright 2011-2014 by Shutterstock Images, LLC +See https://github.com/shutterstock/rickshaw/blob/master/LICENSE for license details + +mustache.js +https://github.com/janl/mustache.js +Copyright 2009 Chris Wanstrath (Ruby) +Copyright 2010-2014 Jan Lehnardt (JavaScript) +Copyright 2010-2015 The mustache.js community +Licensed under the MIT License + +jQuery +https://jquery.org +Copyright jQuery Foundation and other contributors +Licensed under the MIT License + +Go support for Protocol Buffers - Google's data interchange format +http://github.com/golang/protobuf/ +Copyright 2010 The Go Authors +See source code for license details. + +Go support for leveled logs, analogous to +https://code.google.com/p/google-glog/ +Copyright 2013 Google Inc. +Licensed under the Apache License, Version 2.0 + +Support for streaming Protocol Buffer messages for the Go language (golang). +https://github.com/matttproud/golang_protobuf_extensions +Copyright 2013 Matt T. Proud +Licensed under the Apache License, Version 2.0 + +DNS library in Go +http://miek.nl/posts/2014/Aug/16/go-dns-package/ +Copyright 2009 The Go Authors, 2011 Miek Gieben +See https://github.com/miekg/dns/blob/master/LICENSE for license details. + +LevelDB key/value database in Go +https://github.com/syndtr/goleveldb +Copyright 2012 Suryandaru Triandana +See https://github.com/syndtr/goleveldb/blob/master/LICENSE for license details. + +gosnappy - a fork of code.google.com/p/snappy-go +https://github.com/syndtr/gosnappy +Copyright 2011 The Snappy-Go Authors +See https://github.com/syndtr/gosnappy/blob/master/LICENSE for license details. + +go-zookeeper - Native ZooKeeper client for Go +https://github.com/samuel/go-zookeeper +Copyright (c) 2013, Samuel Stauffer +See https://github.com/samuel/go-zookeeper/blob/master/LICENSE for license details. diff --git a/vendor/github.com/prometheus/prometheus/promql/ast.go b/vendor/github.com/prometheus/prometheus/promql/ast.go new file mode 100644 index 000000000..b3ccd2570 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/promql/ast.go @@ -0,0 +1,317 @@ +// Copyright 2015 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package promql + +import ( + "fmt" + "time" + + "github.com/prometheus/common/model" + + "github.com/prometheus/prometheus/storage/local" + "github.com/prometheus/prometheus/storage/metric" +) + +// Node is a generic interface for all nodes in an AST. +// +// Whenever numerous nodes are listed such as in a switch-case statement +// or a chain of function definitions (e.g. String(), expr(), etc.) convention is +// to list them as follows: +// +// - Statements +// - statement types (alphabetical) +// - ... +// - Expressions +// - expression types (alphabetical) +// - ... +// +type Node interface { + // String representation of the node that returns the given node when parsed + // as part of a valid query. + String() string +} + +// Statement is a generic interface for all statements. +type Statement interface { + Node + + // stmt ensures that no other type accidentally implements the interface + stmt() +} + +// Statements is a list of statement nodes that implements Node. +type Statements []Statement + +// AlertStmt represents an added alert rule. +type AlertStmt struct { + Name string + Expr Expr + Duration time.Duration + Labels model.LabelSet + Annotations model.LabelSet +} + +// EvalStmt holds an expression and information on the range it should +// be evaluated on. +type EvalStmt struct { + Expr Expr // Expression to be evaluated. + + // The time boundaries for the evaluation. If Start equals End an instant + // is evaluated. + Start, End model.Time + // Time between two evaluated instants for the range [Start:End]. + Interval time.Duration +} + +// RecordStmt represents an added recording rule. +type RecordStmt struct { + Name string + Expr Expr + Labels model.LabelSet +} + +func (*AlertStmt) stmt() {} +func (*EvalStmt) stmt() {} +func (*RecordStmt) stmt() {} + +// Expr is a generic interface for all expression types. +type Expr interface { + Node + + // Type returns the type the expression evaluates to. It does not perform + // in-depth checks as this is done at parsing-time. + Type() model.ValueType + // expr ensures that no other types accidentally implement the interface. + expr() +} + +// Expressions is a list of expression nodes that implements Node. +type Expressions []Expr + +// AggregateExpr represents an aggregation operation on a vector. +type AggregateExpr struct { + Op itemType // The used aggregation operation. + Expr Expr // The vector expression over which is aggregated. + Param Expr // Parameter used by some aggregators. + Grouping model.LabelNames // The labels by which to group the vector. + Without bool // Whether to drop the given labels rather than keep them. + KeepCommonLabels bool // Whether to keep common labels among result elements. +} + +// BinaryExpr represents a binary expression between two child expressions. +type BinaryExpr struct { + Op itemType // The operation of the expression. + LHS, RHS Expr // The operands on the respective sides of the operator. + + // The matching behavior for the operation if both operands are vectors. + // If they are not this field is nil. + VectorMatching *VectorMatching + + // If a comparison operator, return 0/1 rather than filtering. + ReturnBool bool +} + +// Call represents a function call. +type Call struct { + Func *Function // The function that was called. + Args Expressions // Arguments used in the call. +} + +// MatrixSelector represents a matrix selection. +type MatrixSelector struct { + Name string + Range time.Duration + Offset time.Duration + LabelMatchers metric.LabelMatchers + + // The series iterators are populated at query preparation time. + iterators []local.SeriesIterator +} + +// NumberLiteral represents a number. +type NumberLiteral struct { + Val model.SampleValue +} + +// ParenExpr wraps an expression so it cannot be disassembled as a consequence +// of operator precedence. +type ParenExpr struct { + Expr Expr +} + +// StringLiteral represents a string. +type StringLiteral struct { + Val string +} + +// UnaryExpr represents a unary operation on another expression. +// Currently unary operations are only supported for scalars. +type UnaryExpr struct { + Op itemType + Expr Expr +} + +// VectorSelector represents a vector selection. +type VectorSelector struct { + Name string + Offset time.Duration + LabelMatchers metric.LabelMatchers + + // The series iterators are populated at query preparation time. + iterators []local.SeriesIterator +} + +func (e *AggregateExpr) Type() model.ValueType { return model.ValVector } +func (e *Call) Type() model.ValueType { return e.Func.ReturnType } +func (e *MatrixSelector) Type() model.ValueType { return model.ValMatrix } +func (e *NumberLiteral) Type() model.ValueType { return model.ValScalar } +func (e *ParenExpr) Type() model.ValueType { return e.Expr.Type() } +func (e *StringLiteral) Type() model.ValueType { return model.ValString } +func (e *UnaryExpr) Type() model.ValueType { return e.Expr.Type() } +func (e *VectorSelector) Type() model.ValueType { return model.ValVector } +func (e *BinaryExpr) Type() model.ValueType { + if e.LHS.Type() == model.ValScalar && e.RHS.Type() == model.ValScalar { + return model.ValScalar + } + return model.ValVector +} + +func (*AggregateExpr) expr() {} +func (*BinaryExpr) expr() {} +func (*Call) expr() {} +func (*MatrixSelector) expr() {} +func (*NumberLiteral) expr() {} +func (*ParenExpr) expr() {} +func (*StringLiteral) expr() {} +func (*UnaryExpr) expr() {} +func (*VectorSelector) expr() {} + +// VectorMatchCardinality describes the cardinality relationship +// of two vectors in a binary operation. +type VectorMatchCardinality int + +const ( + CardOneToOne VectorMatchCardinality = iota + CardManyToOne + CardOneToMany + CardManyToMany +) + +func (vmc VectorMatchCardinality) String() string { + switch vmc { + case CardOneToOne: + return "one-to-one" + case CardManyToOne: + return "many-to-one" + case CardOneToMany: + return "one-to-many" + case CardManyToMany: + return "many-to-many" + } + panic("promql.VectorMatchCardinality.String: unknown match cardinality") +} + +// VectorMatching describes how elements from two vectors in a binary +// operation are supposed to be matched. +type VectorMatching struct { + // The cardinality of the two vectors. + Card VectorMatchCardinality + // MatchingLabels contains the labels which define equality of a pair of + // elements from the vectors. + MatchingLabels model.LabelNames + // On includes the given label names from matching, + // rather than excluding them. + On bool + // Include contains additional labels that should be included in + // the result from the side with the lower cardinality. + Include model.LabelNames +} + +// Visitor allows visiting a Node and its child nodes. The Visit method is +// invoked for each node encountered by Walk. If the result visitor w is not +// nil, Walk visits each of the children of node with the visitor w, followed +// by a call of w.Visit(nil). +type Visitor interface { + Visit(node Node) (w Visitor) +} + +// Walk traverses an AST in depth-first order: It starts by calling +// v.Visit(node); node must not be nil. If the visitor w returned by +// v.Visit(node) is not nil, Walk is invoked recursively with visitor +// w for each of the non-nil children of node, followed by a call of +// w.Visit(nil). +func Walk(v Visitor, node Node) { + if v = v.Visit(node); v == nil { + return + } + + switch n := node.(type) { + case Statements: + for _, s := range n { + Walk(v, s) + } + case *AlertStmt: + Walk(v, n.Expr) + + case *EvalStmt: + Walk(v, n.Expr) + + case *RecordStmt: + Walk(v, n.Expr) + + case Expressions: + for _, e := range n { + Walk(v, e) + } + case *AggregateExpr: + Walk(v, n.Expr) + + case *BinaryExpr: + Walk(v, n.LHS) + Walk(v, n.RHS) + + case *Call: + Walk(v, n.Args) + + case *ParenExpr: + Walk(v, n.Expr) + + case *UnaryExpr: + Walk(v, n.Expr) + + case *MatrixSelector, *NumberLiteral, *StringLiteral, *VectorSelector: + // nothing to do + + default: + panic(fmt.Errorf("promql.Walk: unhandled node type %T", node)) + } + + v.Visit(nil) +} + +type inspector func(Node) bool + +func (f inspector) Visit(node Node) Visitor { + if f(node) { + return f + } + return nil +} + +// Inspect traverses an AST in depth-first order: It starts by calling +// f(node); node must not be nil. If f returns true, Inspect invokes f +// for all the non-nil children of node, recursively. +func Inspect(node Node, f func(Node) bool) { + Walk(inspector(f), node) +} diff --git a/vendor/github.com/prometheus/prometheus/promql/engine.go b/vendor/github.com/prometheus/prometheus/promql/engine.go new file mode 100644 index 000000000..63c0b9606 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/promql/engine.go @@ -0,0 +1,1436 @@ +// Copyright 2013 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package promql + +import ( + "container/heap" + "fmt" + "math" + "runtime" + "sort" + "time" + + opentracing "github.com/opentracing/opentracing-go" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/log" + "github.com/prometheus/common/model" + "golang.org/x/net/context" + + "github.com/prometheus/prometheus/storage/local" + "github.com/prometheus/prometheus/storage/metric" + "github.com/prometheus/prometheus/util/stats" +) + +const ( + namespace = "prometheus" + subsystem = "engine" + queryTag = "query" + + // The largest SampleValue that can be converted to an int64 without overflow. + maxInt64 model.SampleValue = 9223372036854774784 + // The smallest SampleValue that can be converted to an int64 without underflow. + minInt64 model.SampleValue = -9223372036854775808 +) + +var ( + currentQueries = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "queries", + Help: "The current number of queries being executed or waiting.", + }) + maxConcurrentQueries = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "queries_concurrent_max", + Help: "The max number of concurrent queries.", + }) + queryPrepareTime = prometheus.NewSummary( + prometheus.SummaryOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "query_duration_seconds", + Help: "Query timings", + ConstLabels: prometheus.Labels{"slice": "prepare_time"}, + }, + ) + queryInnerEval = prometheus.NewSummary( + prometheus.SummaryOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "query_duration_seconds", + Help: "Query timings", + ConstLabels: prometheus.Labels{"slice": "inner_eval"}, + }, + ) + queryResultAppend = prometheus.NewSummary( + prometheus.SummaryOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "query_duration_seconds", + Help: "Query timings", + ConstLabels: prometheus.Labels{"slice": "result_append"}, + }, + ) + queryResultSort = prometheus.NewSummary( + prometheus.SummaryOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "query_duration_seconds", + Help: "Query timings", + ConstLabels: prometheus.Labels{"slice": "result_sort"}, + }, + ) +) + +func init() { + prometheus.MustRegister(currentQueries) + prometheus.MustRegister(maxConcurrentQueries) + prometheus.MustRegister(queryPrepareTime) + prometheus.MustRegister(queryInnerEval) + prometheus.MustRegister(queryResultAppend) + prometheus.MustRegister(queryResultSort) +} + +// convertibleToInt64 returns true if v does not over-/underflow an int64. +func convertibleToInt64(v model.SampleValue) bool { + return v <= maxInt64 && v >= minInt64 +} + +// sampleStream is a stream of Values belonging to an attached COWMetric. +type sampleStream struct { + Metric metric.Metric + Values []model.SamplePair +} + +// sample is a single sample belonging to a COWMetric. +type sample struct { + Metric metric.Metric + Value model.SampleValue + Timestamp model.Time +} + +// vector is basically only an alias for model.Samples, but the +// contract is that in a Vector, all Samples have the same timestamp. +type vector []*sample + +func (vector) Type() model.ValueType { return model.ValVector } +func (vec vector) String() string { return vec.value().String() } + +func (vec vector) value() model.Vector { + val := make(model.Vector, len(vec)) + for i, s := range vec { + val[i] = &model.Sample{ + Metric: s.Metric.Copy().Metric, + Value: s.Value, + Timestamp: s.Timestamp, + } + } + return val +} + +// matrix is a slice of SampleStreams that implements sort.Interface and +// has a String method. +type matrix []*sampleStream + +func (matrix) Type() model.ValueType { return model.ValMatrix } +func (mat matrix) String() string { return mat.value().String() } + +func (mat matrix) value() model.Matrix { + val := make(model.Matrix, len(mat)) + for i, ss := range mat { + val[i] = &model.SampleStream{ + Metric: ss.Metric.Copy().Metric, + Values: ss.Values, + } + } + return val +} + +// Result holds the resulting value of an execution or an error +// if any occurred. +type Result struct { + Err error + Value model.Value +} + +// Vector returns a vector if the result value is one. An error is returned if +// the result was an error or the result value is not a vector. +func (r *Result) Vector() (model.Vector, error) { + if r.Err != nil { + return nil, r.Err + } + v, ok := r.Value.(model.Vector) + if !ok { + return nil, fmt.Errorf("query result is not a vector") + } + return v, nil +} + +// Matrix returns a matrix. An error is returned if +// the result was an error or the result value is not a matrix. +func (r *Result) Matrix() (model.Matrix, error) { + if r.Err != nil { + return nil, r.Err + } + v, ok := r.Value.(model.Matrix) + if !ok { + return nil, fmt.Errorf("query result is not a range vector") + } + return v, nil +} + +// Scalar returns a scalar value. An error is returned if +// the result was an error or the result value is not a scalar. +func (r *Result) Scalar() (*model.Scalar, error) { + if r.Err != nil { + return nil, r.Err + } + v, ok := r.Value.(*model.Scalar) + if !ok { + return nil, fmt.Errorf("query result is not a scalar") + } + return v, nil +} + +func (r *Result) String() string { + if r.Err != nil { + return r.Err.Error() + } + if r.Value == nil { + return "" + } + return r.Value.String() +} + +type ( + // ErrQueryTimeout is returned if a query timed out during processing. + ErrQueryTimeout string + // ErrQueryCanceled is returned if a query was canceled during processing. + ErrQueryCanceled string + // ErrStorage is returned if an error was encountered in the storage layer + // during query handling. + ErrStorage error +) + +func (e ErrQueryTimeout) Error() string { return fmt.Sprintf("query timed out in %s", string(e)) } +func (e ErrQueryCanceled) Error() string { return fmt.Sprintf("query was canceled in %s", string(e)) } + +// A Query is derived from an a raw query string and can be run against an engine +// it is associated with. +type Query interface { + // Exec processes the query and + Exec(ctx context.Context) *Result + // Statement returns the parsed statement of the query. + Statement() Statement + // Stats returns statistics about the lifetime of the query. + Stats() *stats.TimerGroup + // Cancel signals that a running query execution should be aborted. + Cancel() +} + +// query implements the Query interface. +type query struct { + // The original query string. + q string + // Statement of the parsed query. + stmt Statement + // Timer stats for the query execution. + stats *stats.TimerGroup + // Cancellation function for the query. + cancel func() + + // The engine against which the query is executed. + ng *Engine +} + +// Statement implements the Query interface. +func (q *query) Statement() Statement { + return q.stmt +} + +// Stats implements the Query interface. +func (q *query) Stats() *stats.TimerGroup { + return q.stats +} + +// Cancel implements the Query interface. +func (q *query) Cancel() { + if q.cancel != nil { + q.cancel() + } +} + +// Exec implements the Query interface. +func (q *query) Exec(ctx context.Context) *Result { + if span := opentracing.SpanFromContext(ctx); span != nil { + span.SetTag(queryTag, q.stmt.String()) + } + + res, err := q.ng.exec(ctx, q) + return &Result{Err: err, Value: res} +} + +// contextDone returns an error if the context was canceled or timed out. +func contextDone(ctx context.Context, env string) error { + select { + case <-ctx.Done(): + err := ctx.Err() + switch err { + case context.Canceled: + return ErrQueryCanceled(env) + case context.DeadlineExceeded: + return ErrQueryTimeout(env) + default: + return err + } + default: + return nil + } +} + +// Engine handles the lifetime of queries from beginning to end. +// It is connected to a querier. +type Engine struct { + // A Querier constructor against an underlying storage. + queryable Queryable + // The gate limiting the maximum number of concurrent and waiting queries. + gate *queryGate + options *EngineOptions +} + +// Queryable allows opening a storage querier. +type Queryable interface { + Querier() (local.Querier, error) +} + +// NewEngine returns a new engine. +func NewEngine(queryable Queryable, o *EngineOptions) *Engine { + if o == nil { + o = DefaultEngineOptions + } + maxConcurrentQueries.Set(float64(o.MaxConcurrentQueries)) + return &Engine{ + queryable: queryable, + gate: newQueryGate(o.MaxConcurrentQueries), + options: o, + } +} + +// EngineOptions contains configuration parameters for an Engine. +type EngineOptions struct { + MaxConcurrentQueries int + Timeout time.Duration +} + +// DefaultEngineOptions are the default engine options. +var DefaultEngineOptions = &EngineOptions{ + MaxConcurrentQueries: 20, + Timeout: 2 * time.Minute, +} + +// NewInstantQuery returns an evaluation query for the given expression at the given time. +func (ng *Engine) NewInstantQuery(qs string, ts model.Time) (Query, error) { + expr, err := ParseExpr(qs) + if err != nil { + return nil, err + } + qry := ng.newQuery(expr, ts, ts, 0) + qry.q = qs + + return qry, nil +} + +// NewRangeQuery returns an evaluation query for the given time range and with +// the resolution set by the interval. +func (ng *Engine) NewRangeQuery(qs string, start, end model.Time, interval time.Duration) (Query, error) { + expr, err := ParseExpr(qs) + if err != nil { + return nil, err + } + if expr.Type() != model.ValVector && expr.Type() != model.ValScalar { + return nil, fmt.Errorf("invalid expression type %q for range query, must be scalar or instant vector", documentedType(expr.Type())) + } + qry := ng.newQuery(expr, start, end, interval) + qry.q = qs + + return qry, nil +} + +func (ng *Engine) newQuery(expr Expr, start, end model.Time, interval time.Duration) *query { + es := &EvalStmt{ + Expr: expr, + Start: start, + End: end, + Interval: interval, + } + qry := &query{ + stmt: es, + ng: ng, + stats: stats.NewTimerGroup(), + } + return qry +} + +// testStmt is an internal helper statement that allows execution +// of an arbitrary function during handling. It is used to test the Engine. +type testStmt func(context.Context) error + +func (testStmt) String() string { return "test statement" } +func (testStmt) stmt() {} + +func (ng *Engine) newTestQuery(f func(context.Context) error) Query { + qry := &query{ + q: "test statement", + stmt: testStmt(f), + ng: ng, + stats: stats.NewTimerGroup(), + } + return qry +} + +// exec executes the query. +// +// At this point per query only one EvalStmt is evaluated. Alert and record +// statements are not handled by the Engine. +func (ng *Engine) exec(ctx context.Context, q *query) (model.Value, error) { + currentQueries.Inc() + defer currentQueries.Dec() + ctx, cancel := context.WithTimeout(ctx, ng.options.Timeout) + q.cancel = cancel + + queueTimer := q.stats.GetTimer(stats.ExecQueueTime).Start() + + if err := ng.gate.Start(ctx); err != nil { + return nil, err + } + defer ng.gate.Done() + + queueTimer.Stop() + + // Cancel when execution is done or an error was raised. + defer q.cancel() + + const env = "query execution" + + evalTimer := q.stats.GetTimer(stats.TotalEvalTime).Start() + defer evalTimer.Stop() + + // The base context might already be canceled on the first iteration (e.g. during shutdown). + if err := contextDone(ctx, env); err != nil { + return nil, err + } + + switch s := q.Statement().(type) { + case *EvalStmt: + return ng.execEvalStmt(ctx, q, s) + case testStmt: + return nil, s(ctx) + } + + panic(fmt.Errorf("promql.Engine.exec: unhandled statement of type %T", q.Statement())) +} + +// execEvalStmt evaluates the expression of an evaluation statement for the given time range. +func (ng *Engine) execEvalStmt(ctx context.Context, query *query, s *EvalStmt) (model.Value, error) { + querier, err := ng.queryable.Querier() + if err != nil { + return nil, err + } + defer querier.Close() + + prepareTimer := query.stats.GetTimer(stats.QueryPreparationTime).Start() + err = ng.populateIterators(ctx, querier, s) + prepareTimer.Stop() + queryPrepareTime.Observe(prepareTimer.ElapsedTime().Seconds()) + + if err != nil { + return nil, err + } + defer ng.closeIterators(s) + + evalTimer := query.stats.GetTimer(stats.InnerEvalTime).Start() + // Instant evaluation. + if s.Start == s.End && s.Interval == 0 { + evaluator := &evaluator{ + Timestamp: s.Start, + ctx: ctx, + } + val, err := evaluator.Eval(s.Expr) + if err != nil { + return nil, err + } + + // Turn matrix and vector types with protected metrics into + // model.* types. + switch v := val.(type) { + case vector: + val = v.value() + case matrix: + val = v.value() + } + + evalTimer.Stop() + queryInnerEval.Observe(evalTimer.ElapsedTime().Seconds()) + + return val, nil + } + numSteps := int(s.End.Sub(s.Start) / s.Interval) + + // Range evaluation. + sampleStreams := map[model.Fingerprint]*sampleStream{} + for ts := s.Start; !ts.After(s.End); ts = ts.Add(s.Interval) { + + if err := contextDone(ctx, "range evaluation"); err != nil { + return nil, err + } + + evaluator := &evaluator{ + Timestamp: ts, + ctx: ctx, + } + val, err := evaluator.Eval(s.Expr) + if err != nil { + return nil, err + } + + switch v := val.(type) { + case *model.Scalar: + // As the expression type does not change we can safely default to 0 + // as the fingerprint for scalar expressions. + ss := sampleStreams[0] + if ss == nil { + ss = &sampleStream{Values: make([]model.SamplePair, 0, numSteps)} + sampleStreams[0] = ss + } + ss.Values = append(ss.Values, model.SamplePair{ + Value: v.Value, + Timestamp: v.Timestamp, + }) + case vector: + for _, sample := range v { + fp := sample.Metric.Metric.Fingerprint() + ss := sampleStreams[fp] + if ss == nil { + ss = &sampleStream{ + Metric: sample.Metric, + Values: make([]model.SamplePair, 0, numSteps), + } + sampleStreams[fp] = ss + } + ss.Values = append(ss.Values, model.SamplePair{ + Value: sample.Value, + Timestamp: sample.Timestamp, + }) + } + default: + panic(fmt.Errorf("promql.Engine.exec: invalid expression type %q", val.Type())) + } + } + evalTimer.Stop() + queryInnerEval.Observe(evalTimer.ElapsedTime().Seconds()) + + if err := contextDone(ctx, "expression evaluation"); err != nil { + return nil, err + } + + appendTimer := query.stats.GetTimer(stats.ResultAppendTime).Start() + mat := matrix{} + for _, ss := range sampleStreams { + mat = append(mat, ss) + } + appendTimer.Stop() + queryResultAppend.Observe(appendTimer.ElapsedTime().Seconds()) + + if err := contextDone(ctx, "expression evaluation"); err != nil { + return nil, err + } + + // Turn matrix type with protected metric into model.Matrix. + resMatrix := mat.value() + + sortTimer := query.stats.GetTimer(stats.ResultSortTime).Start() + sort.Sort(resMatrix) + sortTimer.Stop() + queryResultSort.Observe(sortTimer.ElapsedTime().Seconds()) + return resMatrix, nil +} + +func (ng *Engine) populateIterators(ctx context.Context, querier local.Querier, s *EvalStmt) error { + var queryErr error + Inspect(s.Expr, func(node Node) bool { + switch n := node.(type) { + case *VectorSelector: + if s.Start.Equal(s.End) { + n.iterators, queryErr = querier.QueryInstant( + ctx, + s.Start.Add(-n.Offset), + StalenessDelta, + n.LabelMatchers..., + ) + } else { + n.iterators, queryErr = querier.QueryRange( + ctx, + s.Start.Add(-n.Offset-StalenessDelta), + s.End.Add(-n.Offset), + n.LabelMatchers..., + ) + } + if queryErr != nil { + return false + } + case *MatrixSelector: + n.iterators, queryErr = querier.QueryRange( + ctx, + s.Start.Add(-n.Offset-n.Range), + s.End.Add(-n.Offset), + n.LabelMatchers..., + ) + if queryErr != nil { + return false + } + } + return true + }) + return queryErr +} + +func (ng *Engine) closeIterators(s *EvalStmt) { + Inspect(s.Expr, func(node Node) bool { + switch n := node.(type) { + case *VectorSelector: + for _, it := range n.iterators { + it.Close() + } + case *MatrixSelector: + for _, it := range n.iterators { + it.Close() + } + } + return true + }) +} + +// An evaluator evaluates given expressions at a fixed timestamp. It is attached to an +// engine through which it connects to a querier and reports errors. On timeout or +// cancellation of its context it terminates. +type evaluator struct { + ctx context.Context + + Timestamp model.Time +} + +// fatalf causes a panic with the input formatted into an error. +func (ev *evaluator) errorf(format string, args ...interface{}) { + ev.error(fmt.Errorf(format, args...)) +} + +// fatal causes a panic with the given error. +func (ev *evaluator) error(err error) { + panic(err) +} + +// recover is the handler that turns panics into returns from the top level of evaluation. +func (ev *evaluator) recover(errp *error) { + e := recover() + if e != nil { + if _, ok := e.(runtime.Error); ok { + // Print the stack trace but do not inhibit the running application. + buf := make([]byte, 64<<10) + buf = buf[:runtime.Stack(buf, false)] + + log.Errorf("parser panic: %v\n%s", e, buf) + *errp = fmt.Errorf("unexpected error") + } else { + *errp = e.(error) + } + } +} + +// evalScalar attempts to evaluate e to a scalar value and errors otherwise. +func (ev *evaluator) evalScalar(e Expr) *model.Scalar { + val := ev.eval(e) + sv, ok := val.(*model.Scalar) + if !ok { + ev.errorf("expected scalar but got %s", documentedType(val.Type())) + } + return sv +} + +// evalVector attempts to evaluate e to a vector value and errors otherwise. +func (ev *evaluator) evalVector(e Expr) vector { + val := ev.eval(e) + vec, ok := val.(vector) + if !ok { + ev.errorf("expected instant vector but got %s", documentedType(val.Type())) + } + return vec +} + +// evalInt attempts to evaluate e into an integer and errors otherwise. +func (ev *evaluator) evalInt(e Expr) int64 { + sc := ev.evalScalar(e) + if !convertibleToInt64(sc.Value) { + ev.errorf("scalar value %v overflows int64", sc.Value) + } + return int64(sc.Value) +} + +// evalFloat attempts to evaluate e into a float and errors otherwise. +func (ev *evaluator) evalFloat(e Expr) float64 { + sc := ev.evalScalar(e) + return float64(sc.Value) +} + +// evalMatrix attempts to evaluate e into a matrix and errors otherwise. +// The error message uses the term "range vector" to match the user facing +// documentation. +func (ev *evaluator) evalMatrix(e Expr) matrix { + val := ev.eval(e) + mat, ok := val.(matrix) + if !ok { + ev.errorf("expected range vector but got %s", documentedType(val.Type())) + } + return mat +} + +// evalString attempts to evaluate e to a string value and errors otherwise. +func (ev *evaluator) evalString(e Expr) *model.String { + val := ev.eval(e) + sv, ok := val.(*model.String) + if !ok { + ev.errorf("expected string but got %s", documentedType(val.Type())) + } + return sv +} + +// evalOneOf evaluates e and errors unless the result is of one of the given types. +func (ev *evaluator) evalOneOf(e Expr, t1, t2 model.ValueType) model.Value { + val := ev.eval(e) + if val.Type() != t1 && val.Type() != t2 { + ev.errorf("expected %s or %s but got %s", documentedType(t1), documentedType(t2), documentedType(val.Type())) + } + return val +} + +func (ev *evaluator) Eval(expr Expr) (v model.Value, err error) { + defer ev.recover(&err) + return ev.eval(expr), nil +} + +// eval evaluates the given expression as the given AST expression node requires. +func (ev *evaluator) eval(expr Expr) model.Value { + // This is the top-level evaluation method. + // Thus, we check for timeout/cancellation here. + if err := contextDone(ev.ctx, "expression evaluation"); err != nil { + ev.error(err) + } + + switch e := expr.(type) { + case *AggregateExpr: + vector := ev.evalVector(e.Expr) + return ev.aggregation(e.Op, e.Grouping, e.Without, e.KeepCommonLabels, e.Param, vector) + + case *BinaryExpr: + lhs := ev.evalOneOf(e.LHS, model.ValScalar, model.ValVector) + rhs := ev.evalOneOf(e.RHS, model.ValScalar, model.ValVector) + + switch lt, rt := lhs.Type(), rhs.Type(); { + case lt == model.ValScalar && rt == model.ValScalar: + return &model.Scalar{ + Value: scalarBinop(e.Op, lhs.(*model.Scalar).Value, rhs.(*model.Scalar).Value), + Timestamp: ev.Timestamp, + } + + case lt == model.ValVector && rt == model.ValVector: + switch e.Op { + case itemLAND: + return ev.vectorAnd(lhs.(vector), rhs.(vector), e.VectorMatching) + case itemLOR: + return ev.vectorOr(lhs.(vector), rhs.(vector), e.VectorMatching) + case itemLUnless: + return ev.vectorUnless(lhs.(vector), rhs.(vector), e.VectorMatching) + default: + return ev.vectorBinop(e.Op, lhs.(vector), rhs.(vector), e.VectorMatching, e.ReturnBool) + } + case lt == model.ValVector && rt == model.ValScalar: + return ev.vectorScalarBinop(e.Op, lhs.(vector), rhs.(*model.Scalar), false, e.ReturnBool) + + case lt == model.ValScalar && rt == model.ValVector: + return ev.vectorScalarBinop(e.Op, rhs.(vector), lhs.(*model.Scalar), true, e.ReturnBool) + } + + case *Call: + return e.Func.Call(ev, e.Args) + + case *MatrixSelector: + return ev.matrixSelector(e) + + case *NumberLiteral: + return &model.Scalar{Value: e.Val, Timestamp: ev.Timestamp} + + case *ParenExpr: + return ev.eval(e.Expr) + + case *StringLiteral: + return &model.String{Value: e.Val, Timestamp: ev.Timestamp} + + case *UnaryExpr: + se := ev.evalOneOf(e.Expr, model.ValScalar, model.ValVector) + // Only + and - are possible operators. + if e.Op == itemSUB { + switch v := se.(type) { + case *model.Scalar: + v.Value = -v.Value + case vector: + for i, sv := range v { + v[i].Value = -sv.Value + } + } + } + return se + + case *VectorSelector: + return ev.vectorSelector(e) + } + panic(fmt.Errorf("unhandled expression of type: %T", expr)) +} + +// vectorSelector evaluates a *VectorSelector expression. +func (ev *evaluator) vectorSelector(node *VectorSelector) vector { + vec := vector{} + for _, it := range node.iterators { + refTime := ev.Timestamp.Add(-node.Offset) + samplePair := it.ValueAtOrBeforeTime(refTime) + if samplePair.Timestamp.Before(refTime.Add(-StalenessDelta)) { + continue // Sample outside of staleness policy window. + } + vec = append(vec, &sample{ + Metric: it.Metric(), + Value: samplePair.Value, + Timestamp: ev.Timestamp, + }) + } + return vec +} + +// matrixSelector evaluates a *MatrixSelector expression. +func (ev *evaluator) matrixSelector(node *MatrixSelector) matrix { + interval := metric.Interval{ + OldestInclusive: ev.Timestamp.Add(-node.Range - node.Offset), + NewestInclusive: ev.Timestamp.Add(-node.Offset), + } + + sampleStreams := make([]*sampleStream, 0, len(node.iterators)) + for _, it := range node.iterators { + samplePairs := it.RangeValues(interval) + if len(samplePairs) == 0 { + continue + } + + if node.Offset != 0 { + for _, sp := range samplePairs { + sp.Timestamp = sp.Timestamp.Add(node.Offset) + } + } + + sampleStream := &sampleStream{ + Metric: it.Metric(), + Values: samplePairs, + } + sampleStreams = append(sampleStreams, sampleStream) + } + return matrix(sampleStreams) +} + +func (ev *evaluator) vectorAnd(lhs, rhs vector, matching *VectorMatching) vector { + if matching.Card != CardManyToMany { + panic("set operations must only use many-to-many matching") + } + sigf := signatureFunc(matching.On, matching.MatchingLabels...) + + var result vector + // The set of signatures for the right-hand side vector. + rightSigs := map[uint64]struct{}{} + // Add all rhs samples to a map so we can easily find matches later. + for _, rs := range rhs { + rightSigs[sigf(rs.Metric)] = struct{}{} + } + + for _, ls := range lhs { + // If there's a matching entry in the right-hand side vector, add the sample. + if _, ok := rightSigs[sigf(ls.Metric)]; ok { + result = append(result, ls) + } + } + return result +} + +func (ev *evaluator) vectorOr(lhs, rhs vector, matching *VectorMatching) vector { + if matching.Card != CardManyToMany { + panic("set operations must only use many-to-many matching") + } + sigf := signatureFunc(matching.On, matching.MatchingLabels...) + + var result vector + leftSigs := map[uint64]struct{}{} + // Add everything from the left-hand-side vector. + for _, ls := range lhs { + leftSigs[sigf(ls.Metric)] = struct{}{} + result = append(result, ls) + } + // Add all right-hand side elements which have not been added from the left-hand side. + for _, rs := range rhs { + if _, ok := leftSigs[sigf(rs.Metric)]; !ok { + result = append(result, rs) + } + } + return result +} + +func (ev *evaluator) vectorUnless(lhs, rhs vector, matching *VectorMatching) vector { + if matching.Card != CardManyToMany { + panic("set operations must only use many-to-many matching") + } + sigf := signatureFunc(matching.On, matching.MatchingLabels...) + + rightSigs := map[uint64]struct{}{} + for _, rs := range rhs { + rightSigs[sigf(rs.Metric)] = struct{}{} + } + + var result vector + for _, ls := range lhs { + if _, ok := rightSigs[sigf(ls.Metric)]; !ok { + result = append(result, ls) + } + } + return result +} + +// vectorBinop evaluates a binary operation between two vectors, excluding set operators. +func (ev *evaluator) vectorBinop(op itemType, lhs, rhs vector, matching *VectorMatching, returnBool bool) vector { + if matching.Card == CardManyToMany { + panic("many-to-many only allowed for set operators") + } + var ( + result = vector{} + sigf = signatureFunc(matching.On, matching.MatchingLabels...) + ) + + // The control flow below handles one-to-one or many-to-one matching. + // For one-to-many, swap sidedness and account for the swap when calculating + // values. + if matching.Card == CardOneToMany { + lhs, rhs = rhs, lhs + } + + // All samples from the rhs hashed by the matching label/values. + rightSigs := map[uint64]*sample{} + + // Add all rhs samples to a map so we can easily find matches later. + for _, rs := range rhs { + sig := sigf(rs.Metric) + // The rhs is guaranteed to be the 'one' side. Having multiple samples + // with the same signature means that the matching is many-to-many. + if _, found := rightSigs[sig]; found { + // Many-to-many matching not allowed. + ev.errorf("many-to-many matching not allowed: matching labels must be unique on one side") + } + rightSigs[sig] = rs + } + + // Tracks the match-signature. For one-to-one operations the value is nil. For many-to-one + // the value is a set of signatures to detect duplicated result elements. + matchedSigs := map[uint64]map[uint64]struct{}{} + + // For all lhs samples find a respective rhs sample and perform + // the binary operation. + for _, ls := range lhs { + sig := sigf(ls.Metric) + + rs, found := rightSigs[sig] // Look for a match in the rhs vector. + if !found { + continue + } + + // Account for potentially swapped sidedness. + vl, vr := ls.Value, rs.Value + if matching.Card == CardOneToMany { + vl, vr = vr, vl + } + value, keep := vectorElemBinop(op, vl, vr) + if returnBool { + if keep { + value = 1.0 + } else { + value = 0.0 + } + } else if !keep { + continue + } + metric := resultMetric(ls.Metric, rs.Metric, op, matching) + + insertedSigs, exists := matchedSigs[sig] + if matching.Card == CardOneToOne { + if exists { + ev.errorf("multiple matches for labels: many-to-one matching must be explicit (group_left/group_right)") + } + matchedSigs[sig] = nil // Set existence to true. + } else { + // In many-to-one matching the grouping labels have to ensure a unique metric + // for the result vector. Check whether those labels have already been added for + // the same matching labels. + insertSig := uint64(metric.Metric.Fingerprint()) + if !exists { + insertedSigs = map[uint64]struct{}{} + matchedSigs[sig] = insertedSigs + } else if _, duplicate := insertedSigs[insertSig]; duplicate { + ev.errorf("multiple matches for labels: grouping labels must ensure unique matches") + } + insertedSigs[insertSig] = struct{}{} + } + + result = append(result, &sample{ + Metric: metric, + Value: value, + Timestamp: ev.Timestamp, + }) + } + return result +} + +// signatureFunc returns a function that calculates the signature for a metric +// ignoring the provided labels. If on, then the given labels are only used instead. +func signatureFunc(on bool, labels ...model.LabelName) func(m metric.Metric) uint64 { + if !on { + return func(m metric.Metric) uint64 { + tmp := m.Metric.Clone() + for _, l := range labels { + delete(tmp, l) + } + delete(tmp, model.MetricNameLabel) + return uint64(tmp.Fingerprint()) + } + } + return func(m metric.Metric) uint64 { + return model.SignatureForLabels(m.Metric, labels...) + } +} + +// resultMetric returns the metric for the given sample(s) based on the vector +// binary operation and the matching options. +func resultMetric(lhs, rhs metric.Metric, op itemType, matching *VectorMatching) metric.Metric { + if shouldDropMetricName(op) { + lhs.Del(model.MetricNameLabel) + } + if !matching.On { + if matching.Card == CardOneToOne { + for _, l := range matching.MatchingLabels { + lhs.Del(l) + } + } + for _, ln := range matching.Include { + // Included labels from the `group_x` modifier are taken from the "one"-side. + value := rhs.Metric[ln] + if value != "" { + lhs.Set(ln, rhs.Metric[ln]) + } else { + lhs.Del(ln) + } + } + return lhs + } + // As we definitely write, creating a new metric is the easiest solution. + m := model.Metric{} + if matching.Card == CardOneToOne { + for _, ln := range matching.MatchingLabels { + if v, ok := lhs.Metric[ln]; ok { + m[ln] = v + } + } + } else { + for k, v := range lhs.Metric { + m[k] = v + } + } + for _, ln := range matching.Include { + // Included labels from the `group_x` modifier are taken from the "one"-side . + if v, ok := rhs.Metric[ln]; ok { + m[ln] = v + } else { + delete(m, ln) + } + } + return metric.Metric{Metric: m, Copied: false} +} + +// vectorScalarBinop evaluates a binary operation between a vector and a scalar. +func (ev *evaluator) vectorScalarBinop(op itemType, lhs vector, rhs *model.Scalar, swap, returnBool bool) vector { + vec := make(vector, 0, len(lhs)) + + for _, lhsSample := range lhs { + lv, rv := lhsSample.Value, rhs.Value + // lhs always contains the vector. If the original position was different + // swap for calculating the value. + if swap { + lv, rv = rv, lv + } + value, keep := vectorElemBinop(op, lv, rv) + if returnBool { + if keep { + value = 1.0 + } else { + value = 0.0 + } + keep = true + } + if keep { + lhsSample.Value = value + if shouldDropMetricName(op) { + lhsSample.Metric.Del(model.MetricNameLabel) + } + vec = append(vec, lhsSample) + } + } + return vec +} + +// scalarBinop evaluates a binary operation between two scalars. +func scalarBinop(op itemType, lhs, rhs model.SampleValue) model.SampleValue { + switch op { + case itemADD: + return lhs + rhs + case itemSUB: + return lhs - rhs + case itemMUL: + return lhs * rhs + case itemDIV: + return lhs / rhs + case itemPOW: + return model.SampleValue(math.Pow(float64(lhs), float64(rhs))) + case itemMOD: + return model.SampleValue(math.Mod(float64(lhs), float64(rhs))) + case itemEQL: + return btos(lhs == rhs) + case itemNEQ: + return btos(lhs != rhs) + case itemGTR: + return btos(lhs > rhs) + case itemLSS: + return btos(lhs < rhs) + case itemGTE: + return btos(lhs >= rhs) + case itemLTE: + return btos(lhs <= rhs) + } + panic(fmt.Errorf("operator %q not allowed for scalar operations", op)) +} + +// vectorElemBinop evaluates a binary operation between two vector elements. +func vectorElemBinop(op itemType, lhs, rhs model.SampleValue) (model.SampleValue, bool) { + switch op { + case itemADD: + return lhs + rhs, true + case itemSUB: + return lhs - rhs, true + case itemMUL: + return lhs * rhs, true + case itemDIV: + return lhs / rhs, true + case itemPOW: + return model.SampleValue(math.Pow(float64(lhs), float64(rhs))), true + case itemMOD: + return model.SampleValue(math.Mod(float64(lhs), float64(rhs))), true + case itemEQL: + return lhs, lhs == rhs + case itemNEQ: + return lhs, lhs != rhs + case itemGTR: + return lhs, lhs > rhs + case itemLSS: + return lhs, lhs < rhs + case itemGTE: + return lhs, lhs >= rhs + case itemLTE: + return lhs, lhs <= rhs + } + panic(fmt.Errorf("operator %q not allowed for operations between vectors", op)) +} + +// labelIntersection returns the metric of common label/value pairs of two input metrics. +func labelIntersection(metric1, metric2 metric.Metric) metric.Metric { + for label, value := range metric1.Metric { + if metric2.Metric[label] != value { + metric1.Del(label) + } + } + return metric1 +} + +type groupedAggregation struct { + labels metric.Metric + value model.SampleValue + valuesSquaredSum model.SampleValue + groupCount int + heap vectorByValueHeap + reverseHeap vectorByReverseValueHeap +} + +// aggregation evaluates an aggregation operation on a vector. +func (ev *evaluator) aggregation(op itemType, grouping model.LabelNames, without bool, keepCommon bool, param Expr, vec vector) vector { + + result := map[uint64]*groupedAggregation{} + var k int64 + if op == itemTopK || op == itemBottomK { + k = ev.evalInt(param) + if k < 1 { + return vector{} + } + } + var q float64 + if op == itemQuantile { + q = ev.evalFloat(param) + } + var valueLabel model.LabelName + if op == itemCountValues { + valueLabel = model.LabelName(ev.evalString(param).Value) + if !without { + grouping = append(grouping, valueLabel) + } + } + + for _, s := range vec { + withoutMetric := s.Metric + if without { + for _, l := range grouping { + withoutMetric.Del(l) + } + withoutMetric.Del(model.MetricNameLabel) + if op == itemCountValues { + withoutMetric.Set(valueLabel, model.LabelValue(s.Value.String())) + } + } else { + if op == itemCountValues { + s.Metric.Set(valueLabel, model.LabelValue(s.Value.String())) + } + } + + var groupingKey uint64 + if without { + groupingKey = uint64(withoutMetric.Metric.Fingerprint()) + } else { + groupingKey = model.SignatureForLabels(s.Metric.Metric, grouping...) + } + + groupedResult, ok := result[groupingKey] + // Add a new group if it doesn't exist. + if !ok { + var m metric.Metric + if keepCommon { + m = s.Metric + m.Del(model.MetricNameLabel) + } else if without { + m = withoutMetric + } else { + m = metric.Metric{ + Metric: model.Metric{}, + Copied: true, + } + for _, l := range grouping { + if v, ok := s.Metric.Metric[l]; ok { + m.Set(l, v) + } + } + } + result[groupingKey] = &groupedAggregation{ + labels: m, + value: s.Value, + valuesSquaredSum: s.Value * s.Value, + groupCount: 1, + } + if op == itemTopK || op == itemQuantile { + result[groupingKey].heap = make(vectorByValueHeap, 0, k) + heap.Push(&result[groupingKey].heap, &sample{Value: s.Value, Metric: s.Metric}) + } else if op == itemBottomK { + result[groupingKey].reverseHeap = make(vectorByReverseValueHeap, 0, k) + heap.Push(&result[groupingKey].reverseHeap, &sample{Value: s.Value, Metric: s.Metric}) + } + continue + } + // Add the sample to the existing group. + if keepCommon { + groupedResult.labels = labelIntersection(groupedResult.labels, s.Metric) + } + + switch op { + case itemSum: + groupedResult.value += s.Value + case itemAvg: + groupedResult.value += s.Value + groupedResult.groupCount++ + case itemMax: + if groupedResult.value < s.Value || math.IsNaN(float64(groupedResult.value)) { + groupedResult.value = s.Value + } + case itemMin: + if groupedResult.value > s.Value || math.IsNaN(float64(groupedResult.value)) { + groupedResult.value = s.Value + } + case itemCount, itemCountValues: + groupedResult.groupCount++ + case itemStdvar, itemStddev: + groupedResult.value += s.Value + groupedResult.valuesSquaredSum += s.Value * s.Value + groupedResult.groupCount++ + case itemTopK: + if int64(len(groupedResult.heap)) < k || groupedResult.heap[0].Value < s.Value || math.IsNaN(float64(groupedResult.heap[0].Value)) { + if int64(len(groupedResult.heap)) == k { + heap.Pop(&groupedResult.heap) + } + heap.Push(&groupedResult.heap, &sample{Value: s.Value, Metric: s.Metric}) + } + case itemBottomK: + if int64(len(groupedResult.reverseHeap)) < k || groupedResult.reverseHeap[0].Value > s.Value || math.IsNaN(float64(groupedResult.reverseHeap[0].Value)) { + if int64(len(groupedResult.reverseHeap)) == k { + heap.Pop(&groupedResult.reverseHeap) + } + heap.Push(&groupedResult.reverseHeap, &sample{Value: s.Value, Metric: s.Metric}) + } + case itemQuantile: + groupedResult.heap = append(groupedResult.heap, s) + default: + panic(fmt.Errorf("expected aggregation operator but got %q", op)) + } + } + + // Construct the result vector from the aggregated groups. + resultVector := make(vector, 0, len(result)) + + for _, aggr := range result { + switch op { + case itemAvg: + aggr.value = aggr.value / model.SampleValue(aggr.groupCount) + case itemCount, itemCountValues: + aggr.value = model.SampleValue(aggr.groupCount) + case itemStdvar: + avg := float64(aggr.value) / float64(aggr.groupCount) + aggr.value = model.SampleValue(float64(aggr.valuesSquaredSum)/float64(aggr.groupCount) - avg*avg) + case itemStddev: + avg := float64(aggr.value) / float64(aggr.groupCount) + aggr.value = model.SampleValue(math.Sqrt(float64(aggr.valuesSquaredSum)/float64(aggr.groupCount) - avg*avg)) + case itemTopK: + // The heap keeps the lowest value on top, so reverse it. + sort.Sort(sort.Reverse(aggr.heap)) + for _, v := range aggr.heap { + resultVector = append(resultVector, &sample{ + Metric: v.Metric, + Value: v.Value, + Timestamp: ev.Timestamp, + }) + } + continue // Bypass default append. + case itemBottomK: + // The heap keeps the lowest value on top, so reverse it. + sort.Sort(sort.Reverse(aggr.reverseHeap)) + for _, v := range aggr.reverseHeap { + resultVector = append(resultVector, &sample{ + Metric: v.Metric, + Value: v.Value, + Timestamp: ev.Timestamp, + }) + } + continue // Bypass default append. + case itemQuantile: + aggr.value = model.SampleValue(quantile(q, aggr.heap)) + default: + // For other aggregations, we already have the right value. + } + sample := &sample{ + Metric: aggr.labels, + Value: aggr.value, + Timestamp: ev.Timestamp, + } + resultVector = append(resultVector, sample) + } + return resultVector +} + +// btos returns 1 if b is true, 0 otherwise. +func btos(b bool) model.SampleValue { + if b { + return 1 + } + return 0 +} + +// shouldDropMetricName returns whether the metric name should be dropped in the +// result of the op operation. +func shouldDropMetricName(op itemType) bool { + switch op { + case itemADD, itemSUB, itemDIV, itemMUL, itemMOD: + return true + default: + return false + } +} + +// StalenessDelta determines the time since the last sample after which a time +// series is considered stale. +var StalenessDelta = 5 * time.Minute + +// A queryGate controls the maximum number of concurrently running and waiting queries. +type queryGate struct { + ch chan struct{} +} + +// newQueryGate returns a query gate that limits the number of queries +// being concurrently executed. +func newQueryGate(length int) *queryGate { + return &queryGate{ + ch: make(chan struct{}, length), + } +} + +// Start blocks until the gate has a free spot or the context is done. +func (g *queryGate) Start(ctx context.Context) error { + select { + case <-ctx.Done(): + return contextDone(ctx, "query queue") + case g.ch <- struct{}{}: + return nil + } +} + +// Done releases a single spot in the gate. +func (g *queryGate) Done() { + select { + case <-g.ch: + default: + panic("engine.queryGate.Done: more operations done than started") + } +} + +// documentedType returns the internal type to the equivalent +// user facing terminology as defined in the documentation. +func documentedType(t model.ValueType) string { + switch t.String() { + case "vector": + return "instant vector" + case "matrix": + return "range vector" + default: + return t.String() + } +} diff --git a/vendor/github.com/prometheus/prometheus/promql/functions.go b/vendor/github.com/prometheus/prometheus/promql/functions.go new file mode 100644 index 000000000..d4dd01415 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/promql/functions.go @@ -0,0 +1,1338 @@ +// Copyright 2015 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package promql + +import ( + "math" + "regexp" + "sort" + "strconv" + "strings" + "time" + + "github.com/prometheus/common/model" + + "github.com/prometheus/prometheus/storage/metric" +) + +// Function represents a function of the expression language and is +// used by function nodes. +type Function struct { + Name string + ArgTypes []model.ValueType + Variadic int + ReturnType model.ValueType + Call func(ev *evaluator, args Expressions) model.Value +} + +// === time() model.SampleValue === +func funcTime(ev *evaluator, args Expressions) model.Value { + return &model.Scalar{ + Value: model.SampleValue(ev.Timestamp.Unix()), + Timestamp: ev.Timestamp, + } +} + +// extrapolatedRate is a utility function for rate/increase/delta. +// It calculates the rate (allowing for counter resets if isCounter is true), +// extrapolates if the first/last sample is close to the boundary, and returns +// the result as either per-second (if isRate is true) or overall. +func extrapolatedRate(ev *evaluator, arg Expr, isCounter bool, isRate bool) model.Value { + ms := arg.(*MatrixSelector) + + rangeStart := ev.Timestamp.Add(-ms.Range - ms.Offset) + rangeEnd := ev.Timestamp.Add(-ms.Offset) + + resultVector := vector{} + + matrixValue := ev.evalMatrix(ms) + for _, samples := range matrixValue { + // No sense in trying to compute a rate without at least two points. Drop + // this vector element. + if len(samples.Values) < 2 { + continue + } + var ( + counterCorrection model.SampleValue + lastValue model.SampleValue + ) + for _, sample := range samples.Values { + currentValue := sample.Value + if isCounter && currentValue < lastValue { + counterCorrection += lastValue + } + lastValue = currentValue + } + resultValue := lastValue - samples.Values[0].Value + counterCorrection + + // Duration between first/last samples and boundary of range. + durationToStart := samples.Values[0].Timestamp.Sub(rangeStart).Seconds() + durationToEnd := rangeEnd.Sub(samples.Values[len(samples.Values)-1].Timestamp).Seconds() + + sampledInterval := samples.Values[len(samples.Values)-1].Timestamp.Sub(samples.Values[0].Timestamp).Seconds() + averageDurationBetweenSamples := sampledInterval / float64(len(samples.Values)-1) + + if isCounter && resultValue > 0 && samples.Values[0].Value >= 0 { + // Counters cannot be negative. If we have any slope at + // all (i.e. resultValue went up), we can extrapolate + // the zero point of the counter. If the duration to the + // zero point is shorter than the durationToStart, we + // take the zero point as the start of the series, + // thereby avoiding extrapolation to negative counter + // values. + durationToZero := sampledInterval * float64(samples.Values[0].Value/resultValue) + if durationToZero < durationToStart { + durationToStart = durationToZero + } + } + + // If the first/last samples are close to the boundaries of the range, + // extrapolate the result. This is as we expect that another sample + // will exist given the spacing between samples we've seen thus far, + // with an allowance for noise. + extrapolationThreshold := averageDurationBetweenSamples * 1.1 + extrapolateToInterval := sampledInterval + + if durationToStart < extrapolationThreshold { + extrapolateToInterval += durationToStart + } else { + extrapolateToInterval += averageDurationBetweenSamples / 2 + } + if durationToEnd < extrapolationThreshold { + extrapolateToInterval += durationToEnd + } else { + extrapolateToInterval += averageDurationBetweenSamples / 2 + } + resultValue = resultValue * model.SampleValue(extrapolateToInterval/sampledInterval) + if isRate { + resultValue = resultValue / model.SampleValue(ms.Range.Seconds()) + } + + resultSample := &sample{ + Metric: samples.Metric, + Value: resultValue, + Timestamp: ev.Timestamp, + } + resultSample.Metric.Del(model.MetricNameLabel) + resultVector = append(resultVector, resultSample) + } + return resultVector +} + +// === delta(matrix model.ValMatrix) Vector === +func funcDelta(ev *evaluator, args Expressions) model.Value { + return extrapolatedRate(ev, args[0], false, false) +} + +// === rate(node model.ValMatrix) Vector === +func funcRate(ev *evaluator, args Expressions) model.Value { + return extrapolatedRate(ev, args[0], true, true) +} + +// === increase(node model.ValMatrix) Vector === +func funcIncrease(ev *evaluator, args Expressions) model.Value { + return extrapolatedRate(ev, args[0], true, false) +} + +// === irate(node model.ValMatrix) Vector === +func funcIrate(ev *evaluator, args Expressions) model.Value { + return instantValue(ev, args[0], true) +} + +// === idelta(node model.ValMatrix) Vector === +func funcIdelta(ev *evaluator, args Expressions) model.Value { + return instantValue(ev, args[0], false) +} + +func instantValue(ev *evaluator, arg Expr, isRate bool) model.Value { + resultVector := vector{} + for _, samples := range ev.evalMatrix(arg) { + // No sense in trying to compute a rate without at least two points. Drop + // this vector element. + if len(samples.Values) < 2 { + continue + } + + lastSample := samples.Values[len(samples.Values)-1] + previousSample := samples.Values[len(samples.Values)-2] + + var resultValue model.SampleValue + if isRate && lastSample.Value < previousSample.Value { + // Counter reset. + resultValue = lastSample.Value + } else { + resultValue = lastSample.Value - previousSample.Value + } + + sampledInterval := lastSample.Timestamp.Sub(previousSample.Timestamp) + if sampledInterval == 0 { + // Avoid dividing by 0. + continue + } + + if isRate { + // Convert to per-second. + resultValue /= model.SampleValue(sampledInterval.Seconds()) + } + + resultSample := &sample{ + Metric: samples.Metric, + Value: resultValue, + Timestamp: ev.Timestamp, + } + resultSample.Metric.Del(model.MetricNameLabel) + resultVector = append(resultVector, resultSample) + } + return resultVector +} + +// Calculate the trend value at the given index i in raw data d. +// This is somewhat analogous to the slope of the trend at the given index. +// The argument "s" is the set of computed smoothed values. +// The argument "b" is the set of computed trend factors. +// The argument "d" is the set of raw input values. +func calcTrendValue(i int, sf, tf float64, s, b, d []float64) float64 { + if i == 0 { + return b[0] + } + + x := tf * (s[i] - s[i-1]) + y := (1 - tf) * b[i-1] + + // Cache the computed value. + b[i] = x + y + + return b[i] +} + +// Holt-Winters is similar to a weighted moving average, where historical data has exponentially less influence on the current data. +// Holt-Winter also accounts for trends in data. The smoothing factor (0 < sf < 1) affects how historical data will affect the current +// data. A lower smoothing factor increases the influence of historical data. The trend factor (0 < tf < 1) affects +// how trends in historical data will affect the current data. A higher trend factor increases the influence. +// of trends. Algorithm taken from https://en.wikipedia.org/wiki/Exponential_smoothing titled: "Double exponential smoothing". +func funcHoltWinters(ev *evaluator, args Expressions) model.Value { + mat := ev.evalMatrix(args[0]) + + // The smoothing factor argument. + sf := ev.evalFloat(args[1]) + + // The trend factor argument. + tf := ev.evalFloat(args[2]) + + // Sanity check the input. + if sf <= 0 || sf >= 1 { + ev.errorf("invalid smoothing factor. Expected: 0 < sf < 1 got: %f", sf) + } + if tf <= 0 || tf >= 1 { + ev.errorf("invalid trend factor. Expected: 0 < tf < 1 got: %f", sf) + } + + // Make an output vector large enough to hold the entire result. + resultVector := make(vector, 0, len(mat)) + + // Create scratch values. + var s, b, d []float64 + + var l int + for _, samples := range mat { + l = len(samples.Values) + + // Can't do the smoothing operation with less than two points. + if l < 2 { + continue + } + + // Resize scratch values. + if l != len(s) { + s = make([]float64, l) + b = make([]float64, l) + d = make([]float64, l) + } + + // Fill in the d values with the raw values from the input. + for i, v := range samples.Values { + d[i] = float64(v.Value) + } + + // Set initial values. + s[0] = d[0] + b[0] = d[1] - d[0] + + // Run the smoothing operation. + var x, y float64 + for i := 1; i < len(d); i++ { + + // Scale the raw value against the smoothing factor. + x = sf * d[i] + + // Scale the last smoothed value with the trend at this point. + y = (1 - sf) * (s[i-1] + calcTrendValue(i-1, sf, tf, s, b, d)) + + s[i] = x + y + } + + samples.Metric.Del(model.MetricNameLabel) + resultVector = append(resultVector, &sample{ + Metric: samples.Metric, + Value: model.SampleValue(s[len(s)-1]), // The last value in the vector is the smoothed result. + Timestamp: ev.Timestamp, + }) + } + + return resultVector +} + +// === sort(node model.ValVector) Vector === +func funcSort(ev *evaluator, args Expressions) model.Value { + // NaN should sort to the bottom, so take descending sort with NaN first and + // reverse it. + byValueSorter := vectorByReverseValueHeap(ev.evalVector(args[0])) + sort.Sort(sort.Reverse(byValueSorter)) + return vector(byValueSorter) +} + +// === sortDesc(node model.ValVector) Vector === +func funcSortDesc(ev *evaluator, args Expressions) model.Value { + // NaN should sort to the bottom, so take ascending sort with NaN first and + // reverse it. + byValueSorter := vectorByValueHeap(ev.evalVector(args[0])) + sort.Sort(sort.Reverse(byValueSorter)) + return vector(byValueSorter) +} + +// === clamp_max(vector model.ValVector, max Scalar) Vector === +func funcClampMax(ev *evaluator, args Expressions) model.Value { + vec := ev.evalVector(args[0]) + max := ev.evalFloat(args[1]) + for _, el := range vec { + el.Metric.Del(model.MetricNameLabel) + el.Value = model.SampleValue(math.Min(max, float64(el.Value))) + } + return vec +} + +// === clamp_min(vector model.ValVector, min Scalar) Vector === +func funcClampMin(ev *evaluator, args Expressions) model.Value { + vec := ev.evalVector(args[0]) + min := ev.evalFloat(args[1]) + for _, el := range vec { + el.Metric.Del(model.MetricNameLabel) + el.Value = model.SampleValue(math.Max(min, float64(el.Value))) + } + return vec +} + +// === drop_common_labels(node model.ValVector) Vector === +func funcDropCommonLabels(ev *evaluator, args Expressions) model.Value { + vec := ev.evalVector(args[0]) + if len(vec) < 1 { + return vector{} + } + common := model.LabelSet{} + for k, v := range vec[0].Metric.Metric { + // TODO(julius): Should we also drop common metric names? + if k == model.MetricNameLabel { + continue + } + common[k] = v + } + + for _, el := range vec[1:] { + for k, v := range common { + if el.Metric.Metric[k] != v { + // Deletion of map entries while iterating over them is safe. + // From http://golang.org/ref/spec#For_statements: + // "If map entries that have not yet been reached are deleted during + // iteration, the corresponding iteration values will not be produced." + delete(common, k) + } + } + } + + for _, el := range vec { + for k := range el.Metric.Metric { + if _, ok := common[k]; ok { + el.Metric.Del(k) + } + } + } + return vec +} + +// === round(vector model.ValVector, toNearest=1 Scalar) Vector === +func funcRound(ev *evaluator, args Expressions) model.Value { + // round returns a number rounded to toNearest. + // Ties are solved by rounding up. + toNearest := float64(1) + if len(args) >= 2 { + toNearest = ev.evalFloat(args[1]) + } + // Invert as it seems to cause fewer floating point accuracy issues. + toNearestInverse := 1.0 / toNearest + + vec := ev.evalVector(args[0]) + for _, el := range vec { + el.Metric.Del(model.MetricNameLabel) + el.Value = model.SampleValue(math.Floor(float64(el.Value)*toNearestInverse+0.5) / toNearestInverse) + } + return vec +} + +// === scalar(node model.ValVector) Scalar === +func funcScalar(ev *evaluator, args Expressions) model.Value { + v := ev.evalVector(args[0]) + if len(v) != 1 { + return &model.Scalar{ + Value: model.SampleValue(math.NaN()), + Timestamp: ev.Timestamp, + } + } + return &model.Scalar{ + Value: model.SampleValue(v[0].Value), + Timestamp: ev.Timestamp, + } +} + +// === count_scalar(vector model.ValVector) model.SampleValue === +func funcCountScalar(ev *evaluator, args Expressions) model.Value { + return &model.Scalar{ + Value: model.SampleValue(len(ev.evalVector(args[0]))), + Timestamp: ev.Timestamp, + } +} + +func aggrOverTime(ev *evaluator, args Expressions, aggrFn func([]model.SamplePair) model.SampleValue) model.Value { + mat := ev.evalMatrix(args[0]) + resultVector := vector{} + + for _, el := range mat { + if len(el.Values) == 0 { + continue + } + + el.Metric.Del(model.MetricNameLabel) + resultVector = append(resultVector, &sample{ + Metric: el.Metric, + Value: aggrFn(el.Values), + Timestamp: ev.Timestamp, + }) + } + return resultVector +} + +// === avg_over_time(matrix model.ValMatrix) Vector === +func funcAvgOverTime(ev *evaluator, args Expressions) model.Value { + return aggrOverTime(ev, args, func(values []model.SamplePair) model.SampleValue { + var sum model.SampleValue + for _, v := range values { + sum += v.Value + } + return sum / model.SampleValue(len(values)) + }) +} + +// === count_over_time(matrix model.ValMatrix) Vector === +func funcCountOverTime(ev *evaluator, args Expressions) model.Value { + return aggrOverTime(ev, args, func(values []model.SamplePair) model.SampleValue { + return model.SampleValue(len(values)) + }) +} + +// === floor(vector model.ValVector) Vector === +func funcFloor(ev *evaluator, args Expressions) model.Value { + vector := ev.evalVector(args[0]) + for _, el := range vector { + el.Metric.Del(model.MetricNameLabel) + el.Value = model.SampleValue(math.Floor(float64(el.Value))) + } + return vector +} + +// === max_over_time(matrix model.ValMatrix) Vector === +func funcMaxOverTime(ev *evaluator, args Expressions) model.Value { + return aggrOverTime(ev, args, func(values []model.SamplePair) model.SampleValue { + max := math.Inf(-1) + for _, v := range values { + max = math.Max(max, float64(v.Value)) + } + return model.SampleValue(max) + }) +} + +// === min_over_time(matrix model.ValMatrix) Vector === +func funcMinOverTime(ev *evaluator, args Expressions) model.Value { + return aggrOverTime(ev, args, func(values []model.SamplePair) model.SampleValue { + min := math.Inf(1) + for _, v := range values { + min = math.Min(min, float64(v.Value)) + } + return model.SampleValue(min) + }) +} + +// === sum_over_time(matrix model.ValMatrix) Vector === +func funcSumOverTime(ev *evaluator, args Expressions) model.Value { + return aggrOverTime(ev, args, func(values []model.SamplePair) model.SampleValue { + var sum model.SampleValue + for _, v := range values { + sum += v.Value + } + return sum + }) +} + +// === quantile_over_time(matrix model.ValMatrix) Vector === +func funcQuantileOverTime(ev *evaluator, args Expressions) model.Value { + q := ev.evalFloat(args[0]) + mat := ev.evalMatrix(args[1]) + resultVector := vector{} + + for _, el := range mat { + if len(el.Values) == 0 { + continue + } + + el.Metric.Del(model.MetricNameLabel) + values := make(vectorByValueHeap, 0, len(el.Values)) + for _, v := range el.Values { + values = append(values, &sample{Value: v.Value}) + } + resultVector = append(resultVector, &sample{ + Metric: el.Metric, + Value: model.SampleValue(quantile(q, values)), + Timestamp: ev.Timestamp, + }) + } + return resultVector +} + +// === stddev_over_time(matrix model.ValMatrix) Vector === +func funcStddevOverTime(ev *evaluator, args Expressions) model.Value { + return aggrOverTime(ev, args, func(values []model.SamplePair) model.SampleValue { + var sum, squaredSum, count model.SampleValue + for _, v := range values { + sum += v.Value + squaredSum += v.Value * v.Value + count++ + } + avg := sum / count + return model.SampleValue(math.Sqrt(float64(squaredSum/count - avg*avg))) + }) +} + +// === stdvar_over_time(matrix model.ValMatrix) Vector === +func funcStdvarOverTime(ev *evaluator, args Expressions) model.Value { + return aggrOverTime(ev, args, func(values []model.SamplePair) model.SampleValue { + var sum, squaredSum, count model.SampleValue + for _, v := range values { + sum += v.Value + squaredSum += v.Value * v.Value + count++ + } + avg := sum / count + return squaredSum/count - avg*avg + }) +} + +// === abs(vector model.ValVector) Vector === +func funcAbs(ev *evaluator, args Expressions) model.Value { + vector := ev.evalVector(args[0]) + for _, el := range vector { + el.Metric.Del(model.MetricNameLabel) + el.Value = model.SampleValue(math.Abs(float64(el.Value))) + } + return vector +} + +// === absent(vector model.ValVector) Vector === +func funcAbsent(ev *evaluator, args Expressions) model.Value { + if len(ev.evalVector(args[0])) > 0 { + return vector{} + } + m := model.Metric{} + if vs, ok := args[0].(*VectorSelector); ok { + for _, matcher := range vs.LabelMatchers { + if matcher.Type == metric.Equal && matcher.Name != model.MetricNameLabel { + m[matcher.Name] = matcher.Value + } + } + } + return vector{ + &sample{ + Metric: metric.Metric{ + Metric: m, + Copied: true, + }, + Value: 1, + Timestamp: ev.Timestamp, + }, + } +} + +// === ceil(vector model.ValVector) Vector === +func funcCeil(ev *evaluator, args Expressions) model.Value { + vector := ev.evalVector(args[0]) + for _, el := range vector { + el.Metric.Del(model.MetricNameLabel) + el.Value = model.SampleValue(math.Ceil(float64(el.Value))) + } + return vector +} + +// === exp(vector model.ValVector) Vector === +func funcExp(ev *evaluator, args Expressions) model.Value { + vector := ev.evalVector(args[0]) + for _, el := range vector { + el.Metric.Del(model.MetricNameLabel) + el.Value = model.SampleValue(math.Exp(float64(el.Value))) + } + return vector +} + +// === sqrt(vector VectorNode) Vector === +func funcSqrt(ev *evaluator, args Expressions) model.Value { + vector := ev.evalVector(args[0]) + for _, el := range vector { + el.Metric.Del(model.MetricNameLabel) + el.Value = model.SampleValue(math.Sqrt(float64(el.Value))) + } + return vector +} + +// === ln(vector model.ValVector) Vector === +func funcLn(ev *evaluator, args Expressions) model.Value { + vector := ev.evalVector(args[0]) + for _, el := range vector { + el.Metric.Del(model.MetricNameLabel) + el.Value = model.SampleValue(math.Log(float64(el.Value))) + } + return vector +} + +// === log2(vector model.ValVector) Vector === +func funcLog2(ev *evaluator, args Expressions) model.Value { + vector := ev.evalVector(args[0]) + for _, el := range vector { + el.Metric.Del(model.MetricNameLabel) + el.Value = model.SampleValue(math.Log2(float64(el.Value))) + } + return vector +} + +// === log10(vector model.ValVector) Vector === +func funcLog10(ev *evaluator, args Expressions) model.Value { + vector := ev.evalVector(args[0]) + for _, el := range vector { + el.Metric.Del(model.MetricNameLabel) + el.Value = model.SampleValue(math.Log10(float64(el.Value))) + } + return vector +} + +// linearRegression performs a least-square linear regression analysis on the +// provided SamplePairs. It returns the slope, and the intercept value at the +// provided time. +func linearRegression(samples []model.SamplePair, interceptTime model.Time) (slope, intercept model.SampleValue) { + var ( + n model.SampleValue + sumX, sumY model.SampleValue + sumXY, sumX2 model.SampleValue + ) + for _, sample := range samples { + x := model.SampleValue( + model.Time(sample.Timestamp-interceptTime).UnixNano(), + ) / 1e9 + n += 1.0 + sumY += sample.Value + sumX += x + sumXY += x * sample.Value + sumX2 += x * x + } + covXY := sumXY - sumX*sumY/n + varX := sumX2 - sumX*sumX/n + + slope = covXY / varX + intercept = sumY/n - slope*sumX/n + return slope, intercept +} + +// === deriv(node model.ValMatrix) Vector === +func funcDeriv(ev *evaluator, args Expressions) model.Value { + mat := ev.evalMatrix(args[0]) + resultVector := make(vector, 0, len(mat)) + + for _, samples := range mat { + // No sense in trying to compute a derivative without at least two points. + // Drop this vector element. + if len(samples.Values) < 2 { + continue + } + // We pass in an arbitrary timestamp that is near the values in use + // to avoid floating point accuracy issues, see + // https://github.com/prometheus/prometheus/issues/2674 + slope, _ := linearRegression(samples.Values, samples.Values[0].Timestamp) + resultSample := &sample{ + Metric: samples.Metric, + Value: slope, + Timestamp: ev.Timestamp, + } + resultSample.Metric.Del(model.MetricNameLabel) + resultVector = append(resultVector, resultSample) + } + return resultVector +} + +// === predict_linear(node model.ValMatrix, k model.ValScalar) Vector === +func funcPredictLinear(ev *evaluator, args Expressions) model.Value { + mat := ev.evalMatrix(args[0]) + resultVector := make(vector, 0, len(mat)) + duration := model.SampleValue(ev.evalFloat(args[1])) + + for _, samples := range mat { + // No sense in trying to predict anything without at least two points. + // Drop this vector element. + if len(samples.Values) < 2 { + continue + } + slope, intercept := linearRegression(samples.Values, ev.Timestamp) + resultSample := &sample{ + Metric: samples.Metric, + Value: slope*duration + intercept, + Timestamp: ev.Timestamp, + } + resultSample.Metric.Del(model.MetricNameLabel) + resultVector = append(resultVector, resultSample) + } + return resultVector +} + +// === histogram_quantile(k model.ValScalar, vector model.ValVector) Vector === +func funcHistogramQuantile(ev *evaluator, args Expressions) model.Value { + q := model.SampleValue(ev.evalFloat(args[0])) + inVec := ev.evalVector(args[1]) + + outVec := vector{} + signatureToMetricWithBuckets := map[uint64]*metricWithBuckets{} + for _, el := range inVec { + upperBound, err := strconv.ParseFloat( + string(el.Metric.Metric[model.BucketLabel]), 64, + ) + if err != nil { + // Oops, no bucket label or malformed label value. Skip. + // TODO(beorn7): Issue a warning somehow. + continue + } + signature := model.SignatureWithoutLabels(el.Metric.Metric, excludedLabels) + mb, ok := signatureToMetricWithBuckets[signature] + if !ok { + el.Metric.Del(model.BucketLabel) + el.Metric.Del(model.MetricNameLabel) + mb = &metricWithBuckets{el.Metric, nil} + signatureToMetricWithBuckets[signature] = mb + } + mb.buckets = append(mb.buckets, bucket{upperBound, el.Value}) + } + + for _, mb := range signatureToMetricWithBuckets { + outVec = append(outVec, &sample{ + Metric: mb.metric, + Value: model.SampleValue(bucketQuantile(q, mb.buckets)), + Timestamp: ev.Timestamp, + }) + } + + return outVec +} + +// === resets(matrix model.ValMatrix) Vector === +func funcResets(ev *evaluator, args Expressions) model.Value { + in := ev.evalMatrix(args[0]) + out := make(vector, 0, len(in)) + + for _, samples := range in { + resets := 0 + prev := model.SampleValue(samples.Values[0].Value) + for _, sample := range samples.Values[1:] { + current := sample.Value + if current < prev { + resets++ + } + prev = current + } + + rs := &sample{ + Metric: samples.Metric, + Value: model.SampleValue(resets), + Timestamp: ev.Timestamp, + } + rs.Metric.Del(model.MetricNameLabel) + out = append(out, rs) + } + return out +} + +// === changes(matrix model.ValMatrix) Vector === +func funcChanges(ev *evaluator, args Expressions) model.Value { + in := ev.evalMatrix(args[0]) + out := make(vector, 0, len(in)) + + for _, samples := range in { + changes := 0 + prev := model.SampleValue(samples.Values[0].Value) + for _, sample := range samples.Values[1:] { + current := sample.Value + if current != prev && !(math.IsNaN(float64(current)) && math.IsNaN(float64(prev))) { + changes++ + } + prev = current + } + + rs := &sample{ + Metric: samples.Metric, + Value: model.SampleValue(changes), + Timestamp: ev.Timestamp, + } + rs.Metric.Del(model.MetricNameLabel) + out = append(out, rs) + } + return out +} + +// === label_replace(vector model.ValVector, dst_label, replacement, src_labelname, regex model.ValString) Vector === +func funcLabelReplace(ev *evaluator, args Expressions) model.Value { + var ( + vector = ev.evalVector(args[0]) + dst = model.LabelName(ev.evalString(args[1]).Value) + repl = ev.evalString(args[2]).Value + src = model.LabelName(ev.evalString(args[3]).Value) + regexStr = ev.evalString(args[4]).Value + ) + + regex, err := regexp.Compile("^(?:" + regexStr + ")$") + if err != nil { + ev.errorf("invalid regular expression in label_replace(): %s", regexStr) + } + if !model.LabelNameRE.MatchString(string(dst)) { + ev.errorf("invalid destination label name in label_replace(): %s", dst) + } + + outSet := make(map[model.Fingerprint]struct{}, len(vector)) + for _, el := range vector { + srcVal := string(el.Metric.Metric[src]) + indexes := regex.FindStringSubmatchIndex(srcVal) + // If there is no match, no replacement should take place. + if indexes == nil { + continue + } + res := regex.ExpandString([]byte{}, repl, srcVal, indexes) + if len(res) == 0 { + el.Metric.Del(dst) + } else { + el.Metric.Set(dst, model.LabelValue(res)) + } + + fp := el.Metric.Metric.Fingerprint() + if _, exists := outSet[fp]; exists { + ev.errorf("duplicated label set in output of label_replace(): %s", el.Metric.Metric) + } else { + outSet[fp] = struct{}{} + } + } + + return vector +} + +// === label_join(vector model.ValVector, dest_labelname, separator, src_labelname...) Vector === +func funcLabelJoin(ev *evaluator, args Expressions) model.Value { + var ( + vector = ev.evalVector(args[0]) + dst = model.LabelName(ev.evalString(args[1]).Value) + sep = ev.evalString(args[2]).Value + srcLabels = make([]model.LabelName, len(args)-3) + ) + for i := 3; i < len(args); i++ { + src := model.LabelName(ev.evalString(args[i]).Value) + if !model.LabelNameRE.MatchString(string(src)) { + ev.errorf("invalid source label name in label_join(): %s", src) + } + srcLabels[i-3] = src + } + + if !model.LabelNameRE.MatchString(string(dst)) { + ev.errorf("invalid destination label name in label_join(): %s", dst) + } + + outSet := make(map[model.Fingerprint]struct{}, len(vector)) + for _, el := range vector { + srcVals := make([]string, len(srcLabels)) + for i, src := range srcLabels { + srcVals[i] = string(el.Metric.Metric[src]) + } + + strval := strings.Join(srcVals, sep) + if strval == "" { + el.Metric.Del(dst) + } else { + el.Metric.Set(dst, model.LabelValue(strval)) + } + + fp := el.Metric.Metric.Fingerprint() + if _, exists := outSet[fp]; exists { + ev.errorf("duplicated label set in output of label_join(): %s", el.Metric.Metric) + } else { + outSet[fp] = struct{}{} + } + } + return vector +} + +// === vector(s scalar) Vector === +func funcVector(ev *evaluator, args Expressions) model.Value { + return vector{ + &sample{ + Metric: metric.Metric{}, + Value: model.SampleValue(ev.evalFloat(args[0])), + Timestamp: ev.Timestamp, + }, + } +} + +// Common code for date related functions. +func dateWrapper(ev *evaluator, args Expressions, f func(time.Time) model.SampleValue) model.Value { + var v vector + if len(args) == 0 { + v = vector{ + &sample{ + Metric: metric.Metric{}, + Value: model.SampleValue(ev.Timestamp.Unix()), + Timestamp: ev.Timestamp, + }, + } + } else { + v = ev.evalVector(args[0]) + } + for _, el := range v { + el.Metric.Del(model.MetricNameLabel) + t := time.Unix(int64(el.Value), 0).UTC() + el.Value = f(t) + } + return v +} + +// === days_in_month(v vector) scalar === +func funcDaysInMonth(ev *evaluator, args Expressions) model.Value { + return dateWrapper(ev, args, func(t time.Time) model.SampleValue { + return model.SampleValue(32 - time.Date(t.Year(), t.Month(), 32, 0, 0, 0, 0, time.UTC).Day()) + }) +} + +// === day_of_month(v vector) scalar === +func funcDayOfMonth(ev *evaluator, args Expressions) model.Value { + return dateWrapper(ev, args, func(t time.Time) model.SampleValue { + return model.SampleValue(t.Day()) + }) +} + +// === day_of_week(v vector) scalar === +func funcDayOfWeek(ev *evaluator, args Expressions) model.Value { + return dateWrapper(ev, args, func(t time.Time) model.SampleValue { + return model.SampleValue(t.Weekday()) + }) +} + +// === hour(v vector) scalar === +func funcHour(ev *evaluator, args Expressions) model.Value { + return dateWrapper(ev, args, func(t time.Time) model.SampleValue { + return model.SampleValue(t.Hour()) + }) +} + +// === minute(v vector) scalar === +func funcMinute(ev *evaluator, args Expressions) model.Value { + return dateWrapper(ev, args, func(t time.Time) model.SampleValue { + return model.SampleValue(t.Minute()) + }) +} + +// === month(v vector) scalar === +func funcMonth(ev *evaluator, args Expressions) model.Value { + return dateWrapper(ev, args, func(t time.Time) model.SampleValue { + return model.SampleValue(t.Month()) + }) +} + +// === year(v vector) scalar === +func funcYear(ev *evaluator, args Expressions) model.Value { + return dateWrapper(ev, args, func(t time.Time) model.SampleValue { + return model.SampleValue(t.Year()) + }) +} + +var functions = map[string]*Function{ + "abs": { + Name: "abs", + ArgTypes: []model.ValueType{model.ValVector}, + ReturnType: model.ValVector, + Call: funcAbs, + }, + "absent": { + Name: "absent", + ArgTypes: []model.ValueType{model.ValVector}, + ReturnType: model.ValVector, + Call: funcAbsent, + }, + "avg_over_time": { + Name: "avg_over_time", + ArgTypes: []model.ValueType{model.ValMatrix}, + ReturnType: model.ValVector, + Call: funcAvgOverTime, + }, + "ceil": { + Name: "ceil", + ArgTypes: []model.ValueType{model.ValVector}, + ReturnType: model.ValVector, + Call: funcCeil, + }, + "changes": { + Name: "changes", + ArgTypes: []model.ValueType{model.ValMatrix}, + ReturnType: model.ValVector, + Call: funcChanges, + }, + "clamp_max": { + Name: "clamp_max", + ArgTypes: []model.ValueType{model.ValVector, model.ValScalar}, + ReturnType: model.ValVector, + Call: funcClampMax, + }, + "clamp_min": { + Name: "clamp_min", + ArgTypes: []model.ValueType{model.ValVector, model.ValScalar}, + ReturnType: model.ValVector, + Call: funcClampMin, + }, + "count_over_time": { + Name: "count_over_time", + ArgTypes: []model.ValueType{model.ValMatrix}, + ReturnType: model.ValVector, + Call: funcCountOverTime, + }, + "count_scalar": { + Name: "count_scalar", + ArgTypes: []model.ValueType{model.ValVector}, + ReturnType: model.ValScalar, + Call: funcCountScalar, + }, + "days_in_month": { + Name: "days_in_month", + ArgTypes: []model.ValueType{model.ValVector}, + Variadic: 1, + ReturnType: model.ValVector, + Call: funcDaysInMonth, + }, + "day_of_month": { + Name: "day_of_month", + ArgTypes: []model.ValueType{model.ValVector}, + Variadic: 1, + ReturnType: model.ValVector, + Call: funcDayOfMonth, + }, + "day_of_week": { + Name: "day_of_week", + ArgTypes: []model.ValueType{model.ValVector}, + Variadic: 1, + ReturnType: model.ValVector, + Call: funcDayOfWeek, + }, + "delta": { + Name: "delta", + ArgTypes: []model.ValueType{model.ValMatrix}, + ReturnType: model.ValVector, + Call: funcDelta, + }, + "deriv": { + Name: "deriv", + ArgTypes: []model.ValueType{model.ValMatrix}, + ReturnType: model.ValVector, + Call: funcDeriv, + }, + "drop_common_labels": { + Name: "drop_common_labels", + ArgTypes: []model.ValueType{model.ValVector}, + ReturnType: model.ValVector, + Call: funcDropCommonLabels, + }, + "exp": { + Name: "exp", + ArgTypes: []model.ValueType{model.ValVector}, + ReturnType: model.ValVector, + Call: funcExp, + }, + "floor": { + Name: "floor", + ArgTypes: []model.ValueType{model.ValVector}, + ReturnType: model.ValVector, + Call: funcFloor, + }, + "histogram_quantile": { + Name: "histogram_quantile", + ArgTypes: []model.ValueType{model.ValScalar, model.ValVector}, + ReturnType: model.ValVector, + Call: funcHistogramQuantile, + }, + "holt_winters": { + Name: "holt_winters", + ArgTypes: []model.ValueType{model.ValMatrix, model.ValScalar, model.ValScalar}, + ReturnType: model.ValVector, + Call: funcHoltWinters, + }, + "hour": { + Name: "hour", + ArgTypes: []model.ValueType{model.ValVector}, + Variadic: 1, + ReturnType: model.ValVector, + Call: funcHour, + }, + "idelta": { + Name: "idelta", + ArgTypes: []model.ValueType{model.ValMatrix}, + ReturnType: model.ValVector, + Call: funcIdelta, + }, + "increase": { + Name: "increase", + ArgTypes: []model.ValueType{model.ValMatrix}, + ReturnType: model.ValVector, + Call: funcIncrease, + }, + "irate": { + Name: "irate", + ArgTypes: []model.ValueType{model.ValMatrix}, + ReturnType: model.ValVector, + Call: funcIrate, + }, + "label_replace": { + Name: "label_replace", + ArgTypes: []model.ValueType{model.ValVector, model.ValString, model.ValString, model.ValString, model.ValString}, + ReturnType: model.ValVector, + Call: funcLabelReplace, + }, + "label_join": { + Name: "label_join", + ArgTypes: []model.ValueType{model.ValVector, model.ValString, model.ValString, model.ValString}, + Variadic: -1, + ReturnType: model.ValVector, + Call: funcLabelJoin, + }, + "ln": { + Name: "ln", + ArgTypes: []model.ValueType{model.ValVector}, + ReturnType: model.ValVector, + Call: funcLn, + }, + "log10": { + Name: "log10", + ArgTypes: []model.ValueType{model.ValVector}, + ReturnType: model.ValVector, + Call: funcLog10, + }, + "log2": { + Name: "log2", + ArgTypes: []model.ValueType{model.ValVector}, + ReturnType: model.ValVector, + Call: funcLog2, + }, + "max_over_time": { + Name: "max_over_time", + ArgTypes: []model.ValueType{model.ValMatrix}, + ReturnType: model.ValVector, + Call: funcMaxOverTime, + }, + "min_over_time": { + Name: "min_over_time", + ArgTypes: []model.ValueType{model.ValMatrix}, + ReturnType: model.ValVector, + Call: funcMinOverTime, + }, + "minute": { + Name: "minute", + ArgTypes: []model.ValueType{model.ValVector}, + Variadic: 1, + ReturnType: model.ValVector, + Call: funcMinute, + }, + "month": { + Name: "month", + ArgTypes: []model.ValueType{model.ValVector}, + Variadic: 1, + ReturnType: model.ValVector, + Call: funcMonth, + }, + "predict_linear": { + Name: "predict_linear", + ArgTypes: []model.ValueType{model.ValMatrix, model.ValScalar}, + ReturnType: model.ValVector, + Call: funcPredictLinear, + }, + "quantile_over_time": { + Name: "quantile_over_time", + ArgTypes: []model.ValueType{model.ValScalar, model.ValMatrix}, + ReturnType: model.ValVector, + Call: funcQuantileOverTime, + }, + "rate": { + Name: "rate", + ArgTypes: []model.ValueType{model.ValMatrix}, + ReturnType: model.ValVector, + Call: funcRate, + }, + "resets": { + Name: "resets", + ArgTypes: []model.ValueType{model.ValMatrix}, + ReturnType: model.ValVector, + Call: funcResets, + }, + "round": { + Name: "round", + ArgTypes: []model.ValueType{model.ValVector, model.ValScalar}, + Variadic: 1, + ReturnType: model.ValVector, + Call: funcRound, + }, + "scalar": { + Name: "scalar", + ArgTypes: []model.ValueType{model.ValVector}, + ReturnType: model.ValScalar, + Call: funcScalar, + }, + "sort": { + Name: "sort", + ArgTypes: []model.ValueType{model.ValVector}, + ReturnType: model.ValVector, + Call: funcSort, + }, + "sort_desc": { + Name: "sort_desc", + ArgTypes: []model.ValueType{model.ValVector}, + ReturnType: model.ValVector, + Call: funcSortDesc, + }, + "sqrt": { + Name: "sqrt", + ArgTypes: []model.ValueType{model.ValVector}, + ReturnType: model.ValVector, + Call: funcSqrt, + }, + "stddev_over_time": { + Name: "stddev_over_time", + ArgTypes: []model.ValueType{model.ValMatrix}, + ReturnType: model.ValVector, + Call: funcStddevOverTime, + }, + "stdvar_over_time": { + Name: "stdvar_over_time", + ArgTypes: []model.ValueType{model.ValMatrix}, + ReturnType: model.ValVector, + Call: funcStdvarOverTime, + }, + "sum_over_time": { + Name: "sum_over_time", + ArgTypes: []model.ValueType{model.ValMatrix}, + ReturnType: model.ValVector, + Call: funcSumOverTime, + }, + "time": { + Name: "time", + ArgTypes: []model.ValueType{}, + ReturnType: model.ValScalar, + Call: funcTime, + }, + "vector": { + Name: "vector", + ArgTypes: []model.ValueType{model.ValScalar}, + ReturnType: model.ValVector, + Call: funcVector, + }, + "year": { + Name: "year", + ArgTypes: []model.ValueType{model.ValVector}, + Variadic: 1, + ReturnType: model.ValVector, + Call: funcYear, + }, +} + +// getFunction returns a predefined Function object for the given name. +func getFunction(name string) (*Function, bool) { + function, ok := functions[name] + return function, ok +} + +type vectorByValueHeap vector + +func (s vectorByValueHeap) Len() int { + return len(s) +} + +func (s vectorByValueHeap) Less(i, j int) bool { + if math.IsNaN(float64(s[i].Value)) { + return true + } + return s[i].Value < s[j].Value +} + +func (s vectorByValueHeap) Swap(i, j int) { + s[i], s[j] = s[j], s[i] +} + +func (s *vectorByValueHeap) Push(x interface{}) { + *s = append(*s, x.(*sample)) +} + +func (s *vectorByValueHeap) Pop() interface{} { + old := *s + n := len(old) + el := old[n-1] + *s = old[0 : n-1] + return el +} + +type vectorByReverseValueHeap vector + +func (s vectorByReverseValueHeap) Len() int { + return len(s) +} + +func (s vectorByReverseValueHeap) Less(i, j int) bool { + if math.IsNaN(float64(s[i].Value)) { + return true + } + return s[i].Value > s[j].Value +} + +func (s vectorByReverseValueHeap) Swap(i, j int) { + s[i], s[j] = s[j], s[i] +} + +func (s *vectorByReverseValueHeap) Push(x interface{}) { + *s = append(*s, x.(*sample)) +} + +func (s *vectorByReverseValueHeap) Pop() interface{} { + old := *s + n := len(old) + el := old[n-1] + *s = old[0 : n-1] + return el +} diff --git a/vendor/github.com/prometheus/prometheus/promql/fuzz.go b/vendor/github.com/prometheus/prometheus/promql/fuzz.go new file mode 100644 index 000000000..e52ccfb25 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/promql/fuzz.go @@ -0,0 +1,87 @@ +// Copyright 2015 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Only build when go-fuzz is in use +// +build gofuzz + +package promql + +// PromQL parser fuzzing instrumentation for use with +// https://github.com/dvyukov/go-fuzz. +// +// Fuzz each parser by building appropriately instrumented parser, ex. +// FuzzParseMetric and execute it with it's +// +// go-fuzz-build -func FuzzParseMetric -o FuzzParseMetric.zip github.com/prometheus/prometheus/promql +// +// And then run the tests with the appropriate inputs +// +// go-fuzz -bin FuzzParseMetric.zip -workdir fuzz-data/ParseMetric +// +// Further input samples should go in the folders fuzz-data/ParseMetric/corpus. +// +// Repeat for ParseMetricSeletion, ParseExpr and ParseStmt. + +// Tuning which value is returned from Fuzz*-functions has a strong influence +// on how quick the fuzzer converges on "interesting" cases. At least try +// switching between fuzzMeh (= included in corpus, but not a priority) and +// fuzzDiscard (=don't use this input for re-building later inputs) when +// experimenting. +const ( + fuzzInteresting = 1 + fuzzMeh = 0 + fuzzDiscard = -1 +) + +// Fuzz the metric parser. +// +// Note that his is not the parser for the text-based exposition-format; that +// lives in github.com/prometheus/client_golang/text. +func FuzzParseMetric(in []byte) int { + _, err := ParseMetric(string(in)) + if err == nil { + return fuzzInteresting + } + + return fuzzMeh +} + +// Fuzz the metric selector parser. +func FuzzParseMetricSelector(in []byte) int { + _, err := ParseMetricSelector(string(in)) + if err == nil { + return fuzzInteresting + } + + return fuzzMeh +} + +// Fuzz the expression parser. +func FuzzParseExpr(in []byte) int { + _, err := ParseExpr(string(in)) + if err == nil { + return fuzzInteresting + } + + return fuzzMeh +} + +// Fuzz the parser. +func FuzzParseStmts(in []byte) int { + _, err := ParseStmts(string(in)) + if err == nil { + return fuzzInteresting + } + + return fuzzMeh +} diff --git a/vendor/github.com/prometheus/prometheus/promql/lex.go b/vendor/github.com/prometheus/prometheus/promql/lex.go new file mode 100644 index 000000000..efc0b11e8 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/promql/lex.go @@ -0,0 +1,908 @@ +// Copyright 2015 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package promql + +import ( + "fmt" + "strings" + "unicode" + "unicode/utf8" +) + +// item represents a token or text string returned from the scanner. +type item struct { + typ itemType // The type of this item. + pos Pos // The starting position, in bytes, of this item in the input string. + val string // The value of this item. +} + +// String returns a descriptive string for the item. +func (i item) String() string { + switch { + case i.typ == itemEOF: + return "EOF" + case i.typ == itemError: + return i.val + case i.typ == itemIdentifier || i.typ == itemMetricIdentifier: + return fmt.Sprintf("%q", i.val) + case i.typ.isKeyword(): + return fmt.Sprintf("<%s>", i.val) + case i.typ.isOperator(): + return fmt.Sprintf("", i.val) + case i.typ.isAggregator(): + return fmt.Sprintf("", i.val) + case len(i.val) > 10: + return fmt.Sprintf("%.10q...", i.val) + } + return fmt.Sprintf("%q", i.val) +} + +// isOperator returns true if the item corresponds to a arithmetic or set operator. +// Returns false otherwise. +func (i itemType) isOperator() bool { return i > operatorsStart && i < operatorsEnd } + +// isAggregator returns true if the item belongs to the aggregator functions. +// Returns false otherwise +func (i itemType) isAggregator() bool { return i > aggregatorsStart && i < aggregatorsEnd } + +// isAggregator returns true if the item is an aggregator that takes a parameter. +// Returns false otherwise +func (i itemType) isAggregatorWithParam() bool { + return i == itemTopK || i == itemBottomK || i == itemCountValues || i == itemQuantile +} + +// isKeyword returns true if the item corresponds to a keyword. +// Returns false otherwise. +func (i itemType) isKeyword() bool { return i > keywordsStart && i < keywordsEnd } + +// isCompairsonOperator returns true if the item corresponds to a comparison operator. +// Returns false otherwise. +func (i itemType) isComparisonOperator() bool { + switch i { + case itemEQL, itemNEQ, itemLTE, itemLSS, itemGTE, itemGTR: + return true + default: + return false + } +} + +// isSetOperator returns whether the item corresponds to a set operator. +func (i itemType) isSetOperator() bool { + switch i { + case itemLAND, itemLOR, itemLUnless: + return true + } + return false +} + +// LowestPrec is a constant for operator precedence in expressions. +const LowestPrec = 0 // Non-operators. + +// Precedence returns the operator precedence of the binary +// operator op. If op is not a binary operator, the result +// is LowestPrec. +func (i itemType) precedence() int { + switch i { + case itemLOR: + return 1 + case itemLAND, itemLUnless: + return 2 + case itemEQL, itemNEQ, itemLTE, itemLSS, itemGTE, itemGTR: + return 3 + case itemADD, itemSUB: + return 4 + case itemMUL, itemDIV, itemMOD: + return 5 + case itemPOW: + return 6 + default: + return LowestPrec + } +} + +func (i itemType) isRightAssociative() bool { + switch i { + case itemPOW: + return true + default: + return false + } + +} + +type itemType int + +const ( + itemError itemType = iota // Error occurred, value is error message + itemEOF + itemComment + itemIdentifier + itemMetricIdentifier + itemLeftParen + itemRightParen + itemLeftBrace + itemRightBrace + itemLeftBracket + itemRightBracket + itemComma + itemAssign + itemSemicolon + itemString + itemNumber + itemDuration + itemBlank + itemTimes + + operatorsStart + // Operators. + itemSUB + itemADD + itemMUL + itemMOD + itemDIV + itemLAND + itemLOR + itemLUnless + itemEQL + itemNEQ + itemLTE + itemLSS + itemGTE + itemGTR + itemEQLRegex + itemNEQRegex + itemPOW + operatorsEnd + + aggregatorsStart + // Aggregators. + itemAvg + itemCount + itemSum + itemMin + itemMax + itemStddev + itemStdvar + itemTopK + itemBottomK + itemCountValues + itemQuantile + aggregatorsEnd + + keywordsStart + // Keywords. + itemAlert + itemIf + itemFor + itemLabels + itemAnnotations + itemKeepCommon + itemOffset + itemBy + itemWithout + itemOn + itemIgnoring + itemGroupLeft + itemGroupRight + itemBool + keywordsEnd +) + +var key = map[string]itemType{ + // Operators. + "and": itemLAND, + "or": itemLOR, + "unless": itemLUnless, + + // Aggregators. + "sum": itemSum, + "avg": itemAvg, + "count": itemCount, + "min": itemMin, + "max": itemMax, + "stddev": itemStddev, + "stdvar": itemStdvar, + "topk": itemTopK, + "bottomk": itemBottomK, + "count_values": itemCountValues, + "quantile": itemQuantile, + + // Keywords. + "alert": itemAlert, + "if": itemIf, + "for": itemFor, + "labels": itemLabels, + "annotations": itemAnnotations, + "offset": itemOffset, + "by": itemBy, + "without": itemWithout, + "keep_common": itemKeepCommon, + "on": itemOn, + "ignoring": itemIgnoring, + "group_left": itemGroupLeft, + "group_right": itemGroupRight, + "bool": itemBool, +} + +// These are the default string representations for common items. It does not +// imply that those are the only character sequences that can be lexed to such an item. +var itemTypeStr = map[itemType]string{ + itemLeftParen: "(", + itemRightParen: ")", + itemLeftBrace: "{", + itemRightBrace: "}", + itemLeftBracket: "[", + itemRightBracket: "]", + itemComma: ",", + itemAssign: "=", + itemSemicolon: ";", + itemBlank: "_", + itemTimes: "x", + + itemSUB: "-", + itemADD: "+", + itemMUL: "*", + itemMOD: "%", + itemDIV: "/", + itemEQL: "==", + itemNEQ: "!=", + itemLTE: "<=", + itemLSS: "<", + itemGTE: ">=", + itemGTR: ">", + itemEQLRegex: "=~", + itemNEQRegex: "!~", + itemPOW: "^", +} + +func init() { + // Add keywords to item type strings. + for s, ty := range key { + itemTypeStr[ty] = s + } + // Special numbers. + key["inf"] = itemNumber + key["nan"] = itemNumber +} + +func (i itemType) String() string { + if s, ok := itemTypeStr[i]; ok { + return s + } + return fmt.Sprintf("", i) +} + +func (i item) desc() string { + if _, ok := itemTypeStr[i.typ]; ok { + return i.String() + } + if i.typ == itemEOF { + return i.typ.desc() + } + return fmt.Sprintf("%s %s", i.typ.desc(), i) +} + +func (i itemType) desc() string { + switch i { + case itemError: + return "error" + case itemEOF: + return "end of input" + case itemComment: + return "comment" + case itemIdentifier: + return "identifier" + case itemMetricIdentifier: + return "metric identifier" + case itemString: + return "string" + case itemNumber: + return "number" + case itemDuration: + return "duration" + } + return fmt.Sprintf("%q", i) +} + +const eof = -1 + +// stateFn represents the state of the scanner as a function that returns the next state. +type stateFn func(*lexer) stateFn + +// Pos is the position in a string. +type Pos int + +// lexer holds the state of the scanner. +type lexer struct { + input string // The string being scanned. + state stateFn // The next lexing function to enter. + pos Pos // Current position in the input. + start Pos // Start position of this item. + width Pos // Width of last rune read from input. + lastPos Pos // Position of most recent item returned by nextItem. + items chan item // Channel of scanned items. + + parenDepth int // Nesting depth of ( ) exprs. + braceOpen bool // Whether a { is opened. + bracketOpen bool // Whether a [ is opened. + stringOpen rune // Quote rune of the string currently being read. + + // seriesDesc is set when a series description for the testing + // language is lexed. + seriesDesc bool +} + +// next returns the next rune in the input. +func (l *lexer) next() rune { + if int(l.pos) >= len(l.input) { + l.width = 0 + return eof + } + r, w := utf8.DecodeRuneInString(l.input[l.pos:]) + l.width = Pos(w) + l.pos += l.width + return r +} + +// peek returns but does not consume the next rune in the input. +func (l *lexer) peek() rune { + r := l.next() + l.backup() + return r +} + +// backup steps back one rune. Can only be called once per call of next. +func (l *lexer) backup() { + l.pos -= l.width +} + +// emit passes an item back to the client. +func (l *lexer) emit(t itemType) { + l.items <- item{t, l.start, l.input[l.start:l.pos]} + l.start = l.pos +} + +// ignore skips over the pending input before this point. +func (l *lexer) ignore() { + l.start = l.pos +} + +// accept consumes the next rune if it's from the valid set. +func (l *lexer) accept(valid string) bool { + if strings.ContainsRune(valid, l.next()) { + return true + } + l.backup() + return false +} + +// acceptRun consumes a run of runes from the valid set. +func (l *lexer) acceptRun(valid string) { + for strings.ContainsRune(valid, l.next()) { + // consume + } + l.backup() +} + +// lineNumber reports which line we're on, based on the position of +// the previous item returned by nextItem. Doing it this way +// means we don't have to worry about peek double counting. +func (l *lexer) lineNumber() int { + return 1 + strings.Count(l.input[:l.lastPos], "\n") +} + +// linePosition reports at which character in the current line +// we are on. +func (l *lexer) linePosition() int { + lb := strings.LastIndex(l.input[:l.lastPos], "\n") + if lb == -1 { + return 1 + int(l.lastPos) + } + return 1 + int(l.lastPos) - lb +} + +// errorf returns an error token and terminates the scan by passing +// back a nil pointer that will be the next state, terminating l.nextItem. +func (l *lexer) errorf(format string, args ...interface{}) stateFn { + l.items <- item{itemError, l.start, fmt.Sprintf(format, args...)} + return nil +} + +// nextItem returns the next item from the input. +func (l *lexer) nextItem() item { + item := <-l.items + l.lastPos = item.pos + return item +} + +// lex creates a new scanner for the input string. +func lex(input string) *lexer { + l := &lexer{ + input: input, + items: make(chan item), + } + go l.run() + return l +} + +// run runs the state machine for the lexer. +func (l *lexer) run() { + for l.state = lexStatements; l.state != nil; { + l.state = l.state(l) + } + close(l.items) +} + +// lineComment is the character that starts a line comment. +const lineComment = "#" + +// lexStatements is the top-level state for lexing. +func lexStatements(l *lexer) stateFn { + if l.braceOpen { + return lexInsideBraces + } + if strings.HasPrefix(l.input[l.pos:], lineComment) { + return lexLineComment + } + + switch r := l.next(); { + case r == eof: + if l.parenDepth != 0 { + return l.errorf("unclosed left parenthesis") + } else if l.bracketOpen { + return l.errorf("unclosed left bracket") + } + l.emit(itemEOF) + return nil + case r == ',': + l.emit(itemComma) + case isSpace(r): + return lexSpace + case r == '*': + l.emit(itemMUL) + case r == '/': + l.emit(itemDIV) + case r == '%': + l.emit(itemMOD) + case r == '+': + l.emit(itemADD) + case r == '-': + l.emit(itemSUB) + case r == '^': + l.emit(itemPOW) + case r == '=': + if t := l.peek(); t == '=' { + l.next() + l.emit(itemEQL) + } else if t == '~' { + return l.errorf("unexpected character after '=': %q", t) + } else { + l.emit(itemAssign) + } + case r == '!': + if t := l.next(); t == '=' { + l.emit(itemNEQ) + } else { + return l.errorf("unexpected character after '!': %q", t) + } + case r == '<': + if t := l.peek(); t == '=' { + l.next() + l.emit(itemLTE) + } else { + l.emit(itemLSS) + } + case r == '>': + if t := l.peek(); t == '=' { + l.next() + l.emit(itemGTE) + } else { + l.emit(itemGTR) + } + case isDigit(r) || (r == '.' && isDigit(l.peek())): + l.backup() + return lexNumberOrDuration + case r == '"' || r == '\'': + l.stringOpen = r + return lexString + case r == '`': + l.stringOpen = r + return lexRawString + case isAlpha(r) || r == ':': + l.backup() + return lexKeywordOrIdentifier + case r == '(': + l.emit(itemLeftParen) + l.parenDepth++ + return lexStatements + case r == ')': + l.emit(itemRightParen) + l.parenDepth-- + if l.parenDepth < 0 { + return l.errorf("unexpected right parenthesis %q", r) + } + return lexStatements + case r == '{': + l.emit(itemLeftBrace) + l.braceOpen = true + return lexInsideBraces(l) + case r == '[': + if l.bracketOpen { + return l.errorf("unexpected left bracket %q", r) + } + l.emit(itemLeftBracket) + l.bracketOpen = true + return lexDuration + case r == ']': + if !l.bracketOpen { + return l.errorf("unexpected right bracket %q", r) + } + l.emit(itemRightBracket) + l.bracketOpen = false + + default: + return l.errorf("unexpected character: %q", r) + } + return lexStatements +} + +// lexInsideBraces scans the inside of a vector selector. Keywords are ignored and +// scanned as identifiers. +func lexInsideBraces(l *lexer) stateFn { + if strings.HasPrefix(l.input[l.pos:], lineComment) { + return lexLineComment + } + + switch r := l.next(); { + case r == eof: + return l.errorf("unexpected end of input inside braces") + case isSpace(r): + return lexSpace + case isAlpha(r): + l.backup() + return lexIdentifier + case r == ',': + l.emit(itemComma) + case r == '"' || r == '\'': + l.stringOpen = r + return lexString + case r == '`': + l.stringOpen = r + return lexRawString + case r == '=': + if l.next() == '~' { + l.emit(itemEQLRegex) + break + } + l.backup() + l.emit(itemEQL) + case r == '!': + switch nr := l.next(); { + case nr == '~': + l.emit(itemNEQRegex) + case nr == '=': + l.emit(itemNEQ) + default: + return l.errorf("unexpected character after '!' inside braces: %q", nr) + } + case r == '{': + return l.errorf("unexpected left brace %q", r) + case r == '}': + l.emit(itemRightBrace) + l.braceOpen = false + + if l.seriesDesc { + return lexValueSequence + } + return lexStatements + default: + return l.errorf("unexpected character inside braces: %q", r) + } + return lexInsideBraces +} + +// lexValueSequence scans a value sequence of a series description. +func lexValueSequence(l *lexer) stateFn { + switch r := l.next(); { + case r == eof: + return lexStatements + case isSpace(r): + lexSpace(l) + case r == '+': + l.emit(itemADD) + case r == '-': + l.emit(itemSUB) + case r == 'x': + l.emit(itemTimes) + case r == '_': + l.emit(itemBlank) + case isDigit(r) || (r == '.' && isDigit(l.peek())): + l.backup() + lexNumber(l) + case isAlpha(r): + l.backup() + // We might lex invalid items here but this will be caught by the parser. + return lexKeywordOrIdentifier + default: + return l.errorf("unexpected character in series sequence: %q", r) + } + return lexValueSequence +} + +// lexEscape scans a string escape sequence. The initial escaping character (\) +// has already been seen. +// +// NOTE: This function as well as the helper function digitVal() and associated +// tests have been adapted from the corresponding functions in the "go/scanner" +// package of the Go standard library to work for Prometheus-style strings. +// None of the actual escaping/quoting logic was changed in this function - it +// was only modified to integrate with our lexer. +func lexEscape(l *lexer) { + var n int + var base, max uint32 + + ch := l.next() + switch ch { + case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', l.stringOpen: + return + case '0', '1', '2', '3', '4', '5', '6', '7': + n, base, max = 3, 8, 255 + case 'x': + ch = l.next() + n, base, max = 2, 16, 255 + case 'u': + ch = l.next() + n, base, max = 4, 16, unicode.MaxRune + case 'U': + ch = l.next() + n, base, max = 8, 16, unicode.MaxRune + case eof: + l.errorf("escape sequence not terminated") + default: + l.errorf("unknown escape sequence %#U", ch) + } + + var x uint32 + for n > 0 { + d := uint32(digitVal(ch)) + if d >= base { + if ch == eof { + l.errorf("escape sequence not terminated") + } + l.errorf("illegal character %#U in escape sequence", ch) + } + x = x*base + d + ch = l.next() + n-- + } + + if x > max || 0xD800 <= x && x < 0xE000 { + l.errorf("escape sequence is an invalid Unicode code point") + } +} + +// digitVal returns the digit value of a rune or 16 in case the rune does not +// represent a valid digit. +func digitVal(ch rune) int { + switch { + case '0' <= ch && ch <= '9': + return int(ch - '0') + case 'a' <= ch && ch <= 'f': + return int(ch - 'a' + 10) + case 'A' <= ch && ch <= 'F': + return int(ch - 'A' + 10) + } + return 16 // Larger than any legal digit val. +} + +// lexString scans a quoted string. The initial quote has already been seen. +func lexString(l *lexer) stateFn { +Loop: + for { + switch l.next() { + case '\\': + lexEscape(l) + case utf8.RuneError: + return l.errorf("invalid UTF-8 rune") + case eof, '\n': + return l.errorf("unterminated quoted string") + case l.stringOpen: + break Loop + } + } + l.emit(itemString) + return lexStatements +} + +// lexRawString scans a raw quoted string. The initial quote has already been seen. +func lexRawString(l *lexer) stateFn { +Loop: + for { + switch l.next() { + case utf8.RuneError: + return l.errorf("invalid UTF-8 rune") + case eof: + return l.errorf("unterminated raw string") + case l.stringOpen: + break Loop + } + } + l.emit(itemString) + return lexStatements +} + +// lexSpace scans a run of space characters. One space has already been seen. +func lexSpace(l *lexer) stateFn { + for isSpace(l.peek()) { + l.next() + } + l.ignore() + return lexStatements +} + +// lexLineComment scans a line comment. Left comment marker is known to be present. +func lexLineComment(l *lexer) stateFn { + l.pos += Pos(len(lineComment)) + for r := l.next(); !isEndOfLine(r) && r != eof; { + r = l.next() + } + l.backup() + l.emit(itemComment) + return lexStatements +} + +func lexDuration(l *lexer) stateFn { + if l.scanNumber() { + return l.errorf("missing unit character in duration") + } + // Next two chars must be a valid unit and a non-alphanumeric. + if l.accept("smhdwy") { + if isAlphaNumeric(l.next()) { + return l.errorf("bad duration syntax: %q", l.input[l.start:l.pos]) + } + l.backup() + l.emit(itemDuration) + return lexStatements + } + return l.errorf("bad duration syntax: %q", l.input[l.start:l.pos]) +} + +// lexNumber scans a number: decimal, hex, oct or float. +func lexNumber(l *lexer) stateFn { + if !l.scanNumber() { + return l.errorf("bad number syntax: %q", l.input[l.start:l.pos]) + } + l.emit(itemNumber) + return lexStatements +} + +// lexNumberOrDuration scans a number or a duration item. +func lexNumberOrDuration(l *lexer) stateFn { + if l.scanNumber() { + l.emit(itemNumber) + return lexStatements + } + // Next two chars must be a valid unit and a non-alphanumeric. + if l.accept("smhdwy") { + if isAlphaNumeric(l.next()) { + return l.errorf("bad number or duration syntax: %q", l.input[l.start:l.pos]) + } + l.backup() + l.emit(itemDuration) + return lexStatements + } + return l.errorf("bad number or duration syntax: %q", l.input[l.start:l.pos]) +} + +// scanNumber scans numbers of different formats. The scanned item is +// not necessarily a valid number. This case is caught by the parser. +func (l *lexer) scanNumber() bool { + digits := "0123456789" + // Disallow hexadecimal in series descriptions as the syntax is ambiguous. + if !l.seriesDesc && l.accept("0") && l.accept("xX") { + digits = "0123456789abcdefABCDEF" + } + l.acceptRun(digits) + if l.accept(".") { + l.acceptRun(digits) + } + if l.accept("eE") { + l.accept("+-") + l.acceptRun("0123456789") + } + // Next thing must not be alphanumeric unless it's the times token + // for series repetitions. + if r := l.peek(); (l.seriesDesc && r == 'x') || !isAlphaNumeric(r) { + return true + } + return false +} + +// lexIdentifier scans an alphanumeric identifier. The next character +// is known to be a letter. +func lexIdentifier(l *lexer) stateFn { + for isAlphaNumeric(l.next()) { + // absorb + } + l.backup() + l.emit(itemIdentifier) + return lexStatements +} + +// lexKeywordOrIdentifier scans an alphanumeric identifier which may contain +// a colon rune. If the identifier is a keyword the respective keyword item +// is scanned. +func lexKeywordOrIdentifier(l *lexer) stateFn { +Loop: + for { + switch r := l.next(); { + case isAlphaNumeric(r) || r == ':': + // absorb. + default: + l.backup() + word := l.input[l.start:l.pos] + if kw, ok := key[strings.ToLower(word)]; ok { + l.emit(kw) + } else if !strings.Contains(word, ":") { + l.emit(itemIdentifier) + } else { + l.emit(itemMetricIdentifier) + } + break Loop + } + } + if l.seriesDesc && l.peek() != '{' { + return lexValueSequence + } + return lexStatements +} + +func isSpace(r rune) bool { + return r == ' ' || r == '\t' || r == '\n' || r == '\r' +} + +// isEndOfLine reports whether r is an end-of-line character. +func isEndOfLine(r rune) bool { + return r == '\r' || r == '\n' +} + +// isAlphaNumeric reports whether r is an alphabetic, digit, or underscore. +func isAlphaNumeric(r rune) bool { + return isAlpha(r) || isDigit(r) +} + +// isDigit reports whether r is a digit. Note: we cannot use unicode.IsDigit() +// instead because that also classifies non-Latin digits as digits. See +// https://github.com/prometheus/prometheus/issues/939. +func isDigit(r rune) bool { + return '0' <= r && r <= '9' +} + +// isAlpha reports whether r is an alphabetic or underscore. +func isAlpha(r rune) bool { + return r == '_' || ('a' <= r && r <= 'z') || ('A' <= r && r <= 'Z') +} + +// isLabel reports whether the string can be used as label. +func isLabel(s string) bool { + if len(s) == 0 || !isAlpha(rune(s[0])) { + return false + } + for _, c := range s[1:] { + if !isAlphaNumeric(c) { + return false + } + } + return true +} diff --git a/vendor/github.com/prometheus/prometheus/promql/parse.go b/vendor/github.com/prometheus/prometheus/promql/parse.go new file mode 100644 index 000000000..6a0ecc8dd --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/promql/parse.go @@ -0,0 +1,1146 @@ +// Copyright 2015 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package promql + +import ( + "fmt" + "runtime" + "strconv" + "strings" + "time" + + "github.com/prometheus/common/log" + "github.com/prometheus/common/model" + + "github.com/prometheus/prometheus/storage/metric" + "github.com/prometheus/prometheus/util/strutil" +) + +type parser struct { + lex *lexer + token [3]item + peekCount int +} + +// ParseErr wraps a parsing error with line and position context. +// If the parsing input was a single line, line will be 0 and omitted +// from the error string. +type ParseErr struct { + Line, Pos int + Err error +} + +func (e *ParseErr) Error() string { + if e.Line == 0 { + return fmt.Sprintf("parse error at char %d: %s", e.Pos, e.Err) + } + return fmt.Sprintf("parse error at line %d, char %d: %s", e.Line, e.Pos, e.Err) +} + +// ParseStmts parses the input and returns the resulting statements or any occurring error. +func ParseStmts(input string) (Statements, error) { + p := newParser(input) + + stmts, err := p.parseStmts() + if err != nil { + return nil, err + } + err = p.typecheck(stmts) + return stmts, err +} + +// ParseExpr returns the expression parsed from the input. +func ParseExpr(input string) (Expr, error) { + p := newParser(input) + + expr, err := p.parseExpr() + if err != nil { + return nil, err + } + err = p.typecheck(expr) + return expr, err +} + +// ParseMetric parses the input into a metric +func ParseMetric(input string) (m model.Metric, err error) { + p := newParser(input) + defer p.recover(&err) + + m = p.metric() + if p.peek().typ != itemEOF { + p.errorf("could not parse remaining input %.15q...", p.lex.input[p.lex.lastPos:]) + } + return m, nil +} + +// ParseMetricSelector parses the provided textual metric selector into a list of +// label matchers. +func ParseMetricSelector(input string) (m metric.LabelMatchers, err error) { + p := newParser(input) + defer p.recover(&err) + + name := "" + if t := p.peek().typ; t == itemMetricIdentifier || t == itemIdentifier { + name = p.next().val + } + vs := p.vectorSelector(name) + if p.peek().typ != itemEOF { + p.errorf("could not parse remaining input %.15q...", p.lex.input[p.lex.lastPos:]) + } + return vs.LabelMatchers, nil +} + +// parseSeriesDesc parses the description of a time series. +func parseSeriesDesc(input string) (model.Metric, []sequenceValue, error) { + p := newParser(input) + p.lex.seriesDesc = true + + return p.parseSeriesDesc() +} + +// newParser returns a new parser. +func newParser(input string) *parser { + p := &parser{ + lex: lex(input), + } + return p +} + +// parseStmts parses a sequence of statements from the input. +func (p *parser) parseStmts() (stmts Statements, err error) { + defer p.recover(&err) + stmts = Statements{} + + for p.peek().typ != itemEOF { + if p.peek().typ == itemComment { + continue + } + stmts = append(stmts, p.stmt()) + } + return +} + +// parseExpr parses a single expression from the input. +func (p *parser) parseExpr() (expr Expr, err error) { + defer p.recover(&err) + + for p.peek().typ != itemEOF { + if p.peek().typ == itemComment { + continue + } + if expr != nil { + p.errorf("could not parse remaining input %.15q...", p.lex.input[p.lex.lastPos:]) + } + expr = p.expr() + } + + if expr == nil { + p.errorf("no expression found in input") + } + return +} + +// sequenceValue is an omittable value in a sequence of time series values. +type sequenceValue struct { + value model.SampleValue + omitted bool +} + +func (v sequenceValue) String() string { + if v.omitted { + return "_" + } + return v.value.String() +} + +// parseSeriesDesc parses a description of a time series into its metric and value sequence. +func (p *parser) parseSeriesDesc() (m model.Metric, vals []sequenceValue, err error) { + defer p.recover(&err) + + m = p.metric() + + const ctx = "series values" + for { + if p.peek().typ == itemEOF { + break + } + + // Extract blanks. + if p.peek().typ == itemBlank { + p.next() + times := uint64(1) + if p.peek().typ == itemTimes { + p.next() + times, err = strconv.ParseUint(p.expect(itemNumber, ctx).val, 10, 64) + if err != nil { + p.errorf("invalid repetition in %s: %s", ctx, err) + } + } + for i := uint64(0); i < times; i++ { + vals = append(vals, sequenceValue{omitted: true}) + } + continue + } + + // Extract values. + sign := 1.0 + if t := p.peek().typ; t == itemSUB || t == itemADD { + if p.next().typ == itemSUB { + sign = -1 + } + } + k := sign * p.number(p.expect(itemNumber, ctx).val) + vals = append(vals, sequenceValue{ + value: model.SampleValue(k), + }) + + // If there are no offset repetitions specified, proceed with the next value. + if t := p.peek().typ; t == itemNumber || t == itemBlank { + continue + } else if t == itemEOF { + break + } else if t != itemADD && t != itemSUB { + p.errorf("expected next value or relative expansion in %s but got %s", ctx, t.desc()) + } + + // Expand the repeated offsets into values. + sign = 1.0 + if p.next().typ == itemSUB { + sign = -1.0 + } + offset := sign * p.number(p.expect(itemNumber, ctx).val) + p.expect(itemTimes, ctx) + + times, err := strconv.ParseUint(p.expect(itemNumber, ctx).val, 10, 64) + if err != nil { + p.errorf("invalid repetition in %s: %s", ctx, err) + } + + for i := uint64(0); i < times; i++ { + k += offset + vals = append(vals, sequenceValue{ + value: model.SampleValue(k), + }) + } + } + return m, vals, nil +} + +// typecheck checks correct typing of the parsed statements or expression. +func (p *parser) typecheck(node Node) (err error) { + defer p.recover(&err) + + p.checkType(node) + return nil +} + +// next returns the next token. +func (p *parser) next() item { + if p.peekCount > 0 { + p.peekCount-- + } else { + t := p.lex.nextItem() + // Skip comments. + for t.typ == itemComment { + t = p.lex.nextItem() + } + p.token[0] = t + } + if p.token[p.peekCount].typ == itemError { + p.errorf("%s", p.token[p.peekCount].val) + } + return p.token[p.peekCount] +} + +// peek returns but does not consume the next token. +func (p *parser) peek() item { + if p.peekCount > 0 { + return p.token[p.peekCount-1] + } + p.peekCount = 1 + + t := p.lex.nextItem() + // Skip comments. + for t.typ == itemComment { + t = p.lex.nextItem() + } + p.token[0] = t + return p.token[0] +} + +// backup backs the input stream up one token. +func (p *parser) backup() { + p.peekCount++ +} + +// errorf formats the error and terminates processing. +func (p *parser) errorf(format string, args ...interface{}) { + p.error(fmt.Errorf(format, args...)) +} + +// error terminates processing. +func (p *parser) error(err error) { + perr := &ParseErr{ + Line: p.lex.lineNumber(), + Pos: p.lex.linePosition(), + Err: err, + } + if strings.Count(strings.TrimSpace(p.lex.input), "\n") == 0 { + perr.Line = 0 + } + panic(perr) +} + +// expect consumes the next token and guarantees it has the required type. +func (p *parser) expect(exp itemType, context string) item { + token := p.next() + if token.typ != exp { + p.errorf("unexpected %s in %s, expected %s", token.desc(), context, exp.desc()) + } + return token +} + +// expectOneOf consumes the next token and guarantees it has one of the required types. +func (p *parser) expectOneOf(exp1, exp2 itemType, context string) item { + token := p.next() + if token.typ != exp1 && token.typ != exp2 { + p.errorf("unexpected %s in %s, expected %s or %s", token.desc(), context, exp1.desc(), exp2.desc()) + } + return token +} + +var errUnexpected = fmt.Errorf("unexpected error") + +// recover is the handler that turns panics into returns from the top level of Parse. +func (p *parser) recover(errp *error) { + e := recover() + if e != nil { + if _, ok := e.(runtime.Error); ok { + // Print the stack trace but do not inhibit the running application. + buf := make([]byte, 64<<10) + buf = buf[:runtime.Stack(buf, false)] + + log.Errorf("parser panic: %v\n%s", e, buf) + *errp = errUnexpected + } else { + *errp = e.(error) + } + } + return +} + +// stmt parses any statement. +// +// alertStatement | recordStatement +// +func (p *parser) stmt() Statement { + switch tok := p.peek(); tok.typ { + case itemAlert: + return p.alertStmt() + case itemIdentifier, itemMetricIdentifier: + return p.recordStmt() + } + p.errorf("no valid statement detected") + return nil +} + +// alertStmt parses an alert rule. +// +// ALERT name IF expr [FOR duration] +// [LABELS label_set] +// [ANNOTATIONS label_set] +// +func (p *parser) alertStmt() *AlertStmt { + const ctx = "alert statement" + + p.expect(itemAlert, ctx) + name := p.expect(itemIdentifier, ctx) + // Alerts require a vector typed expression. + p.expect(itemIf, ctx) + expr := p.expr() + + // Optional for clause. + var ( + duration time.Duration + err error + ) + if p.peek().typ == itemFor { + p.next() + dur := p.expect(itemDuration, ctx) + duration, err = parseDuration(dur.val) + if err != nil { + p.error(err) + } + } + + var ( + labels = model.LabelSet{} + annotations = model.LabelSet{} + ) + if p.peek().typ == itemLabels { + p.expect(itemLabels, ctx) + labels = p.labelSet() + } + if p.peek().typ == itemAnnotations { + p.expect(itemAnnotations, ctx) + annotations = p.labelSet() + } + + return &AlertStmt{ + Name: name.val, + Expr: expr, + Duration: duration, + Labels: labels, + Annotations: annotations, + } +} + +// recordStmt parses a recording rule. +func (p *parser) recordStmt() *RecordStmt { + const ctx = "record statement" + + name := p.expectOneOf(itemIdentifier, itemMetricIdentifier, ctx).val + + var lset model.LabelSet + if p.peek().typ == itemLeftBrace { + lset = p.labelSet() + } + + p.expect(itemAssign, ctx) + expr := p.expr() + + return &RecordStmt{ + Name: name, + Labels: lset, + Expr: expr, + } +} + +// expr parses any expression. +func (p *parser) expr() Expr { + // Parse the starting expression. + expr := p.unaryExpr() + + // Loop through the operations and construct a binary operation tree based + // on the operators' precedence. + for { + // If the next token is not an operator the expression is done. + op := p.peek().typ + if !op.isOperator() { + return expr + } + p.next() // Consume operator. + + // Parse optional operator matching options. Its validity + // is checked in the type-checking stage. + vecMatching := &VectorMatching{ + Card: CardOneToOne, + } + if op.isSetOperator() { + vecMatching.Card = CardManyToMany + } + + returnBool := false + // Parse bool modifier. + if p.peek().typ == itemBool { + if !op.isComparisonOperator() { + p.errorf("bool modifier can only be used on comparison operators") + } + p.next() + returnBool = true + } + + // Parse ON/IGNORING clause. + if p.peek().typ == itemOn || p.peek().typ == itemIgnoring { + if p.peek().typ == itemOn { + vecMatching.On = true + } + p.next() + vecMatching.MatchingLabels = p.labels() + + // Parse grouping. + if t := p.peek().typ; t == itemGroupLeft || t == itemGroupRight { + p.next() + if t == itemGroupLeft { + vecMatching.Card = CardManyToOne + } else { + vecMatching.Card = CardOneToMany + } + if p.peek().typ == itemLeftParen { + vecMatching.Include = p.labels() + } + } + } + + for _, ln := range vecMatching.MatchingLabels { + for _, ln2 := range vecMatching.Include { + if ln == ln2 && vecMatching.On { + p.errorf("label %q must not occur in ON and GROUP clause at once", ln) + } + } + } + + // Parse the next operand. + rhs := p.unaryExpr() + + // Assign the new root based on the precedence of the LHS and RHS operators. + expr = p.balance(expr, op, rhs, vecMatching, returnBool) + } +} + +func (p *parser) balance(lhs Expr, op itemType, rhs Expr, vecMatching *VectorMatching, returnBool bool) *BinaryExpr { + if lhsBE, ok := lhs.(*BinaryExpr); ok { + precd := lhsBE.Op.precedence() - op.precedence() + if (precd < 0) || (precd == 0 && op.isRightAssociative()) { + balanced := p.balance(lhsBE.RHS, op, rhs, vecMatching, returnBool) + if lhsBE.Op.isComparisonOperator() && !lhsBE.ReturnBool && balanced.Type() == model.ValScalar && lhsBE.LHS.Type() == model.ValScalar { + p.errorf("comparisons between scalars must use BOOL modifier") + } + return &BinaryExpr{ + Op: lhsBE.Op, + LHS: lhsBE.LHS, + RHS: balanced, + VectorMatching: lhsBE.VectorMatching, + ReturnBool: lhsBE.ReturnBool, + } + } + } + if op.isComparisonOperator() && !returnBool && rhs.Type() == model.ValScalar && lhs.Type() == model.ValScalar { + p.errorf("comparisons between scalars must use BOOL modifier") + } + return &BinaryExpr{ + Op: op, + LHS: lhs, + RHS: rhs, + VectorMatching: vecMatching, + ReturnBool: returnBool, + } +} + +// unaryExpr parses a unary expression. +// +// | | (+|-) | '(' ')' +// +func (p *parser) unaryExpr() Expr { + switch t := p.peek(); t.typ { + case itemADD, itemSUB: + p.next() + e := p.unaryExpr() + + // Simplify unary expressions for number literals. + if nl, ok := e.(*NumberLiteral); ok { + if t.typ == itemSUB { + nl.Val *= -1 + } + return nl + } + return &UnaryExpr{Op: t.typ, Expr: e} + + case itemLeftParen: + p.next() + e := p.expr() + p.expect(itemRightParen, "paren expression") + + return &ParenExpr{Expr: e} + } + e := p.primaryExpr() + + // Expression might be followed by a range selector. + if p.peek().typ == itemLeftBracket { + vs, ok := e.(*VectorSelector) + if !ok { + p.errorf("range specification must be preceded by a metric selector, but follows a %T instead", e) + } + e = p.rangeSelector(vs) + } + + // Parse optional offset. + if p.peek().typ == itemOffset { + offset := p.offset() + + switch s := e.(type) { + case *VectorSelector: + s.Offset = offset + case *MatrixSelector: + s.Offset = offset + default: + p.errorf("offset modifier must be preceded by an instant or range selector, but follows a %T instead", e) + } + } + + return e +} + +// rangeSelector parses a matrix (a.k.a. range) selector based on a given +// vector selector. +// +// '[' ']' +// +func (p *parser) rangeSelector(vs *VectorSelector) *MatrixSelector { + const ctx = "range selector" + p.next() + + var erange time.Duration + var err error + + erangeStr := p.expect(itemDuration, ctx).val + erange, err = parseDuration(erangeStr) + if err != nil { + p.error(err) + } + + p.expect(itemRightBracket, ctx) + + e := &MatrixSelector{ + Name: vs.Name, + LabelMatchers: vs.LabelMatchers, + Range: erange, + } + return e +} + +// number parses a number. +func (p *parser) number(val string) float64 { + n, err := strconv.ParseInt(val, 0, 64) + f := float64(n) + if err != nil { + f, err = strconv.ParseFloat(val, 64) + } + if err != nil { + p.errorf("error parsing number: %s", err) + } + return f +} + +// primaryExpr parses a primary expression. +// +// | | | +// +func (p *parser) primaryExpr() Expr { + switch t := p.next(); { + case t.typ == itemNumber: + f := p.number(t.val) + return &NumberLiteral{model.SampleValue(f)} + + case t.typ == itemString: + return &StringLiteral{p.unquoteString(t.val)} + + case t.typ == itemLeftBrace: + // Metric selector without metric name. + p.backup() + return p.vectorSelector("") + + case t.typ == itemIdentifier: + // Check for function call. + if p.peek().typ == itemLeftParen { + return p.call(t.val) + } + fallthrough // Else metric selector. + + case t.typ == itemMetricIdentifier: + return p.vectorSelector(t.val) + + case t.typ.isAggregator(): + p.backup() + return p.aggrExpr() + + default: + p.errorf("no valid expression found") + } + return nil +} + +// labels parses a list of labelnames. +// +// '(' , ... ')' +// +func (p *parser) labels() model.LabelNames { + const ctx = "grouping opts" + + p.expect(itemLeftParen, ctx) + + labels := model.LabelNames{} + if p.peek().typ != itemRightParen { + for { + id := p.next() + if !isLabel(id.val) { + p.errorf("unexpected %s in %s, expected label", id.desc(), ctx) + } + labels = append(labels, model.LabelName(id.val)) + + if p.peek().typ != itemComma { + break + } + p.next() + } + } + p.expect(itemRightParen, ctx) + + return labels +} + +// aggrExpr parses an aggregation expression. +// +// () [by ] [keep_common] +// [by ] [keep_common] () +// +func (p *parser) aggrExpr() *AggregateExpr { + const ctx = "aggregation" + + agop := p.next() + if !agop.typ.isAggregator() { + p.errorf("expected aggregation operator but got %s", agop) + } + var grouping model.LabelNames + var keepCommon, without bool + + modifiersFirst := false + + if t := p.peek().typ; t == itemBy || t == itemWithout { + if t == itemWithout { + without = true + } + p.next() + grouping = p.labels() + modifiersFirst = true + } + if p.peek().typ == itemKeepCommon { + p.next() + keepCommon = true + modifiersFirst = true + } + + p.expect(itemLeftParen, ctx) + var param Expr + if agop.typ.isAggregatorWithParam() { + param = p.expr() + p.expect(itemComma, ctx) + } + e := p.expr() + p.expect(itemRightParen, ctx) + + if !modifiersFirst { + if t := p.peek().typ; t == itemBy || t == itemWithout { + if len(grouping) > 0 { + p.errorf("aggregation must only contain one grouping clause") + } + if t == itemWithout { + without = true + } + p.next() + grouping = p.labels() + } + if p.peek().typ == itemKeepCommon { + p.next() + keepCommon = true + } + } + + if keepCommon && without { + p.errorf("cannot use 'keep_common' with 'without'") + } + + return &AggregateExpr{ + Op: agop.typ, + Expr: e, + Param: param, + Grouping: grouping, + Without: without, + KeepCommonLabels: keepCommon, + } +} + +// call parses a function call. +// +// '(' [ , ...] ')' +// +func (p *parser) call(name string) *Call { + const ctx = "function call" + + fn, exist := getFunction(name) + if !exist { + p.errorf("unknown function with name %q", name) + } + + p.expect(itemLeftParen, ctx) + // Might be call without args. + if p.peek().typ == itemRightParen { + p.next() // Consume. + return &Call{fn, nil} + } + + var args []Expr + for { + e := p.expr() + args = append(args, e) + + // Terminate if no more arguments. + if p.peek().typ != itemComma { + break + } + p.next() + } + + // Call must be closed. + p.expect(itemRightParen, ctx) + + return &Call{Func: fn, Args: args} +} + +// labelSet parses a set of label matchers +// +// '{' [ '=' , ... ] '}' +// +func (p *parser) labelSet() model.LabelSet { + set := model.LabelSet{} + for _, lm := range p.labelMatchers(itemEQL) { + set[lm.Name] = lm.Value + } + return set +} + +// labelMatchers parses a set of label matchers. +// +// '{' [ , ... ] '}' +// +func (p *parser) labelMatchers(operators ...itemType) metric.LabelMatchers { + const ctx = "label matching" + + matchers := metric.LabelMatchers{} + + p.expect(itemLeftBrace, ctx) + + // Check if no matchers are provided. + if p.peek().typ == itemRightBrace { + p.next() + return matchers + } + + for { + label := p.expect(itemIdentifier, ctx) + + op := p.next().typ + if !op.isOperator() { + p.errorf("expected label matching operator but got %s", op) + } + var validOp = false + for _, allowedOp := range operators { + if op == allowedOp { + validOp = true + } + } + if !validOp { + p.errorf("operator must be one of %q, is %q", operators, op) + } + + val := p.unquoteString(p.expect(itemString, ctx).val) + + // Map the item to the respective match type. + var matchType metric.MatchType + switch op { + case itemEQL: + matchType = metric.Equal + case itemNEQ: + matchType = metric.NotEqual + case itemEQLRegex: + matchType = metric.RegexMatch + case itemNEQRegex: + matchType = metric.RegexNoMatch + default: + p.errorf("item %q is not a metric match type", op) + } + + m, err := metric.NewLabelMatcher( + matchType, + model.LabelName(label.val), + model.LabelValue(val), + ) + if err != nil { + p.error(err) + } + + matchers = append(matchers, m) + + if p.peek().typ == itemIdentifier { + p.errorf("missing comma before next identifier %q", p.peek().val) + } + + // Terminate list if last matcher. + if p.peek().typ != itemComma { + break + } + p.next() + + // Allow comma after each item in a multi-line listing. + if p.peek().typ == itemRightBrace { + break + } + } + + p.expect(itemRightBrace, ctx) + + return matchers +} + +// metric parses a metric. +// +// +// [] +// +func (p *parser) metric() model.Metric { + name := "" + m := model.Metric{} + + t := p.peek().typ + if t == itemIdentifier || t == itemMetricIdentifier { + name = p.next().val + t = p.peek().typ + } + if t != itemLeftBrace && name == "" { + p.errorf("missing metric name or metric selector") + } + if t == itemLeftBrace { + m = model.Metric(p.labelSet()) + } + if name != "" { + m[model.MetricNameLabel] = model.LabelValue(name) + } + return m +} + +// offset parses an offset modifier. +// +// offset +// +func (p *parser) offset() time.Duration { + const ctx = "offset" + + p.next() + offi := p.expect(itemDuration, ctx) + + offset, err := parseDuration(offi.val) + if err != nil { + p.error(err) + } + + return offset +} + +// vectorSelector parses a new (instant) vector selector. +// +// [] +// [] +// +func (p *parser) vectorSelector(name string) *VectorSelector { + var matchers metric.LabelMatchers + // Parse label matching if any. + if t := p.peek(); t.typ == itemLeftBrace { + matchers = p.labelMatchers(itemEQL, itemNEQ, itemEQLRegex, itemNEQRegex) + } + // Metric name must not be set in the label matchers and before at the same time. + if name != "" { + for _, m := range matchers { + if m.Name == model.MetricNameLabel { + p.errorf("metric name must not be set twice: %q or %q", name, m.Value) + } + } + // Set name label matching. + m, err := metric.NewLabelMatcher(metric.Equal, model.MetricNameLabel, model.LabelValue(name)) + if err != nil { + panic(err) // Must not happen with metric.Equal. + } + matchers = append(matchers, m) + } + + if len(matchers) == 0 { + p.errorf("vector selector must contain label matchers or metric name") + } + // A vector selector must contain at least one non-empty matcher to prevent + // implicit selection of all metrics (e.g. by a typo). + notEmpty := false + for _, lm := range matchers { + if !lm.MatchesEmptyString() { + notEmpty = true + break + } + } + if !notEmpty { + p.errorf("vector selector must contain at least one non-empty matcher") + } + + return &VectorSelector{ + Name: name, + LabelMatchers: matchers, + } +} + +// expectType checks the type of the node and raises an error if it +// is not of the expected type. +func (p *parser) expectType(node Node, want model.ValueType, context string) { + t := p.checkType(node) + if t != want { + p.errorf("expected type %s in %s, got %s", documentedType(want), context, documentedType(t)) + } +} + +// check the types of the children of each node and raise an error +// if they do not form a valid node. +// +// Some of these checks are redundant as the the parsing stage does not allow +// them, but the costs are small and might reveal errors when making changes. +func (p *parser) checkType(node Node) (typ model.ValueType) { + // For expressions the type is determined by their Type function. + // Statements and lists do not have a type but are not invalid either. + switch n := node.(type) { + case Statements, Expressions, Statement: + typ = model.ValNone + case Expr: + typ = n.Type() + default: + p.errorf("unknown node type: %T", node) + } + + // Recursively check correct typing for child nodes and raise + // errors in case of bad typing. + switch n := node.(type) { + case Statements: + for _, s := range n { + p.expectType(s, model.ValNone, "statement list") + } + case *AlertStmt: + p.expectType(n.Expr, model.ValVector, "alert statement") + + case *EvalStmt: + ty := p.checkType(n.Expr) + if ty == model.ValNone { + p.errorf("evaluation statement must have a valid expression type but got %s", documentedType(ty)) + } + + case *RecordStmt: + ty := p.checkType(n.Expr) + if ty != model.ValVector && ty != model.ValScalar { + p.errorf("record statement must have a valid expression of type instant vector or scalar but got %s", documentedType(ty)) + } + + case Expressions: + for _, e := range n { + ty := p.checkType(e) + if ty == model.ValNone { + p.errorf("expression must have a valid expression type but got %s", documentedType(ty)) + } + } + case *AggregateExpr: + if !n.Op.isAggregator() { + p.errorf("aggregation operator expected in aggregation expression but got %q", n.Op) + } + p.expectType(n.Expr, model.ValVector, "aggregation expression") + if n.Op == itemTopK || n.Op == itemBottomK || n.Op == itemQuantile { + p.expectType(n.Param, model.ValScalar, "aggregation parameter") + } + if n.Op == itemCountValues { + p.expectType(n.Param, model.ValString, "aggregation parameter") + } + + case *BinaryExpr: + lt := p.checkType(n.LHS) + rt := p.checkType(n.RHS) + + if !n.Op.isOperator() { + p.errorf("binary expression does not support operator %q", n.Op) + } + if (lt != model.ValScalar && lt != model.ValVector) || (rt != model.ValScalar && rt != model.ValVector) { + p.errorf("binary expression must contain only scalar and instant vector types") + } + + if (lt != model.ValVector || rt != model.ValVector) && n.VectorMatching != nil { + if len(n.VectorMatching.MatchingLabels) > 0 { + p.errorf("vector matching only allowed between instant vectors") + } + n.VectorMatching = nil + } else { + // Both operands are vectors. + if n.Op.isSetOperator() { + if n.VectorMatching.Card == CardOneToMany || n.VectorMatching.Card == CardManyToOne { + p.errorf("no grouping allowed for %q operation", n.Op) + } + if n.VectorMatching.Card != CardManyToMany { + p.errorf("set operations must always be many-to-many") + } + } + } + + if (lt == model.ValScalar || rt == model.ValScalar) && n.Op.isSetOperator() { + p.errorf("set operator %q not allowed in binary scalar expression", n.Op) + } + + case *Call: + nargs := len(n.Func.ArgTypes) + if n.Func.Variadic == 0 { + if nargs != len(n.Args) { + p.errorf("expected %d argument(s) in call to %q, got %d", nargs, n.Func.Name, len(n.Args)) + } + } else { + na := nargs - 1 + if na > len(n.Args) { + p.errorf("expected at least %d argument(s) in call to %q, got %d", na, n.Func.Name, len(n.Args)) + } else if nargsmax := na + n.Func.Variadic; n.Func.Variadic > 0 && nargsmax < len(n.Args) { + p.errorf("expected at most %d argument(s) in call to %q, got %d", nargsmax, n.Func.Name, len(n.Args)) + } + } + + for i, arg := range n.Args { + if i >= len(n.Func.ArgTypes) { + i = len(n.Func.ArgTypes) - 1 + } + p.expectType(arg, n.Func.ArgTypes[i], fmt.Sprintf("call to function %q", n.Func.Name)) + } + + case *ParenExpr: + p.checkType(n.Expr) + + case *UnaryExpr: + if n.Op != itemADD && n.Op != itemSUB { + p.errorf("only + and - operators allowed for unary expressions") + } + if t := p.checkType(n.Expr); t != model.ValScalar && t != model.ValVector { + p.errorf("unary expression only allowed on expressions of type scalar or instant vector, got %q", documentedType(t)) + } + + case *NumberLiteral, *MatrixSelector, *StringLiteral, *VectorSelector: + // Nothing to do for terminals. + + default: + p.errorf("unknown node type: %T", node) + } + return +} + +func (p *parser) unquoteString(s string) string { + unquoted, err := strutil.Unquote(s) + if err != nil { + p.errorf("error unquoting string %q: %s", s, err) + } + return unquoted +} + +func parseDuration(ds string) (time.Duration, error) { + dur, err := model.ParseDuration(ds) + if err != nil { + return 0, err + } + if dur == 0 { + return 0, fmt.Errorf("duration must be greater than 0") + } + return time.Duration(dur), nil +} diff --git a/vendor/github.com/prometheus/prometheus/promql/printer.go b/vendor/github.com/prometheus/prometheus/promql/printer.go new file mode 100644 index 000000000..40ca02e65 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/promql/printer.go @@ -0,0 +1,236 @@ +// Copyright 2015 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package promql + +import ( + "fmt" + "sort" + "strings" + "time" + + "github.com/prometheus/common/model" + + "github.com/prometheus/prometheus/storage/metric" +) + +// Tree returns a string of the tree structure of the given node. +func Tree(node Node) string { + return tree(node, "") +} + +func tree(node Node, level string) string { + if node == nil { + return fmt.Sprintf("%s |---- %T\n", level, node) + } + typs := strings.Split(fmt.Sprintf("%T", node), ".")[1] + + var t string + // Only print the number of statements for readability. + if stmts, ok := node.(Statements); ok { + t = fmt.Sprintf("%s |---- %s :: %d\n", level, typs, len(stmts)) + } else { + t = fmt.Sprintf("%s |---- %s :: %s\n", level, typs, node) + } + + level += " · · ·" + + switch n := node.(type) { + case Statements: + for _, s := range n { + t += tree(s, level) + } + case *AlertStmt: + t += tree(n.Expr, level) + + case *EvalStmt: + t += tree(n.Expr, level) + + case *RecordStmt: + t += tree(n.Expr, level) + + case Expressions: + for _, e := range n { + t += tree(e, level) + } + case *AggregateExpr: + t += tree(n.Expr, level) + + case *BinaryExpr: + t += tree(n.LHS, level) + t += tree(n.RHS, level) + + case *Call: + t += tree(n.Args, level) + + case *ParenExpr: + t += tree(n.Expr, level) + + case *UnaryExpr: + t += tree(n.Expr, level) + + case *MatrixSelector, *NumberLiteral, *StringLiteral, *VectorSelector: + // nothing to do + + default: + panic("promql.Tree: not all node types covered") + } + return t +} + +func (stmts Statements) String() (s string) { + if len(stmts) == 0 { + return "" + } + for _, stmt := range stmts { + s += stmt.String() + s += "\n\n" + } + return s[:len(s)-2] +} + +func (node *AlertStmt) String() string { + s := fmt.Sprintf("ALERT %s", node.Name) + s += fmt.Sprintf("\n\tIF %s", node.Expr) + if node.Duration > 0 { + s += fmt.Sprintf("\n\tFOR %s", model.Duration(node.Duration)) + } + if len(node.Labels) > 0 { + s += fmt.Sprintf("\n\tLABELS %s", node.Labels) + } + if len(node.Annotations) > 0 { + s += fmt.Sprintf("\n\tANNOTATIONS %s", node.Annotations) + } + return s +} + +func (node *EvalStmt) String() string { + return "EVAL " + node.Expr.String() +} + +func (node *RecordStmt) String() string { + s := fmt.Sprintf("%s%s = %s", node.Name, node.Labels, node.Expr) + return s +} + +func (es Expressions) String() (s string) { + if len(es) == 0 { + return "" + } + for _, e := range es { + s += e.String() + s += ", " + } + return s[:len(s)-2] +} + +func (node *AggregateExpr) String() string { + aggrString := fmt.Sprintf("%s(", node.Op) + if node.Op.isAggregatorWithParam() { + aggrString += fmt.Sprintf("%s, ", node.Param) + } + aggrString += fmt.Sprintf("%s)", node.Expr) + if len(node.Grouping) > 0 { + var format string + if node.Without { + format = "%s WITHOUT (%s)" + } else { + format = "%s BY (%s)" + } + aggrString = fmt.Sprintf(format, aggrString, node.Grouping) + } + if node.KeepCommonLabels { + aggrString += " KEEP_COMMON" + } + return aggrString +} + +func (node *BinaryExpr) String() string { + returnBool := "" + if node.ReturnBool { + returnBool = " BOOL" + } + + matching := "" + vm := node.VectorMatching + if vm != nil && (len(vm.MatchingLabels) > 0 || vm.On) { + if vm.On { + matching = fmt.Sprintf(" ON(%s)", vm.MatchingLabels) + } else { + matching = fmt.Sprintf(" IGNORING(%s)", vm.MatchingLabels) + } + if vm.Card == CardManyToOne || vm.Card == CardOneToMany { + matching += " GROUP_" + if vm.Card == CardManyToOne { + matching += "LEFT" + } else { + matching += "RIGHT" + } + matching += fmt.Sprintf("(%s)", vm.Include) + } + } + return fmt.Sprintf("%s %s%s%s %s", node.LHS, node.Op, returnBool, matching, node.RHS) +} + +func (node *Call) String() string { + return fmt.Sprintf("%s(%s)", node.Func.Name, node.Args) +} + +func (node *MatrixSelector) String() string { + vecSelector := &VectorSelector{ + Name: node.Name, + LabelMatchers: node.LabelMatchers, + } + offset := "" + if node.Offset != time.Duration(0) { + offset = fmt.Sprintf(" OFFSET %s", model.Duration(node.Offset)) + } + return fmt.Sprintf("%s[%s]%s", vecSelector.String(), model.Duration(node.Range), offset) +} + +func (node *NumberLiteral) String() string { + return fmt.Sprint(node.Val) +} + +func (node *ParenExpr) String() string { + return fmt.Sprintf("(%s)", node.Expr) +} + +func (node *StringLiteral) String() string { + return fmt.Sprintf("%q", node.Val) +} + +func (node *UnaryExpr) String() string { + return fmt.Sprintf("%s%s", node.Op, node.Expr) +} + +func (node *VectorSelector) String() string { + labelStrings := make([]string, 0, len(node.LabelMatchers)-1) + for _, matcher := range node.LabelMatchers { + // Only include the __name__ label if its no equality matching. + if matcher.Name == model.MetricNameLabel && matcher.Type == metric.Equal { + continue + } + labelStrings = append(labelStrings, matcher.String()) + } + offset := "" + if node.Offset != time.Duration(0) { + offset = fmt.Sprintf(" OFFSET %s", model.Duration(node.Offset)) + } + + if len(labelStrings) == 0 { + return fmt.Sprintf("%s%s", node.Name, offset) + } + sort.Strings(labelStrings) + return fmt.Sprintf("%s{%s}%s", node.Name, strings.Join(labelStrings, ","), offset) +} diff --git a/vendor/github.com/prometheus/prometheus/promql/quantile.go b/vendor/github.com/prometheus/prometheus/promql/quantile.go new file mode 100644 index 000000000..4250ec388 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/promql/quantile.go @@ -0,0 +1,185 @@ +// Copyright 2015 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package promql + +import ( + "math" + "sort" + + "github.com/prometheus/common/model" + + "github.com/prometheus/prometheus/storage/metric" +) + +// Helpers to calculate quantiles. + +// excludedLabels are the labels to exclude from signature calculation for +// quantiles. +var excludedLabels = map[model.LabelName]struct{}{ + model.MetricNameLabel: {}, + model.BucketLabel: {}, +} + +type bucket struct { + upperBound float64 + count model.SampleValue +} + +// buckets implements sort.Interface. +type buckets []bucket + +func (b buckets) Len() int { return len(b) } +func (b buckets) Swap(i, j int) { b[i], b[j] = b[j], b[i] } +func (b buckets) Less(i, j int) bool { return b[i].upperBound < b[j].upperBound } + +type metricWithBuckets struct { + metric metric.Metric + buckets buckets +} + +// bucketQuantile calculates the quantile 'q' based on the given buckets. The +// buckets will be sorted by upperBound by this function (i.e. no sorting +// needed before calling this function). The quantile value is interpolated +// assuming a linear distribution within a bucket. However, if the quantile +// falls into the highest bucket, the upper bound of the 2nd highest bucket is +// returned. A natural lower bound of 0 is assumed if the upper bound of the +// lowest bucket is greater 0. In that case, interpolation in the lowest bucket +// happens linearly between 0 and the upper bound of the lowest bucket. +// However, if the lowest bucket has an upper bound less or equal 0, this upper +// bound is returned if the quantile falls into the lowest bucket. +// +// There are a number of special cases (once we have a way to report errors +// happening during evaluations of AST functions, we should report those +// explicitly): +// +// If 'buckets' has fewer than 2 elements, NaN is returned. +// +// If the highest bucket is not +Inf, NaN is returned. +// +// If q<0, -Inf is returned. +// +// If q>1, +Inf is returned. +func bucketQuantile(q model.SampleValue, buckets buckets) float64 { + if q < 0 { + return math.Inf(-1) + } + if q > 1 { + return math.Inf(+1) + } + if len(buckets) < 2 { + return math.NaN() + } + sort.Sort(buckets) + if !math.IsInf(buckets[len(buckets)-1].upperBound, +1) { + return math.NaN() + } + + ensureMonotonic(buckets) + + rank := q * buckets[len(buckets)-1].count + b := sort.Search(len(buckets)-1, func(i int) bool { return buckets[i].count >= rank }) + + if b == len(buckets)-1 { + return buckets[len(buckets)-2].upperBound + } + if b == 0 && buckets[0].upperBound <= 0 { + return buckets[0].upperBound + } + var ( + bucketStart float64 + bucketEnd = buckets[b].upperBound + count = buckets[b].count + ) + if b > 0 { + bucketStart = buckets[b-1].upperBound + count -= buckets[b-1].count + rank -= buckets[b-1].count + } + return bucketStart + (bucketEnd-bucketStart)*float64(rank/count) +} + +// The assumption that bucket counts increase monotonically with increasing +// upperBound may be violated during: +// +// * Recording rule evaluation of histogram_quantile, especially when rate() +// has been applied to the underlying bucket timeseries. +// * Evaluation of histogram_quantile computed over federated bucket +// timeseries, especially when rate() has been applied. +// +// This is because scraped data is not made available to rule evaluation or +// federation atomically, so some buckets are computed with data from the +// most recent scrapes, but the other buckets are missing data from the most +// recent scrape. +// +// Monotonicity is usually guaranteed because if a bucket with upper bound +// u1 has count c1, then any bucket with a higher upper bound u > u1 must +// have counted all c1 observations and perhaps more, so that c >= c1. +// +// Randomly interspersed partial sampling breaks that guarantee, and rate() +// exacerbates it. Specifically, suppose bucket le=1000 has a count of 10 from +// 4 samples but the bucket with le=2000 has a count of 7 from 3 samples. The +// monotonicity is broken. It is exacerbated by rate() because under normal +// operation, cumulative counting of buckets will cause the bucket counts to +// diverge such that small differences from missing samples are not a problem. +// rate() removes this divergence.) +// +// bucketQuantile depends on that monotonicity to do a binary search for the +// bucket with the φ-quantile count, so breaking the monotonicity +// guarantee causes bucketQuantile() to return undefined (nonsense) results. +// +// As a somewhat hacky solution until ingestion is atomic per scrape, we +// calculate the "envelope" of the histogram buckets, essentially removing +// any decreases in the count between successive buckets. + +func ensureMonotonic(buckets buckets) { + max := buckets[0].count + for i := range buckets[1:] { + switch { + case buckets[i].count > max: + max = buckets[i].count + case buckets[i].count < max: + buckets[i].count = max + } + } +} + +// qauntile calculates the given quantile of a vector of samples. +// +// The vector will be sorted. +// If 'values' has zero elements, NaN is returned. +// If q<0, -Inf is returned. +// If q>1, +Inf is returned. +func quantile(q float64, values vectorByValueHeap) float64 { + if len(values) == 0 { + return math.NaN() + } + if q < 0 { + return math.Inf(-1) + } + if q > 1 { + return math.Inf(+1) + } + sort.Sort(values) + + n := float64(len(values)) + // When the quantile lies between two samples, + // we use a weighted average of the two samples. + rank := q * (n - 1) + + lowerIndex := math.Max(0, math.Floor(rank)) + upperIndex := math.Min(n-1, lowerIndex+1) + + weight := rank - math.Floor(rank) + return float64(values[int(lowerIndex)].Value)*(1-weight) + float64(values[int(upperIndex)].Value)*weight +} diff --git a/vendor/github.com/prometheus/prometheus/promql/test.go b/vendor/github.com/prometheus/prometheus/promql/test.go new file mode 100644 index 000000000..e65982713 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/promql/test.go @@ -0,0 +1,525 @@ +// Copyright 2015 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package promql + +import ( + "fmt" + "io/ioutil" + "math" + "regexp" + "strconv" + "strings" + "time" + + "github.com/prometheus/common/model" + "golang.org/x/net/context" + + "github.com/prometheus/prometheus/storage" + "github.com/prometheus/prometheus/storage/local" + "github.com/prometheus/prometheus/util/testutil" +) + +var ( + minNormal = math.Float64frombits(0x0010000000000000) // The smallest positive normal value of type float64. + + patSpace = regexp.MustCompile("[\t ]+") + patLoad = regexp.MustCompile(`^load\s+(.+?)$`) + patEvalInstant = regexp.MustCompile(`^eval(?:_(fail|ordered))?\s+instant\s+(?:at\s+(.+?))?\s+(.+)$`) +) + +const ( + testStartTime = model.Time(0) + epsilon = 0.000001 // Relative error allowed for sample values. +) + +// Test is a sequence of read and write commands that are run +// against a test storage. +type Test struct { + testutil.T + + cmds []testCommand + + storage local.Storage + closeStorage func() + queryEngine *Engine + context context.Context + cancelCtx context.CancelFunc +} + +// NewTest returns an initialized empty Test. +func NewTest(t testutil.T, input string) (*Test, error) { + test := &Test{ + T: t, + cmds: []testCommand{}, + } + err := test.parse(input) + test.clear() + + return test, err +} + +func newTestFromFile(t testutil.T, filename string) (*Test, error) { + content, err := ioutil.ReadFile(filename) + if err != nil { + return nil, err + } + return NewTest(t, string(content)) +} + +// QueryEngine returns the test's query engine. +func (t *Test) QueryEngine() *Engine { + return t.queryEngine +} + +// Context returns the test's context. +func (t *Test) Context() context.Context { + return t.context +} + +// Storage returns the test's storage. +func (t *Test) Storage() local.Storage { + return t.storage +} + +func raise(line int, format string, v ...interface{}) error { + return &ParseErr{ + Line: line + 1, + Err: fmt.Errorf(format, v...), + } +} + +func (t *Test) parseLoad(lines []string, i int) (int, *loadCmd, error) { + if !patLoad.MatchString(lines[i]) { + return i, nil, raise(i, "invalid load command. (load )") + } + parts := patLoad.FindStringSubmatch(lines[i]) + + gap, err := model.ParseDuration(parts[1]) + if err != nil { + return i, nil, raise(i, "invalid step definition %q: %s", parts[1], err) + } + cmd := newLoadCmd(time.Duration(gap)) + for i+1 < len(lines) { + i++ + defLine := lines[i] + if len(defLine) == 0 { + i-- + break + } + metric, vals, err := parseSeriesDesc(defLine) + if err != nil { + if perr, ok := err.(*ParseErr); ok { + perr.Line = i + 1 + } + return i, nil, err + } + cmd.set(metric, vals...) + } + return i, cmd, nil +} + +func (t *Test) parseEval(lines []string, i int) (int, *evalCmd, error) { + if !patEvalInstant.MatchString(lines[i]) { + return i, nil, raise(i, "invalid evaluation command. (eval[_fail|_ordered] instant [at ] ") + } + parts := patEvalInstant.FindStringSubmatch(lines[i]) + var ( + mod = parts[1] + at = parts[2] + qry = parts[3] + ) + expr, err := ParseExpr(qry) + if err != nil { + if perr, ok := err.(*ParseErr); ok { + perr.Line = i + 1 + perr.Pos += strings.Index(lines[i], qry) + } + return i, nil, err + } + + offset, err := model.ParseDuration(at) + if err != nil { + return i, nil, raise(i, "invalid step definition %q: %s", parts[1], err) + } + ts := testStartTime.Add(time.Duration(offset)) + + cmd := newEvalCmd(expr, ts, ts, 0) + switch mod { + case "ordered": + cmd.ordered = true + case "fail": + cmd.fail = true + } + + for j := 1; i+1 < len(lines); j++ { + i++ + defLine := lines[i] + if len(defLine) == 0 { + i-- + break + } + if f, err := parseNumber(defLine); err == nil { + cmd.expect(0, nil, sequenceValue{value: model.SampleValue(f)}) + break + } + metric, vals, err := parseSeriesDesc(defLine) + if err != nil { + if perr, ok := err.(*ParseErr); ok { + perr.Line = i + 1 + } + return i, nil, err + } + + // Currently, we are not expecting any matrices. + if len(vals) > 1 { + return i, nil, raise(i, "expecting multiple values in instant evaluation not allowed") + } + cmd.expect(j, metric, vals...) + } + return i, cmd, nil +} + +// parse the given command sequence and appends it to the test. +func (t *Test) parse(input string) error { + // Trim lines and remove comments. + lines := strings.Split(input, "\n") + for i, l := range lines { + l = strings.TrimSpace(l) + if strings.HasPrefix(l, "#") { + l = "" + } + lines[i] = l + } + var err error + + // Scan for steps line by line. + for i := 0; i < len(lines); i++ { + l := lines[i] + if len(l) == 0 { + continue + } + var cmd testCommand + + switch c := strings.ToLower(patSpace.Split(l, 2)[0]); { + case c == "clear": + cmd = &clearCmd{} + case c == "load": + i, cmd, err = t.parseLoad(lines, i) + case strings.HasPrefix(c, "eval"): + i, cmd, err = t.parseEval(lines, i) + default: + return raise(i, "invalid command %q", l) + } + if err != nil { + return err + } + t.cmds = append(t.cmds, cmd) + } + return nil +} + +// testCommand is an interface that ensures that only the package internal +// types can be a valid command for a test. +type testCommand interface { + testCmd() +} + +func (*clearCmd) testCmd() {} +func (*loadCmd) testCmd() {} +func (*evalCmd) testCmd() {} + +// loadCmd is a command that loads sequences of sample values for specific +// metrics into the storage. +type loadCmd struct { + gap time.Duration + metrics map[model.Fingerprint]model.Metric + defs map[model.Fingerprint][]model.SamplePair +} + +func newLoadCmd(gap time.Duration) *loadCmd { + return &loadCmd{ + gap: gap, + metrics: map[model.Fingerprint]model.Metric{}, + defs: map[model.Fingerprint][]model.SamplePair{}, + } +} + +func (cmd loadCmd) String() string { + return "load" +} + +// set a sequence of sample values for the given metric. +func (cmd *loadCmd) set(m model.Metric, vals ...sequenceValue) { + fp := m.Fingerprint() + + samples := make([]model.SamplePair, 0, len(vals)) + ts := testStartTime + for _, v := range vals { + if !v.omitted { + samples = append(samples, model.SamplePair{ + Timestamp: ts, + Value: v.value, + }) + } + ts = ts.Add(cmd.gap) + } + cmd.defs[fp] = samples + cmd.metrics[fp] = m +} + +// append the defined time series to the storage. +func (cmd *loadCmd) append(a storage.SampleAppender) { + for fp, samples := range cmd.defs { + met := cmd.metrics[fp] + for _, smpl := range samples { + s := &model.Sample{ + Metric: met, + Value: smpl.Value, + Timestamp: smpl.Timestamp, + } + a.Append(s) + } + } +} + +// evalCmd is a command that evaluates an expression for the given time (range) +// and expects a specific result. +type evalCmd struct { + expr Expr + start, end model.Time + interval time.Duration + + instant bool + fail, ordered bool + + metrics map[model.Fingerprint]model.Metric + expected map[model.Fingerprint]entry +} + +type entry struct { + pos int + vals []sequenceValue +} + +func (e entry) String() string { + return fmt.Sprintf("%d: %s", e.pos, e.vals) +} + +func newEvalCmd(expr Expr, start, end model.Time, interval time.Duration) *evalCmd { + return &evalCmd{ + expr: expr, + start: start, + end: end, + interval: interval, + instant: start == end && interval == 0, + + metrics: map[model.Fingerprint]model.Metric{}, + expected: map[model.Fingerprint]entry{}, + } +} + +func (ev *evalCmd) String() string { + return "eval" +} + +// expect adds a new metric with a sequence of values to the set of expected +// results for the query. +func (ev *evalCmd) expect(pos int, m model.Metric, vals ...sequenceValue) { + if m == nil { + ev.expected[0] = entry{pos: pos, vals: vals} + return + } + fp := m.Fingerprint() + ev.metrics[fp] = m + ev.expected[fp] = entry{pos: pos, vals: vals} +} + +// compareResult compares the result value with the defined expectation. +func (ev *evalCmd) compareResult(result model.Value) error { + switch val := result.(type) { + case model.Matrix: + if ev.instant { + return fmt.Errorf("received range result on instant evaluation") + } + seen := map[model.Fingerprint]bool{} + for pos, v := range val { + fp := v.Metric.Fingerprint() + if _, ok := ev.metrics[fp]; !ok { + return fmt.Errorf("unexpected metric %s in result", v.Metric) + } + exp := ev.expected[fp] + if ev.ordered && exp.pos != pos+1 { + return fmt.Errorf("expected metric %s with %v at position %d but was at %d", v.Metric, exp.vals, exp.pos, pos+1) + } + for i, expVal := range exp.vals { + if !almostEqual(float64(expVal.value), float64(v.Values[i].Value)) { + return fmt.Errorf("expected %v for %s but got %v", expVal, v.Metric, v.Values) + } + } + seen[fp] = true + } + for fp, expVals := range ev.expected { + if !seen[fp] { + return fmt.Errorf("expected metric %s with %v not found", ev.metrics[fp], expVals) + } + } + + case model.Vector: + if !ev.instant { + return fmt.Errorf("received instant result on range evaluation") + } + seen := map[model.Fingerprint]bool{} + for pos, v := range val { + fp := v.Metric.Fingerprint() + if _, ok := ev.metrics[fp]; !ok { + return fmt.Errorf("unexpected metric %s in result", v.Metric) + } + exp := ev.expected[fp] + if ev.ordered && exp.pos != pos+1 { + return fmt.Errorf("expected metric %s with %v at position %d but was at %d", v.Metric, exp.vals, exp.pos, pos+1) + } + if !almostEqual(float64(exp.vals[0].value), float64(v.Value)) { + return fmt.Errorf("expected %v for %s but got %v", exp.vals[0].value, v.Metric, v.Value) + } + + seen[fp] = true + } + for fp, expVals := range ev.expected { + if !seen[fp] { + return fmt.Errorf("expected metric %s with %v not found", ev.metrics[fp], expVals) + } + } + + case *model.Scalar: + if !almostEqual(float64(ev.expected[0].vals[0].value), float64(val.Value)) { + return fmt.Errorf("expected scalar %v but got %v", val.Value, ev.expected[0].vals[0].value) + } + + default: + panic(fmt.Errorf("promql.Test.compareResult: unexpected result type %T", result)) + } + return nil +} + +// clearCmd is a command that wipes the test's storage state. +type clearCmd struct{} + +func (cmd clearCmd) String() string { + return "clear" +} + +// Run executes the command sequence of the test. Until the maximum error number +// is reached, evaluation errors do not terminate execution. +func (t *Test) Run() error { + for _, cmd := range t.cmds { + err := t.exec(cmd) + // TODO(fabxc): aggregate command errors, yield diffs for result + // comparison errors. + if err != nil { + return err + } + } + return nil +} + +// exec processes a single step of the test. +func (t *Test) exec(tc testCommand) error { + switch cmd := tc.(type) { + case *clearCmd: + t.clear() + + case *loadCmd: + cmd.append(t.storage) + t.storage.WaitForIndexing() + + case *evalCmd: + q := t.queryEngine.newQuery(cmd.expr, cmd.start, cmd.end, cmd.interval) + res := q.Exec(t.context) + if res.Err != nil { + if cmd.fail { + return nil + } + return fmt.Errorf("error evaluating query: %s", res.Err) + } + if res.Err == nil && cmd.fail { + return fmt.Errorf("expected error evaluating query but got none") + } + + err := cmd.compareResult(res.Value) + if err != nil { + return fmt.Errorf("error in %s %s: %s", cmd, cmd.expr, err) + } + + default: + panic("promql.Test.exec: unknown test command type") + } + return nil +} + +// clear the current test storage of all inserted samples. +func (t *Test) clear() { + if t.closeStorage != nil { + t.closeStorage() + } + if t.cancelCtx != nil { + t.cancelCtx() + } + + var closer testutil.Closer + t.storage, closer = local.NewTestStorage(t, 2) + + t.closeStorage = closer.Close + t.queryEngine = NewEngine(t.storage, nil) + t.context, t.cancelCtx = context.WithCancel(context.Background()) +} + +// Close closes resources associated with the Test. +func (t *Test) Close() { + t.cancelCtx() + t.closeStorage() +} + +// samplesAlmostEqual returns true if the two sample lines only differ by a +// small relative error in their sample value. +func almostEqual(a, b float64) bool { + // NaN has no equality but for testing we still want to know whether both values + // are NaN. + if math.IsNaN(a) && math.IsNaN(b) { + return true + } + + // Cf. http://floating-point-gui.de/errors/comparison/ + if a == b { + return true + } + + diff := math.Abs(a - b) + + if a == 0 || b == 0 || diff < minNormal { + return diff < epsilon*minNormal + } + return diff/(math.Abs(a)+math.Abs(b)) < epsilon +} + +func parseNumber(s string) (float64, error) { + n, err := strconv.ParseInt(s, 0, 64) + f := float64(n) + if err != nil { + f, err = strconv.ParseFloat(s, 64) + } + if err != nil { + return 0, fmt.Errorf("error parsing number: %s", err) + } + return f, nil +} diff --git a/vendor/github.com/prometheus/prometheus/storage/local/chunk/chunk.go b/vendor/github.com/prometheus/prometheus/storage/local/chunk/chunk.go new file mode 100644 index 000000000..19c36734b --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/storage/local/chunk/chunk.go @@ -0,0 +1,494 @@ +// Copyright 2014 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package chunk + +import ( + "container/list" + "errors" + "fmt" + "io" + "sort" + "sync" + "sync/atomic" + + "github.com/prometheus/common/model" + + "github.com/prometheus/prometheus/storage/metric" +) + +// ChunkLen is the length of a chunk in bytes. +const ChunkLen = 1024 + +// DefaultEncoding can be changed via a flag. +var DefaultEncoding = DoubleDelta + +var ( + errChunkBoundsExceeded = errors.New("attempted access outside of chunk boundaries") + errAddedToEvictedChunk = errors.New("attempted to add sample to evicted chunk") +) + +// EvictRequest is a request to evict a chunk from memory. +type EvictRequest struct { + Desc *Desc + Evict bool +} + +// Encoding defines which encoding we are using, delta, doubledelta, or varbit +type Encoding byte + +// String implements flag.Value. +func (e Encoding) String() string { + return fmt.Sprintf("%d", e) +} + +// Set implements flag.Value. +func (e *Encoding) Set(s string) error { + switch s { + case "0": + *e = Delta + case "1": + *e = DoubleDelta + case "2": + *e = Varbit + default: + return fmt.Errorf("invalid chunk encoding: %s", s) + } + return nil +} + +const ( + // Delta encoding + Delta Encoding = iota + // DoubleDelta encoding + DoubleDelta + // Varbit encoding + Varbit +) + +// Desc contains meta-data for a chunk. Pay special attention to the +// documented requirements for calling its methods concurrently (WRT pinning and +// locking). The doc comments spell out the requirements for each method, but +// here is an overview and general explanation: +// +// Everything that changes the pinning of the underlying chunk or deals with its +// eviction is protected by a mutex. This affects the following methods: Pin, +// Unpin, RefCount, IsEvicted, MaybeEvict. These methods can be called at any +// time without further prerequisites. +// +// Another group of methods acts on (or sets) the underlying chunk. These +// methods involve no locking. They may only be called if the caller has pinned +// the chunk (to guarantee the chunk is not evicted concurrently). Also, the +// caller must make sure nobody else will call these methods concurrently, +// either by holding the sole reference to the Desc (usually during loading +// or creation) or by locking the fingerprint of the series the Desc +// belongs to. The affected methods are: Add, MaybePopulateLastTime, SetChunk. +// +// Finally, there are the special cases FirstTime and LastTime. LastTime requires +// to have locked the fingerprint of the series but the chunk does not need to +// be pinned. That's because the ChunkLastTime field in Desc gets populated +// upon completion of the chunk (when it is still pinned, and which happens +// while the series's fingerprint is locked). Once that has happened, calling +// LastTime does not require the chunk to be loaded anymore. Before that has +// happened, the chunk is pinned anyway. The ChunkFirstTime field in Desc +// is populated upon creation of a Desc, so it is alway safe to call +// FirstTime. The FirstTime method is arguably not needed and only there for +// consistency with LastTime. +type Desc struct { + sync.Mutex // Protects pinning. + C Chunk // nil if chunk is evicted. + rCnt int + ChunkFirstTime model.Time // Populated at creation. Immutable. + ChunkLastTime model.Time // Populated on closing of the chunk, model.Earliest if unset. + + // EvictListElement is nil if the chunk is not in the evict list. + // EvictListElement is _not_ protected by the Desc mutex. + // It must only be touched by the evict list handler in MemorySeriesStorage. + EvictListElement *list.Element +} + +// NewDesc creates a new Desc pointing to the provided chunk. The provided chunk +// is assumed to be not persisted yet. Therefore, the refCount of the new +// Desc is 1 (preventing eviction prior to persisting). +func NewDesc(c Chunk, firstTime model.Time) *Desc { + Ops.WithLabelValues(CreateAndPin).Inc() + atomic.AddInt64(&NumMemChunks, 1) + NumMemDescs.Inc() + return &Desc{ + C: c, + rCnt: 1, + ChunkFirstTime: firstTime, + ChunkLastTime: model.Earliest, + } +} + +// Add adds a sample pair to the underlying chunk. For safe concurrent access, +// The chunk must be pinned, and the caller must have locked the fingerprint of +// the series. +func (d *Desc) Add(s model.SamplePair) ([]Chunk, error) { + if d.C == nil { + return nil, errAddedToEvictedChunk + } + return d.C.Add(s) +} + +// Pin increments the refCount by one. Upon increment from 0 to 1, this +// Desc is removed from the evict list. To enable the latter, the +// evictRequests channel has to be provided. This method can be called +// concurrently at any time. +func (d *Desc) Pin(evictRequests chan<- EvictRequest) { + d.Lock() + defer d.Unlock() + + if d.rCnt == 0 { + // Remove ourselves from the evict list. + evictRequests <- EvictRequest{d, false} + } + d.rCnt++ +} + +// Unpin decrements the refCount by one. Upon decrement from 1 to 0, this +// Desc is added to the evict list. To enable the latter, the evictRequests +// channel has to be provided. This method can be called concurrently at any +// time. +func (d *Desc) Unpin(evictRequests chan<- EvictRequest) { + d.Lock() + defer d.Unlock() + + if d.rCnt == 0 { + panic("cannot unpin already unpinned chunk") + } + d.rCnt-- + if d.rCnt == 0 { + // Add ourselves to the back of the evict list. + evictRequests <- EvictRequest{d, true} + } +} + +// RefCount returns the number of pins. This method can be called concurrently +// at any time. +func (d *Desc) RefCount() int { + d.Lock() + defer d.Unlock() + + return d.rCnt +} + +// FirstTime returns the timestamp of the first sample in the chunk. This method +// can be called concurrently at any time. It only returns the immutable +// d.ChunkFirstTime without any locking. Arguably, this method is +// useless. However, it provides consistency with the LastTime method. +func (d *Desc) FirstTime() model.Time { + return d.ChunkFirstTime +} + +// LastTime returns the timestamp of the last sample in the chunk. For safe +// concurrent access, this method requires the fingerprint of the time series to +// be locked. +func (d *Desc) LastTime() (model.Time, error) { + if d.ChunkLastTime != model.Earliest || d.C == nil { + return d.ChunkLastTime, nil + } + return d.C.NewIterator().LastTimestamp() +} + +// MaybePopulateLastTime populates the ChunkLastTime from the underlying chunk +// if it has not yet happened. Call this method directly after having added the +// last sample to a chunk or after closing a head chunk due to age. For safe +// concurrent access, the chunk must be pinned, and the caller must have locked +// the fingerprint of the series. +func (d *Desc) MaybePopulateLastTime() error { + if d.ChunkLastTime == model.Earliest && d.C != nil { + t, err := d.C.NewIterator().LastTimestamp() + if err != nil { + return err + } + d.ChunkLastTime = t + } + return nil +} + +// IsEvicted returns whether the chunk is evicted. For safe concurrent access, +// the caller must have locked the fingerprint of the series. +func (d *Desc) IsEvicted() bool { + // Locking required here because we do not want the caller to force + // pinning the chunk first, so it could be evicted while this method is + // called. + d.Lock() + defer d.Unlock() + + return d.C == nil +} + +// SetChunk sets the underlying chunk. The caller must have locked the +// fingerprint of the series and must have "pre-pinned" the chunk (i.e. first +// call Pin and then set the chunk). +func (d *Desc) SetChunk(c Chunk) { + if d.C != nil { + panic("chunk already set") + } + d.C = c +} + +// MaybeEvict evicts the chunk if the refCount is 0. It returns whether the chunk +// is now evicted, which includes the case that the chunk was evicted even +// before this method was called. It can be called concurrently at any time. +func (d *Desc) MaybeEvict() bool { + d.Lock() + defer d.Unlock() + + if d.C == nil { + return true + } + if d.rCnt != 0 { + return false + } + if d.ChunkLastTime == model.Earliest { + // This must never happen. + panic("ChunkLastTime not populated for evicted chunk") + } + d.C = nil + Ops.WithLabelValues(Evict).Inc() + atomic.AddInt64(&NumMemChunks, -1) + return true +} + +// Chunk is the interface for all chunks. Chunks are generally not +// goroutine-safe. +type Chunk interface { + // Add adds a SamplePair to the chunks, performs any necessary + // re-encoding, and adds any necessary overflow chunks. It returns the + // new version of the original chunk, followed by overflow chunks, if + // any. The first chunk returned might be the same as the original one + // or a newly allocated version. In any case, take the returned chunk as + // the relevant one and discard the original chunk. + Add(sample model.SamplePair) ([]Chunk, error) + Clone() Chunk + FirstTime() model.Time + NewIterator() Iterator + Marshal(io.Writer) error + MarshalToBuf([]byte) error + Unmarshal(io.Reader) error + UnmarshalFromBuf([]byte) error + Encoding() Encoding + Utilization() float64 + + // Len returns the number of samples in the chunk. Implementations may be + // expensive. + Len() int +} + +// Iterator enables efficient access to the content of a chunk. It is +// generally not safe to use an Iterator concurrently with or after chunk +// mutation. +type Iterator interface { + // Gets the last timestamp in the chunk. + LastTimestamp() (model.Time, error) + // Whether a given timestamp is contained between first and last value + // in the chunk. + Contains(model.Time) (bool, error) + // Scans the next value in the chunk. Directly after the iterator has + // been created, the next value is the first value in the + // chunk. Otherwise, it is the value following the last value scanned or + // found (by one of the Find... methods). Returns false if either the + // end of the chunk is reached or an error has occurred. + Scan() bool + // Finds the most recent value at or before the provided time. Returns + // false if either the chunk contains no value at or before the provided + // time, or an error has occurred. + FindAtOrBefore(model.Time) bool + // Finds the oldest value at or after the provided time. Returns false + // if either the chunk contains no value at or after the provided time, + // or an error has occurred. + FindAtOrAfter(model.Time) bool + // Returns the last value scanned (by the scan method) or found (by one + // of the find... methods). It returns model.ZeroSamplePair before any of + // those methods were called. + Value() model.SamplePair + // Returns the last error encountered. In general, an error signals data + // corruption in the chunk and requires quarantining. + Err() error +} + +// RangeValues is a utility function that retrieves all values within the given +// range from an Iterator. +func RangeValues(it Iterator, in metric.Interval) ([]model.SamplePair, error) { + result := []model.SamplePair{} + if !it.FindAtOrAfter(in.OldestInclusive) { + return result, it.Err() + } + for !it.Value().Timestamp.After(in.NewestInclusive) { + result = append(result, it.Value()) + if !it.Scan() { + break + } + } + return result, it.Err() +} + +// addToOverflowChunk is a utility function that creates a new chunk as overflow +// chunk, adds the provided sample to it, and returns a chunk slice containing +// the provided old chunk followed by the new overflow chunk. +func addToOverflowChunk(c Chunk, s model.SamplePair) ([]Chunk, error) { + overflowChunks, err := New().Add(s) + if err != nil { + return nil, err + } + return []Chunk{c, overflowChunks[0]}, nil +} + +// transcodeAndAdd is a utility function that transcodes the dst chunk into the +// provided src chunk (plus the necessary overflow chunks) and then adds the +// provided sample. It returns the new chunks (transcoded plus overflow) with +// the new sample at the end. +func transcodeAndAdd(dst Chunk, src Chunk, s model.SamplePair) ([]Chunk, error) { + Ops.WithLabelValues(Transcode).Inc() + + var ( + head = dst + body, NewChunks []Chunk + err error + ) + + it := src.NewIterator() + for it.Scan() { + if NewChunks, err = head.Add(it.Value()); err != nil { + return nil, err + } + body = append(body, NewChunks[:len(NewChunks)-1]...) + head = NewChunks[len(NewChunks)-1] + } + if it.Err() != nil { + return nil, it.Err() + } + + if NewChunks, err = head.Add(s); err != nil { + return nil, err + } + return append(body, NewChunks...), nil +} + +// New creates a new chunk according to the encoding set by the +// DefaultEncoding flag. +func New() Chunk { + chunk, err := NewForEncoding(DefaultEncoding) + if err != nil { + panic(err) + } + return chunk +} + +// NewForEncoding allows configuring what chunk type you want +func NewForEncoding(encoding Encoding) (Chunk, error) { + switch encoding { + case Delta: + return newDeltaEncodedChunk(d1, d0, true, ChunkLen), nil + case DoubleDelta: + return newDoubleDeltaEncodedChunk(d1, d0, true, ChunkLen), nil + case Varbit: + return newVarbitChunk(varbitZeroEncoding), nil + default: + return nil, fmt.Errorf("unknown chunk encoding: %v", encoding) + } +} + +// indexAccessor allows accesses to samples by index. +type indexAccessor interface { + timestampAtIndex(int) model.Time + sampleValueAtIndex(int) model.SampleValue + err() error +} + +// indexAccessingChunkIterator is a chunk iterator for chunks for which an +// indexAccessor implementation exists. +type indexAccessingChunkIterator struct { + len int + pos int + lastValue model.SamplePair + acc indexAccessor +} + +func newIndexAccessingChunkIterator(len int, acc indexAccessor) *indexAccessingChunkIterator { + return &indexAccessingChunkIterator{ + len: len, + pos: -1, + lastValue: model.ZeroSamplePair, + acc: acc, + } +} + +// lastTimestamp implements Iterator. +func (it *indexAccessingChunkIterator) LastTimestamp() (model.Time, error) { + return it.acc.timestampAtIndex(it.len - 1), it.acc.err() +} + +// contains implements Iterator. +func (it *indexAccessingChunkIterator) Contains(t model.Time) (bool, error) { + return !t.Before(it.acc.timestampAtIndex(0)) && + !t.After(it.acc.timestampAtIndex(it.len-1)), it.acc.err() +} + +// scan implements Iterator. +func (it *indexAccessingChunkIterator) Scan() bool { + it.pos++ + if it.pos >= it.len { + return false + } + it.lastValue = model.SamplePair{ + Timestamp: it.acc.timestampAtIndex(it.pos), + Value: it.acc.sampleValueAtIndex(it.pos), + } + return it.acc.err() == nil +} + +// findAtOrBefore implements Iterator. +func (it *indexAccessingChunkIterator) FindAtOrBefore(t model.Time) bool { + i := sort.Search(it.len, func(i int) bool { + return it.acc.timestampAtIndex(i).After(t) + }) + if i == 0 || it.acc.err() != nil { + return false + } + it.pos = i - 1 + it.lastValue = model.SamplePair{ + Timestamp: it.acc.timestampAtIndex(i - 1), + Value: it.acc.sampleValueAtIndex(i - 1), + } + return true +} + +// findAtOrAfter implements Iterator. +func (it *indexAccessingChunkIterator) FindAtOrAfter(t model.Time) bool { + i := sort.Search(it.len, func(i int) bool { + return !it.acc.timestampAtIndex(i).Before(t) + }) + if i == it.len || it.acc.err() != nil { + return false + } + it.pos = i + it.lastValue = model.SamplePair{ + Timestamp: it.acc.timestampAtIndex(i), + Value: it.acc.sampleValueAtIndex(i), + } + return true +} + +// value implements Iterator. +func (it *indexAccessingChunkIterator) Value() model.SamplePair { + return it.lastValue +} + +// err implements Iterator. +func (it *indexAccessingChunkIterator) Err() error { + return it.acc.err() +} diff --git a/vendor/github.com/prometheus/prometheus/storage/local/chunk/delta.go b/vendor/github.com/prometheus/prometheus/storage/local/chunk/delta.go new file mode 100644 index 000000000..4e3fd0645 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/storage/local/chunk/delta.go @@ -0,0 +1,379 @@ +// Copyright 2014 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package chunk + +import ( + "encoding/binary" + "fmt" + "io" + "math" + + "github.com/prometheus/common/model" +) + +// The 21-byte header of a delta-encoded chunk looks like: +// +// - time delta bytes: 1 bytes +// - value delta bytes: 1 bytes +// - is integer: 1 byte +// - base time: 8 bytes +// - base value: 8 bytes +// - used buf bytes: 2 bytes +const ( + deltaHeaderBytes = 21 + + deltaHeaderTimeBytesOffset = 0 + deltaHeaderValueBytesOffset = 1 + deltaHeaderIsIntOffset = 2 + deltaHeaderBaseTimeOffset = 3 + deltaHeaderBaseValueOffset = 11 + deltaHeaderBufLenOffset = 19 +) + +// A deltaEncodedChunk adaptively stores sample timestamps and values with a +// delta encoding of various types (int, float) and bit widths. However, once 8 +// bytes would be needed to encode a delta value, a fall-back to the absolute +// numbers happens (so that timestamps are saved directly as int64 and values as +// float64). It implements the chunk interface. +type deltaEncodedChunk []byte + +// newDeltaEncodedChunk returns a newly allocated deltaEncodedChunk. +func newDeltaEncodedChunk(tb, vb deltaBytes, isInt bool, length int) *deltaEncodedChunk { + if tb < 1 { + panic("need at least 1 time delta byte") + } + if length < deltaHeaderBytes+16 { + panic(fmt.Errorf( + "chunk length %d bytes is insufficient, need at least %d", + length, deltaHeaderBytes+16, + )) + } + c := make(deltaEncodedChunk, deltaHeaderIsIntOffset+1, length) + + c[deltaHeaderTimeBytesOffset] = byte(tb) + c[deltaHeaderValueBytesOffset] = byte(vb) + if vb < d8 && isInt { // Only use int for fewer than 8 value delta bytes. + c[deltaHeaderIsIntOffset] = 1 + } else { + c[deltaHeaderIsIntOffset] = 0 + } + + return &c +} + +// Add implements chunk. +func (c deltaEncodedChunk) Add(s model.SamplePair) ([]Chunk, error) { + // TODO(beorn7): Since we return &c, this method might cause an unnecessary allocation. + if c.Len() == 0 { + c = c[:deltaHeaderBytes] + binary.LittleEndian.PutUint64(c[deltaHeaderBaseTimeOffset:], uint64(s.Timestamp)) + binary.LittleEndian.PutUint64(c[deltaHeaderBaseValueOffset:], math.Float64bits(float64(s.Value))) + } + + remainingBytes := cap(c) - len(c) + sampleSize := c.sampleSize() + + // Do we generally have space for another sample in this chunk? If not, + // overflow into a new one. + if remainingBytes < sampleSize { + return addToOverflowChunk(&c, s) + } + + baseValue := c.baseValue() + dt := s.Timestamp - c.baseTime() + if dt < 0 { + return nil, fmt.Errorf("time delta is less than zero: %v", dt) + } + + dv := s.Value - baseValue + tb := c.timeBytes() + vb := c.valueBytes() + isInt := c.isInt() + + // If the new sample is incompatible with the current encoding, reencode the + // existing chunk data into new chunk(s). + + ntb, nvb, nInt := tb, vb, isInt + if isInt && !isInt64(dv) { + // int->float. + nvb = d4 + nInt = false + } else if !isInt && vb == d4 && baseValue+model.SampleValue(float32(dv)) != s.Value { + // float32->float64. + nvb = d8 + } else { + if tb < d8 { + // Maybe more bytes for timestamp. + ntb = max(tb, bytesNeededForUnsignedTimestampDelta(dt)) + } + if c.isInt() && vb < d8 { + // Maybe more bytes for sample value. + nvb = max(vb, bytesNeededForIntegerSampleValueDelta(dv)) + } + } + if tb != ntb || vb != nvb || isInt != nInt { + if len(c)*2 < cap(c) { + return transcodeAndAdd(newDeltaEncodedChunk(ntb, nvb, nInt, cap(c)), &c, s) + } + // Chunk is already half full. Better create a new one and save the transcoding efforts. + return addToOverflowChunk(&c, s) + } + + offset := len(c) + c = c[:offset+sampleSize] + + switch tb { + case d1: + c[offset] = byte(dt) + case d2: + binary.LittleEndian.PutUint16(c[offset:], uint16(dt)) + case d4: + binary.LittleEndian.PutUint32(c[offset:], uint32(dt)) + case d8: + // Store the absolute value (no delta) in case of d8. + binary.LittleEndian.PutUint64(c[offset:], uint64(s.Timestamp)) + default: + return nil, fmt.Errorf("invalid number of bytes for time delta: %d", tb) + } + + offset += int(tb) + + if c.isInt() { + switch vb { + case d0: + // No-op. Constant value is stored as base value. + case d1: + c[offset] = byte(int8(dv)) + case d2: + binary.LittleEndian.PutUint16(c[offset:], uint16(int16(dv))) + case d4: + binary.LittleEndian.PutUint32(c[offset:], uint32(int32(dv))) + // d8 must not happen. Those samples are encoded as float64. + default: + return nil, fmt.Errorf("invalid number of bytes for integer delta: %d", vb) + } + } else { + switch vb { + case d4: + binary.LittleEndian.PutUint32(c[offset:], math.Float32bits(float32(dv))) + case d8: + // Store the absolute value (no delta) in case of d8. + binary.LittleEndian.PutUint64(c[offset:], math.Float64bits(float64(s.Value))) + default: + return nil, fmt.Errorf("invalid number of bytes for floating point delta: %d", vb) + } + } + return []Chunk{&c}, nil +} + +// Clone implements chunk. +func (c deltaEncodedChunk) Clone() Chunk { + clone := make(deltaEncodedChunk, len(c), cap(c)) + copy(clone, c) + return &clone +} + +// FirstTime implements chunk. +func (c deltaEncodedChunk) FirstTime() model.Time { + return c.baseTime() +} + +// NewIterator implements chunk. +func (c *deltaEncodedChunk) NewIterator() Iterator { + return newIndexAccessingChunkIterator(c.Len(), &deltaEncodedIndexAccessor{ + c: *c, + baseT: c.baseTime(), + baseV: c.baseValue(), + tBytes: c.timeBytes(), + vBytes: c.valueBytes(), + isInt: c.isInt(), + }) +} + +// Marshal implements chunk. +func (c deltaEncodedChunk) Marshal(w io.Writer) error { + if len(c) > math.MaxUint16 { + panic("chunk buffer length would overflow a 16 bit uint.") + } + binary.LittleEndian.PutUint16(c[deltaHeaderBufLenOffset:], uint16(len(c))) + + n, err := w.Write(c[:cap(c)]) + if err != nil { + return err + } + if n != cap(c) { + return fmt.Errorf("wanted to write %d bytes, wrote %d", cap(c), n) + } + return nil +} + +// MarshalToBuf implements chunk. +func (c deltaEncodedChunk) MarshalToBuf(buf []byte) error { + if len(c) > math.MaxUint16 { + panic("chunk buffer length would overflow a 16 bit uint") + } + binary.LittleEndian.PutUint16(c[deltaHeaderBufLenOffset:], uint16(len(c))) + + n := copy(buf, c) + if n != len(c) { + return fmt.Errorf("wanted to copy %d bytes to buffer, copied %d", len(c), n) + } + return nil +} + +// Unmarshal implements chunk. +func (c *deltaEncodedChunk) Unmarshal(r io.Reader) error { + *c = (*c)[:cap(*c)] + if _, err := io.ReadFull(r, *c); err != nil { + return err + } + return c.setLen() +} + +// UnmarshalFromBuf implements chunk. +func (c *deltaEncodedChunk) UnmarshalFromBuf(buf []byte) error { + *c = (*c)[:cap(*c)] + copy(*c, buf) + return c.setLen() +} + +// setLen sets the length of the underlying slice and performs some sanity checks. +func (c *deltaEncodedChunk) setLen() error { + l := binary.LittleEndian.Uint16((*c)[deltaHeaderBufLenOffset:]) + if int(l) > cap(*c) { + return fmt.Errorf("delta chunk length exceeded during unmarshaling: %d", l) + } + if int(l) < deltaHeaderBytes { + return fmt.Errorf("delta chunk length less than header size: %d < %d", l, deltaHeaderBytes) + } + switch c.timeBytes() { + case d1, d2, d4, d8: + // Pass. + default: + return fmt.Errorf("invalid number of time bytes in delta chunk: %d", c.timeBytes()) + } + switch c.valueBytes() { + case d0, d1, d2, d4, d8: + // Pass. + default: + return fmt.Errorf("invalid number of value bytes in delta chunk: %d", c.valueBytes()) + } + *c = (*c)[:l] + return nil +} + +// Encoding implements chunk. +func (c deltaEncodedChunk) Encoding() Encoding { return Delta } + +// Utilization implements chunk. +func (c deltaEncodedChunk) Utilization() float64 { + return float64(len(c)) / float64(cap(c)) +} + +func (c deltaEncodedChunk) timeBytes() deltaBytes { + return deltaBytes(c[deltaHeaderTimeBytesOffset]) +} + +func (c deltaEncodedChunk) valueBytes() deltaBytes { + return deltaBytes(c[deltaHeaderValueBytesOffset]) +} + +func (c deltaEncodedChunk) isInt() bool { + return c[deltaHeaderIsIntOffset] == 1 +} + +func (c deltaEncodedChunk) baseTime() model.Time { + return model.Time(binary.LittleEndian.Uint64(c[deltaHeaderBaseTimeOffset:])) +} + +func (c deltaEncodedChunk) baseValue() model.SampleValue { + return model.SampleValue(math.Float64frombits(binary.LittleEndian.Uint64(c[deltaHeaderBaseValueOffset:]))) +} + +func (c deltaEncodedChunk) sampleSize() int { + return int(c.timeBytes() + c.valueBytes()) +} + +// Len implements Chunk. Runs in constant time. +func (c deltaEncodedChunk) Len() int { + if len(c) < deltaHeaderBytes { + return 0 + } + return (len(c) - deltaHeaderBytes) / c.sampleSize() +} + +// deltaEncodedIndexAccessor implements indexAccessor. +type deltaEncodedIndexAccessor struct { + c deltaEncodedChunk + baseT model.Time + baseV model.SampleValue + tBytes, vBytes deltaBytes + isInt bool + lastErr error +} + +func (acc *deltaEncodedIndexAccessor) err() error { + return acc.lastErr +} + +func (acc *deltaEncodedIndexAccessor) timestampAtIndex(idx int) model.Time { + offset := deltaHeaderBytes + idx*int(acc.tBytes+acc.vBytes) + + switch acc.tBytes { + case d1: + return acc.baseT + model.Time(uint8(acc.c[offset])) + case d2: + return acc.baseT + model.Time(binary.LittleEndian.Uint16(acc.c[offset:])) + case d4: + return acc.baseT + model.Time(binary.LittleEndian.Uint32(acc.c[offset:])) + case d8: + // Take absolute value for d8. + return model.Time(binary.LittleEndian.Uint64(acc.c[offset:])) + default: + acc.lastErr = fmt.Errorf("invalid number of bytes for time delta: %d", acc.tBytes) + return model.Earliest + } +} + +func (acc *deltaEncodedIndexAccessor) sampleValueAtIndex(idx int) model.SampleValue { + offset := deltaHeaderBytes + idx*int(acc.tBytes+acc.vBytes) + int(acc.tBytes) + + if acc.isInt { + switch acc.vBytes { + case d0: + return acc.baseV + case d1: + return acc.baseV + model.SampleValue(int8(acc.c[offset])) + case d2: + return acc.baseV + model.SampleValue(int16(binary.LittleEndian.Uint16(acc.c[offset:]))) + case d4: + return acc.baseV + model.SampleValue(int32(binary.LittleEndian.Uint32(acc.c[offset:]))) + // No d8 for ints. + default: + acc.lastErr = fmt.Errorf("invalid number of bytes for integer delta: %d", acc.vBytes) + return 0 + } + } else { + switch acc.vBytes { + case d4: + return acc.baseV + model.SampleValue(math.Float32frombits(binary.LittleEndian.Uint32(acc.c[offset:]))) + case d8: + // Take absolute value for d8. + return model.SampleValue(math.Float64frombits(binary.LittleEndian.Uint64(acc.c[offset:]))) + default: + acc.lastErr = fmt.Errorf("invalid number of bytes for floating point delta: %d", acc.vBytes) + return 0 + } + } +} diff --git a/vendor/github.com/prometheus/prometheus/storage/local/chunk/delta_helpers.go b/vendor/github.com/prometheus/prometheus/storage/local/chunk/delta_helpers.go new file mode 100644 index 000000000..81e5d18cb --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/storage/local/chunk/delta_helpers.go @@ -0,0 +1,84 @@ +// Copyright 2015 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package chunk + +import ( + "math" + + "github.com/prometheus/common/model" +) + +type deltaBytes byte + +const ( + d0 deltaBytes = 0 + d1 deltaBytes = 1 + d2 deltaBytes = 2 + d4 deltaBytes = 4 + d8 deltaBytes = 8 +) + +func bytesNeededForUnsignedTimestampDelta(deltaT model.Time) deltaBytes { + switch { + case deltaT > math.MaxUint32: + return d8 + case deltaT > math.MaxUint16: + return d4 + case deltaT > math.MaxUint8: + return d2 + default: + return d1 + } +} + +func bytesNeededForSignedTimestampDelta(deltaT model.Time) deltaBytes { + switch { + case deltaT > math.MaxInt32 || deltaT < math.MinInt32: + return d8 + case deltaT > math.MaxInt16 || deltaT < math.MinInt16: + return d4 + case deltaT > math.MaxInt8 || deltaT < math.MinInt8: + return d2 + default: + return d1 + } +} + +func bytesNeededForIntegerSampleValueDelta(deltaV model.SampleValue) deltaBytes { + switch { + case deltaV < math.MinInt32 || deltaV > math.MaxInt32: + return d8 + case deltaV < math.MinInt16 || deltaV > math.MaxInt16: + return d4 + case deltaV < math.MinInt8 || deltaV > math.MaxInt8: + return d2 + case deltaV != 0: + return d1 + default: + return d0 + } +} + +func max(a, b deltaBytes) deltaBytes { + if a > b { + return a + } + return b +} + +// isInt64 returns true if v can be represented as an int64. +func isInt64(v model.SampleValue) bool { + // Note: Using math.Modf is slower than the conversion approach below. + return model.SampleValue(int64(v)) == v +} diff --git a/vendor/github.com/prometheus/prometheus/storage/local/chunk/doubledelta.go b/vendor/github.com/prometheus/prometheus/storage/local/chunk/doubledelta.go new file mode 100644 index 000000000..249c99d54 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/storage/local/chunk/doubledelta.go @@ -0,0 +1,525 @@ +// Copyright 2014 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package chunk + +import ( + "encoding/binary" + "fmt" + "io" + "math" + + "github.com/prometheus/common/model" +) + +// The 37-byte header of a delta-encoded chunk looks like: +// +// - used buf bytes: 2 bytes +// - time double-delta bytes: 1 bytes +// - value double-delta bytes: 1 bytes +// - is integer: 1 byte +// - base time: 8 bytes +// - base value: 8 bytes +// - base time delta: 8 bytes +// - base value delta: 8 bytes +const ( + doubleDeltaHeaderBytes = 37 + doubleDeltaHeaderMinBytes = 21 // header isn't full for chunk w/ one sample + + doubleDeltaHeaderBufLenOffset = 0 + doubleDeltaHeaderTimeBytesOffset = 2 + doubleDeltaHeaderValueBytesOffset = 3 + doubleDeltaHeaderIsIntOffset = 4 + doubleDeltaHeaderBaseTimeOffset = 5 + doubleDeltaHeaderBaseValueOffset = 13 + doubleDeltaHeaderBaseTimeDeltaOffset = 21 + doubleDeltaHeaderBaseValueDeltaOffset = 29 +) + +// A doubleDeltaEncodedChunk adaptively stores sample timestamps and values with +// a double-delta encoding of various types (int, float) and bit widths. A base +// value and timestamp and a base delta for each is saved in the header. The +// payload consists of double-deltas, i.e. deviations from the values and +// timestamps calculated by applying the base value and time and the base deltas. +// However, once 8 bytes would be needed to encode a double-delta value, a +// fall-back to the absolute numbers happens (so that timestamps are saved +// directly as int64 and values as float64). +// doubleDeltaEncodedChunk implements the chunk interface. +type doubleDeltaEncodedChunk []byte + +// newDoubleDeltaEncodedChunk returns a newly allocated doubleDeltaEncodedChunk. +func newDoubleDeltaEncodedChunk(tb, vb deltaBytes, isInt bool, length int) *doubleDeltaEncodedChunk { + if tb < 1 { + panic("need at least 1 time delta byte") + } + if length < doubleDeltaHeaderBytes+16 { + panic(fmt.Errorf( + "chunk length %d bytes is insufficient, need at least %d", + length, doubleDeltaHeaderBytes+16, + )) + } + c := make(doubleDeltaEncodedChunk, doubleDeltaHeaderIsIntOffset+1, length) + + c[doubleDeltaHeaderTimeBytesOffset] = byte(tb) + c[doubleDeltaHeaderValueBytesOffset] = byte(vb) + if vb < d8 && isInt { // Only use int for fewer than 8 value double-delta bytes. + c[doubleDeltaHeaderIsIntOffset] = 1 + } else { + c[doubleDeltaHeaderIsIntOffset] = 0 + } + return &c +} + +// Add implements chunk. +func (c doubleDeltaEncodedChunk) Add(s model.SamplePair) ([]Chunk, error) { + // TODO(beorn7): Since we return &c, this method might cause an unnecessary allocation. + if c.Len() == 0 { + return c.addFirstSample(s), nil + } + + tb := c.timeBytes() + vb := c.valueBytes() + + if c.Len() == 1 { + return c.addSecondSample(s, tb, vb) + } + + remainingBytes := cap(c) - len(c) + sampleSize := c.sampleSize() + + // Do we generally have space for another sample in this chunk? If not, + // overflow into a new one. + if remainingBytes < sampleSize { + return addToOverflowChunk(&c, s) + } + + projectedTime := c.baseTime() + model.Time(c.Len())*c.baseTimeDelta() + ddt := s.Timestamp - projectedTime + + projectedValue := c.baseValue() + model.SampleValue(c.Len())*c.baseValueDelta() + ddv := s.Value - projectedValue + + ntb, nvb, nInt := tb, vb, c.isInt() + // If the new sample is incompatible with the current encoding, reencode the + // existing chunk data into new chunk(s). + if c.isInt() && !isInt64(ddv) { + // int->float. + nvb = d4 + nInt = false + } else if !c.isInt() && vb == d4 && projectedValue+model.SampleValue(float32(ddv)) != s.Value { + // float32->float64. + nvb = d8 + } else { + if tb < d8 { + // Maybe more bytes for timestamp. + ntb = max(tb, bytesNeededForSignedTimestampDelta(ddt)) + } + if c.isInt() && vb < d8 { + // Maybe more bytes for sample value. + nvb = max(vb, bytesNeededForIntegerSampleValueDelta(ddv)) + } + } + if tb != ntb || vb != nvb || c.isInt() != nInt { + if len(c)*2 < cap(c) { + return transcodeAndAdd(newDoubleDeltaEncodedChunk(ntb, nvb, nInt, cap(c)), &c, s) + } + // Chunk is already half full. Better create a new one and save the transcoding efforts. + return addToOverflowChunk(&c, s) + } + + offset := len(c) + c = c[:offset+sampleSize] + + switch tb { + case d1: + c[offset] = byte(ddt) + case d2: + binary.LittleEndian.PutUint16(c[offset:], uint16(ddt)) + case d4: + binary.LittleEndian.PutUint32(c[offset:], uint32(ddt)) + case d8: + // Store the absolute value (no delta) in case of d8. + binary.LittleEndian.PutUint64(c[offset:], uint64(s.Timestamp)) + default: + return nil, fmt.Errorf("invalid number of bytes for time delta: %d", tb) + } + + offset += int(tb) + + if c.isInt() { + switch vb { + case d0: + // No-op. Constant delta is stored as base value. + case d1: + c[offset] = byte(int8(ddv)) + case d2: + binary.LittleEndian.PutUint16(c[offset:], uint16(int16(ddv))) + case d4: + binary.LittleEndian.PutUint32(c[offset:], uint32(int32(ddv))) + // d8 must not happen. Those samples are encoded as float64. + default: + return nil, fmt.Errorf("invalid number of bytes for integer delta: %d", vb) + } + } else { + switch vb { + case d4: + binary.LittleEndian.PutUint32(c[offset:], math.Float32bits(float32(ddv))) + case d8: + // Store the absolute value (no delta) in case of d8. + binary.LittleEndian.PutUint64(c[offset:], math.Float64bits(float64(s.Value))) + default: + return nil, fmt.Errorf("invalid number of bytes for floating point delta: %d", vb) + } + } + return []Chunk{&c}, nil +} + +// Clone implements chunk. +func (c doubleDeltaEncodedChunk) Clone() Chunk { + clone := make(doubleDeltaEncodedChunk, len(c), cap(c)) + copy(clone, c) + return &clone +} + +// FirstTime implements chunk. +func (c doubleDeltaEncodedChunk) FirstTime() model.Time { + return c.baseTime() +} + +// NewIterator( implements chunk. +func (c *doubleDeltaEncodedChunk) NewIterator() Iterator { + return newIndexAccessingChunkIterator(c.Len(), &doubleDeltaEncodedIndexAccessor{ + c: *c, + baseT: c.baseTime(), + baseΔT: c.baseTimeDelta(), + baseV: c.baseValue(), + baseΔV: c.baseValueDelta(), + tBytes: c.timeBytes(), + vBytes: c.valueBytes(), + isInt: c.isInt(), + }) +} + +// Marshal implements chunk. +func (c doubleDeltaEncodedChunk) Marshal(w io.Writer) error { + if len(c) > math.MaxUint16 { + panic("chunk buffer length would overflow a 16 bit uint") + } + binary.LittleEndian.PutUint16(c[doubleDeltaHeaderBufLenOffset:], uint16(len(c))) + + n, err := w.Write(c[:cap(c)]) + if err != nil { + return err + } + if n != cap(c) { + return fmt.Errorf("wanted to write %d bytes, wrote %d", cap(c), n) + } + return nil +} + +// MarshalToBuf implements chunk. +func (c doubleDeltaEncodedChunk) MarshalToBuf(buf []byte) error { + if len(c) > math.MaxUint16 { + panic("chunk buffer length would overflow a 16 bit uint") + } + binary.LittleEndian.PutUint16(c[doubleDeltaHeaderBufLenOffset:], uint16(len(c))) + + n := copy(buf, c) + if n != len(c) { + return fmt.Errorf("wanted to copy %d bytes to buffer, copied %d", len(c), n) + } + return nil +} + +// Unmarshal implements chunk. +func (c *doubleDeltaEncodedChunk) Unmarshal(r io.Reader) error { + *c = (*c)[:cap(*c)] + if _, err := io.ReadFull(r, *c); err != nil { + return err + } + return c.setLen() +} + +// UnmarshalFromBuf implements chunk. +func (c *doubleDeltaEncodedChunk) UnmarshalFromBuf(buf []byte) error { + *c = (*c)[:cap(*c)] + copy(*c, buf) + return c.setLen() +} + +// setLen sets the length of the underlying slice and performs some sanity checks. +func (c *doubleDeltaEncodedChunk) setLen() error { + l := binary.LittleEndian.Uint16((*c)[doubleDeltaHeaderBufLenOffset:]) + if int(l) > cap(*c) { + return fmt.Errorf("doubledelta chunk length exceeded during unmarshaling: %d", l) + } + if int(l) < doubleDeltaHeaderMinBytes { + return fmt.Errorf("doubledelta chunk length less than header size: %d < %d", l, doubleDeltaHeaderMinBytes) + } + switch c.timeBytes() { + case d1, d2, d4, d8: + // Pass. + default: + return fmt.Errorf("invalid number of time bytes in doubledelta chunk: %d", c.timeBytes()) + } + switch c.valueBytes() { + case d0, d1, d2, d4, d8: + // Pass. + default: + return fmt.Errorf("invalid number of value bytes in doubledelta chunk: %d", c.valueBytes()) + } + *c = (*c)[:l] + return nil +} + +// Encoding implements chunk. +func (c doubleDeltaEncodedChunk) Encoding() Encoding { return DoubleDelta } + +// Utilization implements chunk. +func (c doubleDeltaEncodedChunk) Utilization() float64 { + return float64(len(c)-doubleDeltaHeaderIsIntOffset-1) / float64(cap(c)) +} + +func (c doubleDeltaEncodedChunk) baseTime() model.Time { + return model.Time( + binary.LittleEndian.Uint64( + c[doubleDeltaHeaderBaseTimeOffset:], + ), + ) +} + +func (c doubleDeltaEncodedChunk) baseValue() model.SampleValue { + return model.SampleValue( + math.Float64frombits( + binary.LittleEndian.Uint64( + c[doubleDeltaHeaderBaseValueOffset:], + ), + ), + ) +} + +func (c doubleDeltaEncodedChunk) baseTimeDelta() model.Time { + if len(c) < doubleDeltaHeaderBaseTimeDeltaOffset+8 { + return 0 + } + return model.Time( + binary.LittleEndian.Uint64( + c[doubleDeltaHeaderBaseTimeDeltaOffset:], + ), + ) +} + +func (c doubleDeltaEncodedChunk) baseValueDelta() model.SampleValue { + if len(c) < doubleDeltaHeaderBaseValueDeltaOffset+8 { + return 0 + } + return model.SampleValue( + math.Float64frombits( + binary.LittleEndian.Uint64( + c[doubleDeltaHeaderBaseValueDeltaOffset:], + ), + ), + ) +} + +func (c doubleDeltaEncodedChunk) timeBytes() deltaBytes { + return deltaBytes(c[doubleDeltaHeaderTimeBytesOffset]) +} + +func (c doubleDeltaEncodedChunk) valueBytes() deltaBytes { + return deltaBytes(c[doubleDeltaHeaderValueBytesOffset]) +} + +func (c doubleDeltaEncodedChunk) sampleSize() int { + return int(c.timeBytes() + c.valueBytes()) +} + +// Len implements Chunk. Runs in constant time. +func (c doubleDeltaEncodedChunk) Len() int { + if len(c) <= doubleDeltaHeaderIsIntOffset+1 { + return 0 + } + if len(c) <= doubleDeltaHeaderBaseValueOffset+8 { + return 1 + } + return (len(c)-doubleDeltaHeaderBytes)/c.sampleSize() + 2 +} + +func (c doubleDeltaEncodedChunk) isInt() bool { + return c[doubleDeltaHeaderIsIntOffset] == 1 +} + +// addFirstSample is a helper method only used by c.add(). It adds timestamp and +// value as base time and value. +func (c doubleDeltaEncodedChunk) addFirstSample(s model.SamplePair) []Chunk { + c = c[:doubleDeltaHeaderBaseValueOffset+8] + binary.LittleEndian.PutUint64( + c[doubleDeltaHeaderBaseTimeOffset:], + uint64(s.Timestamp), + ) + binary.LittleEndian.PutUint64( + c[doubleDeltaHeaderBaseValueOffset:], + math.Float64bits(float64(s.Value)), + ) + return []Chunk{&c} +} + +// addSecondSample is a helper method only used by c.add(). It calculates the +// base delta from the provided sample and adds it to the chunk. +func (c doubleDeltaEncodedChunk) addSecondSample(s model.SamplePair, tb, vb deltaBytes) ([]Chunk, error) { + baseTimeDelta := s.Timestamp - c.baseTime() + if baseTimeDelta < 0 { + return nil, fmt.Errorf("base time delta is less than zero: %v", baseTimeDelta) + } + c = c[:doubleDeltaHeaderBytes] + if tb >= d8 || bytesNeededForUnsignedTimestampDelta(baseTimeDelta) >= d8 { + // If already the base delta needs d8 (or we are at d8 + // already, anyway), we better encode this timestamp + // directly rather than as a delta and switch everything + // to d8. + c[doubleDeltaHeaderTimeBytesOffset] = byte(d8) + binary.LittleEndian.PutUint64( + c[doubleDeltaHeaderBaseTimeDeltaOffset:], + uint64(s.Timestamp), + ) + } else { + binary.LittleEndian.PutUint64( + c[doubleDeltaHeaderBaseTimeDeltaOffset:], + uint64(baseTimeDelta), + ) + } + baseValue := c.baseValue() + baseValueDelta := s.Value - baseValue + if vb >= d8 || baseValue+baseValueDelta != s.Value { + // If we can't reproduce the original sample value (or + // if we are at d8 already, anyway), we better encode + // this value directly rather than as a delta and switch + // everything to d8. + c[doubleDeltaHeaderValueBytesOffset] = byte(d8) + c[doubleDeltaHeaderIsIntOffset] = 0 + binary.LittleEndian.PutUint64( + c[doubleDeltaHeaderBaseValueDeltaOffset:], + math.Float64bits(float64(s.Value)), + ) + } else { + binary.LittleEndian.PutUint64( + c[doubleDeltaHeaderBaseValueDeltaOffset:], + math.Float64bits(float64(baseValueDelta)), + ) + } + return []Chunk{&c}, nil +} + +// doubleDeltaEncodedIndexAccessor implements indexAccessor. +type doubleDeltaEncodedIndexAccessor struct { + c doubleDeltaEncodedChunk + baseT, baseΔT model.Time + baseV, baseΔV model.SampleValue + tBytes, vBytes deltaBytes + isInt bool + lastErr error +} + +func (acc *doubleDeltaEncodedIndexAccessor) err() error { + return acc.lastErr +} + +func (acc *doubleDeltaEncodedIndexAccessor) timestampAtIndex(idx int) model.Time { + if idx == 0 { + return acc.baseT + } + if idx == 1 { + // If time bytes are at d8, the time is saved directly rather + // than as a difference. + if acc.tBytes == d8 { + return acc.baseΔT + } + return acc.baseT + acc.baseΔT + } + + offset := doubleDeltaHeaderBytes + (idx-2)*int(acc.tBytes+acc.vBytes) + + switch acc.tBytes { + case d1: + return acc.baseT + + model.Time(idx)*acc.baseΔT + + model.Time(int8(acc.c[offset])) + case d2: + return acc.baseT + + model.Time(idx)*acc.baseΔT + + model.Time(int16(binary.LittleEndian.Uint16(acc.c[offset:]))) + case d4: + return acc.baseT + + model.Time(idx)*acc.baseΔT + + model.Time(int32(binary.LittleEndian.Uint32(acc.c[offset:]))) + case d8: + // Take absolute value for d8. + return model.Time(binary.LittleEndian.Uint64(acc.c[offset:])) + default: + acc.lastErr = fmt.Errorf("invalid number of bytes for time delta: %d", acc.tBytes) + return model.Earliest + } +} + +func (acc *doubleDeltaEncodedIndexAccessor) sampleValueAtIndex(idx int) model.SampleValue { + if idx == 0 { + return acc.baseV + } + if idx == 1 { + // If value bytes are at d8, the value is saved directly rather + // than as a difference. + if acc.vBytes == d8 { + return acc.baseΔV + } + return acc.baseV + acc.baseΔV + } + + offset := doubleDeltaHeaderBytes + (idx-2)*int(acc.tBytes+acc.vBytes) + int(acc.tBytes) + + if acc.isInt { + switch acc.vBytes { + case d0: + return acc.baseV + + model.SampleValue(idx)*acc.baseΔV + case d1: + return acc.baseV + + model.SampleValue(idx)*acc.baseΔV + + model.SampleValue(int8(acc.c[offset])) + case d2: + return acc.baseV + + model.SampleValue(idx)*acc.baseΔV + + model.SampleValue(int16(binary.LittleEndian.Uint16(acc.c[offset:]))) + case d4: + return acc.baseV + + model.SampleValue(idx)*acc.baseΔV + + model.SampleValue(int32(binary.LittleEndian.Uint32(acc.c[offset:]))) + // No d8 for ints. + default: + acc.lastErr = fmt.Errorf("invalid number of bytes for integer delta: %d", acc.vBytes) + return 0 + } + } else { + switch acc.vBytes { + case d4: + return acc.baseV + + model.SampleValue(idx)*acc.baseΔV + + model.SampleValue(math.Float32frombits(binary.LittleEndian.Uint32(acc.c[offset:]))) + case d8: + // Take absolute value for d8. + return model.SampleValue(math.Float64frombits(binary.LittleEndian.Uint64(acc.c[offset:]))) + default: + acc.lastErr = fmt.Errorf("invalid number of bytes for floating point delta: %d", acc.vBytes) + return 0 + } + } +} diff --git a/vendor/github.com/prometheus/prometheus/storage/local/chunk/instrumentation.go b/vendor/github.com/prometheus/prometheus/storage/local/chunk/instrumentation.go new file mode 100644 index 000000000..4dd3231e4 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/storage/local/chunk/instrumentation.go @@ -0,0 +1,90 @@ +// Copyright 2014 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package chunk + +import "github.com/prometheus/client_golang/prometheus" + +// Usually, a separate file for instrumentation is frowned upon. Metrics should +// be close to where they are used. However, the metrics below are set all over +// the place, so we go for a separate instrumentation file in this case. +var ( + Ops = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "chunk_ops_total", + Help: "The total number of chunk operations by their type.", + }, + []string{OpTypeLabel}, + ) + DescOps = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "chunkdesc_ops_total", + Help: "The total number of chunk descriptor operations by their type.", + }, + []string{OpTypeLabel}, + ) + NumMemDescs = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "memory_chunkdescs", + Help: "The current number of chunk descriptors in memory.", + }) +) + +const ( + namespace = "prometheus" + subsystem = "local_storage" + + // OpTypeLabel is the label name for chunk operation types. + OpTypeLabel = "type" + + // Op-types for ChunkOps. + + // CreateAndPin is the label value for create-and-pin chunk ops. + CreateAndPin = "create" // A Desc creation with refCount=1. + // PersistAndUnpin is the label value for persist chunk ops. + PersistAndUnpin = "persist" + // Pin is the label value for pin chunk ops (excludes pin on creation). + Pin = "pin" + // Unpin is the label value for unpin chunk ops (excludes the unpin on persisting). + Unpin = "unpin" + // Clone is the label value for clone chunk ops. + Clone = "clone" + // Transcode is the label value for transcode chunk ops. + Transcode = "transcode" + // Drop is the label value for drop chunk ops. + Drop = "drop" + + // Op-types for ChunkOps and ChunkDescOps. + + // Evict is the label value for evict chunk desc ops. + Evict = "evict" + // Load is the label value for load chunk and chunk desc ops. + Load = "load" +) + +func init() { + prometheus.MustRegister(Ops) + prometheus.MustRegister(DescOps) + prometheus.MustRegister(NumMemDescs) +} + +// NumMemChunks is the total number of chunks in memory. This is a global +// counter, also used internally, so not implemented as metrics. Collected in +// MemorySeriesStorage. +// TODO(beorn7): Having this as an exported global variable is really bad. +var NumMemChunks int64 diff --git a/vendor/github.com/prometheus/prometheus/storage/local/chunk/varbit.go b/vendor/github.com/prometheus/prometheus/storage/local/chunk/varbit.go new file mode 100644 index 000000000..3181a9a76 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/storage/local/chunk/varbit.go @@ -0,0 +1,1210 @@ +// Copyright 2016 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package chunk + +import ( + "encoding/binary" + "fmt" + "io" + "math" + + "github.com/prometheus/common/model" +) + +// The varbit chunk encoding is broadly similar to the double-delta +// chunks. However, it uses a number of different bit-widths to save the +// double-deltas (rather than 1, 2, or 4 bytes). Also, it doesn't use the delta +// of the first two samples of a chunk as the base delta, but uses a "sliding" +// delta, i.e. the delta of the two previous samples. Both differences make +// random access more expensive. Sample values can be encoded with the same +// double-delta scheme as timestamps, but different value encodings can be +// chosen adaptively, among them XOR encoding and "zero" encoding for constant +// sample values. Overall, the varbit encoding results in a much better +// compression ratio (~1.3 bytes per sample compared to ~3.3 bytes per sample +// with double-delta encoding, for typical data sets). +// +// Major parts of the varbit encoding are inspired by the following paper: +// Gorilla: A Fast, Scalable, In-Memory Time Series Database +// T. Pelkonen et al., Facebook Inc. +// http://www.vldb.org/pvldb/vol8/p1816-teller.pdf +// Note that there are significant differences, some due to the way Prometheus +// chunks work, others to optimize for the Prometheus use-case. +// +// Layout of a 1024 byte varbit chunk (big endian, wherever it matters): +// - first time (int64): 8 bytes bit 0000-0063 +// - first value (float64): 8 bytes bit 0064-0127 +// - last time (int64): 8 bytes bit 0128-0191 +// - last value (float64): 8 bytes bit 0192-0255 +// - first Δt (t1-t0, unsigned): 3 bytes bit 0256-0279 +// - flags (byte) 1 byte bit 0280-0287 +// - bit offset for next sample 2 bytes bit 0288-0303 +// - first Δv for value encoding 1, otherwise payload +// 4 bytes bit 0304-0335 +// - payload 973 bytes bit 0336-8119 +// The following only exists if the chunk is still open. Otherwise, it might be +// used by payload. +// - bit offset for current ΔΔt=0 count 2 bytes bit 8120-8135 +// - last Δt 3 bytes bit 8136-8159 +// - special bytes for value encoding 4 bytes bit 8160-8191 +// - for encoding 1: last Δv 4 bytes bit 8160-8191 +// - for encoding 2: count of +// - last leading zeros (1 byte) 1 byte bit 8160-8167 +// - last significant bits (1 byte) 1 byte bit 8168-8175 +// +// FLAGS +// +// The two least significant bits of the flags byte define the value encoding +// for the whole chunk, see below. The most significant byte of the flags byte +// is set if the chunk is closed. No samples can be added anymore to a closed +// chunk. Furthermore, the last value of a closed chunk is only saved in the +// header (last time, last value), while in a chunk that is still open, the last +// sample in the payload is the same sample as saved in the header. +// +// The remaining bits in the flags byte are currently unused. +// +// TIMESTAMP ENCODING +// +// The 1st timestamp is saved directly. +// +// The difference to the 2nd timestamp is saved as first Δt. 3 bytes is enough +// for about 4.5h. Since we close a chunk after sitting idle for 1h, this +// limitation has no practical consequences. Should, for whatever reason, a +// larger delta be required, the chunk would be closed, i.e. the new sample is +// added as the last sample to the chunk, and the next sample will be added to a +// new chunk. +// +// From the 3rd timestamp on, a double-delta (ΔΔt) is saved: +// (t_{n} - t_{n-1}) - (t_{n-1} - t_{n-2}) +// To perform that operation, the last Δt is saved at the end of the chunk for +// as long the chunk is not closed yet (see above). +// +// Most of the times, ΔΔt is zero, even with the ms-precision of +// Prometheus. Therefore, we save a ΔΔt of zero as a leading '0' bit followed by +// 7 bits counting the number of consecutive ΔΔt==0 (the count is offset by -1, +// so the range of 0 to 127 represents 1 to 128 repetitions). +// +// If ΔΔt != 0, we essentially apply the Gorilla encoding scheme (cf. section +// 4.1.1 in the paper) but with different bit buckets as Prometheus uses ms +// rather than s, and the default scrape interval is 1m rather than 4m). In +// particular: +// +// - If ΔΔt is between [-32,31], store '10' followed by a 6 bit value. This is +// for minor irregularities in the scrape interval. +// +// - If ΔΔt is between [-65536,65535], store '110' followed by a 17 bit +// value. This will typically happen if a scrape is missed completely. +// +// - If ΔΔt is between [-4194304,4194303], store '111' followed by a 23 bit +// value. This spans more than 1h, which is usually enough as we close a +// chunk anyway if it doesn't receive any sample in 1h. +// +// - Should we nevertheless encounter a larger ΔΔt, we simply close the chunk, +// add the new sample as the last of the chunk, and add subsequent samples to +// a new chunk. +// +// VALUE ENCODING +// +// Value encoding can change and is determined by the two least significant bits +// of the 'flags' byte at bit position 280. The encoding can be changed without +// transcoding upon adding the 3rd sample. After that, an encoding change +// results either in transcoding or in closing the chunk. +// +// The 1st sample value is always saved directly. The 2nd sample value is saved +// in the header as the last value. Upon saving the 3rd value, an encoding is +// chosen, and the chunk is prepared accordingly. +// +// The following value encodings exist (with their value in the flags byte): +// +// 0: "Zero encoding". +// +// In many time series, the value simply stays constant over a long time +// (e.g. the "up" time series). In that case, all sample values are determined +// by the 1st value, and no further value encoding is happening at all. The +// payload consists entirely of timestamps. +// +// 1: Integer double-delta encoding. +// +// Many Prometheus metrics are integer counters and change in a quite regular +// fashion, similar to timestamps. Thus, the same double-delta encoding can be +// applied. This encoding works like the timestamp encoding described above, but +// with different bit buckets and without counting of repeated ΔΔv=0. The case +// of ΔΔv=0 is represented by a single '0' bit for each occurrence. The first Δv +// is saved as an int32 at bit position 288. The most recent Δv is saved as an +// int32 at the end of the chunk (see above). If Δv cannot be represented as a +// 32 bit signed integer, no integer double-delta encoding can be applied. +// +// Bit buckets (lead-in bytes followed by (signed) value bits): +// - '0': 0 bit +// - '10': 6 bit +// - '110': 13 bit +// - '1110': 20 bit +// - '1111': 33 bit +// Since Δv is restricted to 32 bit, 33 bit are always enough for ΔΔv. +// +// 2: XOR encoding. +// +// This follows almost precisely the Gorilla value encoding (cf. section 4.1.2 +// of the paper). The last count of leading zeros and the last count of +// meaningful bits in the XOR value is saved at the end of the chunk for as long +// as the chunk is not closed yet (see above). Note, though, that the number of +// significant bits is saved as (count-1), i.e. a saved value of 0 means 1 +// significant bit, a saved value of 1 means 2, and so on. Also, we save the +// numbers of leading zeros and significant bits anew if they drop a +// lot. Otherwise, you can easily be locked in with a high number of significant +// bits. +// +// 3: Direct encoding. +// +// If the sample values are just random, it is most efficient to save sample +// values directly as float64. +// +// ZIPPING TIMESTAMPS AND VALUES TOGETHER +// +// Usually, encoded timestamps and encoded values simply alternate. There are +// two exceptions: +// +// (1) With the "zero encoding" for values, the payload only contains +// timestamps. +// +// (2) In a consecutive row of up to 128 ΔΔt=0 repeats, the count of timestamps +// determines how many sample values will follow directly after another. + +const ( + varbitMinLength = 128 + varbitMaxLength = 8191 + + // Useful byte offsets. + varbitFirstTimeOffset = 0 + varbitFirstValueOffset = 8 + varbitLastTimeOffset = 16 + varbitLastValueOffset = 24 + varbitFirstTimeDeltaOffset = 32 + varbitFlagOffset = 35 + varbitNextSampleBitOffsetOffset = 36 + varbitFirstValueDeltaOffset = 38 + // The following are in the "footer" and only usable if the chunk is + // still open. + varbitCountOffsetBitOffset = ChunkLen - 9 + varbitLastTimeDeltaOffset = ChunkLen - 7 + varbitLastValueDeltaOffset = ChunkLen - 4 + varbitLastLeadingZerosCountOffset = ChunkLen - 4 + varbitLastSignificantBitsCountOffset = ChunkLen - 3 + + varbitFirstSampleBitOffset uint16 = 0 // Symbolic, don't really read or write here. + varbitSecondSampleBitOffset uint16 = 1 // Symbolic, don't really read or write here. + // varbitThirdSampleBitOffset is a bit special. Depending on the encoding, there can + // be various things at this offset. It's most of the time symbolic, but in the best + // case (zero encoding for values), it will be the real offset for the 3rd sample. + varbitThirdSampleBitOffset uint16 = varbitFirstValueDeltaOffset * 8 + + // If the bit offset for the next sample is above this threshold, no new + // samples can be added to the chunk's payload (because the payload has + // already reached the footer). However, one more sample can be saved in + // the header as the last sample. + varbitNextSampleBitOffsetThreshold = 8 * varbitCountOffsetBitOffset + + varbitMaxTimeDelta = 1 << 24 // What fits into a 3-byte timestamp. +) + +type varbitValueEncoding byte + +const ( + varbitZeroEncoding varbitValueEncoding = iota + varbitIntDoubleDeltaEncoding + varbitXOREncoding + varbitDirectEncoding +) + +// varbitWorstCaseBitsPerSample provides the worst-case number of bits needed +// per sample with the various value encodings. The counts already include the +// up to 27 bits taken by a timestamp. +var varbitWorstCaseBitsPerSample = map[varbitValueEncoding]int{ + varbitZeroEncoding: 27 + 0, + varbitIntDoubleDeltaEncoding: 27 + 38, + varbitXOREncoding: 27 + 13 + 64, + varbitDirectEncoding: 27 + 64, +} + +// varbitChunk implements the chunk interface. +type varbitChunk []byte + +// newVarbitChunk returns a newly allocated varbitChunk. For simplicity, all +// varbit chunks must have the length as determined by the ChunkLen constant. +func newVarbitChunk(enc varbitValueEncoding) *varbitChunk { + if ChunkLen < varbitMinLength || ChunkLen > varbitMaxLength { + panic(fmt.Errorf( + "invalid chunk length of %d bytes, need at least %d bytes and at most %d bytes", + ChunkLen, varbitMinLength, varbitMaxLength, + )) + } + if enc > varbitDirectEncoding { + panic(fmt.Errorf("unknown varbit value encoding: %v", enc)) + } + c := make(varbitChunk, ChunkLen) + c.setValueEncoding(enc) + return &c +} + +// Add implements chunk. +func (c *varbitChunk) Add(s model.SamplePair) ([]Chunk, error) { + offset := c.nextSampleOffset() + switch { + case c.closed(): + return addToOverflowChunk(c, s) + case offset > varbitNextSampleBitOffsetThreshold: + return c.addLastSample(s), nil + case offset == varbitFirstSampleBitOffset: + return c.addFirstSample(s), nil + case offset == varbitSecondSampleBitOffset: + return c.addSecondSample(s) + } + return c.addLaterSample(s, offset) +} + +// Clone implements chunk. +func (c varbitChunk) Clone() Chunk { + clone := make(varbitChunk, len(c)) + copy(clone, c) + return &clone +} + +// NewIterator implements chunk. +func (c varbitChunk) NewIterator() Iterator { + return newVarbitChunkIterator(c) +} + +// Marshal implements chunk. +func (c varbitChunk) Marshal(w io.Writer) error { + n, err := w.Write(c) + if err != nil { + return err + } + if n != cap(c) { + return fmt.Errorf("wanted to write %d bytes, wrote %d", cap(c), n) + } + return nil +} + +// MarshalToBuf implements chunk. +func (c varbitChunk) MarshalToBuf(buf []byte) error { + n := copy(buf, c) + if n != len(c) { + return fmt.Errorf("wanted to copy %d bytes to buffer, copied %d", len(c), n) + } + return nil +} + +// Unmarshal implements chunk. +func (c varbitChunk) Unmarshal(r io.Reader) error { + _, err := io.ReadFull(r, c) + return err +} + +// UnmarshalFromBuf implements chunk. +func (c varbitChunk) UnmarshalFromBuf(buf []byte) error { + if copied := copy(c, buf); copied != cap(c) { + return fmt.Errorf("insufficient bytes copied from buffer during unmarshaling, want %d, got %d", cap(c), copied) + } + return nil +} + +// Encoding implements chunk. +func (c varbitChunk) Encoding() Encoding { return Varbit } + +// Utilization implements chunk. +func (c varbitChunk) Utilization() float64 { + // 15 bytes is the length of the chunk footer. + return math.Min(float64(c.nextSampleOffset()/8+15)/float64(cap(c)), 1) +} + +// Len implements chunk. Runs in O(n). +func (c varbitChunk) Len() int { + it := c.NewIterator() + i := 0 + for ; it.Scan(); i++ { + } + return i +} + +// FirstTime implements chunk. +func (c varbitChunk) FirstTime() model.Time { + return model.Time( + binary.BigEndian.Uint64( + c[varbitFirstTimeOffset:], + ), + ) +} + +func (c varbitChunk) firstValue() model.SampleValue { + return model.SampleValue( + math.Float64frombits( + binary.BigEndian.Uint64( + c[varbitFirstValueOffset:], + ), + ), + ) +} + +func (c varbitChunk) lastTime() model.Time { + return model.Time( + binary.BigEndian.Uint64( + c[varbitLastTimeOffset:], + ), + ) +} + +func (c varbitChunk) lastValue() model.SampleValue { + return model.SampleValue( + math.Float64frombits( + binary.BigEndian.Uint64( + c[varbitLastValueOffset:], + ), + ), + ) +} + +func (c varbitChunk) firstTimeDelta() model.Time { + // Only the first 3 bytes are actually the timestamp, so get rid of the + // last one by bitshifting. + return model.Time(c[varbitFirstTimeDeltaOffset+2]) | + model.Time(c[varbitFirstTimeDeltaOffset+1])<<8 | + model.Time(c[varbitFirstTimeDeltaOffset])<<16 +} + +// firstValueDelta returns an undefined result if the encoding type is not 1. +func (c varbitChunk) firstValueDelta() int32 { + return int32(binary.BigEndian.Uint32(c[varbitFirstValueDeltaOffset:])) +} + +// lastTimeDelta returns an undefined result if the chunk is closed already. +func (c varbitChunk) lastTimeDelta() model.Time { + return model.Time(c[varbitLastTimeDeltaOffset+2]) | + model.Time(c[varbitLastTimeDeltaOffset+1])<<8 | + model.Time(c[varbitLastTimeDeltaOffset])<<16 +} + +// setLastTimeDelta must not be called if the chunk is closed already. It most +// not be called with a time that doesn't fit into 24bit, either. +func (c varbitChunk) setLastTimeDelta(dT model.Time) { + if dT > varbitMaxTimeDelta { + panic("Δt overflows 24 bit") + } + c[varbitLastTimeDeltaOffset] = byte(dT >> 16) + c[varbitLastTimeDeltaOffset+1] = byte(dT >> 8) + c[varbitLastTimeDeltaOffset+2] = byte(dT) +} + +// lastValueDelta returns an undefined result if the chunk is closed already. +func (c varbitChunk) lastValueDelta() int32 { + return int32(binary.BigEndian.Uint32(c[varbitLastValueDeltaOffset:])) +} + +// setLastValueDelta must not be called if the chunk is closed already. +func (c varbitChunk) setLastValueDelta(dV int32) { + binary.BigEndian.PutUint32(c[varbitLastValueDeltaOffset:], uint32(dV)) +} + +func (c varbitChunk) nextSampleOffset() uint16 { + return binary.BigEndian.Uint16(c[varbitNextSampleBitOffsetOffset:]) +} + +func (c varbitChunk) setNextSampleOffset(offset uint16) { + binary.BigEndian.PutUint16(c[varbitNextSampleBitOffsetOffset:], offset) +} + +func (c varbitChunk) valueEncoding() varbitValueEncoding { + return varbitValueEncoding(c[varbitFlagOffset] & 0x03) +} + +func (c varbitChunk) setValueEncoding(enc varbitValueEncoding) { + if enc > varbitDirectEncoding { + panic("invalid varbit value encoding") + } + c[varbitFlagOffset] &^= 0x03 // Clear. + c[varbitFlagOffset] |= byte(enc) // Set. +} + +func (c varbitChunk) closed() bool { + return c[varbitFlagOffset] > 0x7F // Most significant bit set. +} + +func (c varbitChunk) zeroDDTRepeats() (repeats uint64, offset uint16) { + offset = binary.BigEndian.Uint16(c[varbitCountOffsetBitOffset:]) + if offset == 0 { + return 0, 0 + } + return c.readBitPattern(offset, 7) + 1, offset +} + +func (c varbitChunk) setZeroDDTRepeats(repeats uint64, offset uint16) { + switch repeats { + case 0: + // Just clear the offset. + binary.BigEndian.PutUint16(c[varbitCountOffsetBitOffset:], 0) + return + case 1: + // First time we set a repeat here, so set the offset. But only + // if we haven't reached the footer yet. (If that's the case, we + // would overwrite ourselves below, and we don't need the offset + // later anyway because no more samples will be added to this + // chunk.) + if offset+7 <= varbitNextSampleBitOffsetThreshold { + binary.BigEndian.PutUint16(c[varbitCountOffsetBitOffset:], offset) + } + default: + // For a change, we are writing somewhere where we have written + // before. We need to clear the bits first. + posIn1stByte := offset % 8 + c[offset/8] &^= bitMask[7][posIn1stByte] + if posIn1stByte > 1 { + c[offset/8+1] &^= bitMask[posIn1stByte-1][0] + } + } + c.addBitPattern(offset, repeats-1, 7) +} + +func (c varbitChunk) setLastSample(s model.SamplePair) { + binary.BigEndian.PutUint64( + c[varbitLastTimeOffset:], + uint64(s.Timestamp), + ) + binary.BigEndian.PutUint64( + c[varbitLastValueOffset:], + math.Float64bits(float64(s.Value)), + ) +} + +// addFirstSample is a helper method only used by c.add(). It adds timestamp and +// value as base time and value. +func (c *varbitChunk) addFirstSample(s model.SamplePair) []Chunk { + binary.BigEndian.PutUint64( + (*c)[varbitFirstTimeOffset:], + uint64(s.Timestamp), + ) + binary.BigEndian.PutUint64( + (*c)[varbitFirstValueOffset:], + math.Float64bits(float64(s.Value)), + ) + c.setLastSample(s) // To simplify handling of single-sample chunks. + c.setNextSampleOffset(varbitSecondSampleBitOffset) + return []Chunk{c} +} + +// addSecondSample is a helper method only used by c.add(). It calculates the +// first time delta from the provided sample and adds it to the chunk together +// with the provided sample as the last sample. +func (c *varbitChunk) addSecondSample(s model.SamplePair) ([]Chunk, error) { + firstTimeDelta := s.Timestamp - c.FirstTime() + if firstTimeDelta < 0 { + return nil, fmt.Errorf("first Δt is less than zero: %v", firstTimeDelta) + } + if firstTimeDelta > varbitMaxTimeDelta { + // A time delta too great. Still, we can add it as a last sample + // before overflowing. + return c.addLastSample(s), nil + } + (*c)[varbitFirstTimeDeltaOffset] = byte(firstTimeDelta >> 16) + (*c)[varbitFirstTimeDeltaOffset+1] = byte(firstTimeDelta >> 8) + (*c)[varbitFirstTimeDeltaOffset+2] = byte(firstTimeDelta) + + // Also set firstTimeDelta as the last time delta to be able to use the + // normal methods for adding later samples. + c.setLastTimeDelta(firstTimeDelta) + + c.setLastSample(s) + c.setNextSampleOffset(varbitThirdSampleBitOffset) + return []Chunk{c}, nil +} + +// addLastSample is a helper method only used by c.add() and in other helper +// methods called by c.add(). It simply sets the given sample as the last sample +// in the heador and declares the chunk closed. In other words, addLastSample +// adds the very last sample added to this chunk ever, while setLastSample sets +// the sample most recently added to the chunk so that it can be used for the +// calculations required to add the next sample. +func (c *varbitChunk) addLastSample(s model.SamplePair) []Chunk { + c.setLastSample(s) + (*c)[varbitFlagOffset] |= 0x80 + return []Chunk{c} +} + +// addLaterSample is a helper method only used by c.add(). It adds a third or +// later sample. +func (c *varbitChunk) addLaterSample(s model.SamplePair, offset uint16) ([]Chunk, error) { + var ( + lastTime = c.lastTime() + lastTimeDelta = c.lastTimeDelta() + newTimeDelta = s.Timestamp - lastTime + lastValue = c.lastValue() + encoding = c.valueEncoding() + ) + + if newTimeDelta < 0 { + return nil, fmt.Errorf("Δt is less than zero: %v", newTimeDelta) + } + if offset == varbitThirdSampleBitOffset { + offset, encoding = c.prepForThirdSample(lastValue, s.Value, encoding) + } + if newTimeDelta > varbitMaxTimeDelta { + // A time delta too great. Still, we can add it as a last sample + // before overflowing. + return c.addLastSample(s), nil + } + + // Analyze worst case, does it fit? If not, set new sample as the last. + if int(offset)+varbitWorstCaseBitsPerSample[encoding] > ChunkLen*8 { + return c.addLastSample(s), nil + } + + // Transcoding/overflow decisions first. + if encoding == varbitZeroEncoding && s.Value != lastValue { + // Cannot go on with zero encoding. + if offset > ChunkLen*4 { + // Chunk already half full. Don't transcode, overflow instead. + return addToOverflowChunk(c, s) + } + if isInt32(s.Value - lastValue) { + // Trying int encoding looks promising. + return transcodeAndAdd(newVarbitChunk(varbitIntDoubleDeltaEncoding), c, s) + } + return transcodeAndAdd(newVarbitChunk(varbitXOREncoding), c, s) + } + if encoding == varbitIntDoubleDeltaEncoding && !isInt32(s.Value-lastValue) { + // Cannot go on with int encoding. + if offset > ChunkLen*4 { + // Chunk already half full. Don't transcode, overflow instead. + return addToOverflowChunk(c, s) + } + return transcodeAndAdd(newVarbitChunk(varbitXOREncoding), c, s) + } + + offset, overflow := c.addDDTime(offset, lastTimeDelta, newTimeDelta) + if overflow { + return c.addLastSample(s), nil + } + switch encoding { + case varbitZeroEncoding: + // Nothing to do. + case varbitIntDoubleDeltaEncoding: + offset = c.addDDValue(offset, lastValue, s.Value) + case varbitXOREncoding: + offset = c.addXORValue(offset, lastValue, s.Value) + case varbitDirectEncoding: + offset = c.addBitPattern(offset, math.Float64bits(float64(s.Value)), 64) + default: + return nil, fmt.Errorf("unknown Varbit value encoding: %v", encoding) + } + + c.setNextSampleOffset(offset) + c.setLastSample(s) + return []Chunk{c}, nil +} + +func (c varbitChunk) prepForThirdSample( + lastValue, newValue model.SampleValue, encoding varbitValueEncoding, +) (uint16, varbitValueEncoding) { + var ( + offset = varbitThirdSampleBitOffset + firstValue = c.firstValue() + firstValueDelta = lastValue - firstValue + firstXOR = math.Float64bits(float64(firstValue)) ^ math.Float64bits(float64(lastValue)) + _, firstSignificantBits = countBits(firstXOR) + secondXOR = math.Float64bits(float64(lastValue)) ^ math.Float64bits(float64(newValue)) + _, secondSignificantBits = countBits(secondXOR) + ) + // Now pick an initial encoding and prepare things accordingly. + // However, never pick an encoding "below" the one initially set. + switch { + case encoding == varbitZeroEncoding && lastValue == firstValue && lastValue == newValue: + // Stay at zero encoding. + // No value to be set. + // No offset change required. + case encoding <= varbitIntDoubleDeltaEncoding && isInt32(firstValueDelta): + encoding = varbitIntDoubleDeltaEncoding + binary.BigEndian.PutUint32( + c[varbitFirstValueDeltaOffset:], + uint32(int32(firstValueDelta)), + ) + c.setLastValueDelta(int32(firstValueDelta)) + offset += 32 + case encoding == varbitDirectEncoding || firstSignificantBits+secondSignificantBits > 100: + // Heuristics based on three samples only is a bit weak, + // but if we need 50+13 = 63 bits per sample already + // now, we might be better off going for direct encoding. + encoding = varbitDirectEncoding + // Put bit pattern directly where otherwise the delta would have gone. + binary.BigEndian.PutUint64( + c[varbitFirstValueDeltaOffset:], + math.Float64bits(float64(lastValue)), + ) + offset += 64 + default: + encoding = varbitXOREncoding + offset = c.addXORValue(offset, firstValue, lastValue) + } + c.setValueEncoding(encoding) + c.setNextSampleOffset(offset) + return offset, encoding +} + +// addDDTime requires that lastTimeDelta and newTimeDelta are positive and don't overflow 24bit. +func (c varbitChunk) addDDTime(offset uint16, lastTimeDelta, newTimeDelta model.Time) (newOffset uint16, overflow bool) { + timeDD := newTimeDelta - lastTimeDelta + + if !isSignedIntN(int64(timeDD), 23) { + return offset, true + } + + c.setLastTimeDelta(newTimeDelta) + repeats, repeatsOffset := c.zeroDDTRepeats() + + if timeDD == 0 { + if repeats == 0 || repeats == 128 { + // First zeroDDT, or counter full, prepare new counter. + offset = c.addZeroBit(offset) + repeatsOffset = offset + offset += 7 + repeats = 0 + } + c.setZeroDDTRepeats(repeats+1, repeatsOffset) + return offset, false + } + + // No zero repeat. If we had any before, clear the DDT offset. + c.setZeroDDTRepeats(0, repeatsOffset) + + switch { + case isSignedIntN(int64(timeDD), 6): + offset = c.addOneBitsWithTrailingZero(offset, 1) + offset = c.addSignedInt(offset, int64(timeDD), 6) + case isSignedIntN(int64(timeDD), 17): + offset = c.addOneBitsWithTrailingZero(offset, 2) + offset = c.addSignedInt(offset, int64(timeDD), 17) + case isSignedIntN(int64(timeDD), 23): + offset = c.addOneBits(offset, 3) + offset = c.addSignedInt(offset, int64(timeDD), 23) + default: + panic("unexpected required bits for ΔΔt") + } + return offset, false +} + +// addDDValue requires that newValue-lastValue can be represented with an int32. +func (c varbitChunk) addDDValue(offset uint16, lastValue, newValue model.SampleValue) uint16 { + newValueDelta := int64(newValue - lastValue) + lastValueDelta := c.lastValueDelta() + valueDD := newValueDelta - int64(lastValueDelta) + c.setLastValueDelta(int32(newValueDelta)) + + switch { + case valueDD == 0: + return c.addZeroBit(offset) + case isSignedIntN(valueDD, 6): + offset = c.addOneBitsWithTrailingZero(offset, 1) + return c.addSignedInt(offset, valueDD, 6) + case isSignedIntN(valueDD, 13): + offset = c.addOneBitsWithTrailingZero(offset, 2) + return c.addSignedInt(offset, valueDD, 13) + case isSignedIntN(valueDD, 20): + offset = c.addOneBitsWithTrailingZero(offset, 3) + return c.addSignedInt(offset, valueDD, 20) + case isSignedIntN(valueDD, 33): + offset = c.addOneBits(offset, 4) + return c.addSignedInt(offset, valueDD, 33) + default: + panic("unexpected required bits for ΔΔv") + } +} + +func (c varbitChunk) addXORValue(offset uint16, lastValue, newValue model.SampleValue) uint16 { + lastPattern := math.Float64bits(float64(lastValue)) + newPattern := math.Float64bits(float64(newValue)) + xor := lastPattern ^ newPattern + if xor == 0 { + return c.addZeroBit(offset) + } + + lastLeadingBits := c[varbitLastLeadingZerosCountOffset] + lastSignificantBits := c[varbitLastSignificantBitsCountOffset] + newLeadingBits, newSignificantBits := countBits(xor) + + // Short entry if the new significant bits fit into the same box as the + // last significant bits. However, should the new significant bits be + // shorter by 10 or more, go for a long entry instead, as we will + // probably save more (11 bit one-time overhead, potentially more to + // save later). + if newLeadingBits >= lastLeadingBits && + newLeadingBits+newSignificantBits <= lastLeadingBits+lastSignificantBits && + lastSignificantBits-newSignificantBits < 10 { + offset = c.addOneBitsWithTrailingZero(offset, 1) + return c.addBitPattern( + offset, + xor>>(64-lastLeadingBits-lastSignificantBits), + uint16(lastSignificantBits), + ) + } + + // Long entry. + c[varbitLastLeadingZerosCountOffset] = newLeadingBits + c[varbitLastSignificantBitsCountOffset] = newSignificantBits + offset = c.addOneBits(offset, 2) + offset = c.addBitPattern(offset, uint64(newLeadingBits), 5) + offset = c.addBitPattern(offset, uint64(newSignificantBits-1), 6) // Note -1! + return c.addBitPattern( + offset, + xor>>(64-newLeadingBits-newSignificantBits), + uint16(newSignificantBits), + ) +} + +func (c varbitChunk) addZeroBit(offset uint16) uint16 { + if offset < varbitNextSampleBitOffsetThreshold { + // Writing a zero to a never touched area is a no-op. + // Just increase the offset. + return offset + 1 + } + newByte := c[offset/8] &^ bitMask[1][offset%8] + c[offset/8] = newByte + // TODO(beorn7): The two lines above could be written as + // c[offset/8] &^= bitMask[1][offset%8] + // However, that tickles a compiler bug with GOARCH=386. + // See https://github.com/prometheus/prometheus/issues/1509 + return offset + 1 +} + +func (c varbitChunk) addOneBits(offset uint16, n uint16) uint16 { + if n > 7 { + panic("unexpected number of control bits") + } + b := 8 - offset%8 + if b > n { + b = n + } + c[offset/8] |= bitMask[b][offset%8] + offset += b + b = n - b + if b > 0 { + c[offset/8] |= bitMask[b][0] + offset += b + } + return offset +} +func (c varbitChunk) addOneBitsWithTrailingZero(offset uint16, n uint16) uint16 { + offset = c.addOneBits(offset, n) + return c.addZeroBit(offset) +} + +// addSignedInt adds i as a signed integer with n bits. It requires i to be +// representable as such. (Check with isSignedIntN first.) +func (c varbitChunk) addSignedInt(offset uint16, i int64, n uint16) uint16 { + if i < 0 && n < 64 { + i += 1 << n + } + return c.addBitPattern(offset, uint64(i), n) +} + +// addBitPattern adds the last n bits of the given pattern. Other bits in the +// pattern must be 0. +func (c varbitChunk) addBitPattern(offset uint16, pattern uint64, n uint16) uint16 { + var ( + byteOffset = offset / 8 + bitsToWrite = 8 - offset%8 + newOffset = offset + n + ) + + // Clean up the parts of the footer we will write into. (But not more as + // we are still using the value related part of the footer when we have + // already overwritten timestamp related parts.) + if newOffset > varbitNextSampleBitOffsetThreshold { + pos := offset + if pos < varbitNextSampleBitOffsetThreshold { + pos = varbitNextSampleBitOffsetThreshold + } + for pos < newOffset { + posInByte := pos % 8 + bitsToClear := newOffset - pos + if bitsToClear > 8-posInByte { + bitsToClear = 8 - posInByte + } + c[pos/8] &^= bitMask[bitsToClear][posInByte] + pos += bitsToClear + } + } + + for n > 0 { + if n <= bitsToWrite { + c[byteOffset] |= byte(pattern << (bitsToWrite - n)) + break + } + c[byteOffset] |= byte(pattern >> (n - bitsToWrite)) + n -= bitsToWrite + bitsToWrite = 8 + byteOffset++ + } + return newOffset +} + +// readBitPattern reads n bits at the given offset and returns them as the last +// n bits in a uint64. +func (c varbitChunk) readBitPattern(offset, n uint16) uint64 { + var ( + result uint64 + byteOffset = offset / 8 + bitOffset = offset % 8 + trailingBits, bitsToRead uint16 + ) + + for n > 0 { + trailingBits = 0 + bitsToRead = 8 - bitOffset + if bitsToRead > n { + trailingBits = bitsToRead - n + bitsToRead = n + } + result <<= bitsToRead + result |= uint64( + (c[byteOffset] & bitMask[bitsToRead][bitOffset]) >> trailingBits, + ) + n -= bitsToRead + byteOffset++ + bitOffset = 0 + } + return result +} + +type varbitChunkIterator struct { + c varbitChunk + // pos is the bit position within the chunk for the next sample to be + // decoded when scan() is called (i.e. it is _not_ the bit position of + // the sample currently returned by value()). The symbolic values + // varbitFirstSampleBitOffset and varbitSecondSampleBitOffset are also + // used for pos. len is the offset of the first bit in the chunk that is + // not part of the payload. If pos==len, then the iterator is positioned + // behind the last sample in the payload. However, the next call of + // scan() still has to check if the chunk is closed, in which case there + // is one more sample, saved in the header. To mark the iterator as + // having scanned that last sample, too, pos is set to len+1. + pos, len uint16 + t, dT model.Time + repeats byte // Repeats of ΔΔt=0. + v model.SampleValue + dV int64 // Only used for int value encoding. + leading, significant uint16 + enc varbitValueEncoding + lastError error + rewound bool + nextT model.Time // Only for rewound state. + nextV model.SampleValue // Only for rewound state. +} + +func newVarbitChunkIterator(c varbitChunk) *varbitChunkIterator { + return &varbitChunkIterator{ + c: c, + len: c.nextSampleOffset(), + t: model.Earliest, + enc: c.valueEncoding(), + significant: 1, + } +} + +// lastTimestamp implements Iterator. +func (it *varbitChunkIterator) LastTimestamp() (model.Time, error) { + if it.len == varbitFirstSampleBitOffset { + // No samples in the chunk yet. + return model.Earliest, it.lastError + } + return it.c.lastTime(), it.lastError +} + +// contains implements Iterator. +func (it *varbitChunkIterator) Contains(t model.Time) (bool, error) { + last, err := it.LastTimestamp() + if err != nil { + it.lastError = err + return false, err + } + return !t.Before(it.c.FirstTime()) && + !t.After(last), it.lastError +} + +// scan implements Iterator. +func (it *varbitChunkIterator) Scan() bool { + if it.lastError != nil { + return false + } + if it.rewound { + it.t = it.nextT + it.v = it.nextV + it.rewound = false + return true + } + if it.pos > it.len { + return false + } + if it.pos == it.len && it.repeats == 0 { + it.pos = it.len + 1 + if !it.c.closed() { + return false + } + it.t = it.c.lastTime() + it.v = it.c.lastValue() + return it.lastError == nil + } + if it.pos == varbitFirstSampleBitOffset { + it.t = it.c.FirstTime() + it.v = it.c.firstValue() + it.pos = varbitSecondSampleBitOffset + return it.lastError == nil + } + if it.pos == varbitSecondSampleBitOffset { + if it.len == varbitThirdSampleBitOffset && !it.c.closed() { + // Special case: Chunk has only two samples. + it.t = it.c.lastTime() + it.v = it.c.lastValue() + it.pos = it.len + 1 + return it.lastError == nil + } + it.dT = it.c.firstTimeDelta() + it.t += it.dT + // Value depends on encoding. + switch it.enc { + case varbitZeroEncoding: + it.pos = varbitThirdSampleBitOffset + case varbitIntDoubleDeltaEncoding: + it.dV = int64(it.c.firstValueDelta()) + it.v += model.SampleValue(it.dV) + it.pos = varbitThirdSampleBitOffset + 32 + case varbitXOREncoding: + it.pos = varbitThirdSampleBitOffset + it.readXOR() + case varbitDirectEncoding: + it.v = model.SampleValue(math.Float64frombits( + binary.BigEndian.Uint64(it.c[varbitThirdSampleBitOffset/8:]), + )) + it.pos = varbitThirdSampleBitOffset + 64 + default: + it.lastError = fmt.Errorf("unknown varbit value encoding: %v", it.enc) + } + return it.lastError == nil + } + // 3rd sample or later does not have special cases anymore. + it.readDDT() + switch it.enc { + case varbitZeroEncoding: + // Do nothing. + case varbitIntDoubleDeltaEncoding: + it.readDDV() + case varbitXOREncoding: + it.readXOR() + case varbitDirectEncoding: + it.v = model.SampleValue(math.Float64frombits(it.readBitPattern(64))) + return it.lastError == nil + default: + it.lastError = fmt.Errorf("unknown varbit value encoding: %v", it.enc) + return false + } + return it.lastError == nil +} + +// findAtOrBefore implements Iterator. +func (it *varbitChunkIterator) FindAtOrBefore(t model.Time) bool { + if it.len == 0 || t.Before(it.c.FirstTime()) { + return false + } + last := it.c.lastTime() + if !t.Before(last) { + it.t = last + it.v = it.c.lastValue() + it.pos = it.len + 1 + return true + } + if t == it.t { + return it.lastError == nil + } + if t.Before(it.t) || it.rewound { + it.reset() + } + + var ( + prevT = model.Earliest + prevV model.SampleValue + ) + for it.Scan() && !t.Before(it.t) { + prevT = it.t + prevV = it.v + // TODO(beorn7): If we are in a repeat, we could iterate forward + // much faster. + } + if t == it.t { + return it.lastError == nil + } + it.rewind(prevT, prevV) + return it.lastError == nil +} + +// findAtOrAfter implements Iterator. +func (it *varbitChunkIterator) FindAtOrAfter(t model.Time) bool { + if it.len == 0 || t.After(it.c.lastTime()) { + return false + } + first := it.c.FirstTime() + if !t.After(first) { + it.reset() + return it.Scan() + } + if t == it.t { + return it.lastError == nil + } + if t.Before(it.t) { + it.reset() + } + for it.Scan() && t.After(it.t) { + // TODO(beorn7): If we are in a repeat, we could iterate forward + // much faster. + } + return it.lastError == nil +} + +// value implements Iterator. +func (it *varbitChunkIterator) Value() model.SamplePair { + return model.SamplePair{ + Timestamp: it.t, + Value: it.v, + } +} + +// err implements Iterator. +func (it *varbitChunkIterator) Err() error { + return it.lastError +} + +func (it *varbitChunkIterator) readDDT() { + if it.repeats > 0 { + it.repeats-- + } else { + switch it.readControlBits(3) { + case 0: + it.repeats = byte(it.readBitPattern(7)) + case 1: + it.dT += model.Time(it.readSignedInt(6)) + case 2: + it.dT += model.Time(it.readSignedInt(17)) + case 3: + it.dT += model.Time(it.readSignedInt(23)) + default: + panic("unexpected number of control bits") + } + } + it.t += it.dT +} + +func (it *varbitChunkIterator) readDDV() { + switch it.readControlBits(4) { + case 0: + // Do nothing. + case 1: + it.dV += it.readSignedInt(6) + case 2: + it.dV += it.readSignedInt(13) + case 3: + it.dV += it.readSignedInt(20) + case 4: + it.dV += it.readSignedInt(33) + default: + panic("unexpected number of control bits") + } + it.v += model.SampleValue(it.dV) +} + +func (it *varbitChunkIterator) readXOR() { + switch it.readControlBits(2) { + case 0: + return + case 1: + // Do nothing right now. All done below. + case 2: + it.leading = uint16(it.readBitPattern(5)) + it.significant = uint16(it.readBitPattern(6)) + 1 + default: + panic("unexpected number of control bits") + } + pattern := math.Float64bits(float64(it.v)) + pattern ^= it.readBitPattern(it.significant) << (64 - it.significant - it.leading) + it.v = model.SampleValue(math.Float64frombits(pattern)) +} + +// readControlBits reads successive 1-bits and stops after reading the first +// 0-bit. It also stops once it has read max bits. It returns the number of read +// 1-bits. +func (it *varbitChunkIterator) readControlBits(max uint16) uint16 { + var count uint16 + for count < max && int(it.pos/8) < len(it.c) { + b := it.c[it.pos/8] & bitMask[1][it.pos%8] + it.pos++ + if b == 0 { + return count + } + count++ + } + if int(it.pos/8) >= len(it.c) { + it.lastError = errChunkBoundsExceeded + } + return count +} + +func (it *varbitChunkIterator) readBitPattern(n uint16) uint64 { + if len(it.c)*8 < int(it.pos)+int(n) { + it.lastError = errChunkBoundsExceeded + return 0 + } + u := it.c.readBitPattern(it.pos, n) + it.pos += n + return u +} + +func (it *varbitChunkIterator) readSignedInt(n uint16) int64 { + u := it.readBitPattern(n) + if n < 64 && u >= 1<<(n-1) { + u -= 1 << n + } + return int64(u) +} + +// reset puts the chunk iterator into the state it had upon creation. +func (it *varbitChunkIterator) reset() { + it.pos = 0 + it.t = model.Earliest + it.dT = 0 + it.repeats = 0 + it.v = 0 + it.dV = 0 + it.leading = 0 + it.significant = 1 + it.rewound = false +} + +// rewind "rewinds" the chunk iterator by one step. Since one cannot simply +// rewind a Varbit chunk, the old values have to be provided by the +// caller. Rewinding an already rewound chunk panics. After a call of scan or +// reset, a chunk can be rewound again. +func (it *varbitChunkIterator) rewind(t model.Time, v model.SampleValue) { + if it.rewound { + panic("cannot rewind varbit chunk twice") + } + it.rewound = true + it.nextT = it.t + it.nextV = it.v + it.t = t + it.v = v +} diff --git a/vendor/github.com/prometheus/prometheus/storage/local/chunk/varbit_helpers.go b/vendor/github.com/prometheus/prometheus/storage/local/chunk/varbit_helpers.go new file mode 100644 index 000000000..cc637a992 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/storage/local/chunk/varbit_helpers.go @@ -0,0 +1,75 @@ +// Copyright 2016 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package chunk + +import "github.com/prometheus/common/model" + +var ( + // bit masks for consecutive bits in a byte at various offsets. + bitMask = [][]byte{ + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, // 0 bit + {0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01}, // 1 bit + {0xC0, 0x60, 0x30, 0x18, 0x0C, 0x06, 0x03, 0x01}, // 2 bit + {0xE0, 0x70, 0x38, 0x1C, 0x0E, 0x07, 0x03, 0x01}, // 3 bit + {0xF0, 0x78, 0x3C, 0x1E, 0x0F, 0x07, 0x03, 0x01}, // 4 bit + {0xF8, 0x7C, 0x3E, 0x1F, 0x0F, 0x07, 0x03, 0x01}, // 5 bit + {0xFC, 0x7E, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01}, // 6 bit + {0xFE, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01}, // 7 bit + {0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01}, // 8 bit + } +) + +// isInt32 returns true if v can be represented as an int32. +func isInt32(v model.SampleValue) bool { + return model.SampleValue(int32(v)) == v +} + +// countBits returs the number of leading zero bits and the number of +// significant bits after that in the given bit pattern. The maximum number of +// leading zeros is 31 (so that it can be represented by a 5bit number). Leading +// zeros beyond that are considered part of the significant bits. +func countBits(pattern uint64) (leading, significant byte) { + // TODO(beorn7): This would probably be faster with ugly endless switch + // statements. + if pattern == 0 { + return + } + for pattern < 1<<63 { + leading++ + pattern <<= 1 + } + for pattern > 0 { + significant++ + pattern <<= 1 + } + if leading > 31 { // 5 bit limit. + significant += leading - 31 + leading = 31 + } + return +} + +// isSignedIntN returns if n can be represented as a signed int with the given +// bit length. +func isSignedIntN(i int64, n byte) bool { + upper := int64(1) << (n - 1) + if i >= upper { + return false + } + lower := upper - (1 << n) + if i < lower { + return false + } + return true +} diff --git a/vendor/github.com/prometheus/prometheus/storage/local/codable/codable.go b/vendor/github.com/prometheus/prometheus/storage/local/codable/codable.go new file mode 100644 index 000000000..ebabdf456 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/storage/local/codable/codable.go @@ -0,0 +1,467 @@ +// Copyright 2014 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package codable provides types that implement encoding.BinaryMarshaler and +// encoding.BinaryUnmarshaler and functions that help to encode and decode +// primitives. The Prometheus storage backend uses them to persist objects to +// files and to save objects in LevelDB. +// +// The encodings used in this package are designed in a way that objects can be +// unmarshaled from a continuous byte stream, i.e. the information when to stop +// reading is determined by the format. No separate termination information is +// needed. +// +// Strings are encoded as the length of their bytes as a varint followed by +// their bytes. +// +// Slices are encoded as their length as a varint followed by their elements. +// +// Maps are encoded as the number of mappings as a varint, followed by the +// mappings, each of which consists of the key followed by the value. +package codable + +import ( + "bytes" + "encoding/binary" + "fmt" + "io" + "sync" + + "github.com/prometheus/common/model" +) + +// A byteReader is an io.ByteReader that also implements the vanilla io.Reader +// interface. +type byteReader interface { + io.Reader + io.ByteReader +} + +// bufPool is a pool for staging buffers. Using a pool allows concurrency-safe +// reuse of buffers +var bufPool sync.Pool + +// getBuf returns a buffer from the pool. The length of the returned slice is l. +func getBuf(l int) []byte { + x := bufPool.Get() + if x == nil { + return make([]byte, l) + } + buf := x.([]byte) + if cap(buf) < l { + return make([]byte, l) + } + return buf[:l] +} + +// putBuf returns a buffer to the pool. +func putBuf(buf []byte) { + bufPool.Put(buf) +} + +// EncodeVarint encodes an int64 as a varint and writes it to an io.Writer. +// It returns the number of bytes written. +// This is a GC-friendly implementation that takes the required staging buffer +// from a buffer pool. +func EncodeVarint(w io.Writer, i int64) (int, error) { + buf := getBuf(binary.MaxVarintLen64) + defer putBuf(buf) + + bytesWritten := binary.PutVarint(buf, i) + _, err := w.Write(buf[:bytesWritten]) + return bytesWritten, err +} + +// EncodeUvarint encodes an uint64 as a varint and writes it to an io.Writer. +// It returns the number of bytes written. +// This is a GC-friendly implementation that takes the required staging buffer +// from a buffer pool. +func EncodeUvarint(w io.Writer, i uint64) (int, error) { + buf := getBuf(binary.MaxVarintLen64) + defer putBuf(buf) + + bytesWritten := binary.PutUvarint(buf, i) + _, err := w.Write(buf[:bytesWritten]) + return bytesWritten, err +} + +// EncodeUint64 writes an uint64 to an io.Writer in big-endian byte-order. +// This is a GC-friendly implementation that takes the required staging buffer +// from a buffer pool. +func EncodeUint64(w io.Writer, u uint64) error { + buf := getBuf(8) + defer putBuf(buf) + + binary.BigEndian.PutUint64(buf, u) + _, err := w.Write(buf) + return err +} + +// DecodeUint64 reads an uint64 from an io.Reader in big-endian byte-order. +// This is a GC-friendly implementation that takes the required staging buffer +// from a buffer pool. +func DecodeUint64(r io.Reader) (uint64, error) { + buf := getBuf(8) + defer putBuf(buf) + + if _, err := io.ReadFull(r, buf); err != nil { + return 0, err + } + return binary.BigEndian.Uint64(buf), nil +} + +// encodeString writes the varint encoded length followed by the bytes of s to +// b. +func encodeString(b *bytes.Buffer, s string) error { + // Note that this should have used EncodeUvarint but a glitch happened + // while designing the checkpoint format. + if _, err := EncodeVarint(b, int64(len(s))); err != nil { + return err + } + if _, err := b.WriteString(s); err != nil { + return err + } + return nil +} + +// decodeString decodes a string encoded by encodeString. +func decodeString(b byteReader) (string, error) { + length, err := binary.ReadVarint(b) + if length < 0 { + err = fmt.Errorf("found negative string length during decoding: %d", length) + } + if err != nil { + return "", err + } + + buf := getBuf(int(length)) + defer putBuf(buf) + + if _, err := io.ReadFull(b, buf); err != nil { + return "", err + } + return string(buf), nil +} + +// A Metric is a model.Metric that implements +// encoding.BinaryMarshaler and encoding.BinaryUnmarshaler. +type Metric model.Metric + +// MarshalBinary implements encoding.BinaryMarshaler. +func (m Metric) MarshalBinary() ([]byte, error) { + buf := &bytes.Buffer{} + // Note that this should have used EncodeUvarint but a glitch happened + // while designing the checkpoint format. + if _, err := EncodeVarint(buf, int64(len(m))); err != nil { + return nil, err + } + for l, v := range m { + if err := encodeString(buf, string(l)); err != nil { + return nil, err + } + if err := encodeString(buf, string(v)); err != nil { + return nil, err + } + } + return buf.Bytes(), nil +} + +// UnmarshalBinary implements encoding.BinaryUnmarshaler. It can be used with the +// zero value of Metric. +func (m *Metric) UnmarshalBinary(buf []byte) error { + return m.UnmarshalFromReader(bytes.NewReader(buf)) +} + +// UnmarshalFromReader unmarshals a Metric from a reader that implements +// both, io.Reader and io.ByteReader. It can be used with the zero value of +// Metric. +func (m *Metric) UnmarshalFromReader(r byteReader) error { + numLabelPairs, err := binary.ReadVarint(r) + if numLabelPairs < 0 { + err = fmt.Errorf("found negative numLabelPairs during unmarshaling: %d", numLabelPairs) + } + if err != nil { + return err + } + *m = make(Metric, numLabelPairs) + + for ; numLabelPairs > 0; numLabelPairs-- { + ln, err := decodeString(r) + if err != nil { + return err + } + lv, err := decodeString(r) + if err != nil { + return err + } + (*m)[model.LabelName(ln)] = model.LabelValue(lv) + } + return nil +} + +// A Fingerprint is a model.Fingerprint that implements +// encoding.BinaryMarshaler and encoding.BinaryUnmarshaler. The implementation +// depends on model.Fingerprint to be convertible to uint64. It encodes +// the fingerprint as a big-endian uint64. +type Fingerprint model.Fingerprint + +// MarshalBinary implements encoding.BinaryMarshaler. +func (fp Fingerprint) MarshalBinary() ([]byte, error) { + b := make([]byte, 8) + binary.BigEndian.PutUint64(b, uint64(fp)) + return b, nil +} + +// UnmarshalBinary implements encoding.BinaryUnmarshaler. +func (fp *Fingerprint) UnmarshalBinary(buf []byte) error { + *fp = Fingerprint(binary.BigEndian.Uint64(buf)) + return nil +} + +// FingerprintSet is a map[model.Fingerprint]struct{} that +// implements encoding.BinaryMarshaler and encoding.BinaryUnmarshaler. Its +// binary form is identical to that of Fingerprints. +type FingerprintSet map[model.Fingerprint]struct{} + +// MarshalBinary implements encoding.BinaryMarshaler. +func (fps FingerprintSet) MarshalBinary() ([]byte, error) { + b := make([]byte, binary.MaxVarintLen64+len(fps)*8) + lenBytes := binary.PutVarint(b, int64(len(fps))) + offset := lenBytes + + for fp := range fps { + binary.BigEndian.PutUint64(b[offset:], uint64(fp)) + offset += 8 + } + return b[:len(fps)*8+lenBytes], nil +} + +// UnmarshalBinary implements encoding.BinaryUnmarshaler. +func (fps *FingerprintSet) UnmarshalBinary(buf []byte) error { + numFPs, offset := binary.Varint(buf) + if offset <= 0 { + return fmt.Errorf("could not decode length of Fingerprints, varint decoding returned %d", offset) + } + *fps = make(FingerprintSet, numFPs) + + for i := 0; i < int(numFPs); i++ { + (*fps)[model.Fingerprint(binary.BigEndian.Uint64(buf[offset+i*8:]))] = struct{}{} + } + return nil +} + +// Fingerprints is a model.Fingerprints that implements +// encoding.BinaryMarshaler and encoding.BinaryUnmarshaler. Its binary form is +// identical to that of FingerprintSet. +type Fingerprints model.Fingerprints + +// MarshalBinary implements encoding.BinaryMarshaler. +func (fps Fingerprints) MarshalBinary() ([]byte, error) { + b := make([]byte, binary.MaxVarintLen64+len(fps)*8) + lenBytes := binary.PutVarint(b, int64(len(fps))) + + for i, fp := range fps { + binary.BigEndian.PutUint64(b[i*8+lenBytes:], uint64(fp)) + } + return b[:len(fps)*8+lenBytes], nil +} + +// UnmarshalBinary implements encoding.BinaryUnmarshaler. +func (fps *Fingerprints) UnmarshalBinary(buf []byte) error { + numFPs, offset := binary.Varint(buf) + if offset <= 0 { + return fmt.Errorf("could not decode length of Fingerprints, varint decoding returned %d", offset) + } + *fps = make(Fingerprints, numFPs) + + for i := range *fps { + (*fps)[i] = model.Fingerprint(binary.BigEndian.Uint64(buf[offset+i*8:])) + } + return nil +} + +// LabelPair is a model.LabelPair that implements +// encoding.BinaryMarshaler and encoding.BinaryUnmarshaler. +type LabelPair model.LabelPair + +// MarshalBinary implements encoding.BinaryMarshaler. +func (lp LabelPair) MarshalBinary() ([]byte, error) { + buf := &bytes.Buffer{} + if err := encodeString(buf, string(lp.Name)); err != nil { + return nil, err + } + if err := encodeString(buf, string(lp.Value)); err != nil { + return nil, err + } + return buf.Bytes(), nil +} + +// UnmarshalBinary implements encoding.BinaryUnmarshaler. +func (lp *LabelPair) UnmarshalBinary(buf []byte) error { + r := bytes.NewReader(buf) + n, err := decodeString(r) + if err != nil { + return err + } + v, err := decodeString(r) + if err != nil { + return err + } + lp.Name = model.LabelName(n) + lp.Value = model.LabelValue(v) + return nil +} + +// LabelName is a model.LabelName that implements +// encoding.BinaryMarshaler and encoding.BinaryUnmarshaler. +type LabelName model.LabelName + +// MarshalBinary implements encoding.BinaryMarshaler. +func (l LabelName) MarshalBinary() ([]byte, error) { + buf := &bytes.Buffer{} + if err := encodeString(buf, string(l)); err != nil { + return nil, err + } + return buf.Bytes(), nil +} + +// UnmarshalBinary implements encoding.BinaryUnmarshaler. +func (l *LabelName) UnmarshalBinary(buf []byte) error { + r := bytes.NewReader(buf) + n, err := decodeString(r) + if err != nil { + return err + } + *l = LabelName(n) + return nil +} + +// LabelValueSet is a map[model.LabelValue]struct{} that implements +// encoding.BinaryMarshaler and encoding.BinaryUnmarshaler. Its binary form is +// identical to that of LabelValues. +type LabelValueSet map[model.LabelValue]struct{} + +// MarshalBinary implements encoding.BinaryMarshaler. +func (vs LabelValueSet) MarshalBinary() ([]byte, error) { + buf := &bytes.Buffer{} + // Note that this should have used EncodeUvarint but a glitch happened + // while designing the checkpoint format. + if _, err := EncodeVarint(buf, int64(len(vs))); err != nil { + return nil, err + } + for v := range vs { + if err := encodeString(buf, string(v)); err != nil { + return nil, err + } + } + return buf.Bytes(), nil +} + +// UnmarshalBinary implements encoding.BinaryUnmarshaler. +func (vs *LabelValueSet) UnmarshalBinary(buf []byte) error { + r := bytes.NewReader(buf) + numValues, err := binary.ReadVarint(r) + if numValues < 0 { + err = fmt.Errorf("found negative number of values during unmarshaling: %d", numValues) + } + if err != nil { + return err + } + *vs = make(LabelValueSet, numValues) + + for i := int64(0); i < numValues; i++ { + v, err := decodeString(r) + if err != nil { + return err + } + (*vs)[model.LabelValue(v)] = struct{}{} + } + return nil +} + +// LabelValues is a model.LabelValues that implements +// encoding.BinaryMarshaler and encoding.BinaryUnmarshaler. Its binary form is +// identical to that of LabelValueSet. +type LabelValues model.LabelValues + +// MarshalBinary implements encoding.BinaryMarshaler. +func (vs LabelValues) MarshalBinary() ([]byte, error) { + buf := &bytes.Buffer{} + // Note that this should have used EncodeUvarint but a glitch happened + // while designing the checkpoint format. + if _, err := EncodeVarint(buf, int64(len(vs))); err != nil { + return nil, err + } + for _, v := range vs { + if err := encodeString(buf, string(v)); err != nil { + return nil, err + } + } + return buf.Bytes(), nil +} + +// UnmarshalBinary implements encoding.BinaryUnmarshaler. +func (vs *LabelValues) UnmarshalBinary(buf []byte) error { + r := bytes.NewReader(buf) + numValues, err := binary.ReadVarint(r) + if numValues < 0 { + err = fmt.Errorf("found negative number of values during unmarshaling: %d", numValues) + } + if err != nil { + return err + } + *vs = make(LabelValues, numValues) + + for i := range *vs { + v, err := decodeString(r) + if err != nil { + return err + } + (*vs)[i] = model.LabelValue(v) + } + return nil +} + +// TimeRange is used to define a time range and implements +// encoding.BinaryMarshaler and encoding.BinaryUnmarshaler. +type TimeRange struct { + First, Last model.Time +} + +// MarshalBinary implements encoding.BinaryMarshaler. +func (tr TimeRange) MarshalBinary() ([]byte, error) { + buf := &bytes.Buffer{} + if _, err := EncodeVarint(buf, int64(tr.First)); err != nil { + return nil, err + } + if _, err := EncodeVarint(buf, int64(tr.Last)); err != nil { + return nil, err + } + return buf.Bytes(), nil +} + +// UnmarshalBinary implements encoding.BinaryUnmarshaler. +func (tr *TimeRange) UnmarshalBinary(buf []byte) error { + r := bytes.NewReader(buf) + first, err := binary.ReadVarint(r) + if err != nil { + return err + } + last, err := binary.ReadVarint(r) + if err != nil { + return err + } + tr.First = model.Time(first) + tr.Last = model.Time(last) + return nil +} diff --git a/vendor/github.com/prometheus/prometheus/storage/local/crashrecovery.go b/vendor/github.com/prometheus/prometheus/storage/local/crashrecovery.go new file mode 100644 index 000000000..822678b4d --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/storage/local/crashrecovery.go @@ -0,0 +1,559 @@ +// Copyright 2015 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package local + +import ( + "errors" + "fmt" + "io" + "os" + "path/filepath" + "strings" + "sync/atomic" + + "github.com/prometheus/common/log" + "github.com/prometheus/common/model" + + "github.com/prometheus/prometheus/storage/local/chunk" + "github.com/prometheus/prometheus/storage/local/codable" + "github.com/prometheus/prometheus/storage/local/index" +) + +// recoverFromCrash is called by loadSeriesMapAndHeads if the persistence +// appears to be dirty after the loading (either because the loading resulted in +// an error or because the persistence was dirty from the start). Not goroutine +// safe. Only call before anything else is running (except index processing +// queue as started by newPersistence). +func (p *persistence) recoverFromCrash(fingerprintToSeries map[model.Fingerprint]*memorySeries) error { + // TODO(beorn): We need proper tests for the crash recovery. + log.Warn("Starting crash recovery. Prometheus is inoperational until complete.") + log.Warn("To avoid crash recovery in the future, shut down Prometheus with SIGTERM or a HTTP POST to /-/quit.") + + fpsSeen := map[model.Fingerprint]struct{}{} + count := 0 + seriesDirNameFmt := fmt.Sprintf("%%0%dx", seriesDirNameLen) + + // Delete the fingerprint mapping file as it might be stale or + // corrupt. We'll rebuild the mappings as we go. + if err := os.RemoveAll(p.mappingsFileName()); err != nil { + return fmt.Errorf("couldn't remove old fingerprint mapping file %s: %s", p.mappingsFileName(), err) + } + // The mappings to rebuild. + fpm := fpMappings{} + + log.Info("Scanning files.") + for i := 0; i < 1<<(seriesDirNameLen*4); i++ { + dirname := filepath.Join(p.basePath, fmt.Sprintf(seriesDirNameFmt, i)) + dir, err := os.Open(dirname) + if os.IsNotExist(err) { + continue + } + if err != nil { + return err + } + for fis := []os.FileInfo{}; err != io.EOF; fis, err = dir.Readdir(1024) { + if err != nil { + dir.Close() + return err + } + for _, fi := range fis { + fp, ok := p.sanitizeSeries(dirname, fi, fingerprintToSeries, fpm) + if ok { + fpsSeen[fp] = struct{}{} + } + count++ + if count%10000 == 0 { + log.Infof("%d files scanned.", count) + } + } + } + dir.Close() + } + log.Infof("File scan complete. %d series found.", len(fpsSeen)) + + log.Info("Checking for series without series file.") + for fp, s := range fingerprintToSeries { + if _, seen := fpsSeen[fp]; !seen { + // fp exists in fingerprintToSeries, but has no representation on disk. + if s.persistWatermark >= len(s.chunkDescs) { + // Oops, everything including the head chunk was + // already persisted, but nothing on disk. Or + // the persistWatermark is plainly wrong. Thus, + // we lost that series completely. Clean up the + // remnants. + delete(fingerprintToSeries, fp) + if err := p.purgeArchivedMetric(fp); err != nil { + // Purging the archived metric didn't work, so try + // to unindex it, just in case it's in the indexes. + p.unindexMetric(fp, s.metric) + } + log.Warnf("Lost series detected: fingerprint %v, metric %v.", fp, s.metric) + continue + } + // If we are here, the only chunks we have are the chunks in the checkpoint. + // Adjust things accordingly. + if s.persistWatermark > 0 || s.chunkDescsOffset != 0 { + minLostChunks := s.persistWatermark + s.chunkDescsOffset + if minLostChunks <= 0 { + log.Warnf( + "Possible loss of chunks for fingerprint %v, metric %v.", + fp, s.metric, + ) + } else { + log.Warnf( + "Lost at least %d chunks for fingerprint %v, metric %v.", + minLostChunks, fp, s.metric, + ) + } + s.chunkDescs = append( + make([]*chunk.Desc, 0, len(s.chunkDescs)-s.persistWatermark), + s.chunkDescs[s.persistWatermark:]..., + ) + chunk.NumMemDescs.Sub(float64(s.persistWatermark)) + s.persistWatermark = 0 + s.chunkDescsOffset = 0 + } + maybeAddMapping(fp, s.metric, fpm) + fpsSeen[fp] = struct{}{} // Add so that fpsSeen is complete. + } + } + log.Info("Check for series without series file complete.") + + if err := p.cleanUpArchiveIndexes(fingerprintToSeries, fpsSeen, fpm); err != nil { + return err + } + if err := p.rebuildLabelIndexes(fingerprintToSeries); err != nil { + return err + } + // Finally rewrite the mappings file if there are any mappings. + if len(fpm) > 0 { + if err := p.checkpointFPMappings(fpm); err != nil { + return err + } + } + + p.dirtyMtx.Lock() + // Only declare storage clean if it didn't become dirty during crash recovery. + if !p.becameDirty { + p.dirty = false + } + p.dirtyMtx.Unlock() + + log.Warn("Crash recovery complete.") + return nil +} + +// sanitizeSeries sanitizes a series based on its series file as defined by the +// provided directory and FileInfo. The method returns the fingerprint as +// derived from the directory and file name, and whether the provided file has +// been sanitized. A file that failed to be sanitized is moved into the +// "orphaned" sub-directory, if possible. +// +// The following steps are performed: +// +// - A file whose name doesn't comply with the naming scheme of a series file is +// simply moved into the orphaned directory. +// +// - If the size of the series file isn't a multiple of the chunk size, +// extraneous bytes are truncated. If the truncation fails, the file is +// moved into the orphaned directory. +// +// - A file that is empty (after truncation) is deleted. +// +// - A series that is not archived (i.e. it is in the fingerprintToSeries map) +// is checked for consistency of its various parameters (like persist +// watermark, offset of chunkDescs etc.). In particular, overlap between an +// in-memory head chunk with the most recent persisted chunk is +// checked. Inconsistencies are rectified. +// +// - A series that is archived (i.e. it is not in the fingerprintToSeries map) +// is checked for its presence in the index of archived series. If it cannot +// be found there, it is moved into the orphaned directory. +func (p *persistence) sanitizeSeries( + dirname string, fi os.FileInfo, + fingerprintToSeries map[model.Fingerprint]*memorySeries, + fpm fpMappings, +) (model.Fingerprint, bool) { + var ( + fp model.Fingerprint + err error + filename = filepath.Join(dirname, fi.Name()) + s *memorySeries + ) + + purge := func() { + if fp != 0 { + var metric model.Metric + if s != nil { + metric = s.metric + } + if err = p.quarantineSeriesFile( + fp, errors.New("purge during crash recovery"), metric, + ); err == nil { + return + } + log. + With("file", filename). + With("error", err). + Error("Failed to move lost series file to orphaned directory.") + } + // If we are here, we are either purging an incorrectly named + // file, or quarantining has failed. So simply delete the file. + if err = os.Remove(filename); err != nil { + log. + With("file", filename). + With("error", err). + Error("Failed to delete lost series file.") + } + } + + if len(fi.Name()) != fpLen-seriesDirNameLen+len(seriesFileSuffix) || + !strings.HasSuffix(fi.Name(), seriesFileSuffix) { + log.Warnf("Unexpected series file name %s.", filename) + purge() + return fp, false + } + if fp, err = model.FingerprintFromString(filepath.Base(dirname) + fi.Name()[:fpLen-seriesDirNameLen]); err != nil { + log.Warnf("Error parsing file name %s: %s", filename, err) + purge() + return fp, false + } + + bytesToTrim := fi.Size() % int64(chunkLenWithHeader) + chunksInFile := int(fi.Size()) / chunkLenWithHeader + modTime := fi.ModTime() + if bytesToTrim != 0 { + log.Warnf( + "Truncating file %s to exactly %d chunks, trimming %d extraneous bytes.", + filename, chunksInFile, bytesToTrim, + ) + f, err := os.OpenFile(filename, os.O_WRONLY, 0640) + if err != nil { + log.Errorf("Could not open file %s: %s", filename, err) + purge() + return fp, false + } + if err := f.Truncate(fi.Size() - bytesToTrim); err != nil { + log.Errorf("Failed to truncate file %s: %s", filename, err) + purge() + return fp, false + } + } + if chunksInFile == 0 { + log.Warnf("No chunks left in file %s.", filename) + purge() + return fp, false + } + + s, ok := fingerprintToSeries[fp] + if ok { // This series is supposed to not be archived. + if s == nil { + panic("fingerprint mapped to nil pointer") + } + maybeAddMapping(fp, s.metric, fpm) + if !p.pedanticChecks && + bytesToTrim == 0 && + s.chunkDescsOffset != -1 && + chunksInFile == s.chunkDescsOffset+s.persistWatermark && + modTime.Equal(s.modTime) { + // Everything is consistent. We are good. + return fp, true + } + // If we are here, we cannot be sure the series file is + // consistent with the checkpoint, so we have to take a closer + // look. + if s.headChunkClosed { + // This is the easy case as we have all chunks on + // disk. Treat this series as a freshly unarchived one + // by loading the chunkDescs and setting all parameters + // based on the loaded chunkDescs. + cds, err := p.loadChunkDescs(fp, 0) + if err != nil { + log.Errorf( + "Failed to load chunk descriptors for metric %v, fingerprint %v: %s", + s.metric, fp, err, + ) + purge() + return fp, false + } + log.Warnf( + "Treating recovered metric %v, fingerprint %v, as freshly unarchived, with %d chunks in series file.", + s.metric, fp, len(cds), + ) + s.chunkDescs = cds + s.chunkDescsOffset = 0 + s.savedFirstTime = cds[0].FirstTime() + s.lastTime, err = cds[len(cds)-1].LastTime() + if err != nil { + log.Errorf( + "Failed to determine time of the last sample for metric %v, fingerprint %v: %s", + s.metric, fp, err, + ) + purge() + return fp, false + } + s.persistWatermark = len(cds) + s.modTime = modTime + // Finally, evict again all chunk.Descs except the latest one to save memory. + s.evictChunkDescs(len(cds) - 1) + return fp, true + } + // This is the tricky one: We have chunks from heads.db, but + // some of those chunks might already be in the series + // file. Strategy: Take the last time of the most recent chunk + // in the series file. Then find the oldest chunk among those + // from heads.db that has a first time later or equal to the + // last time from the series file. Throw away the older chunks + // from heads.db and stitch the parts together. + + // First, throw away the chunkDescs without chunks. + s.chunkDescs = s.chunkDescs[s.persistWatermark:] + chunk.NumMemDescs.Sub(float64(s.persistWatermark)) + cds, err := p.loadChunkDescs(fp, 0) + if err != nil { + log.Errorf( + "Failed to load chunk descriptors for metric %v, fingerprint %v: %s", + s.metric, fp, err, + ) + purge() + return fp, false + } + s.persistWatermark = len(cds) + s.chunkDescsOffset = 0 + s.savedFirstTime = cds[0].FirstTime() + s.modTime = modTime + + lastTime, err := cds[len(cds)-1].LastTime() + if err != nil { + log.Errorf( + "Failed to determine time of the last sample for metric %v, fingerprint %v: %s", + s.metric, fp, err, + ) + purge() + return fp, false + } + keepIdx := -1 + for i, cd := range s.chunkDescs { + if cd.FirstTime() >= lastTime { + keepIdx = i + break + } + } + if keepIdx == -1 { + log.Warnf( + "Recovered metric %v, fingerprint %v: all %d chunks recovered from series file.", + s.metric, fp, chunksInFile, + ) + chunk.NumMemDescs.Sub(float64(len(s.chunkDescs))) + atomic.AddInt64(&chunk.NumMemChunks, int64(-len(s.chunkDescs))) + s.chunkDescs = cds + s.headChunkClosed = true + // Finally, evict again all chunk.Descs except the latest one to save memory. + s.evictChunkDescs(len(cds) - 1) + return fp, true + } + log.Warnf( + "Recovered metric %v, fingerprint %v: recovered %d chunks from series file, recovered %d chunks from checkpoint.", + s.metric, fp, chunksInFile, len(s.chunkDescs)-keepIdx, + ) + chunk.NumMemDescs.Sub(float64(keepIdx)) + atomic.AddInt64(&chunk.NumMemChunks, int64(-keepIdx)) + chunkDescsToEvict := len(cds) + if keepIdx == len(s.chunkDescs) { + // No chunks from series file left, head chunk is evicted, so declare it closed. + s.headChunkClosed = true + chunkDescsToEvict-- // Keep one chunk.Desc in this case to avoid a series with zero chunk.Descs. + } + s.chunkDescs = append(cds, s.chunkDescs[keepIdx:]...) + // Finally, evict again chunk.Descs without chunk to save memory. + s.evictChunkDescs(chunkDescsToEvict) + return fp, true + } + // This series is supposed to be archived. + metric, err := p.archivedMetric(fp) + if err != nil { + log.Errorf( + "Fingerprint %v assumed archived but couldn't be looked up in archived index: %s", + fp, err, + ) + purge() + return fp, false + } + if metric == nil { + log.Warnf( + "Fingerprint %v assumed archived but couldn't be found in archived index.", + fp, + ) + purge() + return fp, false + } + // This series looks like a properly archived one. + maybeAddMapping(fp, metric, fpm) + return fp, true +} + +func (p *persistence) cleanUpArchiveIndexes( + fpToSeries map[model.Fingerprint]*memorySeries, + fpsSeen map[model.Fingerprint]struct{}, + fpm fpMappings, +) error { + log.Info("Cleaning up archive indexes.") + var fp codable.Fingerprint + var m codable.Metric + count := 0 + if err := p.archivedFingerprintToMetrics.ForEach(func(kv index.KeyValueAccessor) error { + count++ + if count%10000 == 0 { + log.Infof("%d archived metrics checked.", count) + } + if err := kv.Key(&fp); err != nil { + return err + } + _, fpSeen := fpsSeen[model.Fingerprint(fp)] + inMemory := false + if fpSeen { + _, inMemory = fpToSeries[model.Fingerprint(fp)] + } + if !fpSeen || inMemory { + if inMemory { + log.Warnf("Archive clean-up: Fingerprint %v is not archived. Purging from archive indexes.", model.Fingerprint(fp)) + } + if !fpSeen { + log.Warnf("Archive clean-up: Fingerprint %v is unknown. Purging from archive indexes.", model.Fingerprint(fp)) + } + // It's fine if the fp is not in the archive indexes. + if _, err := p.archivedFingerprintToMetrics.Delete(fp); err != nil { + return err + } + // Delete from timerange index, too. + _, err := p.archivedFingerprintToTimeRange.Delete(fp) + return err + } + // fp is legitimately archived. Now we need the metric to check for a mapped fingerprint. + if err := kv.Value(&m); err != nil { + return err + } + maybeAddMapping(model.Fingerprint(fp), model.Metric(m), fpm) + // Make sure it is in timerange index, too. + has, err := p.archivedFingerprintToTimeRange.Has(fp) + if err != nil { + return err + } + if has { + return nil // All good. + } + log.Warnf("Archive clean-up: Fingerprint %v is not in time-range index. Unarchiving it for recovery.") + // Again, it's fine if fp is not in the archive index. + if _, err := p.archivedFingerprintToMetrics.Delete(fp); err != nil { + return err + } + cds, err := p.loadChunkDescs(model.Fingerprint(fp), 0) + if err != nil { + return err + } + series, err := newMemorySeries(model.Metric(m), cds, p.seriesFileModTime(model.Fingerprint(fp))) + if err != nil { + return err + } + fpToSeries[model.Fingerprint(fp)] = series + // Evict all but one chunk.Desc to save memory. + series.evictChunkDescs(len(cds) - 1) + return nil + }); err != nil { + return err + } + count = 0 + if err := p.archivedFingerprintToTimeRange.ForEach(func(kv index.KeyValueAccessor) error { + count++ + if count%10000 == 0 { + log.Infof("%d archived time ranges checked.", count) + } + if err := kv.Key(&fp); err != nil { + return err + } + has, err := p.archivedFingerprintToMetrics.Has(fp) + if err != nil { + return err + } + if has { + return nil // All good. + } + log.Warnf("Archive clean-up: Purging unknown fingerprint %v in time-range index.", fp) + deleted, err := p.archivedFingerprintToTimeRange.Delete(fp) + if err != nil { + return err + } + if !deleted { + log.Errorf("Fingerprint %v to be deleted from archivedFingerprintToTimeRange not found. This should never happen.", fp) + } + return nil + }); err != nil { + return err + } + log.Info("Clean-up of archive indexes complete.") + return nil +} + +func (p *persistence) rebuildLabelIndexes( + fpToSeries map[model.Fingerprint]*memorySeries, +) error { + count := 0 + log.Info("Rebuilding label indexes.") + log.Info("Indexing metrics in memory.") + for fp, s := range fpToSeries { + p.indexMetric(fp, s.metric) + count++ + if count%10000 == 0 { + log.Infof("%d metrics queued for indexing.", count) + } + } + log.Info("Indexing archived metrics.") + var fp codable.Fingerprint + var m codable.Metric + if err := p.archivedFingerprintToMetrics.ForEach(func(kv index.KeyValueAccessor) error { + if err := kv.Key(&fp); err != nil { + return err + } + if err := kv.Value(&m); err != nil { + return err + } + p.indexMetric(model.Fingerprint(fp), model.Metric(m)) + count++ + if count%10000 == 0 { + log.Infof("%d metrics queued for indexing.", count) + } + return nil + }); err != nil { + return err + } + log.Info("All requests for rebuilding the label indexes queued. (Actual processing may lag behind.)") + return nil +} + +// maybeAddMapping adds a fingerprint mapping to fpm if the FastFingerprint of m is different from fp. +func maybeAddMapping(fp model.Fingerprint, m model.Metric, fpm fpMappings) { + if rawFP := m.FastFingerprint(); rawFP != fp { + log.Warnf( + "Metric %v with fingerprint %v is mapped from raw fingerprint %v.", + m, fp, rawFP, + ) + if mappedFPs, ok := fpm[rawFP]; ok { + mappedFPs[metricToUniqueString(m)] = fp + } else { + fpm[rawFP] = map[string]model.Fingerprint{ + metricToUniqueString(m): fp, + } + } + } +} diff --git a/vendor/github.com/prometheus/prometheus/storage/local/heads.go b/vendor/github.com/prometheus/prometheus/storage/local/heads.go new file mode 100644 index 000000000..887659170 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/storage/local/heads.go @@ -0,0 +1,261 @@ +// Copyright 2016 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package local + +import ( + "bufio" + "encoding/binary" + "fmt" + "io" + "os" + "time" + + "github.com/prometheus/common/model" + + "github.com/prometheus/prometheus/storage/local/chunk" + "github.com/prometheus/prometheus/storage/local/codable" +) + +const ( + headsFileName = "heads.db" + headsTempFileName = "heads.db.tmp" + headsFormatVersion = 2 + headsFormatLegacyVersion = 1 // Can read, but will never write. + headsMagicString = "PrometheusHeads" +) + +// headsScanner is a scanner to read time series with their heads from a +// heads.db file. It follows a similar semantics as the bufio.Scanner. +// It is not safe to use a headsScanner concurrently. +type headsScanner struct { + f *os.File + r *bufio.Reader + fp model.Fingerprint // Read after each scan() call that has returned true. + series *memorySeries // Read after each scan() call that has returned true. + version int64 // Read after newHeadsScanner has returned. + seriesTotal uint64 // Read after newHeadsScanner has returned. + seriesCurrent uint64 + chunksToPersistTotal int64 // Read after scan() has returned false. + err error // Read after scan() has returned false. +} + +func newHeadsScanner(filename string) *headsScanner { + hs := &headsScanner{} + defer func() { + if hs.f != nil && hs.err != nil { + hs.f.Close() + } + }() + + if hs.f, hs.err = os.Open(filename); hs.err != nil { + return hs + } + hs.r = bufio.NewReaderSize(hs.f, fileBufSize) + + buf := make([]byte, len(headsMagicString)) + if _, hs.err = io.ReadFull(hs.r, buf); hs.err != nil { + return hs + } + magic := string(buf) + if magic != headsMagicString { + hs.err = fmt.Errorf( + "unexpected magic string, want %q, got %q", + headsMagicString, magic, + ) + return hs + } + hs.version, hs.err = binary.ReadVarint(hs.r) + if (hs.version != headsFormatVersion && hs.version != headsFormatLegacyVersion) || hs.err != nil { + hs.err = fmt.Errorf( + "unknown or unreadable heads format version, want %d, got %d, error: %s", + headsFormatVersion, hs.version, hs.err, + ) + return hs + } + if hs.seriesTotal, hs.err = codable.DecodeUint64(hs.r); hs.err != nil { + return hs + } + return hs +} + +// scan works like bufio.Scanner.Scan. +func (hs *headsScanner) scan() bool { + if hs.seriesCurrent == hs.seriesTotal || hs.err != nil { + return false + } + + var ( + seriesFlags byte + fpAsInt uint64 + metric codable.Metric + persistWatermark int64 + modTimeNano int64 + modTime time.Time + chunkDescsOffset int64 + savedFirstTime int64 + numChunkDescs int64 + firstTime int64 + lastTime int64 + encoding byte + ch chunk.Chunk + lastTimeHead model.Time + ) + if seriesFlags, hs.err = hs.r.ReadByte(); hs.err != nil { + return false + } + headChunkPersisted := seriesFlags&flagHeadChunkPersisted != 0 + if fpAsInt, hs.err = codable.DecodeUint64(hs.r); hs.err != nil { + return false + } + hs.fp = model.Fingerprint(fpAsInt) + + if hs.err = metric.UnmarshalFromReader(hs.r); hs.err != nil { + return false + } + if hs.version != headsFormatLegacyVersion { + // persistWatermark only present in v2. + persistWatermark, hs.err = binary.ReadVarint(hs.r) + if persistWatermark < 0 { + hs.err = fmt.Errorf("found negative persist watermark in checkpoint: %d", persistWatermark) + } + if hs.err != nil { + return false + } + modTimeNano, hs.err = binary.ReadVarint(hs.r) + if hs.err != nil { + return false + } + if modTimeNano != -1 { + modTime = time.Unix(0, modTimeNano) + } + } + if chunkDescsOffset, hs.err = binary.ReadVarint(hs.r); hs.err != nil { + return false + } + if savedFirstTime, hs.err = binary.ReadVarint(hs.r); hs.err != nil { + return false + } + + if numChunkDescs, hs.err = binary.ReadVarint(hs.r); hs.err != nil { + return false + } + if numChunkDescs < 0 { + hs.err = fmt.Errorf("found negative number of chunk descriptors in checkpoint: %d", numChunkDescs) + return false + } + + chunkDescs := make([]*chunk.Desc, numChunkDescs) + if hs.version == headsFormatLegacyVersion { + if headChunkPersisted { + persistWatermark = numChunkDescs + } else { + persistWatermark = numChunkDescs - 1 + } + } + headChunkClosed := true // Initial assumption. + for i := int64(0); i < numChunkDescs; i++ { + if i < persistWatermark { + if firstTime, hs.err = binary.ReadVarint(hs.r); hs.err != nil { + return false + } + if lastTime, hs.err = binary.ReadVarint(hs.r); hs.err != nil { + return false + } + chunkDescs[i] = &chunk.Desc{ + ChunkFirstTime: model.Time(firstTime), + ChunkLastTime: model.Time(lastTime), + } + chunk.NumMemDescs.Inc() + } else { + // Non-persisted chunk. + // If there are non-persisted chunks at all, we consider + // the head chunk not to be closed yet. + headChunkClosed = false + if encoding, hs.err = hs.r.ReadByte(); hs.err != nil { + return false + } + if ch, hs.err = chunk.NewForEncoding(chunk.Encoding(encoding)); hs.err != nil { + return false + } + if hs.err = ch.Unmarshal(hs.r); hs.err != nil { + return false + } + cd := chunk.NewDesc(ch, ch.FirstTime()) + if i < numChunkDescs-1 { + // This is NOT the head chunk. So it's a chunk + // to be persisted, and we need to populate lastTime. + hs.chunksToPersistTotal++ + if hs.err = cd.MaybePopulateLastTime(); hs.err != nil { + return false + } + } + chunkDescs[i] = cd + } + } + + if lastTimeHead, hs.err = chunkDescs[len(chunkDescs)-1].LastTime(); hs.err != nil { + return false + } + + hs.series = &memorySeries{ + metric: model.Metric(metric), + chunkDescs: chunkDescs, + persistWatermark: int(persistWatermark), + modTime: modTime, + chunkDescsOffset: int(chunkDescsOffset), + savedFirstTime: model.Time(savedFirstTime), + lastTime: lastTimeHead, + headChunkClosed: headChunkClosed, + } + hs.seriesCurrent++ + return true +} + +// close closes the underlying file if required. +func (hs *headsScanner) close() { + if hs.f != nil { + hs.f.Close() + } +} + +// DumpHeads writes the metadata of the provided heads file in a human-readable +// form. +func DumpHeads(filename string, out io.Writer) error { + hs := newHeadsScanner(filename) + defer hs.close() + + if hs.err == nil { + fmt.Fprintf( + out, + ">>> Dumping %d series from heads file %q with format version %d. <<<\n", + hs.seriesTotal, filename, hs.version, + ) + } + for hs.scan() { + s := hs.series + fmt.Fprintf( + out, + "FP=%v\tMETRIC=%s\tlen(chunkDescs)=%d\tpersistWatermark=%d\tchunkDescOffset=%d\tsavedFirstTime=%v\tlastTime=%v\theadChunkClosed=%t\n", + hs.fp, s.metric, len(s.chunkDescs), s.persistWatermark, s.chunkDescsOffset, s.savedFirstTime, s.lastTime, s.headChunkClosed, + ) + } + if hs.err == nil { + fmt.Fprintf( + out, + ">>> Dump complete. %d chunks to persist. <<<\n", + hs.chunksToPersistTotal, + ) + } + return hs.err +} diff --git a/vendor/github.com/prometheus/prometheus/storage/local/index/index.go b/vendor/github.com/prometheus/prometheus/storage/local/index/index.go new file mode 100644 index 000000000..1f33d5201 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/storage/local/index/index.go @@ -0,0 +1,303 @@ +// Copyright 2014 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package index provides a number of indexes backed by persistent key-value +// stores. The only supported implementation of a key-value store is currently +// goleveldb, but other implementations can easily be added. +package index + +import ( + "os" + "path" + "path/filepath" + + "github.com/prometheus/common/model" + + "github.com/prometheus/prometheus/storage/local/codable" +) + +// Directory names for LevelDB indices. +const ( + FingerprintToMetricDir = "archived_fingerprint_to_metric" + FingerprintTimeRangeDir = "archived_fingerprint_to_timerange" + LabelNameToLabelValuesDir = "labelname_to_labelvalues" + LabelPairToFingerprintsDir = "labelpair_to_fingerprints" +) + +// LevelDB cache sizes, changeable via flags. +var ( + FingerprintMetricCacheSize = 10 * 1024 * 1024 + FingerprintTimeRangeCacheSize = 5 * 1024 * 1024 + LabelNameLabelValuesCacheSize = 10 * 1024 * 1024 + LabelPairFingerprintsCacheSize = 20 * 1024 * 1024 +) + +// FingerprintMetricMapping is an in-memory map of fingerprints to metrics. +type FingerprintMetricMapping map[model.Fingerprint]model.Metric + +// FingerprintMetricIndex models a database mapping fingerprints to metrics. +type FingerprintMetricIndex struct { + KeyValueStore +} + +// IndexBatch indexes a batch of mappings from fingerprints to metrics. +// +// This method is goroutine-safe, but note that no specific order of execution +// can be guaranteed (especially critical if IndexBatch and UnindexBatch are +// called concurrently for the same fingerprint). +func (i *FingerprintMetricIndex) IndexBatch(mapping FingerprintMetricMapping) error { + b := i.NewBatch() + + for fp, m := range mapping { + if err := b.Put(codable.Fingerprint(fp), codable.Metric(m)); err != nil { + return err + } + } + + return i.Commit(b) +} + +// UnindexBatch unindexes a batch of mappings from fingerprints to metrics. +// +// This method is goroutine-safe, but note that no specific order of execution +// can be guaranteed (especially critical if IndexBatch and UnindexBatch are +// called concurrently for the same fingerprint). +func (i *FingerprintMetricIndex) UnindexBatch(mapping FingerprintMetricMapping) error { + b := i.NewBatch() + + for fp := range mapping { + if err := b.Delete(codable.Fingerprint(fp)); err != nil { + return err + } + } + + return i.Commit(b) +} + +// Lookup looks up a metric by fingerprint. Looking up a non-existing +// fingerprint is not an error. In that case, (nil, false, nil) is returned. +// +// This method is goroutine-safe. +func (i *FingerprintMetricIndex) Lookup(fp model.Fingerprint) (metric model.Metric, ok bool, err error) { + ok, err = i.Get(codable.Fingerprint(fp), (*codable.Metric)(&metric)) + return +} + +// NewFingerprintMetricIndex returns a LevelDB-backed FingerprintMetricIndex +// ready to use. +func NewFingerprintMetricIndex(basePath string) (*FingerprintMetricIndex, error) { + fingerprintToMetricDB, err := NewLevelDB(LevelDBOptions{ + Path: filepath.Join(basePath, FingerprintToMetricDir), + CacheSizeBytes: FingerprintMetricCacheSize, + }) + if err != nil { + return nil, err + } + return &FingerprintMetricIndex{ + KeyValueStore: fingerprintToMetricDB, + }, nil +} + +// LabelNameLabelValuesMapping is an in-memory map of label names to +// label values. +type LabelNameLabelValuesMapping map[model.LabelName]codable.LabelValueSet + +// LabelNameLabelValuesIndex is a KeyValueStore that maps existing label names +// to all label values stored for that label name. +type LabelNameLabelValuesIndex struct { + KeyValueStore +} + +// IndexBatch adds a batch of label name to label values mappings to the +// index. A mapping of a label name to an empty slice of label values results in +// a deletion of that mapping from the index. +// +// While this method is fundamentally goroutine-safe, note that the order of +// execution for multiple batches executed concurrently is undefined. +func (i *LabelNameLabelValuesIndex) IndexBatch(b LabelNameLabelValuesMapping) error { + batch := i.NewBatch() + + for name, values := range b { + if len(values) == 0 { + if err := batch.Delete(codable.LabelName(name)); err != nil { + return err + } + } else { + if err := batch.Put(codable.LabelName(name), values); err != nil { + return err + } + } + } + + return i.Commit(batch) +} + +// Lookup looks up all label values for a given label name and returns them as +// model.LabelValues (which is a slice). Looking up a non-existing label +// name is not an error. In that case, (nil, false, nil) is returned. +// +// This method is goroutine-safe. +func (i *LabelNameLabelValuesIndex) Lookup(l model.LabelName) (values model.LabelValues, ok bool, err error) { + ok, err = i.Get(codable.LabelName(l), (*codable.LabelValues)(&values)) + return +} + +// LookupSet looks up all label values for a given label name and returns them +// as a set. Looking up a non-existing label name is not an error. In that case, +// (nil, false, nil) is returned. +// +// This method is goroutine-safe. +func (i *LabelNameLabelValuesIndex) LookupSet(l model.LabelName) (values map[model.LabelValue]struct{}, ok bool, err error) { + ok, err = i.Get(codable.LabelName(l), (*codable.LabelValueSet)(&values)) + if values == nil { + values = map[model.LabelValue]struct{}{} + } + return +} + +// NewLabelNameLabelValuesIndex returns a LevelDB-backed +// LabelNameLabelValuesIndex ready to use. +func NewLabelNameLabelValuesIndex(basePath string) (*LabelNameLabelValuesIndex, error) { + labelNameToLabelValuesDB, err := NewLevelDB(LevelDBOptions{ + Path: filepath.Join(basePath, LabelNameToLabelValuesDir), + CacheSizeBytes: LabelNameLabelValuesCacheSize, + }) + if err != nil { + return nil, err + } + return &LabelNameLabelValuesIndex{ + KeyValueStore: labelNameToLabelValuesDB, + }, nil +} + +// DeleteLabelNameLabelValuesIndex deletes the LevelDB-backed +// LabelNameLabelValuesIndex. Use only for a not yet opened index. +func DeleteLabelNameLabelValuesIndex(basePath string) error { + return os.RemoveAll(path.Join(basePath, LabelNameToLabelValuesDir)) +} + +// LabelPairFingerprintsMapping is an in-memory map of label pairs to +// fingerprints. +type LabelPairFingerprintsMapping map[model.LabelPair]codable.FingerprintSet + +// LabelPairFingerprintIndex is a KeyValueStore that maps existing label pairs +// to the fingerprints of all metrics containing those label pairs. +type LabelPairFingerprintIndex struct { + KeyValueStore +} + +// IndexBatch indexes a batch of mappings from label pairs to fingerprints. A +// mapping to an empty slice of fingerprints results in deletion of that mapping +// from the index. +// +// While this method is fundamentally goroutine-safe, note that the order of +// execution for multiple batches executed concurrently is undefined. +func (i *LabelPairFingerprintIndex) IndexBatch(m LabelPairFingerprintsMapping) (err error) { + batch := i.NewBatch() + + for pair, fps := range m { + if len(fps) == 0 { + err = batch.Delete(codable.LabelPair(pair)) + } else { + err = batch.Put(codable.LabelPair(pair), fps) + } + + if err != nil { + return err + } + } + + return i.Commit(batch) +} + +// Lookup looks up all fingerprints for a given label pair. Looking up a +// non-existing label pair is not an error. In that case, (nil, false, nil) is +// returned. +// +// This method is goroutine-safe. +func (i *LabelPairFingerprintIndex) Lookup(p model.LabelPair) (fps model.Fingerprints, ok bool, err error) { + ok, err = i.Get((codable.LabelPair)(p), (*codable.Fingerprints)(&fps)) + return +} + +// LookupSet looks up all fingerprints for a given label pair. Looking up a +// non-existing label pair is not an error. In that case, (nil, false, nil) is +// returned. +// +// This method is goroutine-safe. +func (i *LabelPairFingerprintIndex) LookupSet(p model.LabelPair) (fps map[model.Fingerprint]struct{}, ok bool, err error) { + ok, err = i.Get((codable.LabelPair)(p), (*codable.FingerprintSet)(&fps)) + if fps == nil { + fps = map[model.Fingerprint]struct{}{} + } + return +} + +// NewLabelPairFingerprintIndex returns a LevelDB-backed +// LabelPairFingerprintIndex ready to use. +func NewLabelPairFingerprintIndex(basePath string) (*LabelPairFingerprintIndex, error) { + labelPairToFingerprintsDB, err := NewLevelDB(LevelDBOptions{ + Path: filepath.Join(basePath, LabelPairToFingerprintsDir), + CacheSizeBytes: LabelPairFingerprintsCacheSize, + }) + if err != nil { + return nil, err + } + return &LabelPairFingerprintIndex{ + KeyValueStore: labelPairToFingerprintsDB, + }, nil +} + +// DeleteLabelPairFingerprintIndex deletes the LevelDB-backed +// LabelPairFingerprintIndex. Use only for a not yet opened index. +func DeleteLabelPairFingerprintIndex(basePath string) error { + return os.RemoveAll(path.Join(basePath, LabelPairToFingerprintsDir)) +} + +// FingerprintTimeRangeIndex models a database tracking the time ranges +// of metrics by their fingerprints. +type FingerprintTimeRangeIndex struct { + KeyValueStore +} + +// Lookup returns the time range for the given fingerprint. Looking up a +// non-existing fingerprint is not an error. In that case, (0, 0, false, nil) is +// returned. +// +// This method is goroutine-safe. +func (i *FingerprintTimeRangeIndex) Lookup(fp model.Fingerprint) (firstTime, lastTime model.Time, ok bool, err error) { + var tr codable.TimeRange + ok, err = i.Get(codable.Fingerprint(fp), &tr) + return tr.First, tr.Last, ok, err +} + +// NewFingerprintTimeRangeIndex returns a LevelDB-backed +// FingerprintTimeRangeIndex ready to use. +func NewFingerprintTimeRangeIndex(basePath string) (*FingerprintTimeRangeIndex, error) { + fingerprintTimeRangeDB, err := NewLevelDB(LevelDBOptions{ + Path: filepath.Join(basePath, FingerprintTimeRangeDir), + CacheSizeBytes: FingerprintTimeRangeCacheSize, + }) + if err != nil { + return nil, err + } + return &FingerprintTimeRangeIndex{ + KeyValueStore: fingerprintTimeRangeDB, + }, nil +} + +// DeleteFingerprintTimeRangeIndex deletes the LevelDB-backed +// FingerprintTimeRangeIndex. Use only for a not yet opened index. +func DeleteFingerprintTimeRangeIndex(basePath string) error { + return os.RemoveAll(path.Join(basePath, FingerprintTimeRangeDir)) +} diff --git a/vendor/github.com/prometheus/prometheus/storage/local/index/interface.go b/vendor/github.com/prometheus/prometheus/storage/local/index/interface.go new file mode 100644 index 000000000..40080c7f3 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/storage/local/index/interface.go @@ -0,0 +1,61 @@ +// Copyright 2014 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package index + +import "encoding" + +// KeyValueStore persists key/value pairs. Implementations must be fundamentally +// goroutine-safe. However, it is the caller's responsibility that keys and +// values can be safely marshaled and unmarshaled (via the MarshalBinary and +// UnmarshalBinary methods of the keys and values). For example, if you call the +// Put method of a KeyValueStore implementation, but the key or the value are +// modified concurrently while being marshaled into its binary representation, +// you obviously have a problem. Methods of KeyValueStore return only after +// (un)marshaling is complete. +type KeyValueStore interface { + Put(key, value encoding.BinaryMarshaler) error + // Get unmarshals the result into value. It returns false if no entry + // could be found for key. If value is nil, Get behaves like Has. + Get(key encoding.BinaryMarshaler, value encoding.BinaryUnmarshaler) (bool, error) + Has(key encoding.BinaryMarshaler) (bool, error) + // Delete returns (false, nil) if key does not exist. + Delete(key encoding.BinaryMarshaler) (bool, error) + + NewBatch() Batch + Commit(b Batch) error + + // ForEach iterates through the complete KeyValueStore and calls the + // supplied function for each mapping. + ForEach(func(kv KeyValueAccessor) error) error + + Close() error +} + +// KeyValueAccessor allows access to the key and value of an entry in a +// KeyValueStore. +type KeyValueAccessor interface { + Key(encoding.BinaryUnmarshaler) error + Value(encoding.BinaryUnmarshaler) error +} + +// Batch allows KeyValueStore mutations to be pooled and committed together. An +// implementation does not have to be goroutine-safe. Never modify a Batch +// concurrently or commit the same batch multiple times concurrently. Marshaling +// of keys and values is guaranteed to be complete when the Put or Delete methods +// have returned. +type Batch interface { + Put(key, value encoding.BinaryMarshaler) error + Delete(key encoding.BinaryMarshaler) error + Reset() +} diff --git a/vendor/github.com/prometheus/prometheus/storage/local/index/leveldb.go b/vendor/github.com/prometheus/prometheus/storage/local/index/leveldb.go new file mode 100644 index 000000000..c4c46421c --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/storage/local/index/leveldb.go @@ -0,0 +1,210 @@ +// Copyright 2014 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package index + +import ( + "encoding" + + "github.com/syndtr/goleveldb/leveldb" + leveldb_filter "github.com/syndtr/goleveldb/leveldb/filter" + leveldb_iterator "github.com/syndtr/goleveldb/leveldb/iterator" + leveldb_opt "github.com/syndtr/goleveldb/leveldb/opt" + leveldb_util "github.com/syndtr/goleveldb/leveldb/util" +) + +var ( + keyspace = &leveldb_util.Range{ + Start: nil, + Limit: nil, + } + + iteratorOpts = &leveldb_opt.ReadOptions{ + DontFillCache: true, + } +) + +// LevelDB is a LevelDB-backed sorted KeyValueStore. +type LevelDB struct { + storage *leveldb.DB + readOpts *leveldb_opt.ReadOptions + writeOpts *leveldb_opt.WriteOptions +} + +// LevelDBOptions provides options for a LevelDB. +type LevelDBOptions struct { + Path string // Base path to store files. + CacheSizeBytes int +} + +// NewLevelDB returns a newly allocated LevelDB-backed KeyValueStore ready to +// use. +func NewLevelDB(o LevelDBOptions) (KeyValueStore, error) { + options := &leveldb_opt.Options{ + BlockCacheCapacity: o.CacheSizeBytes, + Filter: leveldb_filter.NewBloomFilter(10), + } + + storage, err := leveldb.OpenFile(o.Path, options) + if err != nil { + return nil, err + } + + return &LevelDB{ + storage: storage, + readOpts: &leveldb_opt.ReadOptions{}, + writeOpts: &leveldb_opt.WriteOptions{}, + }, nil +} + +// NewBatch implements KeyValueStore. +func (l *LevelDB) NewBatch() Batch { + return &LevelDBBatch{ + batch: &leveldb.Batch{}, + } +} + +// Close implements KeyValueStore. +func (l *LevelDB) Close() error { + return l.storage.Close() +} + +// Get implements KeyValueStore. +func (l *LevelDB) Get(key encoding.BinaryMarshaler, value encoding.BinaryUnmarshaler) (bool, error) { + k, err := key.MarshalBinary() + if err != nil { + return false, err + } + raw, err := l.storage.Get(k, l.readOpts) + if err == leveldb.ErrNotFound { + return false, nil + } + if err != nil { + return false, err + } + if value == nil { + return true, nil + } + return true, value.UnmarshalBinary(raw) +} + +// Has implements KeyValueStore. +func (l *LevelDB) Has(key encoding.BinaryMarshaler) (has bool, err error) { + return l.Get(key, nil) +} + +// Delete implements KeyValueStore. +func (l *LevelDB) Delete(key encoding.BinaryMarshaler) (bool, error) { + k, err := key.MarshalBinary() + if err != nil { + return false, err + } + // Note that Delete returns nil if k does not exist. So we have to test + // for existence with Has first. + if has, err := l.storage.Has(k, l.readOpts); !has || err != nil { + return false, err + } + if err = l.storage.Delete(k, l.writeOpts); err != nil { + return false, err + } + return true, nil +} + +// Put implements KeyValueStore. +func (l *LevelDB) Put(key, value encoding.BinaryMarshaler) error { + k, err := key.MarshalBinary() + if err != nil { + return err + } + v, err := value.MarshalBinary() + if err != nil { + return err + } + return l.storage.Put(k, v, l.writeOpts) +} + +// Commit implements KeyValueStore. +func (l *LevelDB) Commit(b Batch) error { + return l.storage.Write(b.(*LevelDBBatch).batch, l.writeOpts) +} + +// ForEach implements KeyValueStore. +func (l *LevelDB) ForEach(cb func(kv KeyValueAccessor) error) error { + snap, err := l.storage.GetSnapshot() + if err != nil { + return err + } + defer snap.Release() + + iter := snap.NewIterator(keyspace, iteratorOpts) + + kv := &levelDBKeyValueAccessor{it: iter} + + for valid := iter.First(); valid; valid = iter.Next() { + if err = iter.Error(); err != nil { + return err + } + + if err := cb(kv); err != nil { + return err + } + } + return nil +} + +// LevelDBBatch is a Batch implementation for LevelDB. +type LevelDBBatch struct { + batch *leveldb.Batch +} + +// Put implements Batch. +func (b *LevelDBBatch) Put(key, value encoding.BinaryMarshaler) error { + k, err := key.MarshalBinary() + if err != nil { + return err + } + v, err := value.MarshalBinary() + if err != nil { + return err + } + b.batch.Put(k, v) + return nil +} + +// Delete implements Batch. +func (b *LevelDBBatch) Delete(key encoding.BinaryMarshaler) error { + k, err := key.MarshalBinary() + if err != nil { + return err + } + b.batch.Delete(k) + return nil +} + +// Reset implements Batch. +func (b *LevelDBBatch) Reset() { + b.batch.Reset() +} + +// levelDBKeyValueAccessor implements KeyValueAccessor. +type levelDBKeyValueAccessor struct { + it leveldb_iterator.Iterator +} + +func (i *levelDBKeyValueAccessor) Key(key encoding.BinaryUnmarshaler) error { + return key.UnmarshalBinary(i.it.Key()) +} + +func (i *levelDBKeyValueAccessor) Value(value encoding.BinaryUnmarshaler) error { + return value.UnmarshalBinary(i.it.Value()) +} diff --git a/vendor/github.com/prometheus/prometheus/storage/local/instrumentation.go b/vendor/github.com/prometheus/prometheus/storage/local/instrumentation.go new file mode 100644 index 000000000..479e13821 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/storage/local/instrumentation.go @@ -0,0 +1,46 @@ +// Copyright 2014 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package local + +const ( + namespace = "prometheus" + subsystem = "local_storage" + + opTypeLabel = "type" + + // Op-types for seriesOps. + create = "create" + archive = "archive" + unarchive = "unarchive" + memoryPurge = "purge_from_memory" + archivePurge = "purge_from_archive" + requestedPurge = "purge_on_request" + memoryMaintenance = "maintenance_in_memory" + archiveMaintenance = "maintenance_in_archive" + completedQurantine = "quarantine_completed" + droppedQuarantine = "quarantine_dropped" + failedQuarantine = "quarantine_failed" + + seriesLocationLabel = "location" + + // Maintenance types for maintainSeriesDuration. + maintainInMemory = "memory" + maintainArchived = "archived" + + discardReasonLabel = "reason" + + // Reasons to discard samples. + outOfOrderTimestamp = "timestamp_out_of_order" + duplicateSample = "multiple_values_for_timestamp" +) diff --git a/vendor/github.com/prometheus/prometheus/storage/local/interface.go b/vendor/github.com/prometheus/prometheus/storage/local/interface.go new file mode 100644 index 000000000..3f1fda713 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/storage/local/interface.go @@ -0,0 +1,106 @@ +// Copyright 2014 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package local + +import ( + "time" + + "github.com/prometheus/common/model" + "golang.org/x/net/context" + + "github.com/prometheus/prometheus/storage" + "github.com/prometheus/prometheus/storage/metric" +) + +// Storage ingests and manages samples, along with various indexes. All methods +// are goroutine-safe. Storage implements storage.SampleAppender. +type Storage interface { + // Querier returns a new Querier on the storage. + Querier() (Querier, error) + + // This SampleAppender needs multiple samples for the same fingerprint to be + // submitted in chronological order, from oldest to newest. When Append has + // returned, the appended sample might not be queryable immediately. (Use + // WaitForIndexing to wait for complete processing.) The implementation might + // remove labels with empty value from the provided Sample as those labels + // are considered equivalent to a label not present at all. + // + // Appending is throttled if the Storage has too many chunks in memory + // already or has too many chunks waiting for persistence. + storage.SampleAppender + + // Drop all time series associated with the given label matchers. Returns + // the number series that were dropped. + DropMetricsForLabelMatchers(context.Context, ...*metric.LabelMatcher) (int, error) + // Run the various maintenance loops in goroutines. Returns when the + // storage is ready to use. Keeps everything running in the background + // until Stop is called. + Start() error + // Stop shuts down the Storage gracefully, flushes all pending + // operations, stops all maintenance loops,and frees all resources. + Stop() error + // WaitForIndexing returns once all samples in the storage are + // indexed. Indexing is needed for FingerprintsForLabelMatchers and + // LabelValuesForLabelName and may lag behind. + WaitForIndexing() +} + +// Querier allows querying a time series storage. +type Querier interface { + // Close closes the querier. Behavior for subsequent calls to Querier methods + // is undefined. + Close() error + // QueryRange returns a list of series iterators for the selected + // time range and label matchers. The iterators need to be closed + // after usage. + QueryRange(ctx context.Context, from, through model.Time, matchers ...*metric.LabelMatcher) ([]SeriesIterator, error) + // QueryInstant returns a list of series iterators for the selected + // instant and label matchers. The iterators need to be closed after usage. + QueryInstant(ctx context.Context, ts model.Time, stalenessDelta time.Duration, matchers ...*metric.LabelMatcher) ([]SeriesIterator, error) + // MetricsForLabelMatchers returns the metrics from storage that satisfy + // the given sets of label matchers. Each set of matchers must contain at + // least one label matcher that does not match the empty string. Otherwise, + // an empty list is returned. Within one set of matchers, the intersection + // of matching series is computed. The final return value will be the union + // of the per-set results. The times from and through are hints for the + // storage to optimize the search. The storage MAY exclude metrics that + // have no samples in the specified interval from the returned map. In + // doubt, specify model.Earliest for from and model.Latest for through. + MetricsForLabelMatchers(ctx context.Context, from, through model.Time, matcherSets ...metric.LabelMatchers) ([]metric.Metric, error) + // LastSampleForLabelMatchers returns the last samples that have been + // ingested for the time series matching the given set of label matchers. + // The label matching behavior is the same as in MetricsForLabelMatchers. + // All returned samples are between the specified cutoff time and now. + LastSampleForLabelMatchers(ctx context.Context, cutoff model.Time, matcherSets ...metric.LabelMatchers) (model.Vector, error) + // Get all of the label values that are associated with a given label name. + LabelValuesForLabelName(context.Context, model.LabelName) (model.LabelValues, error) +} + +// SeriesIterator enables efficient access of sample values in a series. Its +// methods are not goroutine-safe. A SeriesIterator iterates over a snapshot of +// a series, i.e. it is safe to continue using a SeriesIterator after or during +// modifying the corresponding series, but the iterator will represent the state +// of the series prior to the modification. +type SeriesIterator interface { + // Gets the value that is closest before the given time. In case a value + // exists at precisely the given time, that value is returned. If no + // applicable value exists, model.ZeroSamplePair is returned. + ValueAtOrBeforeTime(model.Time) model.SamplePair + // Gets all values contained within a given interval. + RangeValues(metric.Interval) []model.SamplePair + // Returns the metric of the series that the iterator corresponds to. + Metric() metric.Metric + // Closes the iterator and releases the underlying data. + Close() +} diff --git a/vendor/github.com/prometheus/prometheus/storage/local/locker.go b/vendor/github.com/prometheus/prometheus/storage/local/locker.go new file mode 100644 index 000000000..85effcdbe --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/storage/local/locker.go @@ -0,0 +1,79 @@ +// Copyright 2016 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package local + +import ( + "sync" + "unsafe" + + "github.com/prometheus/common/model" +) + +const ( + cacheLineSize = 64 +) + +// Avoid false sharing when using array of mutexes. +type paddedMutex struct { + sync.Mutex + pad [cacheLineSize - unsafe.Sizeof(sync.Mutex{})]byte +} + +// fingerprintLocker allows locking individual fingerprints. To limit the number +// of mutexes needed for that, only a fixed number of mutexes are +// allocated. Fingerprints to be locked are assigned to those pre-allocated +// mutexes by their value. Collisions are not detected. If two fingerprints get +// assigned to the same mutex, only one of them can be locked at the same +// time. As long as the number of pre-allocated mutexes is much larger than the +// number of goroutines requiring a fingerprint lock concurrently, the loss in +// efficiency is small. However, a goroutine must never lock more than one +// fingerprint at the same time. (In that case a collision would try to acquire +// the same mutex twice). +type fingerprintLocker struct { + fpMtxs []paddedMutex + numFpMtxs uint +} + +// newFingerprintLocker returns a new fingerprintLocker ready for use. At least +// 1024 preallocated mutexes are used, even if preallocatedMutexes is lower. +func newFingerprintLocker(preallocatedMutexes int) *fingerprintLocker { + if preallocatedMutexes < 1024 { + preallocatedMutexes = 1024 + } + return &fingerprintLocker{ + make([]paddedMutex, preallocatedMutexes), + uint(preallocatedMutexes), + } +} + +// Lock locks the given fingerprint. +func (l *fingerprintLocker) Lock(fp model.Fingerprint) { + l.fpMtxs[hashFP(fp)%l.numFpMtxs].Lock() +} + +// Unlock unlocks the given fingerprint. +func (l *fingerprintLocker) Unlock(fp model.Fingerprint) { + l.fpMtxs[hashFP(fp)%l.numFpMtxs].Unlock() +} + +// hashFP simply moves entropy from the most significant 48 bits of the +// fingerprint into the least significant 16 bits (by XORing) so that a simple +// MOD on the result can be used to pick a mutex while still making use of +// changes in more significant bits of the fingerprint. (The fast fingerprinting +// function we use is prone to only change a few bits for similar metrics. We +// really want to make use of every change in the fingerprint to vary mutex +// selection.) +func hashFP(fp model.Fingerprint) uint { + return uint(fp ^ (fp >> 32) ^ (fp >> 16)) +} diff --git a/vendor/github.com/prometheus/prometheus/storage/local/mapper.go b/vendor/github.com/prometheus/prometheus/storage/local/mapper.go new file mode 100644 index 000000000..0f5c71868 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/storage/local/mapper.go @@ -0,0 +1,218 @@ +// Copyright 2016 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package local + +import ( + "fmt" + "sort" + "strings" + "sync" + "sync/atomic" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/log" + + "github.com/prometheus/common/model" +) + +const maxMappedFP = 1 << 20 // About 1M fingerprints reserved for mapping. + +var separatorString = string([]byte{model.SeparatorByte}) + +// fpMappings maps original fingerprints to a map of string representations of +// metrics to the truly unique fingerprint. +type fpMappings map[model.Fingerprint]map[string]model.Fingerprint + +// fpMapper is used to map fingerprints in order to work around fingerprint +// collisions. +type fpMapper struct { + // highestMappedFP has to be aligned for atomic operations. + highestMappedFP model.Fingerprint + + mtx sync.RWMutex // Protects mappings. + mappings fpMappings + + fpToSeries *seriesMap + p *persistence + + mappingsCounter prometheus.Counter +} + +// newFPMapper loads the collision map from the persistence and +// returns an fpMapper ready to use. +func newFPMapper(fpToSeries *seriesMap, p *persistence) (*fpMapper, error) { + m := &fpMapper{ + fpToSeries: fpToSeries, + p: p, + mappingsCounter: prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "fingerprint_mappings_total", + Help: "The total number of fingerprints being mapped to avoid collisions.", + }), + } + mappings, nextFP, err := p.loadFPMappings() + if err != nil { + return nil, err + } + m.mappings = mappings + m.mappingsCounter.Add(float64(len(m.mappings))) + m.highestMappedFP = nextFP + return m, nil +} + +// checkpoint persists the current mappings. The caller has to ensure that the +// provided mappings are not changed concurrently. This method is only called +// upon shutdown, when no samples are ingested anymore. +func (m *fpMapper) checkpoint() error { + return m.p.checkpointFPMappings(m.mappings) +} + +// mapFP takes a raw fingerprint (as returned by Metrics.FastFingerprint) and +// returns a truly unique fingerprint. The caller must have locked the raw +// fingerprint. +// +// If an error is encountered, it is returned together with the unchanged raw +// fingerprint. +func (m *fpMapper) mapFP(fp model.Fingerprint, metric model.Metric) model.Fingerprint { + // First check if we are in the reserved FP space, in which case this is + // automatically a collision that has to be mapped. + if fp <= maxMappedFP { + return m.maybeAddMapping(fp, metric) + } + + // Then check the most likely case: This fp belongs to a series that is + // already in memory. + s, ok := m.fpToSeries.get(fp) + if ok { + // FP exists in memory, but is it for the same metric? + if metric.Equal(s.metric) { + // Yupp. We are done. + return fp + } + // Collision detected! + return m.maybeAddMapping(fp, metric) + } + // Metric is not in memory. Before doing the expensive archive lookup, + // check if we have a mapping for this metric in place already. + m.mtx.RLock() + mappedFPs, fpAlreadyMapped := m.mappings[fp] + m.mtx.RUnlock() + if fpAlreadyMapped { + // We indeed have mapped fp historically. + ms := metricToUniqueString(metric) + // fp is locked by the caller, so no further locking of + // 'collisions' required (it is specific to fp). + mappedFP, ok := mappedFPs[ms] + if ok { + // Historical mapping found, return the mapped FP. + return mappedFP + } + } + // If we are here, FP does not exist in memory and is either not mapped + // at all, or existing mappings for FP are not for m. Check if we have + // something for FP in the archive. + archivedMetric, err := m.p.archivedMetric(fp) + if err != nil || archivedMetric == nil { + // Either the archive lookup has returend an error, or fp does + // not exist in the archive. In the former case, the storage has + // been marked as dirty already. We just carry on for as long as + // it goes, assuming that fp does not exist. In either case, + // since now we know (or assume) now that fp does not exist, + // neither in memory nor in archive, we can safely keep it + // unmapped. + return fp + } + // FP exists in archive, but is it for the same metric? + if metric.Equal(archivedMetric) { + // Yupp. We are done. + return fp + } + // Collision detected! + return m.maybeAddMapping(fp, metric) +} + +// maybeAddMapping is only used internally. It takes a detected collision and +// adds it to the collisions map if not yet there. In any case, it returns the +// truly unique fingerprint for the colliding metric. +func (m *fpMapper) maybeAddMapping( + fp model.Fingerprint, + collidingMetric model.Metric, +) model.Fingerprint { + ms := metricToUniqueString(collidingMetric) + m.mtx.RLock() + mappedFPs, ok := m.mappings[fp] + m.mtx.RUnlock() + if ok { + // fp is locked by the caller, so no further locking required. + mappedFP, ok := mappedFPs[ms] + if ok { + return mappedFP // Existing mapping. + } + // A new mapping has to be created. + mappedFP = m.nextMappedFP() + mappedFPs[ms] = mappedFP + log.Infof( + "Collision detected for fingerprint %v, metric %v, mapping to new fingerprint %v.", + fp, collidingMetric, mappedFP, + ) + return mappedFP + } + // This is the first collision for fp. + mappedFP := m.nextMappedFP() + mappedFPs = map[string]model.Fingerprint{ms: mappedFP} + m.mtx.Lock() + m.mappings[fp] = mappedFPs + m.mappingsCounter.Inc() + m.mtx.Unlock() + log.Infof( + "Collision detected for fingerprint %v, metric %v, mapping to new fingerprint %v.", + fp, collidingMetric, mappedFP, + ) + return mappedFP +} + +func (m *fpMapper) nextMappedFP() model.Fingerprint { + mappedFP := model.Fingerprint(atomic.AddUint64((*uint64)(&m.highestMappedFP), 1)) + if mappedFP > maxMappedFP { + panic(fmt.Errorf("more than %v fingerprints mapped in collision detection", maxMappedFP)) + } + return mappedFP +} + +// Describe implements prometheus.Collector. +func (m *fpMapper) Describe(ch chan<- *prometheus.Desc) { + ch <- m.mappingsCounter.Desc() +} + +// Collect implements prometheus.Collector. +func (m *fpMapper) Collect(ch chan<- prometheus.Metric) { + ch <- m.mappingsCounter +} + +// metricToUniqueString turns a metric into a string in a reproducible and +// unique way, i.e. the same metric will always create the same string, and +// different metrics will always create different strings. In a way, it is the +// "ideal" fingerprint function, only that it is more expensive than the +// FastFingerprint function, and its result is not suitable as a key for maps +// and indexes as it might become really large, causing a lot of hashing effort +// in maps and a lot of storage overhead in indexes. +func metricToUniqueString(m model.Metric) string { + parts := make([]string, 0, len(m)) + for ln, lv := range m { + parts = append(parts, string(ln)+separatorString+string(lv)) + } + sort.Strings(parts) + return strings.Join(parts, separatorString) +} diff --git a/vendor/github.com/prometheus/prometheus/storage/local/noop_storage.go b/vendor/github.com/prometheus/prometheus/storage/local/noop_storage.go new file mode 100644 index 000000000..70b5a32f1 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/storage/local/noop_storage.go @@ -0,0 +1,100 @@ +// Copyright 2016 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package local + +import ( + "time" + + "github.com/prometheus/common/model" + "golang.org/x/net/context" + + "github.com/prometheus/prometheus/storage/metric" +) + +// NoopStorage is a dummy storage for use when Prometheus's local storage is +// disabled. It throws away any appended samples and returns empty results. +type NoopStorage struct{} + +// Start implements Storage. +func (s *NoopStorage) Start() (err error) { + return nil +} + +// Stop implements Storage. +func (s *NoopStorage) Stop() error { + return nil +} + +// WaitForIndexing implements Storage. +func (s *NoopStorage) WaitForIndexing() { +} + +// Querier implements Storage. +func (s *NoopStorage) Querier() (Querier, error) { + return &NoopQuerier{}, nil +} + +// NoopQuerier is a dummy Querier for use when Prometheus's local storage is +// disabled. It is returned by the NoopStorage Querier method and always returns +// empty results. +type NoopQuerier struct{} + +// Close implements Querier. +func (s *NoopQuerier) Close() error { + return nil +} + +// LastSampleForLabelMatchers implements Querier. +func (s *NoopQuerier) LastSampleForLabelMatchers(ctx context.Context, cutoff model.Time, matcherSets ...metric.LabelMatchers) (model.Vector, error) { + return nil, nil +} + +// QueryRange implements Querier +func (s *NoopQuerier) QueryRange(ctx context.Context, from, through model.Time, matchers ...*metric.LabelMatcher) ([]SeriesIterator, error) { + return nil, nil +} + +// QueryInstant implements Querier. +func (s *NoopQuerier) QueryInstant(ctx context.Context, ts model.Time, stalenessDelta time.Duration, matchers ...*metric.LabelMatcher) ([]SeriesIterator, error) { + return nil, nil +} + +// MetricsForLabelMatchers implements Querier. +func (s *NoopQuerier) MetricsForLabelMatchers( + ctx context.Context, + from, through model.Time, + matcherSets ...metric.LabelMatchers, +) ([]metric.Metric, error) { + return nil, nil +} + +// LabelValuesForLabelName implements Querier. +func (s *NoopQuerier) LabelValuesForLabelName(ctx context.Context, labelName model.LabelName) (model.LabelValues, error) { + return nil, nil +} + +// DropMetricsForLabelMatchers implements Storage. +func (s *NoopStorage) DropMetricsForLabelMatchers(ctx context.Context, matchers ...*metric.LabelMatcher) (int, error) { + return 0, nil +} + +// Append implements Storage. +func (s *NoopStorage) Append(sample *model.Sample) error { + return nil +} + +// NeedsThrottling implements Storage. +func (s *NoopStorage) NeedsThrottling() bool { + return false +} diff --git a/vendor/github.com/prometheus/prometheus/storage/local/persistence.go b/vendor/github.com/prometheus/prometheus/storage/local/persistence.go new file mode 100644 index 000000000..d6edc7b9c --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/storage/local/persistence.go @@ -0,0 +1,1722 @@ +// Copyright 2014 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package local + +import ( + "bufio" + "context" + "encoding/binary" + "fmt" + "io" + "io/ioutil" + "math" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/log" + "github.com/prometheus/common/model" + + "github.com/prometheus/prometheus/storage/local/chunk" + "github.com/prometheus/prometheus/storage/local/codable" + "github.com/prometheus/prometheus/storage/local/index" + "github.com/prometheus/prometheus/util/flock" +) + +const ( + // Version of the storage as it can be found in the version file. + // Increment to protect against incompatible changes. + Version = 1 + versionFileName = "VERSION" + + seriesFileSuffix = ".db" + seriesTempFileSuffix = ".db.tmp" + seriesDirNameLen = 2 // How many bytes of the fingerprint in dir name. + hintFileSuffix = ".hint" + + mappingsFileName = "mappings.db" + mappingsTempFileName = "mappings.db.tmp" + mappingsFormatVersion = 1 + mappingsMagicString = "PrometheusMappings" + + dirtyFileName = "DIRTY" + + fileBufSize = 1 << 16 // 64kiB. + + chunkHeaderLen = 17 + chunkHeaderTypeOffset = 0 + chunkHeaderFirstTimeOffset = 1 + chunkHeaderLastTimeOffset = 9 + chunkLenWithHeader = chunk.ChunkLen + chunkHeaderLen + chunkMaxBatchSize = 62 // Max no. of chunks to load or write in + // one batch. Note that 62 is the largest number of chunks that fit + // into 64kiB on disk because chunkHeaderLen is added to each 1k chunk. + + indexingMaxBatchSize = 1024 * 1024 + indexingBatchTimeout = 500 * time.Millisecond // Commit batch when idle for that long. + indexingQueueCapacity = 1024 * 256 +) + +var fpLen = len(model.Fingerprint(0).String()) // Length of a fingerprint as string. + +const ( + flagHeadChunkPersisted byte = 1 << iota + // Add more flags here like: + // flagFoo + // flagBar +) + +type indexingOpType byte + +const ( + add indexingOpType = iota + remove +) + +type indexingOp struct { + fingerprint model.Fingerprint + metric model.Metric + opType indexingOpType +} + +// A Persistence is used by a Storage implementation to store samples +// persistently across restarts. The methods are only goroutine-safe if +// explicitly marked as such below. The chunk-related methods persistChunks, +// dropChunks, loadChunks, and loadChunkDescs can be called concurrently with +// each other if each call refers to a different fingerprint. +type persistence struct { + basePath string + + archivedFingerprintToMetrics *index.FingerprintMetricIndex + archivedFingerprintToTimeRange *index.FingerprintTimeRangeIndex + labelPairToFingerprints *index.LabelPairFingerprintIndex + labelNameToLabelValues *index.LabelNameLabelValuesIndex + + indexingQueue chan indexingOp + indexingStopped chan struct{} + indexingFlush chan chan int + + indexingQueueLength prometheus.Gauge + indexingQueueCapacity prometheus.Metric + indexingBatchSizes prometheus.Summary + indexingBatchDuration prometheus.Summary + checkpointDuration prometheus.Summary + checkpointLastDuration prometheus.Gauge + checkpointLastSize prometheus.Gauge + checkpointChunksWritten prometheus.Summary + dirtyCounter prometheus.Counter + startedDirty prometheus.Gauge + checkpointing prometheus.Gauge + seriesChunksPersisted prometheus.Histogram + + dirtyMtx sync.Mutex // Protects dirty and becameDirty. + dirty bool // true if persistence was started in dirty state. + becameDirty bool // true if an inconsistency came up during runtime. + pedanticChecks bool // true if crash recovery should check each series. + dirtyFileName string // The file used for locking and to mark dirty state. + fLock flock.Releaser // The file lock to protect against concurrent usage. + + shouldSync syncStrategy + + minShrinkRatio float64 // How much a series file has to shrink to justify dropping chunks. + + bufPool sync.Pool +} + +// newPersistence returns a newly allocated persistence backed by local disk storage, ready to use. +func newPersistence( + basePath string, + dirty, pedanticChecks bool, + shouldSync syncStrategy, + minShrinkRatio float64, +) (*persistence, error) { + dirtyPath := filepath.Join(basePath, dirtyFileName) + versionPath := filepath.Join(basePath, versionFileName) + + if versionData, err := ioutil.ReadFile(versionPath); err == nil { + if persistedVersion, err := strconv.Atoi(strings.TrimSpace(string(versionData))); err != nil { + return nil, fmt.Errorf("cannot parse content of %s: %s", versionPath, versionData) + } else if persistedVersion != Version { + return nil, fmt.Errorf("found storage version %d on disk, need version %d - please wipe storage or run a version of Prometheus compatible with storage version %d", persistedVersion, Version, persistedVersion) + } + } else if os.IsNotExist(err) { + // No version file found. Let's create the directory (in case + // it's not there yet) and then check if it is actually + // empty. If not, we have found an old storage directory without + // version file, so we have to bail out. + if err := os.MkdirAll(basePath, 0700); err != nil { + if abspath, e := filepath.Abs(basePath); e == nil { + return nil, fmt.Errorf("cannot create persistent directory %s: %s", abspath, err) + } + return nil, fmt.Errorf("cannot create persistent directory %s: %s", basePath, err) + } + fis, err := ioutil.ReadDir(basePath) + if err != nil { + return nil, err + } + filesPresent := len(fis) + for i := range fis { + switch { + case fis[i].Name() == "lost+found" && fis[i].IsDir(): + filesPresent-- + case strings.HasPrefix(fis[i].Name(), "."): + filesPresent-- + } + } + if filesPresent > 0 { + return nil, fmt.Errorf("found existing files in storage path that do not look like storage files compatible with this version of Prometheus; please delete the files in the storage path or choose a different storage path") + } + // Finally we can write our own version into a new version file. + file, err := os.Create(versionPath) + if err != nil { + return nil, err + } + defer file.Close() + if _, err := fmt.Fprintf(file, "%d\n", Version); err != nil { + return nil, err + } + } else { + return nil, err + } + + fLock, dirtyfileExisted, err := flock.New(dirtyPath) + if err != nil { + log.Errorf("Could not lock %s, Prometheus already running?", dirtyPath) + return nil, err + } + if dirtyfileExisted { + dirty = true + } + + archivedFingerprintToMetrics, err := index.NewFingerprintMetricIndex(basePath) + if err != nil { + // At this point, we could simply blow away the archived + // fingerprint-to-metric index. However, then we would lose + // _all_ archived metrics. So better give the user an + // opportunity to repair the LevelDB with a 3rd party tool. + log.Errorf("Could not open the fingerprint-to-metric index for archived series. Please try a 3rd party tool to repair LevelDB in directory %q. If unsuccessful or undesired, delete the whole directory and restart Prometheus for crash recovery. You will lose all archived time series.", filepath.Join(basePath, index.FingerprintToMetricDir)) + return nil, err + } + archivedFingerprintToTimeRange, err := index.NewFingerprintTimeRangeIndex(basePath) + if err != nil { + // We can recover the archived fingerprint-to-timerange index, + // so blow it away and set ourselves dirty. Then re-open the now + // empty index. + if err := index.DeleteFingerprintTimeRangeIndex(basePath); err != nil { + return nil, err + } + dirty = true + if archivedFingerprintToTimeRange, err = index.NewFingerprintTimeRangeIndex(basePath); err != nil { + return nil, err + } + } + + p := &persistence{ + basePath: basePath, + + archivedFingerprintToMetrics: archivedFingerprintToMetrics, + archivedFingerprintToTimeRange: archivedFingerprintToTimeRange, + + indexingQueue: make(chan indexingOp, indexingQueueCapacity), + indexingStopped: make(chan struct{}), + indexingFlush: make(chan chan int), + + indexingQueueLength: prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "indexing_queue_length", + Help: "The number of metrics waiting to be indexed.", + }), + indexingQueueCapacity: prometheus.MustNewConstMetric( + prometheus.NewDesc( + prometheus.BuildFQName(namespace, subsystem, "indexing_queue_capacity"), + "The capacity of the indexing queue.", + nil, nil, + ), + prometheus.GaugeValue, + float64(indexingQueueCapacity), + ), + indexingBatchSizes: prometheus.NewSummary( + prometheus.SummaryOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "indexing_batch_sizes", + Help: "Quantiles for indexing batch sizes (number of metrics per batch).", + }, + ), + indexingBatchDuration: prometheus.NewSummary( + prometheus.SummaryOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "indexing_batch_duration_seconds", + Help: "Quantiles for batch indexing duration in seconds.", + }, + ), + checkpointLastDuration: prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "checkpoint_last_duration_seconds", + Help: "The duration in seconds it took to last checkpoint open chunks and chunks yet to be persisted.", + }), + checkpointDuration: prometheus.NewSummary(prometheus.SummaryOpts{ + Namespace: namespace, + Subsystem: subsystem, + Objectives: map[float64]float64{}, + Name: "checkpoint_duration_seconds", + Help: "The duration in seconds taken for checkpointing open chunks and chunks yet to be persisted", + }), + checkpointLastSize: prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "checkpoint_last_size_bytes", + Help: "The size of the last checkpoint of open chunks and chunks yet to be persisted", + }), + checkpointChunksWritten: prometheus.NewSummary(prometheus.SummaryOpts{ + Namespace: namespace, + Subsystem: subsystem, + Objectives: map[float64]float64{}, + Name: "checkpoint_series_chunks_written", + Help: "The number of chunk written per series while checkpointing open chunks and chunks yet to be persisted.", + }), + dirtyCounter: prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "inconsistencies_total", + Help: "A counter incremented each time an inconsistency in the local storage is detected. If this is greater zero, restart the server as soon as possible.", + }), + startedDirty: prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "started_dirty", + Help: "Whether the local storage was found to be dirty (and crash recovery occurred) during Prometheus startup.", + }), + checkpointing: prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "checkpointing", + Help: "1 if the storage is checkpointing, 0 otherwise.", + }), + seriesChunksPersisted: prometheus.NewHistogram(prometheus.HistogramOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "series_chunks_persisted", + Help: "The number of chunks persisted per series.", + // Even with 4 bytes per sample, you're not going to get more than 85 + // chunks in 6 hours for a time series with 1s resolution. + Buckets: []float64{1, 2, 4, 8, 16, 32, 64, 128}, + }), + dirty: dirty, + pedanticChecks: pedanticChecks, + dirtyFileName: dirtyPath, + fLock: fLock, + shouldSync: shouldSync, + minShrinkRatio: minShrinkRatio, + // Create buffers of length 3*chunkLenWithHeader by default because that is still reasonably small + // and at the same time enough for many uses. The contract is to never return buffer smaller than + // that to the pool so that callers can rely on a minimum buffer size. + bufPool: sync.Pool{New: func() interface{} { return make([]byte, 0, 3*chunkLenWithHeader) }}, + } + + if p.dirty { + // Blow away the label indexes. We'll rebuild them later. + if err := index.DeleteLabelPairFingerprintIndex(basePath); err != nil { + return nil, err + } + if err := index.DeleteLabelNameLabelValuesIndex(basePath); err != nil { + return nil, err + } + } + labelPairToFingerprints, err := index.NewLabelPairFingerprintIndex(basePath) + if err != nil { + return nil, err + } + labelNameToLabelValues, err := index.NewLabelNameLabelValuesIndex(basePath) + if err != nil { + return nil, err + } + p.labelPairToFingerprints = labelPairToFingerprints + p.labelNameToLabelValues = labelNameToLabelValues + + return p, nil +} + +func (p *persistence) run() { + p.processIndexingQueue() +} + +// Describe implements prometheus.Collector. +func (p *persistence) Describe(ch chan<- *prometheus.Desc) { + ch <- p.indexingQueueLength.Desc() + ch <- p.indexingQueueCapacity.Desc() + p.indexingBatchSizes.Describe(ch) + p.indexingBatchDuration.Describe(ch) + ch <- p.checkpointDuration.Desc() + ch <- p.checkpointLastDuration.Desc() + ch <- p.checkpointLastSize.Desc() + ch <- p.checkpointChunksWritten.Desc() + ch <- p.checkpointing.Desc() + ch <- p.dirtyCounter.Desc() + ch <- p.startedDirty.Desc() + ch <- p.seriesChunksPersisted.Desc() +} + +// Collect implements prometheus.Collector. +func (p *persistence) Collect(ch chan<- prometheus.Metric) { + p.indexingQueueLength.Set(float64(len(p.indexingQueue))) + + ch <- p.indexingQueueLength + ch <- p.indexingQueueCapacity + p.indexingBatchSizes.Collect(ch) + p.indexingBatchDuration.Collect(ch) + ch <- p.checkpointDuration + ch <- p.checkpointLastDuration + ch <- p.checkpointLastSize + ch <- p.checkpointChunksWritten + ch <- p.checkpointing + ch <- p.dirtyCounter + ch <- p.startedDirty + ch <- p.seriesChunksPersisted +} + +// isDirty returns the dirty flag in a goroutine-safe way. +func (p *persistence) isDirty() bool { + p.dirtyMtx.Lock() + defer p.dirtyMtx.Unlock() + return p.dirty +} + +// setDirty flags the storage as dirty in a goroutine-safe way. The provided +// error will be logged as a reason the first time the storage is flagged as dirty. +func (p *persistence) setDirty(err error) { + p.dirtyCounter.Inc() + p.dirtyMtx.Lock() + defer p.dirtyMtx.Unlock() + if p.becameDirty { + return + } + p.dirty = true + p.becameDirty = true + log.With("error", err).Error("The storage is now inconsistent. Restart Prometheus ASAP to initiate recovery.") +} + +// fingerprintsForLabelPair returns the fingerprints for the given label +// pair. This method is goroutine-safe but take into account that metrics queued +// for indexing with IndexMetric might not have made it into the index +// yet. (Same applies correspondingly to UnindexMetric.) +func (p *persistence) fingerprintsForLabelPair(lp model.LabelPair) model.Fingerprints { + fps, _, err := p.labelPairToFingerprints.Lookup(lp) + if err != nil { + p.setDirty(fmt.Errorf("error in method fingerprintsForLabelPair(%v): %s", lp, err)) + return nil + } + return fps +} + +// labelValuesForLabelName returns the label values for the given label +// name. This method is goroutine-safe but take into account that metrics queued +// for indexing with IndexMetric might not have made it into the index +// yet. (Same applies correspondingly to UnindexMetric.) +func (p *persistence) labelValuesForLabelName(ln model.LabelName) (model.LabelValues, error) { + lvs, _, err := p.labelNameToLabelValues.Lookup(ln) + if err != nil { + p.setDirty(fmt.Errorf("error in method labelValuesForLabelName(%v): %s", ln, err)) + return nil, err + } + return lvs, nil +} + +// persistChunks persists a number of consecutive chunks of a series. It is the +// caller's responsibility to not modify the chunks concurrently and to not +// persist or drop anything for the same fingerprint concurrently. It returns +// the (zero-based) index of the first persisted chunk within the series +// file. In case of an error, the returned index is -1 (to avoid the +// misconception that the chunk was written at position 0). +// +// Returning an error signals problems with the series file. In this case, the +// caller should quarantine the series. +func (p *persistence) persistChunks(fp model.Fingerprint, chunks []chunk.Chunk) (index int, err error) { + f, err := p.openChunkFileForWriting(fp) + if err != nil { + return -1, err + } + defer p.closeChunkFile(f) + + if err := p.writeChunks(f, chunks); err != nil { + return -1, err + } + + // Determine index within the file. + offset, err := f.Seek(0, io.SeekCurrent) + if err != nil { + return -1, err + } + index, err = chunkIndexForOffset(offset) + if err != nil { + return -1, err + } + + return index - len(chunks), err +} + +// loadChunks loads a group of chunks of a timeseries by their index. The chunk +// with the earliest time will have index 0, the following ones will have +// incrementally larger indexes. The indexOffset denotes the offset to be added to +// each index in indexes. It is the caller's responsibility to not persist or +// drop anything for the same fingerprint concurrently. +func (p *persistence) loadChunks(fp model.Fingerprint, indexes []int, indexOffset int) ([]chunk.Chunk, error) { + f, err := p.openChunkFileForReading(fp) + if err != nil { + return nil, err + } + defer f.Close() + + chunks := make([]chunk.Chunk, 0, len(indexes)) + buf := p.bufPool.Get().([]byte) + defer func() { + // buf may change below. An unwrapped 'defer p.bufPool.Put(buf)' + // would only put back the original buf. + p.bufPool.Put(buf) + }() + + for i := 0; i < len(indexes); i++ { + // This loads chunks in batches. A batch is a streak of + // consecutive chunks, read from disk in one go. + batchSize := 1 + if _, err := f.Seek(offsetForChunkIndex(indexes[i]+indexOffset), io.SeekStart); err != nil { + return nil, err + } + + for ; batchSize < chunkMaxBatchSize && + i+1 < len(indexes) && + indexes[i]+1 == indexes[i+1]; i, batchSize = i+1, batchSize+1 { + } + readSize := batchSize * chunkLenWithHeader + if cap(buf) < readSize { + buf = make([]byte, readSize) + } + buf = buf[:readSize] + + if _, err := io.ReadFull(f, buf); err != nil { + return nil, err + } + for c := 0; c < batchSize; c++ { + chunk, err := chunk.NewForEncoding(chunk.Encoding(buf[c*chunkLenWithHeader+chunkHeaderTypeOffset])) + if err != nil { + return nil, err + } + if err := chunk.UnmarshalFromBuf(buf[c*chunkLenWithHeader+chunkHeaderLen:]); err != nil { + return nil, err + } + chunks = append(chunks, chunk) + } + } + chunk.Ops.WithLabelValues(chunk.Load).Add(float64(len(chunks))) + atomic.AddInt64(&chunk.NumMemChunks, int64(len(chunks))) + return chunks, nil +} + +// loadChunkDescs loads the chunk.Descs for a series from disk. offsetFromEnd is +// the number of chunk.Descs to skip from the end of the series file. It is the +// caller's responsibility to not persist or drop anything for the same +// fingerprint concurrently. +func (p *persistence) loadChunkDescs(fp model.Fingerprint, offsetFromEnd int) ([]*chunk.Desc, error) { + f, err := p.openChunkFileForReading(fp) + if os.IsNotExist(err) { + return nil, nil + } + if err != nil { + return nil, err + } + defer f.Close() + + fi, err := f.Stat() + if err != nil { + return nil, err + } + if fi.Size()%int64(chunkLenWithHeader) != 0 { + // The returned error will bubble up and lead to quarantining of the whole series. + return nil, fmt.Errorf( + "size of series file for fingerprint %v is %d, which is not a multiple of the chunk length %d", + fp, fi.Size(), chunkLenWithHeader, + ) + } + + numChunks := int(fi.Size())/chunkLenWithHeader - offsetFromEnd + cds := make([]*chunk.Desc, numChunks) + chunkTimesBuf := make([]byte, 16) + for i := 0; i < numChunks; i++ { + _, err := f.Seek(offsetForChunkIndex(i)+chunkHeaderFirstTimeOffset, io.SeekStart) + if err != nil { + return nil, err + } + + _, err = io.ReadAtLeast(f, chunkTimesBuf, 16) + if err != nil { + return nil, err + } + cds[i] = &chunk.Desc{ + ChunkFirstTime: model.Time(binary.LittleEndian.Uint64(chunkTimesBuf)), + ChunkLastTime: model.Time(binary.LittleEndian.Uint64(chunkTimesBuf[8:])), + } + } + chunk.DescOps.WithLabelValues(chunk.Load).Add(float64(len(cds))) + chunk.NumMemDescs.Add(float64(len(cds))) + return cds, nil +} + +// checkpointSeriesMapAndHeads persists the fingerprint to memory-series mapping +// and all non persisted chunks. Do not call concurrently with +// loadSeriesMapAndHeads. This method will only write heads format v2, but +// loadSeriesMapAndHeads can also understand v1. +// +// Description of the file format (for both, v1 and v2): +// +// (1) Magic string (const headsMagicString). +// +// (2) Varint-encoded format version (const headsFormatVersion). +// +// (3) Number of series in checkpoint as big-endian uint64. +// +// (4) Repeated once per series: +// +// (4.1) A flag byte, see flag constants above. (Present but unused in v2.) +// +// (4.2) The fingerprint as big-endian uint64. +// +// (4.3) The metric as defined by codable.Metric. +// +// (4.4) The varint-encoded persistWatermark. (Missing in v1.) +// +// (4.5) The modification time of the series file as nanoseconds elapsed since +// January 1, 1970 UTC. -1 if the modification time is unknown or no series file +// exists yet. (Missing in v1.) +// +// (4.6) The varint-encoded chunkDescsOffset. +// +// (4.6) The varint-encoded savedFirstTime. +// +// (4.7) The varint-encoded number of chunk descriptors. +// +// (4.8) Repeated once per chunk descriptor, oldest to most recent, either +// variant 4.8.1 (if index < persistWatermark) or variant 4.8.2 (if index >= +// persistWatermark). In v1, everything is variant 4.8.1 except for a +// non-persisted head-chunk (determined by the flags). +// +// (4.8.1.1) The varint-encoded first time. +// (4.8.1.2) The varint-encoded last time. +// +// (4.8.2.1) A byte defining the chunk type. +// (4.8.2.2) The chunk itself, marshaled with the Marshal() method. +// +// NOTE: Above, varint encoding is used consistently although uvarint would have +// made more sense in many cases. This was simply a glitch while designing the +// format. +func (p *persistence) checkpointSeriesMapAndHeads( + ctx context.Context, fingerprintToSeries *seriesMap, fpLocker *fingerprintLocker, +) (err error) { + log.Info("Checkpointing in-memory metrics and chunks...") + p.checkpointing.Set(1) + defer p.checkpointing.Set(0) + begin := time.Now() + f, err := os.OpenFile(p.headsTempFileName(), os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0640) + if err != nil { + return err + } + + defer func() { + defer os.Remove(p.headsTempFileName()) // Just in case it was left behind. + + if err != nil { + // If we already had an error, do not bother to sync, + // just close, ignoring any further error. + f.Close() + return + } + syncErr := f.Sync() + closeErr := f.Close() + err = syncErr + if err != nil { + return + } + err = closeErr + if err != nil { + return + } + err = os.Rename(p.headsTempFileName(), p.headsFileName()) + duration := time.Since(begin) + p.checkpointDuration.Observe(duration.Seconds()) + p.checkpointLastDuration.Set(duration.Seconds()) + log.Infof("Done checkpointing in-memory metrics and chunks in %v.", duration) + }() + + w := bufio.NewWriterSize(f, fileBufSize) + + if _, err = w.WriteString(headsMagicString); err != nil { + return err + } + var numberOfSeriesOffset int + if numberOfSeriesOffset, err = codable.EncodeVarint(w, headsFormatVersion); err != nil { + return err + } + numberOfSeriesOffset += len(headsMagicString) + numberOfSeriesInHeader := uint64(fingerprintToSeries.length()) + // We have to write the number of series as uint64 because we might need + // to overwrite it later, and a varint might change byte width then. + if err = codable.EncodeUint64(w, numberOfSeriesInHeader); err != nil { + return err + } + + iter := fingerprintToSeries.iter() + defer func() { + // Consume the iterator in any case to not leak goroutines. + for range iter { + } + }() + + var realNumberOfSeries uint64 + for m := range iter { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + func() { // Wrapped in function to use defer for unlocking the fp. + fpLocker.Lock(m.fp) + defer fpLocker.Unlock(m.fp) + + chunksToPersist := len(m.series.chunkDescs) - m.series.persistWatermark + if len(m.series.chunkDescs) == 0 { + // This series was completely purged or archived + // in the meantime. Ignore. + return + } + realNumberOfSeries++ + + // Sanity checks. + if m.series.chunkDescsOffset < 0 && m.series.persistWatermark > 0 { + panic("encountered unknown chunk desc offset in combination with positive persist watermark") + } + + // These are the values to save in the normal case. + var ( + // persistWatermark is zero as we only checkpoint non-persisted chunks. + persistWatermark int64 + // chunkDescsOffset is shifted by the original persistWatermark for the same reason. + chunkDescsOffset = int64(m.series.chunkDescsOffset + m.series.persistWatermark) + numChunkDescs = int64(chunksToPersist) + ) + // However, in the special case of a series being fully + // persisted but still in memory (i.e. not archived), we + // need to save a "placeholder", for which we use just + // the chunk desc of the last chunk. Values have to be + // adjusted accordingly. (The reason for doing it in + // this weird way is to keep the checkpoint format + // compatible with older versions.) + if chunksToPersist == 0 { + persistWatermark = 1 + chunkDescsOffset-- // Save one chunk desc after all. + numChunkDescs = 1 + } + + // seriesFlags left empty in v2. + if err = w.WriteByte(0); err != nil { + return + } + if err = codable.EncodeUint64(w, uint64(m.fp)); err != nil { + return + } + var buf []byte + buf, err = codable.Metric(m.series.metric).MarshalBinary() + if err != nil { + return + } + if _, err = w.Write(buf); err != nil { + return + } + if _, err = codable.EncodeVarint(w, persistWatermark); err != nil { + return + } + if m.series.modTime.IsZero() { + if _, err = codable.EncodeVarint(w, -1); err != nil { + return + } + } else { + if _, err = codable.EncodeVarint(w, m.series.modTime.UnixNano()); err != nil { + return + } + } + if _, err = codable.EncodeVarint(w, chunkDescsOffset); err != nil { + return + } + if _, err = codable.EncodeVarint(w, int64(m.series.savedFirstTime)); err != nil { + return + } + if _, err = codable.EncodeVarint(w, numChunkDescs); err != nil { + return + } + if chunksToPersist == 0 { + // Save the one placeholder chunk desc for a fully persisted series. + chunkDesc := m.series.chunkDescs[len(m.series.chunkDescs)-1] + if _, err = codable.EncodeVarint(w, int64(chunkDesc.FirstTime())); err != nil { + return + } + lt, err := chunkDesc.LastTime() + if err != nil { + return + } + if _, err = codable.EncodeVarint(w, int64(lt)); err != nil { + return + } + } else { + // Save (only) the non-persisted chunks. + for _, chunkDesc := range m.series.chunkDescs[m.series.persistWatermark:] { + if err = w.WriteByte(byte(chunkDesc.C.Encoding())); err != nil { + return + } + if err = chunkDesc.C.Marshal(w); err != nil { + return + } + p.checkpointChunksWritten.Observe(float64(chunksToPersist)) + } + } + // Series is checkpointed now, so declare it clean. In case the entire + // checkpoint fails later on, this is fine, as the storage's series + // maintenance will mark these series newly dirty again, continuously + // increasing the total number of dirty series as seen by the storage. + // This has the effect of triggering a new checkpoint attempt even + // earlier than if we hadn't incorrectly set "dirty" to "false" here + // already. + m.series.dirty = false + }() + if err != nil { + return err + } + } + if err = w.Flush(); err != nil { + return err + } + if realNumberOfSeries != numberOfSeriesInHeader { + // The number of series has changed in the meantime. + // Rewrite it in the header. + if _, err = f.Seek(int64(numberOfSeriesOffset), io.SeekStart); err != nil { + return err + } + if err = codable.EncodeUint64(f, realNumberOfSeries); err != nil { + return err + } + } + info, err := f.Stat() + if err != nil { + return err + } + p.checkpointLastSize.Set(float64(info.Size())) + return err +} + +// loadSeriesMapAndHeads loads the fingerprint to memory-series mapping and all +// the chunks contained in the checkpoint (and thus not yet persisted to series +// files). The method is capable of loading the checkpoint format v1 and v2. If +// recoverable corruption is detected, or if the dirty flag was set from the +// beginning, crash recovery is run, which might take a while. If an +// unrecoverable error is encountered, it is returned. Call this method during +// start-up while nothing else is running in storage land. This method is +// utterly goroutine-unsafe. +func (p *persistence) loadSeriesMapAndHeads() (sm *seriesMap, chunksToPersist int64, err error) { + fingerprintToSeries := make(map[model.Fingerprint]*memorySeries) + sm = &seriesMap{m: fingerprintToSeries} + + defer func() { + if p.dirty { + log.Warn("Persistence layer appears dirty.") + p.startedDirty.Set(1) + err = p.recoverFromCrash(fingerprintToSeries) + if err != nil { + sm = nil + } + } else { + p.startedDirty.Set(0) + } + }() + + hs := newHeadsScanner(p.headsFileName()) + defer hs.close() + for hs.scan() { + fingerprintToSeries[hs.fp] = hs.series + } + if os.IsNotExist(hs.err) { + return sm, 0, nil + } + if hs.err != nil { + p.dirty = true + log. + With("file", p.headsFileName()). + With("error", hs.err). + Error("Error reading heads file.") + return sm, 0, hs.err + } + return sm, hs.chunksToPersistTotal, nil +} + +// dropAndPersistChunks deletes all chunks from a series file whose last sample +// time is before beforeTime, and then appends the provided chunks, leaving out +// those whose last sample time is before beforeTime. It returns the timestamp +// of the first sample in the oldest chunk _not_ dropped, the chunk offset +// within the series file of the first chunk persisted (out of the provided +// chunks, or - if no chunks were provided - the chunk offset where chunks would +// have been persisted, i.e. the end of the file), the number of deleted chunks, +// and true if all chunks of the series have been deleted (in which case the +// returned timestamp will be 0 and must be ignored). It is the caller's +// responsibility to make sure nothing is persisted or loaded for the same +// fingerprint concurrently. +// +// Returning an error signals problems with the series file. In this case, the +// caller should quarantine the series. +func (p *persistence) dropAndPersistChunks( + fp model.Fingerprint, beforeTime model.Time, chunks []chunk.Chunk, +) ( + firstTimeNotDropped model.Time, + offset int, + numDropped int, + allDropped bool, + err error, +) { + // Style note: With the many return values, it was decided to use naked + // returns in this method. They make the method more readable, but + // please handle with care! + if len(chunks) > 0 { + // We have chunks to persist. First check if those are already + // too old. If that's the case, the chunks in the series file + // are all too old, too. + i := 0 + for ; i < len(chunks); i++ { + var lt model.Time + lt, err = chunks[i].NewIterator().LastTimestamp() + if err != nil { + return + } + if !lt.Before(beforeTime) { + break + } + } + if i < len(chunks) { + firstTimeNotDropped = chunks[i].FirstTime() + } + if i > 0 || firstTimeNotDropped.Before(beforeTime) { + // Series file has to go. + if numDropped, err = p.deleteSeriesFile(fp); err != nil { + return + } + numDropped += i + if i == len(chunks) { + allDropped = true + return + } + // Now simply persist what has to be persisted to a new file. + _, err = p.persistChunks(fp, chunks[i:]) + return + } + } + + // If we are here, we have to check the series file itself. + f, err := p.openChunkFileForReading(fp) + if os.IsNotExist(err) { + // No series file. Only need to create new file with chunks to + // persist, if there are any. + if len(chunks) == 0 { + allDropped = true + err = nil // Do not report not-exist err. + return + } + offset, err = p.persistChunks(fp, chunks) + return + } + if err != nil { + return + } + defer f.Close() + + fi, err := f.Stat() + if err != nil { + return + } + chunksInFile := int(fi.Size()) / chunkLenWithHeader + totalChunks := chunksInFile + len(chunks) + + // Calculate chunk index from minShrinkRatio, to skip unnecessary chunk header reading. + chunkIndexToStartSeek := 0 + if p.minShrinkRatio < 1 { + chunkIndexToStartSeek = int(math.Floor(float64(totalChunks) * p.minShrinkRatio)) + } + if chunkIndexToStartSeek >= chunksInFile { + chunkIndexToStartSeek = chunksInFile - 1 + } + numDropped = chunkIndexToStartSeek + + headerBuf := make([]byte, chunkHeaderLen) + // Find the first chunk in the file that should be kept. + for ; ; numDropped++ { + _, err = f.Seek(offsetForChunkIndex(numDropped), io.SeekStart) + if err != nil { + return + } + _, err = io.ReadFull(f, headerBuf) + if err == io.EOF { + // Close the file before trying to delete it. This is necessary on Windows + // (this will cause the defer f.Close to fail, but the error is silently ignored) + f.Close() + // We ran into the end of the file without finding any chunks that should + // be kept. Remove the whole file. + if numDropped, err = p.deleteSeriesFile(fp); err != nil { + return + } + if len(chunks) == 0 { + allDropped = true + return + } + offset, err = p.persistChunks(fp, chunks) + return + } + if err != nil { + return + } + lastTime := model.Time( + binary.LittleEndian.Uint64(headerBuf[chunkHeaderLastTimeOffset:]), + ) + if !lastTime.Before(beforeTime) { + break + } + } + + // If numDropped isn't incremented, the minShrinkRatio condition isn't satisfied. + if numDropped == chunkIndexToStartSeek { + // Nothing to drop. Just adjust the return values and append the chunks (if any). + numDropped = 0 + _, err = f.Seek(offsetForChunkIndex(0), io.SeekStart) + if err != nil { + return + } + _, err = io.ReadFull(f, headerBuf) + if err != nil { + return + } + firstTimeNotDropped = model.Time( + binary.LittleEndian.Uint64(headerBuf[chunkHeaderFirstTimeOffset:]), + ) + if len(chunks) > 0 { + offset, err = p.persistChunks(fp, chunks) + } else { + offset = chunksInFile + } + return + } + // If we are here, we have to drop some chunks for real. So we need to + // record firstTimeNotDropped from the last read header, seek backwards + // to the beginning of its header, and start copying everything from + // there into a new file. Then append the chunks to the new file. + firstTimeNotDropped = model.Time( + binary.LittleEndian.Uint64(headerBuf[chunkHeaderFirstTimeOffset:]), + ) + chunk.Ops.WithLabelValues(chunk.Drop).Add(float64(numDropped)) + _, err = f.Seek(-chunkHeaderLen, io.SeekCurrent) + if err != nil { + return + } + + temp, err := os.OpenFile(p.tempFileNameForFingerprint(fp), os.O_WRONLY|os.O_CREATE, 0640) + if err != nil { + return + } + defer func() { + // Close the file before trying to rename to it. This is necessary on Windows + // (this will cause the defer f.Close to fail, but the error is silently ignored) + f.Close() + p.closeChunkFile(temp) + if err == nil { + err = os.Rename(p.tempFileNameForFingerprint(fp), p.fileNameForFingerprint(fp)) + } + }() + + written, err := io.Copy(temp, f) + if err != nil { + return + } + offset = int(written / chunkLenWithHeader) + + if len(chunks) > 0 { + if err = p.writeChunks(temp, chunks); err != nil { + return + } + } + return +} + +// deleteSeriesFile deletes a series file belonging to the provided +// fingerprint. It returns the number of chunks that were contained in the +// deleted file. +func (p *persistence) deleteSeriesFile(fp model.Fingerprint) (int, error) { + fname := p.fileNameForFingerprint(fp) + fi, err := os.Stat(fname) + if os.IsNotExist(err) { + // Great. The file is already gone. + return 0, nil + } + if err != nil { + return -1, err + } + numChunks := int(fi.Size() / chunkLenWithHeader) + if err := os.Remove(fname); err != nil { + return -1, err + } + chunk.Ops.WithLabelValues(chunk.Drop).Add(float64(numChunks)) + return numChunks, nil +} + +// quarantineSeriesFile moves a series file to the orphaned directory. It also +// writes a hint file with the provided quarantine reason and, if series is +// non-nil, the string representation of the metric. +func (p *persistence) quarantineSeriesFile(fp model.Fingerprint, quarantineReason error, metric model.Metric) error { + var ( + oldName = p.fileNameForFingerprint(fp) + orphanedDir = filepath.Join(p.basePath, "orphaned", filepath.Base(filepath.Dir(oldName))) + newName = filepath.Join(orphanedDir, filepath.Base(oldName)) + hintName = newName[:len(newName)-len(seriesFileSuffix)] + hintFileSuffix + ) + + renameErr := os.MkdirAll(orphanedDir, 0700) + if renameErr != nil { + return renameErr + } + renameErr = os.Rename(oldName, newName) + if os.IsNotExist(renameErr) { + // Source file dosn't exist. That's normal. + renameErr = nil + } + // Write hint file even if the rename ended in an error. At least try... + // And ignore errors writing the hint file. It's best effort. + if f, err := os.Create(hintName); err == nil { + if metric != nil { + f.WriteString(metric.String() + "\n") + } else { + f.WriteString("[UNKNOWN METRIC]\n") + } + if quarantineReason != nil { + f.WriteString(quarantineReason.Error() + "\n") + } else { + f.WriteString("[UNKNOWN REASON]\n") + } + f.Close() + } + return renameErr +} + +// seriesFileModTime returns the modification time of the series file belonging +// to the provided fingerprint. In case of an error, the zero value of time.Time +// is returned. +func (p *persistence) seriesFileModTime(fp model.Fingerprint) time.Time { + var modTime time.Time + if fi, err := os.Stat(p.fileNameForFingerprint(fp)); err == nil { + return fi.ModTime() + } + return modTime +} + +// indexMetric queues the given metric for addition to the indexes needed by +// fingerprintsForLabelPair, labelValuesForLabelName, and +// fingerprintsModifiedBefore. If the queue is full, this method blocks until +// the metric can be queued. This method is goroutine-safe. +func (p *persistence) indexMetric(fp model.Fingerprint, m model.Metric) { + p.indexingQueue <- indexingOp{fp, m, add} +} + +// unindexMetric queues references to the given metric for removal from the +// indexes used for fingerprintsForLabelPair, labelValuesForLabelName, and +// fingerprintsModifiedBefore. The index of fingerprints to archived metrics is +// not affected by this removal. (In fact, never call this method for an +// archived metric. To purge an archived metric, call purgeArchivedMetric.) +// If the queue is full, this method blocks until the metric can be queued. This +// method is goroutine-safe. +func (p *persistence) unindexMetric(fp model.Fingerprint, m model.Metric) { + p.indexingQueue <- indexingOp{fp, m, remove} +} + +// waitForIndexing waits until all items in the indexing queue are processed. If +// queue processing is currently on hold (to gather more ops for batching), this +// method will trigger an immediate start of processing. This method is +// goroutine-safe. +func (p *persistence) waitForIndexing() { + wait := make(chan int) + for { + p.indexingFlush <- wait + if <-wait == 0 { + break + } + } +} + +// archiveMetric persists the mapping of the given fingerprint to the given +// metric, together with the first and last timestamp of the series belonging to +// the metric. The caller must have locked the fingerprint. +func (p *persistence) archiveMetric( + fp model.Fingerprint, m model.Metric, first, last model.Time, +) { + if err := p.archivedFingerprintToMetrics.Put(codable.Fingerprint(fp), codable.Metric(m)); err != nil { + p.setDirty(fmt.Errorf("error in method archiveMetric inserting fingerprint %v into FingerprintToMetrics: %s", fp, err)) + return + } + if err := p.archivedFingerprintToTimeRange.Put(codable.Fingerprint(fp), codable.TimeRange{First: first, Last: last}); err != nil { + p.setDirty(fmt.Errorf("error in method archiveMetric inserting fingerprint %v into FingerprintToTimeRange: %s", fp, err)) + } +} + +// hasArchivedMetric returns whether the archived metric for the given +// fingerprint exists and if yes, what the first and last timestamp in the +// corresponding series is. This method is goroutine-safe. +func (p *persistence) hasArchivedMetric(fp model.Fingerprint) ( + hasMetric bool, firstTime, lastTime model.Time, +) { + firstTime, lastTime, hasMetric, err := p.archivedFingerprintToTimeRange.Lookup(fp) + if err != nil { + p.setDirty(fmt.Errorf("error in method hasArchivedMetric(%v): %s", fp, err)) + hasMetric = false + } + return hasMetric, firstTime, lastTime +} + +// updateArchivedTimeRange updates an archived time range. The caller must make +// sure that the fingerprint is currently archived (the time range will +// otherwise be added without the corresponding metric in the archive). +func (p *persistence) updateArchivedTimeRange( + fp model.Fingerprint, first, last model.Time, +) error { + return p.archivedFingerprintToTimeRange.Put(codable.Fingerprint(fp), codable.TimeRange{First: first, Last: last}) +} + +// fingerprintsModifiedBefore returns the fingerprints of archived timeseries +// that have live samples before the provided timestamp. This method is +// goroutine-safe. +func (p *persistence) fingerprintsModifiedBefore(beforeTime model.Time) ([]model.Fingerprint, error) { + var fp codable.Fingerprint + var tr codable.TimeRange + fps := []model.Fingerprint{} + err := p.archivedFingerprintToTimeRange.ForEach(func(kv index.KeyValueAccessor) error { + if err := kv.Value(&tr); err != nil { + return err + } + if tr.First.Before(beforeTime) { + if err := kv.Key(&fp); err != nil { + return err + } + fps = append(fps, model.Fingerprint(fp)) + } + return nil + }) + return fps, err +} + +// archivedMetric retrieves the archived metric with the given fingerprint. This +// method is goroutine-safe. +func (p *persistence) archivedMetric(fp model.Fingerprint) (model.Metric, error) { + metric, _, err := p.archivedFingerprintToMetrics.Lookup(fp) + if err != nil { + p.setDirty(fmt.Errorf("error in method archivedMetric(%v): %s", fp, err)) + return nil, err + } + return metric, nil +} + +// purgeArchivedMetric deletes an archived fingerprint and its corresponding +// metric entirely. It also queues the metric for un-indexing (no need to call +// unindexMetric for the deleted metric.) It does not touch the series file, +// though. The caller must have locked the fingerprint. +func (p *persistence) purgeArchivedMetric(fp model.Fingerprint) (err error) { + defer func() { + if err != nil { + p.setDirty(fmt.Errorf("error in method purgeArchivedMetric(%v): %s", fp, err)) + } + }() + + metric, err := p.archivedMetric(fp) + if err != nil || metric == nil { + return err + } + deleted, err := p.archivedFingerprintToMetrics.Delete(codable.Fingerprint(fp)) + if err != nil { + return err + } + if !deleted { + log.Errorf("Tried to delete non-archived fingerprint %s from archivedFingerprintToMetrics index. This should never happen.", fp) + } + deleted, err = p.archivedFingerprintToTimeRange.Delete(codable.Fingerprint(fp)) + if err != nil { + return err + } + if !deleted { + log.Errorf("Tried to delete non-archived fingerprint %s from archivedFingerprintToTimeRange index. This should never happen.", fp) + } + p.unindexMetric(fp, metric) + return nil +} + +// unarchiveMetric deletes an archived fingerprint and its metric, but (in +// contrast to purgeArchivedMetric) does not un-index the metric. If a metric +// was actually deleted, the method returns true and the first time and last +// time of the deleted metric. The caller must have locked the fingerprint. +func (p *persistence) unarchiveMetric(fp model.Fingerprint) (deletedAnything bool, err error) { + // An error returned here will bubble up and lead to quarantining of the + // series, so no setDirty required. + deleted, err := p.archivedFingerprintToMetrics.Delete(codable.Fingerprint(fp)) + if err != nil || !deleted { + return false, err + } + deleted, err = p.archivedFingerprintToTimeRange.Delete(codable.Fingerprint(fp)) + if err != nil { + return false, err + } + if !deleted { + log.Errorf("Tried to delete non-archived fingerprint %s from archivedFingerprintToTimeRange index. This should never happen.", fp) + } + return true, nil +} + +// close flushes the indexing queue and other buffered data and releases any +// held resources. It also removes the dirty marker file if successful and if +// the persistence is currently not marked as dirty. +func (p *persistence) close() error { + close(p.indexingQueue) + <-p.indexingStopped + + var lastError, dirtyFileRemoveError error + if err := p.archivedFingerprintToMetrics.Close(); err != nil { + lastError = err + log.Error("Error closing archivedFingerprintToMetric index DB: ", err) + } + if err := p.archivedFingerprintToTimeRange.Close(); err != nil { + lastError = err + log.Error("Error closing archivedFingerprintToTimeRange index DB: ", err) + } + if err := p.labelPairToFingerprints.Close(); err != nil { + lastError = err + log.Error("Error closing labelPairToFingerprints index DB: ", err) + } + if err := p.labelNameToLabelValues.Close(); err != nil { + lastError = err + log.Error("Error closing labelNameToLabelValues index DB: ", err) + } + if lastError == nil && !p.isDirty() { + dirtyFileRemoveError = os.Remove(p.dirtyFileName) + } + if err := p.fLock.Release(); err != nil { + lastError = err + log.Error("Error releasing file lock: ", err) + } + if dirtyFileRemoveError != nil { + // On Windows, removing the dirty file before unlocking is not + // possible. So remove it here if it failed above. + lastError = os.Remove(p.dirtyFileName) + } + return lastError +} + +func (p *persistence) dirNameForFingerprint(fp model.Fingerprint) string { + fpStr := fp.String() + return filepath.Join(p.basePath, fpStr[0:seriesDirNameLen]) +} + +func (p *persistence) fileNameForFingerprint(fp model.Fingerprint) string { + fpStr := fp.String() + return filepath.Join(p.basePath, fpStr[0:seriesDirNameLen], fpStr[seriesDirNameLen:]+seriesFileSuffix) +} + +func (p *persistence) tempFileNameForFingerprint(fp model.Fingerprint) string { + fpStr := fp.String() + return filepath.Join(p.basePath, fpStr[0:seriesDirNameLen], fpStr[seriesDirNameLen:]+seriesTempFileSuffix) +} + +func (p *persistence) openChunkFileForWriting(fp model.Fingerprint) (*os.File, error) { + if err := os.MkdirAll(p.dirNameForFingerprint(fp), 0700); err != nil { + return nil, err + } + return os.OpenFile(p.fileNameForFingerprint(fp), os.O_WRONLY|os.O_APPEND|os.O_CREATE, 0640) + // NOTE: Although the file was opened for append, + // f.Seek(0, io.SeekCurrent) + // would now return '0, nil', so we cannot check for a consistent file length right now. + // However, the chunkIndexForOffset function is doing that check, so a wrong file length + // would still be detected. +} + +// closeChunkFile first syncs the provided file if mandated so by the sync +// strategy. Then it closes the file. Errors are logged. +func (p *persistence) closeChunkFile(f *os.File) { + if p.shouldSync() { + if err := f.Sync(); err != nil { + log.Error("Error syncing file:", err) + } + } + if err := f.Close(); err != nil { + log.Error("Error closing chunk file:", err) + } +} + +func (p *persistence) openChunkFileForReading(fp model.Fingerprint) (*os.File, error) { + return os.Open(p.fileNameForFingerprint(fp)) +} + +func (p *persistence) headsFileName() string { + return filepath.Join(p.basePath, headsFileName) +} + +func (p *persistence) headsTempFileName() string { + return filepath.Join(p.basePath, headsTempFileName) +} + +func (p *persistence) mappingsFileName() string { + return filepath.Join(p.basePath, mappingsFileName) +} + +func (p *persistence) mappingsTempFileName() string { + return filepath.Join(p.basePath, mappingsTempFileName) +} + +func (p *persistence) processIndexingQueue() { + batchSize := 0 + nameToValues := index.LabelNameLabelValuesMapping{} + pairToFPs := index.LabelPairFingerprintsMapping{} + batchTimeout := time.NewTimer(indexingBatchTimeout) + defer batchTimeout.Stop() + + commitBatch := func() { + p.indexingBatchSizes.Observe(float64(batchSize)) + defer func(begin time.Time) { + p.indexingBatchDuration.Observe(time.Since(begin).Seconds()) + }(time.Now()) + + if err := p.labelPairToFingerprints.IndexBatch(pairToFPs); err != nil { + log.Error("Error indexing label pair to fingerprints batch: ", err) + p.setDirty(err) + } + if err := p.labelNameToLabelValues.IndexBatch(nameToValues); err != nil { + log.Error("Error indexing label name to label values batch: ", err) + p.setDirty(err) + } + batchSize = 0 + nameToValues = index.LabelNameLabelValuesMapping{} + pairToFPs = index.LabelPairFingerprintsMapping{} + batchTimeout.Reset(indexingBatchTimeout) + } + + var flush chan chan int +loop: + for { + // Only process flush requests if the queue is currently empty. + if len(p.indexingQueue) == 0 { + flush = p.indexingFlush + } else { + flush = nil + } + select { + case <-batchTimeout.C: + // Only commit if we have something to commit _and_ + // nothing is waiting in the queue to be picked up. That + // prevents a death spiral if the LookupSet calls below + // are slow for some reason. + if batchSize > 0 && len(p.indexingQueue) == 0 { + commitBatch() + } else { + batchTimeout.Reset(indexingBatchTimeout) + } + case r := <-flush: + if batchSize > 0 { + commitBatch() + } + r <- len(p.indexingQueue) + case op, ok := <-p.indexingQueue: + if !ok { + if batchSize > 0 { + commitBatch() + } + break loop + } + + batchSize++ + for ln, lv := range op.metric { + lp := model.LabelPair{Name: ln, Value: lv} + baseFPs, ok := pairToFPs[lp] + if !ok { + var err error + baseFPs, _, err = p.labelPairToFingerprints.LookupSet(lp) + if err != nil { + log.Errorf("Error looking up label pair %v: %s", lp, err) + continue + } + pairToFPs[lp] = baseFPs + } + baseValues, ok := nameToValues[ln] + if !ok { + var err error + baseValues, _, err = p.labelNameToLabelValues.LookupSet(ln) + if err != nil { + log.Errorf("Error looking up label name %v: %s", ln, err) + continue + } + nameToValues[ln] = baseValues + } + switch op.opType { + case add: + baseFPs[op.fingerprint] = struct{}{} + baseValues[lv] = struct{}{} + case remove: + delete(baseFPs, op.fingerprint) + if len(baseFPs) == 0 { + delete(baseValues, lv) + } + default: + panic("unknown op type") + } + } + + if batchSize >= indexingMaxBatchSize { + commitBatch() + } + } + } + close(p.indexingStopped) +} + +// checkpointFPMappings persists the fingerprint mappings. The caller has to +// ensure that the provided mappings are not changed concurrently. This method +// is only called upon shutdown or during crash recovery, when no samples are +// ingested. +// +// Description of the file format, v1: +// +// (1) Magic string (const mappingsMagicString). +// +// (2) Uvarint-encoded format version (const mappingsFormatVersion). +// +// (3) Uvarint-encoded number of mappings in fpMappings. +// +// (4) Repeated once per mapping: +// +// (4.1) The raw fingerprint as big-endian uint64. +// +// (4.2) The uvarint-encoded number of sub-mappings for the raw fingerprint. +// +// (4.3) Repeated once per sub-mapping: +// +// (4.3.1) The uvarint-encoded length of the unique metric string. +// (4.3.2) The unique metric string. +// (4.3.3) The mapped fingerprint as big-endian uint64. +func (p *persistence) checkpointFPMappings(fpm fpMappings) (err error) { + log.Info("Checkpointing fingerprint mappings...") + begin := time.Now() + f, err := os.OpenFile(p.mappingsTempFileName(), os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0640) + if err != nil { + return + } + + defer func() { + syncErr := f.Sync() + closeErr := f.Close() + if err != nil { + return + } + err = syncErr + if err != nil { + return + } + err = closeErr + if err != nil { + return + } + err = os.Rename(p.mappingsTempFileName(), p.mappingsFileName()) + duration := time.Since(begin) + log.Infof("Done checkpointing fingerprint mappings in %v.", duration) + }() + + w := bufio.NewWriterSize(f, fileBufSize) + + if _, err = w.WriteString(mappingsMagicString); err != nil { + return + } + if _, err = codable.EncodeUvarint(w, mappingsFormatVersion); err != nil { + return + } + if _, err = codable.EncodeUvarint(w, uint64(len(fpm))); err != nil { + return + } + + for fp, mappings := range fpm { + if err = codable.EncodeUint64(w, uint64(fp)); err != nil { + return + } + if _, err = codable.EncodeUvarint(w, uint64(len(mappings))); err != nil { + return + } + for ms, mappedFP := range mappings { + if _, err = codable.EncodeUvarint(w, uint64(len(ms))); err != nil { + return + } + if _, err = w.WriteString(ms); err != nil { + return + } + if err = codable.EncodeUint64(w, uint64(mappedFP)); err != nil { + return + } + } + } + err = w.Flush() + return +} + +// loadFPMappings loads the fingerprint mappings. It also returns the highest +// mapped fingerprint and any error encountered. If p.mappingsFileName is not +// found, the method returns (fpMappings{}, 0, nil). Do not call concurrently +// with checkpointFPMappings. +func (p *persistence) loadFPMappings() (fpMappings, model.Fingerprint, error) { + fpm := fpMappings{} + var highestMappedFP model.Fingerprint + + f, err := os.Open(p.mappingsFileName()) + if os.IsNotExist(err) { + return fpm, 0, nil + } + if err != nil { + return nil, 0, err + } + defer f.Close() + r := bufio.NewReaderSize(f, fileBufSize) + + buf := make([]byte, len(mappingsMagicString)) + if _, err := io.ReadFull(r, buf); err != nil { + return nil, 0, err + } + magic := string(buf) + if magic != mappingsMagicString { + return nil, 0, fmt.Errorf( + "unexpected magic string, want %q, got %q", + mappingsMagicString, magic, + ) + } + version, err := binary.ReadUvarint(r) + if version != mappingsFormatVersion || err != nil { + return nil, 0, fmt.Errorf("unknown fingerprint mappings format version, want %d", mappingsFormatVersion) + } + numRawFPs, err := binary.ReadUvarint(r) + if err != nil { + return nil, 0, err + } + for ; numRawFPs > 0; numRawFPs-- { + rawFP, err := codable.DecodeUint64(r) + if err != nil { + return nil, 0, err + } + numMappings, err := binary.ReadUvarint(r) + if err != nil { + return nil, 0, err + } + mappings := make(map[string]model.Fingerprint, numMappings) + for ; numMappings > 0; numMappings-- { + lenMS, err := binary.ReadUvarint(r) + if err != nil { + return nil, 0, err + } + buf := make([]byte, lenMS) + if _, err := io.ReadFull(r, buf); err != nil { + return nil, 0, err + } + fp, err := codable.DecodeUint64(r) + if err != nil { + return nil, 0, err + } + mappedFP := model.Fingerprint(fp) + if mappedFP > highestMappedFP { + highestMappedFP = mappedFP + } + mappings[string(buf)] = mappedFP + } + fpm[model.Fingerprint(rawFP)] = mappings + } + return fpm, highestMappedFP, nil +} + +func (p *persistence) writeChunks(w io.Writer, chunks []chunk.Chunk) error { + b := p.bufPool.Get().([]byte) + defer func() { + // buf may change below. An unwrapped 'defer p.bufPool.Put(buf)' + // would only put back the original buf. + p.bufPool.Put(b) + }() + numChunks := len(chunks) + + for batchSize := chunkMaxBatchSize; len(chunks) > 0; chunks = chunks[batchSize:] { + if batchSize > len(chunks) { + batchSize = len(chunks) + } + writeSize := batchSize * chunkLenWithHeader + if cap(b) < writeSize { + b = make([]byte, writeSize) + } + b = b[:writeSize] + + for i, chunk := range chunks[:batchSize] { + if err := writeChunkHeader(b[i*chunkLenWithHeader:], chunk); err != nil { + return err + } + if err := chunk.MarshalToBuf(b[i*chunkLenWithHeader+chunkHeaderLen:]); err != nil { + return err + } + } + if _, err := w.Write(b); err != nil { + return err + } + } + p.seriesChunksPersisted.Observe(float64(numChunks)) + return nil +} + +func offsetForChunkIndex(i int) int64 { + return int64(i * chunkLenWithHeader) +} + +func chunkIndexForOffset(offset int64) (int, error) { + if int(offset)%chunkLenWithHeader != 0 { + return -1, fmt.Errorf( + "offset %d is not a multiple of on-disk chunk length %d", + offset, chunkLenWithHeader, + ) + } + return int(offset) / chunkLenWithHeader, nil +} + +func writeChunkHeader(header []byte, c chunk.Chunk) error { + header[chunkHeaderTypeOffset] = byte(c.Encoding()) + binary.LittleEndian.PutUint64( + header[chunkHeaderFirstTimeOffset:], + uint64(c.FirstTime()), + ) + lt, err := c.NewIterator().LastTimestamp() + if err != nil { + return err + } + binary.LittleEndian.PutUint64( + header[chunkHeaderLastTimeOffset:], + uint64(lt), + ) + return nil +} diff --git a/vendor/github.com/prometheus/prometheus/storage/local/series.go b/vendor/github.com/prometheus/prometheus/storage/local/series.go new file mode 100644 index 000000000..f58371746 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/storage/local/series.go @@ -0,0 +1,728 @@ +// Copyright 2014 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package local + +import ( + "fmt" + "sort" + "sync" + "time" + + "github.com/prometheus/common/model" + + "github.com/prometheus/prometheus/storage/local/chunk" + "github.com/prometheus/prometheus/storage/metric" +) + +// fingerprintSeriesPair pairs a fingerprint with a memorySeries pointer. +type fingerprintSeriesPair struct { + fp model.Fingerprint + series *memorySeries +} + +// seriesMap maps fingerprints to memory series. All its methods are +// goroutine-safe. A SeriesMap is effectively is a goroutine-safe version of +// map[model.Fingerprint]*memorySeries. +type seriesMap struct { + mtx sync.RWMutex + m map[model.Fingerprint]*memorySeries +} + +// newSeriesMap returns a newly allocated empty seriesMap. To create a seriesMap +// based on a prefilled map, use an explicit initializer. +func newSeriesMap() *seriesMap { + return &seriesMap{m: make(map[model.Fingerprint]*memorySeries)} +} + +// length returns the number of mappings in the seriesMap. +func (sm *seriesMap) length() int { + sm.mtx.RLock() + defer sm.mtx.RUnlock() + + return len(sm.m) +} + +// get returns a memorySeries for a fingerprint. Return values have the same +// semantics as the native Go map. +func (sm *seriesMap) get(fp model.Fingerprint) (s *memorySeries, ok bool) { + sm.mtx.RLock() + s, ok = sm.m[fp] + // Note that the RUnlock is not done via defer for performance reasons. + // TODO(beorn7): Once https://github.com/golang/go/issues/14939 is + // fixed, revert to the usual defer idiom. + sm.mtx.RUnlock() + return +} + +// put adds a mapping to the seriesMap. It panics if s == nil. +func (sm *seriesMap) put(fp model.Fingerprint, s *memorySeries) { + sm.mtx.Lock() + defer sm.mtx.Unlock() + + if s == nil { + panic("tried to add nil pointer to seriesMap") + } + sm.m[fp] = s +} + +// del removes a mapping from the series Map. +func (sm *seriesMap) del(fp model.Fingerprint) { + sm.mtx.Lock() + defer sm.mtx.Unlock() + + delete(sm.m, fp) +} + +// iter returns a channel that produces all mappings in the seriesMap. The +// channel will be closed once all fingerprints have been received. Not +// consuming all fingerprints from the channel will leak a goroutine. The +// semantics of concurrent modification of seriesMap is the similar as the one +// for iterating over a map with a 'range' clause. However, if the next element +// in iteration order is removed after the current element has been received +// from the channel, it will still be produced by the channel. +func (sm *seriesMap) iter() <-chan fingerprintSeriesPair { + ch := make(chan fingerprintSeriesPair) + go func() { + sm.mtx.RLock() + for fp, s := range sm.m { + sm.mtx.RUnlock() + ch <- fingerprintSeriesPair{fp, s} + sm.mtx.RLock() + } + sm.mtx.RUnlock() + close(ch) + }() + return ch +} + +// sortedFPs returns a sorted slice of all the fingerprints in the seriesMap. +func (sm *seriesMap) sortedFPs() model.Fingerprints { + sm.mtx.RLock() + fps := make(model.Fingerprints, 0, len(sm.m)) + for fp := range sm.m { + fps = append(fps, fp) + } + sm.mtx.RUnlock() + + // Sorting could take some time, so do it outside of the lock. + sort.Sort(fps) + return fps +} + +type memorySeries struct { + metric model.Metric + // Sorted by start time, overlapping chunk ranges are forbidden. + chunkDescs []*chunk.Desc + // The index (within chunkDescs above) of the first chunk.Desc that + // points to a non-persisted chunk. If all chunks are persisted, then + // persistWatermark == len(chunkDescs). + persistWatermark int + // The modification time of the series file. The zero value of time.Time + // is used to mark an unknown modification time. + modTime time.Time + // The chunkDescs in memory might not have all the chunkDescs for the + // chunks that are persisted to disk. The missing chunkDescs are all + // contiguous and at the tail end. chunkDescsOffset is the index of the + // chunk on disk that corresponds to the first chunk.Desc in memory. If + // it is 0, the chunkDescs are all loaded. A value of -1 denotes a + // special case: There are chunks on disk, but the offset to the + // chunkDescs in memory is unknown. Also, in this special case, there is + // no overlap between chunks on disk and chunks in memory (implying that + // upon first persisting of a chunk in memory, the offset has to be + // set). + chunkDescsOffset int + // The savedFirstTime field is used as a fallback when the + // chunkDescsOffset is not 0. It can be used to save the FirstTime of the + // first chunk before its chunk desc is evicted. In doubt, this field is + // just set to the oldest possible timestamp. + savedFirstTime model.Time + // The timestamp of the last sample in this series. Needed for fast + // access for federation and to ensure timestamp monotonicity during + // ingestion. + lastTime model.Time + // The last ingested sample value. Needed for fast access for + // federation. + lastSampleValue model.SampleValue + // Whether lastSampleValue has been set already. + lastSampleValueSet bool + // Whether the current head chunk has already been finished. If true, + // the current head chunk must not be modified anymore. + headChunkClosed bool + // Whether the current head chunk is used by an iterator. In that case, + // a non-closed head chunk has to be cloned before more samples are + // appended. + headChunkUsedByIterator bool + // Whether the series is inconsistent with the last checkpoint in a way + // that would require a disk seek during crash recovery. + dirty bool +} + +// newMemorySeries returns a pointer to a newly allocated memorySeries for the +// given metric. chunkDescs and modTime in the new series are set according to +// the provided parameters. chunkDescs can be nil or empty if this is a +// genuinely new time series (i.e. not one that is being unarchived). In that +// case, headChunkClosed is set to false, and firstTime and lastTime are both +// set to model.Earliest. The zero value for modTime can be used if the +// modification time of the series file is unknown (e.g. if this is a genuinely +// new series). +func newMemorySeries(m model.Metric, chunkDescs []*chunk.Desc, modTime time.Time) (*memorySeries, error) { + var err error + firstTime := model.Earliest + lastTime := model.Earliest + if len(chunkDescs) > 0 { + firstTime = chunkDescs[0].FirstTime() + if lastTime, err = chunkDescs[len(chunkDescs)-1].LastTime(); err != nil { + return nil, err + } + } + return &memorySeries{ + metric: m, + chunkDescs: chunkDescs, + headChunkClosed: len(chunkDescs) > 0, + savedFirstTime: firstTime, + lastTime: lastTime, + persistWatermark: len(chunkDescs), + modTime: modTime, + }, nil +} + +// add adds a sample pair to the series. It returns the number of newly +// completed chunks (which are now eligible for persistence). +// +// The caller must have locked the fingerprint of the series. +func (s *memorySeries) add(v model.SamplePair) (int, error) { + if len(s.chunkDescs) == 0 || s.headChunkClosed { + newHead := chunk.NewDesc(chunk.New(), v.Timestamp) + s.chunkDescs = append(s.chunkDescs, newHead) + s.headChunkClosed = false + } else if s.headChunkUsedByIterator && s.head().RefCount() > 1 { + // We only need to clone the head chunk if the current head + // chunk was used in an iterator at all and if the refCount is + // still greater than the 1 we always have because the head + // chunk is not yet persisted. The latter is just an + // approximation. We will still clone unnecessarily if an older + // iterator using a previous version of the head chunk is still + // around and keep the head chunk pinned. We needed to track + // pins by version of the head chunk, which is probably not + // worth the effort. + chunk.Ops.WithLabelValues(chunk.Clone).Inc() + // No locking needed here because a non-persisted head chunk can + // not get evicted concurrently. + s.head().C = s.head().C.Clone() + s.headChunkUsedByIterator = false + } + + chunks, err := s.head().Add(v) + if err != nil { + return 0, err + } + s.head().C = chunks[0] + + for _, c := range chunks[1:] { + s.chunkDescs = append(s.chunkDescs, chunk.NewDesc(c, c.FirstTime())) + } + + // Populate lastTime of now-closed chunks. + for _, cd := range s.chunkDescs[len(s.chunkDescs)-len(chunks) : len(s.chunkDescs)-1] { + if err := cd.MaybePopulateLastTime(); err != nil { + return 0, err + } + } + + s.lastTime = v.Timestamp + s.lastSampleValue = v.Value + s.lastSampleValueSet = true + return len(chunks) - 1, nil +} + +// maybeCloseHeadChunk closes the head chunk if it has not been touched for the +// provided duration. It returns whether the head chunk was closed. If the head +// chunk is already closed, the method is a no-op and returns false. +// +// The caller must have locked the fingerprint of the series. +func (s *memorySeries) maybeCloseHeadChunk(timeout time.Duration) (bool, error) { + if s.headChunkClosed { + return false, nil + } + if time.Since(s.lastTime.Time()) > timeout { + s.headChunkClosed = true + // Since we cannot modify the head chunk from now on, we + // don't need to bother with cloning anymore. + s.headChunkUsedByIterator = false + return true, s.head().MaybePopulateLastTime() + } + return false, nil +} + +// evictChunkDescs evicts chunkDescs. lenToEvict is the index within the current +// chunkDescs of the oldest chunk that is not evicted. +func (s *memorySeries) evictChunkDescs(lenToEvict int) { + if lenToEvict < 1 { + return + } + if s.chunkDescsOffset < 0 { + panic("chunk desc eviction requested with unknown chunk desc offset") + } + lenToKeep := len(s.chunkDescs) - lenToEvict + s.savedFirstTime = s.firstTime() + s.chunkDescsOffset += lenToEvict + s.persistWatermark -= lenToEvict + chunk.DescOps.WithLabelValues(chunk.Evict).Add(float64(lenToEvict)) + chunk.NumMemDescs.Sub(float64(lenToEvict)) + s.chunkDescs = append( + make([]*chunk.Desc, 0, lenToKeep), + s.chunkDescs[lenToEvict:]..., + ) + s.dirty = true +} + +// dropChunks removes chunkDescs older than t. The caller must have locked the +// fingerprint of the series. +func (s *memorySeries) dropChunks(t model.Time) error { + keepIdx := len(s.chunkDescs) + for i, cd := range s.chunkDescs { + lt, err := cd.LastTime() + if err != nil { + return err + } + if !lt.Before(t) { + keepIdx = i + break + } + } + if keepIdx == len(s.chunkDescs) && !s.headChunkClosed { + // Never drop an open head chunk. + keepIdx-- + } + if keepIdx <= 0 { + // Nothing to drop. + return nil + } + s.chunkDescs = append( + make([]*chunk.Desc, 0, len(s.chunkDescs)-keepIdx), + s.chunkDescs[keepIdx:]..., + ) + s.persistWatermark -= keepIdx + if s.persistWatermark < 0 { + panic("dropped unpersisted chunks from memory") + } + if s.chunkDescsOffset != -1 { + s.chunkDescsOffset += keepIdx + } + chunk.NumMemDescs.Sub(float64(keepIdx)) + s.dirty = true + return nil +} + +// preloadChunks is an internal helper method. +func (s *memorySeries) preloadChunks( + indexes []int, fp model.Fingerprint, mss *MemorySeriesStorage, +) (SeriesIterator, error) { + loadIndexes := []int{} + pinnedChunkDescs := make([]*chunk.Desc, 0, len(indexes)) + for _, idx := range indexes { + cd := s.chunkDescs[idx] + pinnedChunkDescs = append(pinnedChunkDescs, cd) + cd.Pin(mss.evictRequests) // Have to pin everything first to prevent immediate eviction on chunk loading. + if cd.IsEvicted() { + loadIndexes = append(loadIndexes, idx) + } + } + chunk.Ops.WithLabelValues(chunk.Pin).Add(float64(len(pinnedChunkDescs))) + + if len(loadIndexes) > 0 { + if s.chunkDescsOffset == -1 { + panic("requested loading chunks from persistence in a situation where we must not have persisted data for chunk descriptors in memory") + } + chunks, err := mss.loadChunks(fp, loadIndexes, s.chunkDescsOffset) + if err != nil { + // Unpin the chunks since we won't return them as pinned chunks now. + for _, cd := range pinnedChunkDescs { + cd.Unpin(mss.evictRequests) + } + chunk.Ops.WithLabelValues(chunk.Unpin).Add(float64(len(pinnedChunkDescs))) + return nopIter, err + } + for i, c := range chunks { + s.chunkDescs[loadIndexes[i]].SetChunk(c) + } + } + + if !s.headChunkClosed && indexes[len(indexes)-1] == len(s.chunkDescs)-1 { + s.headChunkUsedByIterator = true + } + + curriedQuarantineSeries := func(err error) { + mss.quarantineSeries(fp, s.metric, err) + } + + iter := &boundedIterator{ + it: s.newIterator(pinnedChunkDescs, curriedQuarantineSeries, mss.evictRequests), + start: model.Now().Add(-mss.dropAfter), + } + + return iter, nil +} + +// newIterator returns a new SeriesIterator for the provided chunkDescs (which +// must be pinned). +// +// The caller must have locked the fingerprint of the memorySeries. +func (s *memorySeries) newIterator( + pinnedChunkDescs []*chunk.Desc, + quarantine func(error), + evictRequests chan<- chunk.EvictRequest, +) SeriesIterator { + chunks := make([]chunk.Chunk, 0, len(pinnedChunkDescs)) + for _, cd := range pinnedChunkDescs { + // It's OK to directly access cd.c here (without locking) as the + // series FP is locked and the chunk is pinned. + chunks = append(chunks, cd.C) + } + return &memorySeriesIterator{ + chunks: chunks, + chunkIts: make([]chunk.Iterator, len(chunks)), + quarantine: quarantine, + metric: s.metric, + pinnedChunkDescs: pinnedChunkDescs, + evictRequests: evictRequests, + } +} + +// preloadChunksForInstant preloads chunks for the latest value in the given +// range. If the last sample saved in the memorySeries itself is the latest +// value in the given range, it will in fact preload zero chunks and just take +// that value. +func (s *memorySeries) preloadChunksForInstant( + fp model.Fingerprint, + from model.Time, through model.Time, + mss *MemorySeriesStorage, +) (SeriesIterator, error) { + // If we have a lastSamplePair in the series, and this last samplePair + // is in the interval, just take it in a singleSampleSeriesIterator. No + // need to pin or load anything. + lastSample := s.lastSamplePair() + if !through.Before(lastSample.Timestamp) && + !from.After(lastSample.Timestamp) && + lastSample != model.ZeroSamplePair { + iter := &boundedIterator{ + it: &singleSampleSeriesIterator{ + samplePair: lastSample, + metric: s.metric, + }, + start: model.Now().Add(-mss.dropAfter), + } + return iter, nil + } + // If we are here, we are out of luck and have to delegate to the more + // expensive method. + return s.preloadChunksForRange(fp, from, through, mss) +} + +// preloadChunksForRange loads chunks for the given range from the persistence. +// The caller must have locked the fingerprint of the series. +func (s *memorySeries) preloadChunksForRange( + fp model.Fingerprint, + from model.Time, through model.Time, + mss *MemorySeriesStorage, +) (SeriesIterator, error) { + firstChunkDescTime := model.Latest + if len(s.chunkDescs) > 0 { + firstChunkDescTime = s.chunkDescs[0].FirstTime() + } + if s.chunkDescsOffset != 0 && from.Before(firstChunkDescTime) { + cds, err := mss.loadChunkDescs(fp, s.persistWatermark) + if err != nil { + return nopIter, err + } + if s.chunkDescsOffset != -1 && len(cds) != s.chunkDescsOffset { + return nopIter, fmt.Errorf( + "unexpected number of chunk descs loaded for fingerprint %v: expected %d, got %d", + fp, s.chunkDescsOffset, len(cds), + ) + } + s.persistWatermark += len(cds) + s.chunkDescs = append(cds, s.chunkDescs...) + s.chunkDescsOffset = 0 + if len(s.chunkDescs) > 0 { + firstChunkDescTime = s.chunkDescs[0].FirstTime() + } + } + + if len(s.chunkDescs) == 0 || through.Before(firstChunkDescTime) { + return nopIter, nil + } + + // Find first chunk with start time after "from". + fromIdx := sort.Search(len(s.chunkDescs), func(i int) bool { + return s.chunkDescs[i].FirstTime().After(from) + }) + // Find first chunk with start time after "through". + throughIdx := sort.Search(len(s.chunkDescs), func(i int) bool { + return s.chunkDescs[i].FirstTime().After(through) + }) + if fromIdx == len(s.chunkDescs) { + // Even the last chunk starts before "from". Find out if the + // series ends before "from" and we don't need to do anything. + lt, err := s.chunkDescs[len(s.chunkDescs)-1].LastTime() + if err != nil { + return nopIter, err + } + if lt.Before(from) { + return nopIter, nil + } + } + if fromIdx > 0 { + fromIdx-- + } + if throughIdx == len(s.chunkDescs) { + throughIdx-- + } + if fromIdx > throughIdx { + // Guard against nonsensical result. The caller will quarantine the series with a meaningful log entry. + return nopIter, fmt.Errorf("fromIdx=%d is greater than throughIdx=%d, likely caused by data corruption", fromIdx, throughIdx) + } + + pinIndexes := make([]int, 0, throughIdx-fromIdx+1) + for i := fromIdx; i <= throughIdx; i++ { + pinIndexes = append(pinIndexes, i) + } + return s.preloadChunks(pinIndexes, fp, mss) +} + +// head returns a pointer to the head chunk descriptor. The caller must have +// locked the fingerprint of the memorySeries. This method will panic if this +// series has no chunk descriptors. +func (s *memorySeries) head() *chunk.Desc { + return s.chunkDescs[len(s.chunkDescs)-1] +} + +// firstTime returns the timestamp of the first sample in the series. +// +// The caller must have locked the fingerprint of the memorySeries. +func (s *memorySeries) firstTime() model.Time { + if s.chunkDescsOffset == 0 && len(s.chunkDescs) > 0 { + return s.chunkDescs[0].FirstTime() + } + return s.savedFirstTime +} + +// lastSamplePair returns the last ingested SamplePair. It returns +// model.ZeroSamplePair if this memorySeries has never received a sample (via the add +// method), which is the case for freshly unarchived series or newly created +// ones and also for all series after a server restart. However, in that case, +// series will most likely be considered stale anyway. +// +// The caller must have locked the fingerprint of the memorySeries. +func (s *memorySeries) lastSamplePair() model.SamplePair { + if !s.lastSampleValueSet { + return model.ZeroSamplePair + } + return model.SamplePair{ + Timestamp: s.lastTime, + Value: s.lastSampleValue, + } +} + +// chunksToPersist returns a slice of chunkDescs eligible for persistence. It's +// the caller's responsibility to actually persist the returned chunks +// afterwards. The method sets the persistWatermark and the dirty flag +// accordingly. +// +// The caller must have locked the fingerprint of the series. +func (s *memorySeries) chunksToPersist() []*chunk.Desc { + newWatermark := len(s.chunkDescs) + if !s.headChunkClosed { + newWatermark-- + } + if newWatermark == s.persistWatermark { + return nil + } + cds := s.chunkDescs[s.persistWatermark:newWatermark] + s.dirty = true + s.persistWatermark = newWatermark + return cds +} + +// memorySeriesIterator implements SeriesIterator. +type memorySeriesIterator struct { + // Last chunk.Iterator used by ValueAtOrBeforeTime. + chunkIt chunk.Iterator + // Caches chunkIterators. + chunkIts []chunk.Iterator + // The actual sample chunks. + chunks []chunk.Chunk + // Call to quarantine the series this iterator belongs to. + quarantine func(error) + // The metric corresponding to the iterator. + metric model.Metric + // Chunks that were pinned for this iterator. + pinnedChunkDescs []*chunk.Desc + // Where to send evict requests when unpinning pinned chunks. + evictRequests chan<- chunk.EvictRequest +} + +// ValueAtOrBeforeTime implements SeriesIterator. +func (it *memorySeriesIterator) ValueAtOrBeforeTime(t model.Time) model.SamplePair { + // The most common case. We are iterating through a chunk. + if it.chunkIt != nil { + containsT, err := it.chunkIt.Contains(t) + if err != nil { + it.quarantine(err) + return model.ZeroSamplePair + } + if containsT { + if it.chunkIt.FindAtOrBefore(t) { + return it.chunkIt.Value() + } + if it.chunkIt.Err() != nil { + it.quarantine(it.chunkIt.Err()) + } + return model.ZeroSamplePair + } + } + + if len(it.chunks) == 0 { + return model.ZeroSamplePair + } + + // Find the last chunk where FirstTime() is before or equal to t. + l := len(it.chunks) - 1 + i := sort.Search(len(it.chunks), func(i int) bool { + return !it.chunks[l-i].FirstTime().After(t) + }) + if i == len(it.chunks) { + // Even the first chunk starts after t. + return model.ZeroSamplePair + } + it.chunkIt = it.chunkIterator(l - i) + if it.chunkIt.FindAtOrBefore(t) { + return it.chunkIt.Value() + } + if it.chunkIt.Err() != nil { + it.quarantine(it.chunkIt.Err()) + } + return model.ZeroSamplePair +} + +// RangeValues implements SeriesIterator. +func (it *memorySeriesIterator) RangeValues(in metric.Interval) []model.SamplePair { + // Find the first chunk for which the first sample is within the interval. + i := sort.Search(len(it.chunks), func(i int) bool { + return !it.chunks[i].FirstTime().Before(in.OldestInclusive) + }) + // Only now check the last timestamp of the previous chunk (which is + // fairly expensive). + if i > 0 { + lt, err := it.chunkIterator(i - 1).LastTimestamp() + if err != nil { + it.quarantine(err) + return nil + } + if !lt.Before(in.OldestInclusive) { + i-- + } + } + + values := []model.SamplePair{} + for j, c := range it.chunks[i:] { + if c.FirstTime().After(in.NewestInclusive) { + break + } + chValues, err := chunk.RangeValues(it.chunkIterator(i+j), in) + if err != nil { + it.quarantine(err) + return nil + } + values = append(values, chValues...) + } + return values +} + +func (it *memorySeriesIterator) Metric() metric.Metric { + return metric.Metric{Metric: it.metric} +} + +// chunkIterator returns the chunk.Iterator for the chunk at position i (and +// creates it if needed). +func (it *memorySeriesIterator) chunkIterator(i int) chunk.Iterator { + chunkIt := it.chunkIts[i] + if chunkIt == nil { + chunkIt = it.chunks[i].NewIterator() + it.chunkIts[i] = chunkIt + } + return chunkIt +} + +func (it *memorySeriesIterator) Close() { + for _, cd := range it.pinnedChunkDescs { + cd.Unpin(it.evictRequests) + } + chunk.Ops.WithLabelValues(chunk.Unpin).Add(float64(len(it.pinnedChunkDescs))) +} + +// singleSampleSeriesIterator implements Series Iterator. It is a "shortcut +// iterator" that returns a single sample only. The sample is saved in the +// iterator itself, so no chunks need to be pinned. +type singleSampleSeriesIterator struct { + samplePair model.SamplePair + metric model.Metric +} + +// ValueAtTime implements SeriesIterator. +func (it *singleSampleSeriesIterator) ValueAtOrBeforeTime(t model.Time) model.SamplePair { + if it.samplePair.Timestamp.After(t) { + return model.ZeroSamplePair + } + return it.samplePair +} + +// RangeValues implements SeriesIterator. +func (it *singleSampleSeriesIterator) RangeValues(in metric.Interval) []model.SamplePair { + if it.samplePair.Timestamp.After(in.NewestInclusive) || + it.samplePair.Timestamp.Before(in.OldestInclusive) { + return []model.SamplePair{} + } + return []model.SamplePair{it.samplePair} +} + +func (it *singleSampleSeriesIterator) Metric() metric.Metric { + return metric.Metric{Metric: it.metric} +} + +// Close implements SeriesIterator. +func (it *singleSampleSeriesIterator) Close() {} + +// nopSeriesIterator implements Series Iterator. It never returns any values. +type nopSeriesIterator struct{} + +// ValueAtTime implements SeriesIterator. +func (i nopSeriesIterator) ValueAtOrBeforeTime(t model.Time) model.SamplePair { + return model.ZeroSamplePair +} + +// RangeValues implements SeriesIterator. +func (i nopSeriesIterator) RangeValues(in metric.Interval) []model.SamplePair { + return []model.SamplePair{} +} + +// Metric implements SeriesIterator. +func (i nopSeriesIterator) Metric() metric.Metric { + return metric.Metric{} +} + +// Close implements SeriesIterator. +func (i nopSeriesIterator) Close() {} + +var nopIter nopSeriesIterator // A nopSeriesIterator for convenience. Can be shared. diff --git a/vendor/github.com/prometheus/prometheus/storage/local/storage.go b/vendor/github.com/prometheus/prometheus/storage/local/storage.go new file mode 100644 index 000000000..c1caef67e --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/storage/local/storage.go @@ -0,0 +1,2029 @@ +// Copyright 2014 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package local contains the local time series storage used by Prometheus. +package local + +import ( + "container/list" + "errors" + "fmt" + "math/rand" + "runtime" + "sort" + "sync" + "sync/atomic" + "time" + + opentracing "github.com/opentracing/opentracing-go" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/log" + "github.com/prometheus/common/model" + "golang.org/x/net/context" + + "github.com/prometheus/prometheus/storage/local/chunk" + "github.com/prometheus/prometheus/storage/metric" +) + +const ( + evictRequestsCap = 1024 + quarantineRequestsCap = 1024 + + // See waitForNextFP. + fpMaxSweepTime = 6 * time.Hour + fpMaxWaitDuration = 10 * time.Second + + // See handleEvictList. This should be clearly shorter than the usual CG + // interval. On the other hand, each evict check calls ReadMemStats, + // which involves stopping the world (at least up to Go1.8). Hence, + // don't just set this to a very short interval. + evictInterval = time.Second + + // Constants to control the hysteresis of entering and leaving "rushed + // mode". In rushed mode, the dirty series count is ignored for + // checkpointing, series are maintained as frequently as possible, and + // series files are not synced if the adaptive sync strategy is used. + persintenceUrgencyScoreForEnteringRushedMode = 0.8 + persintenceUrgencyScoreForLeavingRushedMode = 0.7 + + // This factor times -storage.local.memory-chunks is the number of + // memory chunks we tolerate before throttling the storage. It is also a + // basis for calculating the persistenceUrgencyScore. + toleranceFactorMemChunks = 1.1 + // This factor times -storage.local.max-chunks-to-persist is the minimum + // required number of chunks waiting for persistence before the number + // of chunks in memory may influence the persistenceUrgencyScore. (In + // other words: if there are no chunks to persist, it doesn't help chunk + // eviction if we speed up persistence.) + factorMinChunksToPersist = 0.2 + + // Threshold for when to stop using LabelMatchers to retrieve and + // intersect fingerprints. The rationale here is that looking up more + // fingerprints has diminishing returns if we already have narrowed down + // the possible fingerprints significantly. It is then easier to simply + // lookup the metrics for all the fingerprints and directly compare them + // to the matchers. Since a fingerprint lookup for an Equal matcher is + // much less expensive, there is a lower threshold for that case. + // TODO(beorn7): These numbers need to be tweaked, probably a bit lower. + // 5x higher numbers have resulted in slightly worse performance in a + // real-life production scenario. + fpEqualMatchThreshold = 1000 + fpOtherMatchThreshold = 10000 + + selectorsTag = "selectors" + fromTag = "from" + throughTag = "through" + tsTag = "ts" + numSeries = "num_series" +) + +type quarantineRequest struct { + fp model.Fingerprint + metric model.Metric + reason error +} + +// SyncStrategy is an enum to select a sync strategy for series files. +type SyncStrategy int + +// String implements flag.Value. +func (ss SyncStrategy) String() string { + switch ss { + case Adaptive: + return "adaptive" + case Always: + return "always" + case Never: + return "never" + } + return "" +} + +// Set implements flag.Value. +func (ss *SyncStrategy) Set(s string) error { + switch s { + case "adaptive": + *ss = Adaptive + case "always": + *ss = Always + case "never": + *ss = Never + default: + return fmt.Errorf("invalid sync strategy: %s", s) + } + return nil +} + +// Possible values for SyncStrategy. +const ( + _ SyncStrategy = iota + Never + Always + Adaptive +) + +// A syncStrategy is a function that returns whether series files should be +// synced or not. It does not need to be goroutine safe. +type syncStrategy func() bool + +// A MemorySeriesStorage manages series in memory over time, while also +// interfacing with a persistence layer to make time series data persistent +// across restarts and evictable from memory. +type MemorySeriesStorage struct { + // archiveHighWatermark, chunksToPersist, persistUrgency have to be aligned for atomic operations. + archiveHighWatermark model.Time // No archived series has samples after this time. + numChunksToPersist int64 // The number of chunks waiting for persistence. + persistUrgency int32 // Persistence urgency score * 1000, int32 allows atomic operations. + rushed bool // Whether the storage is in rushed mode. + rushedMtx sync.Mutex // Protects rushed. + lastNumGC uint32 // To detect if a GC cycle has run. + throttled chan struct{} // This chan is sent to whenever NeedsThrottling() returns true (for logging). + + fpLocker *fingerprintLocker + fpToSeries *seriesMap + + options *MemorySeriesStorageOptions + + loopStopping, loopStopped chan struct{} + logThrottlingStopped chan struct{} + targetHeapSize uint64 + dropAfter time.Duration + headChunkTimeout time.Duration + checkpointInterval time.Duration + checkpointDirtySeriesLimit int + + persistence *persistence + mapper *fpMapper + + evictList *list.List + evictRequests chan chunk.EvictRequest + evictStopping, evictStopped chan struct{} + + quarantineRequests chan quarantineRequest + quarantineStopping, quarantineStopped chan struct{} + + persistErrors prometheus.Counter + queuedChunksToPersist prometheus.Counter + chunksToPersist prometheus.GaugeFunc + memorySeries prometheus.Gauge + headChunks prometheus.Gauge + dirtySeries prometheus.Gauge + seriesOps *prometheus.CounterVec + ingestedSamples prometheus.Counter + discardedSamples *prometheus.CounterVec + nonExistentSeriesMatches prometheus.Counter + memChunks prometheus.GaugeFunc + maintainSeriesDuration *prometheus.SummaryVec + persistenceUrgencyScore prometheus.GaugeFunc + rushedMode prometheus.GaugeFunc + targetHeapSizeBytes prometheus.GaugeFunc +} + +// MemorySeriesStorageOptions contains options needed by +// NewMemorySeriesStorage. It is not safe to leave any of those at their zero +// values. +type MemorySeriesStorageOptions struct { + TargetHeapSize uint64 // Desired maximum heap size. + PersistenceStoragePath string // Location of persistence files. + PersistenceRetentionPeriod time.Duration // Chunks at least that old are dropped. + HeadChunkTimeout time.Duration // Head chunks idle for at least that long may be closed. + CheckpointInterval time.Duration // How often to checkpoint the series map and head chunks. + CheckpointDirtySeriesLimit int // How many dirty series will trigger an early checkpoint. + Dirty bool // Force the storage to consider itself dirty on startup. + PedanticChecks bool // If dirty, perform crash-recovery checks on each series file. + SyncStrategy SyncStrategy // Which sync strategy to apply to series files. + MinShrinkRatio float64 // Minimum ratio a series file has to shrink during truncation. + NumMutexes int // Number of mutexes used for stochastic fingerprint locking. +} + +// NewMemorySeriesStorage returns a newly allocated Storage. Storage.Serve still +// has to be called to start the storage. +func NewMemorySeriesStorage(o *MemorySeriesStorageOptions) *MemorySeriesStorage { + s := &MemorySeriesStorage{ + fpLocker: newFingerprintLocker(o.NumMutexes), + + options: o, + + loopStopping: make(chan struct{}), + loopStopped: make(chan struct{}), + logThrottlingStopped: make(chan struct{}), + throttled: make(chan struct{}, 1), + targetHeapSize: o.TargetHeapSize, + dropAfter: o.PersistenceRetentionPeriod, + headChunkTimeout: o.HeadChunkTimeout, + checkpointInterval: o.CheckpointInterval, + checkpointDirtySeriesLimit: o.CheckpointDirtySeriesLimit, + archiveHighWatermark: model.Now().Add(-o.HeadChunkTimeout), + + evictList: list.New(), + evictRequests: make(chan chunk.EvictRequest, evictRequestsCap), + evictStopping: make(chan struct{}), + evictStopped: make(chan struct{}), + + quarantineRequests: make(chan quarantineRequest, quarantineRequestsCap), + quarantineStopping: make(chan struct{}), + quarantineStopped: make(chan struct{}), + + persistErrors: prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "persist_errors_total", + Help: "The total number of errors while writing to the persistence layer.", + }), + queuedChunksToPersist: prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "queued_chunks_to_persist_total", + Help: "The total number of chunks queued for persistence.", + }), + memorySeries: prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "memory_series", + Help: "The current number of series in memory.", + }), + headChunks: prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "open_head_chunks", + Help: "The current number of open head chunks.", + }), + dirtySeries: prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "memory_dirty_series", + Help: "The current number of series that would require a disk seek during crash recovery.", + }), + seriesOps: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "series_ops_total", + Help: "The total number of series operations by their type.", + }, + []string{opTypeLabel}, + ), + ingestedSamples: prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "ingested_samples_total", + Help: "The total number of samples ingested.", + }), + discardedSamples: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "out_of_order_samples_total", + Help: "The total number of samples that were discarded because their timestamps were at or before the last received sample for a series.", + }, + []string{discardReasonLabel}, + ), + nonExistentSeriesMatches: prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "non_existent_series_matches_total", + Help: "How often a non-existent series was referred to during label matching or chunk preloading. This is an indication of outdated label indexes.", + }), + memChunks: prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "memory_chunks", + Help: "The current number of chunks in memory. The number does not include cloned chunks (i.e. chunks without a descriptor).", + }, + func() float64 { return float64(atomic.LoadInt64(&chunk.NumMemChunks)) }, + ), + maintainSeriesDuration: prometheus.NewSummaryVec( + prometheus.SummaryOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "maintain_series_duration_seconds", + Help: "The duration in seconds it took to perform maintenance on a series.", + }, + []string{seriesLocationLabel}, + ), + } + + s.chunksToPersist = prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "chunks_to_persist", + Help: "The current number of chunks waiting for persistence.", + }, + func() float64 { + return float64(s.getNumChunksToPersist()) + }, + ) + s.rushedMode = prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "rushed_mode", + Help: "1 if the storage is in rushed mode, 0 otherwise.", + }, + func() float64 { + s.rushedMtx.Lock() + defer s.rushedMtx.Unlock() + if s.rushed { + return 1 + } + return 0 + }, + ) + s.persistenceUrgencyScore = prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "persistence_urgency_score", + Help: "A score of urgency to persist chunks, 0 is least urgent, 1 most.", + }, + func() float64 { + score, _ := s.getPersistenceUrgencyScore() + return score + }, + ) + s.targetHeapSizeBytes = prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "target_heap_size_bytes", + Help: "The configured target heap size in bytes.", + }, + func() float64 { + return float64(s.targetHeapSize) + }, + ) + + // Initialize metric vectors. + // TODO(beorn7): Rework once we have a utility function for it in client_golang. + s.discardedSamples.WithLabelValues(outOfOrderTimestamp) + s.discardedSamples.WithLabelValues(duplicateSample) + s.maintainSeriesDuration.WithLabelValues(maintainInMemory) + s.maintainSeriesDuration.WithLabelValues(maintainArchived) + s.seriesOps.WithLabelValues(create) + s.seriesOps.WithLabelValues(archive) + s.seriesOps.WithLabelValues(unarchive) + s.seriesOps.WithLabelValues(memoryPurge) + s.seriesOps.WithLabelValues(archivePurge) + s.seriesOps.WithLabelValues(requestedPurge) + s.seriesOps.WithLabelValues(memoryMaintenance) + s.seriesOps.WithLabelValues(archiveMaintenance) + s.seriesOps.WithLabelValues(completedQurantine) + s.seriesOps.WithLabelValues(droppedQuarantine) + s.seriesOps.WithLabelValues(failedQuarantine) + + return s +} + +// Start implements Storage. +func (s *MemorySeriesStorage) Start() (err error) { + var syncStrategy syncStrategy + switch s.options.SyncStrategy { + case Never: + syncStrategy = func() bool { return false } + case Always: + syncStrategy = func() bool { return true } + case Adaptive: + syncStrategy = func() bool { + _, rushed := s.getPersistenceUrgencyScore() + return !rushed + } + default: + panic("unknown sync strategy") + } + + var p *persistence + p, err = newPersistence( + s.options.PersistenceStoragePath, + s.options.Dirty, s.options.PedanticChecks, + syncStrategy, + s.options.MinShrinkRatio, + ) + if err != nil { + return err + } + s.persistence = p + // Persistence must start running before loadSeriesMapAndHeads() is called. + go s.persistence.run() + + defer func() { + if err != nil { + if e := p.close(); e != nil { + log.Errorln("Error closing persistence:", e) + } + } + }() + + log.Info("Loading series map and head chunks...") + s.fpToSeries, s.numChunksToPersist, err = p.loadSeriesMapAndHeads() + for _, series := range s.fpToSeries.m { + if !series.headChunkClosed { + s.headChunks.Inc() + } + } + + if err != nil { + return err + } + log.Infof("%d series loaded.", s.fpToSeries.length()) + s.memorySeries.Set(float64(s.fpToSeries.length())) + + s.mapper, err = newFPMapper(s.fpToSeries, p) + if err != nil { + return err + } + + go s.handleEvictList() + go s.handleQuarantine() + go s.logThrottling() + go s.loop() + + return nil +} + +// Stop implements Storage. +func (s *MemorySeriesStorage) Stop() error { + log.Info("Stopping local storage...") + + log.Info("Stopping maintenance loop...") + close(s.loopStopping) + <-s.loopStopped + + log.Info("Stopping series quarantining...") + close(s.quarantineStopping) + <-s.quarantineStopped + + log.Info("Stopping chunk eviction...") + close(s.evictStopping) + <-s.evictStopped + + // One final checkpoint of the series map and the head chunks. + if err := s.persistence.checkpointSeriesMapAndHeads( + context.Background(), s.fpToSeries, s.fpLocker, + ); err != nil { + return err + } + if err := s.mapper.checkpoint(); err != nil { + return err + } + + if err := s.persistence.close(); err != nil { + return err + } + log.Info("Local storage stopped.") + return nil +} + +type memorySeriesStorageQuerier struct { + *MemorySeriesStorage +} + +func (memorySeriesStorageQuerier) Close() error { + return nil +} + +// Querier implements the storage interface. +func (s *MemorySeriesStorage) Querier() (Querier, error) { + return memorySeriesStorageQuerier{s}, nil +} + +// WaitForIndexing implements Storage. +func (s *MemorySeriesStorage) WaitForIndexing() { + s.persistence.waitForIndexing() +} + +// LastSampleForLabelMatchers implements Storage. +func (s *MemorySeriesStorage) LastSampleForLabelMatchers(_ context.Context, cutoff model.Time, matcherSets ...metric.LabelMatchers) (model.Vector, error) { + mergedFPs := map[model.Fingerprint]struct{}{} + for _, matchers := range matcherSets { + fps, err := s.fpsForLabelMatchers(cutoff, model.Latest, matchers...) + if err != nil { + return nil, err + } + for fp := range fps { + mergedFPs[fp] = struct{}{} + } + } + + res := make(model.Vector, 0, len(mergedFPs)) + for fp := range mergedFPs { + s.fpLocker.Lock(fp) + + series, ok := s.fpToSeries.get(fp) + if !ok { + // A series could have disappeared between resolving label matchers and here. + s.fpLocker.Unlock(fp) + continue + } + sp := series.lastSamplePair() + res = append(res, &model.Sample{ + Metric: series.metric, + Value: sp.Value, + Timestamp: sp.Timestamp, + }) + s.fpLocker.Unlock(fp) + } + return res, nil +} + +// boundedIterator wraps a SeriesIterator and does not allow fetching +// data from earlier than the configured start time. +type boundedIterator struct { + it SeriesIterator + start model.Time +} + +// ValueAtOrBeforeTime implements the SeriesIterator interface. +func (bit *boundedIterator) ValueAtOrBeforeTime(ts model.Time) model.SamplePair { + if ts < bit.start { + return model.ZeroSamplePair + } + return bit.it.ValueAtOrBeforeTime(ts) +} + +// RangeValues implements the SeriesIterator interface. +func (bit *boundedIterator) RangeValues(interval metric.Interval) []model.SamplePair { + if interval.NewestInclusive < bit.start { + return []model.SamplePair{} + } + if interval.OldestInclusive < bit.start { + interval.OldestInclusive = bit.start + } + return bit.it.RangeValues(interval) +} + +// Metric implements SeriesIterator. +func (bit *boundedIterator) Metric() metric.Metric { + return bit.it.Metric() +} + +// Close implements SeriesIterator. +func (bit *boundedIterator) Close() { + bit.it.Close() +} + +// QueryRange implements Storage. +func (s *MemorySeriesStorage) QueryRange(ctx context.Context, from, through model.Time, matchers ...*metric.LabelMatcher) ([]SeriesIterator, error) { + span, _ := opentracing.StartSpanFromContext(ctx, "QueryRange") + span.SetTag(selectorsTag, metric.LabelMatchers(matchers).String()) + span.SetTag(fromTag, int64(from)) + span.SetTag(throughTag, int64(through)) + defer span.Finish() + + if through.Before(from) { + // In that case, nothing will match. + return nil, nil + } + fpSeriesPairs, err := s.seriesForLabelMatchers(from, through, matchers...) + if err != nil { + return nil, err + } + span.SetTag(numSeries, len(fpSeriesPairs)) + iterators := make([]SeriesIterator, 0, len(fpSeriesPairs)) + for _, pair := range fpSeriesPairs { + it := s.preloadChunksForRange(pair, from, through) + iterators = append(iterators, it) + } + return iterators, nil +} + +// QueryInstant implements Storage. +func (s *MemorySeriesStorage) QueryInstant(ctx context.Context, ts model.Time, stalenessDelta time.Duration, matchers ...*metric.LabelMatcher) ([]SeriesIterator, error) { + span, _ := opentracing.StartSpanFromContext(ctx, "QueryInstant") + span.SetTag(selectorsTag, metric.LabelMatchers(matchers).String()) + span.SetTag(tsTag, ts) + defer span.Finish() + + if stalenessDelta < 0 { + panic("negative staleness delta") + } + from := ts.Add(-stalenessDelta) + through := ts + + fpSeriesPairs, err := s.seriesForLabelMatchers(from, through, matchers...) + if err != nil { + return nil, err + } + iterators := make([]SeriesIterator, 0, len(fpSeriesPairs)) + for _, pair := range fpSeriesPairs { + it := s.preloadChunksForInstant(pair, from, through) + iterators = append(iterators, it) + } + return iterators, nil +} + +// fingerprintsForLabelPair returns the fingerprints with the given +// LabelPair. If intersectWith is non-nil, the method will only return +// fingerprints that are also contained in intersectsWith. If mergeWith is +// non-nil, the found fingerprints are added to the given map. The returned map +// is the same as the given one. +func (s *MemorySeriesStorage) fingerprintsForLabelPair( + pair model.LabelPair, + mergeWith map[model.Fingerprint]struct{}, + intersectWith map[model.Fingerprint]struct{}, +) map[model.Fingerprint]struct{} { + if mergeWith == nil { + mergeWith = map[model.Fingerprint]struct{}{} + } + for _, fp := range s.persistence.fingerprintsForLabelPair(pair) { + if intersectWith == nil { + mergeWith[fp] = struct{}{} + continue + } + if _, ok := intersectWith[fp]; ok { + mergeWith[fp] = struct{}{} + } + } + return mergeWith +} + +// MetricsForLabelMatchers implements Storage. +func (s *MemorySeriesStorage) MetricsForLabelMatchers( + _ context.Context, + from, through model.Time, + matcherSets ...metric.LabelMatchers, +) ([]metric.Metric, error) { + fpToMetric := map[model.Fingerprint]metric.Metric{} + for _, matchers := range matcherSets { + metrics, err := s.metricsForLabelMatchers(from, through, matchers...) + if err != nil { + return nil, err + } + for fp, m := range metrics { + fpToMetric[fp] = m + } + } + + metrics := make([]metric.Metric, 0, len(fpToMetric)) + for _, m := range fpToMetric { + metrics = append(metrics, m) + } + return metrics, nil +} + +// candidateFPsForLabelMatchers returns candidate FPs for given matchers and remaining matchers to be checked. +func (s *MemorySeriesStorage) candidateFPsForLabelMatchers( + matchers ...*metric.LabelMatcher, +) (map[model.Fingerprint]struct{}, []*metric.LabelMatcher, error) { + sort.Sort(metric.LabelMatchers(matchers)) + + if len(matchers) == 0 || matchers[0].MatchesEmptyString() { + // No matchers at all or even the best matcher matches the empty string. + return nil, nil, nil + } + + var ( + matcherIdx int + candidateFPs map[model.Fingerprint]struct{} + ) + + // Equal matchers. + for ; matcherIdx < len(matchers) && (candidateFPs == nil || len(candidateFPs) > fpEqualMatchThreshold); matcherIdx++ { + m := matchers[matcherIdx] + if m.Type != metric.Equal || m.MatchesEmptyString() { + break + } + candidateFPs = s.fingerprintsForLabelPair( + model.LabelPair{ + Name: m.Name, + Value: m.Value, + }, + nil, + candidateFPs, + ) + if len(candidateFPs) == 0 { + return nil, nil, nil + } + } + + // Other matchers. + for ; matcherIdx < len(matchers) && (candidateFPs == nil || len(candidateFPs) > fpOtherMatchThreshold); matcherIdx++ { + m := matchers[matcherIdx] + if m.MatchesEmptyString() { + break + } + + lvs, err := s.LabelValuesForLabelName(context.TODO(), m.Name) + if err != nil { + return nil, nil, err + } + lvs = m.Filter(lvs) + if len(lvs) == 0 { + return nil, nil, nil + } + fps := map[model.Fingerprint]struct{}{} + for _, lv := range lvs { + s.fingerprintsForLabelPair( + model.LabelPair{ + Name: m.Name, + Value: lv, + }, + fps, + candidateFPs, + ) + } + candidateFPs = fps + if len(candidateFPs) == 0 { + return nil, nil, nil + } + } + return candidateFPs, matchers[matcherIdx:], nil +} + +func (s *MemorySeriesStorage) seriesForLabelMatchers( + from, through model.Time, + matchers ...*metric.LabelMatcher, +) ([]fingerprintSeriesPair, error) { + candidateFPs, matchersToCheck, err := s.candidateFPsForLabelMatchers(matchers...) + if err != nil { + return nil, err + } + + result := []fingerprintSeriesPair{} +FPLoop: + for fp := range candidateFPs { + s.fpLocker.Lock(fp) + series := s.seriesForRange(fp, from, through) + s.fpLocker.Unlock(fp) + + if series == nil { + continue FPLoop + } + + for _, m := range matchersToCheck { + if !m.Match(series.metric[m.Name]) { + continue FPLoop + } + } + result = append(result, fingerprintSeriesPair{fp, series}) + } + return result, nil +} + +func (s *MemorySeriesStorage) fpsForLabelMatchers( + from, through model.Time, + matchers ...*metric.LabelMatcher, +) (map[model.Fingerprint]struct{}, error) { + candidateFPs, matchersToCheck, err := s.candidateFPsForLabelMatchers(matchers...) + if err != nil { + return nil, err + } + +FPLoop: + for fp := range candidateFPs { + s.fpLocker.Lock(fp) + met, _, ok := s.metricForRange(fp, from, through) + s.fpLocker.Unlock(fp) + + if !ok { + delete(candidateFPs, fp) + continue FPLoop + } + + for _, m := range matchersToCheck { + if !m.Match(met[m.Name]) { + delete(candidateFPs, fp) + continue FPLoop + } + } + } + return candidateFPs, nil +} + +func (s *MemorySeriesStorage) metricsForLabelMatchers( + from, through model.Time, + matchers ...*metric.LabelMatcher, +) (map[model.Fingerprint]metric.Metric, error) { + + candidateFPs, matchersToCheck, err := s.candidateFPsForLabelMatchers(matchers...) + if err != nil { + return nil, err + } + + result := map[model.Fingerprint]metric.Metric{} +FPLoop: + for fp := range candidateFPs { + s.fpLocker.Lock(fp) + met, _, ok := s.metricForRange(fp, from, through) + s.fpLocker.Unlock(fp) + + if !ok { + continue FPLoop + } + + for _, m := range matchersToCheck { + if !m.Match(met[m.Name]) { + continue FPLoop + } + } + result[fp] = metric.Metric{Metric: met} + } + return result, nil +} + +// metricForRange returns the metric for the given fingerprint if the +// corresponding time series has samples between 'from' and 'through', together +// with a pointer to the series if it is in memory already. For a series that +// does not have samples between 'from' and 'through', the returned bool is +// false. For an archived series that does contain samples between 'from' and +// 'through', it returns (metric, nil, true). +// +// The caller must have locked the fp. +func (s *MemorySeriesStorage) metricForRange( + fp model.Fingerprint, + from, through model.Time, +) (model.Metric, *memorySeries, bool) { + series, ok := s.fpToSeries.get(fp) + if ok { + if series.lastTime.Before(from) || series.firstTime().After(through) { + return nil, nil, false + } + return series.metric, series, true + } + // From here on, we are only concerned with archived metrics. + // If the high watermark of archived series is before 'from', we are done. + watermark := model.Time(atomic.LoadInt64((*int64)(&s.archiveHighWatermark))) + if watermark < from { + return nil, nil, false + } + if from.After(model.Earliest) || through.Before(model.Latest) { + // The range lookup is relatively cheap, so let's do it first if + // we have a chance the archived metric is not in the range. + has, first, last := s.persistence.hasArchivedMetric(fp) + if !has { + s.nonExistentSeriesMatches.Inc() + return nil, nil, false + } + if first.After(through) || last.Before(from) { + return nil, nil, false + } + } + + metric, err := s.persistence.archivedMetric(fp) + if err != nil { + // archivedMetric has already flagged the storage as dirty in this case. + return nil, nil, false + } + return metric, nil, true +} + +// LabelValuesForLabelName implements Storage. +func (s *MemorySeriesStorage) LabelValuesForLabelName(_ context.Context, labelName model.LabelName) (model.LabelValues, error) { + return s.persistence.labelValuesForLabelName(labelName) +} + +// DropMetricsForLabelMatchers implements Storage. +func (s *MemorySeriesStorage) DropMetricsForLabelMatchers(_ context.Context, matchers ...*metric.LabelMatcher) (int, error) { + fps, err := s.fpsForLabelMatchers(model.Earliest, model.Latest, matchers...) + if err != nil { + return 0, err + } + for fp := range fps { + s.purgeSeries(fp, nil, nil) + } + return len(fps), nil +} + +var ( + // ErrOutOfOrderSample is returned if a sample has a timestamp before the latest + // timestamp in the series it is appended to. + ErrOutOfOrderSample = fmt.Errorf("sample timestamp out of order") + // ErrDuplicateSampleForTimestamp is returned if a sample has the same + // timestamp as the latest sample in the series it is appended to but a + // different value. (Appending an identical sample is a no-op and does + // not cause an error.) + ErrDuplicateSampleForTimestamp = fmt.Errorf("sample with repeated timestamp but different value") +) + +// Append implements Storage. +func (s *MemorySeriesStorage) Append(sample *model.Sample) error { + for ln, lv := range sample.Metric { + if len(lv) == 0 { + delete(sample.Metric, ln) + } + } + rawFP := sample.Metric.FastFingerprint() + s.fpLocker.Lock(rawFP) + fp := s.mapper.mapFP(rawFP, sample.Metric) + defer func() { + s.fpLocker.Unlock(fp) + }() // Func wrapper because fp might change below. + if fp != rawFP { + // Switch locks. + s.fpLocker.Unlock(rawFP) + s.fpLocker.Lock(fp) + } + series, err := s.getOrCreateSeries(fp, sample.Metric) + if err != nil { + return err // getOrCreateSeries took care of quarantining already. + } + + if sample.Timestamp == series.lastTime { + // Don't report "no-op appends", i.e. where timestamp and sample + // value are the same as for the last append, as they are a + // common occurrence when using client-side timestamps + // (e.g. Pushgateway or federation). + if sample.Timestamp == series.lastTime && + series.lastSampleValueSet && + sample.Value.Equal(series.lastSampleValue) { + return nil + } + s.discardedSamples.WithLabelValues(duplicateSample).Inc() + return ErrDuplicateSampleForTimestamp // Caused by the caller. + } + if sample.Timestamp < series.lastTime { + s.discardedSamples.WithLabelValues(outOfOrderTimestamp).Inc() + return ErrOutOfOrderSample // Caused by the caller. + } + headChunkWasClosed := series.headChunkClosed + completedChunksCount, err := series.add(model.SamplePair{ + Value: sample.Value, + Timestamp: sample.Timestamp, + }) + if err != nil { + s.quarantineSeries(fp, sample.Metric, err) + return err + } + if headChunkWasClosed { + // Appending to a series with a closed head chunk creates an + // additional open head chunk. + s.headChunks.Inc() + } + s.ingestedSamples.Inc() + s.incNumChunksToPersist(completedChunksCount) + + return nil +} + +// NeedsThrottling implements Storage. +func (s *MemorySeriesStorage) NeedsThrottling() bool { + if score, _ := s.getPersistenceUrgencyScore(); score >= 1 { + select { + case s.throttled <- struct{}{}: + default: // Do nothing, signal already pending. + } + return true + } + return false +} + +// logThrottling handles logging of throttled events and has to be started as a +// goroutine. It stops once s.loopStopping is closed. +// +// Logging strategy: Whenever Throttle() is called and returns true, an signal +// is sent to s.throttled. If that happens for the first time, an Error is +// logged that the storage is now throttled. As long as signals continues to be +// sent via s.throttled at least once per minute, nothing else is logged. Once +// no signal has arrived for a minute, an Info is logged that the storage is not +// throttled anymore. This resets things to the initial state, i.e. once a +// signal arrives again, the Error will be logged again. +func (s *MemorySeriesStorage) logThrottling() { + timer := time.NewTimer(time.Minute) + timer.Stop() + + // Signal exit of the goroutine. Currently only needed by test code. + defer close(s.logThrottlingStopped) + + for { + select { + case <-s.throttled: + if !timer.Stop() { + select { + case <-timer.C: + default: + } + score, _ := s.getPersistenceUrgencyScore() + log. + With("urgencyScore", score). + With("chunksToPersist", s.getNumChunksToPersist()). + With("memoryChunks", atomic.LoadInt64(&chunk.NumMemChunks)). + Error("Storage needs throttling. Scrapes and rule evaluations will be skipped.") + } + timer.Reset(time.Minute) + case <-timer.C: + score, _ := s.getPersistenceUrgencyScore() + log. + With("urgencyScore", score). + With("chunksToPersist", s.getNumChunksToPersist()). + With("memoryChunks", atomic.LoadInt64(&chunk.NumMemChunks)). + Info("Storage does not need throttling anymore.") + case <-s.loopStopping: + return + } + } +} + +func (s *MemorySeriesStorage) getOrCreateSeries(fp model.Fingerprint, m model.Metric) (*memorySeries, error) { + series, ok := s.fpToSeries.get(fp) + if !ok { + var cds []*chunk.Desc + var modTime time.Time + unarchived, err := s.persistence.unarchiveMetric(fp) + if err != nil { + log.Errorf("Error unarchiving fingerprint %v (metric %v): %v", fp, m, err) + return nil, err + } + if unarchived { + s.seriesOps.WithLabelValues(unarchive).Inc() + // We have to load chunk.Descs anyway to do anything with + // the series, so let's do it right now so that we don't + // end up with a series without any chunk.Descs for a + // while (which is confusing as it makes the series + // appear as archived or purged). + cds, err = s.loadChunkDescs(fp, 0) + if err == nil && len(cds) == 0 { + err = fmt.Errorf("unarchived fingerprint %v (metric %v) has no chunks on disk", fp, m) + } + if err != nil { + s.quarantineSeries(fp, m, err) + return nil, err + } + modTime = s.persistence.seriesFileModTime(fp) + } else { + // This was a genuinely new series, so index the metric. + s.persistence.indexMetric(fp, m) + s.seriesOps.WithLabelValues(create).Inc() + } + series, err = newMemorySeries(m, cds, modTime) + if err != nil { + s.quarantineSeries(fp, m, err) + return nil, err + } + s.fpToSeries.put(fp, series) + s.memorySeries.Inc() + if !series.headChunkClosed { + s.headChunks.Inc() + } + } + return series, nil +} + +// seriesForRange is a helper method for seriesForLabelMatchers. +// +// The caller must have locked the fp. +func (s *MemorySeriesStorage) seriesForRange( + fp model.Fingerprint, + from model.Time, through model.Time, +) *memorySeries { + metric, series, ok := s.metricForRange(fp, from, through) + if !ok { + return nil + } + if series == nil { + series, _ = s.getOrCreateSeries(fp, metric) + // getOrCreateSeries took care of quarantining already, so ignore the error. + } + return series +} + +func (s *MemorySeriesStorage) preloadChunksForRange( + pair fingerprintSeriesPair, + from model.Time, through model.Time, +) SeriesIterator { + fp, series := pair.fp, pair.series + if series == nil { + return nopIter + } + + s.fpLocker.Lock(fp) + defer s.fpLocker.Unlock(fp) + + iter, err := series.preloadChunksForRange(fp, from, through, s) + if err != nil { + s.quarantineSeries(fp, series.metric, err) + return nopIter + } + return iter +} + +func (s *MemorySeriesStorage) preloadChunksForInstant( + pair fingerprintSeriesPair, + from model.Time, through model.Time, +) SeriesIterator { + fp, series := pair.fp, pair.series + if series == nil { + return nopIter + } + + s.fpLocker.Lock(fp) + defer s.fpLocker.Unlock(fp) + + iter, err := series.preloadChunksForInstant(fp, from, through, s) + if err != nil { + s.quarantineSeries(fp, series.metric, err) + return nopIter + } + return iter +} + +func (s *MemorySeriesStorage) handleEvictList() { + // This ticker is supposed to tick at least once per GC cyle. Ideally, + // we would handle the evict list after each finished GC cycle, but I + // don't know of a way to "subscribe" to that kind of event. + ticker := time.NewTicker(evictInterval) + + for { + select { + case req := <-s.evictRequests: + if req.Evict { + req.Desc.EvictListElement = s.evictList.PushBack(req.Desc) + } else { + if req.Desc.EvictListElement != nil { + s.evictList.Remove(req.Desc.EvictListElement) + req.Desc.EvictListElement = nil + } + } + case <-ticker.C: + s.maybeEvict() + case <-s.evictStopping: + // Drain evictRequests forever in a goroutine to not let + // requesters hang. + go func() { + for { + <-s.evictRequests + } + }() + ticker.Stop() + log.Info("Chunk eviction stopped.") + close(s.evictStopped) + return + } + } +} + +// maybeEvict is a local helper method. Must only be called by handleEvictList. +func (s *MemorySeriesStorage) maybeEvict() { + ms := runtime.MemStats{} + runtime.ReadMemStats(&ms) + numChunksToEvict := s.calculatePersistUrgency(&ms) + + if numChunksToEvict <= 0 { + return + } + + chunkDescsToEvict := make([]*chunk.Desc, numChunksToEvict) + for i := range chunkDescsToEvict { + e := s.evictList.Front() + if e == nil { + break + } + cd := e.Value.(*chunk.Desc) + cd.EvictListElement = nil + chunkDescsToEvict[i] = cd + s.evictList.Remove(e) + } + // Do the actual eviction in a goroutine as we might otherwise deadlock, + // in the following way: A chunk was Unpinned completely and therefore + // scheduled for eviction. At the time we actually try to evict it, + // another goroutine is pinning the chunk. The pinning goroutine has + // currently locked the chunk and tries to send the evict request (to + // remove the chunk from the evict list) to the evictRequests + // channel. The send blocks because evictRequests is full. However, the + // goroutine that is supposed to empty the channel is waiting for the + // Chunk.Desc lock to try to evict the chunk. + go func() { + for _, cd := range chunkDescsToEvict { + if cd == nil { + break + } + cd.MaybeEvict() + // We don't care if the eviction succeeds. If the chunk + // was pinned in the meantime, it will be added to the + // evict list once it gets Unpinned again. + } + }() +} + +// calculatePersistUrgency calculates and sets s.persistUrgency. Based on the +// calculation, it returns the number of chunks to evict. The runtime.MemStats +// are passed in here for testability. +// +// The persist urgency is calculated by the following formula: +// +// n(toPersist) MAX( h(nextGC), h(current) ) +// p = MIN( 1, --------------------------- * ---------------------------- ) +// n(toPersist) + n(evictable) h(target) +// +// where: +// +// n(toPersist): Number of chunks waiting for persistence. +// n(evictable): Number of evictable chunks. +// h(nextGC): Heap size at which the next GC will kick in (ms.NextGC). +// h(current): Current heap size (ms.HeapAlloc). +// h(target): Configured target heap size. +// +// Note that the actual value stored in s.persistUrgency is 1000 times the value +// calculated as above to allow using an int32, which supports atomic +// operations. +// +// If no GC has run after the last call of this method, it will always return 0 +// (no reason to try to evict any more chunks before we have seen the effect of +// the previous eviction). It will also not decrease the persist urgency in this +// case (but it will increase the persist urgency if a higher value was calculated). +// +// If a GC has run after the last call of this method, the following cases apply: +// +// - If MAX( h(nextGC), h(current) ) < h(target), simply return 0. Nothing to +// evict if the heap is still small enough. +// +// - Otherwise, if n(evictable) is 0, also return 0, but set the urgency score +// to 1 to signal that we want to evict chunk but have no evictable chunks +// available. +// +// - Otherwise, calculate the number of chunks to evict and return it: +// +// MAX( h(nextGC), h(current) ) - h(target) +// n(toEvict) = MIN( n(evictable), ---------------------------------------- ) +// c +// +// where c is the size of a chunk. +// +// - In the latter case, the persist urgency might be increased. The final value +// is the following: +// +// n(toEvict) +// MAX( p, ------------ ) +// n(evictable) +// +// Broadly speaking, the persist urgency is based on the ratio of the number of +// chunks we want to evict and the number of chunks that are actually +// evictable. However, in particular for the case where we don't need to evict +// chunks yet, it also takes into account how close the heap has already grown +// to the configured target size, and how big the pool of chunks to persist is +// compared to the number of chunks already evictable. +// +// This is a helper method only to be called by MemorySeriesStorage.maybeEvict. +func (s *MemorySeriesStorage) calculatePersistUrgency(ms *runtime.MemStats) int { + var ( + oldUrgency = atomic.LoadInt32(&s.persistUrgency) + newUrgency int32 + numChunksToPersist = s.getNumChunksToPersist() + ) + defer func() { + if newUrgency > 1000 { + newUrgency = 1000 + } + atomic.StoreInt32(&s.persistUrgency, newUrgency) + }() + + // Take the NextGC as the relevant heap size because the heap will grow + // to that size before GC kicks in. However, at times the current heap + // is already larger than NextGC, in which case we take that worse case. + heapSize := ms.NextGC + if ms.HeapAlloc > ms.NextGC { + heapSize = ms.HeapAlloc + } + + if numChunksToPersist > 0 { + newUrgency = int32(1000 * uint64(numChunksToPersist) / uint64(numChunksToPersist+s.evictList.Len()) * heapSize / s.targetHeapSize) + } + + // Only continue if a GC has happened since we were here last time. + if ms.NumGC == s.lastNumGC { + if oldUrgency > newUrgency { + // Never reduce urgency without a GC run. + newUrgency = oldUrgency + } + return 0 + } + s.lastNumGC = ms.NumGC + + if heapSize <= s.targetHeapSize { + return 0 // Heap still small enough, don't evict. + } + if s.evictList.Len() == 0 { + // We want to reduce heap size but there is nothing to evict. + newUrgency = 1000 + return 0 + } + numChunksToEvict := int((heapSize - s.targetHeapSize) / chunk.ChunkLen) + if numChunksToEvict > s.evictList.Len() { + numChunksToEvict = s.evictList.Len() + } + if u := int32(numChunksToEvict * 1000 / s.evictList.Len()); u > newUrgency { + newUrgency = u + } + return numChunksToEvict +} + +// waitForNextFP waits an estimated duration, after which we want to process +// another fingerprint so that we will process all fingerprints in a tenth of +// s.dropAfter assuming that the system is doing nothing else, e.g. if we want +// to drop chunks after 40h, we want to cycle through all fingerprints within +// 4h. The estimation is based on the total number of fingerprints as passed +// in. However, the maximum sweep time is capped at fpMaxSweepTime. Also, the +// method will never wait for longer than fpMaxWaitDuration. +// +// The maxWaitDurationFactor can be used to reduce the waiting time if a faster +// processing is required (for example because unpersisted chunks pile up too +// much). +// +// Normally, the method returns true once the wait duration has passed. However, +// if s.loopStopped is closed, it will return false immediately. +func (s *MemorySeriesStorage) waitForNextFP(numberOfFPs int, maxWaitDurationFactor float64) bool { + d := fpMaxWaitDuration + if numberOfFPs != 0 { + sweepTime := s.dropAfter / 10 + if sweepTime > fpMaxSweepTime { + sweepTime = fpMaxSweepTime + } + calculatedWait := time.Duration(float64(sweepTime) / float64(numberOfFPs) * maxWaitDurationFactor) + if calculatedWait < d { + d = calculatedWait + } + } + if d == 0 { + return true + } + t := time.NewTimer(d) + select { + case <-t.C: + return true + case <-s.loopStopping: + return false + } +} + +// cycleThroughMemoryFingerprints returns a channel that emits fingerprints for +// series in memory in a throttled fashion. It continues to cycle through all +// fingerprints in memory until s.loopStopping is closed. +func (s *MemorySeriesStorage) cycleThroughMemoryFingerprints() chan model.Fingerprint { + memoryFingerprints := make(chan model.Fingerprint) + go func() { + defer close(memoryFingerprints) + firstPass := true + + for { + // Initial wait, also important if there are no FPs yet. + if !s.waitForNextFP(s.fpToSeries.length(), 1) { + return + } + begin := time.Now() + fps := s.fpToSeries.sortedFPs() + if firstPass && len(fps) > 0 { + // Start first pass at a random location in the + // key space to cover the whole key space even + // in the case of frequent restarts. + fps = fps[rand.Intn(len(fps)):] + } + count := 0 + for _, fp := range fps { + select { + case memoryFingerprints <- fp: + case <-s.loopStopping: + return + } + // Reduce the wait time according to the urgency score. + score, rushed := s.getPersistenceUrgencyScore() + if rushed { + score = 1 + } + s.waitForNextFP(s.fpToSeries.length(), 1-score) + count++ + } + if count > 0 { + msg := "full" + if firstPass { + msg = "initial partial" + } + log.Infof( + "Completed %s maintenance sweep through %d in-memory fingerprints in %v.", + msg, count, time.Since(begin), + ) + } + firstPass = false + } + }() + + return memoryFingerprints +} + +// cycleThroughArchivedFingerprints returns a channel that emits fingerprints +// for archived series in a throttled fashion. It continues to cycle through all +// archived fingerprints until s.loopStopping is closed. +func (s *MemorySeriesStorage) cycleThroughArchivedFingerprints() chan model.Fingerprint { + archivedFingerprints := make(chan model.Fingerprint) + go func() { + defer close(archivedFingerprints) + + for { + archivedFPs, err := s.persistence.fingerprintsModifiedBefore( + model.Now().Add(-s.dropAfter), + ) + if err != nil { + log.Error("Failed to lookup archived fingerprint ranges: ", err) + s.waitForNextFP(0, 1) + continue + } + // Initial wait, also important if there are no FPs yet. + if !s.waitForNextFP(len(archivedFPs), 1) { + return + } + begin := time.Now() + for _, fp := range archivedFPs { + select { + case archivedFingerprints <- fp: + case <-s.loopStopping: + return + } + // Never speed up maintenance of archived FPs. + s.waitForNextFP(len(archivedFPs), 1) + } + if len(archivedFPs) > 0 { + log.Infof( + "Completed maintenance sweep through %d archived fingerprints in %v.", + len(archivedFPs), time.Since(begin), + ) + } + } + }() + return archivedFingerprints +} + +func (s *MemorySeriesStorage) loop() { + checkpointTimer := time.NewTimer(s.checkpointInterval) + checkpointMinTimer := time.NewTimer(0) + + var dirtySeriesCount int64 + + defer func() { + checkpointTimer.Stop() + checkpointMinTimer.Stop() + log.Info("Maintenance loop stopped.") + close(s.loopStopped) + }() + + memoryFingerprints := s.cycleThroughMemoryFingerprints() + archivedFingerprints := s.cycleThroughArchivedFingerprints() + + checkpointCtx, checkpointCancel := context.WithCancel(context.Background()) + checkpointNow := make(chan struct{}, 1) + + doCheckpoint := func() time.Duration { + start := time.Now() + // We clear this before the checkpoint so that dirtySeriesCount + // is an upper bound. + atomic.StoreInt64(&dirtySeriesCount, 0) + s.dirtySeries.Set(0) + select { + case <-checkpointNow: + // Signal cleared. + default: + // No signal pending. + } + err := s.persistence.checkpointSeriesMapAndHeads( + checkpointCtx, s.fpToSeries, s.fpLocker, + ) + if err == context.Canceled { + log.Info("Checkpoint canceled.") + } else if err != nil { + s.persistErrors.Inc() + log.Errorln("Error while checkpointing:", err) + } + return time.Since(start) + } + + // Checkpoints can happen concurrently with maintenance so even with heavy + // checkpointing there will still be sufficient progress on maintenance. + checkpointLoopStopped := make(chan struct{}) + go func() { + for { + select { + case <-checkpointCtx.Done(): + checkpointLoopStopped <- struct{}{} + return + case <-checkpointMinTimer.C: + var took time.Duration + select { + case <-checkpointCtx.Done(): + checkpointLoopStopped <- struct{}{} + return + case <-checkpointTimer.C: + took = doCheckpoint() + case <-checkpointNow: + if !checkpointTimer.Stop() { + <-checkpointTimer.C + } + took = doCheckpoint() + } + checkpointMinTimer.Reset(took) + checkpointTimer.Reset(s.checkpointInterval) + } + } + }() + +loop: + for { + select { + case <-s.loopStopping: + checkpointCancel() + break loop + case fp := <-memoryFingerprints: + if s.maintainMemorySeries(fp, model.Now().Add(-s.dropAfter)) { + dirty := atomic.AddInt64(&dirtySeriesCount, 1) + s.dirtySeries.Set(float64(dirty)) + // Check if we have enough "dirty" series so that we need an early checkpoint. + // However, if we are already behind persisting chunks, creating a checkpoint + // would be counterproductive, as it would slow down chunk persisting even more, + // while in a situation like that, where we are clearly lacking speed of disk + // maintenance, the best we can do for crash recovery is to persist chunks as + // quickly as possible. So only checkpoint if we are not in rushed mode. + if _, rushed := s.getPersistenceUrgencyScore(); !rushed && + dirty >= int64(s.checkpointDirtySeriesLimit) { + select { + case checkpointNow <- struct{}{}: + // Signal sent. + default: + // Signal already pending. + } + } + } + case fp := <-archivedFingerprints: + s.maintainArchivedSeries(fp, model.Now().Add(-s.dropAfter)) + } + } + // Wait until both channels are closed. + for range memoryFingerprints { + } + for range archivedFingerprints { + } + <-checkpointLoopStopped +} + +// maintainMemorySeries maintains a series that is in memory (i.e. not +// archived). It returns true if the method has changed from clean to dirty +// (i.e. it is inconsistent with the latest checkpoint now so that in case of a +// crash a recovery operation that requires a disk seek needed to be applied). +// +// The method first closes the head chunk if it was not touched for the duration +// of headChunkTimeout. +// +// Then it determines the chunks that need to be purged and the chunks that need +// to be persisted. Depending on the result, it does the following: +// +// - If all chunks of a series need to be purged, the whole series is deleted +// for good and the method returns false. (Detecting non-existence of a series +// file does not require a disk seek.) +// +// - If any chunks need to be purged (but not all of them), it purges those +// chunks from memory and rewrites the series file on disk, leaving out the +// purged chunks and appending all chunks not yet persisted (with the exception +// of a still open head chunk). +// +// - If no chunks on disk need to be purged, but chunks need to be persisted, +// those chunks are simply appended to the existing series file (or the file is +// created if it does not exist yet). +// +// - If no chunks need to be purged and no chunks need to be persisted, nothing +// happens in this step. +// +// Next, the method checks if all chunks in the series are evicted. In that +// case, it archives the series and returns true. +// +// Finally, it evicts chunk.Descs if there are too many. +func (s *MemorySeriesStorage) maintainMemorySeries( + fp model.Fingerprint, beforeTime model.Time, +) (becameDirty bool) { + defer func(begin time.Time) { + s.maintainSeriesDuration.WithLabelValues(maintainInMemory).Observe( + time.Since(begin).Seconds(), + ) + }(time.Now()) + + s.fpLocker.Lock(fp) + defer s.fpLocker.Unlock(fp) + + series, ok := s.fpToSeries.get(fp) + if !ok { + // Series is actually not in memory, perhaps archived or dropped in the meantime. + return false + } + + defer s.seriesOps.WithLabelValues(memoryMaintenance).Inc() + + closed, err := series.maybeCloseHeadChunk(s.headChunkTimeout) + if err != nil { + s.quarantineSeries(fp, series.metric, err) + s.persistErrors.Inc() + } + if closed { + s.incNumChunksToPersist(1) + s.headChunks.Dec() + } + + seriesWasDirty := series.dirty + + if s.writeMemorySeries(fp, series, beforeTime) { + // Series is gone now, we are done. + return false + } + + iOldestNotEvicted := -1 + for i, cd := range series.chunkDescs { + if !cd.IsEvicted() { + iOldestNotEvicted = i + break + } + } + + // Archive if all chunks are evicted. Also make sure the last sample has + // an age of at least headChunkTimeout (which is very likely anyway). + if iOldestNotEvicted == -1 && model.Now().Sub(series.lastTime) > s.headChunkTimeout { + s.fpToSeries.del(fp) + s.memorySeries.Dec() + s.persistence.archiveMetric(fp, series.metric, series.firstTime(), series.lastTime) + s.seriesOps.WithLabelValues(archive).Inc() + oldWatermark := atomic.LoadInt64((*int64)(&s.archiveHighWatermark)) + if oldWatermark < int64(series.lastTime) { + if !atomic.CompareAndSwapInt64( + (*int64)(&s.archiveHighWatermark), + oldWatermark, int64(series.lastTime), + ) { + panic("s.archiveHighWatermark modified outside of maintainMemorySeries") + } + } + return + } + // If we are here, the series is not archived, so check for chunk.Desc + // eviction next. + series.evictChunkDescs(iOldestNotEvicted) + + return series.dirty && !seriesWasDirty +} + +// writeMemorySeries (re-)writes a memory series file. While doing so, it drops +// chunks older than beforeTime from both the series file (if it exists) as well +// as from memory. The provided chunksToPersist are appended to the newly +// written series file. If no chunks need to be purged, but chunksToPersist is +// not empty, those chunks are simply appended to the series file. If the series +// contains no chunks after dropping old chunks, it is purged entirely. In that +// case, the method returns true. +// +// If a persist error is encountered, the series is queued for quarantine. In +// that case, the method returns true, too, because the series should not be +// processed anymore (even if it will only be gone for real once quarantining +// has been completed). +// +// The caller must have locked the fp. +func (s *MemorySeriesStorage) writeMemorySeries( + fp model.Fingerprint, series *memorySeries, beforeTime model.Time, +) bool { + var ( + persistErr error + cds = series.chunksToPersist() + ) + + defer func() { + if persistErr != nil { + s.quarantineSeries(fp, series.metric, persistErr) + s.persistErrors.Inc() + } + // The following is done even in case of an error to ensure + // correct counter bookkeeping and to not pin chunks in memory + // that belong to a series that is scheduled for quarantine + // anyway. + for _, cd := range cds { + cd.Unpin(s.evictRequests) + } + s.incNumChunksToPersist(-len(cds)) + chunk.Ops.WithLabelValues(chunk.PersistAndUnpin).Add(float64(len(cds))) + series.modTime = s.persistence.seriesFileModTime(fp) + }() + + // Get the actual chunks from underneath the chunk.Descs. + // No lock required as chunks still to persist cannot be evicted. + chunks := make([]chunk.Chunk, len(cds)) + for i, cd := range cds { + chunks[i] = cd.C + } + + if !series.firstTime().Before(beforeTime) { + // Oldest sample not old enough, just append chunks, if any. + if len(cds) == 0 { + return false + } + var offset int + offset, persistErr = s.persistence.persistChunks(fp, chunks) + if persistErr != nil { + return true + } + if series.chunkDescsOffset == -1 { + // This is the first chunk persisted for a newly created + // series that had prior chunks on disk. Finally, we can + // set the chunkDescsOffset. + series.chunkDescsOffset = offset + } + return false + } + + newFirstTime, offset, numDroppedFromPersistence, allDroppedFromPersistence, persistErr := + s.persistence.dropAndPersistChunks(fp, beforeTime, chunks) + if persistErr != nil { + return true + } + if persistErr = series.dropChunks(beforeTime); persistErr != nil { + return true + } + if len(series.chunkDescs) == 0 && allDroppedFromPersistence { + // All chunks dropped from both memory and persistence. Delete the series for good. + s.fpToSeries.del(fp) + s.memorySeries.Dec() + s.seriesOps.WithLabelValues(memoryPurge).Inc() + s.persistence.unindexMetric(fp, series.metric) + return true + } + series.savedFirstTime = newFirstTime + if series.chunkDescsOffset == -1 { + series.chunkDescsOffset = offset + } else { + series.chunkDescsOffset -= numDroppedFromPersistence + if series.chunkDescsOffset < 0 { + persistErr = errors.New("dropped more chunks from persistence than from memory") + series.chunkDescsOffset = 0 + return true + } + } + return false +} + +// maintainArchivedSeries drops chunks older than beforeTime from an archived +// series. If the series contains no chunks after that, it is purged entirely. +func (s *MemorySeriesStorage) maintainArchivedSeries(fp model.Fingerprint, beforeTime model.Time) { + defer func(begin time.Time) { + s.maintainSeriesDuration.WithLabelValues(maintainArchived).Observe( + time.Since(begin).Seconds(), + ) + }(time.Now()) + + s.fpLocker.Lock(fp) + defer s.fpLocker.Unlock(fp) + + has, firstTime, lastTime := s.persistence.hasArchivedMetric(fp) + if !has || !firstTime.Before(beforeTime) { + // Oldest sample not old enough, or metric purged or unarchived in the meantime. + return + } + + defer s.seriesOps.WithLabelValues(archiveMaintenance).Inc() + + newFirstTime, _, _, allDropped, err := s.persistence.dropAndPersistChunks(fp, beforeTime, nil) + if err != nil { + // TODO(beorn7): Should quarantine the series. + s.persistErrors.Inc() + log.Error("Error dropping persisted chunks: ", err) + } + if allDropped { + if err := s.persistence.purgeArchivedMetric(fp); err != nil { + s.persistErrors.Inc() + // purgeArchivedMetric logs the error already. + } + s.seriesOps.WithLabelValues(archivePurge).Inc() + return + } + if err := s.persistence.updateArchivedTimeRange(fp, newFirstTime, lastTime); err != nil { + s.persistErrors.Inc() + log.Errorf("Error updating archived time range for fingerprint %v: %s", fp, err) + } +} + +// See persistence.loadChunks for detailed explanation. +func (s *MemorySeriesStorage) loadChunks(fp model.Fingerprint, indexes []int, indexOffset int) ([]chunk.Chunk, error) { + return s.persistence.loadChunks(fp, indexes, indexOffset) +} + +// See persistence.loadChunkDescs for detailed explanation. +func (s *MemorySeriesStorage) loadChunkDescs(fp model.Fingerprint, offsetFromEnd int) ([]*chunk.Desc, error) { + return s.persistence.loadChunkDescs(fp, offsetFromEnd) +} + +// getNumChunksToPersist returns chunksToPersist in a goroutine-safe way. +func (s *MemorySeriesStorage) getNumChunksToPersist() int { + return int(atomic.LoadInt64(&s.numChunksToPersist)) +} + +// incNumChunksToPersist increments chunksToPersist in a goroutine-safe way. Use a +// negative 'by' to decrement. +func (s *MemorySeriesStorage) incNumChunksToPersist(by int) { + atomic.AddInt64(&s.numChunksToPersist, int64(by)) + if by > 0 { + s.queuedChunksToPersist.Add(float64(by)) + } +} + +// getPersistenceUrgencyScore returns an urgency score for the speed of +// persisting chunks. The score is between 0 and 1, where 0 means no urgency at +// all and 1 means highest urgency. It also returns if the storage is in +// "rushed mode". +// +// The storage enters "rushed mode" if the score exceeds +// persintenceUrgencyScoreForEnteringRushedMode at the time this method is +// called. It will leave "rushed mode" if, at a later time this method is +// called, the score is below persintenceUrgencyScoreForLeavingRushedMode. +// "Rushed mode" plays a role for the adaptive series-sync-strategy. It also +// switches off early checkpointing (due to dirty series), and it makes series +// maintenance happen as quickly as possible. +// +// A score of 1 will trigger throttling of sample ingestion. +// +// It is safe to call this method concurrently. +func (s *MemorySeriesStorage) getPersistenceUrgencyScore() (float64, bool) { + s.rushedMtx.Lock() + defer s.rushedMtx.Unlock() + + score := float64(atomic.LoadInt32(&s.persistUrgency)) / 1000 + if score > 1 { + score = 1 + } + + if s.rushed { + // We are already in rushed mode. If the score is still above + // persintenceUrgencyScoreForLeavingRushedMode, return the score + // and leave things as they are. + if score > persintenceUrgencyScoreForLeavingRushedMode { + return score, true + } + // We are out of rushed mode! + s.rushed = false + log. + With("urgencyScore", score). + With("chunksToPersist", s.getNumChunksToPersist()). + With("memoryChunks", atomic.LoadInt64(&chunk.NumMemChunks)). + Info("Storage has left rushed mode.") + return score, false + } + if score > persintenceUrgencyScoreForEnteringRushedMode { + // Enter rushed mode. + s.rushed = true + log. + With("urgencyScore", score). + With("chunksToPersist", s.getNumChunksToPersist()). + With("memoryChunks", atomic.LoadInt64(&chunk.NumMemChunks)). + Warn("Storage has entered rushed mode.") + } + return score, s.rushed +} + +// quarantineSeries registers the provided fingerprint for quarantining. It +// always returns immediately. Quarantine requests are processed +// asynchronously. If there are too many requests queued, they are simply +// dropped. +// +// Quarantining means that the series file is moved to the orphaned directory, +// and all its traces are removed from indices. Call this method if an +// unrecoverable error is detected while dealing with a series, and pass in the +// encountered error. It will be saved as a hint in the orphaned directory. +func (s *MemorySeriesStorage) quarantineSeries(fp model.Fingerprint, metric model.Metric, err error) { + req := quarantineRequest{fp: fp, metric: metric, reason: err} + select { + case s.quarantineRequests <- req: + // Request submitted. + default: + log. + With("fingerprint", fp). + With("metric", metric). + With("reason", err). + Warn("Quarantine queue full. Dropped quarantine request.") + s.seriesOps.WithLabelValues(droppedQuarantine).Inc() + } +} + +func (s *MemorySeriesStorage) handleQuarantine() { + for { + select { + case req := <-s.quarantineRequests: + s.purgeSeries(req.fp, req.metric, req.reason) + log. + With("fingerprint", req.fp). + With("metric", req.metric). + With("reason", req.reason). + Warn("Series quarantined.") + case <-s.quarantineStopping: + log.Info("Series quarantining stopped.") + close(s.quarantineStopped) + return + } + } + +} + +// purgeSeries removes all traces of a series. If a non-nil quarantine reason is +// provided, the series file will not be deleted completely, but moved to the +// orphaned directory with the reason and the metric in a hint file. The +// provided metric might be nil if unknown. +func (s *MemorySeriesStorage) purgeSeries(fp model.Fingerprint, m model.Metric, quarantineReason error) { + s.fpLocker.Lock(fp) + + var ( + series *memorySeries + ok bool + ) + + if series, ok = s.fpToSeries.get(fp); ok { + s.fpToSeries.del(fp) + s.memorySeries.Dec() + m = series.metric + + // Adjust s.chunksToPersist and chunk.NumMemChunks down by + // the number of chunks in this series that are not + // persisted yet. Persisted chunks will be deducted from + // chunk.NumMemChunks upon eviction. + numChunksNotYetPersisted := len(series.chunkDescs) - series.persistWatermark + atomic.AddInt64(&chunk.NumMemChunks, int64(-numChunksNotYetPersisted)) + if !series.headChunkClosed { + // Head chunk wasn't counted as waiting for persistence yet. + // (But it was counted as a chunk in memory.) + numChunksNotYetPersisted-- + } + s.incNumChunksToPersist(-numChunksNotYetPersisted) + + } else { + s.persistence.purgeArchivedMetric(fp) // Ignoring error. There is nothing we can do. + } + if m != nil { + // If we know a metric now, unindex it in any case. + // purgeArchivedMetric might have done so already, but we cannot + // be sure. Unindexing in idempotent, though. + s.persistence.unindexMetric(fp, m) + } + // Attempt to delete/quarantine the series file in any case. + if quarantineReason == nil { + // No reason stated, simply delete the file. + if _, err := s.persistence.deleteSeriesFile(fp); err != nil { + log. + With("fingerprint", fp). + With("metric", m). + With("error", err). + Error("Error deleting series file.") + } + s.seriesOps.WithLabelValues(requestedPurge).Inc() + } else { + if err := s.persistence.quarantineSeriesFile(fp, quarantineReason, m); err == nil { + s.seriesOps.WithLabelValues(completedQurantine).Inc() + } else { + s.seriesOps.WithLabelValues(failedQuarantine).Inc() + log. + With("fingerprint", fp). + With("metric", m). + With("reason", quarantineReason). + With("error", err). + Error("Error quarantining series file.") + } + } + + s.fpLocker.Unlock(fp) +} + +// Describe implements prometheus.Collector. +func (s *MemorySeriesStorage) Describe(ch chan<- *prometheus.Desc) { + s.persistence.Describe(ch) + s.mapper.Describe(ch) + + ch <- s.persistErrors.Desc() + ch <- s.queuedChunksToPersist.Desc() + ch <- s.chunksToPersist.Desc() + ch <- s.memorySeries.Desc() + ch <- s.headChunks.Desc() + ch <- s.dirtySeries.Desc() + s.seriesOps.Describe(ch) + ch <- s.ingestedSamples.Desc() + s.discardedSamples.Describe(ch) + ch <- s.nonExistentSeriesMatches.Desc() + ch <- s.memChunks.Desc() + s.maintainSeriesDuration.Describe(ch) + ch <- s.persistenceUrgencyScore.Desc() + ch <- s.rushedMode.Desc() + ch <- s.targetHeapSizeBytes.Desc() +} + +// Collect implements prometheus.Collector. +func (s *MemorySeriesStorage) Collect(ch chan<- prometheus.Metric) { + s.persistence.Collect(ch) + s.mapper.Collect(ch) + + ch <- s.persistErrors + ch <- s.queuedChunksToPersist + ch <- s.chunksToPersist + ch <- s.memorySeries + ch <- s.headChunks + ch <- s.dirtySeries + s.seriesOps.Collect(ch) + ch <- s.ingestedSamples + s.discardedSamples.Collect(ch) + ch <- s.nonExistentSeriesMatches + ch <- s.memChunks + s.maintainSeriesDuration.Collect(ch) + ch <- s.persistenceUrgencyScore + ch <- s.rushedMode + ch <- s.targetHeapSizeBytes +} diff --git a/vendor/github.com/prometheus/prometheus/storage/local/test_helpers.go b/vendor/github.com/prometheus/prometheus/storage/local/test_helpers.go new file mode 100644 index 000000000..7b6cc51e6 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/storage/local/test_helpers.go @@ -0,0 +1,72 @@ +// Copyright 2014 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// NOTE ON FILENAME: Do not rename this file helpers_test.go (which might appear +// an obvious choice). We need NewTestStorage in tests outside of the local +// package, too. On the other hand, moving NewTestStorage in its own package +// would cause circular dependencies in the tests in packages local. + +package local + +import ( + "time" + + "github.com/prometheus/common/model" + "github.com/prometheus/prometheus/storage/local/chunk" + "github.com/prometheus/prometheus/util/testutil" +) + +type testStorageCloser struct { + storage Storage + directory testutil.Closer +} + +func (t *testStorageCloser) Close() { + if err := t.storage.Stop(); err != nil { + panic(err) + } + t.directory.Close() +} + +// NewTestStorage creates a storage instance backed by files in a temporary +// directory. The returned storage is already in serving state. Upon closing the +// returned test.Closer, the temporary directory is cleaned up. +func NewTestStorage(t testutil.T, encoding chunk.Encoding) (*MemorySeriesStorage, testutil.Closer) { + chunk.DefaultEncoding = encoding + directory := testutil.NewTemporaryDirectory("test_storage", t) + o := &MemorySeriesStorageOptions{ + TargetHeapSize: 1000000000, + PersistenceRetentionPeriod: 24 * time.Hour * 365 * 100, // Enough to never trigger purging. + PersistenceStoragePath: directory.Path(), + HeadChunkTimeout: 5 * time.Minute, + CheckpointInterval: time.Hour, + SyncStrategy: Adaptive, + } + storage := NewMemorySeriesStorage(o) + storage.archiveHighWatermark = model.Latest + if err := storage.Start(); err != nil { + directory.Close() + t.Fatalf("Error creating storage: %s", err) + } + + closer := &testStorageCloser{ + storage: storage, + directory: directory, + } + + return storage, closer +} + +func makeFingerprintSeriesPair(s *MemorySeriesStorage, fp model.Fingerprint) fingerprintSeriesPair { + return fingerprintSeriesPair{fp, s.seriesForRange(fp, model.Earliest, model.Latest)} +} diff --git a/vendor/github.com/prometheus/prometheus/storage/metric/matcher.go b/vendor/github.com/prometheus/prometheus/storage/metric/matcher.go new file mode 100644 index 000000000..2f451e27f --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/storage/metric/matcher.go @@ -0,0 +1,209 @@ +// Copyright 2014 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metric + +import ( + "fmt" + "regexp" + "strings" + + "github.com/prometheus/common/model" +) + +// MatchType is an enum for label matching types. +type MatchType int + +// Possible MatchTypes. +const ( + Equal MatchType = iota + NotEqual + RegexMatch + RegexNoMatch +) + +func (m MatchType) String() string { + typeToStr := map[MatchType]string{ + Equal: "=", + NotEqual: "!=", + RegexMatch: "=~", + RegexNoMatch: "!~", + } + if str, ok := typeToStr[m]; ok { + return str + } + panic("unknown match type") +} + +// LabelMatchers is a slice of LabelMatcher objects. By implementing the +// sort.Interface, it is sortable by cardinality score, i.e. after sorting, the +// LabelMatcher that is expected to yield the fewest matches is first in the +// slice, and LabelMatchers that match the empty string are last. +type LabelMatchers []*LabelMatcher + +func (lms LabelMatchers) Len() int { return len(lms) } +func (lms LabelMatchers) Swap(i, j int) { lms[i], lms[j] = lms[j], lms[i] } +func (lms LabelMatchers) Less(i, j int) bool { return lms[i].score < lms[j].score } + +func (lms LabelMatchers) String() string { + result := make([]string, 0, len(lms)) + for _, lm := range lms { + result = append(result, lm.String()) + } + return strings.Join(result, ",") +} + +// LabelMatcher models the matching of a label. Create with NewLabelMatcher. +type LabelMatcher struct { + Type MatchType + Name model.LabelName + Value model.LabelValue + re *regexp.Regexp + score float64 // Cardinality score, between 0 and 1, 0 is lowest cardinality. +} + +// NewLabelMatcher returns a LabelMatcher object ready to use. +func NewLabelMatcher(matchType MatchType, name model.LabelName, value model.LabelValue) (*LabelMatcher, error) { + m := &LabelMatcher{ + Type: matchType, + Name: name, + Value: value, + } + if matchType == RegexMatch || matchType == RegexNoMatch { + re, err := regexp.Compile("^(?:" + string(value) + ")$") + if err != nil { + return nil, err + } + m.re = re + } + m.calculateScore() + return m, nil +} + +// calculateScore is a helper method only called in the constructor. It +// calculates the cardinality score upfront, so that sorting by it is faster and +// doesn't change internal state of the matcher. +// +// The score is based on a pretty bad but still quite helpful heuristics for +// now. Note that this is an interim solution until the work in progress to +// properly intersect matchers is complete. We intend to not invest any further +// effort into tweaking the score calculation, as this could easily devolve into +// a rabbit hole. +// +// The heuristics works along the following lines: +// +// - A matcher that is known to match nothing would have a score of 0. (This +// case doesn't happen in the scope of this method.) +// +// - A matcher that matches the empty string has a score of 1. +// +// - Equal matchers have a score <= 0.5. The order in score for other matchers +// are RegexMatch, RegexNoMatch, NotEqual. +// +// - There are a number of score adjustments for known "magic" parts, like +// instance labels, metric names containing a colon (which are probably +// recording rules) and such. +// +// - On top, there is a tiny adjustment for the length of the matcher, following +// the blunt expectation that a long label name and/or value is more specific +// and will therefore have a lower cardinality. +// +// To reiterate on the above: PLEASE RESIST THE TEMPTATION TO TWEAK THIS +// METHOD. IT IS "MAGIC" ENOUGH ALREADY AND WILL GO AWAY WITH THE UPCOMING MORE +// POWERFUL INDEXING. +func (m *LabelMatcher) calculateScore() { + if m.Match("") { + m.score = 1 + return + } + // lengthCorrection is between 0 (for length 0) and 0.1 (for length +Inf). + lengthCorrection := 0.1 * (1 - 1/float64(len(m.Name)+len(m.Value)+1)) + switch m.Type { + case Equal: + m.score = 0.3 - lengthCorrection + case RegexMatch: + m.score = 0.6 - lengthCorrection + case RegexNoMatch: + m.score = 0.8 + lengthCorrection + case NotEqual: + m.score = 0.9 + lengthCorrection + } + if m.Type != Equal { + // Don't bother anymore in this case. + return + } + switch m.Name { + case model.InstanceLabel: + // Matches only metrics from a single instance, which clearly + // limits the damage. + m.score -= 0.2 + case model.JobLabel: + // The usual case is a relatively low number of jobs with many + // metrics each. + m.score += 0.1 + case model.BucketLabel, model.QuantileLabel: + // Magic labels for buckets and quantiles will match copiously. + m.score += 0.2 + case model.MetricNameLabel: + if strings.Contains(string(m.Value), ":") { + // Probably a recording rule with limited cardinality. + m.score -= 0.1 + return + } + if m.Value == "up" || m.Value == "scrape_duration_seconds" { + // Synthetic metrics which are contained in every scrape + // exactly once. There might be less frequent metric + // names, but the worst case is limited here, so give it + // a bump. + m.score -= 0.05 + return + } + } +} + +// MatchesEmptyString returns true if the LabelMatcher matches the empty string. +func (m *LabelMatcher) MatchesEmptyString() bool { + return m.score >= 1 +} + +func (m *LabelMatcher) String() string { + return fmt.Sprintf("%s%s%q", m.Name, m.Type, m.Value) +} + +// Match returns true if the label matcher matches the supplied label value. +func (m *LabelMatcher) Match(v model.LabelValue) bool { + switch m.Type { + case Equal: + return m.Value == v + case NotEqual: + return m.Value != v + case RegexMatch: + return m.re.MatchString(string(v)) + case RegexNoMatch: + return !m.re.MatchString(string(v)) + default: + panic("invalid match type") + } +} + +// Filter takes a list of label values and returns all label values which match +// the label matcher. +func (m *LabelMatcher) Filter(in model.LabelValues) model.LabelValues { + out := model.LabelValues{} + for _, v := range in { + if m.Match(v) { + out = append(out, v) + } + } + return out +} diff --git a/vendor/github.com/prometheus/prometheus/storage/metric/metric.go b/vendor/github.com/prometheus/prometheus/storage/metric/metric.go new file mode 100644 index 000000000..7328ac7a8 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/storage/metric/metric.go @@ -0,0 +1,63 @@ +// Copyright 2014 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metric + +import "github.com/prometheus/common/model" + +// Metric wraps a model.Metric and copies it upon modification if Copied is false. +type Metric struct { + Copied bool + Metric model.Metric +} + +// Set sets a label name in the wrapped Metric to a given value and copies the +// Metric initially, if it is not already a copy. +func (m *Metric) Set(ln model.LabelName, lv model.LabelValue) { + m.Copy() + m.Metric[ln] = lv +} + +// Del deletes a given label name from the wrapped Metric and copies the +// Metric initially, if it is not already a copy. +func (m *Metric) Del(ln model.LabelName) { + m.Copy() + delete(m.Metric, ln) +} + +// Get the value for the given label name. An empty value is returned +// if the label does not exist in the metric. +func (m *Metric) Get(ln model.LabelName) model.LabelValue { + return m.Metric[ln] +} + +// Gets behaves as Get but the returned boolean is false iff the label +// does not exist. +func (m *Metric) Gets(ln model.LabelName) (model.LabelValue, bool) { + lv, ok := m.Metric[ln] + return lv, ok +} + +// Copy the underlying Metric if it is not already a copy. +func (m *Metric) Copy() *Metric { + if !m.Copied { + m.Metric = m.Metric.Clone() + m.Copied = true + } + return m +} + +// String implements fmt.Stringer. +func (m Metric) String() string { + return m.Metric.String() +} diff --git a/vendor/github.com/prometheus/prometheus/storage/metric/sample.go b/vendor/github.com/prometheus/prometheus/storage/metric/sample.go new file mode 100644 index 000000000..a30c2b456 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/storage/metric/sample.go @@ -0,0 +1,22 @@ +// Copyright 2013 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metric + +import "github.com/prometheus/common/model" + +// Interval describes the inclusive interval between two Timestamps. +type Interval struct { + OldestInclusive model.Time + NewestInclusive model.Time +} diff --git a/vendor/github.com/prometheus/prometheus/storage/storage.go b/vendor/github.com/prometheus/prometheus/storage/storage.go new file mode 100644 index 000000000..5acae673e --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/storage/storage.go @@ -0,0 +1,76 @@ +// Copyright 2015 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package storage + +import ( + "github.com/prometheus/common/model" +) + +// SampleAppender is the interface to append samples to both, local and remote +// storage. All methods are goroutine-safe. +type SampleAppender interface { + // Append appends a sample to the underlying storage. Depending on the + // storage implementation, there are different guarantees for the fate + // of the sample after Append has returned. Remote storage + // implementation will simply drop samples if they cannot keep up with + // sending samples. Local storage implementations will only drop metrics + // upon unrecoverable errors. + Append(*model.Sample) error + // NeedsThrottling returns true if the underlying storage wishes to not + // receive any more samples. Append will still work but might lead to + // undue resource usage. It is recommended to call NeedsThrottling once + // before an upcoming batch of Append calls (e.g. a full scrape of a + // target or the evaluation of a rule group) and only proceed with the + // batch if NeedsThrottling returns false. In that way, the result of a + // scrape or of an evaluation of a rule group will always be appended + // completely or not at all, and the work of scraping or evaluation will + // not be performed in vain. Also, a call of NeedsThrottling is + // potentially expensive, so limiting the number of calls is reasonable. + // + // Only SampleAppenders for which it is considered critical to receive + // each and every sample should ever return true. SampleAppenders that + // tolerate not receiving all samples should always return false and + // instead drop samples as they see fit to avoid overload. + NeedsThrottling() bool +} + +// Fanout is a SampleAppender that appends every sample to each SampleAppender +// in its list. +type Fanout []SampleAppender + +// Append implements SampleAppender. It appends the provided sample to all +// SampleAppenders in the Fanout slice and waits for each append to complete +// before proceeding with the next. +// If any of the SampleAppenders returns an error, the first one is returned +// at the end. +func (f Fanout) Append(s *model.Sample) error { + var err error + for _, a := range f { + if e := a.Append(s); e != nil && err == nil { + err = e + } + } + return err +} + +// NeedsThrottling returns true if at least one of the SampleAppenders in the +// Fanout slice is throttled. +func (f Fanout) NeedsThrottling() bool { + for _, a := range f { + if a.NeedsThrottling() { + return true + } + } + return false +} diff --git a/vendor/github.com/prometheus/prometheus/util/flock/flock.go b/vendor/github.com/prometheus/prometheus/util/flock/flock.go new file mode 100644 index 000000000..5dc22a2fa --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/util/flock/flock.go @@ -0,0 +1,46 @@ +// Copyright 2016 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package flock provides portable file locking. It is essentially ripped out +// from the code of github.com/syndtr/goleveldb. Strange enough that the +// standard library does not provide this functionality. Once this package has +// proven to work as expected, we should probably turn it into a separate +// general purpose package for humanity. +package flock + +import ( + "os" + "path/filepath" +) + +// Releaser provides the Release method to release a file lock. +type Releaser interface { + Release() error +} + +// New locks the file with the provided name. If the file does not exist, it is +// created. The returned Releaser is used to release the lock. existed is true +// if the file to lock already existed. A non-nil error is returned if the +// locking has failed. Neither this function nor the returned Releaser is +// goroutine-safe. +func New(fileName string) (r Releaser, existed bool, err error) { + if err = os.MkdirAll(filepath.Dir(fileName), 0755); err != nil { + return + } + + _, err = os.Stat(fileName) + existed = err == nil + + r, err = newLock(fileName) + return +} diff --git a/vendor/github.com/prometheus/prometheus/util/flock/flock_plan9.go b/vendor/github.com/prometheus/prometheus/util/flock/flock_plan9.go new file mode 100644 index 000000000..004e85c0f --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/util/flock/flock_plan9.go @@ -0,0 +1,32 @@ +// Copyright 2016 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package flock + +import "os" + +type plan9Lock struct { + f *os.File +} + +func (l *plan9Lock) Release() error { + return l.f.Close() +} + +func newLock(fileName string) (Releaser, error) { + f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE, os.ModeExclusive|0644) + if err != nil { + return nil, err + } + return &plan9Lock{f}, nil +} diff --git a/vendor/github.com/prometheus/prometheus/util/flock/flock_solaris.go b/vendor/github.com/prometheus/prometheus/util/flock/flock_solaris.go new file mode 100644 index 000000000..299fc8744 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/util/flock/flock_solaris.go @@ -0,0 +1,59 @@ +// Copyright 2016 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build solaris + +package flock + +import ( + "os" + "syscall" +) + +type unixLock struct { + f *os.File +} + +func (l *unixLock) Release() error { + if err := l.set(false); err != nil { + return err + } + return l.f.Close() +} + +func (l *unixLock) set(lock bool) error { + flock := syscall.Flock_t{ + Type: syscall.F_UNLCK, + Start: 0, + Len: 0, + Whence: 1, + } + if lock { + flock.Type = syscall.F_WRLCK + } + return syscall.FcntlFlock(l.f.Fd(), syscall.F_SETLK, &flock) +} + +func newLock(fileName string) (Releaser, error) { + f, err := os.OpenFile(fileName, os.O_RDWR|os.O_CREATE, 0644) + if err != nil { + return nil, err + } + l := &unixLock{f} + err = l.set(true) + if err != nil { + f.Close() + return nil, err + } + return l, nil +} diff --git a/vendor/github.com/prometheus/prometheus/util/flock/flock_unix.go b/vendor/github.com/prometheus/prometheus/util/flock/flock_unix.go new file mode 100644 index 000000000..7d71f8fc0 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/util/flock/flock_unix.go @@ -0,0 +1,54 @@ +// Copyright 2016 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build darwin dragonfly freebsd linux netbsd openbsd + +package flock + +import ( + "os" + "syscall" +) + +type unixLock struct { + f *os.File +} + +func (l *unixLock) Release() error { + if err := l.set(false); err != nil { + return err + } + return l.f.Close() +} + +func (l *unixLock) set(lock bool) error { + how := syscall.LOCK_UN + if lock { + how = syscall.LOCK_EX + } + return syscall.Flock(int(l.f.Fd()), how|syscall.LOCK_NB) +} + +func newLock(fileName string) (Releaser, error) { + f, err := os.OpenFile(fileName, os.O_RDWR|os.O_CREATE, 0644) + if err != nil { + return nil, err + } + l := &unixLock{f} + err = l.set(true) + if err != nil { + f.Close() + return nil, err + } + return l, nil +} diff --git a/vendor/github.com/prometheus/prometheus/util/flock/flock_windows.go b/vendor/github.com/prometheus/prometheus/util/flock/flock_windows.go new file mode 100644 index 000000000..bf7266f14 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/util/flock/flock_windows.go @@ -0,0 +1,36 @@ +// Copyright 2016 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package flock + +import "syscall" + +type windowsLock struct { + fd syscall.Handle +} + +func (fl *windowsLock) Release() error { + return syscall.Close(fl.fd) +} + +func newLock(fileName string) (Releaser, error) { + pathp, err := syscall.UTF16PtrFromString(fileName) + if err != nil { + return nil, err + } + fd, err := syscall.CreateFile(pathp, syscall.GENERIC_READ|syscall.GENERIC_WRITE, 0, nil, syscall.CREATE_ALWAYS, syscall.FILE_ATTRIBUTE_NORMAL, 0) + if err != nil { + return nil, err + } + return &windowsLock{fd}, nil +} diff --git a/vendor/github.com/prometheus/prometheus/util/stats/query_stats.go b/vendor/github.com/prometheus/prometheus/util/stats/query_stats.go new file mode 100644 index 000000000..3d7ad0e83 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/util/stats/query_stats.go @@ -0,0 +1,48 @@ +// Copyright 2013 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package stats + +// QueryTiming identifies the code area or functionality in which time is spent +// during a query. +type QueryTiming int + +// Query timings. +const ( + TotalEvalTime QueryTiming = iota + ResultSortTime + QueryPreparationTime + InnerEvalTime + ResultAppendTime + ExecQueueTime +) + +// Return a string representation of a QueryTiming identifier. +func (s QueryTiming) String() string { + switch s { + case TotalEvalTime: + return "Total eval time" + case ResultSortTime: + return "Result sorting time" + case QueryPreparationTime: + return "Query preparation time" + case InnerEvalTime: + return "Inner eval time" + case ResultAppendTime: + return "Result append time" + case ExecQueueTime: + return "Exec queue wait time" + default: + return "Unknown query timing" + } +} diff --git a/vendor/github.com/prometheus/prometheus/util/stats/timer.go b/vendor/github.com/prometheus/prometheus/util/stats/timer.go new file mode 100644 index 000000000..3d3ee7309 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/util/stats/timer.go @@ -0,0 +1,108 @@ +// Copyright 2013 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package stats + +import ( + "bytes" + "fmt" + "sort" + "time" +) + +// A Timer that can be started and stopped and accumulates the total time it +// was running (the time between Start() and Stop()). +type Timer struct { + name fmt.Stringer + created time.Time + start time.Time + duration time.Duration +} + +// Start the timer. +func (t *Timer) Start() *Timer { + t.start = time.Now() + return t +} + +// Stop the timer. +func (t *Timer) Stop() { + t.duration += time.Since(t.start) +} + +// ElapsedTime returns the time that passed since starting the timer. +func (t *Timer) ElapsedTime() time.Duration { + return time.Since(t.start) +} + +// Return a string representation of the Timer. +func (t *Timer) String() string { + return fmt.Sprintf("%s: %s", t.name, t.duration) +} + +// A TimerGroup represents a group of timers relevant to a single query. +type TimerGroup struct { + timers map[fmt.Stringer]*Timer +} + +// NewTimerGroup constructs a new TimerGroup. +func NewTimerGroup() *TimerGroup { + return &TimerGroup{timers: map[fmt.Stringer]*Timer{}} +} + +// GetTimer gets (and creates, if necessary) the Timer for a given code section. +func (t *TimerGroup) GetTimer(name fmt.Stringer) *Timer { + if timer, exists := t.timers[name]; exists { + return timer + } + timer := &Timer{ + name: name, + created: time.Now(), + } + t.timers[name] = timer + return timer +} + +// Timers is a slice of Timer pointers that implements Len and Swap from +// sort.Interface. +type Timers []*Timer + +type byCreationTimeSorter struct{ Timers } + +// Len implements sort.Interface. +func (t Timers) Len() int { + return len(t) +} + +// Swap implements sort.Interface. +func (t Timers) Swap(i, j int) { + t[i], t[j] = t[j], t[i] +} + +func (s byCreationTimeSorter) Less(i, j int) bool { + return s.Timers[i].created.Before(s.Timers[j].created) +} + +// Return a string representation of a TimerGroup. +func (t *TimerGroup) String() string { + timers := byCreationTimeSorter{} + for _, timer := range t.timers { + timers.Timers = append(timers.Timers, timer) + } + sort.Sort(timers) + result := &bytes.Buffer{} + for _, timer := range timers.Timers { + fmt.Fprintf(result, "%s\n", timer) + } + return result.String() +} diff --git a/vendor/github.com/prometheus/prometheus/util/strutil/quote.go b/vendor/github.com/prometheus/prometheus/util/strutil/quote.go new file mode 100644 index 000000000..981ad473d --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/util/strutil/quote.go @@ -0,0 +1,223 @@ +// Copyright 2015 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package strutil + +import ( + "errors" + "unicode/utf8" +) + +// ErrSyntax indicates that a value does not have the right syntax for the target type. +var ErrSyntax = errors.New("invalid syntax") + +// Unquote interprets s as a single-quoted, double-quoted, or backquoted +// Prometheus query language string literal, returning the string value that s +// quotes. +// +// NOTE: This function as well as the necessary helper functions below +// (unquoteChar, contains, unhex) and associated tests have been adapted from +// the corresponding functions in the "strconv" package of the Go standard +// library to work for Prometheus-style strings. Go's special-casing for single +// quotes was removed and single quoted strings are now treated the same as +// double quoted ones. +func Unquote(s string) (t string, err error) { + n := len(s) + if n < 2 { + return "", ErrSyntax + } + quote := s[0] + if quote != s[n-1] { + return "", ErrSyntax + } + s = s[1 : n-1] + + if quote == '`' { + if contains(s, '`') { + return "", ErrSyntax + } + return s, nil + } + if quote != '"' && quote != '\'' { + return "", ErrSyntax + } + if contains(s, '\n') { + return "", ErrSyntax + } + + // Is it trivial? Avoid allocation. + if !contains(s, '\\') && !contains(s, quote) { + return s, nil + } + + var runeTmp [utf8.UTFMax]byte + buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations. + for len(s) > 0 { + c, multibyte, ss, err := unquoteChar(s, quote) + if err != nil { + return "", err + } + s = ss + if c < utf8.RuneSelf || !multibyte { + buf = append(buf, byte(c)) + } else { + n := utf8.EncodeRune(runeTmp[:], c) + buf = append(buf, runeTmp[:n]...) + } + } + return string(buf), nil +} + +// unquoteChar decodes the first character or byte in the escaped string +// or character literal represented by the string s. +// It returns four values: +// +// 1) value, the decoded Unicode code point or byte value; +// 2) multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation; +// 3) tail, the remainder of the string after the character; and +// 4) an error that will be nil if the character is syntactically valid. +// +// The second argument, quote, specifies the type of literal being parsed +// and therefore which escaped quote character is permitted. +// If set to a single quote, it permits the sequence \' and disallows unescaped '. +// If set to a double quote, it permits \" and disallows unescaped ". +// If set to zero, it does not permit either escape and allows both quote characters to appear unescaped. +func unquoteChar(s string, quote byte) (value rune, multibyte bool, tail string, err error) { + // easy cases + switch c := s[0]; { + case c == quote && (quote == '\'' || quote == '"'): + err = ErrSyntax + return + case c >= utf8.RuneSelf: + r, size := utf8.DecodeRuneInString(s) + return r, true, s[size:], nil + case c != '\\': + return rune(s[0]), false, s[1:], nil + } + + // Hard case: c is backslash. + if len(s) <= 1 { + err = ErrSyntax + return + } + c := s[1] + s = s[2:] + + switch c { + case 'a': + value = '\a' + case 'b': + value = '\b' + case 'f': + value = '\f' + case 'n': + value = '\n' + case 'r': + value = '\r' + case 't': + value = '\t' + case 'v': + value = '\v' + case 'x', 'u', 'U': + n := 0 + switch c { + case 'x': + n = 2 + case 'u': + n = 4 + case 'U': + n = 8 + } + var v rune + if len(s) < n { + err = ErrSyntax + return + } + for j := 0; j < n; j++ { + x, ok := unhex(s[j]) + if !ok { + err = ErrSyntax + return + } + v = v<<4 | x + } + s = s[n:] + if c == 'x' { + // Single-byte string, possibly not UTF-8. + value = v + break + } + if v > utf8.MaxRune { + err = ErrSyntax + return + } + value = v + multibyte = true + case '0', '1', '2', '3', '4', '5', '6', '7': + v := rune(c) - '0' + if len(s) < 2 { + err = ErrSyntax + return + } + for j := 0; j < 2; j++ { // One digit already; two more. + x := rune(s[j]) - '0' + if x < 0 || x > 7 { + err = ErrSyntax + return + } + v = (v << 3) | x + } + s = s[2:] + if v > 255 { + err = ErrSyntax + return + } + value = v + case '\\': + value = '\\' + case '\'', '"': + if c != quote { + err = ErrSyntax + return + } + value = rune(c) + default: + err = ErrSyntax + return + } + tail = s + return +} + +// contains reports whether the string contains the byte c. +func contains(s string, c byte) bool { + for i := 0; i < len(s); i++ { + if s[i] == c { + return true + } + } + return false +} + +func unhex(b byte) (v rune, ok bool) { + c := rune(b) + switch { + case '0' <= c && c <= '9': + return c - '0', true + case 'a' <= c && c <= 'f': + return c - 'a' + 10, true + case 'A' <= c && c <= 'F': + return c - 'A' + 10, true + } + return +} diff --git a/vendor/github.com/prometheus/prometheus/util/strutil/strconv.go b/vendor/github.com/prometheus/prometheus/util/strutil/strconv.go new file mode 100644 index 000000000..3d96e4faf --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/util/strutil/strconv.go @@ -0,0 +1,44 @@ +// Copyright 2013 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package strutil + +import ( + "fmt" + "net/url" + "regexp" +) + +var ( + invalidLabelCharRE = regexp.MustCompile(`[^a-zA-Z0-9_]`) +) + +// TableLinkForExpression creates an escaped relative link to the table view of +// the provided expression. +func TableLinkForExpression(expr string) string { + escapedExpression := url.QueryEscape(expr) + return fmt.Sprintf("/graph?g0.expr=%s&g0.tab=1", escapedExpression) +} + +// GraphLinkForExpression creates an escaped relative link to the graph view of +// the provided expression. +func GraphLinkForExpression(expr string) string { + escapedExpression := url.QueryEscape(expr) + return fmt.Sprintf("/graph?g0.expr=%s&g0.tab=0", escapedExpression) +} + +// SanitizeLabelName replaces anything that doesn't match +// client_label.LabelNameRE with an underscore. +func SanitizeLabelName(name string) string { + return invalidLabelCharRE.ReplaceAllString(name, "_") +} diff --git a/vendor/github.com/prometheus/prometheus/util/testutil/directory.go b/vendor/github.com/prometheus/prometheus/util/testutil/directory.go new file mode 100644 index 000000000..d3c9c926f --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/util/testutil/directory.go @@ -0,0 +1,129 @@ +// Copyright 2013 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package testutil + +import ( + "io/ioutil" + "os" +) + +const ( + // The base directory used for test emissions, which instructs the operating + // system to use the default temporary directory as the base or TMPDIR + // environment variable. + defaultDirectory = "" + + // NilCloser is a no-op Closer. + NilCloser = nilCloser(true) + + // The number of times that a TemporaryDirectory will retry its removal + temporaryDirectoryRemoveRetries = 2 +) + +type ( + // Closer is the interface that wraps the Close method. + Closer interface { + // Close reaps the underlying directory and its children. The directory + // could be deleted by its users already. + Close() + } + + nilCloser bool + + // TemporaryDirectory models a closeable path for transient POSIX disk + // activities. + TemporaryDirectory interface { + Closer + + // Path returns the underlying path for access. + Path() string + } + + // temporaryDirectory is kept as a private type due to private fields and + // their interactions. + temporaryDirectory struct { + path string + tester T + } + + callbackCloser struct { + fn func() + } + + // T implements the needed methods of testing.TB so that we do not need + // to actually import testing (which has the side effect of adding all + // the test flags, which we do not want in non-test binaries even if + // they make use of these utilities for some reason). + T interface { + Fatal(args ...interface{}) + Fatalf(format string, args ...interface{}) + } +) + +func (c nilCloser) Close() { +} + +func (c callbackCloser) Close() { + c.fn() +} + +// NewCallbackCloser returns a Closer that calls the provided function upon +// closing. +func NewCallbackCloser(fn func()) Closer { + return &callbackCloser{ + fn: fn, + } +} + +func (t temporaryDirectory) Close() { + retries := temporaryDirectoryRemoveRetries + err := os.RemoveAll(t.path) + for err != nil && retries > 0 { + switch { + case os.IsNotExist(err): + err = nil + default: + retries-- + err = os.RemoveAll(t.path) + } + } + if err != nil { + t.tester.Fatal(err) + } +} + +func (t temporaryDirectory) Path() string { + return t.path +} + +// NewTemporaryDirectory creates a new temporary directory for transient POSIX +// activities. +func NewTemporaryDirectory(name string, t T) (handler TemporaryDirectory) { + var ( + directory string + err error + ) + + directory, err = ioutil.TempDir(defaultDirectory, name) + if err != nil { + t.Fatal(err) + } + + handler = temporaryDirectory{ + path: directory, + tester: t, + } + + return +} diff --git a/vendor/github.com/prometheus/prometheus/util/testutil/error.go b/vendor/github.com/prometheus/prometheus/util/testutil/error.go new file mode 100644 index 000000000..a132abf8f --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/util/testutil/error.go @@ -0,0 +1,31 @@ +// Copyright 2013 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package testutil + +// ErrorEqual compares Go errors for equality. +func ErrorEqual(left, right error) bool { + if left == right { + return true + } + + if left != nil && right != nil { + if left.Error() == right.Error() { + return true + } + + return false + } + + return false +} diff --git a/vendor/github.com/prometheus/prometheus/util/testutil/roundtrip.go b/vendor/github.com/prometheus/prometheus/util/testutil/roundtrip.go new file mode 100644 index 000000000..996d11f36 --- /dev/null +++ b/vendor/github.com/prometheus/prometheus/util/testutil/roundtrip.go @@ -0,0 +1,47 @@ +// Copyright 2017 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package testutil + +import ( + "net/http" +) + +type roundTrip struct { + theResponse *http.Response + theError error +} + +func (rt *roundTrip) RoundTrip(r *http.Request) (*http.Response, error) { + return rt.theResponse, rt.theError +} + +type roundTripCheckRequest struct { + checkRequest func(*http.Request) + roundTrip +} + +func (rt *roundTripCheckRequest) RoundTrip(r *http.Request) (*http.Response, error) { + rt.checkRequest(r) + return rt.theResponse, rt.theError +} + +// NewRoundTripCheckRequest creates a new instance of a type that implements http.RoundTripper, +// which before returning theResponse and theError, executes checkRequest against a http.Request. +func NewRoundTripCheckRequest(checkRequest func(*http.Request), theResponse *http.Response, theError error) http.RoundTripper { + return &roundTripCheckRequest{ + checkRequest: checkRequest, + roundTrip: roundTrip{ + theResponse: theResponse, + theError: theError}} +} diff --git a/vendor/github.com/syndtr/goleveldb/LICENSE b/vendor/github.com/syndtr/goleveldb/LICENSE new file mode 100644 index 000000000..4a772d1ab --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/LICENSE @@ -0,0 +1,24 @@ +Copyright 2012 Suryandaru Triandana +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/batch.go b/vendor/github.com/syndtr/goleveldb/leveldb/batch.go new file mode 100644 index 000000000..225920002 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/batch.go @@ -0,0 +1,349 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "encoding/binary" + "fmt" + "io" + + "github.com/syndtr/goleveldb/leveldb/errors" + "github.com/syndtr/goleveldb/leveldb/memdb" + "github.com/syndtr/goleveldb/leveldb/storage" +) + +// ErrBatchCorrupted records reason of batch corruption. This error will be +// wrapped with errors.ErrCorrupted. +type ErrBatchCorrupted struct { + Reason string +} + +func (e *ErrBatchCorrupted) Error() string { + return fmt.Sprintf("leveldb: batch corrupted: %s", e.Reason) +} + +func newErrBatchCorrupted(reason string) error { + return errors.NewErrCorrupted(storage.FileDesc{}, &ErrBatchCorrupted{reason}) +} + +const ( + batchHeaderLen = 8 + 4 + batchGrowRec = 3000 + batchBufioSize = 16 +) + +// BatchReplay wraps basic batch operations. +type BatchReplay interface { + Put(key, value []byte) + Delete(key []byte) +} + +type batchIndex struct { + keyType keyType + keyPos, keyLen int + valuePos, valueLen int +} + +func (index batchIndex) k(data []byte) []byte { + return data[index.keyPos : index.keyPos+index.keyLen] +} + +func (index batchIndex) v(data []byte) []byte { + if index.valueLen != 0 { + return data[index.valuePos : index.valuePos+index.valueLen] + } + return nil +} + +func (index batchIndex) kv(data []byte) (key, value []byte) { + return index.k(data), index.v(data) +} + +// Batch is a write batch. +type Batch struct { + data []byte + index []batchIndex + + // internalLen is sums of key/value pair length plus 8-bytes internal key. + internalLen int +} + +func (b *Batch) grow(n int) { + o := len(b.data) + if cap(b.data)-o < n { + div := 1 + if len(b.index) > batchGrowRec { + div = len(b.index) / batchGrowRec + } + ndata := make([]byte, o, o+n+o/div) + copy(ndata, b.data) + b.data = ndata + } +} + +func (b *Batch) appendRec(kt keyType, key, value []byte) { + n := 1 + binary.MaxVarintLen32 + len(key) + if kt == keyTypeVal { + n += binary.MaxVarintLen32 + len(value) + } + b.grow(n) + index := batchIndex{keyType: kt} + o := len(b.data) + data := b.data[:o+n] + data[o] = byte(kt) + o++ + o += binary.PutUvarint(data[o:], uint64(len(key))) + index.keyPos = o + index.keyLen = len(key) + o += copy(data[o:], key) + if kt == keyTypeVal { + o += binary.PutUvarint(data[o:], uint64(len(value))) + index.valuePos = o + index.valueLen = len(value) + o += copy(data[o:], value) + } + b.data = data[:o] + b.index = append(b.index, index) + b.internalLen += index.keyLen + index.valueLen + 8 +} + +// Put appends 'put operation' of the given key/value pair to the batch. +// It is safe to modify the contents of the argument after Put returns but not +// before. +func (b *Batch) Put(key, value []byte) { + b.appendRec(keyTypeVal, key, value) +} + +// Delete appends 'delete operation' of the given key to the batch. +// It is safe to modify the contents of the argument after Delete returns but +// not before. +func (b *Batch) Delete(key []byte) { + b.appendRec(keyTypeDel, key, nil) +} + +// Dump dumps batch contents. The returned slice can be loaded into the +// batch using Load method. +// The returned slice is not its own copy, so the contents should not be +// modified. +func (b *Batch) Dump() []byte { + return b.data +} + +// Load loads given slice into the batch. Previous contents of the batch +// will be discarded. +// The given slice will not be copied and will be used as batch buffer, so +// it is not safe to modify the contents of the slice. +func (b *Batch) Load(data []byte) error { + return b.decode(data, -1) +} + +// Replay replays batch contents. +func (b *Batch) Replay(r BatchReplay) error { + for _, index := range b.index { + switch index.keyType { + case keyTypeVal: + r.Put(index.k(b.data), index.v(b.data)) + case keyTypeDel: + r.Delete(index.k(b.data)) + } + } + return nil +} + +// Len returns number of records in the batch. +func (b *Batch) Len() int { + return len(b.index) +} + +// Reset resets the batch. +func (b *Batch) Reset() { + b.data = b.data[:0] + b.index = b.index[:0] + b.internalLen = 0 +} + +func (b *Batch) replayInternal(fn func(i int, kt keyType, k, v []byte) error) error { + for i, index := range b.index { + if err := fn(i, index.keyType, index.k(b.data), index.v(b.data)); err != nil { + return err + } + } + return nil +} + +func (b *Batch) append(p *Batch) { + ob := len(b.data) + oi := len(b.index) + b.data = append(b.data, p.data...) + b.index = append(b.index, p.index...) + b.internalLen += p.internalLen + + // Updating index offset. + if ob != 0 { + for ; oi < len(b.index); oi++ { + index := &b.index[oi] + index.keyPos += ob + if index.valueLen != 0 { + index.valuePos += ob + } + } + } +} + +func (b *Batch) decode(data []byte, expectedLen int) error { + b.data = data + b.index = b.index[:0] + b.internalLen = 0 + err := decodeBatch(data, func(i int, index batchIndex) error { + b.index = append(b.index, index) + b.internalLen += index.keyLen + index.valueLen + 8 + return nil + }) + if err != nil { + return err + } + if expectedLen >= 0 && len(b.index) != expectedLen { + return newErrBatchCorrupted(fmt.Sprintf("invalid records length: %d vs %d", expectedLen, len(b.index))) + } + return nil +} + +func (b *Batch) putMem(seq uint64, mdb *memdb.DB) error { + var ik []byte + for i, index := range b.index { + ik = makeInternalKey(ik, index.k(b.data), seq+uint64(i), index.keyType) + if err := mdb.Put(ik, index.v(b.data)); err != nil { + return err + } + } + return nil +} + +func (b *Batch) revertMem(seq uint64, mdb *memdb.DB) error { + var ik []byte + for i, index := range b.index { + ik = makeInternalKey(ik, index.k(b.data), seq+uint64(i), index.keyType) + if err := mdb.Delete(ik); err != nil { + return err + } + } + return nil +} + +func newBatch() interface{} { + return &Batch{} +} + +func decodeBatch(data []byte, fn func(i int, index batchIndex) error) error { + var index batchIndex + for i, o := 0, 0; o < len(data); i++ { + // Key type. + index.keyType = keyType(data[o]) + if index.keyType > keyTypeVal { + return newErrBatchCorrupted(fmt.Sprintf("bad record: invalid type %#x", uint(index.keyType))) + } + o++ + + // Key. + x, n := binary.Uvarint(data[o:]) + o += n + if n <= 0 || o+int(x) > len(data) { + return newErrBatchCorrupted("bad record: invalid key length") + } + index.keyPos = o + index.keyLen = int(x) + o += index.keyLen + + // Value. + if index.keyType == keyTypeVal { + x, n = binary.Uvarint(data[o:]) + o += n + if n <= 0 || o+int(x) > len(data) { + return newErrBatchCorrupted("bad record: invalid value length") + } + index.valuePos = o + index.valueLen = int(x) + o += index.valueLen + } else { + index.valuePos = 0 + index.valueLen = 0 + } + + if err := fn(i, index); err != nil { + return err + } + } + return nil +} + +func decodeBatchToMem(data []byte, expectSeq uint64, mdb *memdb.DB) (seq uint64, batchLen int, err error) { + seq, batchLen, err = decodeBatchHeader(data) + if err != nil { + return 0, 0, err + } + if seq < expectSeq { + return 0, 0, newErrBatchCorrupted("invalid sequence number") + } + data = data[batchHeaderLen:] + var ik []byte + var decodedLen int + err = decodeBatch(data, func(i int, index batchIndex) error { + if i >= batchLen { + return newErrBatchCorrupted("invalid records length") + } + ik = makeInternalKey(ik, index.k(data), seq+uint64(i), index.keyType) + if err := mdb.Put(ik, index.v(data)); err != nil { + return err + } + decodedLen++ + return nil + }) + if err == nil && decodedLen != batchLen { + err = newErrBatchCorrupted(fmt.Sprintf("invalid records length: %d vs %d", batchLen, decodedLen)) + } + return +} + +func encodeBatchHeader(dst []byte, seq uint64, batchLen int) []byte { + dst = ensureBuffer(dst, batchHeaderLen) + binary.LittleEndian.PutUint64(dst, seq) + binary.LittleEndian.PutUint32(dst[8:], uint32(batchLen)) + return dst +} + +func decodeBatchHeader(data []byte) (seq uint64, batchLen int, err error) { + if len(data) < batchHeaderLen { + return 0, 0, newErrBatchCorrupted("too short") + } + + seq = binary.LittleEndian.Uint64(data) + batchLen = int(binary.LittleEndian.Uint32(data[8:])) + if batchLen < 0 { + return 0, 0, newErrBatchCorrupted("invalid records length") + } + return +} + +func batchesLen(batches []*Batch) int { + batchLen := 0 + for _, batch := range batches { + batchLen += batch.Len() + } + return batchLen +} + +func writeBatchesWithHeader(wr io.Writer, batches []*Batch, seq uint64) error { + if _, err := wr.Write(encodeBatchHeader(nil, seq, batchesLen(batches))); err != nil { + return err + } + for _, batch := range batches { + if _, err := wr.Write(batch.data); err != nil { + return err + } + } + return nil +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/cache/cache.go b/vendor/github.com/syndtr/goleveldb/leveldb/cache/cache.go new file mode 100644 index 000000000..c36ad3235 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/cache/cache.go @@ -0,0 +1,704 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Package cache provides interface and implementation of a cache algorithms. +package cache + +import ( + "sync" + "sync/atomic" + "unsafe" + + "github.com/syndtr/goleveldb/leveldb/util" +) + +// Cacher provides interface to implements a caching functionality. +// An implementation must be safe for concurrent use. +type Cacher interface { + // Capacity returns cache capacity. + Capacity() int + + // SetCapacity sets cache capacity. + SetCapacity(capacity int) + + // Promote promotes the 'cache node'. + Promote(n *Node) + + // Ban evicts the 'cache node' and prevent subsequent 'promote'. + Ban(n *Node) + + // Evict evicts the 'cache node'. + Evict(n *Node) + + // EvictNS evicts 'cache node' with the given namespace. + EvictNS(ns uint64) + + // EvictAll evicts all 'cache node'. + EvictAll() + + // Close closes the 'cache tree' + Close() error +} + +// Value is a 'cacheable object'. It may implements util.Releaser, if +// so the the Release method will be called once object is released. +type Value interface{} + +// NamespaceGetter provides convenient wrapper for namespace. +type NamespaceGetter struct { + Cache *Cache + NS uint64 +} + +// Get simply calls Cache.Get() method. +func (g *NamespaceGetter) Get(key uint64, setFunc func() (size int, value Value)) *Handle { + return g.Cache.Get(g.NS, key, setFunc) +} + +// The hash tables implementation is based on: +// "Dynamic-Sized Nonblocking Hash Tables", by Yujie Liu, +// Kunlong Zhang, and Michael Spear. +// ACM Symposium on Principles of Distributed Computing, Jul 2014. + +const ( + mInitialSize = 1 << 4 + mOverflowThreshold = 1 << 5 + mOverflowGrowThreshold = 1 << 7 +) + +type mBucket struct { + mu sync.Mutex + node []*Node + frozen bool +} + +func (b *mBucket) freeze() []*Node { + b.mu.Lock() + defer b.mu.Unlock() + if !b.frozen { + b.frozen = true + } + return b.node +} + +func (b *mBucket) get(r *Cache, h *mNode, hash uint32, ns, key uint64, noset bool) (done, added bool, n *Node) { + b.mu.Lock() + + if b.frozen { + b.mu.Unlock() + return + } + + // Scan the node. + for _, n := range b.node { + if n.hash == hash && n.ns == ns && n.key == key { + atomic.AddInt32(&n.ref, 1) + b.mu.Unlock() + return true, false, n + } + } + + // Get only. + if noset { + b.mu.Unlock() + return true, false, nil + } + + // Create node. + n = &Node{ + r: r, + hash: hash, + ns: ns, + key: key, + ref: 1, + } + // Add node to bucket. + b.node = append(b.node, n) + bLen := len(b.node) + b.mu.Unlock() + + // Update counter. + grow := atomic.AddInt32(&r.nodes, 1) >= h.growThreshold + if bLen > mOverflowThreshold { + grow = grow || atomic.AddInt32(&h.overflow, 1) >= mOverflowGrowThreshold + } + + // Grow. + if grow && atomic.CompareAndSwapInt32(&h.resizeInProgess, 0, 1) { + nhLen := len(h.buckets) << 1 + nh := &mNode{ + buckets: make([]unsafe.Pointer, nhLen), + mask: uint32(nhLen) - 1, + pred: unsafe.Pointer(h), + growThreshold: int32(nhLen * mOverflowThreshold), + shrinkThreshold: int32(nhLen >> 1), + } + ok := atomic.CompareAndSwapPointer(&r.mHead, unsafe.Pointer(h), unsafe.Pointer(nh)) + if !ok { + panic("BUG: failed swapping head") + } + go nh.initBuckets() + } + + return true, true, n +} + +func (b *mBucket) delete(r *Cache, h *mNode, hash uint32, ns, key uint64) (done, deleted bool) { + b.mu.Lock() + + if b.frozen { + b.mu.Unlock() + return + } + + // Scan the node. + var ( + n *Node + bLen int + ) + for i := range b.node { + n = b.node[i] + if n.ns == ns && n.key == key { + if atomic.LoadInt32(&n.ref) == 0 { + deleted = true + + // Call releaser. + if n.value != nil { + if r, ok := n.value.(util.Releaser); ok { + r.Release() + } + n.value = nil + } + + // Remove node from bucket. + b.node = append(b.node[:i], b.node[i+1:]...) + bLen = len(b.node) + } + break + } + } + b.mu.Unlock() + + if deleted { + // Call OnDel. + for _, f := range n.onDel { + f() + } + + // Update counter. + atomic.AddInt32(&r.size, int32(n.size)*-1) + shrink := atomic.AddInt32(&r.nodes, -1) < h.shrinkThreshold + if bLen >= mOverflowThreshold { + atomic.AddInt32(&h.overflow, -1) + } + + // Shrink. + if shrink && len(h.buckets) > mInitialSize && atomic.CompareAndSwapInt32(&h.resizeInProgess, 0, 1) { + nhLen := len(h.buckets) >> 1 + nh := &mNode{ + buckets: make([]unsafe.Pointer, nhLen), + mask: uint32(nhLen) - 1, + pred: unsafe.Pointer(h), + growThreshold: int32(nhLen * mOverflowThreshold), + shrinkThreshold: int32(nhLen >> 1), + } + ok := atomic.CompareAndSwapPointer(&r.mHead, unsafe.Pointer(h), unsafe.Pointer(nh)) + if !ok { + panic("BUG: failed swapping head") + } + go nh.initBuckets() + } + } + + return true, deleted +} + +type mNode struct { + buckets []unsafe.Pointer // []*mBucket + mask uint32 + pred unsafe.Pointer // *mNode + resizeInProgess int32 + + overflow int32 + growThreshold int32 + shrinkThreshold int32 +} + +func (n *mNode) initBucket(i uint32) *mBucket { + if b := (*mBucket)(atomic.LoadPointer(&n.buckets[i])); b != nil { + return b + } + + p := (*mNode)(atomic.LoadPointer(&n.pred)) + if p != nil { + var node []*Node + if n.mask > p.mask { + // Grow. + pb := (*mBucket)(atomic.LoadPointer(&p.buckets[i&p.mask])) + if pb == nil { + pb = p.initBucket(i & p.mask) + } + m := pb.freeze() + // Split nodes. + for _, x := range m { + if x.hash&n.mask == i { + node = append(node, x) + } + } + } else { + // Shrink. + pb0 := (*mBucket)(atomic.LoadPointer(&p.buckets[i])) + if pb0 == nil { + pb0 = p.initBucket(i) + } + pb1 := (*mBucket)(atomic.LoadPointer(&p.buckets[i+uint32(len(n.buckets))])) + if pb1 == nil { + pb1 = p.initBucket(i + uint32(len(n.buckets))) + } + m0 := pb0.freeze() + m1 := pb1.freeze() + // Merge nodes. + node = make([]*Node, 0, len(m0)+len(m1)) + node = append(node, m0...) + node = append(node, m1...) + } + b := &mBucket{node: node} + if atomic.CompareAndSwapPointer(&n.buckets[i], nil, unsafe.Pointer(b)) { + if len(node) > mOverflowThreshold { + atomic.AddInt32(&n.overflow, int32(len(node)-mOverflowThreshold)) + } + return b + } + } + + return (*mBucket)(atomic.LoadPointer(&n.buckets[i])) +} + +func (n *mNode) initBuckets() { + for i := range n.buckets { + n.initBucket(uint32(i)) + } + atomic.StorePointer(&n.pred, nil) +} + +// Cache is a 'cache map'. +type Cache struct { + mu sync.RWMutex + mHead unsafe.Pointer // *mNode + nodes int32 + size int32 + cacher Cacher + closed bool +} + +// NewCache creates a new 'cache map'. The cacher is optional and +// may be nil. +func NewCache(cacher Cacher) *Cache { + h := &mNode{ + buckets: make([]unsafe.Pointer, mInitialSize), + mask: mInitialSize - 1, + growThreshold: int32(mInitialSize * mOverflowThreshold), + shrinkThreshold: 0, + } + for i := range h.buckets { + h.buckets[i] = unsafe.Pointer(&mBucket{}) + } + r := &Cache{ + mHead: unsafe.Pointer(h), + cacher: cacher, + } + return r +} + +func (r *Cache) getBucket(hash uint32) (*mNode, *mBucket) { + h := (*mNode)(atomic.LoadPointer(&r.mHead)) + i := hash & h.mask + b := (*mBucket)(atomic.LoadPointer(&h.buckets[i])) + if b == nil { + b = h.initBucket(i) + } + return h, b +} + +func (r *Cache) delete(n *Node) bool { + for { + h, b := r.getBucket(n.hash) + done, deleted := b.delete(r, h, n.hash, n.ns, n.key) + if done { + return deleted + } + } +} + +// Nodes returns number of 'cache node' in the map. +func (r *Cache) Nodes() int { + return int(atomic.LoadInt32(&r.nodes)) +} + +// Size returns sums of 'cache node' size in the map. +func (r *Cache) Size() int { + return int(atomic.LoadInt32(&r.size)) +} + +// Capacity returns cache capacity. +func (r *Cache) Capacity() int { + if r.cacher == nil { + return 0 + } + return r.cacher.Capacity() +} + +// SetCapacity sets cache capacity. +func (r *Cache) SetCapacity(capacity int) { + if r.cacher != nil { + r.cacher.SetCapacity(capacity) + } +} + +// Get gets 'cache node' with the given namespace and key. +// If cache node is not found and setFunc is not nil, Get will atomically creates +// the 'cache node' by calling setFunc. Otherwise Get will returns nil. +// +// The returned 'cache handle' should be released after use by calling Release +// method. +func (r *Cache) Get(ns, key uint64, setFunc func() (size int, value Value)) *Handle { + r.mu.RLock() + defer r.mu.RUnlock() + if r.closed { + return nil + } + + hash := murmur32(ns, key, 0xf00) + for { + h, b := r.getBucket(hash) + done, _, n := b.get(r, h, hash, ns, key, setFunc == nil) + if done { + if n != nil { + n.mu.Lock() + if n.value == nil { + if setFunc == nil { + n.mu.Unlock() + n.unref() + return nil + } + + n.size, n.value = setFunc() + if n.value == nil { + n.size = 0 + n.mu.Unlock() + n.unref() + return nil + } + atomic.AddInt32(&r.size, int32(n.size)) + } + n.mu.Unlock() + if r.cacher != nil { + r.cacher.Promote(n) + } + return &Handle{unsafe.Pointer(n)} + } + + break + } + } + return nil +} + +// Delete removes and ban 'cache node' with the given namespace and key. +// A banned 'cache node' will never inserted into the 'cache tree'. Ban +// only attributed to the particular 'cache node', so when a 'cache node' +// is recreated it will not be banned. +// +// If onDel is not nil, then it will be executed if such 'cache node' +// doesn't exist or once the 'cache node' is released. +// +// Delete return true is such 'cache node' exist. +func (r *Cache) Delete(ns, key uint64, onDel func()) bool { + r.mu.RLock() + defer r.mu.RUnlock() + if r.closed { + return false + } + + hash := murmur32(ns, key, 0xf00) + for { + h, b := r.getBucket(hash) + done, _, n := b.get(r, h, hash, ns, key, true) + if done { + if n != nil { + if onDel != nil { + n.mu.Lock() + n.onDel = append(n.onDel, onDel) + n.mu.Unlock() + } + if r.cacher != nil { + r.cacher.Ban(n) + } + n.unref() + return true + } + + break + } + } + + if onDel != nil { + onDel() + } + + return false +} + +// Evict evicts 'cache node' with the given namespace and key. This will +// simply call Cacher.Evict. +// +// Evict return true is such 'cache node' exist. +func (r *Cache) Evict(ns, key uint64) bool { + r.mu.RLock() + defer r.mu.RUnlock() + if r.closed { + return false + } + + hash := murmur32(ns, key, 0xf00) + for { + h, b := r.getBucket(hash) + done, _, n := b.get(r, h, hash, ns, key, true) + if done { + if n != nil { + if r.cacher != nil { + r.cacher.Evict(n) + } + n.unref() + return true + } + + break + } + } + + return false +} + +// EvictNS evicts 'cache node' with the given namespace. This will +// simply call Cacher.EvictNS. +func (r *Cache) EvictNS(ns uint64) { + r.mu.RLock() + defer r.mu.RUnlock() + if r.closed { + return + } + + if r.cacher != nil { + r.cacher.EvictNS(ns) + } +} + +// EvictAll evicts all 'cache node'. This will simply call Cacher.EvictAll. +func (r *Cache) EvictAll() { + r.mu.RLock() + defer r.mu.RUnlock() + if r.closed { + return + } + + if r.cacher != nil { + r.cacher.EvictAll() + } +} + +// Close closes the 'cache map' and forcefully releases all 'cache node'. +func (r *Cache) Close() error { + r.mu.Lock() + if !r.closed { + r.closed = true + + h := (*mNode)(r.mHead) + h.initBuckets() + + for i := range h.buckets { + b := (*mBucket)(h.buckets[i]) + for _, n := range b.node { + // Call releaser. + if n.value != nil { + if r, ok := n.value.(util.Releaser); ok { + r.Release() + } + n.value = nil + } + + // Call OnDel. + for _, f := range n.onDel { + f() + } + n.onDel = nil + } + } + } + r.mu.Unlock() + + // Avoid deadlock. + if r.cacher != nil { + if err := r.cacher.Close(); err != nil { + return err + } + } + return nil +} + +// CloseWeak closes the 'cache map' and evict all 'cache node' from cacher, but +// unlike Close it doesn't forcefully releases 'cache node'. +func (r *Cache) CloseWeak() error { + r.mu.Lock() + if !r.closed { + r.closed = true + } + r.mu.Unlock() + + // Avoid deadlock. + if r.cacher != nil { + r.cacher.EvictAll() + if err := r.cacher.Close(); err != nil { + return err + } + } + return nil +} + +// Node is a 'cache node'. +type Node struct { + r *Cache + + hash uint32 + ns, key uint64 + + mu sync.Mutex + size int + value Value + + ref int32 + onDel []func() + + CacheData unsafe.Pointer +} + +// NS returns this 'cache node' namespace. +func (n *Node) NS() uint64 { + return n.ns +} + +// Key returns this 'cache node' key. +func (n *Node) Key() uint64 { + return n.key +} + +// Size returns this 'cache node' size. +func (n *Node) Size() int { + return n.size +} + +// Value returns this 'cache node' value. +func (n *Node) Value() Value { + return n.value +} + +// Ref returns this 'cache node' ref counter. +func (n *Node) Ref() int32 { + return atomic.LoadInt32(&n.ref) +} + +// GetHandle returns an handle for this 'cache node'. +func (n *Node) GetHandle() *Handle { + if atomic.AddInt32(&n.ref, 1) <= 1 { + panic("BUG: Node.GetHandle on zero ref") + } + return &Handle{unsafe.Pointer(n)} +} + +func (n *Node) unref() { + if atomic.AddInt32(&n.ref, -1) == 0 { + n.r.delete(n) + } +} + +func (n *Node) unrefLocked() { + if atomic.AddInt32(&n.ref, -1) == 0 { + n.r.mu.RLock() + if !n.r.closed { + n.r.delete(n) + } + n.r.mu.RUnlock() + } +} + +// Handle is a 'cache handle' of a 'cache node'. +type Handle struct { + n unsafe.Pointer // *Node +} + +// Value returns the value of the 'cache node'. +func (h *Handle) Value() Value { + n := (*Node)(atomic.LoadPointer(&h.n)) + if n != nil { + return n.value + } + return nil +} + +// Release releases this 'cache handle'. +// It is safe to call release multiple times. +func (h *Handle) Release() { + nPtr := atomic.LoadPointer(&h.n) + if nPtr != nil && atomic.CompareAndSwapPointer(&h.n, nPtr, nil) { + n := (*Node)(nPtr) + n.unrefLocked() + } +} + +func murmur32(ns, key uint64, seed uint32) uint32 { + const ( + m = uint32(0x5bd1e995) + r = 24 + ) + + k1 := uint32(ns >> 32) + k2 := uint32(ns) + k3 := uint32(key >> 32) + k4 := uint32(key) + + k1 *= m + k1 ^= k1 >> r + k1 *= m + + k2 *= m + k2 ^= k2 >> r + k2 *= m + + k3 *= m + k3 ^= k3 >> r + k3 *= m + + k4 *= m + k4 ^= k4 >> r + k4 *= m + + h := seed + + h *= m + h ^= k1 + h *= m + h ^= k2 + h *= m + h ^= k3 + h *= m + h ^= k4 + + h ^= h >> 13 + h *= m + h ^= h >> 15 + + return h +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/cache/lru.go b/vendor/github.com/syndtr/goleveldb/leveldb/cache/lru.go new file mode 100644 index 000000000..d9a84cde1 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/cache/lru.go @@ -0,0 +1,195 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package cache + +import ( + "sync" + "unsafe" +) + +type lruNode struct { + n *Node + h *Handle + ban bool + + next, prev *lruNode +} + +func (n *lruNode) insert(at *lruNode) { + x := at.next + at.next = n + n.prev = at + n.next = x + x.prev = n +} + +func (n *lruNode) remove() { + if n.prev != nil { + n.prev.next = n.next + n.next.prev = n.prev + n.prev = nil + n.next = nil + } else { + panic("BUG: removing removed node") + } +} + +type lru struct { + mu sync.Mutex + capacity int + used int + recent lruNode +} + +func (r *lru) reset() { + r.recent.next = &r.recent + r.recent.prev = &r.recent + r.used = 0 +} + +func (r *lru) Capacity() int { + r.mu.Lock() + defer r.mu.Unlock() + return r.capacity +} + +func (r *lru) SetCapacity(capacity int) { + var evicted []*lruNode + + r.mu.Lock() + r.capacity = capacity + for r.used > r.capacity { + rn := r.recent.prev + if rn == nil { + panic("BUG: invalid LRU used or capacity counter") + } + rn.remove() + rn.n.CacheData = nil + r.used -= rn.n.Size() + evicted = append(evicted, rn) + } + r.mu.Unlock() + + for _, rn := range evicted { + rn.h.Release() + } +} + +func (r *lru) Promote(n *Node) { + var evicted []*lruNode + + r.mu.Lock() + if n.CacheData == nil { + if n.Size() <= r.capacity { + rn := &lruNode{n: n, h: n.GetHandle()} + rn.insert(&r.recent) + n.CacheData = unsafe.Pointer(rn) + r.used += n.Size() + + for r.used > r.capacity { + rn := r.recent.prev + if rn == nil { + panic("BUG: invalid LRU used or capacity counter") + } + rn.remove() + rn.n.CacheData = nil + r.used -= rn.n.Size() + evicted = append(evicted, rn) + } + } + } else { + rn := (*lruNode)(n.CacheData) + if !rn.ban { + rn.remove() + rn.insert(&r.recent) + } + } + r.mu.Unlock() + + for _, rn := range evicted { + rn.h.Release() + } +} + +func (r *lru) Ban(n *Node) { + r.mu.Lock() + if n.CacheData == nil { + n.CacheData = unsafe.Pointer(&lruNode{n: n, ban: true}) + } else { + rn := (*lruNode)(n.CacheData) + if !rn.ban { + rn.remove() + rn.ban = true + r.used -= rn.n.Size() + r.mu.Unlock() + + rn.h.Release() + rn.h = nil + return + } + } + r.mu.Unlock() +} + +func (r *lru) Evict(n *Node) { + r.mu.Lock() + rn := (*lruNode)(n.CacheData) + if rn == nil || rn.ban { + r.mu.Unlock() + return + } + n.CacheData = nil + r.mu.Unlock() + + rn.h.Release() +} + +func (r *lru) EvictNS(ns uint64) { + var evicted []*lruNode + + r.mu.Lock() + for e := r.recent.prev; e != &r.recent; { + rn := e + e = e.prev + if rn.n.NS() == ns { + rn.remove() + rn.n.CacheData = nil + r.used -= rn.n.Size() + evicted = append(evicted, rn) + } + } + r.mu.Unlock() + + for _, rn := range evicted { + rn.h.Release() + } +} + +func (r *lru) EvictAll() { + r.mu.Lock() + back := r.recent.prev + for rn := back; rn != &r.recent; rn = rn.prev { + rn.n.CacheData = nil + } + r.reset() + r.mu.Unlock() + + for rn := back; rn != &r.recent; rn = rn.prev { + rn.h.Release() + } +} + +func (r *lru) Close() error { + return nil +} + +// NewLRU create a new LRU-cache. +func NewLRU(capacity int) Cacher { + r := &lru{capacity: capacity} + r.reset() + return r +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/comparer.go b/vendor/github.com/syndtr/goleveldb/leveldb/comparer.go new file mode 100644 index 000000000..448402b82 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/comparer.go @@ -0,0 +1,67 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "github.com/syndtr/goleveldb/leveldb/comparer" +) + +type iComparer struct { + ucmp comparer.Comparer +} + +func (icmp *iComparer) uName() string { + return icmp.ucmp.Name() +} + +func (icmp *iComparer) uCompare(a, b []byte) int { + return icmp.ucmp.Compare(a, b) +} + +func (icmp *iComparer) uSeparator(dst, a, b []byte) []byte { + return icmp.ucmp.Separator(dst, a, b) +} + +func (icmp *iComparer) uSuccessor(dst, b []byte) []byte { + return icmp.ucmp.Successor(dst, b) +} + +func (icmp *iComparer) Name() string { + return icmp.uName() +} + +func (icmp *iComparer) Compare(a, b []byte) int { + x := icmp.uCompare(internalKey(a).ukey(), internalKey(b).ukey()) + if x == 0 { + if m, n := internalKey(a).num(), internalKey(b).num(); m > n { + return -1 + } else if m < n { + return 1 + } + } + return x +} + +func (icmp *iComparer) Separator(dst, a, b []byte) []byte { + ua, ub := internalKey(a).ukey(), internalKey(b).ukey() + dst = icmp.uSeparator(dst, ua, ub) + if dst != nil && len(dst) < len(ua) && icmp.uCompare(ua, dst) < 0 { + // Append earliest possible number. + return append(dst, keyMaxNumBytes...) + } + return nil +} + +func (icmp *iComparer) Successor(dst, b []byte) []byte { + ub := internalKey(b).ukey() + dst = icmp.uSuccessor(dst, ub) + if dst != nil && len(dst) < len(ub) && icmp.uCompare(ub, dst) < 0 { + // Append earliest possible number. + return append(dst, keyMaxNumBytes...) + } + return nil +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/comparer/bytes_comparer.go b/vendor/github.com/syndtr/goleveldb/leveldb/comparer/bytes_comparer.go new file mode 100644 index 000000000..abf9fb65c --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/comparer/bytes_comparer.go @@ -0,0 +1,51 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package comparer + +import "bytes" + +type bytesComparer struct{} + +func (bytesComparer) Compare(a, b []byte) int { + return bytes.Compare(a, b) +} + +func (bytesComparer) Name() string { + return "leveldb.BytewiseComparator" +} + +func (bytesComparer) Separator(dst, a, b []byte) []byte { + i, n := 0, len(a) + if n > len(b) { + n = len(b) + } + for ; i < n && a[i] == b[i]; i++ { + } + if i >= n { + // Do not shorten if one string is a prefix of the other + } else if c := a[i]; c < 0xff && c+1 < b[i] { + dst = append(dst, a[:i+1]...) + dst[len(dst)-1]++ + return dst + } + return nil +} + +func (bytesComparer) Successor(dst, b []byte) []byte { + for i, c := range b { + if c != 0xff { + dst = append(dst, b[:i+1]...) + dst[len(dst)-1]++ + return dst + } + } + return nil +} + +// DefaultComparer are default implementation of the Comparer interface. +// It uses the natural ordering, consistent with bytes.Compare. +var DefaultComparer = bytesComparer{} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/comparer/comparer.go b/vendor/github.com/syndtr/goleveldb/leveldb/comparer/comparer.go new file mode 100644 index 000000000..2c522db23 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/comparer/comparer.go @@ -0,0 +1,57 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Package comparer provides interface and implementation for ordering +// sets of data. +package comparer + +// BasicComparer is the interface that wraps the basic Compare method. +type BasicComparer interface { + // Compare returns -1, 0, or +1 depending on whether a is 'less than', + // 'equal to' or 'greater than' b. The two arguments can only be 'equal' + // if their contents are exactly equal. Furthermore, the empty slice + // must be 'less than' any non-empty slice. + Compare(a, b []byte) int +} + +// Comparer defines a total ordering over the space of []byte keys: a 'less +// than' relationship. +type Comparer interface { + BasicComparer + + // Name returns name of the comparer. + // + // The Level-DB on-disk format stores the comparer name, and opening a + // database with a different comparer from the one it was created with + // will result in an error. + // + // An implementation to a new name whenever the comparer implementation + // changes in a way that will cause the relative ordering of any two keys + // to change. + // + // Names starting with "leveldb." are reserved and should not be used + // by any users of this package. + Name() string + + // Bellow are advanced functions used to reduce the space requirements + // for internal data structures such as index blocks. + + // Separator appends a sequence of bytes x to dst such that a <= x && x < b, + // where 'less than' is consistent with Compare. An implementation should + // return nil if x equal to a. + // + // Either contents of a or b should not by any means modified. Doing so + // may cause corruption on the internal state. + Separator(dst, a, b []byte) []byte + + // Successor appends a sequence of bytes x to dst such that x >= b, where + // 'less than' is consistent with Compare. An implementation should return + // nil if x equal to b. + // + // Contents of b should not by any means modified. Doing so may cause + // corruption on the internal state. + Successor(dst, b []byte) []byte +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/db.go b/vendor/github.com/syndtr/goleveldb/leveldb/db.go new file mode 100644 index 000000000..90fedf7bd --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/db.go @@ -0,0 +1,1179 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "container/list" + "fmt" + "io" + "os" + "runtime" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/syndtr/goleveldb/leveldb/errors" + "github.com/syndtr/goleveldb/leveldb/iterator" + "github.com/syndtr/goleveldb/leveldb/journal" + "github.com/syndtr/goleveldb/leveldb/memdb" + "github.com/syndtr/goleveldb/leveldb/opt" + "github.com/syndtr/goleveldb/leveldb/storage" + "github.com/syndtr/goleveldb/leveldb/table" + "github.com/syndtr/goleveldb/leveldb/util" +) + +// DB is a LevelDB database. +type DB struct { + // Need 64-bit alignment. + seq uint64 + + // Stats. Need 64-bit alignment. + cWriteDelay int64 // The cumulative duration of write delays + cWriteDelayN int32 // The cumulative number of write delays + inWritePaused int32 // The indicator whether write operation is paused by compaction + aliveSnaps, aliveIters int32 + + // Session. + s *session + + // MemDB. + memMu sync.RWMutex + memPool chan *memdb.DB + mem, frozenMem *memDB + journal *journal.Writer + journalWriter storage.Writer + journalFd storage.FileDesc + frozenJournalFd storage.FileDesc + frozenSeq uint64 + + // Snapshot. + snapsMu sync.Mutex + snapsList *list.List + + // Write. + batchPool sync.Pool + writeMergeC chan writeMerge + writeMergedC chan bool + writeLockC chan struct{} + writeAckC chan error + writeDelay time.Duration + writeDelayN int + tr *Transaction + + // Compaction. + compCommitLk sync.Mutex + tcompCmdC chan cCmd + tcompPauseC chan chan<- struct{} + mcompCmdC chan cCmd + compErrC chan error + compPerErrC chan error + compErrSetC chan error + compWriteLocking bool + compStats cStats + memdbMaxLevel int // For testing. + + // Close. + closeW sync.WaitGroup + closeC chan struct{} + closed uint32 + closer io.Closer +} + +func openDB(s *session) (*DB, error) { + s.log("db@open opening") + start := time.Now() + db := &DB{ + s: s, + // Initial sequence + seq: s.stSeqNum, + // MemDB + memPool: make(chan *memdb.DB, 1), + // Snapshot + snapsList: list.New(), + // Write + batchPool: sync.Pool{New: newBatch}, + writeMergeC: make(chan writeMerge), + writeMergedC: make(chan bool), + writeLockC: make(chan struct{}, 1), + writeAckC: make(chan error), + // Compaction + tcompCmdC: make(chan cCmd), + tcompPauseC: make(chan chan<- struct{}), + mcompCmdC: make(chan cCmd), + compErrC: make(chan error), + compPerErrC: make(chan error), + compErrSetC: make(chan error), + // Close + closeC: make(chan struct{}), + } + + // Read-only mode. + readOnly := s.o.GetReadOnly() + + if readOnly { + // Recover journals (read-only mode). + if err := db.recoverJournalRO(); err != nil { + return nil, err + } + } else { + // Recover journals. + if err := db.recoverJournal(); err != nil { + return nil, err + } + + // Remove any obsolete files. + if err := db.checkAndCleanFiles(); err != nil { + // Close journal. + if db.journal != nil { + db.journal.Close() + db.journalWriter.Close() + } + return nil, err + } + + } + + // Doesn't need to be included in the wait group. + go db.compactionError() + go db.mpoolDrain() + + if readOnly { + db.SetReadOnly() + } else { + db.closeW.Add(2) + go db.tCompaction() + go db.mCompaction() + // go db.jWriter() + } + + s.logf("db@open done T·%v", time.Since(start)) + + runtime.SetFinalizer(db, (*DB).Close) + return db, nil +} + +// Open opens or creates a DB for the given storage. +// The DB will be created if not exist, unless ErrorIfMissing is true. +// Also, if ErrorIfExist is true and the DB exist Open will returns +// os.ErrExist error. +// +// Open will return an error with type of ErrCorrupted if corruption +// detected in the DB. Use errors.IsCorrupted to test whether an error is +// due to corruption. Corrupted DB can be recovered with Recover function. +// +// The returned DB instance is safe for concurrent use. +// The DB must be closed after use, by calling Close method. +func Open(stor storage.Storage, o *opt.Options) (db *DB, err error) { + s, err := newSession(stor, o) + if err != nil { + return + } + defer func() { + if err != nil { + s.close() + s.release() + } + }() + + err = s.recover() + if err != nil { + if !os.IsNotExist(err) || s.o.GetErrorIfMissing() || s.o.GetReadOnly() { + return + } + err = s.create() + if err != nil { + return + } + } else if s.o.GetErrorIfExist() { + err = os.ErrExist + return + } + + return openDB(s) +} + +// OpenFile opens or creates a DB for the given path. +// The DB will be created if not exist, unless ErrorIfMissing is true. +// Also, if ErrorIfExist is true and the DB exist OpenFile will returns +// os.ErrExist error. +// +// OpenFile uses standard file-system backed storage implementation as +// described in the leveldb/storage package. +// +// OpenFile will return an error with type of ErrCorrupted if corruption +// detected in the DB. Use errors.IsCorrupted to test whether an error is +// due to corruption. Corrupted DB can be recovered with Recover function. +// +// The returned DB instance is safe for concurrent use. +// The DB must be closed after use, by calling Close method. +func OpenFile(path string, o *opt.Options) (db *DB, err error) { + stor, err := storage.OpenFile(path, o.GetReadOnly()) + if err != nil { + return + } + db, err = Open(stor, o) + if err != nil { + stor.Close() + } else { + db.closer = stor + } + return +} + +// Recover recovers and opens a DB with missing or corrupted manifest files +// for the given storage. It will ignore any manifest files, valid or not. +// The DB must already exist or it will returns an error. +// Also, Recover will ignore ErrorIfMissing and ErrorIfExist options. +// +// The returned DB instance is safe for concurrent use. +// The DB must be closed after use, by calling Close method. +func Recover(stor storage.Storage, o *opt.Options) (db *DB, err error) { + s, err := newSession(stor, o) + if err != nil { + return + } + defer func() { + if err != nil { + s.close() + s.release() + } + }() + + err = recoverTable(s, o) + if err != nil { + return + } + return openDB(s) +} + +// RecoverFile recovers and opens a DB with missing or corrupted manifest files +// for the given path. It will ignore any manifest files, valid or not. +// The DB must already exist or it will returns an error. +// Also, Recover will ignore ErrorIfMissing and ErrorIfExist options. +// +// RecoverFile uses standard file-system backed storage implementation as described +// in the leveldb/storage package. +// +// The returned DB instance is safe for concurrent use. +// The DB must be closed after use, by calling Close method. +func RecoverFile(path string, o *opt.Options) (db *DB, err error) { + stor, err := storage.OpenFile(path, false) + if err != nil { + return + } + db, err = Recover(stor, o) + if err != nil { + stor.Close() + } else { + db.closer = stor + } + return +} + +func recoverTable(s *session, o *opt.Options) error { + o = dupOptions(o) + // Mask StrictReader, lets StrictRecovery doing its job. + o.Strict &= ^opt.StrictReader + + // Get all tables and sort it by file number. + fds, err := s.stor.List(storage.TypeTable) + if err != nil { + return err + } + sortFds(fds) + + var ( + maxSeq uint64 + recoveredKey, goodKey, corruptedKey, corruptedBlock, droppedTable int + + // We will drop corrupted table. + strict = o.GetStrict(opt.StrictRecovery) + noSync = o.GetNoSync() + + rec = &sessionRecord{} + bpool = util.NewBufferPool(o.GetBlockSize() + 5) + ) + buildTable := func(iter iterator.Iterator) (tmpFd storage.FileDesc, size int64, err error) { + tmpFd = s.newTemp() + writer, err := s.stor.Create(tmpFd) + if err != nil { + return + } + defer func() { + writer.Close() + if err != nil { + s.stor.Remove(tmpFd) + tmpFd = storage.FileDesc{} + } + }() + + // Copy entries. + tw := table.NewWriter(writer, o) + for iter.Next() { + key := iter.Key() + if validInternalKey(key) { + err = tw.Append(key, iter.Value()) + if err != nil { + return + } + } + } + err = iter.Error() + if err != nil && !errors.IsCorrupted(err) { + return + } + err = tw.Close() + if err != nil { + return + } + if !noSync { + err = writer.Sync() + if err != nil { + return + } + } + size = int64(tw.BytesLen()) + return + } + recoverTable := func(fd storage.FileDesc) error { + s.logf("table@recovery recovering @%d", fd.Num) + reader, err := s.stor.Open(fd) + if err != nil { + return err + } + var closed bool + defer func() { + if !closed { + reader.Close() + } + }() + + // Get file size. + size, err := reader.Seek(0, 2) + if err != nil { + return err + } + + var ( + tSeq uint64 + tgoodKey, tcorruptedKey, tcorruptedBlock int + imin, imax []byte + ) + tr, err := table.NewReader(reader, size, fd, nil, bpool, o) + if err != nil { + return err + } + iter := tr.NewIterator(nil, nil) + if itererr, ok := iter.(iterator.ErrorCallbackSetter); ok { + itererr.SetErrorCallback(func(err error) { + if errors.IsCorrupted(err) { + s.logf("table@recovery block corruption @%d %q", fd.Num, err) + tcorruptedBlock++ + } + }) + } + + // Scan the table. + for iter.Next() { + key := iter.Key() + _, seq, _, kerr := parseInternalKey(key) + if kerr != nil { + tcorruptedKey++ + continue + } + tgoodKey++ + if seq > tSeq { + tSeq = seq + } + if imin == nil { + imin = append([]byte{}, key...) + } + imax = append(imax[:0], key...) + } + if err := iter.Error(); err != nil && !errors.IsCorrupted(err) { + iter.Release() + return err + } + iter.Release() + + goodKey += tgoodKey + corruptedKey += tcorruptedKey + corruptedBlock += tcorruptedBlock + + if strict && (tcorruptedKey > 0 || tcorruptedBlock > 0) { + droppedTable++ + s.logf("table@recovery dropped @%d Gk·%d Ck·%d Cb·%d S·%d Q·%d", fd.Num, tgoodKey, tcorruptedKey, tcorruptedBlock, size, tSeq) + return nil + } + + if tgoodKey > 0 { + if tcorruptedKey > 0 || tcorruptedBlock > 0 { + // Rebuild the table. + s.logf("table@recovery rebuilding @%d", fd.Num) + iter := tr.NewIterator(nil, nil) + tmpFd, newSize, err := buildTable(iter) + iter.Release() + if err != nil { + return err + } + closed = true + reader.Close() + if err := s.stor.Rename(tmpFd, fd); err != nil { + return err + } + size = newSize + } + if tSeq > maxSeq { + maxSeq = tSeq + } + recoveredKey += tgoodKey + // Add table to level 0. + rec.addTable(0, fd.Num, size, imin, imax) + s.logf("table@recovery recovered @%d Gk·%d Ck·%d Cb·%d S·%d Q·%d", fd.Num, tgoodKey, tcorruptedKey, tcorruptedBlock, size, tSeq) + } else { + droppedTable++ + s.logf("table@recovery unrecoverable @%d Ck·%d Cb·%d S·%d", fd.Num, tcorruptedKey, tcorruptedBlock, size) + } + + return nil + } + + // Recover all tables. + if len(fds) > 0 { + s.logf("table@recovery F·%d", len(fds)) + + // Mark file number as used. + s.markFileNum(fds[len(fds)-1].Num) + + for _, fd := range fds { + if err := recoverTable(fd); err != nil { + return err + } + } + + s.logf("table@recovery recovered F·%d N·%d Gk·%d Ck·%d Q·%d", len(fds), recoveredKey, goodKey, corruptedKey, maxSeq) + } + + // Set sequence number. + rec.setSeqNum(maxSeq) + + // Create new manifest. + if err := s.create(); err != nil { + return err + } + + // Commit. + return s.commit(rec) +} + +func (db *DB) recoverJournal() error { + // Get all journals and sort it by file number. + rawFds, err := db.s.stor.List(storage.TypeJournal) + if err != nil { + return err + } + sortFds(rawFds) + + // Journals that will be recovered. + var fds []storage.FileDesc + for _, fd := range rawFds { + if fd.Num >= db.s.stJournalNum || fd.Num == db.s.stPrevJournalNum { + fds = append(fds, fd) + } + } + + var ( + ofd storage.FileDesc // Obsolete file. + rec = &sessionRecord{} + ) + + // Recover journals. + if len(fds) > 0 { + db.logf("journal@recovery F·%d", len(fds)) + + // Mark file number as used. + db.s.markFileNum(fds[len(fds)-1].Num) + + var ( + // Options. + strict = db.s.o.GetStrict(opt.StrictJournal) + checksum = db.s.o.GetStrict(opt.StrictJournalChecksum) + writeBuffer = db.s.o.GetWriteBuffer() + + jr *journal.Reader + mdb = memdb.New(db.s.icmp, writeBuffer) + buf = &util.Buffer{} + batchSeq uint64 + batchLen int + ) + + for _, fd := range fds { + db.logf("journal@recovery recovering @%d", fd.Num) + + fr, err := db.s.stor.Open(fd) + if err != nil { + return err + } + + // Create or reset journal reader instance. + if jr == nil { + jr = journal.NewReader(fr, dropper{db.s, fd}, strict, checksum) + } else { + jr.Reset(fr, dropper{db.s, fd}, strict, checksum) + } + + // Flush memdb and remove obsolete journal file. + if !ofd.Zero() { + if mdb.Len() > 0 { + if _, err := db.s.flushMemdb(rec, mdb, 0); err != nil { + fr.Close() + return err + } + } + + rec.setJournalNum(fd.Num) + rec.setSeqNum(db.seq) + if err := db.s.commit(rec); err != nil { + fr.Close() + return err + } + rec.resetAddedTables() + + db.s.stor.Remove(ofd) + ofd = storage.FileDesc{} + } + + // Replay journal to memdb. + mdb.Reset() + for { + r, err := jr.Next() + if err != nil { + if err == io.EOF { + break + } + + fr.Close() + return errors.SetFd(err, fd) + } + + buf.Reset() + if _, err := buf.ReadFrom(r); err != nil { + if err == io.ErrUnexpectedEOF { + // This is error returned due to corruption, with strict == false. + continue + } + + fr.Close() + return errors.SetFd(err, fd) + } + batchSeq, batchLen, err = decodeBatchToMem(buf.Bytes(), db.seq, mdb) + if err != nil { + if !strict && errors.IsCorrupted(err) { + db.s.logf("journal error: %v (skipped)", err) + // We won't apply sequence number as it might be corrupted. + continue + } + + fr.Close() + return errors.SetFd(err, fd) + } + + // Save sequence number. + db.seq = batchSeq + uint64(batchLen) + + // Flush it if large enough. + if mdb.Size() >= writeBuffer { + if _, err := db.s.flushMemdb(rec, mdb, 0); err != nil { + fr.Close() + return err + } + + mdb.Reset() + } + } + + fr.Close() + ofd = fd + } + + // Flush the last memdb. + if mdb.Len() > 0 { + if _, err := db.s.flushMemdb(rec, mdb, 0); err != nil { + return err + } + } + } + + // Create a new journal. + if _, err := db.newMem(0); err != nil { + return err + } + + // Commit. + rec.setJournalNum(db.journalFd.Num) + rec.setSeqNum(db.seq) + if err := db.s.commit(rec); err != nil { + // Close journal on error. + if db.journal != nil { + db.journal.Close() + db.journalWriter.Close() + } + return err + } + + // Remove the last obsolete journal file. + if !ofd.Zero() { + db.s.stor.Remove(ofd) + } + + return nil +} + +func (db *DB) recoverJournalRO() error { + // Get all journals and sort it by file number. + rawFds, err := db.s.stor.List(storage.TypeJournal) + if err != nil { + return err + } + sortFds(rawFds) + + // Journals that will be recovered. + var fds []storage.FileDesc + for _, fd := range rawFds { + if fd.Num >= db.s.stJournalNum || fd.Num == db.s.stPrevJournalNum { + fds = append(fds, fd) + } + } + + var ( + // Options. + strict = db.s.o.GetStrict(opt.StrictJournal) + checksum = db.s.o.GetStrict(opt.StrictJournalChecksum) + writeBuffer = db.s.o.GetWriteBuffer() + + mdb = memdb.New(db.s.icmp, writeBuffer) + ) + + // Recover journals. + if len(fds) > 0 { + db.logf("journal@recovery RO·Mode F·%d", len(fds)) + + var ( + jr *journal.Reader + buf = &util.Buffer{} + batchSeq uint64 + batchLen int + ) + + for _, fd := range fds { + db.logf("journal@recovery recovering @%d", fd.Num) + + fr, err := db.s.stor.Open(fd) + if err != nil { + return err + } + + // Create or reset journal reader instance. + if jr == nil { + jr = journal.NewReader(fr, dropper{db.s, fd}, strict, checksum) + } else { + jr.Reset(fr, dropper{db.s, fd}, strict, checksum) + } + + // Replay journal to memdb. + for { + r, err := jr.Next() + if err != nil { + if err == io.EOF { + break + } + + fr.Close() + return errors.SetFd(err, fd) + } + + buf.Reset() + if _, err := buf.ReadFrom(r); err != nil { + if err == io.ErrUnexpectedEOF { + // This is error returned due to corruption, with strict == false. + continue + } + + fr.Close() + return errors.SetFd(err, fd) + } + batchSeq, batchLen, err = decodeBatchToMem(buf.Bytes(), db.seq, mdb) + if err != nil { + if !strict && errors.IsCorrupted(err) { + db.s.logf("journal error: %v (skipped)", err) + // We won't apply sequence number as it might be corrupted. + continue + } + + fr.Close() + return errors.SetFd(err, fd) + } + + // Save sequence number. + db.seq = batchSeq + uint64(batchLen) + } + + fr.Close() + } + } + + // Set memDB. + db.mem = &memDB{db: db, DB: mdb, ref: 1} + + return nil +} + +func memGet(mdb *memdb.DB, ikey internalKey, icmp *iComparer) (ok bool, mv []byte, err error) { + mk, mv, err := mdb.Find(ikey) + if err == nil { + ukey, _, kt, kerr := parseInternalKey(mk) + if kerr != nil { + // Shouldn't have had happen. + panic(kerr) + } + if icmp.uCompare(ukey, ikey.ukey()) == 0 { + if kt == keyTypeDel { + return true, nil, ErrNotFound + } + return true, mv, nil + + } + } else if err != ErrNotFound { + return true, nil, err + } + return +} + +func (db *DB) get(auxm *memdb.DB, auxt tFiles, key []byte, seq uint64, ro *opt.ReadOptions) (value []byte, err error) { + ikey := makeInternalKey(nil, key, seq, keyTypeSeek) + + if auxm != nil { + if ok, mv, me := memGet(auxm, ikey, db.s.icmp); ok { + return append([]byte{}, mv...), me + } + } + + em, fm := db.getMems() + for _, m := range [...]*memDB{em, fm} { + if m == nil { + continue + } + defer m.decref() + + if ok, mv, me := memGet(m.DB, ikey, db.s.icmp); ok { + return append([]byte{}, mv...), me + } + } + + v := db.s.version() + value, cSched, err := v.get(auxt, ikey, ro, false) + v.release() + if cSched { + // Trigger table compaction. + db.compTrigger(db.tcompCmdC) + } + return +} + +func nilIfNotFound(err error) error { + if err == ErrNotFound { + return nil + } + return err +} + +func (db *DB) has(auxm *memdb.DB, auxt tFiles, key []byte, seq uint64, ro *opt.ReadOptions) (ret bool, err error) { + ikey := makeInternalKey(nil, key, seq, keyTypeSeek) + + if auxm != nil { + if ok, _, me := memGet(auxm, ikey, db.s.icmp); ok { + return me == nil, nilIfNotFound(me) + } + } + + em, fm := db.getMems() + for _, m := range [...]*memDB{em, fm} { + if m == nil { + continue + } + defer m.decref() + + if ok, _, me := memGet(m.DB, ikey, db.s.icmp); ok { + return me == nil, nilIfNotFound(me) + } + } + + v := db.s.version() + _, cSched, err := v.get(auxt, ikey, ro, true) + v.release() + if cSched { + // Trigger table compaction. + db.compTrigger(db.tcompCmdC) + } + if err == nil { + ret = true + } else if err == ErrNotFound { + err = nil + } + return +} + +// Get gets the value for the given key. It returns ErrNotFound if the +// DB does not contains the key. +// +// The returned slice is its own copy, it is safe to modify the contents +// of the returned slice. +// It is safe to modify the contents of the argument after Get returns. +func (db *DB) Get(key []byte, ro *opt.ReadOptions) (value []byte, err error) { + err = db.ok() + if err != nil { + return + } + + se := db.acquireSnapshot() + defer db.releaseSnapshot(se) + return db.get(nil, nil, key, se.seq, ro) +} + +// Has returns true if the DB does contains the given key. +// +// It is safe to modify the contents of the argument after Has returns. +func (db *DB) Has(key []byte, ro *opt.ReadOptions) (ret bool, err error) { + err = db.ok() + if err != nil { + return + } + + se := db.acquireSnapshot() + defer db.releaseSnapshot(se) + return db.has(nil, nil, key, se.seq, ro) +} + +// NewIterator returns an iterator for the latest snapshot of the +// underlying DB. +// The returned iterator is not safe for concurrent use, but it is safe to use +// multiple iterators concurrently, with each in a dedicated goroutine. +// It is also safe to use an iterator concurrently with modifying its +// underlying DB. The resultant key/value pairs are guaranteed to be +// consistent. +// +// Slice allows slicing the iterator to only contains keys in the given +// range. A nil Range.Start is treated as a key before all keys in the +// DB. And a nil Range.Limit is treated as a key after all keys in +// the DB. +// +// WARNING: Any slice returned by interator (e.g. slice returned by calling +// Iterator.Key() or Iterator.Key() methods), its content should not be modified +// unless noted otherwise. +// +// The iterator must be released after use, by calling Release method. +// +// Also read Iterator documentation of the leveldb/iterator package. +func (db *DB) NewIterator(slice *util.Range, ro *opt.ReadOptions) iterator.Iterator { + if err := db.ok(); err != nil { + return iterator.NewEmptyIterator(err) + } + + se := db.acquireSnapshot() + defer db.releaseSnapshot(se) + // Iterator holds 'version' lock, 'version' is immutable so snapshot + // can be released after iterator created. + return db.newIterator(nil, nil, se.seq, slice, ro) +} + +// GetSnapshot returns a latest snapshot of the underlying DB. A snapshot +// is a frozen snapshot of a DB state at a particular point in time. The +// content of snapshot are guaranteed to be consistent. +// +// The snapshot must be released after use, by calling Release method. +func (db *DB) GetSnapshot() (*Snapshot, error) { + if err := db.ok(); err != nil { + return nil, err + } + + return db.newSnapshot(), nil +} + +// GetProperty returns value of the given property name. +// +// Property names: +// leveldb.num-files-at-level{n} +// Returns the number of files at level 'n'. +// leveldb.stats +// Returns statistics of the underlying DB. +// leveldb.iostats +// Returns statistics of effective disk read and write. +// leveldb.writedelay +// Returns cumulative write delay caused by compaction. +// leveldb.sstables +// Returns sstables list for each level. +// leveldb.blockpool +// Returns block pool stats. +// leveldb.cachedblock +// Returns size of cached block. +// leveldb.openedtables +// Returns number of opened tables. +// leveldb.alivesnaps +// Returns number of alive snapshots. +// leveldb.aliveiters +// Returns number of alive iterators. +func (db *DB) GetProperty(name string) (value string, err error) { + err = db.ok() + if err != nil { + return + } + + const prefix = "leveldb." + if !strings.HasPrefix(name, prefix) { + return "", ErrNotFound + } + p := name[len(prefix):] + + v := db.s.version() + defer v.release() + + numFilesPrefix := "num-files-at-level" + switch { + case strings.HasPrefix(p, numFilesPrefix): + var level uint + var rest string + n, _ := fmt.Sscanf(p[len(numFilesPrefix):], "%d%s", &level, &rest) + if n != 1 { + err = ErrNotFound + } else { + value = fmt.Sprint(v.tLen(int(level))) + } + case p == "stats": + value = "Compactions\n" + + " Level | Tables | Size(MB) | Time(sec) | Read(MB) | Write(MB)\n" + + "-------+------------+---------------+---------------+---------------+---------------\n" + for level, tables := range v.levels { + duration, read, write := db.compStats.getStat(level) + if len(tables) == 0 && duration == 0 { + continue + } + value += fmt.Sprintf(" %3d | %10d | %13.5f | %13.5f | %13.5f | %13.5f\n", + level, len(tables), float64(tables.size())/1048576.0, duration.Seconds(), + float64(read)/1048576.0, float64(write)/1048576.0) + } + case p == "iostats": + value = fmt.Sprintf("Read(MB):%.5f Write(MB):%.5f", + float64(db.s.stor.reads())/1048576.0, + float64(db.s.stor.writes())/1048576.0) + case p == "writedelay": + writeDelayN, writeDelay := atomic.LoadInt32(&db.cWriteDelayN), time.Duration(atomic.LoadInt64(&db.cWriteDelay)) + paused := atomic.LoadInt32(&db.inWritePaused) == 1 + value = fmt.Sprintf("DelayN:%d Delay:%s Paused:%t", writeDelayN, writeDelay, paused) + case p == "sstables": + for level, tables := range v.levels { + value += fmt.Sprintf("--- level %d ---\n", level) + for _, t := range tables { + value += fmt.Sprintf("%d:%d[%q .. %q]\n", t.fd.Num, t.size, t.imin, t.imax) + } + } + case p == "blockpool": + value = fmt.Sprintf("%v", db.s.tops.bpool) + case p == "cachedblock": + if db.s.tops.bcache != nil { + value = fmt.Sprintf("%d", db.s.tops.bcache.Size()) + } else { + value = "" + } + case p == "openedtables": + value = fmt.Sprintf("%d", db.s.tops.cache.Size()) + case p == "alivesnaps": + value = fmt.Sprintf("%d", atomic.LoadInt32(&db.aliveSnaps)) + case p == "aliveiters": + value = fmt.Sprintf("%d", atomic.LoadInt32(&db.aliveIters)) + default: + err = ErrNotFound + } + + return +} + +// DBStats is database statistics. +type DBStats struct { + WriteDelayCount int32 + WriteDelayDuration time.Duration + WritePaused bool + + AliveSnapshots int32 + AliveIterators int32 + + IOWrite uint64 + IORead uint64 + + BlockCacheSize int + OpenedTablesCount int + + LevelSizes []int64 + LevelTablesCounts []int + LevelRead []int64 + LevelWrite []int64 + LevelDurations []time.Duration +} + +// Stats populates s with database statistics. +func (db *DB) Stats(s *DBStats) error { + err := db.ok() + if err != nil { + return err + } + + s.IORead = db.s.stor.reads() + s.IOWrite = db.s.stor.writes() + s.WriteDelayCount = atomic.LoadInt32(&db.cWriteDelayN) + s.WriteDelayDuration = time.Duration(atomic.LoadInt64(&db.cWriteDelay)) + s.WritePaused = atomic.LoadInt32(&db.inWritePaused) == 1 + + s.OpenedTablesCount = db.s.tops.cache.Size() + if db.s.tops.bcache != nil { + s.BlockCacheSize = db.s.tops.bcache.Size() + } else { + s.BlockCacheSize = 0 + } + + s.AliveIterators = atomic.LoadInt32(&db.aliveIters) + s.AliveSnapshots = atomic.LoadInt32(&db.aliveSnaps) + + s.LevelDurations = s.LevelDurations[:0] + s.LevelRead = s.LevelRead[:0] + s.LevelWrite = s.LevelWrite[:0] + s.LevelSizes = s.LevelSizes[:0] + s.LevelTablesCounts = s.LevelTablesCounts[:0] + + v := db.s.version() + defer v.release() + + for level, tables := range v.levels { + duration, read, write := db.compStats.getStat(level) + if len(tables) == 0 && duration == 0 { + continue + } + s.LevelDurations = append(s.LevelDurations, duration) + s.LevelRead = append(s.LevelRead, read) + s.LevelWrite = append(s.LevelWrite, write) + s.LevelSizes = append(s.LevelSizes, tables.size()) + s.LevelTablesCounts = append(s.LevelTablesCounts, len(tables)) + } + + return nil +} + +// SizeOf calculates approximate sizes of the given key ranges. +// The length of the returned sizes are equal with the length of the given +// ranges. The returned sizes measure storage space usage, so if the user +// data compresses by a factor of ten, the returned sizes will be one-tenth +// the size of the corresponding user data size. +// The results may not include the sizes of recently written data. +func (db *DB) SizeOf(ranges []util.Range) (Sizes, error) { + if err := db.ok(); err != nil { + return nil, err + } + + v := db.s.version() + defer v.release() + + sizes := make(Sizes, 0, len(ranges)) + for _, r := range ranges { + imin := makeInternalKey(nil, r.Start, keyMaxSeq, keyTypeSeek) + imax := makeInternalKey(nil, r.Limit, keyMaxSeq, keyTypeSeek) + start, err := v.offsetOf(imin) + if err != nil { + return nil, err + } + limit, err := v.offsetOf(imax) + if err != nil { + return nil, err + } + var size int64 + if limit >= start { + size = limit - start + } + sizes = append(sizes, size) + } + + return sizes, nil +} + +// Close closes the DB. This will also releases any outstanding snapshot, +// abort any in-flight compaction and discard open transaction. +// +// It is not safe to close a DB until all outstanding iterators are released. +// It is valid to call Close multiple times. Other methods should not be +// called after the DB has been closed. +func (db *DB) Close() error { + if !db.setClosed() { + return ErrClosed + } + + start := time.Now() + db.log("db@close closing") + + // Clear the finalizer. + runtime.SetFinalizer(db, nil) + + // Get compaction error. + var err error + select { + case err = <-db.compErrC: + if err == ErrReadOnly { + err = nil + } + default: + } + + // Signal all goroutines. + close(db.closeC) + + // Discard open transaction. + if db.tr != nil { + db.tr.Discard() + } + + // Acquire writer lock. + db.writeLockC <- struct{}{} + + // Wait for all gorotines to exit. + db.closeW.Wait() + + // Closes journal. + if db.journal != nil { + db.journal.Close() + db.journalWriter.Close() + db.journal = nil + db.journalWriter = nil + } + + if db.writeDelayN > 0 { + db.logf("db@write was delayed N·%d T·%v", db.writeDelayN, db.writeDelay) + } + + // Close session. + db.s.close() + db.logf("db@close done T·%v", time.Since(start)) + db.s.release() + + if db.closer != nil { + if err1 := db.closer.Close(); err == nil { + err = err1 + } + db.closer = nil + } + + // Clear memdbs. + db.clearMems() + + return err +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/db_compaction.go b/vendor/github.com/syndtr/goleveldb/leveldb/db_compaction.go new file mode 100644 index 000000000..0c1b9a53b --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/db_compaction.go @@ -0,0 +1,854 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "sync" + "time" + + "github.com/syndtr/goleveldb/leveldb/errors" + "github.com/syndtr/goleveldb/leveldb/opt" + "github.com/syndtr/goleveldb/leveldb/storage" +) + +var ( + errCompactionTransactExiting = errors.New("leveldb: compaction transact exiting") +) + +type cStat struct { + duration time.Duration + read int64 + write int64 +} + +func (p *cStat) add(n *cStatStaging) { + p.duration += n.duration + p.read += n.read + p.write += n.write +} + +func (p *cStat) get() (duration time.Duration, read, write int64) { + return p.duration, p.read, p.write +} + +type cStatStaging struct { + start time.Time + duration time.Duration + on bool + read int64 + write int64 +} + +func (p *cStatStaging) startTimer() { + if !p.on { + p.start = time.Now() + p.on = true + } +} + +func (p *cStatStaging) stopTimer() { + if p.on { + p.duration += time.Since(p.start) + p.on = false + } +} + +type cStats struct { + lk sync.Mutex + stats []cStat +} + +func (p *cStats) addStat(level int, n *cStatStaging) { + p.lk.Lock() + if level >= len(p.stats) { + newStats := make([]cStat, level+1) + copy(newStats, p.stats) + p.stats = newStats + } + p.stats[level].add(n) + p.lk.Unlock() +} + +func (p *cStats) getStat(level int) (duration time.Duration, read, write int64) { + p.lk.Lock() + defer p.lk.Unlock() + if level < len(p.stats) { + return p.stats[level].get() + } + return +} + +func (db *DB) compactionError() { + var err error +noerr: + // No error. + for { + select { + case err = <-db.compErrSetC: + switch { + case err == nil: + case err == ErrReadOnly, errors.IsCorrupted(err): + goto hasperr + default: + goto haserr + } + case <-db.closeC: + return + } + } +haserr: + // Transient error. + for { + select { + case db.compErrC <- err: + case err = <-db.compErrSetC: + switch { + case err == nil: + goto noerr + case err == ErrReadOnly, errors.IsCorrupted(err): + goto hasperr + default: + } + case <-db.closeC: + return + } + } +hasperr: + // Persistent error. + for { + select { + case db.compErrC <- err: + case db.compPerErrC <- err: + case db.writeLockC <- struct{}{}: + // Hold write lock, so that write won't pass-through. + db.compWriteLocking = true + case <-db.closeC: + if db.compWriteLocking { + // We should release the lock or Close will hang. + <-db.writeLockC + } + return + } + } +} + +type compactionTransactCounter int + +func (cnt *compactionTransactCounter) incr() { + *cnt++ +} + +type compactionTransactInterface interface { + run(cnt *compactionTransactCounter) error + revert() error +} + +func (db *DB) compactionTransact(name string, t compactionTransactInterface) { + defer func() { + if x := recover(); x != nil { + if x == errCompactionTransactExiting { + if err := t.revert(); err != nil { + db.logf("%s revert error %q", name, err) + } + } + panic(x) + } + }() + + const ( + backoffMin = 1 * time.Second + backoffMax = 8 * time.Second + backoffMul = 2 * time.Second + ) + var ( + backoff = backoffMin + backoffT = time.NewTimer(backoff) + lastCnt = compactionTransactCounter(0) + + disableBackoff = db.s.o.GetDisableCompactionBackoff() + ) + for n := 0; ; n++ { + // Check whether the DB is closed. + if db.isClosed() { + db.logf("%s exiting", name) + db.compactionExitTransact() + } else if n > 0 { + db.logf("%s retrying N·%d", name, n) + } + + // Execute. + cnt := compactionTransactCounter(0) + err := t.run(&cnt) + if err != nil { + db.logf("%s error I·%d %q", name, cnt, err) + } + + // Set compaction error status. + select { + case db.compErrSetC <- err: + case perr := <-db.compPerErrC: + if err != nil { + db.logf("%s exiting (persistent error %q)", name, perr) + db.compactionExitTransact() + } + case <-db.closeC: + db.logf("%s exiting", name) + db.compactionExitTransact() + } + if err == nil { + return + } + if errors.IsCorrupted(err) { + db.logf("%s exiting (corruption detected)", name) + db.compactionExitTransact() + } + + if !disableBackoff { + // Reset backoff duration if counter is advancing. + if cnt > lastCnt { + backoff = backoffMin + lastCnt = cnt + } + + // Backoff. + backoffT.Reset(backoff) + if backoff < backoffMax { + backoff *= backoffMul + if backoff > backoffMax { + backoff = backoffMax + } + } + select { + case <-backoffT.C: + case <-db.closeC: + db.logf("%s exiting", name) + db.compactionExitTransact() + } + } + } +} + +type compactionTransactFunc struct { + runFunc func(cnt *compactionTransactCounter) error + revertFunc func() error +} + +func (t *compactionTransactFunc) run(cnt *compactionTransactCounter) error { + return t.runFunc(cnt) +} + +func (t *compactionTransactFunc) revert() error { + if t.revertFunc != nil { + return t.revertFunc() + } + return nil +} + +func (db *DB) compactionTransactFunc(name string, run func(cnt *compactionTransactCounter) error, revert func() error) { + db.compactionTransact(name, &compactionTransactFunc{run, revert}) +} + +func (db *DB) compactionExitTransact() { + panic(errCompactionTransactExiting) +} + +func (db *DB) compactionCommit(name string, rec *sessionRecord) { + db.compCommitLk.Lock() + defer db.compCommitLk.Unlock() // Defer is necessary. + db.compactionTransactFunc(name+"@commit", func(cnt *compactionTransactCounter) error { + return db.s.commit(rec) + }, nil) +} + +func (db *DB) memCompaction() { + mdb := db.getFrozenMem() + if mdb == nil { + return + } + defer mdb.decref() + + db.logf("memdb@flush N·%d S·%s", mdb.Len(), shortenb(mdb.Size())) + + // Don't compact empty memdb. + if mdb.Len() == 0 { + db.logf("memdb@flush skipping") + // drop frozen memdb + db.dropFrozenMem() + return + } + + // Pause table compaction. + resumeC := make(chan struct{}) + select { + case db.tcompPauseC <- (chan<- struct{})(resumeC): + case <-db.compPerErrC: + close(resumeC) + resumeC = nil + case <-db.closeC: + db.compactionExitTransact() + } + + var ( + rec = &sessionRecord{} + stats = &cStatStaging{} + flushLevel int + ) + + // Generate tables. + db.compactionTransactFunc("memdb@flush", func(cnt *compactionTransactCounter) (err error) { + stats.startTimer() + flushLevel, err = db.s.flushMemdb(rec, mdb.DB, db.memdbMaxLevel) + stats.stopTimer() + return + }, func() error { + for _, r := range rec.addedTables { + db.logf("memdb@flush revert @%d", r.num) + if err := db.s.stor.Remove(storage.FileDesc{Type: storage.TypeTable, Num: r.num}); err != nil { + return err + } + } + return nil + }) + + rec.setJournalNum(db.journalFd.Num) + rec.setSeqNum(db.frozenSeq) + + // Commit. + stats.startTimer() + db.compactionCommit("memdb", rec) + stats.stopTimer() + + db.logf("memdb@flush committed F·%d T·%v", len(rec.addedTables), stats.duration) + + for _, r := range rec.addedTables { + stats.write += r.size + } + db.compStats.addStat(flushLevel, stats) + + // Drop frozen memdb. + db.dropFrozenMem() + + // Resume table compaction. + if resumeC != nil { + select { + case <-resumeC: + close(resumeC) + case <-db.closeC: + db.compactionExitTransact() + } + } + + // Trigger table compaction. + db.compTrigger(db.tcompCmdC) +} + +type tableCompactionBuilder struct { + db *DB + s *session + c *compaction + rec *sessionRecord + stat0, stat1 *cStatStaging + + snapHasLastUkey bool + snapLastUkey []byte + snapLastSeq uint64 + snapIter int + snapKerrCnt int + snapDropCnt int + + kerrCnt int + dropCnt int + + minSeq uint64 + strict bool + tableSize int + + tw *tWriter +} + +func (b *tableCompactionBuilder) appendKV(key, value []byte) error { + // Create new table if not already. + if b.tw == nil { + // Check for pause event. + if b.db != nil { + select { + case ch := <-b.db.tcompPauseC: + b.db.pauseCompaction(ch) + case <-b.db.closeC: + b.db.compactionExitTransact() + default: + } + } + + // Create new table. + var err error + b.tw, err = b.s.tops.create() + if err != nil { + return err + } + } + + // Write key/value into table. + return b.tw.append(key, value) +} + +func (b *tableCompactionBuilder) needFlush() bool { + return b.tw.tw.BytesLen() >= b.tableSize +} + +func (b *tableCompactionBuilder) flush() error { + t, err := b.tw.finish() + if err != nil { + return err + } + b.rec.addTableFile(b.c.sourceLevel+1, t) + b.stat1.write += t.size + b.s.logf("table@build created L%d@%d N·%d S·%s %q:%q", b.c.sourceLevel+1, t.fd.Num, b.tw.tw.EntriesLen(), shortenb(int(t.size)), t.imin, t.imax) + b.tw = nil + return nil +} + +func (b *tableCompactionBuilder) cleanup() { + if b.tw != nil { + b.tw.drop() + b.tw = nil + } +} + +func (b *tableCompactionBuilder) run(cnt *compactionTransactCounter) error { + snapResumed := b.snapIter > 0 + hasLastUkey := b.snapHasLastUkey // The key might has zero length, so this is necessary. + lastUkey := append([]byte{}, b.snapLastUkey...) + lastSeq := b.snapLastSeq + b.kerrCnt = b.snapKerrCnt + b.dropCnt = b.snapDropCnt + // Restore compaction state. + b.c.restore() + + defer b.cleanup() + + b.stat1.startTimer() + defer b.stat1.stopTimer() + + iter := b.c.newIterator() + defer iter.Release() + for i := 0; iter.Next(); i++ { + // Incr transact counter. + cnt.incr() + + // Skip until last state. + if i < b.snapIter { + continue + } + + resumed := false + if snapResumed { + resumed = true + snapResumed = false + } + + ikey := iter.Key() + ukey, seq, kt, kerr := parseInternalKey(ikey) + + if kerr == nil { + shouldStop := !resumed && b.c.shouldStopBefore(ikey) + + if !hasLastUkey || b.s.icmp.uCompare(lastUkey, ukey) != 0 { + // First occurrence of this user key. + + // Only rotate tables if ukey doesn't hop across. + if b.tw != nil && (shouldStop || b.needFlush()) { + if err := b.flush(); err != nil { + return err + } + + // Creates snapshot of the state. + b.c.save() + b.snapHasLastUkey = hasLastUkey + b.snapLastUkey = append(b.snapLastUkey[:0], lastUkey...) + b.snapLastSeq = lastSeq + b.snapIter = i + b.snapKerrCnt = b.kerrCnt + b.snapDropCnt = b.dropCnt + } + + hasLastUkey = true + lastUkey = append(lastUkey[:0], ukey...) + lastSeq = keyMaxSeq + } + + switch { + case lastSeq <= b.minSeq: + // Dropped because newer entry for same user key exist + fallthrough // (A) + case kt == keyTypeDel && seq <= b.minSeq && b.c.baseLevelForKey(lastUkey): + // For this user key: + // (1) there is no data in higher levels + // (2) data in lower levels will have larger seq numbers + // (3) data in layers that are being compacted here and have + // smaller seq numbers will be dropped in the next + // few iterations of this loop (by rule (A) above). + // Therefore this deletion marker is obsolete and can be dropped. + lastSeq = seq + b.dropCnt++ + continue + default: + lastSeq = seq + } + } else { + if b.strict { + return kerr + } + + // Don't drop corrupted keys. + hasLastUkey = false + lastUkey = lastUkey[:0] + lastSeq = keyMaxSeq + b.kerrCnt++ + } + + if err := b.appendKV(ikey, iter.Value()); err != nil { + return err + } + } + + if err := iter.Error(); err != nil { + return err + } + + // Finish last table. + if b.tw != nil && !b.tw.empty() { + return b.flush() + } + return nil +} + +func (b *tableCompactionBuilder) revert() error { + for _, at := range b.rec.addedTables { + b.s.logf("table@build revert @%d", at.num) + if err := b.s.stor.Remove(storage.FileDesc{Type: storage.TypeTable, Num: at.num}); err != nil { + return err + } + } + return nil +} + +func (db *DB) tableCompaction(c *compaction, noTrivial bool) { + defer c.release() + + rec := &sessionRecord{} + rec.addCompPtr(c.sourceLevel, c.imax) + + if !noTrivial && c.trivial() { + t := c.levels[0][0] + db.logf("table@move L%d@%d -> L%d", c.sourceLevel, t.fd.Num, c.sourceLevel+1) + rec.delTable(c.sourceLevel, t.fd.Num) + rec.addTableFile(c.sourceLevel+1, t) + db.compactionCommit("table-move", rec) + return + } + + var stats [2]cStatStaging + for i, tables := range c.levels { + for _, t := range tables { + stats[i].read += t.size + // Insert deleted tables into record + rec.delTable(c.sourceLevel+i, t.fd.Num) + } + } + sourceSize := int(stats[0].read + stats[1].read) + minSeq := db.minSeq() + db.logf("table@compaction L%d·%d -> L%d·%d S·%s Q·%d", c.sourceLevel, len(c.levels[0]), c.sourceLevel+1, len(c.levels[1]), shortenb(sourceSize), minSeq) + + b := &tableCompactionBuilder{ + db: db, + s: db.s, + c: c, + rec: rec, + stat1: &stats[1], + minSeq: minSeq, + strict: db.s.o.GetStrict(opt.StrictCompaction), + tableSize: db.s.o.GetCompactionTableSize(c.sourceLevel + 1), + } + db.compactionTransact("table@build", b) + + // Commit. + stats[1].startTimer() + db.compactionCommit("table", rec) + stats[1].stopTimer() + + resultSize := int(stats[1].write) + db.logf("table@compaction committed F%s S%s Ke·%d D·%d T·%v", sint(len(rec.addedTables)-len(rec.deletedTables)), sshortenb(resultSize-sourceSize), b.kerrCnt, b.dropCnt, stats[1].duration) + + // Save compaction stats + for i := range stats { + db.compStats.addStat(c.sourceLevel+1, &stats[i]) + } +} + +func (db *DB) tableRangeCompaction(level int, umin, umax []byte) error { + db.logf("table@compaction range L%d %q:%q", level, umin, umax) + if level >= 0 { + if c := db.s.getCompactionRange(level, umin, umax, true); c != nil { + db.tableCompaction(c, true) + } + } else { + // Retry until nothing to compact. + for { + compacted := false + + // Scan for maximum level with overlapped tables. + v := db.s.version() + m := 1 + for i := m; i < len(v.levels); i++ { + tables := v.levels[i] + if tables.overlaps(db.s.icmp, umin, umax, false) { + m = i + } + } + v.release() + + for level := 0; level < m; level++ { + if c := db.s.getCompactionRange(level, umin, umax, false); c != nil { + db.tableCompaction(c, true) + compacted = true + } + } + + if !compacted { + break + } + } + } + + return nil +} + +func (db *DB) tableAutoCompaction() { + if c := db.s.pickCompaction(); c != nil { + db.tableCompaction(c, false) + } +} + +func (db *DB) tableNeedCompaction() bool { + v := db.s.version() + defer v.release() + return v.needCompaction() +} + +// resumeWrite returns an indicator whether we should resume write operation if enough level0 files are compacted. +func (db *DB) resumeWrite() bool { + v := db.s.version() + defer v.release() + if v.tLen(0) < db.s.o.GetWriteL0PauseTrigger() { + return true + } + return false +} + +func (db *DB) pauseCompaction(ch chan<- struct{}) { + select { + case ch <- struct{}{}: + case <-db.closeC: + db.compactionExitTransact() + } +} + +type cCmd interface { + ack(err error) +} + +type cAuto struct { + // Note for table compaction, an non-empty ackC represents it's a compaction waiting command. + ackC chan<- error +} + +func (r cAuto) ack(err error) { + if r.ackC != nil { + defer func() { + recover() + }() + r.ackC <- err + } +} + +type cRange struct { + level int + min, max []byte + ackC chan<- error +} + +func (r cRange) ack(err error) { + if r.ackC != nil { + defer func() { + recover() + }() + r.ackC <- err + } +} + +// This will trigger auto compaction but will not wait for it. +func (db *DB) compTrigger(compC chan<- cCmd) { + select { + case compC <- cAuto{}: + default: + } +} + +// This will trigger auto compaction and/or wait for all compaction to be done. +func (db *DB) compTriggerWait(compC chan<- cCmd) (err error) { + ch := make(chan error) + defer close(ch) + // Send cmd. + select { + case compC <- cAuto{ch}: + case err = <-db.compErrC: + return + case <-db.closeC: + return ErrClosed + } + // Wait cmd. + select { + case err = <-ch: + case err = <-db.compErrC: + case <-db.closeC: + return ErrClosed + } + return err +} + +// Send range compaction request. +func (db *DB) compTriggerRange(compC chan<- cCmd, level int, min, max []byte) (err error) { + ch := make(chan error) + defer close(ch) + // Send cmd. + select { + case compC <- cRange{level, min, max, ch}: + case err := <-db.compErrC: + return err + case <-db.closeC: + return ErrClosed + } + // Wait cmd. + select { + case err = <-ch: + case err = <-db.compErrC: + case <-db.closeC: + return ErrClosed + } + return err +} + +func (db *DB) mCompaction() { + var x cCmd + + defer func() { + if x := recover(); x != nil { + if x != errCompactionTransactExiting { + panic(x) + } + } + if x != nil { + x.ack(ErrClosed) + } + db.closeW.Done() + }() + + for { + select { + case x = <-db.mcompCmdC: + switch x.(type) { + case cAuto: + db.memCompaction() + x.ack(nil) + x = nil + default: + panic("leveldb: unknown command") + } + case <-db.closeC: + return + } + } +} + +func (db *DB) tCompaction() { + var ( + x cCmd + waitQ []cCmd + ) + + defer func() { + if x := recover(); x != nil { + if x != errCompactionTransactExiting { + panic(x) + } + } + for i := range waitQ { + waitQ[i].ack(ErrClosed) + waitQ[i] = nil + } + if x != nil { + x.ack(ErrClosed) + } + db.closeW.Done() + }() + + for { + if db.tableNeedCompaction() { + select { + case x = <-db.tcompCmdC: + case ch := <-db.tcompPauseC: + db.pauseCompaction(ch) + continue + case <-db.closeC: + return + default: + } + // Resume write operation as soon as possible. + if len(waitQ) > 0 && db.resumeWrite() { + for i := range waitQ { + waitQ[i].ack(nil) + waitQ[i] = nil + } + waitQ = waitQ[:0] + } + } else { + for i := range waitQ { + waitQ[i].ack(nil) + waitQ[i] = nil + } + waitQ = waitQ[:0] + select { + case x = <-db.tcompCmdC: + case ch := <-db.tcompPauseC: + db.pauseCompaction(ch) + continue + case <-db.closeC: + return + } + } + if x != nil { + switch cmd := x.(type) { + case cAuto: + if cmd.ackC != nil { + // Check the write pause state before caching it. + if db.resumeWrite() { + x.ack(nil) + } else { + waitQ = append(waitQ, x) + } + } + case cRange: + x.ack(db.tableRangeCompaction(cmd.level, cmd.min, cmd.max)) + default: + panic("leveldb: unknown command") + } + x = nil + } + db.tableAutoCompaction() + } +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/db_iter.go b/vendor/github.com/syndtr/goleveldb/leveldb/db_iter.go new file mode 100644 index 000000000..03c24cdab --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/db_iter.go @@ -0,0 +1,360 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "errors" + "math/rand" + "runtime" + "sync" + "sync/atomic" + + "github.com/syndtr/goleveldb/leveldb/iterator" + "github.com/syndtr/goleveldb/leveldb/opt" + "github.com/syndtr/goleveldb/leveldb/util" +) + +var ( + errInvalidInternalKey = errors.New("leveldb: Iterator: invalid internal key") +) + +type memdbReleaser struct { + once sync.Once + m *memDB +} + +func (mr *memdbReleaser) Release() { + mr.once.Do(func() { + mr.m.decref() + }) +} + +func (db *DB) newRawIterator(auxm *memDB, auxt tFiles, slice *util.Range, ro *opt.ReadOptions) iterator.Iterator { + strict := opt.GetStrict(db.s.o.Options, ro, opt.StrictReader) + em, fm := db.getMems() + v := db.s.version() + + tableIts := v.getIterators(slice, ro) + n := len(tableIts) + len(auxt) + 3 + its := make([]iterator.Iterator, 0, n) + + if auxm != nil { + ami := auxm.NewIterator(slice) + ami.SetReleaser(&memdbReleaser{m: auxm}) + its = append(its, ami) + } + for _, t := range auxt { + its = append(its, v.s.tops.newIterator(t, slice, ro)) + } + + emi := em.NewIterator(slice) + emi.SetReleaser(&memdbReleaser{m: em}) + its = append(its, emi) + if fm != nil { + fmi := fm.NewIterator(slice) + fmi.SetReleaser(&memdbReleaser{m: fm}) + its = append(its, fmi) + } + its = append(its, tableIts...) + mi := iterator.NewMergedIterator(its, db.s.icmp, strict) + mi.SetReleaser(&versionReleaser{v: v}) + return mi +} + +func (db *DB) newIterator(auxm *memDB, auxt tFiles, seq uint64, slice *util.Range, ro *opt.ReadOptions) *dbIter { + var islice *util.Range + if slice != nil { + islice = &util.Range{} + if slice.Start != nil { + islice.Start = makeInternalKey(nil, slice.Start, keyMaxSeq, keyTypeSeek) + } + if slice.Limit != nil { + islice.Limit = makeInternalKey(nil, slice.Limit, keyMaxSeq, keyTypeSeek) + } + } + rawIter := db.newRawIterator(auxm, auxt, islice, ro) + iter := &dbIter{ + db: db, + icmp: db.s.icmp, + iter: rawIter, + seq: seq, + strict: opt.GetStrict(db.s.o.Options, ro, opt.StrictReader), + key: make([]byte, 0), + value: make([]byte, 0), + } + atomic.AddInt32(&db.aliveIters, 1) + runtime.SetFinalizer(iter, (*dbIter).Release) + return iter +} + +func (db *DB) iterSamplingRate() int { + return rand.Intn(2 * db.s.o.GetIteratorSamplingRate()) +} + +type dir int + +const ( + dirReleased dir = iota - 1 + dirSOI + dirEOI + dirBackward + dirForward +) + +// dbIter represent an interator states over a database session. +type dbIter struct { + db *DB + icmp *iComparer + iter iterator.Iterator + seq uint64 + strict bool + + smaplingGap int + dir dir + key []byte + value []byte + err error + releaser util.Releaser +} + +func (i *dbIter) sampleSeek() { + ikey := i.iter.Key() + i.smaplingGap -= len(ikey) + len(i.iter.Value()) + for i.smaplingGap < 0 { + i.smaplingGap += i.db.iterSamplingRate() + i.db.sampleSeek(ikey) + } +} + +func (i *dbIter) setErr(err error) { + i.err = err + i.key = nil + i.value = nil +} + +func (i *dbIter) iterErr() { + if err := i.iter.Error(); err != nil { + i.setErr(err) + } +} + +func (i *dbIter) Valid() bool { + return i.err == nil && i.dir > dirEOI +} + +func (i *dbIter) First() bool { + if i.err != nil { + return false + } else if i.dir == dirReleased { + i.err = ErrIterReleased + return false + } + + if i.iter.First() { + i.dir = dirSOI + return i.next() + } + i.dir = dirEOI + i.iterErr() + return false +} + +func (i *dbIter) Last() bool { + if i.err != nil { + return false + } else if i.dir == dirReleased { + i.err = ErrIterReleased + return false + } + + if i.iter.Last() { + return i.prev() + } + i.dir = dirSOI + i.iterErr() + return false +} + +func (i *dbIter) Seek(key []byte) bool { + if i.err != nil { + return false + } else if i.dir == dirReleased { + i.err = ErrIterReleased + return false + } + + ikey := makeInternalKey(nil, key, i.seq, keyTypeSeek) + if i.iter.Seek(ikey) { + i.dir = dirSOI + return i.next() + } + i.dir = dirEOI + i.iterErr() + return false +} + +func (i *dbIter) next() bool { + for { + if ukey, seq, kt, kerr := parseInternalKey(i.iter.Key()); kerr == nil { + i.sampleSeek() + if seq <= i.seq { + switch kt { + case keyTypeDel: + // Skip deleted key. + i.key = append(i.key[:0], ukey...) + i.dir = dirForward + case keyTypeVal: + if i.dir == dirSOI || i.icmp.uCompare(ukey, i.key) > 0 { + i.key = append(i.key[:0], ukey...) + i.value = append(i.value[:0], i.iter.Value()...) + i.dir = dirForward + return true + } + } + } + } else if i.strict { + i.setErr(kerr) + break + } + if !i.iter.Next() { + i.dir = dirEOI + i.iterErr() + break + } + } + return false +} + +func (i *dbIter) Next() bool { + if i.dir == dirEOI || i.err != nil { + return false + } else if i.dir == dirReleased { + i.err = ErrIterReleased + return false + } + + if !i.iter.Next() || (i.dir == dirBackward && !i.iter.Next()) { + i.dir = dirEOI + i.iterErr() + return false + } + return i.next() +} + +func (i *dbIter) prev() bool { + i.dir = dirBackward + del := true + if i.iter.Valid() { + for { + if ukey, seq, kt, kerr := parseInternalKey(i.iter.Key()); kerr == nil { + i.sampleSeek() + if seq <= i.seq { + if !del && i.icmp.uCompare(ukey, i.key) < 0 { + return true + } + del = (kt == keyTypeDel) + if !del { + i.key = append(i.key[:0], ukey...) + i.value = append(i.value[:0], i.iter.Value()...) + } + } + } else if i.strict { + i.setErr(kerr) + return false + } + if !i.iter.Prev() { + break + } + } + } + if del { + i.dir = dirSOI + i.iterErr() + return false + } + return true +} + +func (i *dbIter) Prev() bool { + if i.dir == dirSOI || i.err != nil { + return false + } else if i.dir == dirReleased { + i.err = ErrIterReleased + return false + } + + switch i.dir { + case dirEOI: + return i.Last() + case dirForward: + for i.iter.Prev() { + if ukey, _, _, kerr := parseInternalKey(i.iter.Key()); kerr == nil { + i.sampleSeek() + if i.icmp.uCompare(ukey, i.key) < 0 { + goto cont + } + } else if i.strict { + i.setErr(kerr) + return false + } + } + i.dir = dirSOI + i.iterErr() + return false + } + +cont: + return i.prev() +} + +func (i *dbIter) Key() []byte { + if i.err != nil || i.dir <= dirEOI { + return nil + } + return i.key +} + +func (i *dbIter) Value() []byte { + if i.err != nil || i.dir <= dirEOI { + return nil + } + return i.value +} + +func (i *dbIter) Release() { + if i.dir != dirReleased { + // Clear the finalizer. + runtime.SetFinalizer(i, nil) + + if i.releaser != nil { + i.releaser.Release() + i.releaser = nil + } + + i.dir = dirReleased + i.key = nil + i.value = nil + i.iter.Release() + i.iter = nil + atomic.AddInt32(&i.db.aliveIters, -1) + i.db = nil + } +} + +func (i *dbIter) SetReleaser(releaser util.Releaser) { + if i.dir == dirReleased { + panic(util.ErrReleased) + } + if i.releaser != nil && releaser != nil { + panic(util.ErrHasReleaser) + } + i.releaser = releaser +} + +func (i *dbIter) Error() error { + return i.err +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/db_snapshot.go b/vendor/github.com/syndtr/goleveldb/leveldb/db_snapshot.go new file mode 100644 index 000000000..c2ad70c84 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/db_snapshot.go @@ -0,0 +1,187 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "container/list" + "fmt" + "runtime" + "sync" + "sync/atomic" + + "github.com/syndtr/goleveldb/leveldb/iterator" + "github.com/syndtr/goleveldb/leveldb/opt" + "github.com/syndtr/goleveldb/leveldb/util" +) + +type snapshotElement struct { + seq uint64 + ref int + e *list.Element +} + +// Acquires a snapshot, based on latest sequence. +func (db *DB) acquireSnapshot() *snapshotElement { + db.snapsMu.Lock() + defer db.snapsMu.Unlock() + + seq := db.getSeq() + + if e := db.snapsList.Back(); e != nil { + se := e.Value.(*snapshotElement) + if se.seq == seq { + se.ref++ + return se + } else if seq < se.seq { + panic("leveldb: sequence number is not increasing") + } + } + se := &snapshotElement{seq: seq, ref: 1} + se.e = db.snapsList.PushBack(se) + return se +} + +// Releases given snapshot element. +func (db *DB) releaseSnapshot(se *snapshotElement) { + db.snapsMu.Lock() + defer db.snapsMu.Unlock() + + se.ref-- + if se.ref == 0 { + db.snapsList.Remove(se.e) + se.e = nil + } else if se.ref < 0 { + panic("leveldb: Snapshot: negative element reference") + } +} + +// Gets minimum sequence that not being snapshotted. +func (db *DB) minSeq() uint64 { + db.snapsMu.Lock() + defer db.snapsMu.Unlock() + + if e := db.snapsList.Front(); e != nil { + return e.Value.(*snapshotElement).seq + } + + return db.getSeq() +} + +// Snapshot is a DB snapshot. +type Snapshot struct { + db *DB + elem *snapshotElement + mu sync.RWMutex + released bool +} + +// Creates new snapshot object. +func (db *DB) newSnapshot() *Snapshot { + snap := &Snapshot{ + db: db, + elem: db.acquireSnapshot(), + } + atomic.AddInt32(&db.aliveSnaps, 1) + runtime.SetFinalizer(snap, (*Snapshot).Release) + return snap +} + +func (snap *Snapshot) String() string { + return fmt.Sprintf("leveldb.Snapshot{%d}", snap.elem.seq) +} + +// Get gets the value for the given key. It returns ErrNotFound if +// the DB does not contains the key. +// +// The caller should not modify the contents of the returned slice, but +// it is safe to modify the contents of the argument after Get returns. +func (snap *Snapshot) Get(key []byte, ro *opt.ReadOptions) (value []byte, err error) { + err = snap.db.ok() + if err != nil { + return + } + snap.mu.RLock() + defer snap.mu.RUnlock() + if snap.released { + err = ErrSnapshotReleased + return + } + return snap.db.get(nil, nil, key, snap.elem.seq, ro) +} + +// Has returns true if the DB does contains the given key. +// +// It is safe to modify the contents of the argument after Get returns. +func (snap *Snapshot) Has(key []byte, ro *opt.ReadOptions) (ret bool, err error) { + err = snap.db.ok() + if err != nil { + return + } + snap.mu.RLock() + defer snap.mu.RUnlock() + if snap.released { + err = ErrSnapshotReleased + return + } + return snap.db.has(nil, nil, key, snap.elem.seq, ro) +} + +// NewIterator returns an iterator for the snapshot of the underlying DB. +// The returned iterator is not safe for concurrent use, but it is safe to use +// multiple iterators concurrently, with each in a dedicated goroutine. +// It is also safe to use an iterator concurrently with modifying its +// underlying DB. The resultant key/value pairs are guaranteed to be +// consistent. +// +// Slice allows slicing the iterator to only contains keys in the given +// range. A nil Range.Start is treated as a key before all keys in the +// DB. And a nil Range.Limit is treated as a key after all keys in +// the DB. +// +// WARNING: Any slice returned by interator (e.g. slice returned by calling +// Iterator.Key() or Iterator.Value() methods), its content should not be +// modified unless noted otherwise. +// +// The iterator must be released after use, by calling Release method. +// Releasing the snapshot doesn't mean releasing the iterator too, the +// iterator would be still valid until released. +// +// Also read Iterator documentation of the leveldb/iterator package. +func (snap *Snapshot) NewIterator(slice *util.Range, ro *opt.ReadOptions) iterator.Iterator { + if err := snap.db.ok(); err != nil { + return iterator.NewEmptyIterator(err) + } + snap.mu.Lock() + defer snap.mu.Unlock() + if snap.released { + return iterator.NewEmptyIterator(ErrSnapshotReleased) + } + // Since iterator already hold version ref, it doesn't need to + // hold snapshot ref. + return snap.db.newIterator(nil, nil, snap.elem.seq, slice, ro) +} + +// Release releases the snapshot. This will not release any returned +// iterators, the iterators would still be valid until released or the +// underlying DB is closed. +// +// Other methods should not be called after the snapshot has been released. +func (snap *Snapshot) Release() { + snap.mu.Lock() + defer snap.mu.Unlock() + + if !snap.released { + // Clear the finalizer. + runtime.SetFinalizer(snap, nil) + + snap.released = true + snap.db.releaseSnapshot(snap.elem) + atomic.AddInt32(&snap.db.aliveSnaps, -1) + snap.db = nil + snap.elem = nil + } +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/db_state.go b/vendor/github.com/syndtr/goleveldb/leveldb/db_state.go new file mode 100644 index 000000000..65e1c54bb --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/db_state.go @@ -0,0 +1,239 @@ +// Copyright (c) 2013, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "errors" + "sync/atomic" + "time" + + "github.com/syndtr/goleveldb/leveldb/journal" + "github.com/syndtr/goleveldb/leveldb/memdb" + "github.com/syndtr/goleveldb/leveldb/storage" +) + +var ( + errHasFrozenMem = errors.New("has frozen mem") +) + +type memDB struct { + db *DB + *memdb.DB + ref int32 +} + +func (m *memDB) getref() int32 { + return atomic.LoadInt32(&m.ref) +} + +func (m *memDB) incref() { + atomic.AddInt32(&m.ref, 1) +} + +func (m *memDB) decref() { + if ref := atomic.AddInt32(&m.ref, -1); ref == 0 { + // Only put back memdb with std capacity. + if m.Capacity() == m.db.s.o.GetWriteBuffer() { + m.Reset() + m.db.mpoolPut(m.DB) + } + m.db = nil + m.DB = nil + } else if ref < 0 { + panic("negative memdb ref") + } +} + +// Get latest sequence number. +func (db *DB) getSeq() uint64 { + return atomic.LoadUint64(&db.seq) +} + +// Atomically adds delta to seq. +func (db *DB) addSeq(delta uint64) { + atomic.AddUint64(&db.seq, delta) +} + +func (db *DB) setSeq(seq uint64) { + atomic.StoreUint64(&db.seq, seq) +} + +func (db *DB) sampleSeek(ikey internalKey) { + v := db.s.version() + if v.sampleSeek(ikey) { + // Trigger table compaction. + db.compTrigger(db.tcompCmdC) + } + v.release() +} + +func (db *DB) mpoolPut(mem *memdb.DB) { + if !db.isClosed() { + select { + case db.memPool <- mem: + default: + } + } +} + +func (db *DB) mpoolGet(n int) *memDB { + var mdb *memdb.DB + select { + case mdb = <-db.memPool: + default: + } + if mdb == nil || mdb.Capacity() < n { + mdb = memdb.New(db.s.icmp, maxInt(db.s.o.GetWriteBuffer(), n)) + } + return &memDB{ + db: db, + DB: mdb, + } +} + +func (db *DB) mpoolDrain() { + ticker := time.NewTicker(30 * time.Second) + for { + select { + case <-ticker.C: + select { + case <-db.memPool: + default: + } + case <-db.closeC: + ticker.Stop() + // Make sure the pool is drained. + select { + case <-db.memPool: + case <-time.After(time.Second): + } + close(db.memPool) + return + } + } +} + +// Create new memdb and froze the old one; need external synchronization. +// newMem only called synchronously by the writer. +func (db *DB) newMem(n int) (mem *memDB, err error) { + fd := storage.FileDesc{Type: storage.TypeJournal, Num: db.s.allocFileNum()} + w, err := db.s.stor.Create(fd) + if err != nil { + db.s.reuseFileNum(fd.Num) + return + } + + db.memMu.Lock() + defer db.memMu.Unlock() + + if db.frozenMem != nil { + return nil, errHasFrozenMem + } + + if db.journal == nil { + db.journal = journal.NewWriter(w) + } else { + db.journal.Reset(w) + db.journalWriter.Close() + db.frozenJournalFd = db.journalFd + } + db.journalWriter = w + db.journalFd = fd + db.frozenMem = db.mem + mem = db.mpoolGet(n) + mem.incref() // for self + mem.incref() // for caller + db.mem = mem + // The seq only incremented by the writer. And whoever called newMem + // should hold write lock, so no need additional synchronization here. + db.frozenSeq = db.seq + return +} + +// Get all memdbs. +func (db *DB) getMems() (e, f *memDB) { + db.memMu.RLock() + defer db.memMu.RUnlock() + if db.mem != nil { + db.mem.incref() + } else if !db.isClosed() { + panic("nil effective mem") + } + if db.frozenMem != nil { + db.frozenMem.incref() + } + return db.mem, db.frozenMem +} + +// Get effective memdb. +func (db *DB) getEffectiveMem() *memDB { + db.memMu.RLock() + defer db.memMu.RUnlock() + if db.mem != nil { + db.mem.incref() + } else if !db.isClosed() { + panic("nil effective mem") + } + return db.mem +} + +// Check whether we has frozen memdb. +func (db *DB) hasFrozenMem() bool { + db.memMu.RLock() + defer db.memMu.RUnlock() + return db.frozenMem != nil +} + +// Get frozen memdb. +func (db *DB) getFrozenMem() *memDB { + db.memMu.RLock() + defer db.memMu.RUnlock() + if db.frozenMem != nil { + db.frozenMem.incref() + } + return db.frozenMem +} + +// Drop frozen memdb; assume that frozen memdb isn't nil. +func (db *DB) dropFrozenMem() { + db.memMu.Lock() + if err := db.s.stor.Remove(db.frozenJournalFd); err != nil { + db.logf("journal@remove removing @%d %q", db.frozenJournalFd.Num, err) + } else { + db.logf("journal@remove removed @%d", db.frozenJournalFd.Num) + } + db.frozenJournalFd = storage.FileDesc{} + db.frozenMem.decref() + db.frozenMem = nil + db.memMu.Unlock() +} + +// Clear mems ptr; used by DB.Close(). +func (db *DB) clearMems() { + db.memMu.Lock() + db.mem = nil + db.frozenMem = nil + db.memMu.Unlock() +} + +// Set closed flag; return true if not already closed. +func (db *DB) setClosed() bool { + return atomic.CompareAndSwapUint32(&db.closed, 0, 1) +} + +// Check whether DB was closed. +func (db *DB) isClosed() bool { + return atomic.LoadUint32(&db.closed) != 0 +} + +// Check read ok status. +func (db *DB) ok() error { + if db.isClosed() { + return ErrClosed + } + return nil +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/db_transaction.go b/vendor/github.com/syndtr/goleveldb/leveldb/db_transaction.go new file mode 100644 index 000000000..1a0000188 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/db_transaction.go @@ -0,0 +1,329 @@ +// Copyright (c) 2016, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "errors" + "sync" + "time" + + "github.com/syndtr/goleveldb/leveldb/iterator" + "github.com/syndtr/goleveldb/leveldb/opt" + "github.com/syndtr/goleveldb/leveldb/util" +) + +var errTransactionDone = errors.New("leveldb: transaction already closed") + +// Transaction is the transaction handle. +type Transaction struct { + db *DB + lk sync.RWMutex + seq uint64 + mem *memDB + tables tFiles + ikScratch []byte + rec sessionRecord + stats cStatStaging + closed bool +} + +// Get gets the value for the given key. It returns ErrNotFound if the +// DB does not contains the key. +// +// The returned slice is its own copy, it is safe to modify the contents +// of the returned slice. +// It is safe to modify the contents of the argument after Get returns. +func (tr *Transaction) Get(key []byte, ro *opt.ReadOptions) ([]byte, error) { + tr.lk.RLock() + defer tr.lk.RUnlock() + if tr.closed { + return nil, errTransactionDone + } + return tr.db.get(tr.mem.DB, tr.tables, key, tr.seq, ro) +} + +// Has returns true if the DB does contains the given key. +// +// It is safe to modify the contents of the argument after Has returns. +func (tr *Transaction) Has(key []byte, ro *opt.ReadOptions) (bool, error) { + tr.lk.RLock() + defer tr.lk.RUnlock() + if tr.closed { + return false, errTransactionDone + } + return tr.db.has(tr.mem.DB, tr.tables, key, tr.seq, ro) +} + +// NewIterator returns an iterator for the latest snapshot of the transaction. +// The returned iterator is not safe for concurrent use, but it is safe to use +// multiple iterators concurrently, with each in a dedicated goroutine. +// It is also safe to use an iterator concurrently while writes to the +// transaction. The resultant key/value pairs are guaranteed to be consistent. +// +// Slice allows slicing the iterator to only contains keys in the given +// range. A nil Range.Start is treated as a key before all keys in the +// DB. And a nil Range.Limit is treated as a key after all keys in +// the DB. +// +// WARNING: Any slice returned by interator (e.g. slice returned by calling +// Iterator.Key() or Iterator.Key() methods), its content should not be modified +// unless noted otherwise. +// +// The iterator must be released after use, by calling Release method. +// +// Also read Iterator documentation of the leveldb/iterator package. +func (tr *Transaction) NewIterator(slice *util.Range, ro *opt.ReadOptions) iterator.Iterator { + tr.lk.RLock() + defer tr.lk.RUnlock() + if tr.closed { + return iterator.NewEmptyIterator(errTransactionDone) + } + tr.mem.incref() + return tr.db.newIterator(tr.mem, tr.tables, tr.seq, slice, ro) +} + +func (tr *Transaction) flush() error { + // Flush memdb. + if tr.mem.Len() != 0 { + tr.stats.startTimer() + iter := tr.mem.NewIterator(nil) + t, n, err := tr.db.s.tops.createFrom(iter) + iter.Release() + tr.stats.stopTimer() + if err != nil { + return err + } + if tr.mem.getref() == 1 { + tr.mem.Reset() + } else { + tr.mem.decref() + tr.mem = tr.db.mpoolGet(0) + tr.mem.incref() + } + tr.tables = append(tr.tables, t) + tr.rec.addTableFile(0, t) + tr.stats.write += t.size + tr.db.logf("transaction@flush created L0@%d N·%d S·%s %q:%q", t.fd.Num, n, shortenb(int(t.size)), t.imin, t.imax) + } + return nil +} + +func (tr *Transaction) put(kt keyType, key, value []byte) error { + tr.ikScratch = makeInternalKey(tr.ikScratch, key, tr.seq+1, kt) + if tr.mem.Free() < len(tr.ikScratch)+len(value) { + if err := tr.flush(); err != nil { + return err + } + } + if err := tr.mem.Put(tr.ikScratch, value); err != nil { + return err + } + tr.seq++ + return nil +} + +// Put sets the value for the given key. It overwrites any previous value +// for that key; a DB is not a multi-map. +// Please note that the transaction is not compacted until committed, so if you +// writes 10 same keys, then those 10 same keys are in the transaction. +// +// It is safe to modify the contents of the arguments after Put returns. +func (tr *Transaction) Put(key, value []byte, wo *opt.WriteOptions) error { + tr.lk.Lock() + defer tr.lk.Unlock() + if tr.closed { + return errTransactionDone + } + return tr.put(keyTypeVal, key, value) +} + +// Delete deletes the value for the given key. +// Please note that the transaction is not compacted until committed, so if you +// writes 10 same keys, then those 10 same keys are in the transaction. +// +// It is safe to modify the contents of the arguments after Delete returns. +func (tr *Transaction) Delete(key []byte, wo *opt.WriteOptions) error { + tr.lk.Lock() + defer tr.lk.Unlock() + if tr.closed { + return errTransactionDone + } + return tr.put(keyTypeDel, key, nil) +} + +// Write apply the given batch to the transaction. The batch will be applied +// sequentially. +// Please note that the transaction is not compacted until committed, so if you +// writes 10 same keys, then those 10 same keys are in the transaction. +// +// It is safe to modify the contents of the arguments after Write returns. +func (tr *Transaction) Write(b *Batch, wo *opt.WriteOptions) error { + if b == nil || b.Len() == 0 { + return nil + } + + tr.lk.Lock() + defer tr.lk.Unlock() + if tr.closed { + return errTransactionDone + } + return b.replayInternal(func(i int, kt keyType, k, v []byte) error { + return tr.put(kt, k, v) + }) +} + +func (tr *Transaction) setDone() { + tr.closed = true + tr.db.tr = nil + tr.mem.decref() + <-tr.db.writeLockC +} + +// Commit commits the transaction. If error is not nil, then the transaction is +// not committed, it can then either be retried or discarded. +// +// Other methods should not be called after transaction has been committed. +func (tr *Transaction) Commit() error { + if err := tr.db.ok(); err != nil { + return err + } + + tr.lk.Lock() + defer tr.lk.Unlock() + if tr.closed { + return errTransactionDone + } + if err := tr.flush(); err != nil { + // Return error, lets user decide either to retry or discard + // transaction. + return err + } + if len(tr.tables) != 0 { + // Committing transaction. + tr.rec.setSeqNum(tr.seq) + tr.db.compCommitLk.Lock() + tr.stats.startTimer() + var cerr error + for retry := 0; retry < 3; retry++ { + cerr = tr.db.s.commit(&tr.rec) + if cerr != nil { + tr.db.logf("transaction@commit error R·%d %q", retry, cerr) + select { + case <-time.After(time.Second): + case <-tr.db.closeC: + tr.db.logf("transaction@commit exiting") + tr.db.compCommitLk.Unlock() + return cerr + } + } else { + // Success. Set db.seq. + tr.db.setSeq(tr.seq) + break + } + } + tr.stats.stopTimer() + if cerr != nil { + // Return error, lets user decide either to retry or discard + // transaction. + return cerr + } + + // Update compaction stats. This is safe as long as we hold compCommitLk. + tr.db.compStats.addStat(0, &tr.stats) + + // Trigger table auto-compaction. + tr.db.compTrigger(tr.db.tcompCmdC) + tr.db.compCommitLk.Unlock() + + // Additionally, wait compaction when certain threshold reached. + // Ignore error, returns error only if transaction can't be committed. + tr.db.waitCompaction() + } + // Only mark as done if transaction committed successfully. + tr.setDone() + return nil +} + +func (tr *Transaction) discard() { + // Discard transaction. + for _, t := range tr.tables { + tr.db.logf("transaction@discard @%d", t.fd.Num) + if err1 := tr.db.s.stor.Remove(t.fd); err1 == nil { + tr.db.s.reuseFileNum(t.fd.Num) + } + } +} + +// Discard discards the transaction. +// +// Other methods should not be called after transaction has been discarded. +func (tr *Transaction) Discard() { + tr.lk.Lock() + if !tr.closed { + tr.discard() + tr.setDone() + } + tr.lk.Unlock() +} + +func (db *DB) waitCompaction() error { + if db.s.tLen(0) >= db.s.o.GetWriteL0PauseTrigger() { + return db.compTriggerWait(db.tcompCmdC) + } + return nil +} + +// OpenTransaction opens an atomic DB transaction. Only one transaction can be +// opened at a time. Subsequent call to Write and OpenTransaction will be blocked +// until in-flight transaction is committed or discarded. +// The returned transaction handle is safe for concurrent use. +// +// Transaction is expensive and can overwhelm compaction, especially if +// transaction size is small. Use with caution. +// +// The transaction must be closed once done, either by committing or discarding +// the transaction. +// Closing the DB will discard open transaction. +func (db *DB) OpenTransaction() (*Transaction, error) { + if err := db.ok(); err != nil { + return nil, err + } + + // The write happen synchronously. + select { + case db.writeLockC <- struct{}{}: + case err := <-db.compPerErrC: + return nil, err + case <-db.closeC: + return nil, ErrClosed + } + + if db.tr != nil { + panic("leveldb: has open transaction") + } + + // Flush current memdb. + if db.mem != nil && db.mem.Len() != 0 { + if _, err := db.rotateMem(0, true); err != nil { + return nil, err + } + } + + // Wait compaction when certain threshold reached. + if err := db.waitCompaction(); err != nil { + return nil, err + } + + tr := &Transaction{ + db: db, + seq: db.seq, + mem: db.mpoolGet(0), + } + tr.mem.incref() + db.tr = tr + return tr, nil +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/db_util.go b/vendor/github.com/syndtr/goleveldb/leveldb/db_util.go new file mode 100644 index 000000000..3f0654894 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/db_util.go @@ -0,0 +1,102 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "github.com/syndtr/goleveldb/leveldb/errors" + "github.com/syndtr/goleveldb/leveldb/iterator" + "github.com/syndtr/goleveldb/leveldb/opt" + "github.com/syndtr/goleveldb/leveldb/storage" + "github.com/syndtr/goleveldb/leveldb/util" +) + +// Reader is the interface that wraps basic Get and NewIterator methods. +// This interface implemented by both DB and Snapshot. +type Reader interface { + Get(key []byte, ro *opt.ReadOptions) (value []byte, err error) + NewIterator(slice *util.Range, ro *opt.ReadOptions) iterator.Iterator +} + +// Sizes is list of size. +type Sizes []int64 + +// Sum returns sum of the sizes. +func (sizes Sizes) Sum() int64 { + var sum int64 + for _, size := range sizes { + sum += size + } + return sum +} + +// Logging. +func (db *DB) log(v ...interface{}) { db.s.log(v...) } +func (db *DB) logf(format string, v ...interface{}) { db.s.logf(format, v...) } + +// Check and clean files. +func (db *DB) checkAndCleanFiles() error { + v := db.s.version() + defer v.release() + + tmap := make(map[int64]bool) + for _, tables := range v.levels { + for _, t := range tables { + tmap[t.fd.Num] = false + } + } + + fds, err := db.s.stor.List(storage.TypeAll) + if err != nil { + return err + } + + var nt int + var rem []storage.FileDesc + for _, fd := range fds { + keep := true + switch fd.Type { + case storage.TypeManifest: + keep = fd.Num >= db.s.manifestFd.Num + case storage.TypeJournal: + if !db.frozenJournalFd.Zero() { + keep = fd.Num >= db.frozenJournalFd.Num + } else { + keep = fd.Num >= db.journalFd.Num + } + case storage.TypeTable: + _, keep = tmap[fd.Num] + if keep { + tmap[fd.Num] = true + nt++ + } + } + + if !keep { + rem = append(rem, fd) + } + } + + if nt != len(tmap) { + var mfds []storage.FileDesc + for num, present := range tmap { + if !present { + mfds = append(mfds, storage.FileDesc{Type: storage.TypeTable, Num: num}) + db.logf("db@janitor table missing @%d", num) + } + } + return errors.NewErrCorrupted(storage.FileDesc{}, &errors.ErrMissingFiles{Fds: mfds}) + } + + db.logf("db@janitor F·%d G·%d", len(fds), len(rem)) + for _, fd := range rem { + db.logf("db@janitor removing %s-%d", fd.Type, fd.Num) + if err := db.s.stor.Remove(fd); err != nil { + return err + } + } + return nil +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/db_write.go b/vendor/github.com/syndtr/goleveldb/leveldb/db_write.go new file mode 100644 index 000000000..db0c1bece --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/db_write.go @@ -0,0 +1,464 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "sync/atomic" + "time" + + "github.com/syndtr/goleveldb/leveldb/memdb" + "github.com/syndtr/goleveldb/leveldb/opt" + "github.com/syndtr/goleveldb/leveldb/util" +) + +func (db *DB) writeJournal(batches []*Batch, seq uint64, sync bool) error { + wr, err := db.journal.Next() + if err != nil { + return err + } + if err := writeBatchesWithHeader(wr, batches, seq); err != nil { + return err + } + if err := db.journal.Flush(); err != nil { + return err + } + if sync { + return db.journalWriter.Sync() + } + return nil +} + +func (db *DB) rotateMem(n int, wait bool) (mem *memDB, err error) { + retryLimit := 3 +retry: + // Wait for pending memdb compaction. + err = db.compTriggerWait(db.mcompCmdC) + if err != nil { + return + } + retryLimit-- + + // Create new memdb and journal. + mem, err = db.newMem(n) + if err != nil { + if err == errHasFrozenMem { + if retryLimit <= 0 { + panic("BUG: still has frozen memdb") + } + goto retry + } + return + } + + // Schedule memdb compaction. + if wait { + err = db.compTriggerWait(db.mcompCmdC) + } else { + db.compTrigger(db.mcompCmdC) + } + return +} + +func (db *DB) flush(n int) (mdb *memDB, mdbFree int, err error) { + delayed := false + slowdownTrigger := db.s.o.GetWriteL0SlowdownTrigger() + pauseTrigger := db.s.o.GetWriteL0PauseTrigger() + flush := func() (retry bool) { + mdb = db.getEffectiveMem() + if mdb == nil { + err = ErrClosed + return false + } + defer func() { + if retry { + mdb.decref() + mdb = nil + } + }() + tLen := db.s.tLen(0) + mdbFree = mdb.Free() + switch { + case tLen >= slowdownTrigger && !delayed: + delayed = true + time.Sleep(time.Millisecond) + case mdbFree >= n: + return false + case tLen >= pauseTrigger: + delayed = true + // Set the write paused flag explicitly. + atomic.StoreInt32(&db.inWritePaused, 1) + err = db.compTriggerWait(db.tcompCmdC) + // Unset the write paused flag. + atomic.StoreInt32(&db.inWritePaused, 0) + if err != nil { + return false + } + default: + // Allow memdb to grow if it has no entry. + if mdb.Len() == 0 { + mdbFree = n + } else { + mdb.decref() + mdb, err = db.rotateMem(n, false) + if err == nil { + mdbFree = mdb.Free() + } else { + mdbFree = 0 + } + } + return false + } + return true + } + start := time.Now() + for flush() { + } + if delayed { + db.writeDelay += time.Since(start) + db.writeDelayN++ + } else if db.writeDelayN > 0 { + db.logf("db@write was delayed N·%d T·%v", db.writeDelayN, db.writeDelay) + atomic.AddInt32(&db.cWriteDelayN, int32(db.writeDelayN)) + atomic.AddInt64(&db.cWriteDelay, int64(db.writeDelay)) + db.writeDelay = 0 + db.writeDelayN = 0 + } + return +} + +type writeMerge struct { + sync bool + batch *Batch + keyType keyType + key, value []byte +} + +func (db *DB) unlockWrite(overflow bool, merged int, err error) { + for i := 0; i < merged; i++ { + db.writeAckC <- err + } + if overflow { + // Pass lock to the next write (that failed to merge). + db.writeMergedC <- false + } else { + // Release lock. + <-db.writeLockC + } +} + +// ourBatch is batch that we can modify. +func (db *DB) writeLocked(batch, ourBatch *Batch, merge, sync bool) error { + // Try to flush memdb. This method would also trying to throttle writes + // if it is too fast and compaction cannot catch-up. + mdb, mdbFree, err := db.flush(batch.internalLen) + if err != nil { + db.unlockWrite(false, 0, err) + return err + } + defer mdb.decref() + + var ( + overflow bool + merged int + batches = []*Batch{batch} + ) + + if merge { + // Merge limit. + var mergeLimit int + if batch.internalLen > 128<<10 { + mergeLimit = (1 << 20) - batch.internalLen + } else { + mergeLimit = 128 << 10 + } + mergeCap := mdbFree - batch.internalLen + if mergeLimit > mergeCap { + mergeLimit = mergeCap + } + + merge: + for mergeLimit > 0 { + select { + case incoming := <-db.writeMergeC: + if incoming.batch != nil { + // Merge batch. + if incoming.batch.internalLen > mergeLimit { + overflow = true + break merge + } + batches = append(batches, incoming.batch) + mergeLimit -= incoming.batch.internalLen + } else { + // Merge put. + internalLen := len(incoming.key) + len(incoming.value) + 8 + if internalLen > mergeLimit { + overflow = true + break merge + } + if ourBatch == nil { + ourBatch = db.batchPool.Get().(*Batch) + ourBatch.Reset() + batches = append(batches, ourBatch) + } + // We can use same batch since concurrent write doesn't + // guarantee write order. + ourBatch.appendRec(incoming.keyType, incoming.key, incoming.value) + mergeLimit -= internalLen + } + sync = sync || incoming.sync + merged++ + db.writeMergedC <- true + + default: + break merge + } + } + } + + // Release ourBatch if any. + if ourBatch != nil { + defer db.batchPool.Put(ourBatch) + } + + // Seq number. + seq := db.seq + 1 + + // Write journal. + if err := db.writeJournal(batches, seq, sync); err != nil { + db.unlockWrite(overflow, merged, err) + return err + } + + // Put batches. + for _, batch := range batches { + if err := batch.putMem(seq, mdb.DB); err != nil { + panic(err) + } + seq += uint64(batch.Len()) + } + + // Incr seq number. + db.addSeq(uint64(batchesLen(batches))) + + // Rotate memdb if it's reach the threshold. + if batch.internalLen >= mdbFree { + db.rotateMem(0, false) + } + + db.unlockWrite(overflow, merged, nil) + return nil +} + +// Write apply the given batch to the DB. The batch records will be applied +// sequentially. Write might be used concurrently, when used concurrently and +// batch is small enough, write will try to merge the batches. Set NoWriteMerge +// option to true to disable write merge. +// +// It is safe to modify the contents of the arguments after Write returns but +// not before. Write will not modify content of the batch. +func (db *DB) Write(batch *Batch, wo *opt.WriteOptions) error { + if err := db.ok(); err != nil || batch == nil || batch.Len() == 0 { + return err + } + + // If the batch size is larger than write buffer, it may justified to write + // using transaction instead. Using transaction the batch will be written + // into tables directly, skipping the journaling. + if batch.internalLen > db.s.o.GetWriteBuffer() && !db.s.o.GetDisableLargeBatchTransaction() { + tr, err := db.OpenTransaction() + if err != nil { + return err + } + if err := tr.Write(batch, wo); err != nil { + tr.Discard() + return err + } + return tr.Commit() + } + + merge := !wo.GetNoWriteMerge() && !db.s.o.GetNoWriteMerge() + sync := wo.GetSync() && !db.s.o.GetNoSync() + + // Acquire write lock. + if merge { + select { + case db.writeMergeC <- writeMerge{sync: sync, batch: batch}: + if <-db.writeMergedC { + // Write is merged. + return <-db.writeAckC + } + // Write is not merged, the write lock is handed to us. Continue. + case db.writeLockC <- struct{}{}: + // Write lock acquired. + case err := <-db.compPerErrC: + // Compaction error. + return err + case <-db.closeC: + // Closed + return ErrClosed + } + } else { + select { + case db.writeLockC <- struct{}{}: + // Write lock acquired. + case err := <-db.compPerErrC: + // Compaction error. + return err + case <-db.closeC: + // Closed + return ErrClosed + } + } + + return db.writeLocked(batch, nil, merge, sync) +} + +func (db *DB) putRec(kt keyType, key, value []byte, wo *opt.WriteOptions) error { + if err := db.ok(); err != nil { + return err + } + + merge := !wo.GetNoWriteMerge() && !db.s.o.GetNoWriteMerge() + sync := wo.GetSync() && !db.s.o.GetNoSync() + + // Acquire write lock. + if merge { + select { + case db.writeMergeC <- writeMerge{sync: sync, keyType: kt, key: key, value: value}: + if <-db.writeMergedC { + // Write is merged. + return <-db.writeAckC + } + // Write is not merged, the write lock is handed to us. Continue. + case db.writeLockC <- struct{}{}: + // Write lock acquired. + case err := <-db.compPerErrC: + // Compaction error. + return err + case <-db.closeC: + // Closed + return ErrClosed + } + } else { + select { + case db.writeLockC <- struct{}{}: + // Write lock acquired. + case err := <-db.compPerErrC: + // Compaction error. + return err + case <-db.closeC: + // Closed + return ErrClosed + } + } + + batch := db.batchPool.Get().(*Batch) + batch.Reset() + batch.appendRec(kt, key, value) + return db.writeLocked(batch, batch, merge, sync) +} + +// Put sets the value for the given key. It overwrites any previous value +// for that key; a DB is not a multi-map. Write merge also applies for Put, see +// Write. +// +// It is safe to modify the contents of the arguments after Put returns but not +// before. +func (db *DB) Put(key, value []byte, wo *opt.WriteOptions) error { + return db.putRec(keyTypeVal, key, value, wo) +} + +// Delete deletes the value for the given key. Delete will not returns error if +// key doesn't exist. Write merge also applies for Delete, see Write. +// +// It is safe to modify the contents of the arguments after Delete returns but +// not before. +func (db *DB) Delete(key []byte, wo *opt.WriteOptions) error { + return db.putRec(keyTypeDel, key, nil, wo) +} + +func isMemOverlaps(icmp *iComparer, mem *memdb.DB, min, max []byte) bool { + iter := mem.NewIterator(nil) + defer iter.Release() + return (max == nil || (iter.First() && icmp.uCompare(max, internalKey(iter.Key()).ukey()) >= 0)) && + (min == nil || (iter.Last() && icmp.uCompare(min, internalKey(iter.Key()).ukey()) <= 0)) +} + +// CompactRange compacts the underlying DB for the given key range. +// In particular, deleted and overwritten versions are discarded, +// and the data is rearranged to reduce the cost of operations +// needed to access the data. This operation should typically only +// be invoked by users who understand the underlying implementation. +// +// A nil Range.Start is treated as a key before all keys in the DB. +// And a nil Range.Limit is treated as a key after all keys in the DB. +// Therefore if both is nil then it will compact entire DB. +func (db *DB) CompactRange(r util.Range) error { + if err := db.ok(); err != nil { + return err + } + + // Lock writer. + select { + case db.writeLockC <- struct{}{}: + case err := <-db.compPerErrC: + return err + case <-db.closeC: + return ErrClosed + } + + // Check for overlaps in memdb. + mdb := db.getEffectiveMem() + if mdb == nil { + return ErrClosed + } + defer mdb.decref() + if isMemOverlaps(db.s.icmp, mdb.DB, r.Start, r.Limit) { + // Memdb compaction. + if _, err := db.rotateMem(0, false); err != nil { + <-db.writeLockC + return err + } + <-db.writeLockC + if err := db.compTriggerWait(db.mcompCmdC); err != nil { + return err + } + } else { + <-db.writeLockC + } + + // Table compaction. + return db.compTriggerRange(db.tcompCmdC, -1, r.Start, r.Limit) +} + +// SetReadOnly makes DB read-only. It will stay read-only until reopened. +func (db *DB) SetReadOnly() error { + if err := db.ok(); err != nil { + return err + } + + // Lock writer. + select { + case db.writeLockC <- struct{}{}: + db.compWriteLocking = true + case err := <-db.compPerErrC: + return err + case <-db.closeC: + return ErrClosed + } + + // Set compaction read-only. + select { + case db.compErrSetC <- ErrReadOnly: + case perr := <-db.compPerErrC: + return perr + case <-db.closeC: + return ErrClosed + } + + return nil +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/doc.go b/vendor/github.com/syndtr/goleveldb/leveldb/doc.go new file mode 100644 index 000000000..be768e573 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/doc.go @@ -0,0 +1,92 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Package leveldb provides implementation of LevelDB key/value database. +// +// Create or open a database: +// +// // The returned DB instance is safe for concurrent use. Which mean that all +// // DB's methods may be called concurrently from multiple goroutine. +// db, err := leveldb.OpenFile("path/to/db", nil) +// ... +// defer db.Close() +// ... +// +// Read or modify the database content: +// +// // Remember that the contents of the returned slice should not be modified. +// data, err := db.Get([]byte("key"), nil) +// ... +// err = db.Put([]byte("key"), []byte("value"), nil) +// ... +// err = db.Delete([]byte("key"), nil) +// ... +// +// Iterate over database content: +// +// iter := db.NewIterator(nil, nil) +// for iter.Next() { +// // Remember that the contents of the returned slice should not be modified, and +// // only valid until the next call to Next. +// key := iter.Key() +// value := iter.Value() +// ... +// } +// iter.Release() +// err = iter.Error() +// ... +// +// Iterate over subset of database content with a particular prefix: +// iter := db.NewIterator(util.BytesPrefix([]byte("foo-")), nil) +// for iter.Next() { +// // Use key/value. +// ... +// } +// iter.Release() +// err = iter.Error() +// ... +// +// Seek-then-Iterate: +// +// iter := db.NewIterator(nil, nil) +// for ok := iter.Seek(key); ok; ok = iter.Next() { +// // Use key/value. +// ... +// } +// iter.Release() +// err = iter.Error() +// ... +// +// Iterate over subset of database content: +// +// iter := db.NewIterator(&util.Range{Start: []byte("foo"), Limit: []byte("xoo")}, nil) +// for iter.Next() { +// // Use key/value. +// ... +// } +// iter.Release() +// err = iter.Error() +// ... +// +// Batch writes: +// +// batch := new(leveldb.Batch) +// batch.Put([]byte("foo"), []byte("value")) +// batch.Put([]byte("bar"), []byte("another value")) +// batch.Delete([]byte("baz")) +// err = db.Write(batch, nil) +// ... +// +// Use bloom filter: +// +// o := &opt.Options{ +// Filter: filter.NewBloomFilter(10), +// } +// db, err := leveldb.OpenFile("path/to/db", o) +// ... +// defer db.Close() +// ... +package leveldb diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/errors.go b/vendor/github.com/syndtr/goleveldb/leveldb/errors.go new file mode 100644 index 000000000..de2649812 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/errors.go @@ -0,0 +1,20 @@ +// Copyright (c) 2014, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "github.com/syndtr/goleveldb/leveldb/errors" +) + +// Common errors. +var ( + ErrNotFound = errors.ErrNotFound + ErrReadOnly = errors.New("leveldb: read-only mode") + ErrSnapshotReleased = errors.New("leveldb: snapshot released") + ErrIterReleased = errors.New("leveldb: iterator released") + ErrClosed = errors.New("leveldb: closed") +) diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/errors/errors.go b/vendor/github.com/syndtr/goleveldb/leveldb/errors/errors.go new file mode 100644 index 000000000..8d6146b6f --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/errors/errors.go @@ -0,0 +1,78 @@ +// Copyright (c) 2014, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Package errors provides common error types used throughout leveldb. +package errors + +import ( + "errors" + "fmt" + + "github.com/syndtr/goleveldb/leveldb/storage" + "github.com/syndtr/goleveldb/leveldb/util" +) + +// Common errors. +var ( + ErrNotFound = New("leveldb: not found") + ErrReleased = util.ErrReleased + ErrHasReleaser = util.ErrHasReleaser +) + +// New returns an error that formats as the given text. +func New(text string) error { + return errors.New(text) +} + +// ErrCorrupted is the type that wraps errors that indicate corruption in +// the database. +type ErrCorrupted struct { + Fd storage.FileDesc + Err error +} + +func (e *ErrCorrupted) Error() string { + if !e.Fd.Zero() { + return fmt.Sprintf("%v [file=%v]", e.Err, e.Fd) + } + return e.Err.Error() +} + +// NewErrCorrupted creates new ErrCorrupted error. +func NewErrCorrupted(fd storage.FileDesc, err error) error { + return &ErrCorrupted{fd, err} +} + +// IsCorrupted returns a boolean indicating whether the error is indicating +// a corruption. +func IsCorrupted(err error) bool { + switch err.(type) { + case *ErrCorrupted: + return true + case *storage.ErrCorrupted: + return true + } + return false +} + +// ErrMissingFiles is the type that indicating a corruption due to missing +// files. ErrMissingFiles always wrapped with ErrCorrupted. +type ErrMissingFiles struct { + Fds []storage.FileDesc +} + +func (e *ErrMissingFiles) Error() string { return "file missing" } + +// SetFd sets 'file info' of the given error with the given file. +// Currently only ErrCorrupted is supported, otherwise will do nothing. +func SetFd(err error, fd storage.FileDesc) error { + switch x := err.(type) { + case *ErrCorrupted: + x.Fd = fd + return x + } + return err +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/filter.go b/vendor/github.com/syndtr/goleveldb/leveldb/filter.go new file mode 100644 index 000000000..e961e420d --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/filter.go @@ -0,0 +1,31 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "github.com/syndtr/goleveldb/leveldb/filter" +) + +type iFilter struct { + filter.Filter +} + +func (f iFilter) Contains(filter, key []byte) bool { + return f.Filter.Contains(filter, internalKey(key).ukey()) +} + +func (f iFilter) NewGenerator() filter.FilterGenerator { + return iFilterGenerator{f.Filter.NewGenerator()} +} + +type iFilterGenerator struct { + filter.FilterGenerator +} + +func (g iFilterGenerator) Add(key []byte) { + g.FilterGenerator.Add(internalKey(key).ukey()) +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/filter/bloom.go b/vendor/github.com/syndtr/goleveldb/leveldb/filter/bloom.go new file mode 100644 index 000000000..bab0e9970 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/filter/bloom.go @@ -0,0 +1,116 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package filter + +import ( + "github.com/syndtr/goleveldb/leveldb/util" +) + +func bloomHash(key []byte) uint32 { + return util.Hash(key, 0xbc9f1d34) +} + +type bloomFilter int + +// The bloom filter serializes its parameters and is backward compatible +// with respect to them. Therefor, its parameters are not added to its +// name. +func (bloomFilter) Name() string { + return "leveldb.BuiltinBloomFilter" +} + +func (f bloomFilter) Contains(filter, key []byte) bool { + nBytes := len(filter) - 1 + if nBytes < 1 { + return false + } + nBits := uint32(nBytes * 8) + + // Use the encoded k so that we can read filters generated by + // bloom filters created using different parameters. + k := filter[nBytes] + if k > 30 { + // Reserved for potentially new encodings for short bloom filters. + // Consider it a match. + return true + } + + kh := bloomHash(key) + delta := (kh >> 17) | (kh << 15) // Rotate right 17 bits + for j := uint8(0); j < k; j++ { + bitpos := kh % nBits + if (uint32(filter[bitpos/8]) & (1 << (bitpos % 8))) == 0 { + return false + } + kh += delta + } + return true +} + +func (f bloomFilter) NewGenerator() FilterGenerator { + // Round down to reduce probing cost a little bit. + k := uint8(f * 69 / 100) // 0.69 =~ ln(2) + if k < 1 { + k = 1 + } else if k > 30 { + k = 30 + } + return &bloomFilterGenerator{ + n: int(f), + k: k, + } +} + +type bloomFilterGenerator struct { + n int + k uint8 + + keyHashes []uint32 +} + +func (g *bloomFilterGenerator) Add(key []byte) { + // Use double-hashing to generate a sequence of hash values. + // See analysis in [Kirsch,Mitzenmacher 2006]. + g.keyHashes = append(g.keyHashes, bloomHash(key)) +} + +func (g *bloomFilterGenerator) Generate(b Buffer) { + // Compute bloom filter size (in both bits and bytes) + nBits := uint32(len(g.keyHashes) * g.n) + // For small n, we can see a very high false positive rate. Fix it + // by enforcing a minimum bloom filter length. + if nBits < 64 { + nBits = 64 + } + nBytes := (nBits + 7) / 8 + nBits = nBytes * 8 + + dest := b.Alloc(int(nBytes) + 1) + dest[nBytes] = g.k + for _, kh := range g.keyHashes { + delta := (kh >> 17) | (kh << 15) // Rotate right 17 bits + for j := uint8(0); j < g.k; j++ { + bitpos := kh % nBits + dest[bitpos/8] |= (1 << (bitpos % 8)) + kh += delta + } + } + + g.keyHashes = g.keyHashes[:0] +} + +// NewBloomFilter creates a new initialized bloom filter for given +// bitsPerKey. +// +// Since bitsPerKey is persisted individually for each bloom filter +// serialization, bloom filters are backwards compatible with respect to +// changing bitsPerKey. This means that no big performance penalty will +// be experienced when changing the parameter. See documentation for +// opt.Options.Filter for more information. +func NewBloomFilter(bitsPerKey int) Filter { + return bloomFilter(bitsPerKey) +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/filter/filter.go b/vendor/github.com/syndtr/goleveldb/leveldb/filter/filter.go new file mode 100644 index 000000000..7a925c5a8 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/filter/filter.go @@ -0,0 +1,60 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Package filter provides interface and implementation of probabilistic +// data structure. +// +// The filter is resposible for creating small filter from a set of keys. +// These filter will then used to test whether a key is a member of the set. +// In many cases, a filter can cut down the number of disk seeks from a +// handful to a single disk seek per DB.Get call. +package filter + +// Buffer is the interface that wraps basic Alloc, Write and WriteByte methods. +type Buffer interface { + // Alloc allocs n bytes of slice from the buffer. This also advancing + // write offset. + Alloc(n int) []byte + + // Write appends the contents of p to the buffer. + Write(p []byte) (n int, err error) + + // WriteByte appends the byte c to the buffer. + WriteByte(c byte) error +} + +// Filter is the filter. +type Filter interface { + // Name returns the name of this policy. + // + // Note that if the filter encoding changes in an incompatible way, + // the name returned by this method must be changed. Otherwise, old + // incompatible filters may be passed to methods of this type. + Name() string + + // NewGenerator creates a new filter generator. + NewGenerator() FilterGenerator + + // Contains returns true if the filter contains the given key. + // + // The filter are filters generated by the filter generator. + Contains(filter, key []byte) bool +} + +// FilterGenerator is the filter generator. +type FilterGenerator interface { + // Add adds a key to the filter generator. + // + // The key may become invalid after call to this method end, therefor + // key must be copied if implementation require keeping key for later + // use. The key should not modified directly, doing so may cause + // undefined results. + Add(key []byte) + + // Generate generates filters based on keys passed so far. After call + // to Generate the filter generator maybe resetted, depends on implementation. + Generate(b Buffer) +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/iterator/array_iter.go b/vendor/github.com/syndtr/goleveldb/leveldb/iterator/array_iter.go new file mode 100644 index 000000000..a23ab05f7 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/iterator/array_iter.go @@ -0,0 +1,184 @@ +// Copyright (c) 2014, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package iterator + +import ( + "github.com/syndtr/goleveldb/leveldb/util" +) + +// BasicArray is the interface that wraps basic Len and Search method. +type BasicArray interface { + // Len returns length of the array. + Len() int + + // Search finds smallest index that point to a key that is greater + // than or equal to the given key. + Search(key []byte) int +} + +// Array is the interface that wraps BasicArray and basic Index method. +type Array interface { + BasicArray + + // Index returns key/value pair with index of i. + Index(i int) (key, value []byte) +} + +// Array is the interface that wraps BasicArray and basic Get method. +type ArrayIndexer interface { + BasicArray + + // Get returns a new data iterator with index of i. + Get(i int) Iterator +} + +type basicArrayIterator struct { + util.BasicReleaser + array BasicArray + pos int + err error +} + +func (i *basicArrayIterator) Valid() bool { + return i.pos >= 0 && i.pos < i.array.Len() && !i.Released() +} + +func (i *basicArrayIterator) First() bool { + if i.Released() { + i.err = ErrIterReleased + return false + } + + if i.array.Len() == 0 { + i.pos = -1 + return false + } + i.pos = 0 + return true +} + +func (i *basicArrayIterator) Last() bool { + if i.Released() { + i.err = ErrIterReleased + return false + } + + n := i.array.Len() + if n == 0 { + i.pos = 0 + return false + } + i.pos = n - 1 + return true +} + +func (i *basicArrayIterator) Seek(key []byte) bool { + if i.Released() { + i.err = ErrIterReleased + return false + } + + n := i.array.Len() + if n == 0 { + i.pos = 0 + return false + } + i.pos = i.array.Search(key) + if i.pos >= n { + return false + } + return true +} + +func (i *basicArrayIterator) Next() bool { + if i.Released() { + i.err = ErrIterReleased + return false + } + + i.pos++ + if n := i.array.Len(); i.pos >= n { + i.pos = n + return false + } + return true +} + +func (i *basicArrayIterator) Prev() bool { + if i.Released() { + i.err = ErrIterReleased + return false + } + + i.pos-- + if i.pos < 0 { + i.pos = -1 + return false + } + return true +} + +func (i *basicArrayIterator) Error() error { return i.err } + +type arrayIterator struct { + basicArrayIterator + array Array + pos int + key, value []byte +} + +func (i *arrayIterator) updateKV() { + if i.pos == i.basicArrayIterator.pos { + return + } + i.pos = i.basicArrayIterator.pos + if i.Valid() { + i.key, i.value = i.array.Index(i.pos) + } else { + i.key = nil + i.value = nil + } +} + +func (i *arrayIterator) Key() []byte { + i.updateKV() + return i.key +} + +func (i *arrayIterator) Value() []byte { + i.updateKV() + return i.value +} + +type arrayIteratorIndexer struct { + basicArrayIterator + array ArrayIndexer +} + +func (i *arrayIteratorIndexer) Get() Iterator { + if i.Valid() { + return i.array.Get(i.basicArrayIterator.pos) + } + return nil +} + +// NewArrayIterator returns an iterator from the given array. +func NewArrayIterator(array Array) Iterator { + return &arrayIterator{ + basicArrayIterator: basicArrayIterator{array: array, pos: -1}, + array: array, + pos: -1, + } +} + +// NewArrayIndexer returns an index iterator from the given array. +func NewArrayIndexer(array ArrayIndexer) IteratorIndexer { + return &arrayIteratorIndexer{ + basicArrayIterator: basicArrayIterator{array: array, pos: -1}, + array: array, + } +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/iterator/indexed_iter.go b/vendor/github.com/syndtr/goleveldb/leveldb/iterator/indexed_iter.go new file mode 100644 index 000000000..939adbb93 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/iterator/indexed_iter.go @@ -0,0 +1,242 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package iterator + +import ( + "github.com/syndtr/goleveldb/leveldb/errors" + "github.com/syndtr/goleveldb/leveldb/util" +) + +// IteratorIndexer is the interface that wraps CommonIterator and basic Get +// method. IteratorIndexer provides index for indexed iterator. +type IteratorIndexer interface { + CommonIterator + + // Get returns a new data iterator for the current position, or nil if + // done. + Get() Iterator +} + +type indexedIterator struct { + util.BasicReleaser + index IteratorIndexer + strict bool + + data Iterator + err error + errf func(err error) + closed bool +} + +func (i *indexedIterator) setData() { + if i.data != nil { + i.data.Release() + } + i.data = i.index.Get() +} + +func (i *indexedIterator) clearData() { + if i.data != nil { + i.data.Release() + } + i.data = nil +} + +func (i *indexedIterator) indexErr() { + if err := i.index.Error(); err != nil { + if i.errf != nil { + i.errf(err) + } + i.err = err + } +} + +func (i *indexedIterator) dataErr() bool { + if err := i.data.Error(); err != nil { + if i.errf != nil { + i.errf(err) + } + if i.strict || !errors.IsCorrupted(err) { + i.err = err + return true + } + } + return false +} + +func (i *indexedIterator) Valid() bool { + return i.data != nil && i.data.Valid() +} + +func (i *indexedIterator) First() bool { + if i.err != nil { + return false + } else if i.Released() { + i.err = ErrIterReleased + return false + } + + if !i.index.First() { + i.indexErr() + i.clearData() + return false + } + i.setData() + return i.Next() +} + +func (i *indexedIterator) Last() bool { + if i.err != nil { + return false + } else if i.Released() { + i.err = ErrIterReleased + return false + } + + if !i.index.Last() { + i.indexErr() + i.clearData() + return false + } + i.setData() + if !i.data.Last() { + if i.dataErr() { + return false + } + i.clearData() + return i.Prev() + } + return true +} + +func (i *indexedIterator) Seek(key []byte) bool { + if i.err != nil { + return false + } else if i.Released() { + i.err = ErrIterReleased + return false + } + + if !i.index.Seek(key) { + i.indexErr() + i.clearData() + return false + } + i.setData() + if !i.data.Seek(key) { + if i.dataErr() { + return false + } + i.clearData() + return i.Next() + } + return true +} + +func (i *indexedIterator) Next() bool { + if i.err != nil { + return false + } else if i.Released() { + i.err = ErrIterReleased + return false + } + + switch { + case i.data != nil && !i.data.Next(): + if i.dataErr() { + return false + } + i.clearData() + fallthrough + case i.data == nil: + if !i.index.Next() { + i.indexErr() + return false + } + i.setData() + return i.Next() + } + return true +} + +func (i *indexedIterator) Prev() bool { + if i.err != nil { + return false + } else if i.Released() { + i.err = ErrIterReleased + return false + } + + switch { + case i.data != nil && !i.data.Prev(): + if i.dataErr() { + return false + } + i.clearData() + fallthrough + case i.data == nil: + if !i.index.Prev() { + i.indexErr() + return false + } + i.setData() + if !i.data.Last() { + if i.dataErr() { + return false + } + i.clearData() + return i.Prev() + } + } + return true +} + +func (i *indexedIterator) Key() []byte { + if i.data == nil { + return nil + } + return i.data.Key() +} + +func (i *indexedIterator) Value() []byte { + if i.data == nil { + return nil + } + return i.data.Value() +} + +func (i *indexedIterator) Release() { + i.clearData() + i.index.Release() + i.BasicReleaser.Release() +} + +func (i *indexedIterator) Error() error { + if i.err != nil { + return i.err + } + if err := i.index.Error(); err != nil { + return err + } + return nil +} + +func (i *indexedIterator) SetErrorCallback(f func(err error)) { + i.errf = f +} + +// NewIndexedIterator returns an 'indexed iterator'. An index is iterator +// that returns another iterator, a 'data iterator'. A 'data iterator' is the +// iterator that contains actual key/value pairs. +// +// If strict is true the any 'corruption errors' (i.e errors.IsCorrupted(err) == true) +// won't be ignored and will halt 'indexed iterator', otherwise the iterator will +// continue to the next 'data iterator'. Corruption on 'index iterator' will not be +// ignored and will halt the iterator. +func NewIndexedIterator(index IteratorIndexer, strict bool) Iterator { + return &indexedIterator{index: index, strict: strict} +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/iterator/iter.go b/vendor/github.com/syndtr/goleveldb/leveldb/iterator/iter.go new file mode 100644 index 000000000..96fb0f685 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/iterator/iter.go @@ -0,0 +1,132 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Package iterator provides interface and implementation to traverse over +// contents of a database. +package iterator + +import ( + "errors" + + "github.com/syndtr/goleveldb/leveldb/util" +) + +var ( + ErrIterReleased = errors.New("leveldb/iterator: iterator released") +) + +// IteratorSeeker is the interface that wraps the 'seeks method'. +type IteratorSeeker interface { + // First moves the iterator to the first key/value pair. If the iterator + // only contains one key/value pair then First and Last would moves + // to the same key/value pair. + // It returns whether such pair exist. + First() bool + + // Last moves the iterator to the last key/value pair. If the iterator + // only contains one key/value pair then First and Last would moves + // to the same key/value pair. + // It returns whether such pair exist. + Last() bool + + // Seek moves the iterator to the first key/value pair whose key is greater + // than or equal to the given key. + // It returns whether such pair exist. + // + // It is safe to modify the contents of the argument after Seek returns. + Seek(key []byte) bool + + // Next moves the iterator to the next key/value pair. + // It returns false if the iterator is exhausted. + Next() bool + + // Prev moves the iterator to the previous key/value pair. + // It returns false if the iterator is exhausted. + Prev() bool +} + +// CommonIterator is the interface that wraps common iterator methods. +type CommonIterator interface { + IteratorSeeker + + // util.Releaser is the interface that wraps basic Release method. + // When called Release will releases any resources associated with the + // iterator. + util.Releaser + + // util.ReleaseSetter is the interface that wraps the basic SetReleaser + // method. + util.ReleaseSetter + + // TODO: Remove this when ready. + Valid() bool + + // Error returns any accumulated error. Exhausting all the key/value pairs + // is not considered to be an error. + Error() error +} + +// Iterator iterates over a DB's key/value pairs in key order. +// +// When encounter an error any 'seeks method' will return false and will +// yield no key/value pairs. The error can be queried by calling the Error +// method. Calling Release is still necessary. +// +// An iterator must be released after use, but it is not necessary to read +// an iterator until exhaustion. +// Also, an iterator is not necessarily safe for concurrent use, but it is +// safe to use multiple iterators concurrently, with each in a dedicated +// goroutine. +type Iterator interface { + CommonIterator + + // Key returns the key of the current key/value pair, or nil if done. + // The caller should not modify the contents of the returned slice, and + // its contents may change on the next call to any 'seeks method'. + Key() []byte + + // Value returns the value of the current key/value pair, or nil if done. + // The caller should not modify the contents of the returned slice, and + // its contents may change on the next call to any 'seeks method'. + Value() []byte +} + +// ErrorCallbackSetter is the interface that wraps basic SetErrorCallback +// method. +// +// ErrorCallbackSetter implemented by indexed and merged iterator. +type ErrorCallbackSetter interface { + // SetErrorCallback allows set an error callback of the corresponding + // iterator. Use nil to clear the callback. + SetErrorCallback(f func(err error)) +} + +type emptyIterator struct { + util.BasicReleaser + err error +} + +func (i *emptyIterator) rErr() { + if i.err == nil && i.Released() { + i.err = ErrIterReleased + } +} + +func (*emptyIterator) Valid() bool { return false } +func (i *emptyIterator) First() bool { i.rErr(); return false } +func (i *emptyIterator) Last() bool { i.rErr(); return false } +func (i *emptyIterator) Seek(key []byte) bool { i.rErr(); return false } +func (i *emptyIterator) Next() bool { i.rErr(); return false } +func (i *emptyIterator) Prev() bool { i.rErr(); return false } +func (*emptyIterator) Key() []byte { return nil } +func (*emptyIterator) Value() []byte { return nil } +func (i *emptyIterator) Error() error { return i.err } + +// NewEmptyIterator creates an empty iterator. The err parameter can be +// nil, but if not nil the given err will be returned by Error method. +func NewEmptyIterator(err error) Iterator { + return &emptyIterator{err: err} +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/iterator/merged_iter.go b/vendor/github.com/syndtr/goleveldb/leveldb/iterator/merged_iter.go new file mode 100644 index 000000000..1a7e29df8 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/iterator/merged_iter.go @@ -0,0 +1,304 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package iterator + +import ( + "github.com/syndtr/goleveldb/leveldb/comparer" + "github.com/syndtr/goleveldb/leveldb/errors" + "github.com/syndtr/goleveldb/leveldb/util" +) + +type dir int + +const ( + dirReleased dir = iota - 1 + dirSOI + dirEOI + dirBackward + dirForward +) + +type mergedIterator struct { + cmp comparer.Comparer + iters []Iterator + strict bool + + keys [][]byte + index int + dir dir + err error + errf func(err error) + releaser util.Releaser +} + +func assertKey(key []byte) []byte { + if key == nil { + panic("leveldb/iterator: nil key") + } + return key +} + +func (i *mergedIterator) iterErr(iter Iterator) bool { + if err := iter.Error(); err != nil { + if i.errf != nil { + i.errf(err) + } + if i.strict || !errors.IsCorrupted(err) { + i.err = err + return true + } + } + return false +} + +func (i *mergedIterator) Valid() bool { + return i.err == nil && i.dir > dirEOI +} + +func (i *mergedIterator) First() bool { + if i.err != nil { + return false + } else if i.dir == dirReleased { + i.err = ErrIterReleased + return false + } + + for x, iter := range i.iters { + switch { + case iter.First(): + i.keys[x] = assertKey(iter.Key()) + case i.iterErr(iter): + return false + default: + i.keys[x] = nil + } + } + i.dir = dirSOI + return i.next() +} + +func (i *mergedIterator) Last() bool { + if i.err != nil { + return false + } else if i.dir == dirReleased { + i.err = ErrIterReleased + return false + } + + for x, iter := range i.iters { + switch { + case iter.Last(): + i.keys[x] = assertKey(iter.Key()) + case i.iterErr(iter): + return false + default: + i.keys[x] = nil + } + } + i.dir = dirEOI + return i.prev() +} + +func (i *mergedIterator) Seek(key []byte) bool { + if i.err != nil { + return false + } else if i.dir == dirReleased { + i.err = ErrIterReleased + return false + } + + for x, iter := range i.iters { + switch { + case iter.Seek(key): + i.keys[x] = assertKey(iter.Key()) + case i.iterErr(iter): + return false + default: + i.keys[x] = nil + } + } + i.dir = dirSOI + return i.next() +} + +func (i *mergedIterator) next() bool { + var key []byte + if i.dir == dirForward { + key = i.keys[i.index] + } + for x, tkey := range i.keys { + if tkey != nil && (key == nil || i.cmp.Compare(tkey, key) < 0) { + key = tkey + i.index = x + } + } + if key == nil { + i.dir = dirEOI + return false + } + i.dir = dirForward + return true +} + +func (i *mergedIterator) Next() bool { + if i.dir == dirEOI || i.err != nil { + return false + } else if i.dir == dirReleased { + i.err = ErrIterReleased + return false + } + + switch i.dir { + case dirSOI: + return i.First() + case dirBackward: + key := append([]byte{}, i.keys[i.index]...) + if !i.Seek(key) { + return false + } + return i.Next() + } + + x := i.index + iter := i.iters[x] + switch { + case iter.Next(): + i.keys[x] = assertKey(iter.Key()) + case i.iterErr(iter): + return false + default: + i.keys[x] = nil + } + return i.next() +} + +func (i *mergedIterator) prev() bool { + var key []byte + if i.dir == dirBackward { + key = i.keys[i.index] + } + for x, tkey := range i.keys { + if tkey != nil && (key == nil || i.cmp.Compare(tkey, key) > 0) { + key = tkey + i.index = x + } + } + if key == nil { + i.dir = dirSOI + return false + } + i.dir = dirBackward + return true +} + +func (i *mergedIterator) Prev() bool { + if i.dir == dirSOI || i.err != nil { + return false + } else if i.dir == dirReleased { + i.err = ErrIterReleased + return false + } + + switch i.dir { + case dirEOI: + return i.Last() + case dirForward: + key := append([]byte{}, i.keys[i.index]...) + for x, iter := range i.iters { + if x == i.index { + continue + } + seek := iter.Seek(key) + switch { + case seek && iter.Prev(), !seek && iter.Last(): + i.keys[x] = assertKey(iter.Key()) + case i.iterErr(iter): + return false + default: + i.keys[x] = nil + } + } + } + + x := i.index + iter := i.iters[x] + switch { + case iter.Prev(): + i.keys[x] = assertKey(iter.Key()) + case i.iterErr(iter): + return false + default: + i.keys[x] = nil + } + return i.prev() +} + +func (i *mergedIterator) Key() []byte { + if i.err != nil || i.dir <= dirEOI { + return nil + } + return i.keys[i.index] +} + +func (i *mergedIterator) Value() []byte { + if i.err != nil || i.dir <= dirEOI { + return nil + } + return i.iters[i.index].Value() +} + +func (i *mergedIterator) Release() { + if i.dir != dirReleased { + i.dir = dirReleased + for _, iter := range i.iters { + iter.Release() + } + i.iters = nil + i.keys = nil + if i.releaser != nil { + i.releaser.Release() + i.releaser = nil + } + } +} + +func (i *mergedIterator) SetReleaser(releaser util.Releaser) { + if i.dir == dirReleased { + panic(util.ErrReleased) + } + if i.releaser != nil && releaser != nil { + panic(util.ErrHasReleaser) + } + i.releaser = releaser +} + +func (i *mergedIterator) Error() error { + return i.err +} + +func (i *mergedIterator) SetErrorCallback(f func(err error)) { + i.errf = f +} + +// NewMergedIterator returns an iterator that merges its input. Walking the +// resultant iterator will return all key/value pairs of all input iterators +// in strictly increasing key order, as defined by cmp. +// The input's key ranges may overlap, but there are assumed to be no duplicate +// keys: if iters[i] contains a key k then iters[j] will not contain that key k. +// None of the iters may be nil. +// +// If strict is true the any 'corruption errors' (i.e errors.IsCorrupted(err) == true) +// won't be ignored and will halt 'merged iterator', otherwise the iterator will +// continue to the next 'input iterator'. +func NewMergedIterator(iters []Iterator, cmp comparer.Comparer, strict bool) Iterator { + return &mergedIterator{ + iters: iters, + cmp: cmp, + strict: strict, + keys: make([][]byte, len(iters)), + } +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/journal/journal.go b/vendor/github.com/syndtr/goleveldb/leveldb/journal/journal.go new file mode 100644 index 000000000..d094c3d0f --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/journal/journal.go @@ -0,0 +1,524 @@ +// Copyright 2011 The LevelDB-Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Taken from: https://code.google.com/p/leveldb-go/source/browse/leveldb/record/record.go?r=1d5ccbe03246da926391ee12d1c6caae054ff4b0 +// License, authors and contributors informations can be found at bellow URLs respectively: +// https://code.google.com/p/leveldb-go/source/browse/LICENSE +// https://code.google.com/p/leveldb-go/source/browse/AUTHORS +// https://code.google.com/p/leveldb-go/source/browse/CONTRIBUTORS + +// Package journal reads and writes sequences of journals. Each journal is a stream +// of bytes that completes before the next journal starts. +// +// When reading, call Next to obtain an io.Reader for the next journal. Next will +// return io.EOF when there are no more journals. It is valid to call Next +// without reading the current journal to exhaustion. +// +// When writing, call Next to obtain an io.Writer for the next journal. Calling +// Next finishes the current journal. Call Close to finish the final journal. +// +// Optionally, call Flush to finish the current journal and flush the underlying +// writer without starting a new journal. To start a new journal after flushing, +// call Next. +// +// Neither Readers or Writers are safe to use concurrently. +// +// Example code: +// func read(r io.Reader) ([]string, error) { +// var ss []string +// journals := journal.NewReader(r, nil, true, true) +// for { +// j, err := journals.Next() +// if err == io.EOF { +// break +// } +// if err != nil { +// return nil, err +// } +// s, err := ioutil.ReadAll(j) +// if err != nil { +// return nil, err +// } +// ss = append(ss, string(s)) +// } +// return ss, nil +// } +// +// func write(w io.Writer, ss []string) error { +// journals := journal.NewWriter(w) +// for _, s := range ss { +// j, err := journals.Next() +// if err != nil { +// return err +// } +// if _, err := j.Write([]byte(s)), err != nil { +// return err +// } +// } +// return journals.Close() +// } +// +// The wire format is that the stream is divided into 32KiB blocks, and each +// block contains a number of tightly packed chunks. Chunks cannot cross block +// boundaries. The last block may be shorter than 32 KiB. Any unused bytes in a +// block must be zero. +// +// A journal maps to one or more chunks. Each chunk has a 7 byte header (a 4 +// byte checksum, a 2 byte little-endian uint16 length, and a 1 byte chunk type) +// followed by a payload. The checksum is over the chunk type and the payload. +// +// There are four chunk types: whether the chunk is the full journal, or the +// first, middle or last chunk of a multi-chunk journal. A multi-chunk journal +// has one first chunk, zero or more middle chunks, and one last chunk. +// +// The wire format allows for limited recovery in the face of data corruption: +// on a format error (such as a checksum mismatch), the reader moves to the +// next block and looks for the next full or first chunk. +package journal + +import ( + "encoding/binary" + "fmt" + "io" + + "github.com/syndtr/goleveldb/leveldb/errors" + "github.com/syndtr/goleveldb/leveldb/storage" + "github.com/syndtr/goleveldb/leveldb/util" +) + +// These constants are part of the wire format and should not be changed. +const ( + fullChunkType = 1 + firstChunkType = 2 + middleChunkType = 3 + lastChunkType = 4 +) + +const ( + blockSize = 32 * 1024 + headerSize = 7 +) + +type flusher interface { + Flush() error +} + +// ErrCorrupted is the error type that generated by corrupted block or chunk. +type ErrCorrupted struct { + Size int + Reason string +} + +func (e *ErrCorrupted) Error() string { + return fmt.Sprintf("leveldb/journal: block/chunk corrupted: %s (%d bytes)", e.Reason, e.Size) +} + +// Dropper is the interface that wrap simple Drop method. The Drop +// method will be called when the journal reader dropping a block or chunk. +type Dropper interface { + Drop(err error) +} + +// Reader reads journals from an underlying io.Reader. +type Reader struct { + // r is the underlying reader. + r io.Reader + // the dropper. + dropper Dropper + // strict flag. + strict bool + // checksum flag. + checksum bool + // seq is the sequence number of the current journal. + seq int + // buf[i:j] is the unread portion of the current chunk's payload. + // The low bound, i, excludes the chunk header. + i, j int + // n is the number of bytes of buf that are valid. Once reading has started, + // only the final block can have n < blockSize. + n int + // last is whether the current chunk is the last chunk of the journal. + last bool + // err is any accumulated error. + err error + // buf is the buffer. + buf [blockSize]byte +} + +// NewReader returns a new reader. The dropper may be nil, and if +// strict is true then corrupted or invalid chunk will halt the journal +// reader entirely. +func NewReader(r io.Reader, dropper Dropper, strict, checksum bool) *Reader { + return &Reader{ + r: r, + dropper: dropper, + strict: strict, + checksum: checksum, + last: true, + } +} + +var errSkip = errors.New("leveldb/journal: skipped") + +func (r *Reader) corrupt(n int, reason string, skip bool) error { + if r.dropper != nil { + r.dropper.Drop(&ErrCorrupted{n, reason}) + } + if r.strict && !skip { + r.err = errors.NewErrCorrupted(storage.FileDesc{}, &ErrCorrupted{n, reason}) + return r.err + } + return errSkip +} + +// nextChunk sets r.buf[r.i:r.j] to hold the next chunk's payload, reading the +// next block into the buffer if necessary. +func (r *Reader) nextChunk(first bool) error { + for { + if r.j+headerSize <= r.n { + checksum := binary.LittleEndian.Uint32(r.buf[r.j+0 : r.j+4]) + length := binary.LittleEndian.Uint16(r.buf[r.j+4 : r.j+6]) + chunkType := r.buf[r.j+6] + unprocBlock := r.n - r.j + if checksum == 0 && length == 0 && chunkType == 0 { + // Drop entire block. + r.i = r.n + r.j = r.n + return r.corrupt(unprocBlock, "zero header", false) + } + if chunkType < fullChunkType || chunkType > lastChunkType { + // Drop entire block. + r.i = r.n + r.j = r.n + return r.corrupt(unprocBlock, fmt.Sprintf("invalid chunk type %#x", chunkType), false) + } + r.i = r.j + headerSize + r.j = r.j + headerSize + int(length) + if r.j > r.n { + // Drop entire block. + r.i = r.n + r.j = r.n + return r.corrupt(unprocBlock, "chunk length overflows block", false) + } else if r.checksum && checksum != util.NewCRC(r.buf[r.i-1:r.j]).Value() { + // Drop entire block. + r.i = r.n + r.j = r.n + return r.corrupt(unprocBlock, "checksum mismatch", false) + } + if first && chunkType != fullChunkType && chunkType != firstChunkType { + chunkLength := (r.j - r.i) + headerSize + r.i = r.j + // Report the error, but skip it. + return r.corrupt(chunkLength, "orphan chunk", true) + } + r.last = chunkType == fullChunkType || chunkType == lastChunkType + return nil + } + + // The last block. + if r.n < blockSize && r.n > 0 { + if !first { + return r.corrupt(0, "missing chunk part", false) + } + r.err = io.EOF + return r.err + } + + // Read block. + n, err := io.ReadFull(r.r, r.buf[:]) + if err != nil && err != io.EOF && err != io.ErrUnexpectedEOF { + return err + } + if n == 0 { + if !first { + return r.corrupt(0, "missing chunk part", false) + } + r.err = io.EOF + return r.err + } + r.i, r.j, r.n = 0, 0, n + } +} + +// Next returns a reader for the next journal. It returns io.EOF if there are no +// more journals. The reader returned becomes stale after the next Next call, +// and should no longer be used. If strict is false, the reader will returns +// io.ErrUnexpectedEOF error when found corrupted journal. +func (r *Reader) Next() (io.Reader, error) { + r.seq++ + if r.err != nil { + return nil, r.err + } + r.i = r.j + for { + if err := r.nextChunk(true); err == nil { + break + } else if err != errSkip { + return nil, err + } + } + return &singleReader{r, r.seq, nil}, nil +} + +// Reset resets the journal reader, allows reuse of the journal reader. Reset returns +// last accumulated error. +func (r *Reader) Reset(reader io.Reader, dropper Dropper, strict, checksum bool) error { + r.seq++ + err := r.err + r.r = reader + r.dropper = dropper + r.strict = strict + r.checksum = checksum + r.i = 0 + r.j = 0 + r.n = 0 + r.last = true + r.err = nil + return err +} + +type singleReader struct { + r *Reader + seq int + err error +} + +func (x *singleReader) Read(p []byte) (int, error) { + r := x.r + if r.seq != x.seq { + return 0, errors.New("leveldb/journal: stale reader") + } + if x.err != nil { + return 0, x.err + } + if r.err != nil { + return 0, r.err + } + for r.i == r.j { + if r.last { + return 0, io.EOF + } + x.err = r.nextChunk(false) + if x.err != nil { + if x.err == errSkip { + x.err = io.ErrUnexpectedEOF + } + return 0, x.err + } + } + n := copy(p, r.buf[r.i:r.j]) + r.i += n + return n, nil +} + +func (x *singleReader) ReadByte() (byte, error) { + r := x.r + if r.seq != x.seq { + return 0, errors.New("leveldb/journal: stale reader") + } + if x.err != nil { + return 0, x.err + } + if r.err != nil { + return 0, r.err + } + for r.i == r.j { + if r.last { + return 0, io.EOF + } + x.err = r.nextChunk(false) + if x.err != nil { + if x.err == errSkip { + x.err = io.ErrUnexpectedEOF + } + return 0, x.err + } + } + c := r.buf[r.i] + r.i++ + return c, nil +} + +// Writer writes journals to an underlying io.Writer. +type Writer struct { + // w is the underlying writer. + w io.Writer + // seq is the sequence number of the current journal. + seq int + // f is w as a flusher. + f flusher + // buf[i:j] is the bytes that will become the current chunk. + // The low bound, i, includes the chunk header. + i, j int + // buf[:written] has already been written to w. + // written is zero unless Flush has been called. + written int + // first is whether the current chunk is the first chunk of the journal. + first bool + // pending is whether a chunk is buffered but not yet written. + pending bool + // err is any accumulated error. + err error + // buf is the buffer. + buf [blockSize]byte +} + +// NewWriter returns a new Writer. +func NewWriter(w io.Writer) *Writer { + f, _ := w.(flusher) + return &Writer{ + w: w, + f: f, + } +} + +// fillHeader fills in the header for the pending chunk. +func (w *Writer) fillHeader(last bool) { + if w.i+headerSize > w.j || w.j > blockSize { + panic("leveldb/journal: bad writer state") + } + if last { + if w.first { + w.buf[w.i+6] = fullChunkType + } else { + w.buf[w.i+6] = lastChunkType + } + } else { + if w.first { + w.buf[w.i+6] = firstChunkType + } else { + w.buf[w.i+6] = middleChunkType + } + } + binary.LittleEndian.PutUint32(w.buf[w.i+0:w.i+4], util.NewCRC(w.buf[w.i+6:w.j]).Value()) + binary.LittleEndian.PutUint16(w.buf[w.i+4:w.i+6], uint16(w.j-w.i-headerSize)) +} + +// writeBlock writes the buffered block to the underlying writer, and reserves +// space for the next chunk's header. +func (w *Writer) writeBlock() { + _, w.err = w.w.Write(w.buf[w.written:]) + w.i = 0 + w.j = headerSize + w.written = 0 +} + +// writePending finishes the current journal and writes the buffer to the +// underlying writer. +func (w *Writer) writePending() { + if w.err != nil { + return + } + if w.pending { + w.fillHeader(true) + w.pending = false + } + _, w.err = w.w.Write(w.buf[w.written:w.j]) + w.written = w.j +} + +// Close finishes the current journal and closes the writer. +func (w *Writer) Close() error { + w.seq++ + w.writePending() + if w.err != nil { + return w.err + } + w.err = errors.New("leveldb/journal: closed Writer") + return nil +} + +// Flush finishes the current journal, writes to the underlying writer, and +// flushes it if that writer implements interface{ Flush() error }. +func (w *Writer) Flush() error { + w.seq++ + w.writePending() + if w.err != nil { + return w.err + } + if w.f != nil { + w.err = w.f.Flush() + return w.err + } + return nil +} + +// Reset resets the journal writer, allows reuse of the journal writer. Reset +// will also closes the journal writer if not already. +func (w *Writer) Reset(writer io.Writer) (err error) { + w.seq++ + if w.err == nil { + w.writePending() + err = w.err + } + w.w = writer + w.f, _ = writer.(flusher) + w.i = 0 + w.j = 0 + w.written = 0 + w.first = false + w.pending = false + w.err = nil + return +} + +// Next returns a writer for the next journal. The writer returned becomes stale +// after the next Close, Flush or Next call, and should no longer be used. +func (w *Writer) Next() (io.Writer, error) { + w.seq++ + if w.err != nil { + return nil, w.err + } + if w.pending { + w.fillHeader(true) + } + w.i = w.j + w.j = w.j + headerSize + // Check if there is room in the block for the header. + if w.j > blockSize { + // Fill in the rest of the block with zeroes. + for k := w.i; k < blockSize; k++ { + w.buf[k] = 0 + } + w.writeBlock() + if w.err != nil { + return nil, w.err + } + } + w.first = true + w.pending = true + return singleWriter{w, w.seq}, nil +} + +type singleWriter struct { + w *Writer + seq int +} + +func (x singleWriter) Write(p []byte) (int, error) { + w := x.w + if w.seq != x.seq { + return 0, errors.New("leveldb/journal: stale writer") + } + if w.err != nil { + return 0, w.err + } + n0 := len(p) + for len(p) > 0 { + // Write a block, if it is full. + if w.j == blockSize { + w.fillHeader(false) + w.writeBlock() + if w.err != nil { + return 0, w.err + } + w.first = false + } + // Copy bytes into the buffer. + n := copy(w.buf[w.j:], p) + w.j += n + p = p[n:] + } + return n0, nil +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/key.go b/vendor/github.com/syndtr/goleveldb/leveldb/key.go new file mode 100644 index 000000000..ad8f51ec8 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/key.go @@ -0,0 +1,143 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "encoding/binary" + "fmt" + + "github.com/syndtr/goleveldb/leveldb/errors" + "github.com/syndtr/goleveldb/leveldb/storage" +) + +// ErrInternalKeyCorrupted records internal key corruption. +type ErrInternalKeyCorrupted struct { + Ikey []byte + Reason string +} + +func (e *ErrInternalKeyCorrupted) Error() string { + return fmt.Sprintf("leveldb: internal key %q corrupted: %s", e.Ikey, e.Reason) +} + +func newErrInternalKeyCorrupted(ikey []byte, reason string) error { + return errors.NewErrCorrupted(storage.FileDesc{}, &ErrInternalKeyCorrupted{append([]byte{}, ikey...), reason}) +} + +type keyType uint + +func (kt keyType) String() string { + switch kt { + case keyTypeDel: + return "d" + case keyTypeVal: + return "v" + } + return fmt.Sprintf("", uint(kt)) +} + +// Value types encoded as the last component of internal keys. +// Don't modify; this value are saved to disk. +const ( + keyTypeDel = keyType(0) + keyTypeVal = keyType(1) +) + +// keyTypeSeek defines the keyType that should be passed when constructing an +// internal key for seeking to a particular sequence number (since we +// sort sequence numbers in decreasing order and the value type is +// embedded as the low 8 bits in the sequence number in internal keys, +// we need to use the highest-numbered ValueType, not the lowest). +const keyTypeSeek = keyTypeVal + +const ( + // Maximum value possible for sequence number; the 8-bits are + // used by value type, so its can packed together in single + // 64-bit integer. + keyMaxSeq = (uint64(1) << 56) - 1 + // Maximum value possible for packed sequence number and type. + keyMaxNum = (keyMaxSeq << 8) | uint64(keyTypeSeek) +) + +// Maximum number encoded in bytes. +var keyMaxNumBytes = make([]byte, 8) + +func init() { + binary.LittleEndian.PutUint64(keyMaxNumBytes, keyMaxNum) +} + +type internalKey []byte + +func makeInternalKey(dst, ukey []byte, seq uint64, kt keyType) internalKey { + if seq > keyMaxSeq { + panic("leveldb: invalid sequence number") + } else if kt > keyTypeVal { + panic("leveldb: invalid type") + } + + dst = ensureBuffer(dst, len(ukey)+8) + copy(dst, ukey) + binary.LittleEndian.PutUint64(dst[len(ukey):], (seq<<8)|uint64(kt)) + return internalKey(dst) +} + +func parseInternalKey(ik []byte) (ukey []byte, seq uint64, kt keyType, err error) { + if len(ik) < 8 { + return nil, 0, 0, newErrInternalKeyCorrupted(ik, "invalid length") + } + num := binary.LittleEndian.Uint64(ik[len(ik)-8:]) + seq, kt = uint64(num>>8), keyType(num&0xff) + if kt > keyTypeVal { + return nil, 0, 0, newErrInternalKeyCorrupted(ik, "invalid type") + } + ukey = ik[:len(ik)-8] + return +} + +func validInternalKey(ik []byte) bool { + _, _, _, err := parseInternalKey(ik) + return err == nil +} + +func (ik internalKey) assert() { + if ik == nil { + panic("leveldb: nil internalKey") + } + if len(ik) < 8 { + panic(fmt.Sprintf("leveldb: internal key %q, len=%d: invalid length", []byte(ik), len(ik))) + } +} + +func (ik internalKey) ukey() []byte { + ik.assert() + return ik[:len(ik)-8] +} + +func (ik internalKey) num() uint64 { + ik.assert() + return binary.LittleEndian.Uint64(ik[len(ik)-8:]) +} + +func (ik internalKey) parseNum() (seq uint64, kt keyType) { + num := ik.num() + seq, kt = uint64(num>>8), keyType(num&0xff) + if kt > keyTypeVal { + panic(fmt.Sprintf("leveldb: internal key %q, len=%d: invalid type %#x", []byte(ik), len(ik), kt)) + } + return +} + +func (ik internalKey) String() string { + if ik == nil { + return "" + } + + if ukey, seq, kt, err := parseInternalKey(ik); err == nil { + return fmt.Sprintf("%s,%s%d", shorten(string(ukey)), kt, seq) + } + return fmt.Sprintf("", []byte(ik)) +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/memdb/memdb.go b/vendor/github.com/syndtr/goleveldb/leveldb/memdb/memdb.go new file mode 100644 index 000000000..824e47f5f --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/memdb/memdb.go @@ -0,0 +1,479 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Package memdb provides in-memory key/value database implementation. +package memdb + +import ( + "math/rand" + "sync" + + "github.com/syndtr/goleveldb/leveldb/comparer" + "github.com/syndtr/goleveldb/leveldb/errors" + "github.com/syndtr/goleveldb/leveldb/iterator" + "github.com/syndtr/goleveldb/leveldb/util" +) + +// Common errors. +var ( + ErrNotFound = errors.ErrNotFound + ErrIterReleased = errors.New("leveldb/memdb: iterator released") +) + +const tMaxHeight = 12 + +type dbIter struct { + util.BasicReleaser + p *DB + slice *util.Range + node int + forward bool + key, value []byte + err error +} + +func (i *dbIter) fill(checkStart, checkLimit bool) bool { + if i.node != 0 { + n := i.p.nodeData[i.node] + m := n + i.p.nodeData[i.node+nKey] + i.key = i.p.kvData[n:m] + if i.slice != nil { + switch { + case checkLimit && i.slice.Limit != nil && i.p.cmp.Compare(i.key, i.slice.Limit) >= 0: + fallthrough + case checkStart && i.slice.Start != nil && i.p.cmp.Compare(i.key, i.slice.Start) < 0: + i.node = 0 + goto bail + } + } + i.value = i.p.kvData[m : m+i.p.nodeData[i.node+nVal]] + return true + } +bail: + i.key = nil + i.value = nil + return false +} + +func (i *dbIter) Valid() bool { + return i.node != 0 +} + +func (i *dbIter) First() bool { + if i.Released() { + i.err = ErrIterReleased + return false + } + + i.forward = true + i.p.mu.RLock() + defer i.p.mu.RUnlock() + if i.slice != nil && i.slice.Start != nil { + i.node, _ = i.p.findGE(i.slice.Start, false) + } else { + i.node = i.p.nodeData[nNext] + } + return i.fill(false, true) +} + +func (i *dbIter) Last() bool { + if i.Released() { + i.err = ErrIterReleased + return false + } + + i.forward = false + i.p.mu.RLock() + defer i.p.mu.RUnlock() + if i.slice != nil && i.slice.Limit != nil { + i.node = i.p.findLT(i.slice.Limit) + } else { + i.node = i.p.findLast() + } + return i.fill(true, false) +} + +func (i *dbIter) Seek(key []byte) bool { + if i.Released() { + i.err = ErrIterReleased + return false + } + + i.forward = true + i.p.mu.RLock() + defer i.p.mu.RUnlock() + if i.slice != nil && i.slice.Start != nil && i.p.cmp.Compare(key, i.slice.Start) < 0 { + key = i.slice.Start + } + i.node, _ = i.p.findGE(key, false) + return i.fill(false, true) +} + +func (i *dbIter) Next() bool { + if i.Released() { + i.err = ErrIterReleased + return false + } + + if i.node == 0 { + if !i.forward { + return i.First() + } + return false + } + i.forward = true + i.p.mu.RLock() + defer i.p.mu.RUnlock() + i.node = i.p.nodeData[i.node+nNext] + return i.fill(false, true) +} + +func (i *dbIter) Prev() bool { + if i.Released() { + i.err = ErrIterReleased + return false + } + + if i.node == 0 { + if i.forward { + return i.Last() + } + return false + } + i.forward = false + i.p.mu.RLock() + defer i.p.mu.RUnlock() + i.node = i.p.findLT(i.key) + return i.fill(true, false) +} + +func (i *dbIter) Key() []byte { + return i.key +} + +func (i *dbIter) Value() []byte { + return i.value +} + +func (i *dbIter) Error() error { return i.err } + +func (i *dbIter) Release() { + if !i.Released() { + i.p = nil + i.node = 0 + i.key = nil + i.value = nil + i.BasicReleaser.Release() + } +} + +const ( + nKV = iota + nKey + nVal + nHeight + nNext +) + +// DB is an in-memory key/value database. +type DB struct { + cmp comparer.BasicComparer + rnd *rand.Rand + + mu sync.RWMutex + kvData []byte + // Node data: + // [0] : KV offset + // [1] : Key length + // [2] : Value length + // [3] : Height + // [3..height] : Next nodes + nodeData []int + prevNode [tMaxHeight]int + maxHeight int + n int + kvSize int +} + +func (p *DB) randHeight() (h int) { + const branching = 4 + h = 1 + for h < tMaxHeight && p.rnd.Int()%branching == 0 { + h++ + } + return +} + +// Must hold RW-lock if prev == true, as it use shared prevNode slice. +func (p *DB) findGE(key []byte, prev bool) (int, bool) { + node := 0 + h := p.maxHeight - 1 + for { + next := p.nodeData[node+nNext+h] + cmp := 1 + if next != 0 { + o := p.nodeData[next] + cmp = p.cmp.Compare(p.kvData[o:o+p.nodeData[next+nKey]], key) + } + if cmp < 0 { + // Keep searching in this list + node = next + } else { + if prev { + p.prevNode[h] = node + } else if cmp == 0 { + return next, true + } + if h == 0 { + return next, cmp == 0 + } + h-- + } + } +} + +func (p *DB) findLT(key []byte) int { + node := 0 + h := p.maxHeight - 1 + for { + next := p.nodeData[node+nNext+h] + o := p.nodeData[next] + if next == 0 || p.cmp.Compare(p.kvData[o:o+p.nodeData[next+nKey]], key) >= 0 { + if h == 0 { + break + } + h-- + } else { + node = next + } + } + return node +} + +func (p *DB) findLast() int { + node := 0 + h := p.maxHeight - 1 + for { + next := p.nodeData[node+nNext+h] + if next == 0 { + if h == 0 { + break + } + h-- + } else { + node = next + } + } + return node +} + +// Put sets the value for the given key. It overwrites any previous value +// for that key; a DB is not a multi-map. +// +// It is safe to modify the contents of the arguments after Put returns. +func (p *DB) Put(key []byte, value []byte) error { + p.mu.Lock() + defer p.mu.Unlock() + + if node, exact := p.findGE(key, true); exact { + kvOffset := len(p.kvData) + p.kvData = append(p.kvData, key...) + p.kvData = append(p.kvData, value...) + p.nodeData[node] = kvOffset + m := p.nodeData[node+nVal] + p.nodeData[node+nVal] = len(value) + p.kvSize += len(value) - m + return nil + } + + h := p.randHeight() + if h > p.maxHeight { + for i := p.maxHeight; i < h; i++ { + p.prevNode[i] = 0 + } + p.maxHeight = h + } + + kvOffset := len(p.kvData) + p.kvData = append(p.kvData, key...) + p.kvData = append(p.kvData, value...) + // Node + node := len(p.nodeData) + p.nodeData = append(p.nodeData, kvOffset, len(key), len(value), h) + for i, n := range p.prevNode[:h] { + m := n + nNext + i + p.nodeData = append(p.nodeData, p.nodeData[m]) + p.nodeData[m] = node + } + + p.kvSize += len(key) + len(value) + p.n++ + return nil +} + +// Delete deletes the value for the given key. It returns ErrNotFound if +// the DB does not contain the key. +// +// It is safe to modify the contents of the arguments after Delete returns. +func (p *DB) Delete(key []byte) error { + p.mu.Lock() + defer p.mu.Unlock() + + node, exact := p.findGE(key, true) + if !exact { + return ErrNotFound + } + + h := p.nodeData[node+nHeight] + for i, n := range p.prevNode[:h] { + m := n + nNext + i + p.nodeData[m] = p.nodeData[p.nodeData[m]+nNext+i] + } + + p.kvSize -= p.nodeData[node+nKey] + p.nodeData[node+nVal] + p.n-- + return nil +} + +// Contains returns true if the given key are in the DB. +// +// It is safe to modify the contents of the arguments after Contains returns. +func (p *DB) Contains(key []byte) bool { + p.mu.RLock() + _, exact := p.findGE(key, false) + p.mu.RUnlock() + return exact +} + +// Get gets the value for the given key. It returns error.ErrNotFound if the +// DB does not contain the key. +// +// The caller should not modify the contents of the returned slice, but +// it is safe to modify the contents of the argument after Get returns. +func (p *DB) Get(key []byte) (value []byte, err error) { + p.mu.RLock() + if node, exact := p.findGE(key, false); exact { + o := p.nodeData[node] + p.nodeData[node+nKey] + value = p.kvData[o : o+p.nodeData[node+nVal]] + } else { + err = ErrNotFound + } + p.mu.RUnlock() + return +} + +// Find finds key/value pair whose key is greater than or equal to the +// given key. It returns ErrNotFound if the table doesn't contain +// such pair. +// +// The caller should not modify the contents of the returned slice, but +// it is safe to modify the contents of the argument after Find returns. +func (p *DB) Find(key []byte) (rkey, value []byte, err error) { + p.mu.RLock() + if node, _ := p.findGE(key, false); node != 0 { + n := p.nodeData[node] + m := n + p.nodeData[node+nKey] + rkey = p.kvData[n:m] + value = p.kvData[m : m+p.nodeData[node+nVal]] + } else { + err = ErrNotFound + } + p.mu.RUnlock() + return +} + +// NewIterator returns an iterator of the DB. +// The returned iterator is not safe for concurrent use, but it is safe to use +// multiple iterators concurrently, with each in a dedicated goroutine. +// It is also safe to use an iterator concurrently with modifying its +// underlying DB. However, the resultant key/value pairs are not guaranteed +// to be a consistent snapshot of the DB at a particular point in time. +// +// Slice allows slicing the iterator to only contains keys in the given +// range. A nil Range.Start is treated as a key before all keys in the +// DB. And a nil Range.Limit is treated as a key after all keys in +// the DB. +// +// WARNING: Any slice returned by interator (e.g. slice returned by calling +// Iterator.Key() or Iterator.Key() methods), its content should not be modified +// unless noted otherwise. +// +// The iterator must be released after use, by calling Release method. +// +// Also read Iterator documentation of the leveldb/iterator package. +func (p *DB) NewIterator(slice *util.Range) iterator.Iterator { + return &dbIter{p: p, slice: slice} +} + +// Capacity returns keys/values buffer capacity. +func (p *DB) Capacity() int { + p.mu.RLock() + defer p.mu.RUnlock() + return cap(p.kvData) +} + +// Size returns sum of keys and values length. Note that deleted +// key/value will not be accounted for, but it will still consume +// the buffer, since the buffer is append only. +func (p *DB) Size() int { + p.mu.RLock() + defer p.mu.RUnlock() + return p.kvSize +} + +// Free returns keys/values free buffer before need to grow. +func (p *DB) Free() int { + p.mu.RLock() + defer p.mu.RUnlock() + return cap(p.kvData) - len(p.kvData) +} + +// Len returns the number of entries in the DB. +func (p *DB) Len() int { + p.mu.RLock() + defer p.mu.RUnlock() + return p.n +} + +// Reset resets the DB to initial empty state. Allows reuse the buffer. +func (p *DB) Reset() { + p.mu.Lock() + p.rnd = rand.New(rand.NewSource(0xdeadbeef)) + p.maxHeight = 1 + p.n = 0 + p.kvSize = 0 + p.kvData = p.kvData[:0] + p.nodeData = p.nodeData[:nNext+tMaxHeight] + p.nodeData[nKV] = 0 + p.nodeData[nKey] = 0 + p.nodeData[nVal] = 0 + p.nodeData[nHeight] = tMaxHeight + for n := 0; n < tMaxHeight; n++ { + p.nodeData[nNext+n] = 0 + p.prevNode[n] = 0 + } + p.mu.Unlock() +} + +// New creates a new initialized in-memory key/value DB. The capacity +// is the initial key/value buffer capacity. The capacity is advisory, +// not enforced. +// +// This DB is append-only, deleting an entry would remove entry node but not +// reclaim KV buffer. +// +// The returned DB instance is safe for concurrent use. +func New(cmp comparer.BasicComparer, capacity int) *DB { + p := &DB{ + cmp: cmp, + rnd: rand.New(rand.NewSource(0xdeadbeef)), + maxHeight: 1, + kvData: make([]byte, 0, capacity), + nodeData: make([]int, 4+tMaxHeight), + } + p.nodeData[nHeight] = tMaxHeight + return p +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/opt/options.go b/vendor/github.com/syndtr/goleveldb/leveldb/opt/options.go new file mode 100644 index 000000000..528b16423 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/opt/options.go @@ -0,0 +1,697 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Package opt provides sets of options used by LevelDB. +package opt + +import ( + "math" + + "github.com/syndtr/goleveldb/leveldb/cache" + "github.com/syndtr/goleveldb/leveldb/comparer" + "github.com/syndtr/goleveldb/leveldb/filter" +) + +const ( + KiB = 1024 + MiB = KiB * 1024 + GiB = MiB * 1024 +) + +var ( + DefaultBlockCacher = LRUCacher + DefaultBlockCacheCapacity = 8 * MiB + DefaultBlockRestartInterval = 16 + DefaultBlockSize = 4 * KiB + DefaultCompactionExpandLimitFactor = 25 + DefaultCompactionGPOverlapsFactor = 10 + DefaultCompactionL0Trigger = 4 + DefaultCompactionSourceLimitFactor = 1 + DefaultCompactionTableSize = 2 * MiB + DefaultCompactionTableSizeMultiplier = 1.0 + DefaultCompactionTotalSize = 10 * MiB + DefaultCompactionTotalSizeMultiplier = 10.0 + DefaultCompressionType = SnappyCompression + DefaultIteratorSamplingRate = 1 * MiB + DefaultOpenFilesCacher = LRUCacher + DefaultOpenFilesCacheCapacity = 500 + DefaultWriteBuffer = 4 * MiB + DefaultWriteL0PauseTrigger = 12 + DefaultWriteL0SlowdownTrigger = 8 +) + +// Cacher is a caching algorithm. +type Cacher interface { + New(capacity int) cache.Cacher +} + +type CacherFunc struct { + NewFunc func(capacity int) cache.Cacher +} + +func (f *CacherFunc) New(capacity int) cache.Cacher { + if f.NewFunc != nil { + return f.NewFunc(capacity) + } + return nil +} + +func noCacher(int) cache.Cacher { return nil } + +var ( + // LRUCacher is the LRU-cache algorithm. + LRUCacher = &CacherFunc{cache.NewLRU} + + // NoCacher is the value to disable caching algorithm. + NoCacher = &CacherFunc{} +) + +// Compression is the 'sorted table' block compression algorithm to use. +type Compression uint + +func (c Compression) String() string { + switch c { + case DefaultCompression: + return "default" + case NoCompression: + return "none" + case SnappyCompression: + return "snappy" + } + return "invalid" +} + +const ( + DefaultCompression Compression = iota + NoCompression + SnappyCompression + nCompression +) + +// Strict is the DB 'strict level'. +type Strict uint + +const ( + // If present then a corrupted or invalid chunk or block in manifest + // journal will cause an error instead of being dropped. + // This will prevent database with corrupted manifest to be opened. + StrictManifest Strict = 1 << iota + + // If present then journal chunk checksum will be verified. + StrictJournalChecksum + + // If present then a corrupted or invalid chunk or block in journal + // will cause an error instead of being dropped. + // This will prevent database with corrupted journal to be opened. + StrictJournal + + // If present then 'sorted table' block checksum will be verified. + // This has effect on both 'read operation' and compaction. + StrictBlockChecksum + + // If present then a corrupted 'sorted table' will fails compaction. + // The database will enter read-only mode. + StrictCompaction + + // If present then a corrupted 'sorted table' will halts 'read operation'. + StrictReader + + // If present then leveldb.Recover will drop corrupted 'sorted table'. + StrictRecovery + + // This only applicable for ReadOptions, if present then this ReadOptions + // 'strict level' will override global ones. + StrictOverride + + // StrictAll enables all strict flags. + StrictAll = StrictManifest | StrictJournalChecksum | StrictJournal | StrictBlockChecksum | StrictCompaction | StrictReader | StrictRecovery + + // DefaultStrict is the default strict flags. Specify any strict flags + // will override default strict flags as whole (i.e. not OR'ed). + DefaultStrict = StrictJournalChecksum | StrictBlockChecksum | StrictCompaction | StrictReader + + // NoStrict disables all strict flags. Override default strict flags. + NoStrict = ^StrictAll +) + +// Options holds the optional parameters for the DB at large. +type Options struct { + // AltFilters defines one or more 'alternative filters'. + // 'alternative filters' will be used during reads if a filter block + // does not match with the 'effective filter'. + // + // The default value is nil + AltFilters []filter.Filter + + // BlockCacher provides cache algorithm for LevelDB 'sorted table' block caching. + // Specify NoCacher to disable caching algorithm. + // + // The default value is LRUCacher. + BlockCacher Cacher + + // BlockCacheCapacity defines the capacity of the 'sorted table' block caching. + // Use -1 for zero, this has same effect as specifying NoCacher to BlockCacher. + // + // The default value is 8MiB. + BlockCacheCapacity int + + // BlockCacheEvictRemoved allows enable forced-eviction on cached block belonging + // to removed 'sorted table'. + // + // The default if false. + BlockCacheEvictRemoved bool + + // BlockRestartInterval is the number of keys between restart points for + // delta encoding of keys. + // + // The default value is 16. + BlockRestartInterval int + + // BlockSize is the minimum uncompressed size in bytes of each 'sorted table' + // block. + // + // The default value is 4KiB. + BlockSize int + + // CompactionExpandLimitFactor limits compaction size after expanded. + // This will be multiplied by table size limit at compaction target level. + // + // The default value is 25. + CompactionExpandLimitFactor int + + // CompactionGPOverlapsFactor limits overlaps in grandparent (Level + 2) that a + // single 'sorted table' generates. + // This will be multiplied by table size limit at grandparent level. + // + // The default value is 10. + CompactionGPOverlapsFactor int + + // CompactionL0Trigger defines number of 'sorted table' at level-0 that will + // trigger compaction. + // + // The default value is 4. + CompactionL0Trigger int + + // CompactionSourceLimitFactor limits compaction source size. This doesn't apply to + // level-0. + // This will be multiplied by table size limit at compaction target level. + // + // The default value is 1. + CompactionSourceLimitFactor int + + // CompactionTableSize limits size of 'sorted table' that compaction generates. + // The limits for each level will be calculated as: + // CompactionTableSize * (CompactionTableSizeMultiplier ^ Level) + // The multiplier for each level can also fine-tuned using CompactionTableSizeMultiplierPerLevel. + // + // The default value is 2MiB. + CompactionTableSize int + + // CompactionTableSizeMultiplier defines multiplier for CompactionTableSize. + // + // The default value is 1. + CompactionTableSizeMultiplier float64 + + // CompactionTableSizeMultiplierPerLevel defines per-level multiplier for + // CompactionTableSize. + // Use zero to skip a level. + // + // The default value is nil. + CompactionTableSizeMultiplierPerLevel []float64 + + // CompactionTotalSize limits total size of 'sorted table' for each level. + // The limits for each level will be calculated as: + // CompactionTotalSize * (CompactionTotalSizeMultiplier ^ Level) + // The multiplier for each level can also fine-tuned using + // CompactionTotalSizeMultiplierPerLevel. + // + // The default value is 10MiB. + CompactionTotalSize int + + // CompactionTotalSizeMultiplier defines multiplier for CompactionTotalSize. + // + // The default value is 10. + CompactionTotalSizeMultiplier float64 + + // CompactionTotalSizeMultiplierPerLevel defines per-level multiplier for + // CompactionTotalSize. + // Use zero to skip a level. + // + // The default value is nil. + CompactionTotalSizeMultiplierPerLevel []float64 + + // Comparer defines a total ordering over the space of []byte keys: a 'less + // than' relationship. The same comparison algorithm must be used for reads + // and writes over the lifetime of the DB. + // + // The default value uses the same ordering as bytes.Compare. + Comparer comparer.Comparer + + // Compression defines the 'sorted table' block compression to use. + // + // The default value (DefaultCompression) uses snappy compression. + Compression Compression + + // DisableBufferPool allows disable use of util.BufferPool functionality. + // + // The default value is false. + DisableBufferPool bool + + // DisableBlockCache allows disable use of cache.Cache functionality on + // 'sorted table' block. + // + // The default value is false. + DisableBlockCache bool + + // DisableCompactionBackoff allows disable compaction retry backoff. + // + // The default value is false. + DisableCompactionBackoff bool + + // DisableLargeBatchTransaction allows disabling switch-to-transaction mode + // on large batch write. If enable batch writes large than WriteBuffer will + // use transaction. + // + // The default is false. + DisableLargeBatchTransaction bool + + // ErrorIfExist defines whether an error should returned if the DB already + // exist. + // + // The default value is false. + ErrorIfExist bool + + // ErrorIfMissing defines whether an error should returned if the DB is + // missing. If false then the database will be created if missing, otherwise + // an error will be returned. + // + // The default value is false. + ErrorIfMissing bool + + // Filter defines an 'effective filter' to use. An 'effective filter' + // if defined will be used to generate per-table filter block. + // The filter name will be stored on disk. + // During reads LevelDB will try to find matching filter from + // 'effective filter' and 'alternative filters'. + // + // Filter can be changed after a DB has been created. It is recommended + // to put old filter to the 'alternative filters' to mitigate lack of + // filter during transition period. + // + // A filter is used to reduce disk reads when looking for a specific key. + // + // The default value is nil. + Filter filter.Filter + + // IteratorSamplingRate defines approximate gap (in bytes) between read + // sampling of an iterator. The samples will be used to determine when + // compaction should be triggered. + // + // The default is 1MiB. + IteratorSamplingRate int + + // NoSync allows completely disable fsync. + // + // The default is false. + NoSync bool + + // NoWriteMerge allows disabling write merge. + // + // The default is false. + NoWriteMerge bool + + // OpenFilesCacher provides cache algorithm for open files caching. + // Specify NoCacher to disable caching algorithm. + // + // The default value is LRUCacher. + OpenFilesCacher Cacher + + // OpenFilesCacheCapacity defines the capacity of the open files caching. + // Use -1 for zero, this has same effect as specifying NoCacher to OpenFilesCacher. + // + // The default value is 500. + OpenFilesCacheCapacity int + + // If true then opens DB in read-only mode. + // + // The default value is false. + ReadOnly bool + + // Strict defines the DB strict level. + Strict Strict + + // WriteBuffer defines maximum size of a 'memdb' before flushed to + // 'sorted table'. 'memdb' is an in-memory DB backed by an on-disk + // unsorted journal. + // + // LevelDB may held up to two 'memdb' at the same time. + // + // The default value is 4MiB. + WriteBuffer int + + // WriteL0StopTrigger defines number of 'sorted table' at level-0 that will + // pause write. + // + // The default value is 12. + WriteL0PauseTrigger int + + // WriteL0SlowdownTrigger defines number of 'sorted table' at level-0 that + // will trigger write slowdown. + // + // The default value is 8. + WriteL0SlowdownTrigger int +} + +func (o *Options) GetAltFilters() []filter.Filter { + if o == nil { + return nil + } + return o.AltFilters +} + +func (o *Options) GetBlockCacher() Cacher { + if o == nil || o.BlockCacher == nil { + return DefaultBlockCacher + } else if o.BlockCacher == NoCacher { + return nil + } + return o.BlockCacher +} + +func (o *Options) GetBlockCacheCapacity() int { + if o == nil || o.BlockCacheCapacity == 0 { + return DefaultBlockCacheCapacity + } else if o.BlockCacheCapacity < 0 { + return 0 + } + return o.BlockCacheCapacity +} + +func (o *Options) GetBlockCacheEvictRemoved() bool { + if o == nil { + return false + } + return o.BlockCacheEvictRemoved +} + +func (o *Options) GetBlockRestartInterval() int { + if o == nil || o.BlockRestartInterval <= 0 { + return DefaultBlockRestartInterval + } + return o.BlockRestartInterval +} + +func (o *Options) GetBlockSize() int { + if o == nil || o.BlockSize <= 0 { + return DefaultBlockSize + } + return o.BlockSize +} + +func (o *Options) GetCompactionExpandLimit(level int) int { + factor := DefaultCompactionExpandLimitFactor + if o != nil && o.CompactionExpandLimitFactor > 0 { + factor = o.CompactionExpandLimitFactor + } + return o.GetCompactionTableSize(level+1) * factor +} + +func (o *Options) GetCompactionGPOverlaps(level int) int { + factor := DefaultCompactionGPOverlapsFactor + if o != nil && o.CompactionGPOverlapsFactor > 0 { + factor = o.CompactionGPOverlapsFactor + } + return o.GetCompactionTableSize(level+2) * factor +} + +func (o *Options) GetCompactionL0Trigger() int { + if o == nil || o.CompactionL0Trigger == 0 { + return DefaultCompactionL0Trigger + } + return o.CompactionL0Trigger +} + +func (o *Options) GetCompactionSourceLimit(level int) int { + factor := DefaultCompactionSourceLimitFactor + if o != nil && o.CompactionSourceLimitFactor > 0 { + factor = o.CompactionSourceLimitFactor + } + return o.GetCompactionTableSize(level+1) * factor +} + +func (o *Options) GetCompactionTableSize(level int) int { + var ( + base = DefaultCompactionTableSize + mult float64 + ) + if o != nil { + if o.CompactionTableSize > 0 { + base = o.CompactionTableSize + } + if level < len(o.CompactionTableSizeMultiplierPerLevel) && o.CompactionTableSizeMultiplierPerLevel[level] > 0 { + mult = o.CompactionTableSizeMultiplierPerLevel[level] + } else if o.CompactionTableSizeMultiplier > 0 { + mult = math.Pow(o.CompactionTableSizeMultiplier, float64(level)) + } + } + if mult == 0 { + mult = math.Pow(DefaultCompactionTableSizeMultiplier, float64(level)) + } + return int(float64(base) * mult) +} + +func (o *Options) GetCompactionTotalSize(level int) int64 { + var ( + base = DefaultCompactionTotalSize + mult float64 + ) + if o != nil { + if o.CompactionTotalSize > 0 { + base = o.CompactionTotalSize + } + if level < len(o.CompactionTotalSizeMultiplierPerLevel) && o.CompactionTotalSizeMultiplierPerLevel[level] > 0 { + mult = o.CompactionTotalSizeMultiplierPerLevel[level] + } else if o.CompactionTotalSizeMultiplier > 0 { + mult = math.Pow(o.CompactionTotalSizeMultiplier, float64(level)) + } + } + if mult == 0 { + mult = math.Pow(DefaultCompactionTotalSizeMultiplier, float64(level)) + } + return int64(float64(base) * mult) +} + +func (o *Options) GetComparer() comparer.Comparer { + if o == nil || o.Comparer == nil { + return comparer.DefaultComparer + } + return o.Comparer +} + +func (o *Options) GetCompression() Compression { + if o == nil || o.Compression <= DefaultCompression || o.Compression >= nCompression { + return DefaultCompressionType + } + return o.Compression +} + +func (o *Options) GetDisableBufferPool() bool { + if o == nil { + return false + } + return o.DisableBufferPool +} + +func (o *Options) GetDisableBlockCache() bool { + if o == nil { + return false + } + return o.DisableBlockCache +} + +func (o *Options) GetDisableCompactionBackoff() bool { + if o == nil { + return false + } + return o.DisableCompactionBackoff +} + +func (o *Options) GetDisableLargeBatchTransaction() bool { + if o == nil { + return false + } + return o.DisableLargeBatchTransaction +} + +func (o *Options) GetErrorIfExist() bool { + if o == nil { + return false + } + return o.ErrorIfExist +} + +func (o *Options) GetErrorIfMissing() bool { + if o == nil { + return false + } + return o.ErrorIfMissing +} + +func (o *Options) GetFilter() filter.Filter { + if o == nil { + return nil + } + return o.Filter +} + +func (o *Options) GetIteratorSamplingRate() int { + if o == nil || o.IteratorSamplingRate <= 0 { + return DefaultIteratorSamplingRate + } + return o.IteratorSamplingRate +} + +func (o *Options) GetNoSync() bool { + if o == nil { + return false + } + return o.NoSync +} + +func (o *Options) GetNoWriteMerge() bool { + if o == nil { + return false + } + return o.NoWriteMerge +} + +func (o *Options) GetOpenFilesCacher() Cacher { + if o == nil || o.OpenFilesCacher == nil { + return DefaultOpenFilesCacher + } + if o.OpenFilesCacher == NoCacher { + return nil + } + return o.OpenFilesCacher +} + +func (o *Options) GetOpenFilesCacheCapacity() int { + if o == nil || o.OpenFilesCacheCapacity == 0 { + return DefaultOpenFilesCacheCapacity + } else if o.OpenFilesCacheCapacity < 0 { + return 0 + } + return o.OpenFilesCacheCapacity +} + +func (o *Options) GetReadOnly() bool { + if o == nil { + return false + } + return o.ReadOnly +} + +func (o *Options) GetStrict(strict Strict) bool { + if o == nil || o.Strict == 0 { + return DefaultStrict&strict != 0 + } + return o.Strict&strict != 0 +} + +func (o *Options) GetWriteBuffer() int { + if o == nil || o.WriteBuffer <= 0 { + return DefaultWriteBuffer + } + return o.WriteBuffer +} + +func (o *Options) GetWriteL0PauseTrigger() int { + if o == nil || o.WriteL0PauseTrigger == 0 { + return DefaultWriteL0PauseTrigger + } + return o.WriteL0PauseTrigger +} + +func (o *Options) GetWriteL0SlowdownTrigger() int { + if o == nil || o.WriteL0SlowdownTrigger == 0 { + return DefaultWriteL0SlowdownTrigger + } + return o.WriteL0SlowdownTrigger +} + +// ReadOptions holds the optional parameters for 'read operation'. The +// 'read operation' includes Get, Find and NewIterator. +type ReadOptions struct { + // DontFillCache defines whether block reads for this 'read operation' + // should be cached. If false then the block will be cached. This does + // not affects already cached block. + // + // The default value is false. + DontFillCache bool + + // Strict will be OR'ed with global DB 'strict level' unless StrictOverride + // is present. Currently only StrictReader that has effect here. + Strict Strict +} + +func (ro *ReadOptions) GetDontFillCache() bool { + if ro == nil { + return false + } + return ro.DontFillCache +} + +func (ro *ReadOptions) GetStrict(strict Strict) bool { + if ro == nil { + return false + } + return ro.Strict&strict != 0 +} + +// WriteOptions holds the optional parameters for 'write operation'. The +// 'write operation' includes Write, Put and Delete. +type WriteOptions struct { + // NoWriteMerge allows disabling write merge. + // + // The default is false. + NoWriteMerge bool + + // Sync is whether to sync underlying writes from the OS buffer cache + // through to actual disk, if applicable. Setting Sync can result in + // slower writes. + // + // If false, and the machine crashes, then some recent writes may be lost. + // Note that if it is just the process that crashes (and the machine does + // not) then no writes will be lost. + // + // In other words, Sync being false has the same semantics as a write + // system call. Sync being true means write followed by fsync. + // + // The default value is false. + Sync bool +} + +func (wo *WriteOptions) GetNoWriteMerge() bool { + if wo == nil { + return false + } + return wo.NoWriteMerge +} + +func (wo *WriteOptions) GetSync() bool { + if wo == nil { + return false + } + return wo.Sync +} + +func GetStrict(o *Options, ro *ReadOptions, strict Strict) bool { + if ro.GetStrict(StrictOverride) { + return ro.GetStrict(strict) + } else { + return o.GetStrict(strict) || ro.GetStrict(strict) + } +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/options.go b/vendor/github.com/syndtr/goleveldb/leveldb/options.go new file mode 100644 index 000000000..b072b1ac4 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/options.go @@ -0,0 +1,107 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "github.com/syndtr/goleveldb/leveldb/filter" + "github.com/syndtr/goleveldb/leveldb/opt" +) + +func dupOptions(o *opt.Options) *opt.Options { + newo := &opt.Options{} + if o != nil { + *newo = *o + } + if newo.Strict == 0 { + newo.Strict = opt.DefaultStrict + } + return newo +} + +func (s *session) setOptions(o *opt.Options) { + no := dupOptions(o) + // Alternative filters. + if filters := o.GetAltFilters(); len(filters) > 0 { + no.AltFilters = make([]filter.Filter, len(filters)) + for i, filter := range filters { + no.AltFilters[i] = &iFilter{filter} + } + } + // Comparer. + s.icmp = &iComparer{o.GetComparer()} + no.Comparer = s.icmp + // Filter. + if filter := o.GetFilter(); filter != nil { + no.Filter = &iFilter{filter} + } + + s.o = &cachedOptions{Options: no} + s.o.cache() +} + +const optCachedLevel = 7 + +type cachedOptions struct { + *opt.Options + + compactionExpandLimit []int + compactionGPOverlaps []int + compactionSourceLimit []int + compactionTableSize []int + compactionTotalSize []int64 +} + +func (co *cachedOptions) cache() { + co.compactionExpandLimit = make([]int, optCachedLevel) + co.compactionGPOverlaps = make([]int, optCachedLevel) + co.compactionSourceLimit = make([]int, optCachedLevel) + co.compactionTableSize = make([]int, optCachedLevel) + co.compactionTotalSize = make([]int64, optCachedLevel) + + for level := 0; level < optCachedLevel; level++ { + co.compactionExpandLimit[level] = co.Options.GetCompactionExpandLimit(level) + co.compactionGPOverlaps[level] = co.Options.GetCompactionGPOverlaps(level) + co.compactionSourceLimit[level] = co.Options.GetCompactionSourceLimit(level) + co.compactionTableSize[level] = co.Options.GetCompactionTableSize(level) + co.compactionTotalSize[level] = co.Options.GetCompactionTotalSize(level) + } +} + +func (co *cachedOptions) GetCompactionExpandLimit(level int) int { + if level < optCachedLevel { + return co.compactionExpandLimit[level] + } + return co.Options.GetCompactionExpandLimit(level) +} + +func (co *cachedOptions) GetCompactionGPOverlaps(level int) int { + if level < optCachedLevel { + return co.compactionGPOverlaps[level] + } + return co.Options.GetCompactionGPOverlaps(level) +} + +func (co *cachedOptions) GetCompactionSourceLimit(level int) int { + if level < optCachedLevel { + return co.compactionSourceLimit[level] + } + return co.Options.GetCompactionSourceLimit(level) +} + +func (co *cachedOptions) GetCompactionTableSize(level int) int { + if level < optCachedLevel { + return co.compactionTableSize[level] + } + return co.Options.GetCompactionTableSize(level) +} + +func (co *cachedOptions) GetCompactionTotalSize(level int) int64 { + if level < optCachedLevel { + return co.compactionTotalSize[level] + } + return co.Options.GetCompactionTotalSize(level) +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/session.go b/vendor/github.com/syndtr/goleveldb/leveldb/session.go new file mode 100644 index 000000000..3f391f934 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/session.go @@ -0,0 +1,210 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "fmt" + "io" + "os" + "sync" + + "github.com/syndtr/goleveldb/leveldb/errors" + "github.com/syndtr/goleveldb/leveldb/journal" + "github.com/syndtr/goleveldb/leveldb/opt" + "github.com/syndtr/goleveldb/leveldb/storage" +) + +// ErrManifestCorrupted records manifest corruption. This error will be +// wrapped with errors.ErrCorrupted. +type ErrManifestCorrupted struct { + Field string + Reason string +} + +func (e *ErrManifestCorrupted) Error() string { + return fmt.Sprintf("leveldb: manifest corrupted (field '%s'): %s", e.Field, e.Reason) +} + +func newErrManifestCorrupted(fd storage.FileDesc, field, reason string) error { + return errors.NewErrCorrupted(fd, &ErrManifestCorrupted{field, reason}) +} + +// session represent a persistent database session. +type session struct { + // Need 64-bit alignment. + stNextFileNum int64 // current unused file number + stJournalNum int64 // current journal file number; need external synchronization + stPrevJournalNum int64 // prev journal file number; no longer used; for compatibility with older version of leveldb + stTempFileNum int64 + stSeqNum uint64 // last mem compacted seq; need external synchronization + + stor *iStorage + storLock storage.Locker + o *cachedOptions + icmp *iComparer + tops *tOps + fileRef map[int64]int + + manifest *journal.Writer + manifestWriter storage.Writer + manifestFd storage.FileDesc + + stCompPtrs []internalKey // compaction pointers; need external synchronization + stVersion *version // current version + vmu sync.Mutex +} + +// Creates new initialized session instance. +func newSession(stor storage.Storage, o *opt.Options) (s *session, err error) { + if stor == nil { + return nil, os.ErrInvalid + } + storLock, err := stor.Lock() + if err != nil { + return + } + s = &session{ + stor: newIStorage(stor), + storLock: storLock, + fileRef: make(map[int64]int), + } + s.setOptions(o) + s.tops = newTableOps(s) + s.setVersion(newVersion(s)) + s.log("log@legend F·NumFile S·FileSize N·Entry C·BadEntry B·BadBlock Ke·KeyError D·DroppedEntry L·Level Q·SeqNum T·TimeElapsed") + return +} + +// Close session. +func (s *session) close() { + s.tops.close() + if s.manifest != nil { + s.manifest.Close() + } + if s.manifestWriter != nil { + s.manifestWriter.Close() + } + s.manifest = nil + s.manifestWriter = nil + s.setVersion(&version{s: s, closing: true}) +} + +// Release session lock. +func (s *session) release() { + s.storLock.Unlock() +} + +// Create a new database session; need external synchronization. +func (s *session) create() error { + // create manifest + return s.newManifest(nil, nil) +} + +// Recover a database session; need external synchronization. +func (s *session) recover() (err error) { + defer func() { + if os.IsNotExist(err) { + // Don't return os.ErrNotExist if the underlying storage contains + // other files that belong to LevelDB. So the DB won't get trashed. + if fds, _ := s.stor.List(storage.TypeAll); len(fds) > 0 { + err = &errors.ErrCorrupted{Fd: storage.FileDesc{Type: storage.TypeManifest}, Err: &errors.ErrMissingFiles{}} + } + } + }() + + fd, err := s.stor.GetMeta() + if err != nil { + return + } + + reader, err := s.stor.Open(fd) + if err != nil { + return + } + defer reader.Close() + + var ( + // Options. + strict = s.o.GetStrict(opt.StrictManifest) + + jr = journal.NewReader(reader, dropper{s, fd}, strict, true) + rec = &sessionRecord{} + staging = s.stVersion.newStaging() + ) + for { + var r io.Reader + r, err = jr.Next() + if err != nil { + if err == io.EOF { + err = nil + break + } + return errors.SetFd(err, fd) + } + + err = rec.decode(r) + if err == nil { + // save compact pointers + for _, r := range rec.compPtrs { + s.setCompPtr(r.level, internalKey(r.ikey)) + } + // commit record to version staging + staging.commit(rec) + } else { + err = errors.SetFd(err, fd) + if strict || !errors.IsCorrupted(err) { + return + } + s.logf("manifest error: %v (skipped)", errors.SetFd(err, fd)) + } + rec.resetCompPtrs() + rec.resetAddedTables() + rec.resetDeletedTables() + } + + switch { + case !rec.has(recComparer): + return newErrManifestCorrupted(fd, "comparer", "missing") + case rec.comparer != s.icmp.uName(): + return newErrManifestCorrupted(fd, "comparer", fmt.Sprintf("mismatch: want '%s', got '%s'", s.icmp.uName(), rec.comparer)) + case !rec.has(recNextFileNum): + return newErrManifestCorrupted(fd, "next-file-num", "missing") + case !rec.has(recJournalNum): + return newErrManifestCorrupted(fd, "journal-file-num", "missing") + case !rec.has(recSeqNum): + return newErrManifestCorrupted(fd, "seq-num", "missing") + } + + s.manifestFd = fd + s.setVersion(staging.finish()) + s.setNextFileNum(rec.nextFileNum) + s.recordCommited(rec) + return nil +} + +// Commit session; need external synchronization. +func (s *session) commit(r *sessionRecord) (err error) { + v := s.version() + defer v.release() + + // spawn new version based on current version + nv := v.spawn(r) + + if s.manifest == nil { + // manifest journal writer not yet created, create one + err = s.newManifest(r, nv) + } else { + err = s.flushManifest(r) + } + + // finally, apply new version if no error rise + if err == nil { + s.setVersion(nv) + } + + return +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/session_compaction.go b/vendor/github.com/syndtr/goleveldb/leveldb/session_compaction.go new file mode 100644 index 000000000..089cd00b2 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/session_compaction.go @@ -0,0 +1,302 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "sync/atomic" + + "github.com/syndtr/goleveldb/leveldb/iterator" + "github.com/syndtr/goleveldb/leveldb/memdb" + "github.com/syndtr/goleveldb/leveldb/opt" +) + +func (s *session) pickMemdbLevel(umin, umax []byte, maxLevel int) int { + v := s.version() + defer v.release() + return v.pickMemdbLevel(umin, umax, maxLevel) +} + +func (s *session) flushMemdb(rec *sessionRecord, mdb *memdb.DB, maxLevel int) (int, error) { + // Create sorted table. + iter := mdb.NewIterator(nil) + defer iter.Release() + t, n, err := s.tops.createFrom(iter) + if err != nil { + return 0, err + } + + // Pick level other than zero can cause compaction issue with large + // bulk insert and delete on strictly incrementing key-space. The + // problem is that the small deletion markers trapped at lower level, + // while key/value entries keep growing at higher level. Since the + // key-space is strictly incrementing it will not overlaps with + // higher level, thus maximum possible level is always picked, while + // overlapping deletion marker pushed into lower level. + // See: https://github.com/syndtr/goleveldb/issues/127. + flushLevel := s.pickMemdbLevel(t.imin.ukey(), t.imax.ukey(), maxLevel) + rec.addTableFile(flushLevel, t) + + s.logf("memdb@flush created L%d@%d N·%d S·%s %q:%q", flushLevel, t.fd.Num, n, shortenb(int(t.size)), t.imin, t.imax) + return flushLevel, nil +} + +// Pick a compaction based on current state; need external synchronization. +func (s *session) pickCompaction() *compaction { + v := s.version() + + var sourceLevel int + var t0 tFiles + if v.cScore >= 1 { + sourceLevel = v.cLevel + cptr := s.getCompPtr(sourceLevel) + tables := v.levels[sourceLevel] + for _, t := range tables { + if cptr == nil || s.icmp.Compare(t.imax, cptr) > 0 { + t0 = append(t0, t) + break + } + } + if len(t0) == 0 { + t0 = append(t0, tables[0]) + } + } else { + if p := atomic.LoadPointer(&v.cSeek); p != nil { + ts := (*tSet)(p) + sourceLevel = ts.level + t0 = append(t0, ts.table) + } else { + v.release() + return nil + } + } + + return newCompaction(s, v, sourceLevel, t0) +} + +// Create compaction from given level and range; need external synchronization. +func (s *session) getCompactionRange(sourceLevel int, umin, umax []byte, noLimit bool) *compaction { + v := s.version() + + if sourceLevel >= len(v.levels) { + v.release() + return nil + } + + t0 := v.levels[sourceLevel].getOverlaps(nil, s.icmp, umin, umax, sourceLevel == 0) + if len(t0) == 0 { + v.release() + return nil + } + + // Avoid compacting too much in one shot in case the range is large. + // But we cannot do this for level-0 since level-0 files can overlap + // and we must not pick one file and drop another older file if the + // two files overlap. + if !noLimit && sourceLevel > 0 { + limit := int64(v.s.o.GetCompactionSourceLimit(sourceLevel)) + total := int64(0) + for i, t := range t0 { + total += t.size + if total >= limit { + s.logf("table@compaction limiting F·%d -> F·%d", len(t0), i+1) + t0 = t0[:i+1] + break + } + } + } + + return newCompaction(s, v, sourceLevel, t0) +} + +func newCompaction(s *session, v *version, sourceLevel int, t0 tFiles) *compaction { + c := &compaction{ + s: s, + v: v, + sourceLevel: sourceLevel, + levels: [2]tFiles{t0, nil}, + maxGPOverlaps: int64(s.o.GetCompactionGPOverlaps(sourceLevel)), + tPtrs: make([]int, len(v.levels)), + } + c.expand() + c.save() + return c +} + +// compaction represent a compaction state. +type compaction struct { + s *session + v *version + + sourceLevel int + levels [2]tFiles + maxGPOverlaps int64 + + gp tFiles + gpi int + seenKey bool + gpOverlappedBytes int64 + imin, imax internalKey + tPtrs []int + released bool + + snapGPI int + snapSeenKey bool + snapGPOverlappedBytes int64 + snapTPtrs []int +} + +func (c *compaction) save() { + c.snapGPI = c.gpi + c.snapSeenKey = c.seenKey + c.snapGPOverlappedBytes = c.gpOverlappedBytes + c.snapTPtrs = append(c.snapTPtrs[:0], c.tPtrs...) +} + +func (c *compaction) restore() { + c.gpi = c.snapGPI + c.seenKey = c.snapSeenKey + c.gpOverlappedBytes = c.snapGPOverlappedBytes + c.tPtrs = append(c.tPtrs[:0], c.snapTPtrs...) +} + +func (c *compaction) release() { + if !c.released { + c.released = true + c.v.release() + } +} + +// Expand compacted tables; need external synchronization. +func (c *compaction) expand() { + limit := int64(c.s.o.GetCompactionExpandLimit(c.sourceLevel)) + vt0 := c.v.levels[c.sourceLevel] + vt1 := tFiles{} + if level := c.sourceLevel + 1; level < len(c.v.levels) { + vt1 = c.v.levels[level] + } + + t0, t1 := c.levels[0], c.levels[1] + imin, imax := t0.getRange(c.s.icmp) + // We expand t0 here just incase ukey hop across tables. + t0 = vt0.getOverlaps(t0, c.s.icmp, imin.ukey(), imax.ukey(), c.sourceLevel == 0) + if len(t0) != len(c.levels[0]) { + imin, imax = t0.getRange(c.s.icmp) + } + t1 = vt1.getOverlaps(t1, c.s.icmp, imin.ukey(), imax.ukey(), false) + // Get entire range covered by compaction. + amin, amax := append(t0, t1...).getRange(c.s.icmp) + + // See if we can grow the number of inputs in "sourceLevel" without + // changing the number of "sourceLevel+1" files we pick up. + if len(t1) > 0 { + exp0 := vt0.getOverlaps(nil, c.s.icmp, amin.ukey(), amax.ukey(), c.sourceLevel == 0) + if len(exp0) > len(t0) && t1.size()+exp0.size() < limit { + xmin, xmax := exp0.getRange(c.s.icmp) + exp1 := vt1.getOverlaps(nil, c.s.icmp, xmin.ukey(), xmax.ukey(), false) + if len(exp1) == len(t1) { + c.s.logf("table@compaction expanding L%d+L%d (F·%d S·%s)+(F·%d S·%s) -> (F·%d S·%s)+(F·%d S·%s)", + c.sourceLevel, c.sourceLevel+1, len(t0), shortenb(int(t0.size())), len(t1), shortenb(int(t1.size())), + len(exp0), shortenb(int(exp0.size())), len(exp1), shortenb(int(exp1.size()))) + imin, imax = xmin, xmax + t0, t1 = exp0, exp1 + amin, amax = append(t0, t1...).getRange(c.s.icmp) + } + } + } + + // Compute the set of grandparent files that overlap this compaction + // (parent == sourceLevel+1; grandparent == sourceLevel+2) + if level := c.sourceLevel + 2; level < len(c.v.levels) { + c.gp = c.v.levels[level].getOverlaps(c.gp, c.s.icmp, amin.ukey(), amax.ukey(), false) + } + + c.levels[0], c.levels[1] = t0, t1 + c.imin, c.imax = imin, imax +} + +// Check whether compaction is trivial. +func (c *compaction) trivial() bool { + return len(c.levels[0]) == 1 && len(c.levels[1]) == 0 && c.gp.size() <= c.maxGPOverlaps +} + +func (c *compaction) baseLevelForKey(ukey []byte) bool { + for level := c.sourceLevel + 2; level < len(c.v.levels); level++ { + tables := c.v.levels[level] + for c.tPtrs[level] < len(tables) { + t := tables[c.tPtrs[level]] + if c.s.icmp.uCompare(ukey, t.imax.ukey()) <= 0 { + // We've advanced far enough. + if c.s.icmp.uCompare(ukey, t.imin.ukey()) >= 0 { + // Key falls in this file's range, so definitely not base level. + return false + } + break + } + c.tPtrs[level]++ + } + } + return true +} + +func (c *compaction) shouldStopBefore(ikey internalKey) bool { + for ; c.gpi < len(c.gp); c.gpi++ { + gp := c.gp[c.gpi] + if c.s.icmp.Compare(ikey, gp.imax) <= 0 { + break + } + if c.seenKey { + c.gpOverlappedBytes += gp.size + } + } + c.seenKey = true + + if c.gpOverlappedBytes > c.maxGPOverlaps { + // Too much overlap for current output; start new output. + c.gpOverlappedBytes = 0 + return true + } + return false +} + +// Creates an iterator. +func (c *compaction) newIterator() iterator.Iterator { + // Creates iterator slice. + icap := len(c.levels) + if c.sourceLevel == 0 { + // Special case for level-0. + icap = len(c.levels[0]) + 1 + } + its := make([]iterator.Iterator, 0, icap) + + // Options. + ro := &opt.ReadOptions{ + DontFillCache: true, + Strict: opt.StrictOverride, + } + strict := c.s.o.GetStrict(opt.StrictCompaction) + if strict { + ro.Strict |= opt.StrictReader + } + + for i, tables := range c.levels { + if len(tables) == 0 { + continue + } + + // Level-0 is not sorted and may overlaps each other. + if c.sourceLevel+i == 0 { + for _, t := range tables { + its = append(its, c.s.tops.newIterator(t, nil, ro)) + } + } else { + it := iterator.NewIndexedIterator(tables.newIndexIterator(c.s.tops, c.s.icmp, nil, ro), strict) + its = append(its, it) + } + } + + return iterator.NewMergedIterator(its, c.s.icmp, strict) +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/session_record.go b/vendor/github.com/syndtr/goleveldb/leveldb/session_record.go new file mode 100644 index 000000000..854e1aa6f --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/session_record.go @@ -0,0 +1,323 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "bufio" + "encoding/binary" + "io" + "strings" + + "github.com/syndtr/goleveldb/leveldb/errors" + "github.com/syndtr/goleveldb/leveldb/storage" +) + +type byteReader interface { + io.Reader + io.ByteReader +} + +// These numbers are written to disk and should not be changed. +const ( + recComparer = 1 + recJournalNum = 2 + recNextFileNum = 3 + recSeqNum = 4 + recCompPtr = 5 + recDelTable = 6 + recAddTable = 7 + // 8 was used for large value refs + recPrevJournalNum = 9 +) + +type cpRecord struct { + level int + ikey internalKey +} + +type atRecord struct { + level int + num int64 + size int64 + imin internalKey + imax internalKey +} + +type dtRecord struct { + level int + num int64 +} + +type sessionRecord struct { + hasRec int + comparer string + journalNum int64 + prevJournalNum int64 + nextFileNum int64 + seqNum uint64 + compPtrs []cpRecord + addedTables []atRecord + deletedTables []dtRecord + + scratch [binary.MaxVarintLen64]byte + err error +} + +func (p *sessionRecord) has(rec int) bool { + return p.hasRec&(1< +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "fmt" + "sync/atomic" + + "github.com/syndtr/goleveldb/leveldb/journal" + "github.com/syndtr/goleveldb/leveldb/storage" +) + +// Logging. + +type dropper struct { + s *session + fd storage.FileDesc +} + +func (d dropper) Drop(err error) { + if e, ok := err.(*journal.ErrCorrupted); ok { + d.s.logf("journal@drop %s-%d S·%s %q", d.fd.Type, d.fd.Num, shortenb(e.Size), e.Reason) + } else { + d.s.logf("journal@drop %s-%d %q", d.fd.Type, d.fd.Num, err) + } +} + +func (s *session) log(v ...interface{}) { s.stor.Log(fmt.Sprint(v...)) } +func (s *session) logf(format string, v ...interface{}) { s.stor.Log(fmt.Sprintf(format, v...)) } + +// File utils. + +func (s *session) newTemp() storage.FileDesc { + num := atomic.AddInt64(&s.stTempFileNum, 1) - 1 + return storage.FileDesc{Type: storage.TypeTemp, Num: num} +} + +func (s *session) addFileRef(fd storage.FileDesc, ref int) int { + ref += s.fileRef[fd.Num] + if ref > 0 { + s.fileRef[fd.Num] = ref + } else if ref == 0 { + delete(s.fileRef, fd.Num) + } else { + panic(fmt.Sprintf("negative ref: %v", fd)) + } + return ref +} + +// Session state. + +// Get current version. This will incr version ref, must call +// version.release (exactly once) after use. +func (s *session) version() *version { + s.vmu.Lock() + defer s.vmu.Unlock() + s.stVersion.incref() + return s.stVersion +} + +func (s *session) tLen(level int) int { + s.vmu.Lock() + defer s.vmu.Unlock() + return s.stVersion.tLen(level) +} + +// Set current version to v. +func (s *session) setVersion(v *version) { + s.vmu.Lock() + defer s.vmu.Unlock() + // Hold by session. It is important to call this first before releasing + // current version, otherwise the still used files might get released. + v.incref() + if s.stVersion != nil { + // Release current version. + s.stVersion.releaseNB() + } + s.stVersion = v +} + +// Get current unused file number. +func (s *session) nextFileNum() int64 { + return atomic.LoadInt64(&s.stNextFileNum) +} + +// Set current unused file number to num. +func (s *session) setNextFileNum(num int64) { + atomic.StoreInt64(&s.stNextFileNum, num) +} + +// Mark file number as used. +func (s *session) markFileNum(num int64) { + nextFileNum := num + 1 + for { + old, x := s.stNextFileNum, nextFileNum + if old > x { + x = old + } + if atomic.CompareAndSwapInt64(&s.stNextFileNum, old, x) { + break + } + } +} + +// Allocate a file number. +func (s *session) allocFileNum() int64 { + return atomic.AddInt64(&s.stNextFileNum, 1) - 1 +} + +// Reuse given file number. +func (s *session) reuseFileNum(num int64) { + for { + old, x := s.stNextFileNum, num + if old != x+1 { + x = old + } + if atomic.CompareAndSwapInt64(&s.stNextFileNum, old, x) { + break + } + } +} + +// Set compaction ptr at given level; need external synchronization. +func (s *session) setCompPtr(level int, ik internalKey) { + if level >= len(s.stCompPtrs) { + newCompPtrs := make([]internalKey, level+1) + copy(newCompPtrs, s.stCompPtrs) + s.stCompPtrs = newCompPtrs + } + s.stCompPtrs[level] = append(internalKey{}, ik...) +} + +// Get compaction ptr at given level; need external synchronization. +func (s *session) getCompPtr(level int) internalKey { + if level >= len(s.stCompPtrs) { + return nil + } + return s.stCompPtrs[level] +} + +// Manifest related utils. + +// Fill given session record obj with current states; need external +// synchronization. +func (s *session) fillRecord(r *sessionRecord, snapshot bool) { + r.setNextFileNum(s.nextFileNum()) + + if snapshot { + if !r.has(recJournalNum) { + r.setJournalNum(s.stJournalNum) + } + + if !r.has(recSeqNum) { + r.setSeqNum(s.stSeqNum) + } + + for level, ik := range s.stCompPtrs { + if ik != nil { + r.addCompPtr(level, ik) + } + } + + r.setComparer(s.icmp.uName()) + } +} + +// Mark if record has been committed, this will update session state; +// need external synchronization. +func (s *session) recordCommited(rec *sessionRecord) { + if rec.has(recJournalNum) { + s.stJournalNum = rec.journalNum + } + + if rec.has(recPrevJournalNum) { + s.stPrevJournalNum = rec.prevJournalNum + } + + if rec.has(recSeqNum) { + s.stSeqNum = rec.seqNum + } + + for _, r := range rec.compPtrs { + s.setCompPtr(r.level, internalKey(r.ikey)) + } +} + +// Create a new manifest file; need external synchronization. +func (s *session) newManifest(rec *sessionRecord, v *version) (err error) { + fd := storage.FileDesc{Type: storage.TypeManifest, Num: s.allocFileNum()} + writer, err := s.stor.Create(fd) + if err != nil { + return + } + jw := journal.NewWriter(writer) + + if v == nil { + v = s.version() + defer v.release() + } + if rec == nil { + rec = &sessionRecord{} + } + s.fillRecord(rec, true) + v.fillRecord(rec) + + defer func() { + if err == nil { + s.recordCommited(rec) + if s.manifest != nil { + s.manifest.Close() + } + if s.manifestWriter != nil { + s.manifestWriter.Close() + } + if !s.manifestFd.Zero() { + s.stor.Remove(s.manifestFd) + } + s.manifestFd = fd + s.manifestWriter = writer + s.manifest = jw + } else { + writer.Close() + s.stor.Remove(fd) + s.reuseFileNum(fd.Num) + } + }() + + w, err := jw.Next() + if err != nil { + return + } + err = rec.encode(w) + if err != nil { + return + } + err = jw.Flush() + if err != nil { + return + } + err = s.stor.SetMeta(fd) + return +} + +// Flush record to disk. +func (s *session) flushManifest(rec *sessionRecord) (err error) { + s.fillRecord(rec, false) + w, err := s.manifest.Next() + if err != nil { + return + } + err = rec.encode(w) + if err != nil { + return + } + err = s.manifest.Flush() + if err != nil { + return + } + if !s.o.GetNoSync() { + err = s.manifestWriter.Sync() + if err != nil { + return + } + } + s.recordCommited(rec) + return +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/storage.go b/vendor/github.com/syndtr/goleveldb/leveldb/storage.go new file mode 100644 index 000000000..d45fb5dfe --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/storage.go @@ -0,0 +1,63 @@ +package leveldb + +import ( + "github.com/syndtr/goleveldb/leveldb/storage" + "sync/atomic" +) + +type iStorage struct { + storage.Storage + read uint64 + write uint64 +} + +func (c *iStorage) Open(fd storage.FileDesc) (storage.Reader, error) { + r, err := c.Storage.Open(fd) + return &iStorageReader{r, c}, err +} + +func (c *iStorage) Create(fd storage.FileDesc) (storage.Writer, error) { + w, err := c.Storage.Create(fd) + return &iStorageWriter{w, c}, err +} + +func (c *iStorage) reads() uint64 { + return atomic.LoadUint64(&c.read) +} + +func (c *iStorage) writes() uint64 { + return atomic.LoadUint64(&c.write) +} + +// newIStorage returns the given storage wrapped by iStorage. +func newIStorage(s storage.Storage) *iStorage { + return &iStorage{s, 0, 0} +} + +type iStorageReader struct { + storage.Reader + c *iStorage +} + +func (r *iStorageReader) Read(p []byte) (n int, err error) { + n, err = r.Reader.Read(p) + atomic.AddUint64(&r.c.read, uint64(n)) + return n, err +} + +func (r *iStorageReader) ReadAt(p []byte, off int64) (n int, err error) { + n, err = r.Reader.ReadAt(p, off) + atomic.AddUint64(&r.c.read, uint64(n)) + return n, err +} + +type iStorageWriter struct { + storage.Writer + c *iStorage +} + +func (w *iStorageWriter) Write(p []byte) (n int, err error) { + n, err = w.Writer.Write(p) + atomic.AddUint64(&w.c.write, uint64(n)) + return n, err +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage.go b/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage.go new file mode 100644 index 000000000..9ba71fd6d --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage.go @@ -0,0 +1,671 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reservefs. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package storage + +import ( + "errors" + "fmt" + "io" + "io/ioutil" + "os" + "path/filepath" + "runtime" + "sort" + "strconv" + "strings" + "sync" + "time" +) + +var ( + errFileOpen = errors.New("leveldb/storage: file still open") + errReadOnly = errors.New("leveldb/storage: storage is read-only") +) + +type fileLock interface { + release() error +} + +type fileStorageLock struct { + fs *fileStorage +} + +func (lock *fileStorageLock) Unlock() { + if lock.fs != nil { + lock.fs.mu.Lock() + defer lock.fs.mu.Unlock() + if lock.fs.slock == lock { + lock.fs.slock = nil + } + } +} + +type int64Slice []int64 + +func (p int64Slice) Len() int { return len(p) } +func (p int64Slice) Less(i, j int) bool { return p[i] < p[j] } +func (p int64Slice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } + +func writeFileSynced(filename string, data []byte, perm os.FileMode) error { + f, err := os.OpenFile(filename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, perm) + if err != nil { + return err + } + n, err := f.Write(data) + if err == nil && n < len(data) { + err = io.ErrShortWrite + } + if err1 := f.Sync(); err == nil { + err = err1 + } + if err1 := f.Close(); err == nil { + err = err1 + } + return err +} + +const logSizeThreshold = 1024 * 1024 // 1 MiB + +// fileStorage is a file-system backed storage. +type fileStorage struct { + path string + readOnly bool + + mu sync.Mutex + flock fileLock + slock *fileStorageLock + logw *os.File + logSize int64 + buf []byte + // Opened file counter; if open < 0 means closed. + open int + day int +} + +// OpenFile returns a new filesystem-backed storage implementation with the given +// path. This also acquire a file lock, so any subsequent attempt to open the +// same path will fail. +// +// The storage must be closed after use, by calling Close method. +func OpenFile(path string, readOnly bool) (Storage, error) { + if fi, err := os.Stat(path); err == nil { + if !fi.IsDir() { + return nil, fmt.Errorf("leveldb/storage: open %s: not a directory", path) + } + } else if os.IsNotExist(err) && !readOnly { + if err := os.MkdirAll(path, 0755); err != nil { + return nil, err + } + } else { + return nil, err + } + + flock, err := newFileLock(filepath.Join(path, "LOCK"), readOnly) + if err != nil { + return nil, err + } + + defer func() { + if err != nil { + flock.release() + } + }() + + var ( + logw *os.File + logSize int64 + ) + if !readOnly { + logw, err = os.OpenFile(filepath.Join(path, "LOG"), os.O_WRONLY|os.O_CREATE, 0644) + if err != nil { + return nil, err + } + logSize, err = logw.Seek(0, os.SEEK_END) + if err != nil { + logw.Close() + return nil, err + } + } + + fs := &fileStorage{ + path: path, + readOnly: readOnly, + flock: flock, + logw: logw, + logSize: logSize, + } + runtime.SetFinalizer(fs, (*fileStorage).Close) + return fs, nil +} + +func (fs *fileStorage) Lock() (Locker, error) { + fs.mu.Lock() + defer fs.mu.Unlock() + if fs.open < 0 { + return nil, ErrClosed + } + if fs.readOnly { + return &fileStorageLock{}, nil + } + if fs.slock != nil { + return nil, ErrLocked + } + fs.slock = &fileStorageLock{fs: fs} + return fs.slock, nil +} + +func itoa(buf []byte, i int, wid int) []byte { + u := uint(i) + if u == 0 && wid <= 1 { + return append(buf, '0') + } + + // Assemble decimal in reverse order. + var b [32]byte + bp := len(b) + for ; u > 0 || wid > 0; u /= 10 { + bp-- + wid-- + b[bp] = byte(u%10) + '0' + } + return append(buf, b[bp:]...) +} + +func (fs *fileStorage) printDay(t time.Time) { + if fs.day == t.Day() { + return + } + fs.day = t.Day() + fs.logw.Write([]byte("=============== " + t.Format("Jan 2, 2006 (MST)") + " ===============\n")) +} + +func (fs *fileStorage) doLog(t time.Time, str string) { + if fs.logSize > logSizeThreshold { + // Rotate log file. + fs.logw.Close() + fs.logw = nil + fs.logSize = 0 + rename(filepath.Join(fs.path, "LOG"), filepath.Join(fs.path, "LOG.old")) + } + if fs.logw == nil { + var err error + fs.logw, err = os.OpenFile(filepath.Join(fs.path, "LOG"), os.O_WRONLY|os.O_CREATE, 0644) + if err != nil { + return + } + // Force printDay on new log file. + fs.day = 0 + } + fs.printDay(t) + hour, min, sec := t.Clock() + msec := t.Nanosecond() / 1e3 + // time + fs.buf = itoa(fs.buf[:0], hour, 2) + fs.buf = append(fs.buf, ':') + fs.buf = itoa(fs.buf, min, 2) + fs.buf = append(fs.buf, ':') + fs.buf = itoa(fs.buf, sec, 2) + fs.buf = append(fs.buf, '.') + fs.buf = itoa(fs.buf, msec, 6) + fs.buf = append(fs.buf, ' ') + // write + fs.buf = append(fs.buf, []byte(str)...) + fs.buf = append(fs.buf, '\n') + n, _ := fs.logw.Write(fs.buf) + fs.logSize += int64(n) +} + +func (fs *fileStorage) Log(str string) { + if !fs.readOnly { + t := time.Now() + fs.mu.Lock() + defer fs.mu.Unlock() + if fs.open < 0 { + return + } + fs.doLog(t, str) + } +} + +func (fs *fileStorage) log(str string) { + if !fs.readOnly { + fs.doLog(time.Now(), str) + } +} + +func (fs *fileStorage) setMeta(fd FileDesc) error { + content := fsGenName(fd) + "\n" + // Check and backup old CURRENT file. + currentPath := filepath.Join(fs.path, "CURRENT") + if _, err := os.Stat(currentPath); err == nil { + b, err := ioutil.ReadFile(currentPath) + if err != nil { + fs.log(fmt.Sprintf("backup CURRENT: %v", err)) + return err + } + if string(b) == content { + // Content not changed, do nothing. + return nil + } + if err := writeFileSynced(currentPath+".bak", b, 0644); err != nil { + fs.log(fmt.Sprintf("backup CURRENT: %v", err)) + return err + } + } else if !os.IsNotExist(err) { + return err + } + path := fmt.Sprintf("%s.%d", filepath.Join(fs.path, "CURRENT"), fd.Num) + if err := writeFileSynced(path, []byte(content), 0644); err != nil { + fs.log(fmt.Sprintf("create CURRENT.%d: %v", fd.Num, err)) + return err + } + // Replace CURRENT file. + if err := rename(path, currentPath); err != nil { + fs.log(fmt.Sprintf("rename CURRENT.%d: %v", fd.Num, err)) + return err + } + // Sync root directory. + if err := syncDir(fs.path); err != nil { + fs.log(fmt.Sprintf("syncDir: %v", err)) + return err + } + return nil +} + +func (fs *fileStorage) SetMeta(fd FileDesc) error { + if !FileDescOk(fd) { + return ErrInvalidFile + } + if fs.readOnly { + return errReadOnly + } + + fs.mu.Lock() + defer fs.mu.Unlock() + if fs.open < 0 { + return ErrClosed + } + return fs.setMeta(fd) +} + +func (fs *fileStorage) GetMeta() (FileDesc, error) { + fs.mu.Lock() + defer fs.mu.Unlock() + if fs.open < 0 { + return FileDesc{}, ErrClosed + } + dir, err := os.Open(fs.path) + if err != nil { + return FileDesc{}, err + } + names, err := dir.Readdirnames(0) + // Close the dir first before checking for Readdirnames error. + if ce := dir.Close(); ce != nil { + fs.log(fmt.Sprintf("close dir: %v", ce)) + } + if err != nil { + return FileDesc{}, err + } + // Try this in order: + // - CURRENT.[0-9]+ ('pending rename' file, descending order) + // - CURRENT + // - CURRENT.bak + // + // Skip corrupted file or file that point to a missing target file. + type currentFile struct { + name string + fd FileDesc + } + tryCurrent := func(name string) (*currentFile, error) { + b, err := ioutil.ReadFile(filepath.Join(fs.path, name)) + if err != nil { + if os.IsNotExist(err) { + err = os.ErrNotExist + } + return nil, err + } + var fd FileDesc + if len(b) < 1 || b[len(b)-1] != '\n' || !fsParseNamePtr(string(b[:len(b)-1]), &fd) { + fs.log(fmt.Sprintf("%s: corrupted content: %q", name, b)) + err := &ErrCorrupted{ + Err: errors.New("leveldb/storage: corrupted or incomplete CURRENT file"), + } + return nil, err + } + if _, err := os.Stat(filepath.Join(fs.path, fsGenName(fd))); err != nil { + if os.IsNotExist(err) { + fs.log(fmt.Sprintf("%s: missing target file: %s", name, fd)) + err = os.ErrNotExist + } + return nil, err + } + return ¤tFile{name: name, fd: fd}, nil + } + tryCurrents := func(names []string) (*currentFile, error) { + var ( + cur *currentFile + // Last corruption error. + lastCerr error + ) + for _, name := range names { + var err error + cur, err = tryCurrent(name) + if err == nil { + break + } else if err == os.ErrNotExist { + // Fallback to the next file. + } else if isCorrupted(err) { + lastCerr = err + // Fallback to the next file. + } else { + // In case the error is due to permission, etc. + return nil, err + } + } + if cur == nil { + err := os.ErrNotExist + if lastCerr != nil { + err = lastCerr + } + return nil, err + } + return cur, nil + } + + // Try 'pending rename' files. + var nums []int64 + for _, name := range names { + if strings.HasPrefix(name, "CURRENT.") && name != "CURRENT.bak" { + i, err := strconv.ParseInt(name[8:], 10, 64) + if err == nil { + nums = append(nums, i) + } + } + } + var ( + pendCur *currentFile + pendErr = os.ErrNotExist + pendNames []string + ) + if len(nums) > 0 { + sort.Sort(sort.Reverse(int64Slice(nums))) + pendNames = make([]string, len(nums)) + for i, num := range nums { + pendNames[i] = fmt.Sprintf("CURRENT.%d", num) + } + pendCur, pendErr = tryCurrents(pendNames) + if pendErr != nil && pendErr != os.ErrNotExist && !isCorrupted(pendErr) { + return FileDesc{}, pendErr + } + } + + // Try CURRENT and CURRENT.bak. + curCur, curErr := tryCurrents([]string{"CURRENT", "CURRENT.bak"}) + if curErr != nil && curErr != os.ErrNotExist && !isCorrupted(curErr) { + return FileDesc{}, curErr + } + + // pendCur takes precedence, but guards against obsolete pendCur. + if pendCur != nil && (curCur == nil || pendCur.fd.Num > curCur.fd.Num) { + curCur = pendCur + } + + if curCur != nil { + // Restore CURRENT file to proper state. + if !fs.readOnly && (curCur.name != "CURRENT" || len(pendNames) != 0) { + // Ignore setMeta errors, however don't delete obsolete files if we + // catch error. + if err := fs.setMeta(curCur.fd); err == nil { + // Remove 'pending rename' files. + for _, name := range pendNames { + if err := os.Remove(filepath.Join(fs.path, name)); err != nil { + fs.log(fmt.Sprintf("remove %s: %v", name, err)) + } + } + } + } + return curCur.fd, nil + } + + // Nothing found. + if isCorrupted(pendErr) { + return FileDesc{}, pendErr + } + return FileDesc{}, curErr +} + +func (fs *fileStorage) List(ft FileType) (fds []FileDesc, err error) { + fs.mu.Lock() + defer fs.mu.Unlock() + if fs.open < 0 { + return nil, ErrClosed + } + dir, err := os.Open(fs.path) + if err != nil { + return + } + names, err := dir.Readdirnames(0) + // Close the dir first before checking for Readdirnames error. + if cerr := dir.Close(); cerr != nil { + fs.log(fmt.Sprintf("close dir: %v", cerr)) + } + if err == nil { + for _, name := range names { + if fd, ok := fsParseName(name); ok && fd.Type&ft != 0 { + fds = append(fds, fd) + } + } + } + return +} + +func (fs *fileStorage) Open(fd FileDesc) (Reader, error) { + if !FileDescOk(fd) { + return nil, ErrInvalidFile + } + + fs.mu.Lock() + defer fs.mu.Unlock() + if fs.open < 0 { + return nil, ErrClosed + } + of, err := os.OpenFile(filepath.Join(fs.path, fsGenName(fd)), os.O_RDONLY, 0) + if err != nil { + if fsHasOldName(fd) && os.IsNotExist(err) { + of, err = os.OpenFile(filepath.Join(fs.path, fsGenOldName(fd)), os.O_RDONLY, 0) + if err == nil { + goto ok + } + } + return nil, err + } +ok: + fs.open++ + return &fileWrap{File: of, fs: fs, fd: fd}, nil +} + +func (fs *fileStorage) Create(fd FileDesc) (Writer, error) { + if !FileDescOk(fd) { + return nil, ErrInvalidFile + } + if fs.readOnly { + return nil, errReadOnly + } + + fs.mu.Lock() + defer fs.mu.Unlock() + if fs.open < 0 { + return nil, ErrClosed + } + of, err := os.OpenFile(filepath.Join(fs.path, fsGenName(fd)), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) + if err != nil { + return nil, err + } + fs.open++ + return &fileWrap{File: of, fs: fs, fd: fd}, nil +} + +func (fs *fileStorage) Remove(fd FileDesc) error { + if !FileDescOk(fd) { + return ErrInvalidFile + } + if fs.readOnly { + return errReadOnly + } + + fs.mu.Lock() + defer fs.mu.Unlock() + if fs.open < 0 { + return ErrClosed + } + err := os.Remove(filepath.Join(fs.path, fsGenName(fd))) + if err != nil { + if fsHasOldName(fd) && os.IsNotExist(err) { + if e1 := os.Remove(filepath.Join(fs.path, fsGenOldName(fd))); !os.IsNotExist(e1) { + fs.log(fmt.Sprintf("remove %s: %v (old name)", fd, err)) + err = e1 + } + } else { + fs.log(fmt.Sprintf("remove %s: %v", fd, err)) + } + } + return err +} + +func (fs *fileStorage) Rename(oldfd, newfd FileDesc) error { + if !FileDescOk(oldfd) || !FileDescOk(newfd) { + return ErrInvalidFile + } + if oldfd == newfd { + return nil + } + if fs.readOnly { + return errReadOnly + } + + fs.mu.Lock() + defer fs.mu.Unlock() + if fs.open < 0 { + return ErrClosed + } + return rename(filepath.Join(fs.path, fsGenName(oldfd)), filepath.Join(fs.path, fsGenName(newfd))) +} + +func (fs *fileStorage) Close() error { + fs.mu.Lock() + defer fs.mu.Unlock() + if fs.open < 0 { + return ErrClosed + } + // Clear the finalizer. + runtime.SetFinalizer(fs, nil) + + if fs.open > 0 { + fs.log(fmt.Sprintf("close: warning, %d files still open", fs.open)) + } + fs.open = -1 + if fs.logw != nil { + fs.logw.Close() + } + return fs.flock.release() +} + +type fileWrap struct { + *os.File + fs *fileStorage + fd FileDesc + closed bool +} + +func (fw *fileWrap) Sync() error { + if err := fw.File.Sync(); err != nil { + return err + } + if fw.fd.Type == TypeManifest { + // Also sync parent directory if file type is manifest. + // See: https://code.google.com/p/leveldb/issues/detail?id=190. + if err := syncDir(fw.fs.path); err != nil { + fw.fs.log(fmt.Sprintf("syncDir: %v", err)) + return err + } + } + return nil +} + +func (fw *fileWrap) Close() error { + fw.fs.mu.Lock() + defer fw.fs.mu.Unlock() + if fw.closed { + return ErrClosed + } + fw.closed = true + fw.fs.open-- + err := fw.File.Close() + if err != nil { + fw.fs.log(fmt.Sprintf("close %s: %v", fw.fd, err)) + } + return err +} + +func fsGenName(fd FileDesc) string { + switch fd.Type { + case TypeManifest: + return fmt.Sprintf("MANIFEST-%06d", fd.Num) + case TypeJournal: + return fmt.Sprintf("%06d.log", fd.Num) + case TypeTable: + return fmt.Sprintf("%06d.ldb", fd.Num) + case TypeTemp: + return fmt.Sprintf("%06d.tmp", fd.Num) + default: + panic("invalid file type") + } +} + +func fsHasOldName(fd FileDesc) bool { + return fd.Type == TypeTable +} + +func fsGenOldName(fd FileDesc) string { + switch fd.Type { + case TypeTable: + return fmt.Sprintf("%06d.sst", fd.Num) + } + return fsGenName(fd) +} + +func fsParseName(name string) (fd FileDesc, ok bool) { + var tail string + _, err := fmt.Sscanf(name, "%d.%s", &fd.Num, &tail) + if err == nil { + switch tail { + case "log": + fd.Type = TypeJournal + case "ldb", "sst": + fd.Type = TypeTable + case "tmp": + fd.Type = TypeTemp + default: + return + } + return fd, true + } + n, _ := fmt.Sscanf(name, "MANIFEST-%d%s", &fd.Num, &tail) + if n == 1 { + fd.Type = TypeManifest + return fd, true + } + return +} + +func fsParseNamePtr(name string, fd *FileDesc) bool { + _fd, ok := fsParseName(name) + if fd != nil { + *fd = _fd + } + return ok +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_nacl.go b/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_nacl.go new file mode 100644 index 000000000..5545aeef2 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_nacl.go @@ -0,0 +1,34 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// +build nacl + +package storage + +import ( + "os" + "syscall" +) + +func newFileLock(path string, readOnly bool) (fl fileLock, err error) { + return nil, syscall.ENOTSUP +} + +func setFileLock(f *os.File, readOnly, lock bool) error { + return syscall.ENOTSUP +} + +func rename(oldpath, newpath string) error { + return syscall.ENOTSUP +} + +func isErrInvalid(err error) bool { + return false +} + +func syncDir(name string) error { + return syscall.ENOTSUP +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_plan9.go b/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_plan9.go new file mode 100644 index 000000000..b82979801 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_plan9.go @@ -0,0 +1,63 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package storage + +import ( + "os" +) + +type plan9FileLock struct { + f *os.File +} + +func (fl *plan9FileLock) release() error { + return fl.f.Close() +} + +func newFileLock(path string, readOnly bool) (fl fileLock, err error) { + var ( + flag int + perm os.FileMode + ) + if readOnly { + flag = os.O_RDONLY + } else { + flag = os.O_RDWR + perm = os.ModeExclusive + } + f, err := os.OpenFile(path, flag, perm) + if os.IsNotExist(err) { + f, err = os.OpenFile(path, flag|os.O_CREATE, perm|0644) + } + if err != nil { + return + } + fl = &plan9FileLock{f: f} + return +} + +func rename(oldpath, newpath string) error { + if _, err := os.Stat(newpath); err == nil { + if err := os.Remove(newpath); err != nil { + return err + } + } + + return os.Rename(oldpath, newpath) +} + +func syncDir(name string) error { + f, err := os.Open(name) + if err != nil { + return err + } + defer f.Close() + if err := f.Sync(); err != nil { + return err + } + return nil +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_solaris.go b/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_solaris.go new file mode 100644 index 000000000..79901ee4a --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_solaris.go @@ -0,0 +1,81 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// +build solaris + +package storage + +import ( + "os" + "syscall" +) + +type unixFileLock struct { + f *os.File +} + +func (fl *unixFileLock) release() error { + if err := setFileLock(fl.f, false, false); err != nil { + return err + } + return fl.f.Close() +} + +func newFileLock(path string, readOnly bool) (fl fileLock, err error) { + var flag int + if readOnly { + flag = os.O_RDONLY + } else { + flag = os.O_RDWR + } + f, err := os.OpenFile(path, flag, 0) + if os.IsNotExist(err) { + f, err = os.OpenFile(path, flag|os.O_CREATE, 0644) + } + if err != nil { + return + } + err = setFileLock(f, readOnly, true) + if err != nil { + f.Close() + return + } + fl = &unixFileLock{f: f} + return +} + +func setFileLock(f *os.File, readOnly, lock bool) error { + flock := syscall.Flock_t{ + Type: syscall.F_UNLCK, + Start: 0, + Len: 0, + Whence: 1, + } + if lock { + if readOnly { + flock.Type = syscall.F_RDLCK + } else { + flock.Type = syscall.F_WRLCK + } + } + return syscall.FcntlFlock(f.Fd(), syscall.F_SETLK, &flock) +} + +func rename(oldpath, newpath string) error { + return os.Rename(oldpath, newpath) +} + +func syncDir(name string) error { + f, err := os.Open(name) + if err != nil { + return err + } + defer f.Close() + if err := f.Sync(); err != nil { + return err + } + return nil +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_unix.go b/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_unix.go new file mode 100644 index 000000000..d75f66a9e --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_unix.go @@ -0,0 +1,98 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// +build darwin dragonfly freebsd linux netbsd openbsd + +package storage + +import ( + "os" + "syscall" +) + +type unixFileLock struct { + f *os.File +} + +func (fl *unixFileLock) release() error { + if err := setFileLock(fl.f, false, false); err != nil { + return err + } + return fl.f.Close() +} + +func newFileLock(path string, readOnly bool) (fl fileLock, err error) { + var flag int + if readOnly { + flag = os.O_RDONLY + } else { + flag = os.O_RDWR + } + f, err := os.OpenFile(path, flag, 0) + if os.IsNotExist(err) { + f, err = os.OpenFile(path, flag|os.O_CREATE, 0644) + } + if err != nil { + return + } + err = setFileLock(f, readOnly, true) + if err != nil { + f.Close() + return + } + fl = &unixFileLock{f: f} + return +} + +func setFileLock(f *os.File, readOnly, lock bool) error { + how := syscall.LOCK_UN + if lock { + if readOnly { + how = syscall.LOCK_SH + } else { + how = syscall.LOCK_EX + } + } + return syscall.Flock(int(f.Fd()), how|syscall.LOCK_NB) +} + +func rename(oldpath, newpath string) error { + return os.Rename(oldpath, newpath) +} + +func isErrInvalid(err error) bool { + if err == os.ErrInvalid { + return true + } + // Go < 1.8 + if syserr, ok := err.(*os.SyscallError); ok && syserr.Err == syscall.EINVAL { + return true + } + // Go >= 1.8 returns *os.PathError instead + if patherr, ok := err.(*os.PathError); ok && patherr.Err == syscall.EINVAL { + return true + } + return false +} + +func syncDir(name string) error { + // As per fsync manpage, Linux seems to expect fsync on directory, however + // some system don't support this, so we will ignore syscall.EINVAL. + // + // From fsync(2): + // Calling fsync() does not necessarily ensure that the entry in the + // directory containing the file has also reached disk. For that an + // explicit fsync() on a file descriptor for the directory is also needed. + f, err := os.Open(name) + if err != nil { + return err + } + defer f.Close() + if err := f.Sync(); err != nil && !isErrInvalid(err) { + return err + } + return nil +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_windows.go b/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_windows.go new file mode 100644 index 000000000..899335fd7 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/storage/file_storage_windows.go @@ -0,0 +1,78 @@ +// Copyright (c) 2013, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package storage + +import ( + "syscall" + "unsafe" +) + +var ( + modkernel32 = syscall.NewLazyDLL("kernel32.dll") + + procMoveFileExW = modkernel32.NewProc("MoveFileExW") +) + +const ( + _MOVEFILE_REPLACE_EXISTING = 1 +) + +type windowsFileLock struct { + fd syscall.Handle +} + +func (fl *windowsFileLock) release() error { + return syscall.Close(fl.fd) +} + +func newFileLock(path string, readOnly bool) (fl fileLock, err error) { + pathp, err := syscall.UTF16PtrFromString(path) + if err != nil { + return + } + var access, shareMode uint32 + if readOnly { + access = syscall.GENERIC_READ + shareMode = syscall.FILE_SHARE_READ + } else { + access = syscall.GENERIC_READ | syscall.GENERIC_WRITE + } + fd, err := syscall.CreateFile(pathp, access, shareMode, nil, syscall.OPEN_EXISTING, syscall.FILE_ATTRIBUTE_NORMAL, 0) + if err == syscall.ERROR_FILE_NOT_FOUND { + fd, err = syscall.CreateFile(pathp, access, shareMode, nil, syscall.OPEN_ALWAYS, syscall.FILE_ATTRIBUTE_NORMAL, 0) + } + if err != nil { + return + } + fl = &windowsFileLock{fd: fd} + return +} + +func moveFileEx(from *uint16, to *uint16, flags uint32) error { + r1, _, e1 := syscall.Syscall(procMoveFileExW.Addr(), 3, uintptr(unsafe.Pointer(from)), uintptr(unsafe.Pointer(to)), uintptr(flags)) + if r1 == 0 { + if e1 != 0 { + return error(e1) + } + return syscall.EINVAL + } + return nil +} + +func rename(oldpath, newpath string) error { + from, err := syscall.UTF16PtrFromString(oldpath) + if err != nil { + return err + } + to, err := syscall.UTF16PtrFromString(newpath) + if err != nil { + return err + } + return moveFileEx(from, to, _MOVEFILE_REPLACE_EXISTING) +} + +func syncDir(name string) error { return nil } diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/storage/mem_storage.go b/vendor/github.com/syndtr/goleveldb/leveldb/storage/mem_storage.go new file mode 100644 index 000000000..838f1bee1 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/storage/mem_storage.go @@ -0,0 +1,222 @@ +// Copyright (c) 2013, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package storage + +import ( + "bytes" + "os" + "sync" +) + +const typeShift = 4 + +// Verify at compile-time that typeShift is large enough to cover all FileType +// values by confirming that 0 == 0. +var _ [0]struct{} = [TypeAll >> typeShift]struct{}{} + +type memStorageLock struct { + ms *memStorage +} + +func (lock *memStorageLock) Unlock() { + ms := lock.ms + ms.mu.Lock() + defer ms.mu.Unlock() + if ms.slock == lock { + ms.slock = nil + } + return +} + +// memStorage is a memory-backed storage. +type memStorage struct { + mu sync.Mutex + slock *memStorageLock + files map[uint64]*memFile + meta FileDesc +} + +// NewMemStorage returns a new memory-backed storage implementation. +func NewMemStorage() Storage { + return &memStorage{ + files: make(map[uint64]*memFile), + } +} + +func (ms *memStorage) Lock() (Locker, error) { + ms.mu.Lock() + defer ms.mu.Unlock() + if ms.slock != nil { + return nil, ErrLocked + } + ms.slock = &memStorageLock{ms: ms} + return ms.slock, nil +} + +func (*memStorage) Log(str string) {} + +func (ms *memStorage) SetMeta(fd FileDesc) error { + if !FileDescOk(fd) { + return ErrInvalidFile + } + + ms.mu.Lock() + ms.meta = fd + ms.mu.Unlock() + return nil +} + +func (ms *memStorage) GetMeta() (FileDesc, error) { + ms.mu.Lock() + defer ms.mu.Unlock() + if ms.meta.Zero() { + return FileDesc{}, os.ErrNotExist + } + return ms.meta, nil +} + +func (ms *memStorage) List(ft FileType) ([]FileDesc, error) { + ms.mu.Lock() + var fds []FileDesc + for x := range ms.files { + fd := unpackFile(x) + if fd.Type&ft != 0 { + fds = append(fds, fd) + } + } + ms.mu.Unlock() + return fds, nil +} + +func (ms *memStorage) Open(fd FileDesc) (Reader, error) { + if !FileDescOk(fd) { + return nil, ErrInvalidFile + } + + ms.mu.Lock() + defer ms.mu.Unlock() + if m, exist := ms.files[packFile(fd)]; exist { + if m.open { + return nil, errFileOpen + } + m.open = true + return &memReader{Reader: bytes.NewReader(m.Bytes()), ms: ms, m: m}, nil + } + return nil, os.ErrNotExist +} + +func (ms *memStorage) Create(fd FileDesc) (Writer, error) { + if !FileDescOk(fd) { + return nil, ErrInvalidFile + } + + x := packFile(fd) + ms.mu.Lock() + defer ms.mu.Unlock() + m, exist := ms.files[x] + if exist { + if m.open { + return nil, errFileOpen + } + m.Reset() + } else { + m = &memFile{} + ms.files[x] = m + } + m.open = true + return &memWriter{memFile: m, ms: ms}, nil +} + +func (ms *memStorage) Remove(fd FileDesc) error { + if !FileDescOk(fd) { + return ErrInvalidFile + } + + x := packFile(fd) + ms.mu.Lock() + defer ms.mu.Unlock() + if _, exist := ms.files[x]; exist { + delete(ms.files, x) + return nil + } + return os.ErrNotExist +} + +func (ms *memStorage) Rename(oldfd, newfd FileDesc) error { + if !FileDescOk(oldfd) || !FileDescOk(newfd) { + return ErrInvalidFile + } + if oldfd == newfd { + return nil + } + + oldx := packFile(oldfd) + newx := packFile(newfd) + ms.mu.Lock() + defer ms.mu.Unlock() + oldm, exist := ms.files[oldx] + if !exist { + return os.ErrNotExist + } + newm, exist := ms.files[newx] + if (exist && newm.open) || oldm.open { + return errFileOpen + } + delete(ms.files, oldx) + ms.files[newx] = oldm + return nil +} + +func (*memStorage) Close() error { return nil } + +type memFile struct { + bytes.Buffer + open bool +} + +type memReader struct { + *bytes.Reader + ms *memStorage + m *memFile + closed bool +} + +func (mr *memReader) Close() error { + mr.ms.mu.Lock() + defer mr.ms.mu.Unlock() + if mr.closed { + return ErrClosed + } + mr.m.open = false + return nil +} + +type memWriter struct { + *memFile + ms *memStorage + closed bool +} + +func (*memWriter) Sync() error { return nil } + +func (mw *memWriter) Close() error { + mw.ms.mu.Lock() + defer mw.ms.mu.Unlock() + if mw.closed { + return ErrClosed + } + mw.memFile.open = false + return nil +} + +func packFile(fd FileDesc) uint64 { + return uint64(fd.Num)<> typeShift)} +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/storage/storage.go b/vendor/github.com/syndtr/goleveldb/leveldb/storage/storage.go new file mode 100644 index 000000000..4e4a72425 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/storage/storage.go @@ -0,0 +1,187 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Package storage provides storage abstraction for LevelDB. +package storage + +import ( + "errors" + "fmt" + "io" +) + +// FileType represent a file type. +type FileType int + +// File types. +const ( + TypeManifest FileType = 1 << iota + TypeJournal + TypeTable + TypeTemp + + TypeAll = TypeManifest | TypeJournal | TypeTable | TypeTemp +) + +func (t FileType) String() string { + switch t { + case TypeManifest: + return "manifest" + case TypeJournal: + return "journal" + case TypeTable: + return "table" + case TypeTemp: + return "temp" + } + return fmt.Sprintf("", t) +} + +// Common error. +var ( + ErrInvalidFile = errors.New("leveldb/storage: invalid file for argument") + ErrLocked = errors.New("leveldb/storage: already locked") + ErrClosed = errors.New("leveldb/storage: closed") +) + +// ErrCorrupted is the type that wraps errors that indicate corruption of +// a file. Package storage has its own type instead of using +// errors.ErrCorrupted to prevent circular import. +type ErrCorrupted struct { + Fd FileDesc + Err error +} + +func isCorrupted(err error) bool { + switch err.(type) { + case *ErrCorrupted: + return true + } + return false +} + +func (e *ErrCorrupted) Error() string { + if !e.Fd.Zero() { + return fmt.Sprintf("%v [file=%v]", e.Err, e.Fd) + } + return e.Err.Error() +} + +// Syncer is the interface that wraps basic Sync method. +type Syncer interface { + // Sync commits the current contents of the file to stable storage. + Sync() error +} + +// Reader is the interface that groups the basic Read, Seek, ReadAt and Close +// methods. +type Reader interface { + io.ReadSeeker + io.ReaderAt + io.Closer +} + +// Writer is the interface that groups the basic Write, Sync and Close +// methods. +type Writer interface { + io.WriteCloser + Syncer +} + +// Locker is the interface that wraps Unlock method. +type Locker interface { + Unlock() +} + +// FileDesc is a 'file descriptor'. +type FileDesc struct { + Type FileType + Num int64 +} + +func (fd FileDesc) String() string { + switch fd.Type { + case TypeManifest: + return fmt.Sprintf("MANIFEST-%06d", fd.Num) + case TypeJournal: + return fmt.Sprintf("%06d.log", fd.Num) + case TypeTable: + return fmt.Sprintf("%06d.ldb", fd.Num) + case TypeTemp: + return fmt.Sprintf("%06d.tmp", fd.Num) + default: + return fmt.Sprintf("%#x-%d", fd.Type, fd.Num) + } +} + +// Zero returns true if fd == (FileDesc{}). +func (fd FileDesc) Zero() bool { + return fd == (FileDesc{}) +} + +// FileDescOk returns true if fd is a valid 'file descriptor'. +func FileDescOk(fd FileDesc) bool { + switch fd.Type { + case TypeManifest: + case TypeJournal: + case TypeTable: + case TypeTemp: + default: + return false + } + return fd.Num >= 0 +} + +// Storage is the storage. A storage instance must be safe for concurrent use. +type Storage interface { + // Lock locks the storage. Any subsequent attempt to call Lock will fail + // until the last lock released. + // Caller should call Unlock method after use. + Lock() (Locker, error) + + // Log logs a string. This is used for logging. + // An implementation may write to a file, stdout or simply do nothing. + Log(str string) + + // SetMeta store 'file descriptor' that can later be acquired using GetMeta + // method. The 'file descriptor' should point to a valid file. + // SetMeta should be implemented in such way that changes should happen + // atomically. + SetMeta(fd FileDesc) error + + // GetMeta returns 'file descriptor' stored in meta. The 'file descriptor' + // can be updated using SetMeta method. + // Returns os.ErrNotExist if meta doesn't store any 'file descriptor', or + // 'file descriptor' point to nonexistent file. + GetMeta() (FileDesc, error) + + // List returns file descriptors that match the given file types. + // The file types may be OR'ed together. + List(ft FileType) ([]FileDesc, error) + + // Open opens file with the given 'file descriptor' read-only. + // Returns os.ErrNotExist error if the file does not exist. + // Returns ErrClosed if the underlying storage is closed. + Open(fd FileDesc) (Reader, error) + + // Create creates file with the given 'file descriptor', truncate if already + // exist and opens write-only. + // Returns ErrClosed if the underlying storage is closed. + Create(fd FileDesc) (Writer, error) + + // Remove removes file with the given 'file descriptor'. + // Returns ErrClosed if the underlying storage is closed. + Remove(fd FileDesc) error + + // Rename renames file from oldfd to newfd. + // Returns ErrClosed if the underlying storage is closed. + Rename(oldfd, newfd FileDesc) error + + // Close closes the storage. + // It is valid to call Close multiple times. Other methods should not be + // called after the storage has been closed. + Close() error +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/table.go b/vendor/github.com/syndtr/goleveldb/leveldb/table.go new file mode 100644 index 000000000..1fac60d05 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/table.go @@ -0,0 +1,531 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "fmt" + "sort" + "sync/atomic" + + "github.com/syndtr/goleveldb/leveldb/cache" + "github.com/syndtr/goleveldb/leveldb/iterator" + "github.com/syndtr/goleveldb/leveldb/opt" + "github.com/syndtr/goleveldb/leveldb/storage" + "github.com/syndtr/goleveldb/leveldb/table" + "github.com/syndtr/goleveldb/leveldb/util" +) + +// tFile holds basic information about a table. +type tFile struct { + fd storage.FileDesc + seekLeft int32 + size int64 + imin, imax internalKey +} + +// Returns true if given key is after largest key of this table. +func (t *tFile) after(icmp *iComparer, ukey []byte) bool { + return ukey != nil && icmp.uCompare(ukey, t.imax.ukey()) > 0 +} + +// Returns true if given key is before smallest key of this table. +func (t *tFile) before(icmp *iComparer, ukey []byte) bool { + return ukey != nil && icmp.uCompare(ukey, t.imin.ukey()) < 0 +} + +// Returns true if given key range overlaps with this table key range. +func (t *tFile) overlaps(icmp *iComparer, umin, umax []byte) bool { + return !t.after(icmp, umin) && !t.before(icmp, umax) +} + +// Cosumes one seek and return current seeks left. +func (t *tFile) consumeSeek() int32 { + return atomic.AddInt32(&t.seekLeft, -1) +} + +// Creates new tFile. +func newTableFile(fd storage.FileDesc, size int64, imin, imax internalKey) *tFile { + f := &tFile{ + fd: fd, + size: size, + imin: imin, + imax: imax, + } + + // We arrange to automatically compact this file after + // a certain number of seeks. Let's assume: + // (1) One seek costs 10ms + // (2) Writing or reading 1MB costs 10ms (100MB/s) + // (3) A compaction of 1MB does 25MB of IO: + // 1MB read from this level + // 10-12MB read from next level (boundaries may be misaligned) + // 10-12MB written to next level + // This implies that 25 seeks cost the same as the compaction + // of 1MB of data. I.e., one seek costs approximately the + // same as the compaction of 40KB of data. We are a little + // conservative and allow approximately one seek for every 16KB + // of data before triggering a compaction. + f.seekLeft = int32(size / 16384) + if f.seekLeft < 100 { + f.seekLeft = 100 + } + + return f +} + +func tableFileFromRecord(r atRecord) *tFile { + return newTableFile(storage.FileDesc{Type: storage.TypeTable, Num: r.num}, r.size, r.imin, r.imax) +} + +// tFiles hold multiple tFile. +type tFiles []*tFile + +func (tf tFiles) Len() int { return len(tf) } +func (tf tFiles) Swap(i, j int) { tf[i], tf[j] = tf[j], tf[i] } + +func (tf tFiles) nums() string { + x := "[ " + for i, f := range tf { + if i != 0 { + x += ", " + } + x += fmt.Sprint(f.fd.Num) + } + x += " ]" + return x +} + +// Returns true if i smallest key is less than j. +// This used for sort by key in ascending order. +func (tf tFiles) lessByKey(icmp *iComparer, i, j int) bool { + a, b := tf[i], tf[j] + n := icmp.Compare(a.imin, b.imin) + if n == 0 { + return a.fd.Num < b.fd.Num + } + return n < 0 +} + +// Returns true if i file number is greater than j. +// This used for sort by file number in descending order. +func (tf tFiles) lessByNum(i, j int) bool { + return tf[i].fd.Num > tf[j].fd.Num +} + +// Sorts tables by key in ascending order. +func (tf tFiles) sortByKey(icmp *iComparer) { + sort.Sort(&tFilesSortByKey{tFiles: tf, icmp: icmp}) +} + +// Sorts tables by file number in descending order. +func (tf tFiles) sortByNum() { + sort.Sort(&tFilesSortByNum{tFiles: tf}) +} + +// Returns sum of all tables size. +func (tf tFiles) size() (sum int64) { + for _, t := range tf { + sum += t.size + } + return sum +} + +// Searches smallest index of tables whose its smallest +// key is after or equal with given key. +func (tf tFiles) searchMin(icmp *iComparer, ikey internalKey) int { + return sort.Search(len(tf), func(i int) bool { + return icmp.Compare(tf[i].imin, ikey) >= 0 + }) +} + +// Searches smallest index of tables whose its largest +// key is after or equal with given key. +func (tf tFiles) searchMax(icmp *iComparer, ikey internalKey) int { + return sort.Search(len(tf), func(i int) bool { + return icmp.Compare(tf[i].imax, ikey) >= 0 + }) +} + +// Returns true if given key range overlaps with one or more +// tables key range. If unsorted is true then binary search will not be used. +func (tf tFiles) overlaps(icmp *iComparer, umin, umax []byte, unsorted bool) bool { + if unsorted { + // Check against all files. + for _, t := range tf { + if t.overlaps(icmp, umin, umax) { + return true + } + } + return false + } + + i := 0 + if len(umin) > 0 { + // Find the earliest possible internal key for min. + i = tf.searchMax(icmp, makeInternalKey(nil, umin, keyMaxSeq, keyTypeSeek)) + } + if i >= len(tf) { + // Beginning of range is after all files, so no overlap. + return false + } + return !tf[i].before(icmp, umax) +} + +// Returns tables whose its key range overlaps with given key range. +// Range will be expanded if ukey found hop across tables. +// If overlapped is true then the search will be restarted if umax +// expanded. +// The dst content will be overwritten. +func (tf tFiles) getOverlaps(dst tFiles, icmp *iComparer, umin, umax []byte, overlapped bool) tFiles { + dst = dst[:0] + for i := 0; i < len(tf); { + t := tf[i] + if t.overlaps(icmp, umin, umax) { + if umin != nil && icmp.uCompare(t.imin.ukey(), umin) < 0 { + umin = t.imin.ukey() + dst = dst[:0] + i = 0 + continue + } else if umax != nil && icmp.uCompare(t.imax.ukey(), umax) > 0 { + umax = t.imax.ukey() + // Restart search if it is overlapped. + if overlapped { + dst = dst[:0] + i = 0 + continue + } + } + + dst = append(dst, t) + } + i++ + } + + return dst +} + +// Returns tables key range. +func (tf tFiles) getRange(icmp *iComparer) (imin, imax internalKey) { + for i, t := range tf { + if i == 0 { + imin, imax = t.imin, t.imax + continue + } + if icmp.Compare(t.imin, imin) < 0 { + imin = t.imin + } + if icmp.Compare(t.imax, imax) > 0 { + imax = t.imax + } + } + + return +} + +// Creates iterator index from tables. +func (tf tFiles) newIndexIterator(tops *tOps, icmp *iComparer, slice *util.Range, ro *opt.ReadOptions) iterator.IteratorIndexer { + if slice != nil { + var start, limit int + if slice.Start != nil { + start = tf.searchMax(icmp, internalKey(slice.Start)) + } + if slice.Limit != nil { + limit = tf.searchMin(icmp, internalKey(slice.Limit)) + } else { + limit = tf.Len() + } + tf = tf[start:limit] + } + return iterator.NewArrayIndexer(&tFilesArrayIndexer{ + tFiles: tf, + tops: tops, + icmp: icmp, + slice: slice, + ro: ro, + }) +} + +// Tables iterator index. +type tFilesArrayIndexer struct { + tFiles + tops *tOps + icmp *iComparer + slice *util.Range + ro *opt.ReadOptions +} + +func (a *tFilesArrayIndexer) Search(key []byte) int { + return a.searchMax(a.icmp, internalKey(key)) +} + +func (a *tFilesArrayIndexer) Get(i int) iterator.Iterator { + if i == 0 || i == a.Len()-1 { + return a.tops.newIterator(a.tFiles[i], a.slice, a.ro) + } + return a.tops.newIterator(a.tFiles[i], nil, a.ro) +} + +// Helper type for sortByKey. +type tFilesSortByKey struct { + tFiles + icmp *iComparer +} + +func (x *tFilesSortByKey) Less(i, j int) bool { + return x.lessByKey(x.icmp, i, j) +} + +// Helper type for sortByNum. +type tFilesSortByNum struct { + tFiles +} + +func (x *tFilesSortByNum) Less(i, j int) bool { + return x.lessByNum(i, j) +} + +// Table operations. +type tOps struct { + s *session + noSync bool + evictRemoved bool + cache *cache.Cache + bcache *cache.Cache + bpool *util.BufferPool +} + +// Creates an empty table and returns table writer. +func (t *tOps) create() (*tWriter, error) { + fd := storage.FileDesc{Type: storage.TypeTable, Num: t.s.allocFileNum()} + fw, err := t.s.stor.Create(fd) + if err != nil { + return nil, err + } + return &tWriter{ + t: t, + fd: fd, + w: fw, + tw: table.NewWriter(fw, t.s.o.Options), + }, nil +} + +// Builds table from src iterator. +func (t *tOps) createFrom(src iterator.Iterator) (f *tFile, n int, err error) { + w, err := t.create() + if err != nil { + return + } + + defer func() { + if err != nil { + w.drop() + } + }() + + for src.Next() { + err = w.append(src.Key(), src.Value()) + if err != nil { + return + } + } + err = src.Error() + if err != nil { + return + } + + n = w.tw.EntriesLen() + f, err = w.finish() + return +} + +// Opens table. It returns a cache handle, which should +// be released after use. +func (t *tOps) open(f *tFile) (ch *cache.Handle, err error) { + ch = t.cache.Get(0, uint64(f.fd.Num), func() (size int, value cache.Value) { + var r storage.Reader + r, err = t.s.stor.Open(f.fd) + if err != nil { + return 0, nil + } + + var bcache *cache.NamespaceGetter + if t.bcache != nil { + bcache = &cache.NamespaceGetter{Cache: t.bcache, NS: uint64(f.fd.Num)} + } + + var tr *table.Reader + tr, err = table.NewReader(r, f.size, f.fd, bcache, t.bpool, t.s.o.Options) + if err != nil { + r.Close() + return 0, nil + } + return 1, tr + + }) + if ch == nil && err == nil { + err = ErrClosed + } + return +} + +// Finds key/value pair whose key is greater than or equal to the +// given key. +func (t *tOps) find(f *tFile, key []byte, ro *opt.ReadOptions) (rkey, rvalue []byte, err error) { + ch, err := t.open(f) + if err != nil { + return nil, nil, err + } + defer ch.Release() + return ch.Value().(*table.Reader).Find(key, true, ro) +} + +// Finds key that is greater than or equal to the given key. +func (t *tOps) findKey(f *tFile, key []byte, ro *opt.ReadOptions) (rkey []byte, err error) { + ch, err := t.open(f) + if err != nil { + return nil, err + } + defer ch.Release() + return ch.Value().(*table.Reader).FindKey(key, true, ro) +} + +// Returns approximate offset of the given key. +func (t *tOps) offsetOf(f *tFile, key []byte) (offset int64, err error) { + ch, err := t.open(f) + if err != nil { + return + } + defer ch.Release() + return ch.Value().(*table.Reader).OffsetOf(key) +} + +// Creates an iterator from the given table. +func (t *tOps) newIterator(f *tFile, slice *util.Range, ro *opt.ReadOptions) iterator.Iterator { + ch, err := t.open(f) + if err != nil { + return iterator.NewEmptyIterator(err) + } + iter := ch.Value().(*table.Reader).NewIterator(slice, ro) + iter.SetReleaser(ch) + return iter +} + +// Removes table from persistent storage. It waits until +// no one use the the table. +func (t *tOps) remove(f *tFile) { + t.cache.Delete(0, uint64(f.fd.Num), func() { + if err := t.s.stor.Remove(f.fd); err != nil { + t.s.logf("table@remove removing @%d %q", f.fd.Num, err) + } else { + t.s.logf("table@remove removed @%d", f.fd.Num) + } + if t.evictRemoved && t.bcache != nil { + t.bcache.EvictNS(uint64(f.fd.Num)) + } + }) +} + +// Closes the table ops instance. It will close all tables, +// regadless still used or not. +func (t *tOps) close() { + t.bpool.Close() + t.cache.Close() + if t.bcache != nil { + t.bcache.CloseWeak() + } +} + +// Creates new initialized table ops instance. +func newTableOps(s *session) *tOps { + var ( + cacher cache.Cacher + bcache *cache.Cache + bpool *util.BufferPool + ) + if s.o.GetOpenFilesCacheCapacity() > 0 { + cacher = cache.NewLRU(s.o.GetOpenFilesCacheCapacity()) + } + if !s.o.GetDisableBlockCache() { + var bcacher cache.Cacher + if s.o.GetBlockCacheCapacity() > 0 { + bcacher = s.o.GetBlockCacher().New(s.o.GetBlockCacheCapacity()) + } + bcache = cache.NewCache(bcacher) + } + if !s.o.GetDisableBufferPool() { + bpool = util.NewBufferPool(s.o.GetBlockSize() + 5) + } + return &tOps{ + s: s, + noSync: s.o.GetNoSync(), + evictRemoved: s.o.GetBlockCacheEvictRemoved(), + cache: cache.NewCache(cacher), + bcache: bcache, + bpool: bpool, + } +} + +// tWriter wraps the table writer. It keep track of file descriptor +// and added key range. +type tWriter struct { + t *tOps + + fd storage.FileDesc + w storage.Writer + tw *table.Writer + + first, last []byte +} + +// Append key/value pair to the table. +func (w *tWriter) append(key, value []byte) error { + if w.first == nil { + w.first = append([]byte{}, key...) + } + w.last = append(w.last[:0], key...) + return w.tw.Append(key, value) +} + +// Returns true if the table is empty. +func (w *tWriter) empty() bool { + return w.first == nil +} + +// Closes the storage.Writer. +func (w *tWriter) close() { + if w.w != nil { + w.w.Close() + w.w = nil + } +} + +// Finalizes the table and returns table file. +func (w *tWriter) finish() (f *tFile, err error) { + defer w.close() + err = w.tw.Close() + if err != nil { + return + } + if !w.t.noSync { + err = w.w.Sync() + if err != nil { + return + } + } + f = newTableFile(w.fd, int64(w.tw.BytesLen()), internalKey(w.first), internalKey(w.last)) + return +} + +// Drops the table. +func (w *tWriter) drop() { + w.close() + w.t.s.stor.Remove(w.fd) + w.t.s.reuseFileNum(w.fd.Num) + w.tw = nil + w.first = nil + w.last = nil +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/table/reader.go b/vendor/github.com/syndtr/goleveldb/leveldb/table/reader.go new file mode 100644 index 000000000..496feb6fb --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/table/reader.go @@ -0,0 +1,1139 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package table + +import ( + "encoding/binary" + "fmt" + "io" + "sort" + "strings" + "sync" + + "github.com/golang/snappy" + + "github.com/syndtr/goleveldb/leveldb/cache" + "github.com/syndtr/goleveldb/leveldb/comparer" + "github.com/syndtr/goleveldb/leveldb/errors" + "github.com/syndtr/goleveldb/leveldb/filter" + "github.com/syndtr/goleveldb/leveldb/iterator" + "github.com/syndtr/goleveldb/leveldb/opt" + "github.com/syndtr/goleveldb/leveldb/storage" + "github.com/syndtr/goleveldb/leveldb/util" +) + +// Reader errors. +var ( + ErrNotFound = errors.ErrNotFound + ErrReaderReleased = errors.New("leveldb/table: reader released") + ErrIterReleased = errors.New("leveldb/table: iterator released") +) + +// ErrCorrupted describes error due to corruption. This error will be wrapped +// with errors.ErrCorrupted. +type ErrCorrupted struct { + Pos int64 + Size int64 + Kind string + Reason string +} + +func (e *ErrCorrupted) Error() string { + return fmt.Sprintf("leveldb/table: corruption on %s (pos=%d): %s", e.Kind, e.Pos, e.Reason) +} + +func max(x, y int) int { + if x > y { + return x + } + return y +} + +type block struct { + bpool *util.BufferPool + bh blockHandle + data []byte + restartsLen int + restartsOffset int +} + +func (b *block) seek(cmp comparer.Comparer, rstart, rlimit int, key []byte) (index, offset int, err error) { + index = sort.Search(b.restartsLen-rstart-(b.restartsLen-rlimit), func(i int) bool { + offset := int(binary.LittleEndian.Uint32(b.data[b.restartsOffset+4*(rstart+i):])) + offset++ // shared always zero, since this is a restart point + v1, n1 := binary.Uvarint(b.data[offset:]) // key length + _, n2 := binary.Uvarint(b.data[offset+n1:]) // value length + m := offset + n1 + n2 + return cmp.Compare(b.data[m:m+int(v1)], key) > 0 + }) + rstart - 1 + if index < rstart { + // The smallest key is greater-than key sought. + index = rstart + } + offset = int(binary.LittleEndian.Uint32(b.data[b.restartsOffset+4*index:])) + return +} + +func (b *block) restartIndex(rstart, rlimit, offset int) int { + return sort.Search(b.restartsLen-rstart-(b.restartsLen-rlimit), func(i int) bool { + return int(binary.LittleEndian.Uint32(b.data[b.restartsOffset+4*(rstart+i):])) > offset + }) + rstart - 1 +} + +func (b *block) restartOffset(index int) int { + return int(binary.LittleEndian.Uint32(b.data[b.restartsOffset+4*index:])) +} + +func (b *block) entry(offset int) (key, value []byte, nShared, n int, err error) { + if offset >= b.restartsOffset { + if offset != b.restartsOffset { + err = &ErrCorrupted{Reason: "entries offset not aligned"} + } + return + } + v0, n0 := binary.Uvarint(b.data[offset:]) // Shared prefix length + v1, n1 := binary.Uvarint(b.data[offset+n0:]) // Key length + v2, n2 := binary.Uvarint(b.data[offset+n0+n1:]) // Value length + m := n0 + n1 + n2 + n = m + int(v1) + int(v2) + if n0 <= 0 || n1 <= 0 || n2 <= 0 || offset+n > b.restartsOffset { + err = &ErrCorrupted{Reason: "entries corrupted"} + return + } + key = b.data[offset+m : offset+m+int(v1)] + value = b.data[offset+m+int(v1) : offset+n] + nShared = int(v0) + return +} + +func (b *block) Release() { + b.bpool.Put(b.data) + b.bpool = nil + b.data = nil +} + +type dir int + +const ( + dirReleased dir = iota - 1 + dirSOI + dirEOI + dirBackward + dirForward +) + +type blockIter struct { + tr *Reader + block *block + blockReleaser util.Releaser + releaser util.Releaser + key, value []byte + offset int + // Previous offset, only filled by Next. + prevOffset int + prevNode []int + prevKeys []byte + restartIndex int + // Iterator direction. + dir dir + // Restart index slice range. + riStart int + riLimit int + // Offset slice range. + offsetStart int + offsetRealStart int + offsetLimit int + // Error. + err error +} + +func (i *blockIter) sErr(err error) { + i.err = err + i.key = nil + i.value = nil + i.prevNode = nil + i.prevKeys = nil +} + +func (i *blockIter) reset() { + if i.dir == dirBackward { + i.prevNode = i.prevNode[:0] + i.prevKeys = i.prevKeys[:0] + } + i.restartIndex = i.riStart + i.offset = i.offsetStart + i.dir = dirSOI + i.key = i.key[:0] + i.value = nil +} + +func (i *blockIter) isFirst() bool { + switch i.dir { + case dirForward: + return i.prevOffset == i.offsetRealStart + case dirBackward: + return len(i.prevNode) == 1 && i.restartIndex == i.riStart + } + return false +} + +func (i *blockIter) isLast() bool { + switch i.dir { + case dirForward, dirBackward: + return i.offset == i.offsetLimit + } + return false +} + +func (i *blockIter) First() bool { + if i.err != nil { + return false + } else if i.dir == dirReleased { + i.err = ErrIterReleased + return false + } + + if i.dir == dirBackward { + i.prevNode = i.prevNode[:0] + i.prevKeys = i.prevKeys[:0] + } + i.dir = dirSOI + return i.Next() +} + +func (i *blockIter) Last() bool { + if i.err != nil { + return false + } else if i.dir == dirReleased { + i.err = ErrIterReleased + return false + } + + if i.dir == dirBackward { + i.prevNode = i.prevNode[:0] + i.prevKeys = i.prevKeys[:0] + } + i.dir = dirEOI + return i.Prev() +} + +func (i *blockIter) Seek(key []byte) bool { + if i.err != nil { + return false + } else if i.dir == dirReleased { + i.err = ErrIterReleased + return false + } + + ri, offset, err := i.block.seek(i.tr.cmp, i.riStart, i.riLimit, key) + if err != nil { + i.sErr(err) + return false + } + i.restartIndex = ri + i.offset = max(i.offsetStart, offset) + if i.dir == dirSOI || i.dir == dirEOI { + i.dir = dirForward + } + for i.Next() { + if i.tr.cmp.Compare(i.key, key) >= 0 { + return true + } + } + return false +} + +func (i *blockIter) Next() bool { + if i.dir == dirEOI || i.err != nil { + return false + } else if i.dir == dirReleased { + i.err = ErrIterReleased + return false + } + + if i.dir == dirSOI { + i.restartIndex = i.riStart + i.offset = i.offsetStart + } else if i.dir == dirBackward { + i.prevNode = i.prevNode[:0] + i.prevKeys = i.prevKeys[:0] + } + for i.offset < i.offsetRealStart { + key, value, nShared, n, err := i.block.entry(i.offset) + if err != nil { + i.sErr(i.tr.fixErrCorruptedBH(i.block.bh, err)) + return false + } + if n == 0 { + i.dir = dirEOI + return false + } + i.key = append(i.key[:nShared], key...) + i.value = value + i.offset += n + } + if i.offset >= i.offsetLimit { + i.dir = dirEOI + if i.offset != i.offsetLimit { + i.sErr(i.tr.newErrCorruptedBH(i.block.bh, "entries offset not aligned")) + } + return false + } + key, value, nShared, n, err := i.block.entry(i.offset) + if err != nil { + i.sErr(i.tr.fixErrCorruptedBH(i.block.bh, err)) + return false + } + if n == 0 { + i.dir = dirEOI + return false + } + i.key = append(i.key[:nShared], key...) + i.value = value + i.prevOffset = i.offset + i.offset += n + i.dir = dirForward + return true +} + +func (i *blockIter) Prev() bool { + if i.dir == dirSOI || i.err != nil { + return false + } else if i.dir == dirReleased { + i.err = ErrIterReleased + return false + } + + var ri int + if i.dir == dirForward { + // Change direction. + i.offset = i.prevOffset + if i.offset == i.offsetRealStart { + i.dir = dirSOI + return false + } + ri = i.block.restartIndex(i.restartIndex, i.riLimit, i.offset) + i.dir = dirBackward + } else if i.dir == dirEOI { + // At the end of iterator. + i.restartIndex = i.riLimit + i.offset = i.offsetLimit + if i.offset == i.offsetRealStart { + i.dir = dirSOI + return false + } + ri = i.riLimit - 1 + i.dir = dirBackward + } else if len(i.prevNode) == 1 { + // This is the end of a restart range. + i.offset = i.prevNode[0] + i.prevNode = i.prevNode[:0] + if i.restartIndex == i.riStart { + i.dir = dirSOI + return false + } + i.restartIndex-- + ri = i.restartIndex + } else { + // In the middle of restart range, get from cache. + n := len(i.prevNode) - 3 + node := i.prevNode[n:] + i.prevNode = i.prevNode[:n] + // Get the key. + ko := node[0] + i.key = append(i.key[:0], i.prevKeys[ko:]...) + i.prevKeys = i.prevKeys[:ko] + // Get the value. + vo := node[1] + vl := vo + node[2] + i.value = i.block.data[vo:vl] + i.offset = vl + return true + } + // Build entries cache. + i.key = i.key[:0] + i.value = nil + offset := i.block.restartOffset(ri) + if offset == i.offset { + ri-- + if ri < 0 { + i.dir = dirSOI + return false + } + offset = i.block.restartOffset(ri) + } + i.prevNode = append(i.prevNode, offset) + for { + key, value, nShared, n, err := i.block.entry(offset) + if err != nil { + i.sErr(i.tr.fixErrCorruptedBH(i.block.bh, err)) + return false + } + if offset >= i.offsetRealStart { + if i.value != nil { + // Appends 3 variables: + // 1. Previous keys offset + // 2. Value offset in the data block + // 3. Value length + i.prevNode = append(i.prevNode, len(i.prevKeys), offset-len(i.value), len(i.value)) + i.prevKeys = append(i.prevKeys, i.key...) + } + i.value = value + } + i.key = append(i.key[:nShared], key...) + offset += n + // Stop if target offset reached. + if offset >= i.offset { + if offset != i.offset { + i.sErr(i.tr.newErrCorruptedBH(i.block.bh, "entries offset not aligned")) + return false + } + + break + } + } + i.restartIndex = ri + i.offset = offset + return true +} + +func (i *blockIter) Key() []byte { + if i.err != nil || i.dir <= dirEOI { + return nil + } + return i.key +} + +func (i *blockIter) Value() []byte { + if i.err != nil || i.dir <= dirEOI { + return nil + } + return i.value +} + +func (i *blockIter) Release() { + if i.dir != dirReleased { + i.tr = nil + i.block = nil + i.prevNode = nil + i.prevKeys = nil + i.key = nil + i.value = nil + i.dir = dirReleased + if i.blockReleaser != nil { + i.blockReleaser.Release() + i.blockReleaser = nil + } + if i.releaser != nil { + i.releaser.Release() + i.releaser = nil + } + } +} + +func (i *blockIter) SetReleaser(releaser util.Releaser) { + if i.dir == dirReleased { + panic(util.ErrReleased) + } + if i.releaser != nil && releaser != nil { + panic(util.ErrHasReleaser) + } + i.releaser = releaser +} + +func (i *blockIter) Valid() bool { + return i.err == nil && (i.dir == dirBackward || i.dir == dirForward) +} + +func (i *blockIter) Error() error { + return i.err +} + +type filterBlock struct { + bpool *util.BufferPool + data []byte + oOffset int + baseLg uint + filtersNum int +} + +func (b *filterBlock) contains(filter filter.Filter, offset uint64, key []byte) bool { + i := int(offset >> b.baseLg) + if i < b.filtersNum { + o := b.data[b.oOffset+i*4:] + n := int(binary.LittleEndian.Uint32(o)) + m := int(binary.LittleEndian.Uint32(o[4:])) + if n < m && m <= b.oOffset { + return filter.Contains(b.data[n:m], key) + } else if n == m { + return false + } + } + return true +} + +func (b *filterBlock) Release() { + b.bpool.Put(b.data) + b.bpool = nil + b.data = nil +} + +type indexIter struct { + *blockIter + tr *Reader + slice *util.Range + // Options + fillCache bool +} + +func (i *indexIter) Get() iterator.Iterator { + value := i.Value() + if value == nil { + return nil + } + dataBH, n := decodeBlockHandle(value) + if n == 0 { + return iterator.NewEmptyIterator(i.tr.newErrCorruptedBH(i.tr.indexBH, "bad data block handle")) + } + + var slice *util.Range + if i.slice != nil && (i.blockIter.isFirst() || i.blockIter.isLast()) { + slice = i.slice + } + return i.tr.getDataIterErr(dataBH, slice, i.tr.verifyChecksum, i.fillCache) +} + +// Reader is a table reader. +type Reader struct { + mu sync.RWMutex + fd storage.FileDesc + reader io.ReaderAt + cache *cache.NamespaceGetter + err error + bpool *util.BufferPool + // Options + o *opt.Options + cmp comparer.Comparer + filter filter.Filter + verifyChecksum bool + + dataEnd int64 + metaBH, indexBH, filterBH blockHandle + indexBlock *block + filterBlock *filterBlock +} + +func (r *Reader) blockKind(bh blockHandle) string { + switch bh.offset { + case r.metaBH.offset: + return "meta-block" + case r.indexBH.offset: + return "index-block" + case r.filterBH.offset: + if r.filterBH.length > 0 { + return "filter-block" + } + } + return "data-block" +} + +func (r *Reader) newErrCorrupted(pos, size int64, kind, reason string) error { + return &errors.ErrCorrupted{Fd: r.fd, Err: &ErrCorrupted{Pos: pos, Size: size, Kind: kind, Reason: reason}} +} + +func (r *Reader) newErrCorruptedBH(bh blockHandle, reason string) error { + return r.newErrCorrupted(int64(bh.offset), int64(bh.length), r.blockKind(bh), reason) +} + +func (r *Reader) fixErrCorruptedBH(bh blockHandle, err error) error { + if cerr, ok := err.(*ErrCorrupted); ok { + cerr.Pos = int64(bh.offset) + cerr.Size = int64(bh.length) + cerr.Kind = r.blockKind(bh) + return &errors.ErrCorrupted{Fd: r.fd, Err: cerr} + } + return err +} + +func (r *Reader) readRawBlock(bh blockHandle, verifyChecksum bool) ([]byte, error) { + data := r.bpool.Get(int(bh.length + blockTrailerLen)) + if _, err := r.reader.ReadAt(data, int64(bh.offset)); err != nil && err != io.EOF { + return nil, err + } + + if verifyChecksum { + n := bh.length + 1 + checksum0 := binary.LittleEndian.Uint32(data[n:]) + checksum1 := util.NewCRC(data[:n]).Value() + if checksum0 != checksum1 { + r.bpool.Put(data) + return nil, r.newErrCorruptedBH(bh, fmt.Sprintf("checksum mismatch, want=%#x got=%#x", checksum0, checksum1)) + } + } + + switch data[bh.length] { + case blockTypeNoCompression: + data = data[:bh.length] + case blockTypeSnappyCompression: + decLen, err := snappy.DecodedLen(data[:bh.length]) + if err != nil { + r.bpool.Put(data) + return nil, r.newErrCorruptedBH(bh, err.Error()) + } + decData := r.bpool.Get(decLen) + decData, err = snappy.Decode(decData, data[:bh.length]) + r.bpool.Put(data) + if err != nil { + r.bpool.Put(decData) + return nil, r.newErrCorruptedBH(bh, err.Error()) + } + data = decData + default: + r.bpool.Put(data) + return nil, r.newErrCorruptedBH(bh, fmt.Sprintf("unknown compression type %#x", data[bh.length])) + } + return data, nil +} + +func (r *Reader) readBlock(bh blockHandle, verifyChecksum bool) (*block, error) { + data, err := r.readRawBlock(bh, verifyChecksum) + if err != nil { + return nil, err + } + restartsLen := int(binary.LittleEndian.Uint32(data[len(data)-4:])) + b := &block{ + bpool: r.bpool, + bh: bh, + data: data, + restartsLen: restartsLen, + restartsOffset: len(data) - (restartsLen+1)*4, + } + return b, nil +} + +func (r *Reader) readBlockCached(bh blockHandle, verifyChecksum, fillCache bool) (*block, util.Releaser, error) { + if r.cache != nil { + var ( + err error + ch *cache.Handle + ) + if fillCache { + ch = r.cache.Get(bh.offset, func() (size int, value cache.Value) { + var b *block + b, err = r.readBlock(bh, verifyChecksum) + if err != nil { + return 0, nil + } + return cap(b.data), b + }) + } else { + ch = r.cache.Get(bh.offset, nil) + } + if ch != nil { + b, ok := ch.Value().(*block) + if !ok { + ch.Release() + return nil, nil, errors.New("leveldb/table: inconsistent block type") + } + return b, ch, err + } else if err != nil { + return nil, nil, err + } + } + + b, err := r.readBlock(bh, verifyChecksum) + return b, b, err +} + +func (r *Reader) readFilterBlock(bh blockHandle) (*filterBlock, error) { + data, err := r.readRawBlock(bh, true) + if err != nil { + return nil, err + } + n := len(data) + if n < 5 { + return nil, r.newErrCorruptedBH(bh, "too short") + } + m := n - 5 + oOffset := int(binary.LittleEndian.Uint32(data[m:])) + if oOffset > m { + return nil, r.newErrCorruptedBH(bh, "invalid data-offsets offset") + } + b := &filterBlock{ + bpool: r.bpool, + data: data, + oOffset: oOffset, + baseLg: uint(data[n-1]), + filtersNum: (m - oOffset) / 4, + } + return b, nil +} + +func (r *Reader) readFilterBlockCached(bh blockHandle, fillCache bool) (*filterBlock, util.Releaser, error) { + if r.cache != nil { + var ( + err error + ch *cache.Handle + ) + if fillCache { + ch = r.cache.Get(bh.offset, func() (size int, value cache.Value) { + var b *filterBlock + b, err = r.readFilterBlock(bh) + if err != nil { + return 0, nil + } + return cap(b.data), b + }) + } else { + ch = r.cache.Get(bh.offset, nil) + } + if ch != nil { + b, ok := ch.Value().(*filterBlock) + if !ok { + ch.Release() + return nil, nil, errors.New("leveldb/table: inconsistent block type") + } + return b, ch, err + } else if err != nil { + return nil, nil, err + } + } + + b, err := r.readFilterBlock(bh) + return b, b, err +} + +func (r *Reader) getIndexBlock(fillCache bool) (b *block, rel util.Releaser, err error) { + if r.indexBlock == nil { + return r.readBlockCached(r.indexBH, true, fillCache) + } + return r.indexBlock, util.NoopReleaser{}, nil +} + +func (r *Reader) getFilterBlock(fillCache bool) (*filterBlock, util.Releaser, error) { + if r.filterBlock == nil { + return r.readFilterBlockCached(r.filterBH, fillCache) + } + return r.filterBlock, util.NoopReleaser{}, nil +} + +func (r *Reader) newBlockIter(b *block, bReleaser util.Releaser, slice *util.Range, inclLimit bool) *blockIter { + bi := &blockIter{ + tr: r, + block: b, + blockReleaser: bReleaser, + // Valid key should never be nil. + key: make([]byte, 0), + dir: dirSOI, + riStart: 0, + riLimit: b.restartsLen, + offsetStart: 0, + offsetRealStart: 0, + offsetLimit: b.restartsOffset, + } + if slice != nil { + if slice.Start != nil { + if bi.Seek(slice.Start) { + bi.riStart = b.restartIndex(bi.restartIndex, b.restartsLen, bi.prevOffset) + bi.offsetStart = b.restartOffset(bi.riStart) + bi.offsetRealStart = bi.prevOffset + } else { + bi.riStart = b.restartsLen + bi.offsetStart = b.restartsOffset + bi.offsetRealStart = b.restartsOffset + } + } + if slice.Limit != nil { + if bi.Seek(slice.Limit) && (!inclLimit || bi.Next()) { + bi.offsetLimit = bi.prevOffset + bi.riLimit = bi.restartIndex + 1 + } + } + bi.reset() + if bi.offsetStart > bi.offsetLimit { + bi.sErr(errors.New("leveldb/table: invalid slice range")) + } + } + return bi +} + +func (r *Reader) getDataIter(dataBH blockHandle, slice *util.Range, verifyChecksum, fillCache bool) iterator.Iterator { + b, rel, err := r.readBlockCached(dataBH, verifyChecksum, fillCache) + if err != nil { + return iterator.NewEmptyIterator(err) + } + return r.newBlockIter(b, rel, slice, false) +} + +func (r *Reader) getDataIterErr(dataBH blockHandle, slice *util.Range, verifyChecksum, fillCache bool) iterator.Iterator { + r.mu.RLock() + defer r.mu.RUnlock() + + if r.err != nil { + return iterator.NewEmptyIterator(r.err) + } + + return r.getDataIter(dataBH, slice, verifyChecksum, fillCache) +} + +// NewIterator creates an iterator from the table. +// +// Slice allows slicing the iterator to only contains keys in the given +// range. A nil Range.Start is treated as a key before all keys in the +// table. And a nil Range.Limit is treated as a key after all keys in +// the table. +// +// WARNING: Any slice returned by interator (e.g. slice returned by calling +// Iterator.Key() or Iterator.Key() methods), its content should not be modified +// unless noted otherwise. +// +// The returned iterator is not safe for concurrent use and should be released +// after use. +// +// Also read Iterator documentation of the leveldb/iterator package. +func (r *Reader) NewIterator(slice *util.Range, ro *opt.ReadOptions) iterator.Iterator { + r.mu.RLock() + defer r.mu.RUnlock() + + if r.err != nil { + return iterator.NewEmptyIterator(r.err) + } + + fillCache := !ro.GetDontFillCache() + indexBlock, rel, err := r.getIndexBlock(fillCache) + if err != nil { + return iterator.NewEmptyIterator(err) + } + index := &indexIter{ + blockIter: r.newBlockIter(indexBlock, rel, slice, true), + tr: r, + slice: slice, + fillCache: !ro.GetDontFillCache(), + } + return iterator.NewIndexedIterator(index, opt.GetStrict(r.o, ro, opt.StrictReader)) +} + +func (r *Reader) find(key []byte, filtered bool, ro *opt.ReadOptions, noValue bool) (rkey, value []byte, err error) { + r.mu.RLock() + defer r.mu.RUnlock() + + if r.err != nil { + err = r.err + return + } + + indexBlock, rel, err := r.getIndexBlock(true) + if err != nil { + return + } + defer rel.Release() + + index := r.newBlockIter(indexBlock, nil, nil, true) + defer index.Release() + + if !index.Seek(key) { + if err = index.Error(); err == nil { + err = ErrNotFound + } + return + } + + dataBH, n := decodeBlockHandle(index.Value()) + if n == 0 { + r.err = r.newErrCorruptedBH(r.indexBH, "bad data block handle") + return nil, nil, r.err + } + + // The filter should only used for exact match. + if filtered && r.filter != nil { + filterBlock, frel, ferr := r.getFilterBlock(true) + if ferr == nil { + if !filterBlock.contains(r.filter, dataBH.offset, key) { + frel.Release() + return nil, nil, ErrNotFound + } + frel.Release() + } else if !errors.IsCorrupted(ferr) { + return nil, nil, ferr + } + } + + data := r.getDataIter(dataBH, nil, r.verifyChecksum, !ro.GetDontFillCache()) + if !data.Seek(key) { + data.Release() + if err = data.Error(); err != nil { + return + } + + // The nearest greater-than key is the first key of the next block. + if !index.Next() { + if err = index.Error(); err == nil { + err = ErrNotFound + } + return + } + + dataBH, n = decodeBlockHandle(index.Value()) + if n == 0 { + r.err = r.newErrCorruptedBH(r.indexBH, "bad data block handle") + return nil, nil, r.err + } + + data = r.getDataIter(dataBH, nil, r.verifyChecksum, !ro.GetDontFillCache()) + if !data.Next() { + data.Release() + if err = data.Error(); err == nil { + err = ErrNotFound + } + return + } + } + + // Key doesn't use block buffer, no need to copy the buffer. + rkey = data.Key() + if !noValue { + if r.bpool == nil { + value = data.Value() + } else { + // Value does use block buffer, and since the buffer will be + // recycled, it need to be copied. + value = append([]byte{}, data.Value()...) + } + } + data.Release() + return +} + +// Find finds key/value pair whose key is greater than or equal to the +// given key. It returns ErrNotFound if the table doesn't contain +// such pair. +// If filtered is true then the nearest 'block' will be checked against +// 'filter data' (if present) and will immediately return ErrNotFound if +// 'filter data' indicates that such pair doesn't exist. +// +// The caller may modify the contents of the returned slice as it is its +// own copy. +// It is safe to modify the contents of the argument after Find returns. +func (r *Reader) Find(key []byte, filtered bool, ro *opt.ReadOptions) (rkey, value []byte, err error) { + return r.find(key, filtered, ro, false) +} + +// FindKey finds key that is greater than or equal to the given key. +// It returns ErrNotFound if the table doesn't contain such key. +// If filtered is true then the nearest 'block' will be checked against +// 'filter data' (if present) and will immediately return ErrNotFound if +// 'filter data' indicates that such key doesn't exist. +// +// The caller may modify the contents of the returned slice as it is its +// own copy. +// It is safe to modify the contents of the argument after Find returns. +func (r *Reader) FindKey(key []byte, filtered bool, ro *opt.ReadOptions) (rkey []byte, err error) { + rkey, _, err = r.find(key, filtered, ro, true) + return +} + +// Get gets the value for the given key. It returns errors.ErrNotFound +// if the table does not contain the key. +// +// The caller may modify the contents of the returned slice as it is its +// own copy. +// It is safe to modify the contents of the argument after Find returns. +func (r *Reader) Get(key []byte, ro *opt.ReadOptions) (value []byte, err error) { + r.mu.RLock() + defer r.mu.RUnlock() + + if r.err != nil { + err = r.err + return + } + + rkey, value, err := r.find(key, false, ro, false) + if err == nil && r.cmp.Compare(rkey, key) != 0 { + value = nil + err = ErrNotFound + } + return +} + +// OffsetOf returns approximate offset for the given key. +// +// It is safe to modify the contents of the argument after Get returns. +func (r *Reader) OffsetOf(key []byte) (offset int64, err error) { + r.mu.RLock() + defer r.mu.RUnlock() + + if r.err != nil { + err = r.err + return + } + + indexBlock, rel, err := r.readBlockCached(r.indexBH, true, true) + if err != nil { + return + } + defer rel.Release() + + index := r.newBlockIter(indexBlock, nil, nil, true) + defer index.Release() + if index.Seek(key) { + dataBH, n := decodeBlockHandle(index.Value()) + if n == 0 { + r.err = r.newErrCorruptedBH(r.indexBH, "bad data block handle") + return + } + offset = int64(dataBH.offset) + return + } + err = index.Error() + if err == nil { + offset = r.dataEnd + } + return +} + +// Release implements util.Releaser. +// It also close the file if it is an io.Closer. +func (r *Reader) Release() { + r.mu.Lock() + defer r.mu.Unlock() + + if closer, ok := r.reader.(io.Closer); ok { + closer.Close() + } + if r.indexBlock != nil { + r.indexBlock.Release() + r.indexBlock = nil + } + if r.filterBlock != nil { + r.filterBlock.Release() + r.filterBlock = nil + } + r.reader = nil + r.cache = nil + r.bpool = nil + r.err = ErrReaderReleased +} + +// NewReader creates a new initialized table reader for the file. +// The fi, cache and bpool is optional and can be nil. +// +// The returned table reader instance is safe for concurrent use. +func NewReader(f io.ReaderAt, size int64, fd storage.FileDesc, cache *cache.NamespaceGetter, bpool *util.BufferPool, o *opt.Options) (*Reader, error) { + if f == nil { + return nil, errors.New("leveldb/table: nil file") + } + + r := &Reader{ + fd: fd, + reader: f, + cache: cache, + bpool: bpool, + o: o, + cmp: o.GetComparer(), + verifyChecksum: o.GetStrict(opt.StrictBlockChecksum), + } + + if size < footerLen { + r.err = r.newErrCorrupted(0, size, "table", "too small") + return r, nil + } + + footerPos := size - footerLen + var footer [footerLen]byte + if _, err := r.reader.ReadAt(footer[:], footerPos); err != nil && err != io.EOF { + return nil, err + } + if string(footer[footerLen-len(magic):footerLen]) != magic { + r.err = r.newErrCorrupted(footerPos, footerLen, "table-footer", "bad magic number") + return r, nil + } + + var n int + // Decode the metaindex block handle. + r.metaBH, n = decodeBlockHandle(footer[:]) + if n == 0 { + r.err = r.newErrCorrupted(footerPos, footerLen, "table-footer", "bad metaindex block handle") + return r, nil + } + + // Decode the index block handle. + r.indexBH, n = decodeBlockHandle(footer[n:]) + if n == 0 { + r.err = r.newErrCorrupted(footerPos, footerLen, "table-footer", "bad index block handle") + return r, nil + } + + // Read metaindex block. + metaBlock, err := r.readBlock(r.metaBH, true) + if err != nil { + if errors.IsCorrupted(err) { + r.err = err + return r, nil + } + return nil, err + } + + // Set data end. + r.dataEnd = int64(r.metaBH.offset) + + // Read metaindex. + metaIter := r.newBlockIter(metaBlock, nil, nil, true) + for metaIter.Next() { + key := string(metaIter.Key()) + if !strings.HasPrefix(key, "filter.") { + continue + } + fn := key[7:] + if f0 := o.GetFilter(); f0 != nil && f0.Name() == fn { + r.filter = f0 + } else { + for _, f0 := range o.GetAltFilters() { + if f0.Name() == fn { + r.filter = f0 + break + } + } + } + if r.filter != nil { + filterBH, n := decodeBlockHandle(metaIter.Value()) + if n == 0 { + continue + } + r.filterBH = filterBH + // Update data end. + r.dataEnd = int64(filterBH.offset) + break + } + } + metaIter.Release() + metaBlock.Release() + + // Cache index and filter block locally, since we don't have global cache. + if cache == nil { + r.indexBlock, err = r.readBlock(r.indexBH, true) + if err != nil { + if errors.IsCorrupted(err) { + r.err = err + return r, nil + } + return nil, err + } + if r.filter != nil { + r.filterBlock, err = r.readFilterBlock(r.filterBH) + if err != nil { + if !errors.IsCorrupted(err) { + return nil, err + } + + // Don't use filter then. + r.filter = nil + } + } + } + + return r, nil +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/table/table.go b/vendor/github.com/syndtr/goleveldb/leveldb/table/table.go new file mode 100644 index 000000000..beacdc1f0 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/table/table.go @@ -0,0 +1,177 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Package table allows read and write sorted key/value. +package table + +import ( + "encoding/binary" +) + +/* +Table: + +Table is consist of one or more data blocks, an optional filter block +a metaindex block, an index block and a table footer. Metaindex block +is a special block used to keep parameters of the table, such as filter +block name and its block handle. Index block is a special block used to +keep record of data blocks offset and length, index block use one as +restart interval. The key used by index block are the last key of preceding +block, shorter separator of adjacent blocks or shorter successor of the +last key of the last block. Filter block is an optional block contains +sequence of filter data generated by a filter generator. + +Table data structure: + + optional + / + +--------------+--------------+--------------+------+-------+-----------------+-------------+--------+ + | data block 1 | ... | data block n | filter block | metaindex block | index block | footer | + +--------------+--------------+--------------+--------------+-----------------+-------------+--------+ + + Each block followed by a 5-bytes trailer contains compression type and checksum. + +Table block trailer: + + +---------------------------+-------------------+ + | compression type (1-byte) | checksum (4-byte) | + +---------------------------+-------------------+ + + The checksum is a CRC-32 computed using Castagnoli's polynomial. Compression + type also included in the checksum. + +Table footer: + + +------------------- 40-bytes -------------------+ + / \ + +------------------------+--------------------+------+-----------------+ + | metaindex block handle / index block handle / ---- | magic (8-bytes) | + +------------------------+--------------------+------+-----------------+ + + The magic are first 64-bit of SHA-1 sum of "http://code.google.com/p/leveldb/". + +NOTE: All fixed-length integer are little-endian. +*/ + +/* +Block: + +Block is consist of one or more key/value entries and a block trailer. +Block entry shares key prefix with its preceding key until a restart +point reached. A block should contains at least one restart point. +First restart point are always zero. + +Block data structure: + + + restart point + restart point (depends on restart interval) + / / + +---------------+---------------+---------------+---------------+---------+ + | block entry 1 | block entry 2 | ... | block entry n | trailer | + +---------------+---------------+---------------+---------------+---------+ + +Key/value entry: + + +---- key len ----+ + / \ + +-------+---------+-----------+---------+--------------------+--------------+----------------+ + | shared (varint) | not shared (varint) | value len (varint) | key (varlen) | value (varlen) | + +-----------------+---------------------+--------------------+--------------+----------------+ + + Block entry shares key prefix with its preceding key: + Conditions: + restart_interval=2 + entry one : key=deck,value=v1 + entry two : key=dock,value=v2 + entry three: key=duck,value=v3 + The entries will be encoded as follow: + + + restart point (offset=0) + restart point (offset=16) + / / + +-----+-----+-----+----------+--------+-----+-----+-----+---------+--------+-----+-----+-----+----------+--------+ + | 0 | 4 | 2 | "deck" | "v1" | 1 | 3 | 2 | "ock" | "v2" | 0 | 4 | 2 | "duck" | "v3" | + +-----+-----+-----+----------+--------+-----+-----+-----+---------+--------+-----+-----+-----+----------+--------+ + \ / \ / \ / + +----------- entry one -----------+ +----------- entry two ----------+ +---------- entry three ----------+ + + The block trailer will contains two restart points: + + +------------+-----------+--------+ + | 0 | 16 | 2 | + +------------+-----------+---+----+ + \ / \ + +-- restart points --+ + restart points length + +Block trailer: + + +-- 4-bytes --+ + / \ + +-----------------+-----------------+-----------------+------------------------------+ + | restart point 1 | .... | restart point n | restart points len (4-bytes) | + +-----------------+-----------------+-----------------+------------------------------+ + + +NOTE: All fixed-length integer are little-endian. +*/ + +/* +Filter block: + +Filter block consist of one or more filter data and a filter block trailer. +The trailer contains filter data offsets, a trailer offset and a 1-byte base Lg. + +Filter block data structure: + + + offset 1 + offset 2 + offset n + trailer offset + / / / / + +---------------+---------------+---------------+---------+ + | filter data 1 | ... | filter data n | trailer | + +---------------+---------------+---------------+---------+ + +Filter block trailer: + + +- 4-bytes -+ + / \ + +---------------+---------------+---------------+-------------------------------+------------------+ + | data 1 offset | .... | data n offset | data-offsets offset (4-bytes) | base Lg (1-byte) | + +-------------- +---------------+---------------+-------------------------------+------------------+ + + +NOTE: All fixed-length integer are little-endian. +*/ + +const ( + blockTrailerLen = 5 + footerLen = 48 + + magic = "\x57\xfb\x80\x8b\x24\x75\x47\xdb" + + // The block type gives the per-block compression format. + // These constants are part of the file format and should not be changed. + blockTypeNoCompression = 0 + blockTypeSnappyCompression = 1 + + // Generate new filter every 2KB of data + filterBaseLg = 11 + filterBase = 1 << filterBaseLg +) + +type blockHandle struct { + offset, length uint64 +} + +func decodeBlockHandle(src []byte) (blockHandle, int) { + offset, n := binary.Uvarint(src) + length, m := binary.Uvarint(src[n:]) + if n == 0 || m == 0 { + return blockHandle{}, 0 + } + return blockHandle{offset, length}, n + m +} + +func encodeBlockHandle(dst []byte, b blockHandle) int { + n := binary.PutUvarint(dst, b.offset) + m := binary.PutUvarint(dst[n:], b.length) + return n + m +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/table/writer.go b/vendor/github.com/syndtr/goleveldb/leveldb/table/writer.go new file mode 100644 index 000000000..b96b271d8 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/table/writer.go @@ -0,0 +1,375 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package table + +import ( + "encoding/binary" + "errors" + "fmt" + "io" + + "github.com/golang/snappy" + + "github.com/syndtr/goleveldb/leveldb/comparer" + "github.com/syndtr/goleveldb/leveldb/filter" + "github.com/syndtr/goleveldb/leveldb/opt" + "github.com/syndtr/goleveldb/leveldb/util" +) + +func sharedPrefixLen(a, b []byte) int { + i, n := 0, len(a) + if n > len(b) { + n = len(b) + } + for i < n && a[i] == b[i] { + i++ + } + return i +} + +type blockWriter struct { + restartInterval int + buf util.Buffer + nEntries int + prevKey []byte + restarts []uint32 + scratch []byte +} + +func (w *blockWriter) append(key, value []byte) { + nShared := 0 + if w.nEntries%w.restartInterval == 0 { + w.restarts = append(w.restarts, uint32(w.buf.Len())) + } else { + nShared = sharedPrefixLen(w.prevKey, key) + } + n := binary.PutUvarint(w.scratch[0:], uint64(nShared)) + n += binary.PutUvarint(w.scratch[n:], uint64(len(key)-nShared)) + n += binary.PutUvarint(w.scratch[n:], uint64(len(value))) + w.buf.Write(w.scratch[:n]) + w.buf.Write(key[nShared:]) + w.buf.Write(value) + w.prevKey = append(w.prevKey[:0], key...) + w.nEntries++ +} + +func (w *blockWriter) finish() { + // Write restarts entry. + if w.nEntries == 0 { + // Must have at least one restart entry. + w.restarts = append(w.restarts, 0) + } + w.restarts = append(w.restarts, uint32(len(w.restarts))) + for _, x := range w.restarts { + buf4 := w.buf.Alloc(4) + binary.LittleEndian.PutUint32(buf4, x) + } +} + +func (w *blockWriter) reset() { + w.buf.Reset() + w.nEntries = 0 + w.restarts = w.restarts[:0] +} + +func (w *blockWriter) bytesLen() int { + restartsLen := len(w.restarts) + if restartsLen == 0 { + restartsLen = 1 + } + return w.buf.Len() + 4*restartsLen + 4 +} + +type filterWriter struct { + generator filter.FilterGenerator + buf util.Buffer + nKeys int + offsets []uint32 +} + +func (w *filterWriter) add(key []byte) { + if w.generator == nil { + return + } + w.generator.Add(key) + w.nKeys++ +} + +func (w *filterWriter) flush(offset uint64) { + if w.generator == nil { + return + } + for x := int(offset / filterBase); x > len(w.offsets); { + w.generate() + } +} + +func (w *filterWriter) finish() { + if w.generator == nil { + return + } + // Generate last keys. + + if w.nKeys > 0 { + w.generate() + } + w.offsets = append(w.offsets, uint32(w.buf.Len())) + for _, x := range w.offsets { + buf4 := w.buf.Alloc(4) + binary.LittleEndian.PutUint32(buf4, x) + } + w.buf.WriteByte(filterBaseLg) +} + +func (w *filterWriter) generate() { + // Record offset. + w.offsets = append(w.offsets, uint32(w.buf.Len())) + // Generate filters. + if w.nKeys > 0 { + w.generator.Generate(&w.buf) + w.nKeys = 0 + } +} + +// Writer is a table writer. +type Writer struct { + writer io.Writer + err error + // Options + cmp comparer.Comparer + filter filter.Filter + compression opt.Compression + blockSize int + + dataBlock blockWriter + indexBlock blockWriter + filterBlock filterWriter + pendingBH blockHandle + offset uint64 + nEntries int + // Scratch allocated enough for 5 uvarint. Block writer should not use + // first 20-bytes since it will be used to encode block handle, which + // then passed to the block writer itself. + scratch [50]byte + comparerScratch []byte + compressionScratch []byte +} + +func (w *Writer) writeBlock(buf *util.Buffer, compression opt.Compression) (bh blockHandle, err error) { + // Compress the buffer if necessary. + var b []byte + if compression == opt.SnappyCompression { + // Allocate scratch enough for compression and block trailer. + if n := snappy.MaxEncodedLen(buf.Len()) + blockTrailerLen; len(w.compressionScratch) < n { + w.compressionScratch = make([]byte, n) + } + compressed := snappy.Encode(w.compressionScratch, buf.Bytes()) + n := len(compressed) + b = compressed[:n+blockTrailerLen] + b[n] = blockTypeSnappyCompression + } else { + tmp := buf.Alloc(blockTrailerLen) + tmp[0] = blockTypeNoCompression + b = buf.Bytes() + } + + // Calculate the checksum. + n := len(b) - 4 + checksum := util.NewCRC(b[:n]).Value() + binary.LittleEndian.PutUint32(b[n:], checksum) + + // Write the buffer to the file. + _, err = w.writer.Write(b) + if err != nil { + return + } + bh = blockHandle{w.offset, uint64(len(b) - blockTrailerLen)} + w.offset += uint64(len(b)) + return +} + +func (w *Writer) flushPendingBH(key []byte) { + if w.pendingBH.length == 0 { + return + } + var separator []byte + if len(key) == 0 { + separator = w.cmp.Successor(w.comparerScratch[:0], w.dataBlock.prevKey) + } else { + separator = w.cmp.Separator(w.comparerScratch[:0], w.dataBlock.prevKey, key) + } + if separator == nil { + separator = w.dataBlock.prevKey + } else { + w.comparerScratch = separator + } + n := encodeBlockHandle(w.scratch[:20], w.pendingBH) + // Append the block handle to the index block. + w.indexBlock.append(separator, w.scratch[:n]) + // Reset prev key of the data block. + w.dataBlock.prevKey = w.dataBlock.prevKey[:0] + // Clear pending block handle. + w.pendingBH = blockHandle{} +} + +func (w *Writer) finishBlock() error { + w.dataBlock.finish() + bh, err := w.writeBlock(&w.dataBlock.buf, w.compression) + if err != nil { + return err + } + w.pendingBH = bh + // Reset the data block. + w.dataBlock.reset() + // Flush the filter block. + w.filterBlock.flush(w.offset) + return nil +} + +// Append appends key/value pair to the table. The keys passed must +// be in increasing order. +// +// It is safe to modify the contents of the arguments after Append returns. +func (w *Writer) Append(key, value []byte) error { + if w.err != nil { + return w.err + } + if w.nEntries > 0 && w.cmp.Compare(w.dataBlock.prevKey, key) >= 0 { + w.err = fmt.Errorf("leveldb/table: Writer: keys are not in increasing order: %q, %q", w.dataBlock.prevKey, key) + return w.err + } + + w.flushPendingBH(key) + // Append key/value pair to the data block. + w.dataBlock.append(key, value) + // Add key to the filter block. + w.filterBlock.add(key) + + // Finish the data block if block size target reached. + if w.dataBlock.bytesLen() >= w.blockSize { + if err := w.finishBlock(); err != nil { + w.err = err + return w.err + } + } + w.nEntries++ + return nil +} + +// BlocksLen returns number of blocks written so far. +func (w *Writer) BlocksLen() int { + n := w.indexBlock.nEntries + if w.pendingBH.length > 0 { + // Includes the pending block. + n++ + } + return n +} + +// EntriesLen returns number of entries added so far. +func (w *Writer) EntriesLen() int { + return w.nEntries +} + +// BytesLen returns number of bytes written so far. +func (w *Writer) BytesLen() int { + return int(w.offset) +} + +// Close will finalize the table. Calling Append is not possible +// after Close, but calling BlocksLen, EntriesLen and BytesLen +// is still possible. +func (w *Writer) Close() error { + if w.err != nil { + return w.err + } + + // Write the last data block. Or empty data block if there + // aren't any data blocks at all. + if w.dataBlock.nEntries > 0 || w.nEntries == 0 { + if err := w.finishBlock(); err != nil { + w.err = err + return w.err + } + } + w.flushPendingBH(nil) + + // Write the filter block. + var filterBH blockHandle + w.filterBlock.finish() + if buf := &w.filterBlock.buf; buf.Len() > 0 { + filterBH, w.err = w.writeBlock(buf, opt.NoCompression) + if w.err != nil { + return w.err + } + } + + // Write the metaindex block. + if filterBH.length > 0 { + key := []byte("filter." + w.filter.Name()) + n := encodeBlockHandle(w.scratch[:20], filterBH) + w.dataBlock.append(key, w.scratch[:n]) + } + w.dataBlock.finish() + metaindexBH, err := w.writeBlock(&w.dataBlock.buf, w.compression) + if err != nil { + w.err = err + return w.err + } + + // Write the index block. + w.indexBlock.finish() + indexBH, err := w.writeBlock(&w.indexBlock.buf, w.compression) + if err != nil { + w.err = err + return w.err + } + + // Write the table footer. + footer := w.scratch[:footerLen] + for i := range footer { + footer[i] = 0 + } + n := encodeBlockHandle(footer, metaindexBH) + encodeBlockHandle(footer[n:], indexBH) + copy(footer[footerLen-len(magic):], magic) + if _, err := w.writer.Write(footer); err != nil { + w.err = err + return w.err + } + w.offset += footerLen + + w.err = errors.New("leveldb/table: writer is closed") + return nil +} + +// NewWriter creates a new initialized table writer for the file. +// +// Table writer is not safe for concurrent use. +func NewWriter(f io.Writer, o *opt.Options) *Writer { + w := &Writer{ + writer: f, + cmp: o.GetComparer(), + filter: o.GetFilter(), + compression: o.GetCompression(), + blockSize: o.GetBlockSize(), + comparerScratch: make([]byte, 0), + } + // data block + w.dataBlock.restartInterval = o.GetBlockRestartInterval() + // The first 20-bytes are used for encoding block handle. + w.dataBlock.scratch = w.scratch[20:] + // index block + w.indexBlock.restartInterval = 1 + w.indexBlock.scratch = w.scratch[20:] + // filter block + if w.filter != nil { + w.filterBlock.generator = w.filter.NewGenerator() + w.filterBlock.flush(0) + } + return w +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/util.go b/vendor/github.com/syndtr/goleveldb/leveldb/util.go new file mode 100644 index 000000000..0e2b519e5 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/util.go @@ -0,0 +1,98 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "fmt" + "sort" + + "github.com/syndtr/goleveldb/leveldb/storage" +) + +func shorten(str string) string { + if len(str) <= 8 { + return str + } + return str[:3] + ".." + str[len(str)-3:] +} + +var bunits = [...]string{"", "Ki", "Mi", "Gi", "Ti"} + +func shortenb(bytes int) string { + i := 0 + for ; bytes > 1024 && i < 4; i++ { + bytes /= 1024 + } + return fmt.Sprintf("%d%sB", bytes, bunits[i]) +} + +func sshortenb(bytes int) string { + if bytes == 0 { + return "~" + } + sign := "+" + if bytes < 0 { + sign = "-" + bytes *= -1 + } + i := 0 + for ; bytes > 1024 && i < 4; i++ { + bytes /= 1024 + } + return fmt.Sprintf("%s%d%sB", sign, bytes, bunits[i]) +} + +func sint(x int) string { + if x == 0 { + return "~" + } + sign := "+" + if x < 0 { + sign = "-" + x *= -1 + } + return fmt.Sprintf("%s%d", sign, x) +} + +func minInt(a, b int) int { + if a < b { + return a + } + return b +} + +func maxInt(a, b int) int { + if a > b { + return a + } + return b +} + +type fdSorter []storage.FileDesc + +func (p fdSorter) Len() int { + return len(p) +} + +func (p fdSorter) Less(i, j int) bool { + return p[i].Num < p[j].Num +} + +func (p fdSorter) Swap(i, j int) { + p[i], p[j] = p[j], p[i] +} + +func sortFds(fds []storage.FileDesc) { + sort.Sort(fdSorter(fds)) +} + +func ensureBuffer(b []byte, n int) []byte { + if cap(b) < n { + return make([]byte, n) + } + return b[:n] +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/util/buffer.go b/vendor/github.com/syndtr/goleveldb/leveldb/util/buffer.go new file mode 100644 index 000000000..21de24255 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/util/buffer.go @@ -0,0 +1,293 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package util + +// This a copy of Go std bytes.Buffer with some modification +// and some features stripped. + +import ( + "bytes" + "io" +) + +// A Buffer is a variable-sized buffer of bytes with Read and Write methods. +// The zero value for Buffer is an empty buffer ready to use. +type Buffer struct { + buf []byte // contents are the bytes buf[off : len(buf)] + off int // read at &buf[off], write at &buf[len(buf)] + bootstrap [64]byte // memory to hold first slice; helps small buffers (Printf) avoid allocation. +} + +// Bytes returns a slice of the contents of the unread portion of the buffer; +// len(b.Bytes()) == b.Len(). If the caller changes the contents of the +// returned slice, the contents of the buffer will change provided there +// are no intervening method calls on the Buffer. +func (b *Buffer) Bytes() []byte { return b.buf[b.off:] } + +// String returns the contents of the unread portion of the buffer +// as a string. If the Buffer is a nil pointer, it returns "". +func (b *Buffer) String() string { + if b == nil { + // Special case, useful in debugging. + return "" + } + return string(b.buf[b.off:]) +} + +// Len returns the number of bytes of the unread portion of the buffer; +// b.Len() == len(b.Bytes()). +func (b *Buffer) Len() int { return len(b.buf) - b.off } + +// Truncate discards all but the first n unread bytes from the buffer. +// It panics if n is negative or greater than the length of the buffer. +func (b *Buffer) Truncate(n int) { + switch { + case n < 0 || n > b.Len(): + panic("leveldb/util.Buffer: truncation out of range") + case n == 0: + // Reuse buffer space. + b.off = 0 + } + b.buf = b.buf[0 : b.off+n] +} + +// Reset resets the buffer so it has no content. +// b.Reset() is the same as b.Truncate(0). +func (b *Buffer) Reset() { b.Truncate(0) } + +// grow grows the buffer to guarantee space for n more bytes. +// It returns the index where bytes should be written. +// If the buffer can't grow it will panic with bytes.ErrTooLarge. +func (b *Buffer) grow(n int) int { + m := b.Len() + // If buffer is empty, reset to recover space. + if m == 0 && b.off != 0 { + b.Truncate(0) + } + if len(b.buf)+n > cap(b.buf) { + var buf []byte + if b.buf == nil && n <= len(b.bootstrap) { + buf = b.bootstrap[0:] + } else if m+n <= cap(b.buf)/2 { + // We can slide things down instead of allocating a new + // slice. We only need m+n <= cap(b.buf) to slide, but + // we instead let capacity get twice as large so we + // don't spend all our time copying. + copy(b.buf[:], b.buf[b.off:]) + buf = b.buf[:m] + } else { + // not enough space anywhere + buf = makeSlice(2*cap(b.buf) + n) + copy(buf, b.buf[b.off:]) + } + b.buf = buf + b.off = 0 + } + b.buf = b.buf[0 : b.off+m+n] + return b.off + m +} + +// Alloc allocs n bytes of slice from the buffer, growing the buffer as +// needed. If n is negative, Alloc will panic. +// If the buffer can't grow it will panic with bytes.ErrTooLarge. +func (b *Buffer) Alloc(n int) []byte { + if n < 0 { + panic("leveldb/util.Buffer.Alloc: negative count") + } + m := b.grow(n) + return b.buf[m:] +} + +// Grow grows the buffer's capacity, if necessary, to guarantee space for +// another n bytes. After Grow(n), at least n bytes can be written to the +// buffer without another allocation. +// If n is negative, Grow will panic. +// If the buffer can't grow it will panic with bytes.ErrTooLarge. +func (b *Buffer) Grow(n int) { + if n < 0 { + panic("leveldb/util.Buffer.Grow: negative count") + } + m := b.grow(n) + b.buf = b.buf[0:m] +} + +// Write appends the contents of p to the buffer, growing the buffer as +// needed. The return value n is the length of p; err is always nil. If the +// buffer becomes too large, Write will panic with bytes.ErrTooLarge. +func (b *Buffer) Write(p []byte) (n int, err error) { + m := b.grow(len(p)) + return copy(b.buf[m:], p), nil +} + +// MinRead is the minimum slice size passed to a Read call by +// Buffer.ReadFrom. As long as the Buffer has at least MinRead bytes beyond +// what is required to hold the contents of r, ReadFrom will not grow the +// underlying buffer. +const MinRead = 512 + +// ReadFrom reads data from r until EOF and appends it to the buffer, growing +// the buffer as needed. The return value n is the number of bytes read. Any +// error except io.EOF encountered during the read is also returned. If the +// buffer becomes too large, ReadFrom will panic with bytes.ErrTooLarge. +func (b *Buffer) ReadFrom(r io.Reader) (n int64, err error) { + // If buffer is empty, reset to recover space. + if b.off >= len(b.buf) { + b.Truncate(0) + } + for { + if free := cap(b.buf) - len(b.buf); free < MinRead { + // not enough space at end + newBuf := b.buf + if b.off+free < MinRead { + // not enough space using beginning of buffer; + // double buffer capacity + newBuf = makeSlice(2*cap(b.buf) + MinRead) + } + copy(newBuf, b.buf[b.off:]) + b.buf = newBuf[:len(b.buf)-b.off] + b.off = 0 + } + m, e := r.Read(b.buf[len(b.buf):cap(b.buf)]) + b.buf = b.buf[0 : len(b.buf)+m] + n += int64(m) + if e == io.EOF { + break + } + if e != nil { + return n, e + } + } + return n, nil // err is EOF, so return nil explicitly +} + +// makeSlice allocates a slice of size n. If the allocation fails, it panics +// with bytes.ErrTooLarge. +func makeSlice(n int) []byte { + // If the make fails, give a known error. + defer func() { + if recover() != nil { + panic(bytes.ErrTooLarge) + } + }() + return make([]byte, n) +} + +// WriteTo writes data to w until the buffer is drained or an error occurs. +// The return value n is the number of bytes written; it always fits into an +// int, but it is int64 to match the io.WriterTo interface. Any error +// encountered during the write is also returned. +func (b *Buffer) WriteTo(w io.Writer) (n int64, err error) { + if b.off < len(b.buf) { + nBytes := b.Len() + m, e := w.Write(b.buf[b.off:]) + if m > nBytes { + panic("leveldb/util.Buffer.WriteTo: invalid Write count") + } + b.off += m + n = int64(m) + if e != nil { + return n, e + } + // all bytes should have been written, by definition of + // Write method in io.Writer + if m != nBytes { + return n, io.ErrShortWrite + } + } + // Buffer is now empty; reset. + b.Truncate(0) + return +} + +// WriteByte appends the byte c to the buffer, growing the buffer as needed. +// The returned error is always nil, but is included to match bufio.Writer's +// WriteByte. If the buffer becomes too large, WriteByte will panic with +// bytes.ErrTooLarge. +func (b *Buffer) WriteByte(c byte) error { + m := b.grow(1) + b.buf[m] = c + return nil +} + +// Read reads the next len(p) bytes from the buffer or until the buffer +// is drained. The return value n is the number of bytes read. If the +// buffer has no data to return, err is io.EOF (unless len(p) is zero); +// otherwise it is nil. +func (b *Buffer) Read(p []byte) (n int, err error) { + if b.off >= len(b.buf) { + // Buffer is empty, reset to recover space. + b.Truncate(0) + if len(p) == 0 { + return + } + return 0, io.EOF + } + n = copy(p, b.buf[b.off:]) + b.off += n + return +} + +// Next returns a slice containing the next n bytes from the buffer, +// advancing the buffer as if the bytes had been returned by Read. +// If there are fewer than n bytes in the buffer, Next returns the entire buffer. +// The slice is only valid until the next call to a read or write method. +func (b *Buffer) Next(n int) []byte { + m := b.Len() + if n > m { + n = m + } + data := b.buf[b.off : b.off+n] + b.off += n + return data +} + +// ReadByte reads and returns the next byte from the buffer. +// If no byte is available, it returns error io.EOF. +func (b *Buffer) ReadByte() (c byte, err error) { + if b.off >= len(b.buf) { + // Buffer is empty, reset to recover space. + b.Truncate(0) + return 0, io.EOF + } + c = b.buf[b.off] + b.off++ + return c, nil +} + +// ReadBytes reads until the first occurrence of delim in the input, +// returning a slice containing the data up to and including the delimiter. +// If ReadBytes encounters an error before finding a delimiter, +// it returns the data read before the error and the error itself (often io.EOF). +// ReadBytes returns err != nil if and only if the returned data does not end in +// delim. +func (b *Buffer) ReadBytes(delim byte) (line []byte, err error) { + slice, err := b.readSlice(delim) + // return a copy of slice. The buffer's backing array may + // be overwritten by later calls. + line = append(line, slice...) + return +} + +// readSlice is like ReadBytes but returns a reference to internal buffer data. +func (b *Buffer) readSlice(delim byte) (line []byte, err error) { + i := bytes.IndexByte(b.buf[b.off:], delim) + end := b.off + i + 1 + if i < 0 { + end = len(b.buf) + err = io.EOF + } + line = b.buf[b.off:end] + b.off = end + return line, err +} + +// NewBuffer creates and initializes a new Buffer using buf as its initial +// contents. It is intended to prepare a Buffer to read existing data. It +// can also be used to size the internal buffer for writing. To do that, +// buf should have the desired capacity but a length of zero. +// +// In most cases, new(Buffer) (or just declaring a Buffer variable) is +// sufficient to initialize a Buffer. +func NewBuffer(buf []byte) *Buffer { return &Buffer{buf: buf} } diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/util/buffer_pool.go b/vendor/github.com/syndtr/goleveldb/leveldb/util/buffer_pool.go new file mode 100644 index 000000000..2f3db974a --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/util/buffer_pool.go @@ -0,0 +1,239 @@ +// Copyright (c) 2014, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package util + +import ( + "fmt" + "sync" + "sync/atomic" + "time" +) + +type buffer struct { + b []byte + miss int +} + +// BufferPool is a 'buffer pool'. +type BufferPool struct { + pool [6]chan []byte + size [5]uint32 + sizeMiss [5]uint32 + sizeHalf [5]uint32 + baseline [4]int + baseline0 int + + mu sync.RWMutex + closed bool + closeC chan struct{} + + get uint32 + put uint32 + half uint32 + less uint32 + equal uint32 + greater uint32 + miss uint32 +} + +func (p *BufferPool) poolNum(n int) int { + if n <= p.baseline0 && n > p.baseline0/2 { + return 0 + } + for i, x := range p.baseline { + if n <= x { + return i + 1 + } + } + return len(p.baseline) + 1 +} + +// Get returns buffer with length of n. +func (p *BufferPool) Get(n int) []byte { + if p == nil { + return make([]byte, n) + } + + p.mu.RLock() + defer p.mu.RUnlock() + + if p.closed { + return make([]byte, n) + } + + atomic.AddUint32(&p.get, 1) + + poolNum := p.poolNum(n) + pool := p.pool[poolNum] + if poolNum == 0 { + // Fast path. + select { + case b := <-pool: + switch { + case cap(b) > n: + if cap(b)-n >= n { + atomic.AddUint32(&p.half, 1) + select { + case pool <- b: + default: + } + return make([]byte, n) + } else { + atomic.AddUint32(&p.less, 1) + return b[:n] + } + case cap(b) == n: + atomic.AddUint32(&p.equal, 1) + return b[:n] + default: + atomic.AddUint32(&p.greater, 1) + } + default: + atomic.AddUint32(&p.miss, 1) + } + + return make([]byte, n, p.baseline0) + } else { + sizePtr := &p.size[poolNum-1] + + select { + case b := <-pool: + switch { + case cap(b) > n: + if cap(b)-n >= n { + atomic.AddUint32(&p.half, 1) + sizeHalfPtr := &p.sizeHalf[poolNum-1] + if atomic.AddUint32(sizeHalfPtr, 1) == 20 { + atomic.StoreUint32(sizePtr, uint32(cap(b)/2)) + atomic.StoreUint32(sizeHalfPtr, 0) + } else { + select { + case pool <- b: + default: + } + } + return make([]byte, n) + } else { + atomic.AddUint32(&p.less, 1) + return b[:n] + } + case cap(b) == n: + atomic.AddUint32(&p.equal, 1) + return b[:n] + default: + atomic.AddUint32(&p.greater, 1) + if uint32(cap(b)) >= atomic.LoadUint32(sizePtr) { + select { + case pool <- b: + default: + } + } + } + default: + atomic.AddUint32(&p.miss, 1) + } + + if size := atomic.LoadUint32(sizePtr); uint32(n) > size { + if size == 0 { + atomic.CompareAndSwapUint32(sizePtr, 0, uint32(n)) + } else { + sizeMissPtr := &p.sizeMiss[poolNum-1] + if atomic.AddUint32(sizeMissPtr, 1) == 20 { + atomic.StoreUint32(sizePtr, uint32(n)) + atomic.StoreUint32(sizeMissPtr, 0) + } + } + return make([]byte, n) + } else { + return make([]byte, n, size) + } + } +} + +// Put adds given buffer to the pool. +func (p *BufferPool) Put(b []byte) { + if p == nil { + return + } + + p.mu.RLock() + defer p.mu.RUnlock() + + if p.closed { + return + } + + atomic.AddUint32(&p.put, 1) + + pool := p.pool[p.poolNum(cap(b))] + select { + case pool <- b: + default: + } + +} + +func (p *BufferPool) Close() { + if p == nil { + return + } + + p.mu.Lock() + if !p.closed { + p.closed = true + p.closeC <- struct{}{} + } + p.mu.Unlock() +} + +func (p *BufferPool) String() string { + if p == nil { + return "" + } + + return fmt.Sprintf("BufferPool{B·%d Z·%v Zm·%v Zh·%v G·%d P·%d H·%d <·%d =·%d >·%d M·%d}", + p.baseline0, p.size, p.sizeMiss, p.sizeHalf, p.get, p.put, p.half, p.less, p.equal, p.greater, p.miss) +} + +func (p *BufferPool) drain() { + ticker := time.NewTicker(2 * time.Second) + defer ticker.Stop() + for { + select { + case <-ticker.C: + for _, ch := range p.pool { + select { + case <-ch: + default: + } + } + case <-p.closeC: + close(p.closeC) + for _, ch := range p.pool { + close(ch) + } + return + } + } +} + +// NewBufferPool creates a new initialized 'buffer pool'. +func NewBufferPool(baseline int) *BufferPool { + if baseline <= 0 { + panic("baseline can't be <= 0") + } + p := &BufferPool{ + baseline0: baseline, + baseline: [...]int{baseline / 4, baseline / 2, baseline * 2, baseline * 4}, + closeC: make(chan struct{}, 1), + } + for i, cap := range []int{2, 2, 4, 4, 2, 1} { + p.pool[i] = make(chan []byte, cap) + } + go p.drain() + return p +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/util/crc32.go b/vendor/github.com/syndtr/goleveldb/leveldb/util/crc32.go new file mode 100644 index 000000000..631c9d610 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/util/crc32.go @@ -0,0 +1,30 @@ +// Copyright 2011 The LevelDB-Go Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package util + +import ( + "hash/crc32" +) + +var table = crc32.MakeTable(crc32.Castagnoli) + +// CRC is a CRC-32 checksum computed using Castagnoli's polynomial. +type CRC uint32 + +// NewCRC creates a new crc based on the given bytes. +func NewCRC(b []byte) CRC { + return CRC(0).Update(b) +} + +// Update updates the crc with the given bytes. +func (c CRC) Update(b []byte) CRC { + return CRC(crc32.Update(uint32(c), table, b)) +} + +// Value returns a masked crc. +func (c CRC) Value() uint32 { + return uint32(c>>15|c<<17) + 0xa282ead8 +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/util/hash.go b/vendor/github.com/syndtr/goleveldb/leveldb/util/hash.go new file mode 100644 index 000000000..7f3fa4e2c --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/util/hash.go @@ -0,0 +1,48 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package util + +import ( + "encoding/binary" +) + +// Hash return hash of the given data. +func Hash(data []byte, seed uint32) uint32 { + // Similar to murmur hash + const ( + m = uint32(0xc6a4a793) + r = uint32(24) + ) + var ( + h = seed ^ (uint32(len(data)) * m) + i int + ) + + for n := len(data) - len(data)%4; i < n; i += 4 { + h += binary.LittleEndian.Uint32(data[i:]) + h *= m + h ^= (h >> 16) + } + + switch len(data) - i { + default: + panic("not reached") + case 3: + h += uint32(data[i+2]) << 16 + fallthrough + case 2: + h += uint32(data[i+1]) << 8 + fallthrough + case 1: + h += uint32(data[i]) + h *= m + h ^= (h >> r) + case 0: + } + + return h +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/util/range.go b/vendor/github.com/syndtr/goleveldb/leveldb/util/range.go new file mode 100644 index 000000000..85159583d --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/util/range.go @@ -0,0 +1,32 @@ +// Copyright (c) 2014, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package util + +// Range is a key range. +type Range struct { + // Start of the key range, include in the range. + Start []byte + + // Limit of the key range, not include in the range. + Limit []byte +} + +// BytesPrefix returns key range that satisfy the given prefix. +// This only applicable for the standard 'bytes comparer'. +func BytesPrefix(prefix []byte) *Range { + var limit []byte + for i := len(prefix) - 1; i >= 0; i-- { + c := prefix[i] + if c < 0xff { + limit = make([]byte, i+1) + copy(limit, prefix) + limit[i] = c + 1 + break + } + } + return &Range{prefix, limit} +} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/util/util.go b/vendor/github.com/syndtr/goleveldb/leveldb/util/util.go new file mode 100644 index 000000000..80614afc5 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/util/util.go @@ -0,0 +1,73 @@ +// Copyright (c) 2013, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Package util provides utilities used throughout leveldb. +package util + +import ( + "errors" +) + +var ( + ErrReleased = errors.New("leveldb: resource already relesed") + ErrHasReleaser = errors.New("leveldb: releaser already defined") +) + +// Releaser is the interface that wraps the basic Release method. +type Releaser interface { + // Release releases associated resources. Release should always success + // and can be called multiple times without causing error. + Release() +} + +// ReleaseSetter is the interface that wraps the basic SetReleaser method. +type ReleaseSetter interface { + // SetReleaser associates the given releaser to the resources. The + // releaser will be called once coresponding resources released. + // Calling SetReleaser with nil will clear the releaser. + // + // This will panic if a releaser already present or coresponding + // resource is already released. Releaser should be cleared first + // before assigned a new one. + SetReleaser(releaser Releaser) +} + +// BasicReleaser provides basic implementation of Releaser and ReleaseSetter. +type BasicReleaser struct { + releaser Releaser + released bool +} + +// Released returns whether Release method already called. +func (r *BasicReleaser) Released() bool { + return r.released +} + +// Release implements Releaser.Release. +func (r *BasicReleaser) Release() { + if !r.released { + if r.releaser != nil { + r.releaser.Release() + r.releaser = nil + } + r.released = true + } +} + +// SetReleaser implements ReleaseSetter.SetReleaser. +func (r *BasicReleaser) SetReleaser(releaser Releaser) { + if r.released { + panic(ErrReleased) + } + if r.releaser != nil && releaser != nil { + panic(ErrHasReleaser) + } + r.releaser = releaser +} + +type NoopReleaser struct{} + +func (NoopReleaser) Release() {} diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/version.go b/vendor/github.com/syndtr/goleveldb/leveldb/version.go new file mode 100644 index 000000000..73f272af5 --- /dev/null +++ b/vendor/github.com/syndtr/goleveldb/leveldb/version.go @@ -0,0 +1,528 @@ +// Copyright (c) 2012, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package leveldb + +import ( + "fmt" + "sync/atomic" + "unsafe" + + "github.com/syndtr/goleveldb/leveldb/iterator" + "github.com/syndtr/goleveldb/leveldb/opt" + "github.com/syndtr/goleveldb/leveldb/util" +) + +type tSet struct { + level int + table *tFile +} + +type version struct { + s *session + + levels []tFiles + + // Level that should be compacted next and its compaction score. + // Score < 1 means compaction is not strictly needed. These fields + // are initialized by computeCompaction() + cLevel int + cScore float64 + + cSeek unsafe.Pointer + + closing bool + ref int + released bool +} + +func newVersion(s *session) *version { + return &version{s: s} +} + +func (v *version) incref() { + if v.released { + panic("already released") + } + + v.ref++ + if v.ref == 1 { + // Incr file ref. + for _, tt := range v.levels { + for _, t := range tt { + v.s.addFileRef(t.fd, 1) + } + } + } +} + +func (v *version) releaseNB() { + v.ref-- + if v.ref > 0 { + return + } else if v.ref < 0 { + panic("negative version ref") + } + + for _, tt := range v.levels { + for _, t := range tt { + if v.s.addFileRef(t.fd, -1) == 0 { + v.s.tops.remove(t) + } + } + } + + v.released = true +} + +func (v *version) release() { + v.s.vmu.Lock() + v.releaseNB() + v.s.vmu.Unlock() +} + +func (v *version) walkOverlapping(aux tFiles, ikey internalKey, f func(level int, t *tFile) bool, lf func(level int) bool) { + ukey := ikey.ukey() + + // Aux level. + if aux != nil { + for _, t := range aux { + if t.overlaps(v.s.icmp, ukey, ukey) { + if !f(-1, t) { + return + } + } + } + + if lf != nil && !lf(-1) { + return + } + } + + // Walk tables level-by-level. + for level, tables := range v.levels { + if len(tables) == 0 { + continue + } + + if level == 0 { + // Level-0 files may overlap each other. Find all files that + // overlap ukey. + for _, t := range tables { + if t.overlaps(v.s.icmp, ukey, ukey) { + if !f(level, t) { + return + } + } + } + } else { + if i := tables.searchMax(v.s.icmp, ikey); i < len(tables) { + t := tables[i] + if v.s.icmp.uCompare(ukey, t.imin.ukey()) >= 0 { + if !f(level, t) { + return + } + } + } + } + + if lf != nil && !lf(level) { + return + } + } +} + +func (v *version) get(aux tFiles, ikey internalKey, ro *opt.ReadOptions, noValue bool) (value []byte, tcomp bool, err error) { + if v.closing { + return nil, false, ErrClosed + } + + ukey := ikey.ukey() + + var ( + tset *tSet + tseek bool + + // Level-0. + zfound bool + zseq uint64 + zkt keyType + zval []byte + ) + + err = ErrNotFound + + // Since entries never hop across level, finding key/value + // in smaller level make later levels irrelevant. + v.walkOverlapping(aux, ikey, func(level int, t *tFile) bool { + if level >= 0 && !tseek { + if tset == nil { + tset = &tSet{level, t} + } else { + tseek = true + } + } + + var ( + fikey, fval []byte + ferr error + ) + if noValue { + fikey, ferr = v.s.tops.findKey(t, ikey, ro) + } else { + fikey, fval, ferr = v.s.tops.find(t, ikey, ro) + } + + switch ferr { + case nil: + case ErrNotFound: + return true + default: + err = ferr + return false + } + + if fukey, fseq, fkt, fkerr := parseInternalKey(fikey); fkerr == nil { + if v.s.icmp.uCompare(ukey, fukey) == 0 { + // Level <= 0 may overlaps each-other. + if level <= 0 { + if fseq >= zseq { + zfound = true + zseq = fseq + zkt = fkt + zval = fval + } + } else { + switch fkt { + case keyTypeVal: + value = fval + err = nil + case keyTypeDel: + default: + panic("leveldb: invalid internalKey type") + } + return false + } + } + } else { + err = fkerr + return false + } + + return true + }, func(level int) bool { + if zfound { + switch zkt { + case keyTypeVal: + value = zval + err = nil + case keyTypeDel: + default: + panic("leveldb: invalid internalKey type") + } + return false + } + + return true + }) + + if tseek && tset.table.consumeSeek() <= 0 { + tcomp = atomic.CompareAndSwapPointer(&v.cSeek, nil, unsafe.Pointer(tset)) + } + + return +} + +func (v *version) sampleSeek(ikey internalKey) (tcomp bool) { + var tset *tSet + + v.walkOverlapping(nil, ikey, func(level int, t *tFile) bool { + if tset == nil { + tset = &tSet{level, t} + return true + } + if tset.table.consumeSeek() <= 0 { + tcomp = atomic.CompareAndSwapPointer(&v.cSeek, nil, unsafe.Pointer(tset)) + } + return false + }, nil) + + return +} + +func (v *version) getIterators(slice *util.Range, ro *opt.ReadOptions) (its []iterator.Iterator) { + strict := opt.GetStrict(v.s.o.Options, ro, opt.StrictReader) + for level, tables := range v.levels { + if level == 0 { + // Merge all level zero files together since they may overlap. + for _, t := range tables { + its = append(its, v.s.tops.newIterator(t, slice, ro)) + } + } else if len(tables) != 0 { + its = append(its, iterator.NewIndexedIterator(tables.newIndexIterator(v.s.tops, v.s.icmp, slice, ro), strict)) + } + } + return +} + +func (v *version) newStaging() *versionStaging { + return &versionStaging{base: v} +} + +// Spawn a new version based on this version. +func (v *version) spawn(r *sessionRecord) *version { + staging := v.newStaging() + staging.commit(r) + return staging.finish() +} + +func (v *version) fillRecord(r *sessionRecord) { + for level, tables := range v.levels { + for _, t := range tables { + r.addTableFile(level, t) + } + } +} + +func (v *version) tLen(level int) int { + if level < len(v.levels) { + return len(v.levels[level]) + } + return 0 +} + +func (v *version) offsetOf(ikey internalKey) (n int64, err error) { + for level, tables := range v.levels { + for _, t := range tables { + if v.s.icmp.Compare(t.imax, ikey) <= 0 { + // Entire file is before "ikey", so just add the file size + n += t.size + } else if v.s.icmp.Compare(t.imin, ikey) > 0 { + // Entire file is after "ikey", so ignore + if level > 0 { + // Files other than level 0 are sorted by meta->min, so + // no further files in this level will contain data for + // "ikey". + break + } + } else { + // "ikey" falls in the range for this table. Add the + // approximate offset of "ikey" within the table. + if m, err := v.s.tops.offsetOf(t, ikey); err == nil { + n += m + } else { + return 0, err + } + } + } + } + + return +} + +func (v *version) pickMemdbLevel(umin, umax []byte, maxLevel int) (level int) { + if maxLevel > 0 { + if len(v.levels) == 0 { + return maxLevel + } + if !v.levels[0].overlaps(v.s.icmp, umin, umax, true) { + var overlaps tFiles + for ; level < maxLevel; level++ { + if pLevel := level + 1; pLevel >= len(v.levels) { + return maxLevel + } else if v.levels[pLevel].overlaps(v.s.icmp, umin, umax, false) { + break + } + if gpLevel := level + 2; gpLevel < len(v.levels) { + overlaps = v.levels[gpLevel].getOverlaps(overlaps, v.s.icmp, umin, umax, false) + if overlaps.size() > int64(v.s.o.GetCompactionGPOverlaps(level)) { + break + } + } + } + } + } + return +} + +func (v *version) computeCompaction() { + // Precomputed best level for next compaction + bestLevel := int(-1) + bestScore := float64(-1) + + statFiles := make([]int, len(v.levels)) + statSizes := make([]string, len(v.levels)) + statScore := make([]string, len(v.levels)) + statTotSize := int64(0) + + for level, tables := range v.levels { + var score float64 + size := tables.size() + if level == 0 { + // We treat level-0 specially by bounding the number of files + // instead of number of bytes for two reasons: + // + // (1) With larger write-buffer sizes, it is nice not to do too + // many level-0 compaction. + // + // (2) The files in level-0 are merged on every read and + // therefore we wish to avoid too many files when the individual + // file size is small (perhaps because of a small write-buffer + // setting, or very high compression ratios, or lots of + // overwrites/deletions). + score = float64(len(tables)) / float64(v.s.o.GetCompactionL0Trigger()) + } else { + score = float64(size) / float64(v.s.o.GetCompactionTotalSize(level)) + } + + if score > bestScore { + bestLevel = level + bestScore = score + } + + statFiles[level] = len(tables) + statSizes[level] = shortenb(int(size)) + statScore[level] = fmt.Sprintf("%.2f", score) + statTotSize += size + } + + v.cLevel = bestLevel + v.cScore = bestScore + + v.s.logf("version@stat F·%v S·%s%v Sc·%v", statFiles, shortenb(int(statTotSize)), statSizes, statScore) +} + +func (v *version) needCompaction() bool { + return v.cScore >= 1 || atomic.LoadPointer(&v.cSeek) != nil +} + +type tablesScratch struct { + added map[int64]atRecord + deleted map[int64]struct{} +} + +type versionStaging struct { + base *version + levels []tablesScratch +} + +func (p *versionStaging) getScratch(level int) *tablesScratch { + if level >= len(p.levels) { + newLevels := make([]tablesScratch, level+1) + copy(newLevels, p.levels) + p.levels = newLevels + } + return &(p.levels[level]) +} + +func (p *versionStaging) commit(r *sessionRecord) { + // Deleted tables. + for _, r := range r.deletedTables { + scratch := p.getScratch(r.level) + if r.level < len(p.base.levels) && len(p.base.levels[r.level]) > 0 { + if scratch.deleted == nil { + scratch.deleted = make(map[int64]struct{}) + } + scratch.deleted[r.num] = struct{}{} + } + if scratch.added != nil { + delete(scratch.added, r.num) + } + } + + // New tables. + for _, r := range r.addedTables { + scratch := p.getScratch(r.level) + if scratch.added == nil { + scratch.added = make(map[int64]atRecord) + } + scratch.added[r.num] = r + if scratch.deleted != nil { + delete(scratch.deleted, r.num) + } + } +} + +func (p *versionStaging) finish() *version { + // Build new version. + nv := newVersion(p.base.s) + numLevel := len(p.levels) + if len(p.base.levels) > numLevel { + numLevel = len(p.base.levels) + } + nv.levels = make([]tFiles, numLevel) + for level := 0; level < numLevel; level++ { + var baseTabels tFiles + if level < len(p.base.levels) { + baseTabels = p.base.levels[level] + } + + if level < len(p.levels) { + scratch := p.levels[level] + + var nt tFiles + // Prealloc list if possible. + if n := len(baseTabels) + len(scratch.added) - len(scratch.deleted); n > 0 { + nt = make(tFiles, 0, n) + } + + // Base tables. + for _, t := range baseTabels { + if _, ok := scratch.deleted[t.fd.Num]; ok { + continue + } + if _, ok := scratch.added[t.fd.Num]; ok { + continue + } + nt = append(nt, t) + } + + // New tables. + for _, r := range scratch.added { + nt = append(nt, tableFileFromRecord(r)) + } + + if len(nt) != 0 { + // Sort tables. + if level == 0 { + nt.sortByNum() + } else { + nt.sortByKey(p.base.s.icmp) + } + + nv.levels[level] = nt + } + } else { + nv.levels[level] = baseTabels + } + } + + // Trim levels. + n := len(nv.levels) + for ; n > 0 && nv.levels[n-1] == nil; n-- { + } + nv.levels = nv.levels[:n] + + // Compute compaction score for new version. + nv.computeCompaction() + + return nv +} + +type versionReleaser struct { + v *version + once bool +} + +func (vr *versionReleaser) Release() { + v := vr.v + v.s.vmu.Lock() + if !vr.once { + v.releaseNB() + vr.once = true + } + v.s.vmu.Unlock() +} diff --git a/vendor/modules.txt b/vendor/modules.txt index cf05e33e0..89c3d8f33 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -265,6 +265,8 @@ github.com/golang/protobuf/ptypes/empty github.com/golang/protobuf/ptypes/struct github.com/golang/protobuf/ptypes/timestamp github.com/golang/protobuf/ptypes/wrappers +# github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db => github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db +github.com/golang/snappy # github.com/google/go-cmp v0.3.0 => github.com/google/go-cmp v0.3.0 github.com/google/go-cmp/cmp github.com/google/go-cmp/cmp/internal/diff @@ -490,6 +492,9 @@ github.com/opencontainers/image-spec/specs-go/v1 # github.com/openshift/api v0.0.0-20180801171038-322a19404e37 => github.com/openshift/api v0.0.0-20180801171038-322a19404e37 github.com/openshift/api/apps/v1 github.com/openshift/api/project/v1 +# github.com/opentracing/opentracing-go v1.1.0 => github.com/opentracing/opentracing-go v1.1.0 +github.com/opentracing/opentracing-go +github.com/opentracing/opentracing-go/log # github.com/pborman/uuid v1.2.0 => github.com/pborman/uuid v1.2.0 github.com/pborman/uuid # github.com/pelletier/go-buffruneio v0.2.0 => github.com/pelletier/go-buffruneio v0.2.0 @@ -535,7 +540,7 @@ github.com/projectcalico/libcalico-go/lib/selector/tokenizer github.com/projectcalico/libcalico-go/lib/set github.com/projectcalico/libcalico-go/lib/validator/v3 github.com/projectcalico/libcalico-go/lib/watch -# github.com/prometheus/client_golang v0.9.3 => github.com/prometheus/client_golang v0.9.3 +# github.com/prometheus/client_golang v0.9.4 => github.com/prometheus/client_golang v0.9.4 github.com/prometheus/client_golang/api github.com/prometheus/client_golang/api/prometheus/v1 github.com/prometheus/client_golang/prometheus @@ -543,14 +548,26 @@ github.com/prometheus/client_golang/prometheus/internal github.com/prometheus/client_golang/prometheus/promhttp # github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90 => github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90 github.com/prometheus/client_model/go -# github.com/prometheus/common v0.4.0 => github.com/prometheus/common v0.4.0 +# github.com/prometheus/common v0.4.1 => github.com/prometheus/common v0.4.0 github.com/prometheus/common/expfmt github.com/prometheus/common/internal/bitbucket.org/ww/goautoneg github.com/prometheus/common/log github.com/prometheus/common/model -# github.com/prometheus/procfs v0.0.0-20190507164030-5867b95ac084 => github.com/prometheus/procfs v0.0.0-20190507164030-5867b95ac084 +# github.com/prometheus/procfs v0.0.2 => github.com/prometheus/procfs v0.0.2 github.com/prometheus/procfs github.com/prometheus/procfs/internal/fs +# github.com/prometheus/prometheus v1.8.2 => github.com/prometheus/prometheus v1.8.2 +github.com/prometheus/prometheus/promql +github.com/prometheus/prometheus/storage +github.com/prometheus/prometheus/storage/local +github.com/prometheus/prometheus/storage/local/chunk +github.com/prometheus/prometheus/storage/local/codable +github.com/prometheus/prometheus/storage/local/index +github.com/prometheus/prometheus/storage/metric +github.com/prometheus/prometheus/util/flock +github.com/prometheus/prometheus/util/stats +github.com/prometheus/prometheus/util/strutil +github.com/prometheus/prometheus/util/testutil # github.com/rcrowley/go-metrics v0.0.0-20181016184325-3113b8401b8a => github.com/rcrowley/go-metrics v0.0.0-20181016184325-3113b8401b8a github.com/rcrowley/go-metrics # github.com/satori/go.uuid v1.2.0 => github.com/satori/go.uuid v1.2.0 @@ -586,6 +603,19 @@ github.com/stretchr/objx # github.com/stretchr/testify v1.4.0 => github.com/stretchr/testify v1.4.0 github.com/stretchr/testify/assert github.com/stretchr/testify/mock +# github.com/syndtr/goleveldb v1.0.0 => github.com/syndtr/goleveldb v1.0.0 +github.com/syndtr/goleveldb/leveldb +github.com/syndtr/goleveldb/leveldb/cache +github.com/syndtr/goleveldb/leveldb/comparer +github.com/syndtr/goleveldb/leveldb/errors +github.com/syndtr/goleveldb/leveldb/filter +github.com/syndtr/goleveldb/leveldb/iterator +github.com/syndtr/goleveldb/leveldb/journal +github.com/syndtr/goleveldb/leveldb/memdb +github.com/syndtr/goleveldb/leveldb/opt +github.com/syndtr/goleveldb/leveldb/storage +github.com/syndtr/goleveldb/leveldb/table +github.com/syndtr/goleveldb/leveldb/util # github.com/xanzy/ssh-agent v0.2.1 => github.com/xanzy/ssh-agent v0.2.1 github.com/xanzy/ssh-agent # github.com/yashtewari/glob-intersection v0.0.0-20180916065949-5c77d914dd0b => github.com/yashtewari/glob-intersection v0.0.0-20180916065949-5c77d914dd0b