Skip to content

Commit

Permalink
MGMT-19840: Gather operational metrics from installercache
Browse files Browse the repository at this point in the history
The intent of this PR is to trace the following statistics, implemented as counts and incremented from applicable parts of the solution.

	counterDescriptionInstallerCachePrunedHardlink           "Counts the number of times the installercache pruned a hardlink for being too old"
	counterDescriptionInstallerCacheGetReleaseOK             "Counts the number of times that a release was fetched succesfully"
	counterDescriptionInstallerCacheGetReleaseTimeout        "Counts the number of times that a release timed out or had the context cancelled"
	counterDescriptionInstallerCacheGetReleaseError          "Counts the number of times that a release fetch resulted in error"
	counterDescriptionInstallerCacheReleaseCached            "Counts the number of times that a release was found in the cache"
	counterDescriptionInstallerCacheReleaseExtracted         "Counts the number of times that a release was extracted"
	counterDescriptionInstallerCacheTryEviction              "Counts the number of times that the eviction function was called"
	counterDescriptionInstallerCacheReleaseEvicted           "Counts the number of times that a release was evicted"

This, combined with the event based metrics gathered in openshift#7156 should provide enough information to track the behaviour of the cache.
  • Loading branch information
paul-maidment committed Feb 6, 2025
1 parent 2fbd268 commit f7b0e81
Show file tree
Hide file tree
Showing 6 changed files with 297 additions and 20 deletions.
2 changes: 1 addition & 1 deletion cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -492,7 +492,7 @@ func main() {
Options.BMConfig.S3EndpointURL = newUrl

Options.InstallerCacheConfig.CacheDir = filepath.Join(Options.GeneratorConfig.GetWorkingDirectory(), "installercache")
installerCache, err := installercache.New(Options.InstallerCacheConfig, eventsHandler, diskStatsHelper, log)
installerCache, err := installercache.New(Options.InstallerCacheConfig, eventsHandler, metricsManager, diskStatsHelper, log)
failOnError(err, "failed to instantiate installercache")

generator := generator.New(log, objectHandler, Options.GeneratorConfig, providerRegistry, manifestsApi, eventsHandler, installerCache)
Expand Down
20 changes: 15 additions & 5 deletions internal/ignition/installmanifests_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ var _ = Describe("Bootstrap Ignition Update", func() {
manifestsAPI *manifestsapi.MockManifestsAPI
eventsHandler *eventsapi.MockHandler
installerCache *installercache.Installers
metricsAPI *metrics.MockAPI
)

BeforeEach(func() {
Expand All @@ -105,12 +106,13 @@ var _ = Describe("Bootstrap Ignition Update", func() {
err1 = os.WriteFile(examplePath, []byte(bootstrap1), 0600)
Expect(err1).NotTo(HaveOccurred())
ctrl = gomock.NewController(GinkgoT())
metricsAPI = metrics.NewMockAPI(ctrl)
installerCacheConfig := installercache.Config{
CacheDir: filepath.Join(workDir, "some-dir", "installercache"),
MaxCapacity: installercache.Size(5),
MaxReleaseSize: installercache.Size(5),
}
installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New())
installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metricsAPI, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New())
Expect(err).NotTo(HaveOccurred())
mockS3Client = s3wrapper.NewMockAPI(ctrl)
manifestsAPI = manifestsapi.NewMockManifestsAPI(ctrl)
Expand Down Expand Up @@ -262,6 +264,7 @@ SV4bRR9i0uf+xQ/oYRvugQ25Q7EahO5hJIWRf4aULbk36Zpw3++v2KFnF26zqwB6
ctrl *gomock.Controller
manifestsAPI *manifestsapi.MockManifestsAPI
eventsHandler eventsapi.Handler
metricsAPI *metrics.MockAPI
installerCache *installercache.Installers
)

Expand All @@ -286,12 +289,13 @@ SV4bRR9i0uf+xQ/oYRvugQ25Q7EahO5hJIWRf4aULbk36Zpw3++v2KFnF26zqwB6
ctrl = gomock.NewController(GinkgoT())
manifestsAPI = manifestsapi.NewMockManifestsAPI(ctrl)
eventsHandler = eventsapi.NewMockHandler(ctrl)
metricsAPI = metrics.NewMockAPI(ctrl)
installerCacheConfig := installercache.Config{
CacheDir: filepath.Join(workDir, "some-dir", "installercache"),
MaxCapacity: installercache.Size(5),
MaxReleaseSize: installercache.Size(5),
}
installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New())
installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metricsAPI, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New())
Expect(err).NotTo(HaveOccurred())
})

Expand Down Expand Up @@ -456,6 +460,7 @@ var _ = Describe("createHostIgnitions", func() {
workDir string
manifestsAPI *manifestsapi.MockManifestsAPI
eventsHandler eventsapi.Handler
metricsAPI *metrics.MockAPI
installerCache *installercache.Installers
)

Expand All @@ -476,13 +481,14 @@ var _ = Describe("createHostIgnitions", func() {
mockS3Client = s3wrapper.NewMockAPI(ctrl)
manifestsAPI = manifestsapi.NewMockManifestsAPI(ctrl)
eventsHandler = eventsapi.NewMockHandler(ctrl)
metricsAPI = metrics.NewMockAPI(ctrl)
cluster = testCluster()
installerCacheConfig := installercache.Config{
CacheDir: filepath.Join(workDir, "some-dir", "installercache"),
MaxCapacity: installercache.Size(5),
MaxReleaseSize: installercache.Size(5),
}
installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New())
installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metricsAPI, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New())
Expect(err).NotTo(HaveOccurred())
})

Expand Down Expand Up @@ -1740,6 +1746,7 @@ var _ = Describe("Bare metal host generation", func() {
ctrl *gomock.Controller
manifestsAPI *manifestsapi.MockManifestsAPI
eventsHandler eventsapi.Handler
metricsAPI *metrics.MockAPI
installerCache *installercache.Installers
)

Expand All @@ -1750,13 +1757,14 @@ var _ = Describe("Bare metal host generation", func() {
ctrl = gomock.NewController(GinkgoT())
manifestsAPI = manifestsapi.NewMockManifestsAPI(ctrl)
eventsHandler = eventsapi.NewMockHandler(ctrl)
metricsAPI = metrics.NewMockAPI(ctrl)
installerCacheConfig := installercache.Config{
CacheDir: filepath.Join(workDir, "some-dir", "installercache"),
MaxCapacity: installercache.Size(5),
MaxReleaseSize: installercache.Size(5),
ReleaseFetchRetryInterval: 1 * time.Microsecond,
}
installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New())
installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metricsAPI, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New())
Expect(err).NotTo(HaveOccurred())
})

Expand Down Expand Up @@ -1850,6 +1858,7 @@ var _ = Describe("Import Cluster TLS Certs for ephemeral installer", func() {
ctrl *gomock.Controller
manifestsAPI *manifestsapi.MockManifestsAPI
eventsHandler eventsapi.Handler
metricsAPI *metrics.MockAPI
installerCache *installercache.Installers
)

Expand Down Expand Up @@ -1881,13 +1890,14 @@ var _ = Describe("Import Cluster TLS Certs for ephemeral installer", func() {
ctrl = gomock.NewController(GinkgoT())
manifestsAPI = manifestsapi.NewMockManifestsAPI(ctrl)
eventsHandler = eventsapi.NewMockHandler(ctrl)
metricsAPI = metrics.NewMockAPI(ctrl)
installerCacheConfig := installercache.Config{
CacheDir: filepath.Join(workDir, "some-dir", "installercache"),
MaxCapacity: installercache.Size(5),
MaxReleaseSize: installercache.Size(5),
ReleaseFetchRetryInterval: 1 * time.Microsecond,
}
installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New())
installerCache, err = installercache.New(installerCacheConfig, eventsHandler, metricsAPI, metrics.NewOSDiskStatsHelper(logrus.New()), logrus.New())
Expect(err).NotTo(HaveOccurred())
})

Expand Down
21 changes: 17 additions & 4 deletions internal/installercache/installercache.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ type Installers struct {
eventsHandler eventsapi.Handler
diskStatsHelper metrics.DiskStatsHelper
config Config
metricsAPI metrics.API
}

type Size int64
Expand Down Expand Up @@ -116,7 +117,7 @@ func (rl *Release) Cleanup(ctx context.Context) error {
}

// New constructs an installer cache with a given storage capacity
func New(config Config, eventsHandler eventsapi.Handler, diskStatsHelper metrics.DiskStatsHelper, log logrus.FieldLogger) (*Installers, error) {
func New(config Config, eventsHandler eventsapi.Handler, metricsAPI metrics.API, diskStatsHelper metrics.DiskStatsHelper, log logrus.FieldLogger) (*Installers, error) {
if config.MaxCapacity > 0 && config.MaxReleaseSize == 0 {
return nil, fmt.Errorf("config.MaxReleaseSize (%d bytes) must not be zero", config.MaxReleaseSize)
}
Expand All @@ -128,6 +129,7 @@ func New(config Config, eventsHandler eventsapi.Handler, diskStatsHelper metrics
eventsHandler: eventsHandler,
diskStatsHelper: diskStatsHelper,
config: config,
metricsAPI: metricsAPI,
}, nil
}

Expand All @@ -138,14 +140,20 @@ func (i *Installers) Get(ctx context.Context, releaseID, releaseIDMirror, pullSe
for {
select {
case <-ctx.Done():
return nil, ctx.Err()
err := ctx.Err()
if err == context.DeadlineExceeded {
i.metricsAPI.InstallerCacheGetReleaseTimeout()
}
return nil, err
default:
release, err := i.get(releaseID, releaseIDMirror, pullSecret, ocRelease, ocpVersion, clusterID)
if err == nil {
i.metricsAPI.InstallerCacheGetReleaseOK()
return release, nil
}
_, isCapacityError := err.(*errorInsufficientCacheCapacity)
if !isCapacityError {
i.metricsAPI.InstallerCacheGetReleaseError()
return nil, errors.Wrapf(err, "failed to get installer path for release %s", releaseID)
}
time.Sleep(i.config.ReleaseFetchRetryInterval)
Expand All @@ -164,6 +172,7 @@ func (i *Installers) getDiskUsageIncludingHardlinks() (uint64, error) {
func (i *Installers) extractReleaseIfNeeded(path, releaseID, releaseIDMirror, pullSecret, ocpVersion string, ocRelease oc.Release) (extractDuration float64, cached bool, err error) {
_, err = os.Stat(path)
if err == nil {
i.metricsAPI.InstallerCacheReleaseCached(releaseID)
return 0, true, nil // release was found in the cache
}
if !os.IsNotExist(err) {
Expand All @@ -181,6 +190,7 @@ func (i *Installers) extractReleaseIfNeeded(path, releaseID, releaseIDMirror, pu
if err != nil {
return 0, false, err
}
i.metricsAPI.InstallerCacheReleaseExtracted(releaseID)
return time.Since(extractStartTime).Seconds(), false, nil
}

Expand Down Expand Up @@ -247,6 +257,7 @@ func (i *Installers) shouldEvict(totalUsed int64) (shouldEvict bool) {
//
// Locking must be done outside evict() to avoid contentions.
func (i *Installers) evict() bool {
i.metricsAPI.InstallerCacheTryEviction()
// store the file paths
files := NewPriorityQueue(&fileInfo{})
links := make([]*fileInfo, 0)
Expand Down Expand Up @@ -312,6 +323,7 @@ func (i *Installers) evictFile(filePath string) error {
if err != nil {
return err
}
i.metricsAPI.InstallerCacheReleaseEvicted()
// if the parent directory was left empty,
// remove it to avoid dangling directories
parentDir := path.Dir(filePath)
Expand All @@ -334,10 +346,11 @@ func (i *Installers) pruneExpiredHardLinks(links []*fileInfo, gracePeriod time.D
grace := graceTime.Unix()
if finfo.info.ModTime().Unix() < grace {
i.log.Infof("attempting to prune hard link %s", finfo.path)
err := os.Remove(finfo.path)
if err != nil {
if err := os.Remove(finfo.path); err != nil {
i.log.WithError(err).Errorf("failed to prune hard link %s", finfo.path)
continue
}
i.metricsAPI.InstallerCachePrunedHardLink()
}
}
}
35 changes: 26 additions & 9 deletions internal/installercache/installercache_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ var _ = Describe("installer cache", func() {
manager *Installers
cacheDir string
eventsHandler *eventsapi.MockHandler
metricsAPI *metrics.MockAPI
ctx context.Context
diskStatsHelper metrics.DiskStatsHelper
)
Expand All @@ -85,17 +86,17 @@ var _ = Describe("installer cache", func() {
}

BeforeEach(func() {

ctrl = gomock.NewController(GinkgoT())
diskStatsHelper = metrics.NewOSDiskStatsHelper(logrus.New())
mockRelease = oc.NewMockRelease(ctrl)
eventsHandler = eventsapi.NewMockHandler(ctrl)
metricsAPI = metrics.NewMockAPI(ctrl)
var err error
cacheDir, err = os.MkdirTemp("/tmp", "cacheDir")
Expect(err).NotTo(HaveOccurred())
Expect(os.Mkdir(filepath.Join(cacheDir, "quay.io"), 0755)).To(Succeed())
Expect(os.Mkdir(filepath.Join(filepath.Join(cacheDir, "quay.io"), "release-dev"), 0755)).To(Succeed())
manager, err = New(getInstallerCacheConfig(12, 5), eventsHandler, diskStatsHelper, logrus.New())
manager, err = New(getInstallerCacheConfig(12, 5), eventsHandler, metricsAPI, diskStatsHelper, logrus.New())
Expect(err).NotTo(HaveOccurred())
ctx = context.TODO()
})
Expand Down Expand Up @@ -138,8 +139,12 @@ var _ = Describe("installer cache", func() {
fname := filepath.Join(workdir, releaseID)
if !expectCached {
mockReleaseCalls(releaseID, version)
metricsAPI.EXPECT().InstallerCacheReleaseExtracted(releaseID).Times(1)
}
expectEventsSent()
mockReleaseCalls(releaseID, version)
expectEventsSent()
metricsAPI.EXPECT().InstallerCacheGetReleaseOK().Times(1)
l, err := manager.Get(ctx, releaseID, "mirror", "pull-secret", mockRelease, version, clusterID)
Expect(err).ShouldNot(HaveOccurred())
Expect(l.releaseID).To(Equal(releaseID))
Expand Down Expand Up @@ -189,6 +194,11 @@ var _ = Describe("installer cache", func() {
runTest := func(t test, manager *Installers) (*Release, error) {
expectEventsSent()
mockReleaseCalls(t.releaseID, t.version)
metricsAPI.EXPECT().InstallerCacheReleaseCached(t.releaseID).AnyTimes()
metricsAPI.EXPECT().InstallerCacheReleaseExtracted(t.releaseID).AnyTimes()
metricsAPI.EXPECT().InstallerCacheGetReleaseOK().AnyTimes()
metricsAPI.EXPECT().InstallerCacheTryEviction().AnyTimes()
metricsAPI.EXPECT().InstallerCacheReleaseEvicted().AnyTimes()
return manager.Get(ctx, t.releaseID, "mirror", "pull-secret", mockRelease, t.version, t.clusterID)
}

Expand Down Expand Up @@ -221,7 +231,7 @@ var _ = Describe("installer cache", func() {
// returns the first error encountered or nil if no error encountered.
runParallelTest := func(maxCapacity int64, maxReleaseSize int64, tests []test) error {
var err error
manager, err = New(getInstallerCacheConfig(maxCapacity, maxReleaseSize), eventsHandler, diskStatsHelper, getLogger())
manager, err = New(getInstallerCacheConfig(maxCapacity, maxReleaseSize), eventsHandler, metricsAPI, diskStatsHelper, getLogger())
Expect(err).ToNot(HaveOccurred())
var wg sync.WaitGroup
var reportedError error
Expand Down Expand Up @@ -290,30 +300,30 @@ var _ = Describe("installer cache", func() {
})

It("Should raise error on construction if max release size is larger than cache and cache is enabled", func() {
_, err := New(getInstallerCacheConfig(5, 10), eventsHandler, diskStatsHelper, logrus.New())
_, err := New(getInstallerCacheConfig(5, 10), eventsHandler, metricsAPI, diskStatsHelper, logrus.New())
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(Equal("config.MaxReleaseSize (10 bytes) must not be greater than config.MaxCapacity (5 bytes)"))
})

It("Should raise error on construction if max release size is zero and cache is enabled", func() {
_, err := New(getInstallerCacheConfig(5, 0), eventsHandler, diskStatsHelper, logrus.New())
_, err := New(getInstallerCacheConfig(5, 0), eventsHandler, metricsAPI, diskStatsHelper, logrus.New())
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(Equal("config.MaxReleaseSize (0 bytes) must not be zero"))
})

It("Should not raise error on construction if max release size is larger than cache and cache eviction is disabled", func() {
_, err := New(getInstallerCacheConfig(0, 10), eventsHandler, diskStatsHelper, logrus.New())
_, err := New(getInstallerCacheConfig(0, 10), eventsHandler, metricsAPI, diskStatsHelper, logrus.New())
Expect(err).ToNot(HaveOccurred())
})

It("Should not raise error on construction if max release size is zero and cache eviction is disabled", func() {
_, err := New(getInstallerCacheConfig(0, 0), eventsHandler, diskStatsHelper, logrus.New())
_, err := New(getInstallerCacheConfig(0, 0), eventsHandler, metricsAPI, diskStatsHelper, logrus.New())
Expect(err).ToNot(HaveOccurred())
})

It("when cache limit is zero - eviction is skipped", func() {
var err error
manager, err = New(getInstallerCacheConfig(0, 5), eventsHandler, diskStatsHelper, logrus.New())
manager, err = New(getInstallerCacheConfig(0, 5), eventsHandler, metricsAPI, diskStatsHelper, logrus.New())
Expect(err).ToNot(HaveOccurred())
clusterId := strfmt.UUID(uuid.New().String())
r1, _ := testGet("4.8", "4.8.0", clusterId, false)
Expand All @@ -333,7 +343,10 @@ var _ = Describe("installer cache", func() {
clusterId := strfmt.UUID(uuid.New().String())
_, _ = testGet("4.8", "4.8.0", clusterId, false)
r2, _ := testGet("4.9", "4.9.0", clusterId, false)
metricsAPI.EXPECT().InstallerCacheReleaseCached("4.8").Times(1)
r1, _ := testGet("4.8", "4.8.0", clusterId, true)
metricsAPI.EXPECT().InstallerCacheTryEviction().Times(1)
metricsAPI.EXPECT().InstallerCacheReleaseEvicted().Times(1)
r3, _ := testGet("4.10", "4.10.0", clusterId, false)

By("verify that the oldest file was deleted")
Expand All @@ -351,6 +364,8 @@ var _ = Describe("installer cache", func() {
clusterId := strfmt.UUID(uuid.New().String())
r1, _ := testGet("4.8", "4.8.0", clusterId, false)
r2, _ := testGet("4.9", "4.9.0", clusterId, false)
metricsAPI.EXPECT().InstallerCacheTryEviction().Times(1)
metricsAPI.EXPECT().InstallerCacheReleaseEvicted().Times(1)
r3, _ := testGet("4.10", "4.10.0", clusterId, false)

By("verify that the oldest file was deleted")
Expand All @@ -371,6 +386,8 @@ var _ = Describe("installer cache", func() {
version := "4.10.0"
clusterID := strfmt.UUID(uuid.NewString())
mockReleaseCalls(releaseID, version)
metricsAPI.EXPECT().InstallerCacheReleaseExtracted(releaseID).Times(1)
metricsAPI.EXPECT().InstallerCacheGetReleaseOK().Times(1)
l, err := manager.Get(ctx, releaseID, releaseMirrorID, "pull-secret", mockRelease, version, clusterID)
Expect(err).ShouldNot(HaveOccurred())
Expect(l.releaseID).To(Equal(releaseID))
Expand All @@ -389,7 +406,7 @@ var _ = Describe("installer cache", func() {

numberOfLinks := 10
numberOfExpiredLinks := 5

metricsAPI.EXPECT().InstallerCachePrunedHardLink().Times(numberOfExpiredLinks)
directory, err := os.MkdirTemp("", "testPruneExpiredHardLinks")
Expect(err).ToNot(HaveOccurred())

Expand Down
Loading

0 comments on commit f7b0e81

Please sign in to comment.