diff --git a/dora/core/client/fs/src/main/java/alluxio/client/file/cache/LocalCacheManager.java b/dora/core/client/fs/src/main/java/alluxio/client/file/cache/LocalCacheManager.java index 7d1d206158ed..3683e4f61722 100644 --- a/dora/core/client/fs/src/main/java/alluxio/client/file/cache/LocalCacheManager.java +++ b/dora/core/client/fs/src/main/java/alluxio/client/file/cache/LocalCacheManager.java @@ -32,6 +32,7 @@ import alluxio.file.ReadTargetBuffer; import alluxio.metrics.MetricKey; import alluxio.metrics.MetricsSystem; +import alluxio.metrics.MultiDimensionalMetricsSystem; import alluxio.network.protocol.databuffer.DataFileChannel; import alluxio.resource.LockResource; @@ -209,6 +210,7 @@ public Optional getDataFileChannel( DataFileChannel dataFileChannel = pageInfo.getLocalCacheDir().getPageStore() .getDataFileChannel(pageInfo.getPageId(), pageOffset, bytesToRead, cacheContext.isTemporary()); + MultiDimensionalMetricsSystem.CACHED_DATA_READ.inc(bytesToRead); MetricsSystem.counter(MetricKey.CLIENT_CACHE_HIT_REQUESTS.getName()).inc(); MetricsSystem.meter(MetricKey.CLIENT_CACHE_BYTES_READ_CACHE.getName()).mark(bytesToRead); cacheContext.incrementCounter(MetricKey.CLIENT_CACHE_BYTES_READ_CACHE.getMetricName(), BYTE, @@ -501,6 +503,7 @@ private PutResult putAttempt(PageId pageId, ByteBuffer page, CacheContext cacheC try { pageStoreDir.getPageStore().delete(victim); // Bytes evicted from the cache + MultiDimensionalMetricsSystem.CACHED_EVICTED_DATA.inc(victimPageInfo.getPageSize()); MetricsSystem.meter(MetricKey.CLIENT_CACHE_BYTES_EVICTED.getName()) .mark(victimPageInfo.getPageSize()); // Errors when adding pages @@ -620,6 +623,7 @@ public int get(PageId pageId, int pageOffset, int bytesToRead, ReadTargetBuffer } return -1; } + MultiDimensionalMetricsSystem.CACHED_DATA_READ.inc(bytesRead); MetricsSystem.meter(MetricKey.CLIENT_CACHE_BYTES_READ_CACHE.getName()).mark(bytesRead); cacheContext.incrementCounter(MetricKey.CLIENT_CACHE_BYTES_READ_CACHE.getMetricName(), BYTE, bytesRead); diff --git a/dora/core/common/src/main/java/alluxio/metrics/MultiDimensionalMetricsSystem.java b/dora/core/common/src/main/java/alluxio/metrics/MultiDimensionalMetricsSystem.java index 109f713b0c6b..70b2989eb050 100644 --- a/dora/core/common/src/main/java/alluxio/metrics/MultiDimensionalMetricsSystem.java +++ b/dora/core/common/src/main/java/alluxio/metrics/MultiDimensionalMetricsSystem.java @@ -13,12 +13,14 @@ import alluxio.conf.Configuration; import alluxio.conf.PropertyKey; +import alluxio.util.CommonUtils; import alluxio.util.FormatUtils; import io.prometheus.metrics.config.PrometheusProperties; import io.prometheus.metrics.core.metrics.Counter; import io.prometheus.metrics.core.metrics.Gauge; import io.prometheus.metrics.core.metrics.GaugeWithCallback; +import io.prometheus.metrics.core.metrics.Histogram; import io.prometheus.metrics.core.metrics.Summary; import io.prometheus.metrics.exporter.common.PrometheusHttpRequest; import io.prometheus.metrics.exporter.common.PrometheusHttpResponse; @@ -43,37 +45,57 @@ * and expose all the metrics to the web server. */ public final class MultiDimensionalMetricsSystem { - public static final Summary DATA_ACCESS = Summary.builder() + public static final Histogram DATA_ACCESS = Histogram.builder() .name("alluxio_data_access") + .help("aggregated throughput of all the data access") + .unit(Unit.BYTES) + .labelNames("method") + .build(); + + public static final Summary DATA_ACCESS_LATENCY = Summary.builder() + .name("alluxio_data_access_latency") .help("aggregated latency of all the data access") .unit(Unit.SECONDS) .labelNames("method") - .register(); + .build(); - public static final Summary UFS_DATA_ACCESS = Summary.builder() + public static final Histogram UFS_DATA_ACCESS = Histogram.builder() .name("alluxio_ufs_data_access") + .help("aggregated throughput of ufs access") + .unit(Unit.BYTES) + .labelNames("method") + .build(); + + public static final Summary UFS_DATA_ACCESS_LATENCY = Summary.builder() + .name("alluxio_ufs_data_access_latency") .help("aggregated latency of ufs access") .unit(Unit.SECONDS) .labelNames("method") - .register(); + .build(); + + public static final Counter META_OPERATION = Counter.builder() + .name("alluxio_meta_operation") + .help("counter of rpc calls of the meta operations") + .labelNames("op") + .build(); public static final Counter CACHED_DATA_READ = Counter.builder() .name("alluxio_cached_data_read") .help("amount of the read cached data") .unit(Unit.BYTES) - .register(); + .build(); - public static final Counter META_OPERATION = Counter.builder() - .name("alluxio_meta_operation") - .help("counter of rpc calls of the meta operations") - .labelNames("op") - .register(); + public static final Counter CACHED_EVICTED_DATA = Counter.builder() + .name("alluxio_cached_evicted_data") + .help("amount of the evicted data") + .unit(Unit.BYTES) + .build(); public static final Gauge CACHED_STORAGE = Gauge.builder() .name("alluxio_cached_storage") .help("amount of the cached data") .unit(Unit.BYTES) - .register(); + .build(); public static final GaugeWithCallback CACHED_CAPACITY = GaugeWithCallback.builder() .name("alluxio_cached_capacity") @@ -84,25 +106,40 @@ public final class MultiDimensionalMetricsSystem { long sum = sizes.stream().map(FormatUtils::parseSpaceSize).reduce(0L, Long::sum); callback.call(sum); }) - .register(); - - public static final Counter CACHED_EVICTED_DATA = Counter.builder() - .name("alluxio_cached_evicted_data") - .help("amount of the evicted data") - .unit(Unit.BYTES) - .register(); - - public static final Gauge CACHED_READABLE_STORAGE = Gauge.builder() - .name("alluxio_cached_readable_storage") - .help("amount of readable cached data") - .unit(Unit.BYTES) - .register(); + .build(); /** * Initialize all the metrics. */ public static void initMetrics() { JvmMetrics.builder().register(); + switch (CommonUtils.PROCESS_TYPE.get()) { + case MASTER: + // No essential metrics for the master for now. + break; + case WORKER: + PrometheusRegistry.defaultRegistry.register(DATA_ACCESS); + PrometheusRegistry.defaultRegistry.register(DATA_ACCESS_LATENCY); + PrometheusRegistry.defaultRegistry.register(UFS_DATA_ACCESS); + PrometheusRegistry.defaultRegistry.register(UFS_DATA_ACCESS_LATENCY); + PrometheusRegistry.defaultRegistry.register(META_OPERATION); + PrometheusRegistry.defaultRegistry.register(CACHED_DATA_READ); + PrometheusRegistry.defaultRegistry.register(CACHED_EVICTED_DATA); + PrometheusRegistry.defaultRegistry.register(CACHED_STORAGE); + PrometheusRegistry.defaultRegistry.register(CACHED_CAPACITY); + break; + case CLIENT: + PrometheusRegistry.defaultRegistry.register(DATA_ACCESS); + PrometheusRegistry.defaultRegistry.register(DATA_ACCESS_LATENCY); + PrometheusRegistry.defaultRegistry.register(META_OPERATION); + PrometheusRegistry.defaultRegistry.register(CACHED_DATA_READ); + PrometheusRegistry.defaultRegistry.register(CACHED_EVICTED_DATA); + PrometheusRegistry.defaultRegistry.register(CACHED_STORAGE); + PrometheusRegistry.defaultRegistry.register(CACHED_CAPACITY); + break; + default: + // Ignore and only expose JVM-related metrics + } } /**