diff --git a/RELEASE_NOTES b/RELEASE_NOTES index f6be5ff057..182d6b2a27 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -6,7 +6,6 @@ Bug Fixes: * [Logs/Windows Event] Add windows event log service restart detection and resubscribe * [Metrics/JMX, Metrics/Net, Metrics/DiskIO] Change cumulative to delta conversion to drop initial value * [Metrics/JMX] Suppress sessions unit warning -* [ContainerInsights] Filtering out redundant attributes for Neuron metrics Enhancements: * [Metrics/JMX] Add cumulative to delta conversion for JMX metrics diff --git a/internal/containerinsightscommon/k8sconst.go b/internal/containerinsightscommon/k8sconst.go index 309c0ea9ef..c8423665fa 100644 --- a/internal/containerinsightscommon/k8sconst.go +++ b/internal/containerinsightscommon/k8sconst.go @@ -20,7 +20,6 @@ const ( PodOwnersKey = "PodOwners" HostKey = "host" K8sKey = "kubernetes" - K8sLabelsKey = "labels" RunningPodCount = "number_of_running_pods" RunningContainerCount = "number_of_running_containers" diff --git a/plugins/processors/gpuattributes/internal/awsneuron_metric_checker.go b/plugins/processors/gpuattributes/internal/awsneuron_metric_checker.go deleted file mode 100644 index 6718bd623e..0000000000 --- a/plugins/processors/gpuattributes/internal/awsneuron_metric_checker.go +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: MIT - -package internal - -import ( - "regexp" -) - -const ( - PROCESSED_NEURON_METRIC_PATTERN = "^(container|node|pod)_(neuroncore_|neurondevice_).*|^node_neuron_.*" -) - -type AwsNeuronMetricChecker struct { -} - -func NewAwsNeuronMetricChecker() *AwsNeuronMetricChecker { - return &AwsNeuronMetricChecker{} -} - -func (md *AwsNeuronMetricChecker) IsProcessedNeuronMetric(name string) bool { - matched, err := regexp.MatchString(PROCESSED_NEURON_METRIC_PATTERN, name) - if err != nil { - print(err) - return false - } - return matched -} diff --git a/plugins/processors/gpuattributes/internal/awsneuron_metric_checker_test.go b/plugins/processors/gpuattributes/internal/awsneuron_metric_checker_test.go deleted file mode 100644 index c9eb5aacf1..0000000000 --- a/plugins/processors/gpuattributes/internal/awsneuron_metric_checker_test.go +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: MIT - -package internal - -import ( - "testing" -) - -func TestAwsNeuronMetricModifier_IsProcessedNeuronMetric(t *testing.T) { - tests := []struct { - name string - input string - expected bool - }{ - { - name: "container_neuroncore_prefix", - input: "container_neuroncore_metric", - expected: true, - }, - { - name: "pod_neuroncore_prefix", - input: "pod_neuroncore_metric", - expected: true, - }, - { - name: "node_neuroncore_prefix", - input: "node_neuroncore_metric", - expected: true, - }, - { - name: "container_neurondevice_prefix", - input: "container_neurondevice_metric", - expected: true, - }, - { - name: "pod_neurondevice_prefix", - input: "pod_neurondevice_metric", - expected: true, - }, - { - name: "node_neurondevice_prefix", - input: "node_neurondevice_metric", - expected: true, - }, - { - name: "node_neuron_prefix", - input: "node_neuron_metric", - expected: true, - }, - { - name: "container_neuron_prefix", - input: "container_neuron_metric", - expected: false, - }, - { - name: "other_prefix", - input: "other_metric", - expected: false, - }, - } - - md := NewAwsNeuronMetricChecker() - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - result := md.IsProcessedNeuronMetric(test.input) - if result != test.expected { - t.Errorf("IsProcessedNeuronMetric(%q) = %v, expected %v", test.input, result, test.expected) - } - }) - } -} diff --git a/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier.go b/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier.go index 62375030da..a2a463e83a 100644 --- a/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier.go +++ b/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier.go @@ -6,6 +6,7 @@ package internal import ( "strings" + "go.opentelemetry.io/collector/pdata/pcommon" "go.opentelemetry.io/collector/pdata/pmetric" "go.uber.org/zap" @@ -45,7 +46,6 @@ const ( Kubernetes = "kubernetes" Region = "region" SubnetId = "subnet_id" - RuntimeTagOverride = "DEFAULT" NeuronExecutionErrorsAggregatedMetric = containerinsightscommon.NeuronExecutionErrors + "_total" NeuronDeviceHardwareEccEventsAggregatedMetric = containerinsightscommon.NeuronDeviceHardwareEccEvents + "_total" ) @@ -99,6 +99,26 @@ var ( "sram_ecc_corrected": NeuronDeviceHardwareEccEventsAggregatedMetric, "sram_ecc_uncorrected": NeuronDeviceHardwareEccEventsAggregatedMetric}, } + + MetricAttributesToKeep = map[string]struct{}{ + ClusterName: {}, + ContainerName: {}, + FullPodName: {}, + InstanceId: {}, + InstanceType: {}, + K8sPodName: {}, + Namespace: {}, + NeuronDevice: {}, + NodeName: {}, + PodName: {}, + Service: {}, + AvailabilityZone: {}, + Kubernetes: {}, + Region: {}, + RuntimeTag: {}, + SubnetId: {}, + NeuronCore: {}, + } ) func NewMetricModifier(logger *zap.Logger) *AwsNeuronMetricModifier { @@ -122,7 +142,7 @@ func (md *AwsNeuronMetricModifier) ModifyMetric(originalMetric pmetric.Metric, m } // Neuron metrics sent by the neuron monitor don't have any units so we add them in the agent. addUnit(originalMetric) - updateCoreDeviceRuntimeLabels(originalMetric) + prefixCoreAndDeviceLabels(originalMetric) resetStaleDatapoints(originalMetric) originalMetricName := originalMetric.Name() @@ -136,6 +156,7 @@ func (md *AwsNeuronMetricModifier) ModifyMetric(originalMetric pmetric.Metric, m } modifiedMetricSlice := md.extractDatapointsAsMetricsAndAggregate(originalMetric) + filterLabels(modifiedMetricSlice, originalMetricName) md.duplicateMetrics(modifiedMetricSlice, originalMetricName, originalMetric.Sum().DataPoints(), metrics) } @@ -230,6 +251,7 @@ func (md *AwsNeuronMetricModifier) extractDatapointsAsMetricsAndAggregate(origin // Creating body for the aggregated metric and add it to the new newMetricSlice for each runtime for aggregatedMetricMetadata, value := range aggregatedValuesPerRuntimeTag { + // Aggregated metric for neuron device ecc events is not required aggregatedMetric := setMetricMetadata(newMetricSlice.AppendEmpty(), aggregatedMetricMetadata.aggregatedMetricName, originalMetric.Unit()) originalMetricDatapoints.At(0).CopyTo(aggregatedMetric.SetEmptySum().DataPoints().AppendEmpty()) @@ -247,9 +269,33 @@ func (md *AwsNeuronMetricModifier) extractDatapointsAsMetricsAndAggregate(origin return newMetricSlice } +// This method removes the attribute keys which are not required. The removal is necessary so that the metrics are grouped together +func filterLabels(slice pmetric.MetricSlice, originalMetricName string) { + _, exists := metricModificationsMap[originalMetricName] + if !exists { + return + } + + for i := 0; i < slice.Len(); i++ { + m := slice.At(i) + + dps := m.Sum().DataPoints() + for j := 0; j < dps.Len(); j++ { + attributes := dps.At(j).Attributes() + attributes.RemoveIf(func(label string, value pcommon.Value) bool { + _, exists := MetricAttributesToKeep[label] + if !exists { + return true + } + return false + }) + } + } +} + // This method prefixes NeuronCore and NeuronDevice values with `core` and `device` respectively // to make the attribute values more verbose -func updateCoreDeviceRuntimeLabels(originalMetric pmetric.Metric) { +func prefixCoreAndDeviceLabels(originalMetric pmetric.Metric) { dps := originalMetric.Sum().DataPoints() for i := 0; i < dps.Len(); i++ { dp := dps.At(i) @@ -258,7 +304,6 @@ func updateCoreDeviceRuntimeLabels(originalMetric pmetric.Metric) { dp.Attributes().PutStr(attributeKey, attributeValuePrefix+value.Str()) } } - dp.Attributes().PutStr(RuntimeTag, RuntimeTagOverride) } } @@ -315,7 +360,7 @@ func resetStaleDatapoints(originalMetric pmetric.Metric) { dp := dps.At(i) if dp.ValueType() == pmetric.NumberDataPointValueTypeEmpty || dp.Flags().NoRecordedValue() { dp.SetDoubleValue(dp.DoubleValue()) - dp.Attributes().PutStr(RuntimeTag, RuntimeTagOverride) + dp.Attributes().PutStr(RuntimeTag, "default") dp.SetFlags(dp.Flags().WithNoRecordedValue(false)) } } diff --git a/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier_test.go b/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier_test.go index c9c0de0bca..b0140b831c 100644 --- a/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier_test.go +++ b/plugins/processors/gpuattributes/internal/awsneuron_metric_modifier_test.go @@ -21,7 +21,7 @@ var staticAttributes = map[string]any{ NodeName: "dummyAttribute", AvailabilityZone: "dummyAttribute", Kubernetes: "dummyAttribute", - RuntimeTag: RuntimeTagOverride, + RuntimeTag: "dummyAttribute", SubnetId: "dummyAttribute", } var staticTimestamp = pcommon.NewTimestampFromTime(time.Date(2023, time.March, 12, 11, 0, 0, 0, time.UTC)) @@ -75,7 +75,7 @@ func TestMetricModifierForExecutionLatencyMetric(t *testing.T) { expectedMetrics := map[string]pmetric.Metric{ NeuronExecutionLatency: metricsList.At(0), - "node_neuron_execution_latency": createExpectedMetric("node_neuron_execution_latency", false, []map[string]string{{Type: NodeAWSNeuron, Percentile: "p50"}}, []float64{1}, pmetric.MetricTypeSum, Seconds), + "node_neuron_execution_latency": createExpectedMetric("node_neuron_execution_latency", false, []map[string]string{{Type: NodeAWSNeuron}}, []float64{1}, pmetric.MetricTypeSum, Seconds), } assertModifiedMetric(t, metricsList, expectedMetrics) @@ -88,13 +88,13 @@ func TestMetricModifierForExecutionErrorMetric(t *testing.T) { expectedMetrics := map[string]pmetric.Metric{ NeuronExecutionErrors: metricsList.At(0), - "node_neuron_execution_errors_generic": createExpectedMetric("node_neuron_execution_errors_generic", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, ErrorType: "generic"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_numerical": createExpectedMetric("node_neuron_execution_errors_numerical", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, ErrorType: "numerical"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_transient": createExpectedMetric("node_neuron_execution_errors_transient", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, ErrorType: "transient"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_model": createExpectedMetric("node_neuron_execution_errors_model", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, ErrorType: "model"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_runtime": createExpectedMetric("node_neuron_execution_errors_runtime", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, ErrorType: "runtime"}}, []float64{5}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_hardware": createExpectedMetric("node_neuron_execution_errors_hardware", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, ErrorType: "hardware"}}, []float64{6}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_total": createExpectedMetric("node_neuron_execution_errors_total", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, ErrorType: "generic"}}, []float64{21}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_generic": createExpectedMetric("node_neuron_execution_errors_generic", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_numerical": createExpectedMetric("node_neuron_execution_errors_numerical", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_transient": createExpectedMetric("node_neuron_execution_errors_transient", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_model": createExpectedMetric("node_neuron_execution_errors_model", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_runtime": createExpectedMetric("node_neuron_execution_errors_runtime", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{5}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_hardware": createExpectedMetric("node_neuron_execution_errors_hardware", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{6}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_total": createExpectedMetric("node_neuron_execution_errors_total", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{21}, pmetric.MetricTypeSum, Count), } assertModifiedMetric(t, metricsList, expectedMetrics) @@ -111,12 +111,12 @@ func TestMetricModifierForExecutionStatusMetric(t *testing.T) { expectedMetrics := map[string]pmetric.Metric{ NeuronExecutionStatus: metricsList.At(0), - "node_neuron_execution_status_completed": createExpectedMetric("node_neuron_execution_status_completed", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, StatusType: "completed"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_completed_with_err": createExpectedMetric("node_neuron_execution_status_completed_with_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, StatusType: "completed_with_err"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_completed_with_num_err": createExpectedMetric("node_neuron_execution_status_completed_with_num_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, StatusType: "completed_with_num_err"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_timed_out": createExpectedMetric("node_neuron_execution_status_timed_out", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, StatusType: "timed_out"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_incorrect_input": createExpectedMetric("node_neuron_execution_status_incorrect_input", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, StatusType: "incorrect_input"}}, []float64{5}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_failed_to_queue": createExpectedMetric("node_neuron_execution_status_failed_to_queue", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, StatusType: "failed_to_queue"}}, []float64{6}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_completed": createExpectedMetric("node_neuron_execution_status_completed", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_completed_with_err": createExpectedMetric("node_neuron_execution_status_completed_with_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_completed_with_num_err": createExpectedMetric("node_neuron_execution_status_completed_with_num_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_timed_out": createExpectedMetric("node_neuron_execution_status_timed_out", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_incorrect_input": createExpectedMetric("node_neuron_execution_status_incorrect_input", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{5}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_failed_to_queue": createExpectedMetric("node_neuron_execution_status_failed_to_queue", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{6}, pmetric.MetricTypeSum, Count), } assertModifiedMetric(t, metricsList, expectedMetrics) @@ -130,9 +130,9 @@ func TestMetricModifierForNeuronCoreMemoryUsageMetric(t *testing.T) { expectedMetrics := map[string]pmetric.Metric{ NeuronCoreMemoryUsageModelSharedScratchpad: metricsList.At(0), - "node_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("node_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: NodeAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}, {NeuronCore: "core1", NeuronDevice: "device0", Type: NodeAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}, {NeuronCore: "core2", NeuronDevice: "device1", Type: NodeAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), - "pod_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("pod_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: PodAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}, {NeuronCore: "core1", NeuronDevice: "device0", Type: PodAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}, {NeuronCore: "core2", NeuronDevice: "device1", Type: PodAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), - "container_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("container_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: ContainerAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}, {NeuronCore: "core1", NeuronDevice: "device0", Type: ContainerAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}, {NeuronCore: "core2", NeuronDevice: "device1", Type: ContainerAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), + "node_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("node_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: NodeAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core1", NeuronDevice: "device0", Type: NodeAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core2", NeuronDevice: "device1", Type: NodeAWSNeuronCore, PodName: DummyPod}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), + "pod_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("pod_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: PodAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core1", NeuronDevice: "device0", Type: PodAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core2", NeuronDevice: "device1", Type: PodAWSNeuronCore, PodName: DummyPod}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), + "container_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("container_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: ContainerAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core1", NeuronDevice: "device0", Type: ContainerAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core2", NeuronDevice: "device1", Type: ContainerAWSNeuronCore, PodName: DummyPod}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), } assertModifiedMetric(t, metricsList, expectedMetrics) @@ -146,7 +146,7 @@ func TestMetricModifierForNeuronCoreMemoryUsageMetric_PodNameMissing(t *testing. expectedMetrics := map[string]pmetric.Metric{ NeuronCoreMemoryUsageModelSharedScratchpad: metricsList.At(0), - "node_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("node_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: NodeAWSNeuronCore, MemoryLocation: "None"}, {NeuronCore: "core1", NeuronDevice: "device0", Type: NodeAWSNeuronCore, MemoryLocation: "None"}, {NeuronCore: "core2", NeuronDevice: "device1", Type: NodeAWSNeuronCore, MemoryLocation: "None"}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), + "node_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("node_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: NodeAWSNeuronCore}, {NeuronCore: "core1", NeuronDevice: "device0", Type: NodeAWSNeuronCore}, {NeuronCore: "core2", NeuronDevice: "device1", Type: NodeAWSNeuronCore}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), } assertModifiedMetric(t, metricsList, expectedMetrics) @@ -160,7 +160,7 @@ func TestMetricModifierForNeuronDeviceRuntimeMemoryUsageMetric(t *testing.T) { expectedMetrics := map[string]pmetric.Metric{ NeuronDeviceRuntimeMemoryUsedBytes: metricsList.At(0), - "node_neurondevice_runtime_memory_used_bytes": createExpectedMetric("node_neurondevice_runtime_memory_used_bytes", false, []map[string]string{{Type: NodeAWSNeuron, MemoryLocation: "neuron_device"}}, []float64{2}, pmetric.MetricTypeSum, Bytes), + "node_neurondevice_runtime_memory_used_bytes": createExpectedMetric("node_neurondevice_runtime_memory_used_bytes", false, []map[string]string{{Type: NodeAWSNeuron}}, []float64{2}, pmetric.MetricTypeSum, Bytes), } assertModifiedMetric(t, metricsList, expectedMetrics) @@ -174,21 +174,21 @@ func TestMetricModifierForNeuronDeviceEccEventMetric(t *testing.T) { expectedMetrics := map[string]pmetric.Metric{ NeuronDeviceHwEccEvents: metricsList.At(0), - "node_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_corrected"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_uncorrected"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "sram_ecc_corrected"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "sram_ecc_uncorrected"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_total": createExpectedMetric("node_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_corrected"}}, []float64{10}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_corrected"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_uncorrected"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "sram_ecc_corrected"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "sram_ecc_uncorrected"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_total": createExpectedMetric("pod_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_corrected"}}, []float64{10}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_corrected"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_uncorrected"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "sram_ecc_corrected"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "sram_ecc_uncorrected"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_total": createExpectedMetric("container_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_corrected"}}, []float64{10}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_total": createExpectedMetric("node_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{10}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_total": createExpectedMetric("pod_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{10}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_total": createExpectedMetric("container_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{10}, pmetric.MetricTypeSum, Count), } assertModifiedMetric(t, metricsList, expectedMetrics) @@ -202,11 +202,11 @@ func TestMetricModifierForNeuronDeviceEccEventMetric_PodNameMissing(t *testing.T expectedMetrics := map[string]pmetric.Metric{ NeuronDeviceHwEccEvents: metricsList.At(0), - "node_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_corrected"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_uncorrected"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "sram_ecc_corrected"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "sram_ecc_uncorrected"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_total": createExpectedMetric("node_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_corrected"}}, []float64{10}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_total": createExpectedMetric("node_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{10}, pmetric.MetricTypeSum, Count), } assertModifiedMetric(t, metricsList, expectedMetrics) @@ -249,44 +249,44 @@ func TestListWithMultipleMetrics(t *testing.T) { NeuronDeviceHwEccEvents: metricsList.At(5), NonNeuronMetric: metricsList.At(6), - "node_neuron_execution_latency": createExpectedMetric("node_neuron_execution_latency", false, []map[string]string{{Type: NodeAWSNeuron, Percentile: "p50"}}, []float64{1}, pmetric.MetricTypeSum, Seconds), - - "node_neuron_execution_errors_generic": createExpectedMetric("node_neuron_execution_errors_generic", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, ErrorType: "generic"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_numerical": createExpectedMetric("node_neuron_execution_errors_numerical", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, ErrorType: "numerical"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_transient": createExpectedMetric("node_neuron_execution_errors_transient", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, ErrorType: "transient"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_model": createExpectedMetric("node_neuron_execution_errors_model", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, ErrorType: "model"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_runtime": createExpectedMetric("node_neuron_execution_errors_runtime", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, ErrorType: "runtime"}}, []float64{5}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_hardware": createExpectedMetric("node_neuron_execution_errors_hardware", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, ErrorType: "hardware"}}, []float64{6}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_errors_total": createExpectedMetric("node_neuron_execution_errors_total", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, ErrorType: "generic"}}, []float64{21}, pmetric.MetricTypeSum, Count), - - "node_neuron_execution_status_completed": createExpectedMetric("node_neuron_execution_status_completed", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, StatusType: "completed"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_completed_with_err": createExpectedMetric("node_neuron_execution_status_completed_with_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, StatusType: "completed_with_err"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_completed_with_num_err": createExpectedMetric("node_neuron_execution_status_completed_with_num_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, StatusType: "completed_with_num_err"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_timed_out": createExpectedMetric("node_neuron_execution_status_timed_out", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, StatusType: "timed_out"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_incorrect_input": createExpectedMetric("node_neuron_execution_status_incorrect_input", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, StatusType: "incorrect_input"}}, []float64{5}, pmetric.MetricTypeSum, Count), - "node_neuron_execution_status_failed_to_queue": createExpectedMetric("node_neuron_execution_status_failed_to_queue", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: RuntimeTagOverride, StatusType: "failed_to_queue"}}, []float64{6}, pmetric.MetricTypeSum, Count), - - "node_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("node_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: NodeAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}, {NeuronCore: "core1", NeuronDevice: "device0", Type: NodeAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}, {NeuronCore: "core2", NeuronDevice: "device1", Type: NodeAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), - "pod_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("pod_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: PodAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}, {NeuronCore: "core1", NeuronDevice: "device0", Type: PodAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}, {NeuronCore: "core2", NeuronDevice: "device1", Type: PodAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), - "container_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("container_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: ContainerAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}, {NeuronCore: "core1", NeuronDevice: "device0", Type: ContainerAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}, {NeuronCore: "core2", NeuronDevice: "device1", Type: ContainerAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None"}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), - - "node_neurondevice_runtime_memory_used_bytes": createExpectedMetric("node_neurondevice_runtime_memory_used_bytes", false, []map[string]string{{Type: NodeAWSNeuron, MemoryLocation: "neuron_device"}}, []float64{2}, pmetric.MetricTypeSum, Bytes), - - "node_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_corrected"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_uncorrected"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "sram_ecc_corrected"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "sram_ecc_uncorrected"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "node_neurondevice_hw_ecc_events_total": createExpectedMetric("node_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_corrected"}}, []float64{10}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_corrected"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_uncorrected"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "sram_ecc_corrected"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "sram_ecc_uncorrected"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "pod_neurondevice_hw_ecc_events_total": createExpectedMetric("pod_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_corrected"}}, []float64{10}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_corrected"}}, []float64{1}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_uncorrected"}}, []float64{2}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "sram_ecc_corrected"}}, []float64{3}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "sram_ecc_uncorrected"}}, []float64{4}, pmetric.MetricTypeSum, Count), - "container_neurondevice_hw_ecc_events_total": createExpectedMetric("container_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDeviceIndex: "1", NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: RuntimeTagOverride, EventType: "mem_ecc_corrected"}}, []float64{10}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_latency": createExpectedMetric("node_neuron_execution_latency", false, []map[string]string{{Type: NodeAWSNeuron}}, []float64{1}, pmetric.MetricTypeSum, Seconds), + + "node_neuron_execution_errors_generic": createExpectedMetric("node_neuron_execution_errors_generic", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_numerical": createExpectedMetric("node_neuron_execution_errors_numerical", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_transient": createExpectedMetric("node_neuron_execution_errors_transient", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_model": createExpectedMetric("node_neuron_execution_errors_model", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_runtime": createExpectedMetric("node_neuron_execution_errors_runtime", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{5}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_hardware": createExpectedMetric("node_neuron_execution_errors_hardware", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{6}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_errors_total": createExpectedMetric("node_neuron_execution_errors_total", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{21}, pmetric.MetricTypeSum, Count), + + "node_neuron_execution_status_completed": createExpectedMetric("node_neuron_execution_status_completed", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_completed_with_err": createExpectedMetric("node_neuron_execution_status_completed_with_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_completed_with_num_err": createExpectedMetric("node_neuron_execution_status_completed_with_num_err", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_timed_out": createExpectedMetric("node_neuron_execution_status_timed_out", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_incorrect_input": createExpectedMetric("node_neuron_execution_status_incorrect_input", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{5}, pmetric.MetricTypeSum, Count), + "node_neuron_execution_status_failed_to_queue": createExpectedMetric("node_neuron_execution_status_failed_to_queue", true, []map[string]string{{Type: NodeAWSNeuron, RuntimeTag: "1"}}, []float64{6}, pmetric.MetricTypeSum, Count), + + "node_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("node_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: NodeAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core1", NeuronDevice: "device0", Type: NodeAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core2", NeuronDevice: "device1", Type: NodeAWSNeuronCore, PodName: DummyPod}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), + "pod_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("pod_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: PodAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core1", NeuronDevice: "device0", Type: PodAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core2", NeuronDevice: "device1", Type: PodAWSNeuronCore, PodName: DummyPod}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), + "container_neuroncore_memory_usage_model_shared_scratchpad": createExpectedMetric("container_neuroncore_memory_usage_model_shared_scratchpad", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", Type: ContainerAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core1", NeuronDevice: "device0", Type: ContainerAWSNeuronCore, PodName: DummyPod}, {NeuronCore: "core2", NeuronDevice: "device1", Type: ContainerAWSNeuronCore, PodName: DummyPod}}, []float64{1, 2, 3}, pmetric.MetricTypeSum, Bytes), + + "node_neurondevice_runtime_memory_used_bytes": createExpectedMetric("node_neurondevice_runtime_memory_used_bytes", false, []map[string]string{{Type: NodeAWSNeuron}}, []float64{2}, pmetric.MetricTypeSum, Bytes), + + "node_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("node_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "node_neurondevice_hw_ecc_events_total": createExpectedMetric("node_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: NodeAWSNeuronDevice, RuntimeTag: "1"}}, []float64{10}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("pod_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "pod_neurondevice_hw_ecc_events_total": createExpectedMetric("pod_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: PodAWSNeuronDevice, RuntimeTag: "1"}}, []float64{10}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_mem_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{1}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_mem_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_mem_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{2}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_sram_ecc_corrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_corrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{3}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_sram_ecc_uncorrected": createExpectedMetric("container_neurondevice_hw_ecc_events_sram_ecc_uncorrected", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{4}, pmetric.MetricTypeSum, Count), + "container_neurondevice_hw_ecc_events_total": createExpectedMetric("container_neurondevice_hw_ecc_events_total", false, []map[string]string{{NeuronDevice: "device1", PodName: DummyPod, Type: ContainerAWSNeuronDevice, RuntimeTag: "1"}}, []float64{10}, pmetric.MetricTypeSum, Count), } assertModifiedMetric(t, metricsList, expectedMetrics) } @@ -303,7 +303,7 @@ func TestMetricWithStaleDatapoint(t *testing.T) { expectedMetrics := map[string]pmetric.Metric{ NeuronExecutionLatency: metricsList.At(0), - "node_neuron_execution_latency": createExpectedMetric("node_neuron_execution_latency", false, []map[string]string{{Type: NodeAWSNeuron, Percentile: "p50"}}, []float64{1}, pmetric.MetricTypeSum, Seconds), + "node_neuron_execution_latency": createExpectedMetric("node_neuron_execution_latency", false, []map[string]string{{Type: NodeAWSNeuron}}, []float64{1}, pmetric.MetricTypeSum, Seconds), } assertModifiedMetric(t, metricsList, expectedMetrics) diff --git a/plugins/processors/gpuattributes/internal/metricFilters/gpumetricfilters.go b/plugins/processors/gpuattributes/internal/metricFilters/gpumetricfilters.go deleted file mode 100644 index dedc259432..0000000000 --- a/plugins/processors/gpuattributes/internal/metricFilters/gpumetricfilters.go +++ /dev/null @@ -1,156 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: MIT - -package metricFilters - -import ( - "github.com/aws/amazon-cloudwatch-agent/internal/containerinsightscommon" - "github.com/aws/amazon-cloudwatch-agent/plugins/processors/gpuattributes/internal" -) - -// This class contains the attribute filters which are applied to the metric datapoints of GPU and Neuron metrics. -// If the datapoint contains metrics apart from the ones mentioned in the filter, then they'll be dropped. - -const ( - containerd = "containerd" - pod_id = "pod_id" - pod_name = "pod_name" - pod_owners = "pod_owners" - namespace = "namespace" - container_name = "container_name" -) - -var ContainerGpuLabelFilter = map[string]map[string]interface{}{ - containerinsightscommon.ClusterNameKey: nil, - containerinsightscommon.InstanceIdKey: nil, - containerinsightscommon.GpuDeviceKey: nil, - containerinsightscommon.MetricType: nil, - containerinsightscommon.NodeNameKey: nil, - containerinsightscommon.K8sNamespace: nil, - containerinsightscommon.FullPodNameKey: nil, - containerinsightscommon.PodNameKey: nil, - containerinsightscommon.TypeService: nil, - containerinsightscommon.GpuUniqueId: nil, - containerinsightscommon.ContainerNamekey: nil, - containerinsightscommon.InstanceTypeKey: nil, - containerinsightscommon.VersionKey: nil, - containerinsightscommon.SourcesKey: nil, - containerinsightscommon.Timestamp: nil, - containerinsightscommon.K8sKey: { - containerinsightscommon.HostKey: nil, - containerinsightscommon.K8sLabelsKey: nil, - pod_id: nil, - pod_name: nil, - pod_owners: nil, - namespace: nil, - container_name: nil, - containerd: nil, - }, -} -var PodGpuLabelFilter = map[string]map[string]interface{}{ - containerinsightscommon.ClusterNameKey: nil, - containerinsightscommon.InstanceIdKey: nil, - containerinsightscommon.GpuDeviceKey: nil, - containerinsightscommon.MetricType: nil, - containerinsightscommon.NodeNameKey: nil, - containerinsightscommon.K8sNamespace: nil, - containerinsightscommon.FullPodNameKey: nil, - containerinsightscommon.PodNameKey: nil, - containerinsightscommon.TypeService: nil, - containerinsightscommon.GpuUniqueId: nil, - containerinsightscommon.InstanceTypeKey: nil, - containerinsightscommon.VersionKey: nil, - containerinsightscommon.SourcesKey: nil, - containerinsightscommon.Timestamp: nil, - containerinsightscommon.K8sKey: { - containerinsightscommon.HostKey: nil, - containerinsightscommon.K8sLabelsKey: nil, - pod_id: nil, - pod_name: nil, - pod_owners: nil, - namespace: nil, - }, -} -var NodeGpuLabelFilter = map[string]map[string]interface{}{ - containerinsightscommon.ClusterNameKey: nil, - containerinsightscommon.InstanceIdKey: nil, - containerinsightscommon.GpuDeviceKey: nil, - containerinsightscommon.MetricType: nil, - containerinsightscommon.NodeNameKey: nil, - containerinsightscommon.InstanceTypeKey: nil, - containerinsightscommon.VersionKey: nil, - containerinsightscommon.SourcesKey: nil, - containerinsightscommon.Timestamp: nil, - containerinsightscommon.K8sKey: { - containerinsightscommon.HostKey: nil, - }, -} - -var PodNeuronLabelFilter = map[string]map[string]interface{}{ - containerinsightscommon.ClusterNameKey: nil, - containerinsightscommon.FullPodNameKey: nil, - containerinsightscommon.InstanceIdKey: nil, - containerinsightscommon.InstanceTypeKey: nil, - containerinsightscommon.K8sPodNameKey: nil, - containerinsightscommon.K8sNamespace: nil, - internal.NeuronDevice: nil, - containerinsightscommon.NodeNameKey: nil, - containerinsightscommon.PodNameKey: nil, - containerinsightscommon.TypeService: nil, - internal.AvailabilityZone: nil, - containerinsightscommon.K8sKey: { - containerinsightscommon.HostKey: nil, - pod_id: nil, - pod_owners: nil, - containerinsightscommon.K8sLabelsKey: nil, - }, - internal.Region: nil, - internal.SubnetId: nil, - internal.NeuronCore: nil, - containerinsightscommon.MetricType: nil, -} - -var ContainerNeuronLabelFilter = map[string]map[string]interface{}{ - containerinsightscommon.ClusterNameKey: nil, - containerinsightscommon.ContainerNamekey: nil, - containerinsightscommon.FullPodNameKey: nil, - containerinsightscommon.InstanceIdKey: nil, - containerinsightscommon.InstanceTypeKey: nil, - containerinsightscommon.K8sPodNameKey: nil, - containerinsightscommon.K8sNamespace: nil, - internal.NeuronDevice: nil, - containerinsightscommon.NodeNameKey: nil, - containerinsightscommon.PodNameKey: nil, - containerinsightscommon.TypeService: nil, - internal.AvailabilityZone: nil, - containerinsightscommon.Kubernetes: { - containerinsightscommon.HostKey: nil, - "containerd": nil, - pod_id: nil, - pod_owners: nil, - containerinsightscommon.K8sLabelsKey: nil, - }, - internal.Region: nil, - internal.SubnetId: nil, - internal.NeuronCore: nil, - containerinsightscommon.MetricType: nil, -} - -var NodeNeuronLabelFilter = map[string]map[string]interface{}{ - containerinsightscommon.ClusterNameKey: nil, - containerinsightscommon.InstanceIdKey: nil, - containerinsightscommon.InstanceTypeKey: nil, - containerinsightscommon.K8sNamespace: nil, - internal.NeuronDevice: nil, - containerinsightscommon.NodeNameKey: nil, - containerinsightscommon.TypeService: nil, - internal.AvailabilityZone: nil, - containerinsightscommon.Kubernetes: { - containerinsightscommon.HostKey: nil, - containerinsightscommon.K8sLabelsKey: nil, - }, - internal.Region: nil, - internal.SubnetId: nil, - internal.NeuronCore: nil, - containerinsightscommon.MetricType: nil, -} diff --git a/plugins/processors/gpuattributes/processor.go b/plugins/processors/gpuattributes/processor.go index d37ff0298a..94fb411a53 100644 --- a/plugins/processors/gpuattributes/processor.go +++ b/plugins/processors/gpuattributes/processor.go @@ -11,18 +11,16 @@ import ( "go.opentelemetry.io/collector/pdata/pcommon" "go.opentelemetry.io/collector/pdata/pmetric" "go.uber.org/zap" - "golang.org/x/exp/maps" "github.com/aws/amazon-cloudwatch-agent/internal/containerinsightscommon" "github.com/aws/amazon-cloudwatch-agent/plugins/processors/gpuattributes/internal" - "github.com/aws/amazon-cloudwatch-agent/plugins/processors/gpuattributes/internal/metricFilters" ) const ( - gpuMetricIdentifier = "_gpu_" - containerMetricPrefix = "container_" - podMetricPrefix = "pod_" - nodeMetricPrefix = "node_" + gpuMetricIdentifier = "_gpu_" + gpuContainerMetricPrefix = "container_" + gpuPodMetricPrefix = "pod_" + gpuNodeMetricPrefix = "node_" ) // schemas at each resource level @@ -44,12 +42,77 @@ const ( // - ClusterName // - ClusterName, InstanceIdKey, NodeName // - ClusterName, InstanceIdKey, NodeName, GpuDevice +var containerLabelFilter = map[string]map[string]interface{}{ + containerinsightscommon.ClusterNameKey: nil, + containerinsightscommon.InstanceIdKey: nil, + containerinsightscommon.GpuDeviceKey: nil, + containerinsightscommon.MetricType: nil, + containerinsightscommon.NodeNameKey: nil, + containerinsightscommon.K8sNamespace: nil, + containerinsightscommon.FullPodNameKey: nil, + containerinsightscommon.PodNameKey: nil, + containerinsightscommon.TypeService: nil, + containerinsightscommon.GpuUniqueId: nil, + containerinsightscommon.ContainerNamekey: nil, + containerinsightscommon.InstanceTypeKey: nil, + containerinsightscommon.VersionKey: nil, + containerinsightscommon.SourcesKey: nil, + containerinsightscommon.Timestamp: nil, + containerinsightscommon.K8sKey: { + containerinsightscommon.HostKey: nil, + "labels": nil, + "pod_id": nil, + "pod_name": nil, + "pod_owners": nil, + "namespace": nil, + "container_name": nil, + "containerd": nil, + }, +} +var podLabelFilter = map[string]map[string]interface{}{ + containerinsightscommon.ClusterNameKey: nil, + containerinsightscommon.InstanceIdKey: nil, + containerinsightscommon.GpuDeviceKey: nil, + containerinsightscommon.MetricType: nil, + containerinsightscommon.NodeNameKey: nil, + containerinsightscommon.K8sNamespace: nil, + containerinsightscommon.FullPodNameKey: nil, + containerinsightscommon.PodNameKey: nil, + containerinsightscommon.TypeService: nil, + containerinsightscommon.GpuUniqueId: nil, + containerinsightscommon.InstanceTypeKey: nil, + containerinsightscommon.VersionKey: nil, + containerinsightscommon.SourcesKey: nil, + containerinsightscommon.Timestamp: nil, + containerinsightscommon.K8sKey: { + containerinsightscommon.HostKey: nil, + "labels": nil, + "pod_id": nil, + "pod_name": nil, + "pod_owners": nil, + "namespace": nil, + }, +} +var nodeLabelFilter = map[string]map[string]interface{}{ + containerinsightscommon.ClusterNameKey: nil, + containerinsightscommon.InstanceIdKey: nil, + containerinsightscommon.GpuDeviceKey: nil, + containerinsightscommon.MetricType: nil, + containerinsightscommon.NodeNameKey: nil, + containerinsightscommon.InstanceTypeKey: nil, + containerinsightscommon.VersionKey: nil, + containerinsightscommon.SourcesKey: nil, + containerinsightscommon.Timestamp: nil, + containerinsightscommon.K8sKey: { + containerinsightscommon.HostKey: nil, + }, +} + type gpuAttributesProcessor struct { *Config logger *zap.Logger awsNeuronMetricModifier *internal.AwsNeuronMetricModifier awsNeuronMemoryMetricAggregator *internal.AwsNeuronMemoryMetricsAggregator - awsNeuronMetricChecker *internal.AwsNeuronMetricChecker } func newGpuAttributesProcessor(config *Config, logger *zap.Logger) *gpuAttributesProcessor { @@ -58,7 +121,6 @@ func newGpuAttributesProcessor(config *Config, logger *zap.Logger) *gpuAttribute logger: logger, awsNeuronMetricModifier: internal.NewMetricModifier(logger), awsNeuronMemoryMetricAggregator: internal.NewMemoryMemoryAggregator(), - awsNeuronMetricChecker: internal.NewAwsNeuronMetricChecker(), } return d } @@ -77,6 +139,7 @@ func (d *gpuAttributesProcessor) processMetrics(_ context.Context, md pmetric.Me metricsLength := metrics.Len() for k := 0; k < metricsLength; k++ { m := metrics.At(k) + d.processGPUMetricAttributes(m) d.awsNeuronMemoryMetricAggregator.AggregateMemoryMetric(m) // non neuron metric is returned as a singleton list d.awsNeuronMetricModifier.ModifyMetric(m, metrics) @@ -85,54 +148,24 @@ func (d *gpuAttributesProcessor) processMetrics(_ context.Context, md pmetric.Me aggregatedMemoryMetric := d.awsNeuronMemoryMetricAggregator.FlushAggregatedMemoryMetric() d.awsNeuronMetricModifier.ModifyMetric(aggregatedMemoryMetric, metrics) } - - //loop over all metrics and filter labels - for k := 0; k < metrics.Len(); k++ { - m := metrics.At(k) - d.processMetricAttributes(m) - } } - - dropResourceMetricAttributes(rs) } return md, nil } -func (d *gpuAttributesProcessor) processMetricAttributes(m pmetric.Metric) { +func (d *gpuAttributesProcessor) processGPUMetricAttributes(m pmetric.Metric) { // only decorate GPU metrics - isGpuMetric := strings.Contains(m.Name(), gpuMetricIdentifier) - isNeuronMetric := d.awsNeuronMetricChecker.IsProcessedNeuronMetric(m.Name()) - if !isNeuronMetric && !isGpuMetric { + if !strings.Contains(m.Name(), gpuMetricIdentifier) { return } labelFilter := map[string]map[string]interface{}{} - if isGpuMetric { - if strings.HasPrefix(m.Name(), containerMetricPrefix) { - labelFilter = metricFilters.ContainerGpuLabelFilter - } else if strings.HasPrefix(m.Name(), podMetricPrefix) { - labelFilter = metricFilters.PodGpuLabelFilter - } else if strings.HasPrefix(m.Name(), nodeMetricPrefix) { - labelFilter = metricFilters.NodeGpuLabelFilter - } - } else if isNeuronMetric { - if strings.HasPrefix(m.Name(), containerMetricPrefix) { - labelFilter = metricFilters.ContainerNeuronLabelFilter - } else if strings.HasPrefix(m.Name(), podMetricPrefix) { - labelFilter = metricFilters.PodNeuronLabelFilter - } else if strings.HasPrefix(m.Name(), nodeMetricPrefix) { - labelFilter = metricFilters.NodeNeuronLabelFilter - } - - if strings.Contains(m.Name(), "_neurondevice_hw") { - if kubernetesMap, ok := labelFilter[internal.Kubernetes]; ok { - // cloning is done to avoid modifying the original label filters - labelFilter = maps.Clone(labelFilter) - kubernetesMap := maps.Clone(kubernetesMap) - delete(kubernetesMap, "labels") - labelFilter[internal.Kubernetes] = kubernetesMap - } - } + if strings.HasPrefix(m.Name(), gpuContainerMetricPrefix) { + labelFilter = containerLabelFilter + } else if strings.HasPrefix(m.Name(), gpuPodMetricPrefix) { + labelFilter = podLabelFilter + } else if strings.HasPrefix(m.Name(), gpuNodeMetricPrefix) { + labelFilter = nodeLabelFilter } var dps pmetric.NumberDataPointSlice @@ -197,7 +230,7 @@ func (d *gpuAttributesProcessor) filterAttributes(attributes pcommon.Map, labels func (d *gpuAttributesProcessor) filterGpuMetricsWithoutPodName(metrics pmetric.MetricSlice, resourceAttributes pcommon.Map) { metrics.RemoveIf(func(m pmetric.Metric) bool { isGpu := strings.Contains(m.Name(), gpuMetricIdentifier) - isContainerOrPod := strings.HasPrefix(m.Name(), containerMetricPrefix) || strings.HasPrefix(m.Name(), podMetricPrefix) + isContainerOrPod := strings.HasPrefix(m.Name(), gpuContainerMetricPrefix) || strings.HasPrefix(m.Name(), gpuPodMetricPrefix) if !isGpu || !isContainerOrPod { return false } @@ -220,13 +253,3 @@ func (d *gpuAttributesProcessor) filterGpuMetricsWithoutPodName(metrics pmetric. return dps.Len() == 0 }) } - -func dropResourceMetricAttributes(resourceMetric pmetric.ResourceMetrics) { - serviceNameKey := "service.name" - attributes := resourceMetric.Resource().Attributes() - serviceName, exists := attributes.Get(serviceNameKey) - - if exists && (serviceName.Str() == "containerInsightsNeuronMonitorScraper" || serviceName.Str() == "containerInsightsDCGMExporterScraper") { - resourceMetric.Resource().Attributes().Clear() - } -} diff --git a/plugins/processors/gpuattributes/processor_test.go b/plugins/processors/gpuattributes/processor_test.go index d60a409504..a625945eda 100644 --- a/plugins/processors/gpuattributes/processor_test.go +++ b/plugins/processors/gpuattributes/processor_test.go @@ -13,7 +13,7 @@ import ( "go.uber.org/zap" ) -func TestProcessMetricsForGPUMetrics(t *testing.T) { +func TestProcessMetrics(t *testing.T) { logger, _ := zap.NewDevelopment() gp := newGpuAttributesProcessor(createDefaultConfig().(*Config), logger) ctx := context.Background() @@ -25,7 +25,7 @@ func TestProcessMetricsForGPUMetrics(t *testing.T) { want []map[string]string }{ "nonNode": { - metrics: generateGPUMetrics("prefix", []map[string]string{ + metrics: generateMetrics("prefix", []map[string]string{ { "ClusterName": "cluster", }, @@ -38,7 +38,7 @@ func TestProcessMetricsForGPUMetrics(t *testing.T) { }, }, "nodeDropSimple": { - metrics: generateGPUMetrics("node", []map[string]string{ + metrics: generateMetrics("node", []map[string]string{ { "ClusterName": "cluster", "Drop": "val", @@ -52,7 +52,7 @@ func TestProcessMetricsForGPUMetrics(t *testing.T) { }, }, "nodeDropJson": { - metrics: generateGPUMetrics("node", []map[string]string{ + metrics: generateMetrics("node", []map[string]string{ { "ClusterName": "cluster", "kubernetes": "{\"host\":\"test\"}", @@ -67,7 +67,7 @@ func TestProcessMetricsForGPUMetrics(t *testing.T) { }, }, "nodeDropMixed": { - metrics: generateGPUMetrics("node", []map[string]string{ + metrics: generateMetrics("node", []map[string]string{ { "ClusterName": "cluster", "Drop": "val", @@ -83,7 +83,7 @@ func TestProcessMetricsForGPUMetrics(t *testing.T) { }, }, "dropPodWithoutPodName": { - metrics: generateGPUMetrics("pod", []map[string]string{ + metrics: generateMetrics("pod", []map[string]string{ { "ClusterName": "cluster", "kubernetes": "{\"host\":\"test\",\"b\":\"2\"}", @@ -93,7 +93,7 @@ func TestProcessMetricsForGPUMetrics(t *testing.T) { want: []map[string]string{}, }, "keepPodWithPodName": { - metrics: generateGPUMetrics("pod", []map[string]string{ + metrics: generateMetrics("pod", []map[string]string{ { "ClusterName": "cluster", "PodName": "pod", @@ -110,7 +110,7 @@ func TestProcessMetricsForGPUMetrics(t *testing.T) { }, }, "dropContainerWithoutPodName": { - metrics: generateGPUMetrics("container", []map[string]string{ + metrics: generateMetrics("container", []map[string]string{ { "ClusterName": "cluster", "kubernetes": "{\"host\":\"test\",\"b\":\"2\"}", @@ -120,7 +120,7 @@ func TestProcessMetricsForGPUMetrics(t *testing.T) { want: []map[string]string{}, }, "keepContainerWithPodName": { - metrics: generateGPUMetrics("container", []map[string]string{ + metrics: generateMetrics("container", []map[string]string{ { "ClusterName": "cluster", "PodName": "pod", @@ -137,7 +137,7 @@ func TestProcessMetricsForGPUMetrics(t *testing.T) { }, }, "dropSingleDatapointWithoutPodName": { - metrics: generateGPUMetrics("container", []map[string]string{ + metrics: generateMetrics("container", []map[string]string{ { "ClusterName": "cluster", "kubernetes": "{\"host\":\"test\",\"b\":\"2\"}", @@ -158,7 +158,7 @@ func TestProcessMetricsForGPUMetrics(t *testing.T) { }, }, "keepAllDatapoints": { - metrics: generateGPUMetrics("container", []map[string]string{ + metrics: generateMetrics("container", []map[string]string{ { "ClusterName": "cluster", "PodName": "pod1", @@ -206,273 +206,7 @@ func TestProcessMetricsForGPUMetrics(t *testing.T) { } } -func TestProcessMetricsForNeuronMetrics(t *testing.T) { - logger, _ := zap.NewDevelopment() - gp := newGpuAttributesProcessor(createDefaultConfig().(*Config), logger) - ctx := context.Background() - - testcases := map[string]struct { - resource string - metrics pmetric.Metrics - wantMetricCnt int - want []map[string]string - }{ - "neuronMetricsProcessedWithNoPodCorrelation": { - metrics: generateNeuronMetrics("neuron_execution_latency", []map[string]string{ - { - "ClusterName": "cluster", - "Drop": "val", - "percentile": "p50", - "kubernetes": "{\"host\":\"test\",\"drop\":\"2\",\"labels\":\"label\"}", - }, - }), - wantMetricCnt: 2, - want: []map[string]string{ - // neuron_execution_latency - { - "ClusterName": "cluster", - "Drop": "val", - "percentile": "p50", - "runtime_tag": "DEFAULT", - "kubernetes": "{\"host\":\"test\",\"drop\":\"2\",\"labels\":\"label\"}", - }, - // node_neuron_execution_latency - { - "ClusterName": "cluster", - "Type": "NodeAWSNeuron", - "kubernetes": "{\"host\":\"test\",\"labels\":\"label\"}", - }, - }, - }, - "neuronMetricsProcessedWithPodCorrelation": { - metrics: generateNeuronMetrics("neuroncore_memory_usage_constants", []map[string]string{ - { - "ClusterName": "cluster", - "Drop": "val", - "runtime_tag": "10", - "NeuronCore": "0", - "NeuronDevice": "0", - "PodName": "testPod", - "ContainerName": "testContainer", - "kubernetes": "{\"host\":\"test\",\"drop\":\"2\",\"labels\":\"label\"}", - }, - }), - wantMetricCnt: 7, - want: []map[string]string{ - // neuroncore_memory_usage_constants - { - "ClusterName": "cluster", - "Drop": "val", - "runtime_tag": "DEFAULT", - "NeuronCore": "core0", - "NeuronDevice": "device0", - "PodName": "testPod", - "ContainerName": "testContainer", - "kubernetes": "{\"host\":\"test\",\"drop\":\"2\",\"labels\":\"label\"}", - }, - // container_neuroncore_memory_usage_constants - { - "ClusterName": "cluster", - "NeuronCore": "core0", - "NeuronDevice": "device0", - "Type": "ContainerAWSNeuronCore", - "PodName": "testPod", - "ContainerName": "testContainer", - "kubernetes": "{\"host\":\"test\",\"labels\":\"label\"}", - }, - // pod_neuroncore_memory_usage_constants - { - "ClusterName": "cluster", - "NeuronCore": "core0", - "NeuronDevice": "device0", - "Type": "PodAWSNeuronCore", - "PodName": "testPod", - "kubernetes": "{\"host\":\"test\",\"labels\":\"label\"}", - }, - // node_neuroncore_memory_usage_constants - { - "ClusterName": "cluster", - "NeuronCore": "core0", - "NeuronDevice": "device0", - "Type": "NodeAWSNeuronCore", - "kubernetes": "{\"host\":\"test\",\"labels\":\"label\"}", - }, - // container_neuroncore_memory_usage_total - { - "ClusterName": "cluster", - "NeuronCore": "core0", - "NeuronDevice": "device0", - "Type": "ContainerAWSNeuronCore", - "PodName": "testPod", - "ContainerName": "testContainer", - "kubernetes": "{\"host\":\"test\",\"labels\":\"label\"}", - }, - // pod_neuroncore_memory_usage_total - { - "ClusterName": "cluster", - "NeuronCore": "core0", - "NeuronDevice": "device0", - "Type": "PodAWSNeuronCore", - "PodName": "testPod", - "kubernetes": "{\"host\":\"test\",\"labels\":\"label\"}", - }, - // node_neuroncore_memory_usage_total - { - "ClusterName": "cluster", - "NeuronCore": "core0", - "NeuronDevice": "device0", - "Type": "NodeAWSNeuronCore", - "kubernetes": "{\"host\":\"test\",\"labels\":\"label\"}", - }, - }, - }, - "neuronMemoryMetricsAggregated": { - metrics: generateNeuronMetrics("neuroncore_memory_usage_constants", []map[string]string{ - { - "ClusterName": "cluster", - "Drop": "val", - "runtime_tag": "10", - "NeuronCore": "0", - "NeuronDevice": "0", - "kubernetes": "{\"host\":\"test\",\"drop\":\"2\",\"labels\":\"label\"}", - }, - }), - wantMetricCnt: 3, - want: []map[string]string{ - // neuroncore_memory_usage_constants - { - "ClusterName": "cluster", - "Drop": "val", - "runtime_tag": "DEFAULT", - "NeuronCore": "core0", - "NeuronDevice": "device0", - "kubernetes": "{\"host\":\"test\",\"drop\":\"2\",\"labels\":\"label\"}", - }, - // node_neuroncore_memory_usage_constants - { - "ClusterName": "cluster", - "NeuronCore": "core0", - "NeuronDevice": "device0", - "Type": "NodeAWSNeuronCore", - "kubernetes": "{\"host\":\"test\",\"labels\":\"label\"}", - }, - // node_neuroncore_memory_usage_total - { - "ClusterName": "cluster", - "NeuronCore": "core0", - "NeuronDevice": "device0", - "Type": "NodeAWSNeuronCore", - "kubernetes": "{\"host\":\"test\",\"labels\":\"label\"}", - }, - }, - }, - "neuronDeviceHardwareMetrics_labelsAreDropped": { - metrics: generateNeuronMetrics("neurondevice_hw_ecc_events", []map[string]string{ - { - "ClusterName": "cluster", - "Drop": "val", - "runtime_tag": "10", - "NeuronCore": "0", - "NeuronDevice": "0", - "event_type": "mem_ecc_corrected", - "kubernetes": "{\"host\":\"test\",\"drop\":\"2\",\"labels\":\"label\"}", - "PodName": "testPod", - "ContainerName": "testContainer", - }, - }), - wantMetricCnt: 7, - want: []map[string]string{ - // neurondevice_hw_ecc_events - { - "ClusterName": "cluster", - "Drop": "val", - "runtime_tag": "DEFAULT", - "NeuronCore": "core0", - "NeuronDevice": "device0", - "event_type": "mem_ecc_corrected", - "kubernetes": "{\"host\":\"test\",\"drop\":\"2\",\"labels\":\"label\"}", - "PodName": "testPod", - "ContainerName": "testContainer", - }, - // container_neurondevice_hw_ecc_events_mem_ecc_corrected - { - "ClusterName": "cluster", - "NeuronCore": "core0", - "NeuronDevice": "device0", - "Type": "ContainerAWSNeuronDevice", - "kubernetes": "{\"host\":\"test\"}", - "PodName": "testPod", - "ContainerName": "testContainer", - }, - // pod_neurondevice_hw_ecc_events_mem_ecc_corrected - { - "ClusterName": "cluster", - "NeuronCore": "core0", - "NeuronDevice": "device0", - "Type": "PodAWSNeuronDevice", - "kubernetes": "{\"host\":\"test\"}", - "PodName": "testPod", - }, - // node_neurondevice_hw_ecc_events_mem_ecc_corrected - { - "ClusterName": "cluster", - "NeuronCore": "core0", - "NeuronDevice": "device0", - "Type": "NodeAWSNeuronDevice", - "kubernetes": "{\"host\":\"test\"}", - }, - // container_neurondevice_hw_ecc_events_total - { - "ClusterName": "cluster", - "NeuronCore": "core0", - "NeuronDevice": "device0", - "Type": "ContainerAWSNeuronDevice", - "kubernetes": "{\"host\":\"test\"}", - "PodName": "testPod", - "ContainerName": "testContainer", - }, - // pod_neurondevice_hw_ecc_events_total - { - "ClusterName": "cluster", - "NeuronCore": "core0", - "NeuronDevice": "device0", - "Type": "PodAWSNeuronDevice", - "kubernetes": "{\"host\":\"test\"}", - "PodName": "testPod", - }, - // node_neurondevice_hw_ecc_events_total - { - "ClusterName": "cluster", - "NeuronCore": "core0", - "NeuronDevice": "device0", - "Type": "NodeAWSNeuronDevice", - "kubernetes": "{\"host\":\"test\"}", - }, - }, - }, - } - - for tname, tc := range testcases { - fmt.Printf("running %s\n", tname) - ms, _ := gp.processMetrics(ctx, tc.metrics) - assert.Equal(t, tc.wantMetricCnt, ms.MetricCount()) - if tc.wantMetricCnt > 0 { - resourceMetricsAttributes := ms.ResourceMetrics().At(0).Resource().Attributes() - assert.Equal(t, 0, resourceMetricsAttributes.Len()) - for i, dim := range tc.want { - dpAttr := ms.ResourceMetrics().At(0).ScopeMetrics().At(0).Metrics().At(i).Sum().DataPoints().At(0).Attributes() - assert.Equal(t, len(dim), dpAttr.Len()) - for k, v := range dim { - got, ok := dpAttr.Get(k) - assert.True(t, ok) - assert.Equal(t, v, got.Str()) - } - } - } - } -} - -func generateGPUMetrics(prefix string, dimensions []map[string]string) pmetric.Metrics { +func generateMetrics(prefix string, dimensions []map[string]string) pmetric.Metrics { md := pmetric.NewMetrics() ms := md.ResourceMetrics().AppendEmpty().ScopeMetrics().AppendEmpty().Metrics().AppendEmpty() ms.SetName(prefix + gpuMetricIdentifier) @@ -486,19 +220,3 @@ func generateGPUMetrics(prefix string, dimensions []map[string]string) pmetric.M } return md } - -func generateNeuronMetrics(prefix string, dimensions []map[string]string) pmetric.Metrics { - md := pmetric.NewMetrics() - ms := md.ResourceMetrics().AppendEmpty().ScopeMetrics().AppendEmpty().Metrics().AppendEmpty() - md.ResourceMetrics().At(0).Resource().Attributes().PutStr("service.name", "containerInsightsNeuronMonitorScraper") - ms.SetName(prefix) - dps := ms.SetEmptyGauge().DataPoints() - for _, dim := range dimensions { - dp := dps.AppendEmpty() - dp.SetIntValue(10) - for k, v := range dim { - dp.Attributes().PutStr(k, v) - } - } - return md -}