From dc8f84bee3d044f993585e57cc65a27dd5226b72 Mon Sep 17 00:00:00 2001 From: Eero Tamminen Date: Fri, 3 Jan 2025 13:55:34 +0200 Subject: [PATCH] Generalize BaseStatistics code a bit + document it Signed-off-by: Eero Tamminen --- comps/cores/mega/base_statistics.py | 62 +++++++++-------------------- comps/cores/telemetry/README.md | 22 +++++++++- 2 files changed, 38 insertions(+), 46 deletions(-) diff --git a/comps/cores/mega/base_statistics.py b/comps/cores/mega/base_statistics.py index b0285e73f3..62b8292c66 100644 --- a/comps/cores/mega/base_statistics.py +++ b/comps/cores/mega/base_statistics.py @@ -21,47 +21,23 @@ def append_latency(self, latency, first_token_latency=None): if first_token_latency: self.first_token_latencies.append(first_token_latency) - def calculate_statistics(self): - if not self.response_times: - return { - "p50_latency": None, - "p99_latency": None, - "average_latency": None, - } - # Calculate the P50 (median) - p50 = np.percentile(self.response_times, 50) - - # Calculate the P99 - p99 = np.percentile(self.response_times, 99) - - avg = np.average(self.response_times) - - return { - "p50_latency": p50, - "p99_latency": p99, - "average_latency": avg, - } - - def calculate_first_token_statistics(self): - if not self.first_token_latencies: - return { - "p50_latency_first_token": None, - "p99_latency_first_token": None, - "average_latency_first_token": None, - } - # Calculate the P50 (median) - p50 = np.percentile(self.first_token_latencies, 50) - - # Calculate the P99 - p99 = np.percentile(self.first_token_latencies, 99) - - avg = np.average(self.first_token_latencies) - - return { - "p50_latency_first_token": p50, - "p99_latency_first_token": p99, - "average_latency_first_token": avg, - } + def _add_statistics(self, result, stats, suffix): + "add P50 (median), P99 and average values for 'stats' array to 'result' dict" + if stats: + result[f"p50_{suffix}"] = np.percentile(stats, 50) + result[f"p99_{suffix}"] = np.percentile(stats, 99) + result[f"average_{suffix}"] = np.average(stats) + else: + result[f"p50_{suffix}"] = None + result[f"p99_{suffix}"] = None + result[f"average_{suffix}"] = None + + def get_statistics(self): + "return stats dict with P50, P99 and average values for first token and response timings" + result = {} + self._add_statistics(result, self.response_times, "latency") + self._add_statistics(result, self.first_token_latencies, "latency_first_token") + return result def register_statistics( @@ -79,7 +55,5 @@ def collect_all_statistics(): results = {} if statistics_dict: for name, statistic in statistics_dict.items(): - tmp_dict = statistic.calculate_statistics() - tmp_dict.update(statistic.calculate_first_token_statistics()) - results.update({name: tmp_dict}) + results[name] = statistic.get_statistics() return results diff --git a/comps/cores/telemetry/README.md b/comps/cores/telemetry/README.md index 9a99e7f000..b644c94373 100644 --- a/comps/cores/telemetry/README.md +++ b/comps/cores/telemetry/README.md @@ -4,6 +4,19 @@ OPEA Comps currently provides telemetry functionalities for metrics and tracing ![opea telemetry](../assets/img/opea_telemetry.jpg) +Contents: + +- [Metrics](#metrics) + - [HTTP metrics](#http-metrics) + - [Megaservice E2E metrics](#megaservice-e2e-metrics) + - [Inferencing metrics](#inferencing-metrics) + - [Metrics collection](#metrics-collection) +- [Statistics](#statistics) +- [Tracing](#tracing) +- [Visualization](#visualization) +- [Visualize metrics](#visualize-metrics) +- [Visualize tracing](#visualize-tracing) + ## Metrics OPEA microservice metrics are exported in Prometheus format under `/metrics` endpoint. @@ -20,7 +33,7 @@ They can be fetched e.g. with `curl`: curl localhost:{port of your service}/metrics ``` -### HTTP Metrics +### HTTP metrics Metrics output looks following: @@ -54,7 +67,7 @@ Latency ones are histogram metrics i.e. include count, total value and set of va They are available only for _stream_ requests using LLM. Pending count accounts for all requests. -### Inferencing Metrics +### Inferencing metrics For example, you can `curl localhost:6006/metrics` to retrieve the TEI embedding metrics, and the output should look like follows: @@ -95,6 +108,11 @@ Below are some default metrics endpoints for specific microservices: | TEI embedding | 6006 | /metrics | [link](https://huggingface.github.io/text-embeddings-inference/#/Text%20Embeddings%20Inference/metrics) | | TEI reranking | 8808 | /metrics | [link](https://huggingface.github.io/text-embeddings-inference/#/Text%20Embeddings%20Inference/metrics) | +## Statistics + +Additionally, GenAIComps microservices provide separate `/v1/statistics` endpoint, which outputs P50, P99 and average metrics +for response times, and first token latencies, if microservice processes them. + ## Tracing OPEA use OpenTelemetry to trace function call stacks. To trace a function, add the `@opea_telemetry` decorator to either an async or sync function. The call stacks and time span data will be exported by OpenTelemetry. You can use Jaeger UI to visualize this tracing data.