Merge pull request #648 from mendix/develop

Set garbage collector based on container memory or provided env variable
mendix · Jun 20, 2023 · 714b1ad · 714b1ad
2 parents 8be13f0 + fac9c0c
commit 714b1ad
Show file tree

Hide file tree

Showing 9 changed files with 352 additions and 78 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -29,7 +29,7 @@ jobs:
       - name: Get current version
         id: get-current-version
         run: |
-          CURRENT_VERSION_TAG=$(git tag --list --sort=-version:refname "v*" | head -n 1)
+          CURRENT_VERSION_TAG=$(git tag --list --merged HEAD --sort=-version:refname "v*" | head -n 1)
           echo "::set-output name=current_version_tag::${CURRENT_VERSION_TAG}"
       - name: Auto-generate future version
         id: autogenerate-version

diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -5,6 +5,7 @@ on:
     types: [closed]
     branches:
       - master
+      - 'releases/**'
 
 jobs:
   pre:

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -8,6 +8,7 @@ on:
     branches:
       - develop
       - master
+      - 'releases/**'
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}

diff --git a/README.md b/README.md
@@ -583,7 +583,7 @@ Note that:
 
 ## Telemetry Configuration
 
-The buildpack includes a variety of telemetry agents, and can configure logging for the Mendix Runtime.
+The buildpack includes a variety of telemetry agents that can be configured to collect and forward metrics/logs from the Mendix Runtime.
 
 ### New Relic
 
@@ -829,6 +829,43 @@ Example (1000 loglines/second):
 cf set-env <YOUR_APP> LOG_RATELIMIT '1000'
 ```
 
+### Custom Runtime Metrics filtering
+
+For the third-party integrations explained above, in addition to the metrics collected by the agents, custom runtime metrics are provided via telegraf.
+This configuration also has a filtering mechanism that allows users to specify metrics they allow or deny for the vendor they are using.
+To filter the ingestion of custom runtime metrics to third party APMs, users should provide a list of prefixes of the metrics they want to allow/deny using the environment variables listed below.
+
+Note: Custom database metrics cannot be filtered by name, to turn them off, the `APPMETRICS_INCLUDE_DB` environment variable should be set to false. 
+
+#### APM_METRICS_FILTER_ALLOW
+
+Comma-separated list of prefixes for the metrics to be allowed. By default, all metrics are allowed, even if they are not specified via this env var.
+
+For example, to allow only the session metrics, `APM_METRICS_FILTER_ALLOW` should be set to `mx.runtime.stats.sessions`:
+
+```shell
+cf set-env <YOUR_APP> APM_METRICS_FILTER_ALLOW 'mx.runtime.stats.sessions'
+```
+
+#### APM_METRICS_FILTER_DENY
+
+Comma-separated list of prefixes for the metrics to be denied. 
+
+For example, to deny all metrics starting with jetty or mx.runtime, the environment variable should be set to `jetty,mx.runtime`:
+
+```shell
+cf set-env <YOUR_APP> APM_METRICS_FILTER_DENY 'jetty,mx.runtime'
+```
+
+#### APM_METRICS_FILTER_DENY_ALL
+
+If this environment variable is set to `true`, all metrics will be denied regardless of values of `APM_METRICS_FILTER_ALLOW`, `APM_METRICS_FILTER_DENY`, and `APPMETRICS_INCLUDE_DB`.
+
+```shell
+cf set-env <YOUR_APP> APM_METRICS_FILTER_DENY_ALL true
+```
+
+
 ## Using the Buildpack without an Internet Connection
 
 If you are running Cloud Foundry without a connection to the Internet, you should specify an on-premises web server that hosts Mendix Runtime files and other buildpack dependencies. You can set the endpoint with the following environment variable:

diff --git a/buildpack/core/java.py b/buildpack/core/java.py
@@ -11,6 +11,8 @@
 
 
 JAVA_VERSION_OVERRIDE_KEY = "JAVA_VERSION"
+DEFAULT_GC_COLLECTOR = "Serial"
+SUPPORTED_GC_COLLECTORS = ["Serial", "G1"]
 
 
 def get_java_major_version(runtime_version):
@@ -282,13 +284,7 @@ def _set_user_provided_java_options(m2ee):
 
 
 def _set_jvm_memory(m2ee, vcap):
-    max_memory = os.environ.get("MEMORY_LIMIT")
-
-    if max_memory:
-        match = re.search("([0-9]+)M", max_memory.upper())
-        limit = int(match.group(1))
-    else:
-        limit = int(vcap["limits"]["mem"])
+    limit = get_memory_limit(vcap)
 
     if limit >= 32768:
         heap_size = limit - 4096
@@ -335,12 +331,48 @@ def _set_jvm_memory(m2ee, vcap):
         )
 
 
+def _set_garbage_collector(m2ee, vcap_data):
+    limit = get_memory_limit(vcap_data)
+
+    jvm_garbage_collector = DEFAULT_GC_COLLECTOR
+    if limit >= 4096:
+        # override collector if memory > 4G
+        jvm_garbage_collector = "G1"
+
+    env_jvm_garbage_collector = os.getenv("JVM_GARBAGE_COLLECTOR")
+    if env_jvm_garbage_collector:
+        if env_jvm_garbage_collector in SUPPORTED_GC_COLLECTORS:
+            # override from user-provided variable
+            jvm_garbage_collector = env_jvm_garbage_collector
+        else:
+            logging.warning("Unsupported jvm garbage collector found. The specified "
+                            "garbage collector [%s] is not supported. JVM garbage "
+                            "collector type falling back to default [%s]",
+                            env_jvm_garbage_collector, jvm_garbage_collector)
+
+    util.upsert_javaopts(m2ee, f"-XX:+Use{jvm_garbage_collector}GC")
+
+    logging.info("JVM garbage collector is set to [%s]", jvm_garbage_collector)
+
+
+def get_memory_limit(vcap):
+    max_memory = os.environ.get("MEMORY_LIMIT")
+
+    if max_memory:
+        match = re.search("([0-9]+)M", max_memory.upper())
+        limit = int(match.group(1))
+    else:
+        limit = int(vcap["limits"]["mem"])
+    return limit
+
+
 def _set_application_name(m2ee, application_name):
     util.upsert_javaopts(m2ee, f"-DapplicationName={application_name}")
 
 
 def update_config(m2ee, application_name, vcap_data, runtime_version):
     _set_application_name(m2ee, application_name)
     _set_jvm_memory(m2ee, vcap_data)
+    _set_garbage_collector(m2ee, vcap_data)
     _set_jvm_locale(m2ee, get_java_major_version(runtime_version))
     _set_user_provided_java_options(m2ee)
diff --git a/buildpack/telemetry/metrics.py b/buildpack/telemetry/metrics.py
@@ -20,69 +20,6 @@
 
 from . import datadog, appdynamics, dynatrace
 
-# Runtime configuration for influx registry
-# This enables the new stream of metrics coming from micrometer instead
-# of the admin port.
-# https://docs.mendix.com/refguide/metrics#registries-configuration
-# NOTE: Metrics are usually dot separated. But each registry has its
-# own naming format. For instance, a metric like
-# `a.name.like.this` would appear as `a_name_like_this` in
-# influx-formatted metrics output. Hence the filter names uses the
-# dot-separated metric names.
-INFLUX_REGISTRY = {
-    "type": "influx",
-    "settings": {
-        "uri": "http://localhost:8086",
-        "db": "mendix",
-        "step": "10s",
-    },
-    "filters": [
-        # Login metrics needs to be enabled explicitly as it's disabled
-        # by default
-        {
-            "type": "nameStartsWith",
-            "result": "accept",
-            "values": ["mx.runtime.user.login"],
-        },
-        # Filter out irrelevant metrics to reduce
-        # the payload size passed to TSS/TFR
-        # https://docs.mendix.com/refguide/metrics#filters
-        {
-            "type": "nameStartsWith",
-            "result": "deny",
-            "values": ["commons.pool", "jvm.buffer"],
-        },
-    ],
-}
-
-STATSD_REGISTRY = {
-    "type": "statsd",
-    "settings": {"port": datadog.get_statsd_port()},
-}
-
-# For freeapps we push only the session & login metrics
-FREEAPPS_METRICS_REGISTRY = [
-    {
-        "type": "influx",
-        "settings": {
-            "uri": "http://localhost:8086",
-            "db": "mendix",
-            "step": "10s",
-        },
-        "filters": [
-            {
-                "type": "nameStartsWith",
-                "result": "accept",
-                "values": [
-                    "mx.runtime.stats.sessions",
-                    "mx.runtime.user.login",
-                ],
-            },
-            {"type": "nameStartsWith", "result": "deny", "values": [""]},
-        ],
-    }
-]
-
 METRICS_REGISTRIES_KEY = "Metrics.Registries"
 
 # From this MxRuntime version onwards we gather (available) runtime statistics
@@ -186,21 +123,144 @@ def configure_metrics_registry(m2ee):
 
     logging.info("Configuring runtime to push metrics to influx via micrometer")
     if util.is_free_app():
-        return FREEAPPS_METRICS_REGISTRY
+        return [get_freeapps_registry()]
 
-    paidapps_registries = [INFLUX_REGISTRY]
+    paidapps_registries = [get_influx_registry()]
 
     if (
         datadog.is_enabled()
         or get_appmetrics_target()
         or appdynamics.machine_agent_enabled()
         or dynatrace.is_telegraf_enabled()
     ):
-        paidapps_registries.append(STATSD_REGISTRY)
+        allow_list, deny_list = get_apm_filters()
+        paidapps_registries.append(get_statsd_registry(allow_list, deny_list))
 
     return paidapps_registries
 
 
+def get_apm_filters():
+    if deny_all_apm_metrics():
+        allow_list = []
+        deny_list = [""]
+    else:
+        allowed_metrics = os.getenv("APM_METRICS_FILTER_ALLOW")
+        denied_metrics = os.getenv("APM_METRICS_FILTER_DENY")
+
+        if allowed_metrics and (denied_metrics is None):
+            # if only allowed metrics are specified, deny all the others
+            denied_metrics = ""
+
+        allow_list = sanitize_metrics_filter(allowed_metrics)
+        deny_list = sanitize_metrics_filter(denied_metrics)
+
+    logging.info(
+        "For APM integrations; allowed metric prefixes are: %s, "
+        "and denied metric prefixes are: %s",
+        allow_list,
+        deny_list,
+    )
+
+    return allow_list, deny_list
+
+
+def deny_all_apm_metrics():
+    return strtobool(os.getenv("APM_METRICS_FILTER_DENY_ALL", default="false"))
+
+
+def sanitize_metrics_filter(metric_filter):
+    """
+    If we use empty string ("") in the filters that we use for statsd registry,
+    it accepts/denies every metric since we use type as `nameStartsWith`.
+    To prevent breaking the functionality because of this, we need to make sure
+    that we pass empty string to the registry filters only if it's intentional.
+    So, we strip the leading and trailing commas. Additionally we remove all
+    the white spaces to prevent any unintentional mistakes.
+    """
+    if metric_filter is None:
+        return []
+    return metric_filter.replace(" ", "").strip(",").split(",")
+
+
+def get_influx_registry():
+    # Runtime configuration for influx registry
+    # This enables the new stream of metrics coming from micrometer instead
+    # of the admin port.
+    # https://docs.mendix.com/refguide/metrics#registries-configuration
+    # NOTE: Metrics are usually dot separated. But each registry has its
+    # own naming format. For instance, a metric like
+    # `a.name.like.this` would appear as `a_name_like_this` in
+    # influx-formatted metrics output. Hence the filter names uses the
+    # dot-separated metric names.
+    return {
+        "type": "influx",
+        "settings": {
+            "uri": "http://localhost:8086",
+            "db": "mendix",
+            "step": "10s",
+        },
+        "filters": [
+            # Login metrics needs to be enabled explicitly as it's disabled
+            # by default
+            {
+                "type": "nameStartsWith",
+                "result": "accept",
+                "values": ["mx.runtime.user.login"],
+            },
+            # Filter out irrelevant metrics to reduce
+            # the payload size passed to TSS/TFR
+            # https://docs.mendix.com/refguide/metrics#filters
+            {
+                "type": "nameStartsWith",
+                "result": "deny",
+                "values": ["commons.pool", "jvm.buffer"],
+            },
+        ],
+    }
+
+
+def get_statsd_registry(allow_list, deny_list):
+    return {
+        "type": "statsd",
+        "settings": {"port": datadog.get_statsd_port()},
+        "filters": [
+            {
+                "type": "nameStartsWith",
+                "result": "accept",
+                "values": allow_list,
+            },
+            {
+                "type": "nameStartsWith",
+                "result": "deny",
+                "values": deny_list,
+            },
+        ],
+    }
+
+
+def get_freeapps_registry():
+    # For freeapps we push only the session & login metrics
+    return {
+        "type": "influx",
+        "settings": {
+            "uri": "http://localhost:8086",
+            "db": "mendix",
+            "step": "10s",
+        },
+        "filters": [
+            {
+                "type": "nameStartsWith",
+                "result": "accept",
+                "values": [
+                    "mx.runtime.stats.sessions",
+                    "mx.runtime.user.login",
+                ],
+            },
+            {"type": "nameStartsWith", "result": "deny", "values": [""]},
+        ],
+    }
+
+
 def bypass_loggregator():
     env_var = os.getenv("BYPASS_LOGGREGATOR", "False")
     # Throws a useful message if you put in a nonsensical value.

diff --git a/buildpack/telemetry/telegraf.py b/buildpack/telemetry/telegraf.py
@@ -75,6 +75,8 @@ def _get_config_file_path(version):
 
 
 def include_db_metrics():
+    if metrics.deny_all_apm_metrics():
+        return False
     if util.is_free_app():
         # For free apps we are not interested in database metrics
         return False
@@ -169,7 +171,7 @@ def _get_http_outputs():
 
 
 def _get_db_config():
-    if (include_db_metrics() or datadog.get_api_key()) and util.is_cluster_leader():
+    if include_db_metrics() and util.is_cluster_leader():
         db_config = database.get_config()
         if db_config and db_config["DatabaseType"] == "PostgreSQL":
             return db_config

diff --git a/etc/telegraf/telegraf.toml.j2 b/etc/telegraf/telegraf.toml.j2
@@ -430,7 +430,7 @@
   # Higher flush interval and batch size, so that we
   # don't bombard TFR and DataLake with too many requests, but
   # few requests with bigger payloads
-  metric_batch_size = 3000
+  metric_batch_size = 2500
   flush_interval = "30s"
 
   ## HTTP method, one of: "POST" or "PUT"
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,6 +5,7 @@ on: @@
         types: [closed]
         branches:
           - master
+          - 'releases/**'
     jobs:
       pre:
@@ Expand Down @@