Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add metric label for scache load #3516

Merged
merged 1 commit into from
May 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions docs/references/observability.rst
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,10 @@ The query time in seconds of the last schema cache load.
pgrst_schema_cache_loads_total
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

======== =======
**Type** Counter
======== =======
========== ==========================
**Type** Counter
**Labels** ``status``: SUCCESS | FAIL
========== ==========================

The total number of times the schema cache was loaded.

Expand Down
38 changes: 20 additions & 18 deletions src/PostgREST/Metrics.hs
Original file line number Diff line number Diff line change
Expand Up @@ -8,47 +8,49 @@ module PostgREST.Metrics
import qualified Data.ByteString.Lazy as LBS
import qualified Hasql.Pool.Observation as SQL

import qualified Prometheus as Prom
import Prometheus

import PostgREST.Observation

import Protolude

data MetricsState =
MetricsState Prom.Counter Prom.Gauge Prom.Gauge Prom.Gauge Prom.Counter Prom.Gauge
MetricsState Counter Gauge Gauge Gauge (Vector Label1 Counter) Gauge

init :: Int -> IO MetricsState
init configDbPoolSize = do
poolTimeouts <- Prom.register $ Prom.counter (Prom.Info "pgrst_db_pool_timeouts_total" "The total number of pool connection timeouts")
poolAvailable <- Prom.register $ Prom.gauge (Prom.Info "pgrst_db_pool_available" "Available connections in the pool")
poolWaiting <- Prom.register $ Prom.gauge (Prom.Info "pgrst_db_pool_waiting" "Requests waiting to acquire a pool connection")
poolMaxSize <- Prom.register $ Prom.gauge (Prom.Info "pgrst_db_pool_max" "Max pool connections")
schemaCacheLoads <- Prom.register $ Prom.counter (Prom.Info "pgrst_schema_cache_loads_total" "The total number of times the schema cache was loaded")
schemaCacheQueryTime <- Prom.register $ Prom.gauge (Prom.Info "pgrst_schema_cache_query_time_seconds" "The query time in seconds of the last schema cache load")
Prom.setGauge poolMaxSize (fromIntegral configDbPoolSize)
poolTimeouts <- register $ counter (Info "pgrst_db_pool_timeouts_total" "The total number of pool connection timeouts")
poolAvailable <- register $ gauge (Info "pgrst_db_pool_available" "Available connections in the pool")
poolWaiting <- register $ gauge (Info "pgrst_db_pool_waiting" "Requests waiting to acquire a pool connection")
poolMaxSize <- register $ gauge (Info "pgrst_db_pool_max" "Max pool connections")
schemaCacheLoads <- register $ vector "status" $ counter (Info "pgrst_schema_cache_loads_total" "The total number of times the schema cache was loaded")
schemaCacheQueryTime <- register $ gauge (Info "pgrst_schema_cache_query_time_seconds" "The query time in seconds of the last schema cache load")
setGauge poolMaxSize (fromIntegral configDbPoolSize)
pure $ MetricsState poolTimeouts poolAvailable poolWaiting poolMaxSize schemaCacheLoads schemaCacheQueryTime

observationMetrics :: MetricsState -> ObservationHandler
observationMetrics (MetricsState poolTimeouts poolAvailable poolWaiting _ schemaCacheLoads schemaCacheQueryTime) obs = case obs of
(PoolAcqTimeoutObs _) -> do
Prom.incCounter poolTimeouts
incCounter poolTimeouts
(HasqlPoolObs (SQL.ConnectionObservation _ status)) -> case status of
SQL.ReadyForUseConnectionStatus -> do
Prom.incGauge poolAvailable
incGauge poolAvailable
SQL.InUseConnectionStatus -> do
Prom.decGauge poolAvailable
decGauge poolAvailable
SQL.TerminatedConnectionStatus _ -> do
Prom.decGauge poolAvailable
decGauge poolAvailable
SQL.ConnectingConnectionStatus -> pure ()
PoolRequest ->
Prom.incGauge poolWaiting
incGauge poolWaiting
PoolRequestFullfilled ->
Prom.decGauge poolWaiting
decGauge poolWaiting
SchemaCacheLoadedObs resTime -> do
Prom.incCounter schemaCacheLoads
Prom.setGauge schemaCacheQueryTime resTime
withLabel schemaCacheLoads "SUCCESS" incCounter
setGauge schemaCacheQueryTime resTime
SchemaCacheNormalErrorObs _ -> do
withLabel schemaCacheLoads "FAIL" incCounter
_ ->
pure ()

metricsToText :: IO LBS.ByteString
metricsToText = Prom.exportMetricsAsText
metricsToText = exportMetricsAsText
40 changes: 39 additions & 1 deletion test/io/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -732,6 +732,44 @@ def test_admin_ready_includes_schema_cache_state(defaultenv, metapostgrest):
reset_statement_timeout(metapostgrest, role)


def test_metrics_include_schema_cache_fails(defaultenv, metapostgrest):
"Should get shema cache fails from the metrics endpoint"

role = "timeout_authenticator"

env = {
**defaultenv,
"PGUSER": role,
"PGRST_INTERNAL_SCHEMA_CACHE_SLEEP": "50",
}

with run(env=env) as postgrest:
# The schema cache query takes at least 20ms, due to PGRST_INTERNAL_SCHEMA_CACHE_SLEEP above.
# Make it impossible to load the schema cache, by setting statement timeout to 100ms.
set_statement_timeout(metapostgrest, role, 20)

# force a reconnection so the new role setting is picked up
postgrest.process.send_signal(signal.SIGUSR1)

# wait for some schema cache retries
time.sleep(1)

response = postgrest.admin.get("/ready", timeout=1)
assert response.status_code == 503

response = postgrest.admin.get("/metrics", timeout=1)
assert response.status_code == 200

metrics = float(
re.search(
r'pgrst_schema_cache_loads_total{status="FAIL"} (\d+)', response.text
).group(1)
)
assert metrics > 3.0

reset_statement_timeout(metapostgrest, role)


def test_admin_not_found(defaultenv):
"Should get a not found from a undefined endpoint on the admin server"

Expand Down Expand Up @@ -1448,7 +1486,7 @@ def test_admin_metrics(defaultenv):
response = postgrest.admin.get("/metrics")
assert response.status_code == 200
assert "pgrst_schema_cache_query_time_seconds" in response.text
assert "pgrst_schema_cache_loads_total" in response.text
assert 'pgrst_schema_cache_loads_total{status="SUCCESS"}' in response.text
assert "pgrst_db_pool_max" in response.text
assert "pgrst_db_pool_waiting" in response.text
assert "pgrst_db_pool_available" in response.text
Expand Down
Loading