Skip to content

Commit

Permalink
docker-compose: switch to using newer HA deployment model
Browse files Browse the repository at this point in the history
It turns out that the HA test script was still working under the old HA
model, so it had to be reworked.
  • Loading branch information
paulfantom authored and JamesGuthrie committed May 18, 2022
1 parent ee7f76e commit 02838a6
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 35 deletions.
4 changes: 3 additions & 1 deletion docker-compose/high-availability/prometheus1.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ global:
# external systems (federation, remote storage, Alertmanager).
external_labels:
cluster: 'monitoring-cluster'
__replica__: 'promscale_deploy_ha-promscale-connector1-1'
__replica__: 'prometheus1'

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
Expand All @@ -18,8 +18,10 @@ rule_files:

remote_write:
- url: "http://promscale-connector1:9201/write"
- url: "http://promscale-connector2:9201/write"
remote_read:
- url: "http://promscale-connector1:9201/read"
- url: "http://promscale-connector2:9201/read"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
Expand Down
4 changes: 3 additions & 1 deletion docker-compose/high-availability/prometheus2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,18 @@ global:
# external systems (federation, remote storage, Alertmanager).
external_labels:
cluster: 'monitoring-cluster'
__replica__: 'promscale_deploy_ha-promscale-connector2-1'
__replica__: 'prometheus2'

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first.rules"
# - "second.rules"

remote_write:
- url: "http://promscale-connector1:9201/write"
- url: "http://promscale-connector2:9201/write"
remote_read:
- url: "http://promscale-connector1:9201/read"
- url: "http://promscale-connector2:9201/read"

scrape_configs:
Expand Down
74 changes: 41 additions & 33 deletions docker-compose/high-availability/test.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,17 @@
#!/usr/bin/env bash

# This test script exercises Promscale's HA leader election and failover code
# paths. In principle the setup is the following:
# - Two prometheus instances with `__replica__` labels of `prometheus1` and
# `prometheus2` respectively, are _both_ connected to two Promscale connector
# instances.
# - The Promscale connector instances should both choose the same "leader"
# prometheus instance. Only metrics from this instance are collected. The
# current leader is exposed through the `promscale_ha_cluster_leader_info`
# metric.
# - By starting and stopping the `prometheus1` and `prometheus2` instances, we
# can trigger a failover from one leader to the other.

set -euf -o pipefail

SELF_DIR=$(cd $(dirname ${0}) && pwd)
Expand All @@ -21,22 +33,23 @@ cleanup() {

trap cleanup EXIT

ha_line() {
LAST=`docker exec $1 wget -O - localhost:9201/metrics-text 2>&1 | grep -i "promscale_ha_cluster_leader_info.*1$" | tail -n 1`
echo "$LAST"
leader_name() {
LEADER=`docker exec $1 wget -O - localhost:9201/metrics-text 2>&1 | grep -i "promscale_ha_cluster_leader_info.*1$" | sed 's/^.*replica=\"\(.*\)\".*$/\1/'`
echo "$LEADER"
}

is_ingesting() {
LAST=$(ha_line $1)
if [[ "$LAST" == *"$1"* ]]; then
is_leader() {
LEADER1=$(leader_name "promscale_deploy_ha-promscale-connector1-1")
LEADER2=$(leader_name "promscale_deploy_ha-promscale-connector2-1")
if [[ "$LEADER1" == $1 && "$LEADER2" == $1 ]]; then
true
else
false
fi
}

is_not_ingesting() {
if is_ingesting $1; then
is_not_leader() {
if is_leader $1; then
false
else
true
Expand All @@ -57,48 +70,43 @@ wait_for() {
exit 1
}

wait_for_ingestion() {
echo "waiting for $1 to be in status $2"
wait_for_leader() {
echo "waiting for $1 to be leader"

for i in `seq 10` ; do
if $2 "$1"; then
echo "connector $1 $2"
if is_leader "$1"; then
echo "$1 is leader"
return
fi
sleep 10
done
echo "FAIL waiting for $1 $2"
echo "FAIL waiting for $1 to be leader"
exit 1
}

wait_for "promscale_deploy_ha-promscale-connector1-1"
wait_for "promscale_deploy_ha-promscale-connector2-1"

#make sure inital conditions are what we expect
if is_not_ingesting "promscale_deploy_ha-promscale-connector1-1"; then
docker stop "promscale_deploy_ha-promscale-connector2-1"
wait_for_ingestion "promscale_deploy_ha-promscale-connector1-1" "is_ingesting"
docker start "promscale_deploy_ha-promscale-connector2-1"
wait_for_ingestion "promscale_deploy_ha-promscale-connector2-1" "is_not_ingesting"
#make sure initial conditions are what we expect
if is_not_leader "prometheus1"; then
docker stop "promscale_deploy_ha-prometheus2-1"
wait_for_leader "prometheus1"
docker start "promscale_deploy_ha-prometheus2-1"
fi

echo "check initial condition"
is_ingesting "promscale_deploy_ha-promscale-connector1-1"
is_not_ingesting "promscale_deploy_ha-promscale-connector2-1"
echo "1: check initial condition"
is_leader "prometheus1"
is_not_leader "prometheus2"

echo "kill connector1's prometheus"
echo "2: kill prometheus1, prometheus2 becomes leader"
docker stop promscale_deploy_ha-prometheus1-1
wait_for_ingestion "promscale_deploy_ha-promscale-connector2-1" "is_ingesting"
wait_for_ingestion "promscale_deploy_ha-promscale-connector1-1" "is_not_ingesting"
wait_for_leader "prometheus2"

echo "bring prometheus 1 back and kill the current leader, connector 1 becomes leader again"
echo "3: bring prometheus 1 back and kill the current leader, prometheus 1 becomes leader again"
docker start promscale_deploy_ha-prometheus1-1
docker stop "promscale_deploy_ha-promscale-connector2-1"
wait_for_ingestion "promscale_deploy_ha-promscale-connector1-1" "is_ingesting"

echo "bring connector 2 up, it becomes follower"
docker start "promscale_deploy_ha-promscale-connector2-1"
wait_for "promscale_deploy_ha-promscale-connector2-1"
wait_for_ingestion "promscale_deploy_ha-promscale-connector2-1" "is_not_ingesting"
docker stop "promscale_deploy_ha-prometheus2-1"
wait_for_leader "prometheus1"

echo "SUCCESS"

echo "Stopping containers"

0 comments on commit 02838a6

Please sign in to comment.