Skip to content

Commit

Permalink
Merge branch 'main' into dependabot/npm_and_yarn/rollout-dashboard/fr…
Browse files Browse the repository at this point in the history
…ontend/rollup-4.22.4
  • Loading branch information
sasa-tomic authored Oct 25, 2024
2 parents f0b1451 + 770efc6 commit 3908854
Show file tree
Hide file tree
Showing 9 changed files with 1,300 additions and 526 deletions.
2 changes: 1 addition & 1 deletion bin/airflow
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ export PYTHONPATH=$(dirname "$SCRIPT_DIR")/shared

if [ "$1" == "setup" ]
then
python -m venv "$VENV_DIR"
/usr/bin/python3 -m venv "$VENV_DIR"
"$VENV_DIR"/bin/pip3 install "apache-airflow[celery]==2.9.1" \
apache-airflow-providers-slack[common.sql] \
apache-airflow-providers-google \
Expand Down
15 changes: 6 additions & 9 deletions plugins/operators/ic_os_rollout.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,8 +209,7 @@ def __init__(
self.source_task_id = source_task_id
dr_dre_slack_id = DR_DRE_SLACK_ID
text = (
(
"""Proposal <{{
"""Proposal <{{
task_instance.xcom_pull(
task_ids='%(source_task_id)s',
map_indexes=task_instance.map_index,
Expand All @@ -221,10 +220,8 @@ def __init__(
map_indexes=task_instance.map_index,
).proposal_id
}}> is now up for voting. <!subteam^%(dr_dre_slack_id)s>"""
""" please vote for the proposal using your HSM."""
)
% locals()
)
""" please vote for the proposal using your HSM."""
) % locals()
slack.SlackAPIPostOperator.__init__(
self,
channel=SLACK_CHANNEL,
Expand Down Expand Up @@ -257,10 +254,10 @@ def __init__(
) -> None:
dr_dre_slack_id = DR_DRE_SLACK_ID
text = (
"""Subnet `%(subnet_id)s` has not finished upgrading in over an hour."""
""" <!subteam^%(dr_dre_slack_id)s>"""
f"""Subnet `{subnet_id}` has not finished upgrading in over an hour."""
f""" <!subteam^{dr_dre_slack_id}>"""
""" please investigate *as soon as possible*."""
) % locals()
)
slack.SlackAPIPostOperator.__init__(
self,
channel=SLACK_CHANNEL,
Expand Down
67 changes: 39 additions & 28 deletions plugins/sensors/ic_os_rollout.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,9 @@ def execute(self, context: Context, event: Any = None) -> None:
+ f'ic_subnet="{subnet_id}"'
+ "}) by (ic_active_version, ic_subnet)"
)
self.log.info(f"Querying Prometheus servers: {query}")
print("::group::Querying Prometheus servers")
self.log.info(query)
print("::endgroup::")
res = prom.query_prometheus_servers(self.network.prometheus_urls, query)
if len(res) == 1 and res[0]["metric"]["ic_active_version"] == git_revision:
current_replica_count = int(res[0]["value"])
Expand Down Expand Up @@ -345,43 +347,48 @@ def send_notification_if_necessary(subnet_id: str) -> None:
# a message to Slack notifying the DRE operator that a subnet
# has not exited the alerts condition in over an hour.
now = time.time()
first_alert_check_timestamp = context["task_instance"].xcom_pull(
key="first_alert_check_timestamp",
key = "alert_check_timestamp"
task_id = context["task_instance"].task_id
self.log.info(
"Pulling alert check timestamp from xcom for %s %s %s",
key,
task_id,
context["task_instance"].map_index,
)
alert_check_timestamp = context["task_instance"].xcom_pull(
key=key,
task_ids=task_id,
map_indexes=context["task_instance"].map_index,
)
if not first_alert_check_timestamp:
self.log.info(
"Here is the current alert check timestamp: %r", alert_check_timestamp
)
if not alert_check_timestamp:
# Value is not yet xcommed. Xcom it now.
deadline = now + SUBNET_UPDATE_STALL_TIMEOUT_SECONDS
self.log.info(
"Notification routine not yet run; storing timestamp %s",
now,
)
# Value is not yet xcommed.
context["task_instance"].xcom_push(
key="first_alert_check_timestamp",
value=now,
"Notification deadline not initialized, storing %s", deadline
)
context["task_instance"].xcom_push(key=key, value=deadline)
else:
self.log.info(
"Notification routine already ran at %r",
first_alert_check_timestamp,
)
first_alert_check_timestamp = float(first_alert_check_timestamp)
if (
first_alert_check_timestamp
> now + SUBNET_UPDATE_STALL_TIMEOUT_SECONDS
):
deadline = float(alert_check_timestamp)
if now > deadline:
# Value is xcommed and is old enough.
deadline = now + SUBNET_UPDATE_STALL_TIMEOUT_SECONDS
self.log.info(
"Routine ran over %s seconds ago, notifying",
now - first_alert_check_timestamp,
"Notification deadline has been hit, notifying"
" and resetting deadline to %s",
deadline,
)
# Value is xcommed and is old enough.
# Send message here.
NotifyAboutStalledSubnet(
task_id="notify_about_stalled_subnet",
subnet_id=subnet_id,
).execute(context=context)
# send message here, then
# Remember new deadline.
context["task_instance"].xcom_push(
key="first_alert_check_timestamp",
value=now + 3600,
key=key,
value=deadline,
)

subnet_id, git_revision = subnet_id_and_git_revision_from_args(
Expand All @@ -401,7 +408,9 @@ def send_notification_if_necessary(subnet_id: str) -> None:
"subnet_id": subnet_id,
}
)
self.log.info(f"Querying Prometheus servers: {query}")
print("::group::Querying Prometheus servers")
self.log.info(query)
print("::endgroup::")
res = prom.query_prometheus_servers(self.network.prometheus_urls, query)
if len(res) > 0:
self.log.info("There are still Prometheus alerts on the subnet:")
Expand Down Expand Up @@ -626,7 +635,9 @@ def execute(self, context: Context, event: Any = None) -> None:
f" last 1 day before {subnet_id}"
)
query = "sum(changes(ic_replica_info{" + f'ic_subnet="{other}"' + "}[1d]))"
self.log.info(f"Querying Prometheus servers: {query}")
print("::group::Querying Prometheus servers")
self.log.info(query)
print("::endgroup::")
res = prom.query_prometheus_servers(self.network.prometheus_urls, query)
if not res:
raise RuntimeError(("Prometheus returned no sum of updates: %r" % res,))
Expand Down
2 changes: 1 addition & 1 deletion rollout-dashboard/doc/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
To learn about the API, how to use it, and how to interpret the data
served by API calls, please consult the programming documentation
that accompanies the `rollout_dashboard` crate, available under folder
[`server/`](server/) by running the `cargo rustdoc` program within that
[`../server/`](../server/) by running the `cargo rustdoc` program within that
folder, and then launching the Web page it generates for you.

Please do not proceed with creating a client
Expand Down
7 changes: 7 additions & 0 deletions rollout-dashboard/server/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions rollout-dashboard/server/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ futures = "0.3.30"
indexmap = { version = "2.3.0", features = ["serde"] }
lazy_static = "1.5.0"
log = "0.4.22"
querystring = "1.1.0"
regex = "1.10.5"
reqwest = { version = "0.12.5", features = ["json", "cookies"] }
serde = { version = "1.0.203", features = ["derive", "std"] }
Expand Down
Loading

0 comments on commit 3908854

Please sign in to comment.