diff --git a/docs/configuration/defining-playbooks/index.rst b/docs/configuration/defining-playbooks/index.rst index 98ed7a421..13a44a063 100644 --- a/docs/configuration/defining-playbooks/index.rst +++ b/docs/configuration/defining-playbooks/index.rst @@ -17,4 +17,4 @@ Learn how to define Robusta playbooks. List of All Triggers and Actions --------------------------------- -Refer to :ref:`Playbook Reference` +Refer to :ref:`Triggers Reference` or :ref:`Actions Reference` diff --git a/docs/configuration/sinks/slack.rst b/docs/configuration/sinks/slack.rst index 5a1eba37e..708b37668 100644 --- a/docs/configuration/sinks/slack.rst +++ b/docs/configuration/sinks/slack.rst @@ -117,10 +117,10 @@ If you can't use the `official Slack app `_ -2. Enable Socket mode in your Slack App and copy the websocket token into the Robusta deployment yaml. -3. Under "OAuth and Permissions" add the following scopes: chat:write, files:write, incoming-webhook, and channels:history -4. Under "Event Subscriptions" add bot user events for message.channels and press "Save Changes" -5. Click "Install into Workspace" +2. Enable Socket mode in your Slack App. +3. Under "OAuth and Permissions" add the following scopes: chat:write, chat:write.public, files:write, incoming-webhook, and channels:history. +4. Under "Event Subscriptions" add bot user events for message.channels and press "Save Changes". +5. Click "Install into Workspace". 6. Copy the ``Bot User OAuth Token`` from "OAuth and Permissions". 7. Add the token to SinksConfig in your `generated_values.yaml` file. diff --git a/playbooks/robusta_playbooks/k8s_resource_enrichments.py b/playbooks/robusta_playbooks/k8s_resource_enrichments.py index 5ebfdbaac..c2cf7bffe 100644 --- a/playbooks/robusta_playbooks/k8s_resource_enrichments.py +++ b/playbooks/robusta_playbooks/k8s_resource_enrichments.py @@ -48,6 +48,8 @@ class RelatedContainer(BaseModel): status: Optional[str] = None created: Optional[str] = None ports: List[Any] = [] + statusMessage: Optional[str] = None + statusReason: Optional[str] = None class RelatedPod(BaseModel): @@ -64,6 +66,7 @@ class RelatedPod(BaseModel): addresses: str containers: List[RelatedContainer] status: Optional[str] = None + statusReason: Optional[str] = None supported_resources = ["Deployment", "DaemonSet", "ReplicaSet", "Pod", "StatefulSet", "Job", "Node"] @@ -87,6 +90,7 @@ def to_pod_row(pod: Pod, cluster_name: str) -> List: addresses, len(pod.spec.containers), pod.status.phase, + pod.status.reason, ] @@ -95,7 +99,6 @@ def get_related_pods(resource) -> list[Pod]: if kind not in supported_resources: raise ActionException(ErrorCodes.RESOURCE_NOT_SUPPORTED, f"Related pods is not supported for resource {kind}") - pods = [] if kind == "Job": job_pods = get_job_all_pods(resource) pods = job_pods if job_pods else [] @@ -128,6 +131,7 @@ def to_pod_obj(pod: Pod, cluster: str) -> RelatedPod: addresses=addresses, containers=get_pod_containers(pod), status=pod.status.phase, + statusReason=pod.status.reason, ) @@ -156,6 +160,8 @@ def get_pod_containers(pod: Pod) -> List[RelatedContainer]: memoryRequest=requests.memory, restarts=getattr(containerStatus, "restartCount", 0), status=stateStr, + statusMessage=getattr(state, "messsage", None) if state else None, + statusReason=getattr(state, "reason", None) if state else None, created=getattr(state, "startedAt", None), ports=[port.to_dict() for port in container.ports] if container.ports else [], ) @@ -197,6 +203,7 @@ def related_pods(event: KubernetesResourceEvent, params: RelatedPodParams): "addresses", "containers", "status", + "status reason", ], rows=rows, ) diff --git a/playbooks/robusta_playbooks/krr.py b/playbooks/robusta_playbooks/krr.py index 930b2cde9..e2050155b 100644 --- a/playbooks/robusta_playbooks/krr.py +++ b/playbooks/robusta_playbooks/krr.py @@ -352,15 +352,15 @@ def krr_scan(event: ExecutionBaseEvent, params: KRRParams): except json.JSONDecodeError: logging.error(f"*KRR scan job failed. Expecting json result.*\n\n Result:\n{logs}") return - except ValidationError as e: - logging.error(f"*KRR scan job failed. Result format issue.*\n\n {e}") + except ValidationError: + logging.error("*KRR scan job failed. Result format issue.*\n\n", exc_info=True) logging.error(f"\n {logs}") return except Exception as e: if str(e) == "Failed to reach wait condition": - logging.error(f"*KRR scan job failed. The job wait condition timed out ({params.timeout}s)*") + logging.error(f"*KRR scan job failed. The job wait condition timed out ({params.timeout}s)*", exc_info=True) else: - logging.error(f"*KRR scan job unexpected error.*\n {e}") + logging.error(f"*KRR scan job unexpected error.*\n {e}", exc_info=True) return scan_block = ScanReportBlock( diff --git a/playbooks/robusta_playbooks/pod_investigator_enricher.py b/playbooks/robusta_playbooks/pod_investigator_enricher.py index b96fc219d..ec4516f8f 100755 --- a/playbooks/robusta_playbooks/pod_investigator_enricher.py +++ b/playbooks/robusta_playbooks/pod_investigator_enricher.py @@ -1,7 +1,7 @@ import datetime import logging from enum import Enum -from typing import List, Optional +from typing import List, Optional, Tuple from hikaru.model.rel_1_26 import Pod, PodList from robusta.api import ( @@ -59,8 +59,10 @@ def pod_issue_investigator(event: KubernetesResourceEvent): logging.info(f"No pod issues discovered for {resource.kind} {resource.metadata.name}") return # Investigate first issue found - pod_issue = detect_pod_issue(pods_with_issues[0]) - report_pod_issue(event, pods_with_issues, pod_issue) + first_pod = pods_with_issues[0] + pod_issue = detect_pod_issue(first_pod) + message, reason = get_pod_issue_message_and_reason(first_pod) + report_pod_issue(event, pods_with_issues, pod_issue, message, reason) def detect_pod_issue(pod: Pod) -> PodIssue: @@ -75,6 +77,18 @@ def detect_pod_issue(pod: Pod) -> PodIssue: return PodIssue.NoneDetected +def get_pod_issue_message_and_reason(pod: Pod) -> Tuple[Optional[str], Optional[str]]: + # Works/should work only or KubeContainerWaiting and KubePodNotReady + # Note: in line with the old code in pod_issue_investigator, we only get the message for + # the first of possibly many misbehaving containers. + if pod.status.containerStatuses: + if pod.status.containerStatuses[0].state.waiting: + return ( + pod.status.containerStatuses[0].state.waiting.message, + pod.status.containerStatuses[0].state.waiting.reason, + ) + + def is_pod_pending(pod: Pod) -> bool: return pod.status.phase.lower() == "pending" @@ -124,7 +138,9 @@ def has_image_pull_issue(pod: Pod) -> bool: return len(image_pull_statuses) > 0 -def report_pod_issue(event: KubernetesResourceEvent, pods: List[Pod], issue: PodIssue): +def report_pod_issue( + event: KubernetesResourceEvent, pods: List[Pod], issue: PodIssue, message: Optional[str], reason: Optional[str] +): # find pods with issues pods_with_issue = [pod for pod in pods if detect_pod_issue(pod) == issue] pod_names = [pod.metadata.name for pod in pods_with_issue] @@ -145,6 +161,11 @@ def report_pod_issue(event: KubernetesResourceEvent, pods: List[Pod], issue: Pod blocks.extend(additional_blocks) event.add_enrichment(blocks) + if reason: + if message is None: + message = "unknown" + event.add_enrichment([MarkdownBlock(f"\n\n{reason}: {message}")]) + def get_expected_replicas(event: KubernetesResourceEvent) -> int: resource = event.get_resource() diff --git a/src/robusta/integrations/prometheus/models.py b/src/robusta/integrations/prometheus/models.py index dca78c36b..7739e6ce6 100644 --- a/src/robusta/integrations/prometheus/models.py +++ b/src/robusta/integrations/prometheus/models.py @@ -113,7 +113,7 @@ def get_prometheus_query(self) -> str: parsed_url = urlparse(url) query_params = parse_qs(parsed_url.query) - q_expr = query_params.get('g0.expr', []) + q_expr = query_params.get("g0.expr", []) if len(q_expr) < 1 or not q_expr[0]: return ""