squad-track-duration

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# vim: set ts=4
#
# Copyright 2024-present Linaro Limited
#
# SPDX-License-Identifier: MIT


import argparse
import json
import logging
import os
import sys
from datetime import datetime, timedelta
from pathlib import Path

import pandas as pd
import plotly.express as px
from squad_client.core.api import SquadApi
from squad_client.core.models import ALL, Squad

squad_host_url = "https://qa-reports.linaro.org/"
SquadApi.configure(cache=3600, url=os.getenv("SQUAD_HOST", squad_host_url))

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

ARTIFACTORIAL_FILENAME = "builds.json"


class MetaFigure:
    def __init__(self, plotly_fig, title, description):
        self.plotly_fig = plotly_fig
        self.title = title
        self.description = description

    def fig(self):
        return self.fig

    def title(self):
        return self.title

    def description(self):
        return self.description


def parse_datetime_from_string(datetime_string):
    accepted_datetime_formats = ["%Y-%m-%d", "%Y-%m-%dT%H:%M:%S"]

    # Loop through each accepted datetime format and try parse it
    for datetime_format in accepted_datetime_formats:
        try:
            # If the format parses successfully, return the datetime object
            return datetime.strptime(datetime_string, datetime_format)
        except ValueError:
            pass

    # If no format can be parsed, raise an argument error
    raise argparse.ArgumentTypeError(
        f"Unsupported datetime format {datetime_string}. Accepted formats are {accepted_datetime_formats}"
    )


def parse_args():
    parser = argparse.ArgumentParser(description="Track duration")

    parser.add_argument(
        "--group",
        required=True,
        help="squad group",
    )

    parser.add_argument(
        "--project",
        required=True,
        help="squad project",
    )

    parser.add_argument(
        "--start-datetime",
        type=parse_datetime_from_string,
        required=True,
        help="Starting date time. Example: 2022-01-01 or 2022-01-01T00:00:00",
    )

    parser.add_argument(
        "--end-datetime",
        type=parse_datetime_from_string,
        required=True,
        help="Ending date time. Example: 2022-12-31 or 2022-12-31T00:00:00",
    )

    parser.add_argument(
        "--build-name",
        required=False,
        default="gcc-13-lkftconfig",
        help="Build name",
    )

    parser.add_argument(
        "--debug",
        action="store_true",
        default=False,
        help="Display debug messages",
    )

    return parser.parse_args()


def get_cache_from_artifactorial():
    exists = os.path.exists(ARTIFACTORIAL_FILENAME)
    if not exists:
        return {}

    with open(ARTIFACTORIAL_FILENAME, "r") as fp:
        builds = json.load(fp)
        return builds

    return {}


def save_build_cache_to_artifactorial(data, days_ago=None):
    with open(ARTIFACTORIAL_FILENAME, "w") as fp:
        json.dump(data, fp)


def get_data(args, build_cache):
    start_datetime = args.start_datetime
    end_datetime = args.end_datetime

    group = Squad().group(args.group)
    project = group.project(args.project)
    environments = project.environments(count=ALL).values()

    first_start_day = True
    final_end_date = False
    tmp_data = []

    # Set up a delta which determines how many days of data to read from SQUAD
    # per loop. Minimum delta is 1 day and delta must be in whole days to keep
    # this code easy to read, understand and debug.
    delta = timedelta(days=1)

    if delta.days < 1:
        raise Exception("Minimum delta is 1 day for this code to work.")
    if delta.seconds != 0 or delta.microseconds != 0:
        raise Exception("Deltas must be whole days only.")

    # Loops through each delta until the end date and filters the SQUAD data
    # for that delta
    while not final_end_date:

        # If it is the first date in the range, use the provided start datetime
        if first_start_day:
            first_start_day = False
            # Use the provided start time for the first day
            tmp_start_datetime = start_datetime
        else:
            # For all other days, update the date by the delta then use the
            # start of the day by zeroing hours, minutes and seconds
            tmp_start_datetime += delta
            tmp_start_datetime = tmp_start_datetime.replace(hour=0, minute=0, second=0)

        # If the delta for this iteration sends us over the end of the range,
        # use the provided end datetime
        if tmp_start_datetime + delta >= end_datetime:
            # We have reached the last day, so use this as the end date
            tmp_end_datetime = end_datetime
            final_end_date = True
        else:
            # Otherwise take the start time (with minutes zeroed) + delta
            tmp_end_datetime = (
                tmp_start_datetime.replace(hour=0, minute=0, second=0) + delta
            )

        logger.info(
            f"Fetching builds from SQUAD, start_datetime: {tmp_start_datetime}, end_datetime: {tmp_end_datetime}"
        )

        filters = {
            "created_at__lt": tmp_end_datetime.strftime("%Y-%m-%dT%H:%M:%S"),
            "created_at__gt": tmp_start_datetime.strftime("%Y-%m-%dT%H:%M:%S"),
            "count": ALL,
        }

        builds = project.builds(**filters)
        device_dict = {}

        # Loop through the environments and create a lookup table for URL -> device name (slug)
        for env in environments:
            device_dict[env.url] = env.slug

        # Loop through the builds in the specified window and cache their data
        # to a file if they are marked as finished. This will mean that we don't
        # have to look them up again is SQUAD if we have already looked them up.
        for build_id, build in builds.items():
            if str(build_id) in build_cache.keys():
                logger.debug(f"cached: {build_id}")
                tmp_data = tmp_data + build_cache[str(build_id)]
            else:
                logger.debug(f"no-cache: {build_id}")
                tmp_build_cache = []
                testruns = build.testruns(count=ALL, prefetch_metadata=True)
                for testrun_key, testrun in testruns.items():
                    device = device_dict[testrun.environment]
                    metadata = testrun.metadata

                    durations = metadata.durations
                    # Ignore testruns without duration data
                    if durations is None:
                        continue

                    build_name = metadata.build_name
                    # Ignore testruns without a build_name
                    if build_name is None:
                        continue

                    # Read the boot time from the duration data
                    boottime = durations["tests"]["boot"]
                    tmp = {
                        "build_id": build_id,
                        "build_name": build_name,
                        "git_describe": build.version.strip(),
                        "device": device,
                        "boottime": float(boottime),
                        "finished": build.finished,
                        "created_at": build.created_at,
                    }
                    tmp_data.append(tmp)
                    tmp_build_cache.append(tmp)

                # Cache data for builds that are marked finished
                if build.finished and len(tmp_build_cache) > 0:
                    build_cache[str(build_id)] = tmp_build_cache
                    logger.debug(f"finished: {build_id}, {build.finished}")

    return tmp_data, build_cache


def combine_plotly_figs_to_html(
    figs,
    html_fname,
    main_title,
    main_description,
    include_plotlyjs="cdn",
    separator=None,
    auto_open=False,
):
    with open(html_fname, "w") as f:
        f.write(f"<h1>{main_title}</h1>")
        f.write(f"<div>{main_description}</div>")
        index = 0
        f.write("<h2>Page content</h2>")
        f.write("<ul>")
        for fig in figs[1:]:
            index = index + 1
            f.write(f'<li><a href="#fig{index}">{fig.title}</a></li>')
        f.write("</ul>")
        f.write(f'<h2><a id="fig0">{figs[0].title}</a></h2>')
        f.write(f"<div>{figs[0].description}</div>")
        f.write(figs[0].plotly_fig.to_html(include_plotlyjs=include_plotlyjs))
        index = 0
        for fig in figs[1:]:
            index = index + 1
            if separator:
                f.write(separator)
            f.write(f'<h2><a id="fig{index}">{fig.title}</a></h2>')
            f.write(f"<div>{fig.description}</div>")
            f.write(fig.plotly_fig.to_html(full_html=False, include_plotlyjs=False))

    if auto_open:
        import webbrowser

        uri = Path(html_fname).absolute().as_uri()
        webbrowser.open(uri)


def run():
    args = parse_args()
    if args.debug:
        logger.setLevel(level=logging.DEBUG)

    if args.start_datetime > args.end_datetime:
        raise Exception("Start time must be earlier than end time.")

    build_cache = get_cache_from_artifactorial()
    data = []
    data, build_cache = get_data(args, build_cache)

    save_build_cache_to_artifactorial(build_cache)

    # Turn the data (list of dicts) into a pandas DataFrame
    df = pd.DataFrame(data)

    logger.debug("***********************")
    logger.debug(df)
    logger.debug(df.info())
    logger.debug("***********************")

    # Generate a build_name_device column and add this as a column in the DataFrame
    df["build_name_device"] = df.build_name + "-" + df.device
    figure_colletion = []

    # Filter the DataFrame by the desired build name(s)
    filtered_df1 = df[df["build_name"].isin([args.build_name])]

    # Create a DataFrame which groups by type then takes the mean of the boot
    # time per type.
    df_grouping1 = filtered_df1.groupby(
        ["created_at", "git_describe", "device", "build_name"]
    )

    mean_boottimes1 = df_grouping1["boottime"].mean()

    # Convert the Series object back to a DataFrame then sort values first by
    # device, then by created_at. This will make the graph legend alphabetised
    # while also ensuring the dates for each line are ordered by created_at so
    # the graph's lines will be drawn correctly.
    mean_boottimes1 = mean_boottimes1.reset_index().sort_values(
        by=["device", "created_at"]
    )

    # Calculate how many boottimes we averaged over per device
    count_per_device1 = df_grouping1["boottime"].count().groupby("device").sum()
    col_name_boottime_count = "Boottimes included in average"
    count_per_device1 = count_per_device1.reset_index().rename(
        columns={"boottime": col_name_boottime_count}
    )

    # Create a new column with the name and count, then stick together the
    # counts and the averages
    count_per_device1["device_count"] = (
        count_per_device1.device
        + " ("
        + count_per_device1[col_name_boottime_count].astype(str)
        + ")"
    )
    mean_boottimes1 = mean_boottimes1.merge(
        count_per_device1, on="device", how="inner", suffixes=("_1", "_2")
    )

    # Create the figure to display this data
    figure_colletion.append(
        MetaFigure(
            px.line(
                mean_boottimes1,
                x="created_at",
                y="boottime",
                color="device_count",
                markers=True,
                labels={"device_count": "Device (number of boots in mean)"},
            )
            .update_xaxes(
                tickvals=mean_boottimes1["created_at"],
                ticktext=mean_boottimes1["git_describe"],
            )
            .update_layout(xaxis_title="Version", yaxis_title="Boot time"),
            f"Line graph, {args.build_name}",
            f"This line graph is generated from build_name {args.build_name}."
            + " The graph uses the average (mean) over a number of boots for each device. The number of boots included in the average is presented in the 'Device (number of boots in mean)' in the line graph legend.",
        )
    )

    # Filter the DataFrame by the desired build name(s)
    filtered_df2 = df[df["build_name"].str.endswith(args.build_name.split("-")[-1])]

    # Group and the mean of the boot time for the desired type - this time it is
    # grouped by build_name_device, too, since we want to look at both the build
    # and what device this was run on.
    df_grouping2 = filtered_df2.groupby(
        ["created_at", "git_describe", "device", "build_name_device", "build_name"]
    )

    mean_boottimes2 = df_grouping2["boottime"].mean()

    # Convert the Series object back to a DataFrame then sort values first by
    # build_name_device, then by created_at. This will make the graph legend
    # alphabetised while also ensuring the dates for each line are ordered by
    # created_at so the graph's lines will be drawn correctly.
    mean_boottimes2 = mean_boottimes2.reset_index().sort_values(
        by=["build_name_device", "created_at"]
    )

    logger.debug(mean_boottimes2.info())
    logger.debug(mean_boottimes2)

    # Calculate how many boottimes we averaged over per device
    count_per_device2 = (
        df_grouping2["boottime"].count().groupby("build_name_device").sum()
    )
    count_per_device2 = count_per_device2.reset_index().rename(
        columns={"boottime": col_name_boottime_count}
    )

    # Create a new column with the name and count, then stick together the
    # counts and the averages
    count_per_device2["build_name_device_count"] = (
        count_per_device2.build_name_device
        + " ("
        + count_per_device2[col_name_boottime_count].astype(str)
        + ")"
    )
    mean_boottimes2 = mean_boottimes2.merge(
        count_per_device2, on="build_name_device", how="inner", suffixes=("_1", "_2")
    )

    # Create the figure for this visualisation
    figure_colletion.append(
        MetaFigure(
            px.line(
                mean_boottimes2,
                x="created_at",
                y="boottime",
                color="build_name_device_count",
                markers=True,
                labels={
                    "build_name_device_count": "Build name - device (number of boots in mean)"
                },
            )
            .update_xaxes(
                tickvals=mean_boottimes2["created_at"],
                ticktext=mean_boottimes2["git_describe"],
            )
            .update_layout(xaxis_title="Version", yaxis_title="Boot time"),
            f"Line graph, {args.build_name.split('-')[-1]}",
            f"This line graph is generated from \"{args.build_name.split('-')[-1]}\"."
            + " The graph uses the average (mean) over a number of boots for each build_name-device combination. The number of boots included in the average is presented in the 'Build name - device (number of boots in mean)' in the line graph legend.",
        )
    )

    combine_plotly_figs_to_html(
        figure_colletion,
        "index.html",
        "This page show some interesting data around LKFT's builds",
        f"These graphs is based on LKFT's {args.project} branch",
    )

    exit(0)


if __name__ == "__main__":
    sys.exit(run())