.github/workflows/nm-benchmark.yml

name: benchmark 
on:
  # makes workflow reusable
  workflow_call:
    inputs:
      label:
        description: "requested runner label (specifies instance)"
        type: string
        required: true
      benchmark_config_list_file:
        description: "path to a file containing a list of benchmark-configs to run benchmarks with. For reference look at .github/data/nm_benchmark_configs_list.txt"
        type: string
        required: true
      timeout:
        description: "maximum time runner will be up"
        type: string
        required: true
      gitref:
        description: "git commit hash or branch name"
        type: string
        required: true
      Gi_per_thread:
        description: 'requested GiB to reserve per thread'
        type: string
        required: true
      nvcc_threads:
        description: "number of threads nvcc build threads"
        type: string
        required: true
      python:
        description: "python version, e.g. 3.10.12"
        type: string
        required: true
      push_benchmark_results_to_gh_pages:
        description: "When set to true, the workflow pushes all benchmarking results to gh-pages UI"
        type: string
        required: true

  # makes workflow manually callable
  workflow_dispatch:
    inputs:
      label:
        description: "requested runner label (specifies instance)"
        type: string
        required: true
      benchmark_config_list_file:
        description: "path to a file containing a list of benchmark-configs to run benchmarks with. For reference look at .github/data/nm_benchmark_configs_list.txt"
        type: string
        required: true
      timeout:
        description: "maximum time runner will be up"
        type: string
        required: true
      gitref:
        description: "git commit hash or branch name"
        type: string
        required: true
      Gi_per_thread:
        description: 'requested GiB to reserve per thread'
        type: string
        required: true
      nvcc_threads:
        description: "number of threads nvcc build threads"
        type: string
        required: true
      python:
        description: "python version, e.g. 3.10.12"
        type: string
        required: true
      push_benchmark_results_to_gh_pages:
        description: "When set to true, the workflow pushes all benchmarking results to gh-pages UI"
        type: choice
        options:
          - 'true'
          - 'false'
        default: 'false'

jobs:
  BENCHMARK:

    runs-on: ${{ inputs.label }}
    timeout-minutes: ${{ fromJSON(inputs.timeout) }}
    outputs:
      gh_action_benchmark_input_artifact_name: ${{ steps.set_gh_action_benchmark_input_artifact_name.outputs.gh_action_benchmark_input_artifact_name}}

    steps:
      - name: checkout repository code
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
          ref: ${{ inputs.gitref }}
          submodules: recursive

      - name: setenv
        id: setenv
        uses: ./.github/actions/nm-set-env/
        with:
          hf_token: ${{ secrets.NM_HF_TOKEN }}
          Gi_per_thread: ${{ inputs.Gi_per_thread }}
          nvcc_threads: ${{ inputs.nvcc_threads }}

      - name: set python
        id: set_python
        uses: ./.github/actions/nm-set-python/
        with:
          python: ${{ inputs.python }}
          venv: TEST

      - name: hf cache
        id: hf_cache
        uses: ./.github/actions/nm-hf-cache/
        with:
          fs_cache: ${{ secrets.HF_FS_CACHE }}

      - name: build
        id: build
        uses: ./.github/actions/nm-build-vllm/
        with:
          python: ${{ inputs.python }}
          venv: TEST
          pypi: ${{ secrets.NM_PRIVATE_PYPI_LOCATION }}

      - name: run benchmarks
        uses: ./.github/actions/nm-benchmark/
        with:
          benchmark_config_list_file: ${{ inputs.benchmark_config_list_file }}
          output_directory: benchmark-results
          python: ${{ inputs.python }}
          venv: TEST

      - name: store benchmark result artifacts
        if: success()
        uses: actions/upload-artifact@v4
        with:
          name: ${{ github.run_id }}-${{ inputs.label }}
          path: benchmark-results
          retention-days: 2

      - name: copy benchmark results to EFS store
        if: success()
        uses: ./.github/actions/nm-copy-benchmark-data-to-efs
        with:
          label: ${{ inputs.label }}
          src: benchmark-results
          efs_dst: /EFS/benchmark_results

      # Produce GHA benchmark JSONs
      - name: make github-action-benchmark JSONs 
        uses: ./.github/actions/nm-produce-gha-benchmark-json
        with:
          vllm_benchmark_jsons_path: benchmark-results 
          # Metrics that are "better" when the value is greater are stored here
          bigger_is_better_output_file_path: gh-action-benchmark-jsons/bigger_is_better.json
          # Metrics that are "better" when the value is smaller are stored here
          smaller_is_better_output_file_path: gh-action-benchmark-jsons/smaller_is_better.json
          # Metrics that we only want to observe are stored here
          observation_metrics_output_file_path: gh-action-benchmark-jsons/observation_metrics.json
          python: ${{ inputs.python }}
          venv: TEST

      - name: set gh action benchmark input artifact name
        id: set_gh_action_benchmark_input_artifact_name
        run: |
          GH_ACTION_BENCHMARK_INPUT_ARTIFACT_NAME=`echo "gh_action_benchmark_jsons-${{ github.run_id }}-${{ inputs.label }}"`
          echo "gh_action_benchmark_input_artifact_name=$GH_ACTION_BENCHMARK_INPUT_ARTIFACT_NAME" >> $GITHUB_OUTPUT

      - name: store gh action benchmark input artifacts
        if: success()
        uses: actions/upload-artifact@v4
        with:
          name: ${{ steps.set_gh_action_benchmark_input_artifact_name.outputs.gh_action_benchmark_input_artifact_name}}
          path: gh-action-benchmark-jsons 
          retention-days: 1

      - name: copy gh action benchmark JSONs to EFS store
        if: success()
        uses: ./.github/actions/nm-copy-benchmark-data-to-efs
        with:
          label: ${{ inputs.label }}
          src: gh-action-benchmark-jsons
          efs_dst: /EFS/benchmark_results

  NM_GH_ACTION_BENCHMARK:

    needs: BENCHMARK 
    runs-on: ubuntu-latest
    timeout-minutes: 20
    permissions:
      # Permissions required to be able to push to the nm-gh-pages branch 
      contents: write

    steps:
      - name: checkout repository code
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
          ref: ${{ inputs.gitref }}
          submodules: recursive

      - name: download benchmark results artifact
        uses: actions/download-artifact@v4
        with:
          name: ${{ needs.BENCHMARK.outputs.gh_action_benchmark_input_artifact_name }}
          path: downloads

      - name: display structure of downloaded files
        run: ls -R ./downloads

      - name: nm-github-action-benchmark(bigger_is_better.json)
        # Absence of the file indicates that there were no "bigger_is_better" metrics
        if: (success() || failure()) && (hashFiles('downloads/bigger_is_better.json') != '')
        uses: ./.github/actions/nm-github-action-benchmark
        with:
          gh_action_benchmark_name: "bigger_is_better"
          gh_action_benchmark_json_file_path:  "downloads/bigger_is_better.json"
          gh_action_benchmark_tool: "customBiggerIsBetter"
          gh_pages_branch: "nm-gh-pages"
          auto_push: ${{ inputs.push_benchmark_results_to_gh_pages }}
          reporting_enabled: "true"
          github_token: ${{ secrets.GITHUB_TOKEN }}

      - name: nm-github-action-benchmark(smaller_is_better.json)
        # Absence of the file indicates that there were no "smaller_is_better" metrics
        if: (success() || failure()) && (hashFiles('downloads/smaller_is_better.json') != '')
        uses: ./.github/actions/nm-github-action-benchmark
        with:
          gh_action_benchmark_name: "smaller_is_better"
          gh_action_benchmark_json_file_path:  "downloads/smaller_is_better.json"
          gh_action_benchmark_tool: "customSmallerIsBetter"
          gh_pages_branch: "nm-gh-pages"
          auto_push: ${{ inputs.push_benchmark_results_to_gh_pages }}
          reporting_enabled: "true"
          github_token: ${{ secrets.GITHUB_TOKEN }}

      - name: nm-github-action-benchmark(observation_metrics.json)
        # Absence of the file indicates that there were no "observation" metrics
        if: (success() || failure()) && (hashFiles('downloads/observation_metrics.json') != '')
        uses: ./.github/actions/nm-github-action-benchmark
        with:
          gh_action_benchmark_name: "observation_metrics"
          gh_action_benchmark_json_file_path:  "downloads/observation_metrics.json"
          # `github-action-benchmark` expects a tool name that is either
          # "customBiggerIsBetter" or "customSmallerIsBetter". This is a hack to
          # work around that. Since we mark the action to not report failures, this
          # is fine.
          gh_action_benchmark_tool: "customBiggerIsBetter"
          gh_pages_branch: "nm-gh-pages"
          auto_push: ${{ inputs.push_benchmark_results_to_gh_pages }}
          reporting_enabled: "false"
          github_token: ${{ secrets.GITHUB_TOKEN }}