Capture additional runner metadata dev branch (#65)

Co-authored-by: Adnan Khan <[email protected]>
praetorian-inc · Jan 29, 2024 · 8561d76 · 8561d76
1 parent 4b37e19
commit 8561d76
Show file tree

Hide file tree

Showing 8 changed files with 143 additions and 66 deletions.
diff --git a/gato/enumerate/recommender.py b/gato/enumerate/recommender.py
@@ -155,6 +155,11 @@ def print_repo_runner_info(repository: Repository):
                 f"{Output.bright(repository.accessible_runners[0].runner_name)}"
                 f" and the machine name was "
                 f"{Output.bright(repository.accessible_runners[0].machine_name)}"
+                f" and the runner type was "
+                f"{Output.bright(repository.accessible_runners[0].runner_type)}"
+                f" in the {Output.bright(repository.accessible_runners[0].runner_group)} group"
+                f" with the following labels: "
+                f"{Output.bright(', '.join(repository.accessible_runners[0].labels))}"
             )
 
             for runner in repository.accessible_runners:

diff --git a/gato/enumerate/repository.py b/gato/enumerate/repository.py
@@ -43,7 +43,13 @@ def __perform_runlog_enumeration(self, repository: Repository):
         if wf_runs:
             for wf_run in wf_runs:
                 runner = Runner(
-                    wf_run['runner_name'], wf_run['machine_name'], non_ephemeral=wf_run['non_ephemeral']
+                    wf_run['runner_name'],
+                    wf_run['runner_type'],
+                    wf_run['token_permissions'],
+                    runner_group=wf_run['runner_group'],
+                    machine_name=wf_run['machine_name'],
+                    labels=wf_run['requested_labels'],
+                    non_ephemeral=wf_run['non_ephemeral']
                 )
 
                 repository.add_accessible_runner(runner)
@@ -139,9 +145,7 @@ def enumerate_repository(self, repository: Repository, large_org_enum=False):
 
             # If we are doing internal enum, get the logs, because coverage is
             # more important here and it's ok if it takes time.
-            elif not repository.is_public() and self.__perform_runlog_enumeration(repository):
-                runner_detected = True
-            else:
+            elif not repository.is_public() or not large_org_enum:
                 runner_detected = self.__perform_runlog_enumeration(repository)
 
         if runner_detected:

diff --git a/gato/github/api.py b/gato/github/api.py
@@ -12,7 +12,6 @@
 
 logger = logging.getLogger(__name__)
 
-
 class Api():
     """Class to serve as an abstraction layer to interact with the GitHub API.
     It handles utilizing proxies, along with passing the PAT and handling any
@@ -21,6 +20,9 @@ class Api():
 
     RUNNER_RE = re.compile(r'Runner name: \'([\w+-.]+)\'')
     MACHINE_RE = re.compile(r'Machine name: \'([\w+-.]+)\'')
+    RUNNERGROUP_RE = re.compile(r'Runner group name: \'([\w+-.]+)\'')
+    RUNNERTYPE_RE = re.compile(r'([\w+-.]+)')
+
     RUN_THRESHOLD = 90
 
     def __init__(self, pat: str, version: str = "2022-11-28",
@@ -111,49 +113,77 @@ def __process_run_log(self, log_content: bytes, run_info: dict):
         Returns:
             dict: metadata about the run execution.
         """
-        log_package = None
+        log_package = dict()
+        token_permissions = dict()
+        runner_type = None
         non_ephemeral = False
+        labels = None
+        runner_name = None
+        machine_name = None
+        runner_group = None
 
         with zipfile.ZipFile(io.BytesIO(log_content)) as runres:
             for zipinfo in runres.infolist():
-                # TODO use a lambda for this messy logic
-                if "checkout" in zipinfo.filename or "Checkout" in zipinfo.filename:
+                if zipinfo.filename.startswith('0_'):
                     with runres.open(zipinfo) as run_setup:
                         content = run_setup.read().decode()
-                        if "Cleaning the repository" in content:
-                            non_ephemeral = True
-
-                        if log_package:
-                            log_package['non_ephemeral'] = non_ephemeral
+                        content_lines = content.split('\n')
 
-                if "Set up job" in zipinfo.filename:
-                    with runres.open(zipinfo) as run_setup:
-                        content = run_setup.read().decode()
-                        if "Image Release: https://github.com/actions/runner-images" in content:
+                        if "Image Release: https://github.com/actions/runner-images" in content or \
+                            "Job is about to start running on the hosted runner: GitHub Actions" in content:
                             # Larger runners will appear to be self-hosted, but
                             # they will have the image name. Skip if we see this.
+                            # If the log contains "job is about to start running on hosted runner", 
+                            # the runner is a Github hosted runner so we can skip it. 
                             continue
-
-                        if "Runner name" in content or \
-                                "Machine name" in content:
-
-                            # Need to replace windows style line
-                            # return with linux..
-                            matches = Api.RUNNER_RE.search(content)
-                            runner_name = matches.group(1) if matches else None
-
-                            matches = Api.MACHINE_RE.search(content)
-                            hostname = matches.group(1) if matches else None
-
-                            log_package = {
-                                "setup_log": content,
-                                "runner_name": runner_name,
-                                "machine_name": hostname,
-                                "run_id": run_info["id"],
-                                "run_attempt": run_info["run_attempt"],
-                                "non_ephemeral": non_ephemeral
-                            }
-        return log_package
+                        index = 0
+                        while index < len(content_lines) and content_lines[index]: 
+                            line = content_lines[index]
+
+                            if "Requested labels: " in line: 
+                                labels = line.split("Requested labels: ")[1].split(', ')
+
+                            if "Runner name: " in line:
+                                runner_name = line.split("Runner name: ")[1].replace("'", "")
+
+                            if "Machine name: " in line:
+                                machine_name = line.split("Machine name: ")[1].replace("'", "")
+
+                            if "Runner group name:" in line: 
+                                runner_group = line.split("Runner group name: ")[1].replace("'", "")
+
+                            if "Job is about to start running on" in line:
+                                runner_type = line.split()[-1]
+                                matches = Api.RUNNERTYPE_RE.search(runner_type)
+                                runner_type = matches.group(1)
+
+                            if "GITHUB_TOKEN Permission" in line:
+                                while "##[endgroup]" not in content_lines[index+1]:
+                                    index += 1
+                                    scope = content_lines[index].split()[1].replace(':', '')
+                                    permission = content_lines[index].split()[2]
+                                    token_permissions[scope] = permission
+                                log_package["token_permissions"] = token_permissions
+
+                            if "Cleaning the repository" in line:
+                                non_ephemeral = True
+                            log_package["non_ephemeral"] = non_ephemeral
+
+                            index += 1
+
+                        log_package = {
+                            "requested_labels": labels,
+                            "runner_name": runner_name,
+                            "machine_name": machine_name,
+                            "runner_group": runner_group,
+                            "runner_type": runner_type,
+                            "run_id": run_info["id"],
+                            "run_attempt": run_info["run_attempt"],
+                            "non_ephemeral": non_ephemeral,
+                            "token_permissions": token_permissions
+                        }
+
+                    return log_package
 
     def __get_full_runlog(self, log_content: bytes, run_name: str):
         """Gets the full text of the runlog from the zip file by matching the
@@ -682,13 +712,12 @@ def retrieve_run_logs(self, repo_name: str, short_circuit: str = True):
         if runs.status_code == 200:
             logger.debug(f'Enumerating runs within {repo_name}')
             for run in runs.json()['workflow_runs']:
-
                 # We are only interested in runs that actually executed.
                 if run['conclusion'] != 'success' and \
                     run['conclusion'] != 'failure':
                     continue
 
-                if short_circuit:                
+                if short_circuit:
                     # If we are only looking for the presence of SH runners and
                     # not trying to determine ephmeral vs not from repeats, then
                     # we just need to look at each branch + wf combination once.
@@ -700,6 +729,7 @@ def retrieve_run_logs(self, repo_name: str, short_circuit: str = True):
                 run_log = self.call_get(
                     f'/repos/{repo_name}/actions/runs/{run["id"]}/'
                     f'attempts/{run["run_attempt"]}/logs')
+
                 if run_log.status_code == 200:
                     run_log = self.__process_run_log(run_log.content, run)
                     if run_log:
@@ -1175,4 +1205,4 @@ def commit_workflow(self, repo_name: str,
         if self.__verify_result(r, 201) is False:
             return None
 
-        return new_commit_sha
+        return new_commit_sha
diff --git a/gato/models/runner.py b/gato/models/runner.py
@@ -8,6 +8,9 @@ class Runner:
     def __init__(
             self,
             runner_name,
+            runner_type=None,
+            token_permissions=None,
+            runner_group=None,
             machine_name=None,
             os=None,
             status=None,
@@ -25,6 +28,9 @@ def __init__(
         """
         self.runner_name = runner_name
         self.machine_name = machine_name
+        self.runner_group = runner_group
+        self.runner_type = runner_type
+        self.token_permissions = token_permissions
         self.os = os
         self.status = status
         self.labels = labels
@@ -37,10 +43,13 @@ def toJSON(self):
             "name": self.runner_name,
             "machine_name": self.machine_name if self.machine_name
             else "Unknown",
+            "runner_type": self.runner_type if self.runner_type else "Unknown",
+            "runner_group_name": self.runner_group if self.runner_group else "Unknown",
+            "token_permissions": self.token_permissions,
             "os": self.os if self.os else "Unknown",
             "status": self.status if self.status else "Unknown",
             "labels": [label for label in self.labels],
             "non_ephemeral": self.non_ephemeral
         }
 
-        return representation
+        return representation
diff --git a/unit_test/files/run_log.zip b/unit_test/files/run_log.zip
diff --git a/unit_test/test_api.py b/unit_test/test_api.py
@@ -465,18 +465,18 @@ def test_retrieve_run_logs(mock_get):
         zip_bytes = run_log.read()
         mock_get.return_value.content = zip_bytes
 
-    abstraction_layer = Api( test_pat, "2022-11-28")
+    abstraction_layer = Api(test_pat, "2022-11-28")
     logs = abstraction_layer.retrieve_run_logs("testOrg/testRepo")
 
     assert len(logs) == 1
-    assert list(logs)[0]['runner_name'] == 'ghrunner-test'
+    assert list(logs)[0]['runner_name'] == 'runner-30'
 
     logs = abstraction_layer.retrieve_run_logs(
         "testOrg/testRepo", short_circuit=False
     )
 
     assert len(logs) == 1
-    assert list(logs)[0]['runner_name'] == 'ghrunner-test'
+    assert list(logs)[0]['runner_name'] == 'runner-30'
 
 
 @patch("gato.github.api.requests.get")

diff --git a/unit_test/test_enumerate.py b/unit_test/test_enumerate.py
@@ -18,6 +18,21 @@
 
 Output(False, True)
 
+BASE_MOCK_RUNNER = [{
+        "machine_name": "unittest1",
+        "runner_name": "much_unit_such_test",
+        "runner_type": "organization",
+        "non_ephemeral": False,
+        "token_permissions": {
+            "Actions": "write"
+        },
+        "runner_group": "Default",
+        "requested_labels": [
+            "self-hosted",
+            "Linux",
+            "X64"
+        ]
+}]
 
 @pytest.fixture(scope="session", autouse=True)
 def load_test_files(request):
@@ -102,9 +117,7 @@ def test_enumerate_repo_admin(mock_api, capsys):
         "scopes": ['repo', 'workflow']
     }
 
-    mock_api.return_value.retrieve_run_logs.return_value = [
-        {"machine_name": "unittest1", "runner_name": "much_unit_such_test", "non_ephemeral": False}
-    ]
+    mock_api.return_value.retrieve_run_logs.return_value = BASE_MOCK_RUNNER
 
     repo_data = json.loads(json.dumps(TEST_REPO_DATA))
     repo_data['permissions']['admin'] = True
@@ -142,9 +155,7 @@ def test_enumerate_repo_admin_no_wf(mock_api, capsys):
         "scopes": ['repo']
     }
 
-    mock_api.return_value.retrieve_run_logs.return_value = [
-        {"machine_name": "unittest1", "runner_name": "much_unit_such_test", "non_ephemeral": False}
-    ]
+    mock_api.return_value.retrieve_run_logs.return_value = BASE_MOCK_RUNNER
 
     repo_data = json.loads(json.dumps(TEST_REPO_DATA))
     repo_data['permissions']['admin'] = True
@@ -182,9 +193,7 @@ def test_enumerate_repo_no_wf_no_admin(mock_api, capsys):
         "scopes": ['repo']
     }
 
-    mock_api.return_value.retrieve_run_logs.return_value = [
-        {"machine_name": "unittest1", "runner_name": "much_unit_such_test", "non_ephemeral": False}
-    ]
+    mock_api.return_value.retrieve_run_logs.return_value = BASE_MOCK_RUNNER
 
     repo_data = json.loads(json.dumps(TEST_REPO_DATA))
     repo_data['permissions']['admin'] = False
@@ -221,9 +230,7 @@ def test_enumerate_repo_no_wf_maintain(mock_api, capsys):
         "scopes": ['repo', 'workflow']
     }
 
-    mock_api.return_value.retrieve_run_logs.return_value = [
-        {"machine_name": "unittest1", "runner_name": "much_unit_such_test", "non_ephemeral": False}
-    ]
+    mock_api.return_value.retrieve_run_logs.return_value = BASE_MOCK_RUNNER
 
     repo_data = json.loads(json.dumps(TEST_REPO_DATA))
 
@@ -261,9 +268,7 @@ def test_enumerate_repo_only(mock_api, capsys):
         "scopes": ['repo', 'workflow']
     }
 
-    mock_api.return_value.retrieve_run_logs.return_value = [
-        {"machine_name": "unittest1", "runner_name": "much_unit_such_test", "non_ephemeral": False}
-    ]
+    mock_api.return_value.retrieve_run_logs.return_value = BASE_MOCK_RUNNER
 
     repo_data = json.loads(json.dumps(TEST_REPO_DATA))
 

diff --git a/unit_test/test_repo_enumerate.py b/unit_test/test_repo_enumerate.py
@@ -44,9 +44,21 @@ def test_enumerate_repo():
         "scopes": ['repo', 'workflow']
     }
 
-    mock_api.retrieve_run_logs.return_value = [
-        {"machine_name": "unittest1", "runner_name": "much_unit_such_test", "non_ephemeral": False}
-    ]
+    mock_api.retrieve_run_logs.return_value = [{
+        "machine_name": "unittest1",
+        "runner_name": "much_unit_such_test",
+        "runner_type": "organization",
+        "non_ephemeral": False,
+        "token_permissions": {
+            "Actions": "write"
+        },
+        "runner_group": "Default",
+        "requested_labels": [
+            "self-hosted",
+            "Linux",
+            "X64"
+        ]
+    }]
 
     repo_data = json.loads(json.dumps(TEST_REPO_DATA))
     test_repo = Repository(repo_data)
@@ -74,9 +86,21 @@ def test_enumerate_repo_admin():
         "scopes": ['repo', 'workflow']
     }
 
-    mock_api.retrieve_run_logs.return_value = [
-        {"machine_name": "unittest1", "runner_name": "much_unit_such_test", "non_ephemeral": False}
-    ]
+    mock_api.retrieve_run_logs.return_value = [{
+        "machine_name": "unittest1",
+        "runner_name": "much_unit_such_test",
+        "runner_type": "organization",
+        "non_ephemeral": False,
+        "token_permissions": {
+            "Actions": "write"
+        },
+        "runner_group": "Default",
+        "requested_labels": [
+            "self-hosted",
+            "Linux",
+            "X64"
+        ]
+    }]
 
     repo_data = json.loads(json.dumps(TEST_REPO_DATA))
     repo_data['permissions']['admin'] = True