Skip to content

Commit

Permalink
Enable Client Logs with "client-logs" (#83)
Browse files Browse the repository at this point in the history
Co-authored-by: github-actions <[email protected]>
  • Loading branch information
ric-evans and github-actions authored Oct 17, 2023
1 parent a849202 commit 4757568
Show file tree
Hide file tree
Showing 14 changed files with 109 additions and 68 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ _Launch a new scan of an event_
| `"max_pixel_reco_time"` | int | *[REQUIRED]* | the max amount of time each pixel's reco should take (accurate values will evict pixels from slow workers thereby re-delivering to faster workers -- slow workers are unavoidable due to non-deterministic errors)
| `"scanner_server_memory"` | str | default: `1024M` | how much memory for the scanner server to request
| `"memory"` | str | default: `8G` | how much memory per client worker to request
| `"debug_mode"` | str or list | default: None | what debug mode(s) to use: `"logs-dump"` redirects each reco's logs/prints to its parent scanner client's stderr/stdout
| `"debug_mode"` | str or list | default: None | what debug mode(s) to use: `"client-logs"` collects the scanner clients' stderr/stdout including icetray logs (scans are limited in # of workers)
| `"predictive_scanning_threshold"` | float | default: `1.0` | the predictive scanning threshold [0.1, 1.0] (see [Skymap Scanner](https://github.com/icecube/skymap_scanner))
| `"classifiers"` | <code>dict[str, str &#124; bool &#124; float &#124; int]</code> | default: `{}` | a user-defined collection of labels, attributes, etc. -- this is constrained in size and is intended for user-defined metadata only
| `"manifest_projection"` | list | default: all fields but [these](#manifest-fields-excluded-by-default-in-response) | which `Manifest` fields to include in the response (include `*` to include all fields)
Expand Down Expand Up @@ -286,6 +286,7 @@ Pseudo-code:
},
cluster_id: int,
n_workers: int,
starter_info: dict,
},
...
{
Expand All @@ -296,6 +297,7 @@ Pseudo-code:
},
cluster_id: int,
n_workers: int,
starter_info: dict,
},
...
],
Expand Down
4 changes: 2 additions & 2 deletions clientmanager/clientmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,10 +134,10 @@ def wait_for_file(waitee: Path, wait_time: int) -> Path:
help="does everything except submitting the worker(s)",
)
sub_parser.add_argument(
"--logs-directory",
"--spool-logs-directory",
default=None,
type=Path,
help="where to save logs (if not given, logs are not saved)",
help="where to spool (persist) logs -- if not given, logs are not kept",
)

# worker args
Expand Down
7 changes: 5 additions & 2 deletions clientmanager/condor/act.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,13 @@ def _act(args: argparse.Namespace, schedd_obj: htcondor.Schedd) -> None:
# make connections -- do now so we don't have any surprises downstream
skydriver_rc = utils.connect_to_skydriver()
# start
submit_result_obj = starter.start(
submit_dict, submit_result_obj = starter.start(
schedd_obj=schedd_obj,
# starter CL args -- helper
dryrun=args.dryrun,
logs_directory=args.logs_directory if args.logs_directory else None,
spool_logs_directory=args.spool_logs_directory
if args.spool_logs_directory
else None,
# starter CL args -- worker
memory=args.memory,
n_cores=args.n_cores,
Expand All @@ -58,6 +60,7 @@ def _act(args: argparse.Namespace, schedd_obj: htcondor.Schedd) -> None:
},
cluster_id=submit_result_obj.cluster(),
n_workers=submit_result_obj.num_procs(),
starter_info=submit_dict,
)
LOGGER.info("Sent cluster info to SkyDriver")
case "stop":
Expand Down
43 changes: 27 additions & 16 deletions clientmanager/condor/starter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import datetime as dt
from pathlib import Path
from typing import Any

import htcondor # type: ignore[import]

Expand All @@ -20,10 +21,6 @@ def make_condor_logs_subdir(directory: Path) -> Path:
return subdir


def _get_log_fpath(logs_subdir: Path) -> Path:
return logs_subdir / "clientmanager.log"


def make_condor_job_description( # pylint: disable=too-many-arguments
logs_subdir: Path | None,
# condor args
Expand All @@ -33,8 +30,8 @@ def make_condor_job_description( # pylint: disable=too-many-arguments
image: str,
client_startup_json_s3: S3File,
client_args_string: str,
) -> htcondor.Submit:
"""Make the condor job description (submit object)."""
) -> dict[str, Any]:
"""Make the condor job description (dict)."""

# NOTE:
# In the newest version of condor we could use:
Expand Down Expand Up @@ -71,7 +68,7 @@ def make_condor_job_description( # pylint: disable=too-many-arguments
#
"should_transfer_files": "YES",
"transfer_input_files": client_startup_json_s3.url,
"transfer_output_files": '""', # must be quoted
"transfer_output_files": '""', # must be quoted for "none"
#
"request_cpus": str(n_cores),
"request_memory": memory,
Expand All @@ -84,21 +81,34 @@ def make_condor_job_description( # pylint: disable=too-many-arguments
{
"output": str(logs_subdir / "client-$(ProcId).out"),
"error": str(logs_subdir / "client-$(ProcId).err"),
"log": str(_get_log_fpath(logs_subdir)),
"log": str(logs_subdir / "clientmanager.log"),
}
)
# https://htcondor.readthedocs.io/en/latest/users-manual/file-transfer.html#specifying-if-and-when-to-transfer-files
submit_dict.update(
{
"transfer_output_files": ",".join(
[
submit_dict["output"],
submit_dict["error"],
submit_dict["log"],
]
),
"when_to_transfer_output": "ON_EXIT_OR_EVICT",
}
)
else:
# NOTE: this needs to be removed if we ARE transferring files
submit_dict["initialdir"] = "/tmp"

return htcondor.Submit(submit_dict)
return submit_dict


def start(
schedd_obj: htcondor.Schedd,
# starter CL args -- helper
dryrun: bool,
logs_directory: Path | None,
spool_logs_directory: Path | None,
# starter CL args -- worker
memory: str,
n_cores: int,
Expand All @@ -107,10 +117,10 @@ def start(
client_args: list[tuple[str, str]],
client_startup_json_s3: S3File,
image: str,
) -> htcondor.SubmitResult:
) -> tuple[dict[str, Any], htcondor.SubmitResult]:
"""Main logic."""
if logs_directory:
logs_subdir = make_condor_logs_subdir(logs_directory)
if spool_logs_directory:
logs_subdir = make_condor_logs_subdir(spool_logs_directory)
spool = True
else:
logs_subdir = None
Expand All @@ -131,7 +141,7 @@ def start(
)

# make condor job description
submit_obj = make_condor_job_description(
submit_dict = make_condor_job_description(
logs_subdir,
# condor args
memory,
Expand All @@ -141,12 +151,13 @@ def start(
client_startup_json_s3,
client_args_string,
)
submit_obj = htcondor.Submit(submit_dict)
LOGGER.info(submit_obj)

# dryrun?
if dryrun:
LOGGER.error("Script Aborted: Condor job not submitted")
return
raise RuntimeError("Dry run completed successfully")

# submit
submit_result_obj = schedd_obj.submit(
Expand All @@ -163,4 +174,4 @@ def start(
)
schedd_obj.spool(jobs)

return submit_result_obj
return submit_dict, submit_result_obj
3 changes: 2 additions & 1 deletion clientmanager/k8s/act.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def _act(args: argparse.Namespace, k8s_api: kubernetes.client.ApiClient) -> None
# make connections -- do now so we don't have any surprises downstream
skydriver_rc = utils.connect_to_skydriver()
# start
starter.start(
k8s_job_dict = starter.start(
k8s_api=k8s_api,
cluster_id=cluster_id,
# k8s CL args
Expand Down Expand Up @@ -106,6 +106,7 @@ def _act(args: argparse.Namespace, k8s_api: kubernetes.client.ApiClient) -> None
},
cluster_id=cluster_id,
n_workers=args.n_workers,
starter_info=k8s_job_dict,
)
LOGGER.info("Sent cluster info to SkyDriver")
case "stop":
Expand Down
3 changes: 2 additions & 1 deletion clientmanager/k8s/starter.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ def start(
SECRET_FORWARDED_ENV_VARS,
)
try:
# must be natively json-encodable
LOGGER.info(json.dumps(k8s_job_dict, indent=4))
except json.decoder.JSONDecodeError:
LOGGER.info(pprint.pformat(k8s_job_dict, indent=4))
Expand All @@ -211,7 +212,7 @@ def start(
# dryrun?
if dryrun:
LOGGER.error("Script Aborted: K8s job not submitted")
return k8s_job_dict
raise RuntimeError("Dry run completed successfully")

# create namespace
# kubernetes.client.CoreV1Api(k8s_api).create_namespace(
Expand Down
3 changes: 3 additions & 0 deletions clientmanager/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import dataclasses as dc
from pathlib import Path
from typing import Any

import boto3 # type: ignore[import]
import requests
Expand Down Expand Up @@ -33,6 +34,7 @@ def update_skydriver(
location: dict[str, str],
cluster_id: str | int,
n_workers: int,
starter_info: dict[str, Any],
) -> None:
"""Send SkyDriver updates from the `submit_result`."""
skydriver_rc.request_seq(
Expand All @@ -44,6 +46,7 @@ def update_skydriver(
"location": location,
"cluster_id": str(cluster_id),
"n_workers": n_workers,
"starter_info": starter_info,
}
},
)
Expand Down
36 changes: 18 additions & 18 deletions dependencies-from-Dockerfile.log
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
# pip freeze
########################################################################
backoff==2.2.1
boto3==1.28.63
botocore==1.31.63
boto3==1.28.65
botocore==1.31.65
cachetools==5.3.1
certifi==2023.7.22
cffi==1.16.0
Expand Down Expand Up @@ -56,7 +56,7 @@ thrift==0.16.0
tornado==6.3.3
typeguard==4.1.5
typing_extensions==4.8.0
urllib3==1.26.17
urllib3==1.26.18
websocket-client==1.6.4
wipac-dev-tools==1.7.0
wipac-rest-tools==1.5.2
Expand All @@ -73,19 +73,19 @@ pip==23.2.1
pipdeptree==2.13.0
setuptools==65.5.1
skydriver-clientmanager
├── boto3 [required: Any, installed: 1.28.63]
│ ├── botocore [required: >=1.31.63,<1.32.0, installed: 1.31.63]
├── boto3 [required: Any, installed: 1.28.65]
│ ├── botocore [required: >=1.31.65,<1.32.0, installed: 1.31.65]
│ │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1]
│ │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.8.2]
│ │ │ └── six [required: >=1.5, installed: 1.16.0]
│ │ └── urllib3 [required: >=1.25.4,<2.1, installed: 1.26.17]
│ │ └── urllib3 [required: >=1.25.4,<2.1, installed: 1.26.18]
│ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1]
│ └── s3transfer [required: >=0.7.0,<0.8.0, installed: 0.7.0]
│ └── botocore [required: >=1.12.36,<2.0a.0, installed: 1.31.63]
│ └── botocore [required: >=1.12.36,<2.0a.0, installed: 1.31.65]
│ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1]
│ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.8.2]
│ │ └── six [required: >=1.5, installed: 1.16.0]
│ └── urllib3 [required: >=1.25.4,<2.1, installed: 1.26.17]
│ └── urllib3 [required: >=1.25.4,<2.1, installed: 1.26.18]
├── coloredlogs [required: Any, installed: 15.0.1]
│ └── humanfriendly [required: >=9.1, installed: 10.0]
├── dacite [required: Any, installed: 1.8.1]
Expand All @@ -106,16 +106,16 @@ skydriver-clientmanager
│ │ ├── certifi [required: >=2017.4.17, installed: 2023.7.22]
│ │ ├── charset-normalizer [required: >=2,<4, installed: 3.3.0]
│ │ ├── idna [required: >=2.5,<4, installed: 3.4]
│ │ └── urllib3 [required: >=1.21.1,<3, installed: 1.26.17]
│ │ └── urllib3 [required: >=1.21.1,<3, installed: 1.26.18]
│ ├── requests-oauthlib [required: Any, installed: 1.3.1]
│ │ ├── oauthlib [required: >=3.0.0, installed: 3.2.2]
│ │ └── requests [required: >=2.0.0, installed: 2.31.0]
│ │ ├── certifi [required: >=2017.4.17, installed: 2023.7.22]
│ │ ├── charset-normalizer [required: >=2,<4, installed: 3.3.0]
│ │ ├── idna [required: >=2.5,<4, installed: 3.4]
│ │ └── urllib3 [required: >=1.21.1,<3, installed: 1.26.17]
│ │ └── urllib3 [required: >=1.21.1,<3, installed: 1.26.18]
│ ├── six [required: >=1.9.0, installed: 1.16.0]
│ ├── urllib3 [required: >=1.24.2,<2.0, installed: 1.26.17]
│ ├── urllib3 [required: >=1.24.2,<2.0, installed: 1.26.18]
│ └── websocket-client [required: >=0.32.0,!=0.42.*,!=0.41.*,!=0.40.0, installed: 1.6.4]
├── motor [required: Any, installed: 3.3.1]
│ └── pymongo [required: >=4.5,<5, installed: 4.5.0]
Expand All @@ -126,7 +126,7 @@ skydriver-clientmanager
│ ├── certifi [required: >=2017.4.17, installed: 2023.7.22]
│ ├── charset-normalizer [required: >=2,<4, installed: 3.3.0]
│ ├── idna [required: >=2.5,<4, installed: 3.4]
│ └── urllib3 [required: >=1.21.1,<3, installed: 1.26.17]
│ └── urllib3 [required: >=1.21.1,<3, installed: 1.26.18]
├── tornado [required: Any, installed: 6.3.3]
├── typeguard [required: Any, installed: 4.1.5]
│ └── typing-extensions [required: >=4.7.0, installed: 4.8.0]
Expand All @@ -135,7 +135,7 @@ skydriver-clientmanager
│ │ ├── certifi [required: >=2017.4.17, installed: 2023.7.22]
│ │ ├── charset-normalizer [required: >=2,<4, installed: 3.3.0]
│ │ ├── idna [required: >=2.5,<4, installed: 3.4]
│ │ └── urllib3 [required: >=1.21.1,<3, installed: 1.26.17]
│ │ └── urllib3 [required: >=1.21.1,<3, installed: 1.26.18]
│ └── typing-extensions [required: Any, installed: 4.8.0]
└── wipac-rest-tools [required: Any, installed: 1.5.2]
├── cachetools [required: Any, installed: 5.3.1]
Expand All @@ -147,20 +147,20 @@ skydriver-clientmanager
│ ├── certifi [required: >=2017.4.17, installed: 2023.7.22]
│ ├── charset-normalizer [required: >=2,<4, installed: 3.3.0]
│ ├── idna [required: >=2.5,<4, installed: 3.4]
│ └── urllib3 [required: >=1.21.1,<3, installed: 1.26.17]
│ └── urllib3 [required: >=1.21.1,<3, installed: 1.26.18]
├── requests-futures [required: Any, installed: 1.0.1]
│ └── requests [required: >=1.2.0, installed: 2.31.0]
│ ├── certifi [required: >=2017.4.17, installed: 2023.7.22]
│ ├── charset-normalizer [required: >=2,<4, installed: 3.3.0]
│ ├── idna [required: >=2.5,<4, installed: 3.4]
│ └── urllib3 [required: >=1.21.1,<3, installed: 1.26.17]
│ └── urllib3 [required: >=1.21.1,<3, installed: 1.26.18]
├── tornado [required: Any, installed: 6.3.3]
└── wipac-dev-tools [required: Any, installed: 1.7.0]
├── requests [required: Any, installed: 2.31.0]
│ ├── certifi [required: >=2017.4.17, installed: 2023.7.22]
│ ├── charset-normalizer [required: >=2,<4, installed: 3.3.0]
│ ├── idna [required: >=2.5,<4, installed: 3.4]
│ └── urllib3 [required: >=1.21.1,<3, installed: 1.26.17]
│ └── urllib3 [required: >=1.21.1,<3, installed: 1.26.18]
└── typing-extensions [required: Any, installed: 4.8.0]
wheel==0.41.2
wipac-telemetry==0.3.0
Expand Down Expand Up @@ -234,7 +234,7 @@ wipac-telemetry==0.3.0
│ ├── certifi [required: >=2017.4.17, installed: 2023.7.22]
│ ├── charset-normalizer [required: >=2,<4, installed: 3.3.0]
│ ├── idna [required: >=2.5,<4, installed: 3.4]
│ └── urllib3 [required: >=1.21.1,<3, installed: 1.26.17]
│ └── urllib3 [required: >=1.21.1,<3, installed: 1.26.18]
├── opentelemetry-sdk [required: Any, installed: 1.20.0]
│ ├── opentelemetry-api [required: ==1.20.0, installed: 1.20.0]
│ │ ├── Deprecated [required: >=1.2.6, installed: 1.2.14]
Expand All @@ -250,5 +250,5 @@ wipac-telemetry==0.3.0
│ ├── certifi [required: >=2017.4.17, installed: 2023.7.22]
│ ├── charset-normalizer [required: >=2,<4, installed: 3.3.0]
│ ├── idna [required: >=2.5,<4, installed: 3.4]
│ └── urllib3 [required: >=1.21.1,<3, installed: 1.26.17]
│ └── urllib3 [required: >=1.21.1,<3, installed: 1.26.18]
└── typing-extensions [required: Any, installed: 4.8.0]
8 changes: 4 additions & 4 deletions skydriver/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import dataclasses as dc
import enum
import logging
from typing import Optional
from typing import Any, Optional

import coloredlogs # type: ignore[import]
import kubernetes.client # type: ignore[import]
Expand All @@ -23,8 +23,7 @@
class DebugMode(enum.Enum):
"""Various debug modes."""

LOGS_DUMP = "logs-dump"
LOGS_DIRECTORY = "logs-directory-admin-only" # if used w/ condor, limited to one scan at a time (spool)
CLIENT_LOGS = "client-logs"


@dc.dataclass(frozen=True)
Expand Down Expand Up @@ -107,7 +106,7 @@ def __post_init__(self) -> None:
LOCAL_K8S_HOST = "local"

# known cluster locations
KNOWN_CLUSTERS = {
KNOWN_CLUSTERS: dict[str, dict[str, Any]] = {
"sub-2": {
"orchestrator": "condor",
"location": {
Expand All @@ -125,6 +124,7 @@ def __post_init__(self) -> None:
),
)
],
"max_n_clients_during_debug_mode": 10,
},
LOCAL_K8S_HOST: {
"orchestrator": "k8s",
Expand Down
Loading

0 comments on commit 4757568

Please sign in to comment.