diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 51f0a3f..41a9416 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -18,7 +18,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.6, 3.7, 3.8, 3.9, "3.10"] + python-version: [3.7, 3.8, 3.9, "3.10"] experimental: [false] steps: - uses: actions/checkout@v2 @@ -31,6 +31,9 @@ jobs: cache-dependency-path: 'requirements*' - name: check OS run: cat /etc/os-release + - name: Telemetry off + run: | + export TRAINER_TELEMETRY=0 - name: Install dependencies run: | sudo apt-get update diff --git a/trainer/VERSION b/trainer/VERSION index 9e2dabb..4974c8e 100644 --- a/trainer/VERSION +++ b/trainer/VERSION @@ -1 +1 @@ -v0.0.17 +v0.0.18 diff --git a/trainer/analytics.py b/trainer/analytics.py new file mode 100644 index 0000000..6291c6b --- /dev/null +++ b/trainer/analytics.py @@ -0,0 +1,12 @@ +import os + +import requests + +telemetry = os.environ.get("TRAINER_TELEMETRY") + + +def ping_training_run(): + if telemetry == "0": + return + URL = "https://coqui.gateway.scarf.sh/trainer/training_run" + _ = requests.get(URL) diff --git a/trainer/callbacks.py b/trainer/callbacks.py index ead5f8c..5da4100 100644 --- a/trainer/callbacks.py +++ b/trainer/callbacks.py @@ -1,4 +1,4 @@ -from typing import Dict, Callable +from typing import Callable, Dict class TrainerCallback: diff --git a/trainer/trainer.py b/trainer/trainer.py index 9d0ca6b..f3e6320 100644 --- a/trainer/trainer.py +++ b/trainer/trainer.py @@ -19,6 +19,7 @@ from torch.nn.parallel import DistributedDataParallel as DDP_th from torch.utils.data import DataLoader +from trainer.analytics import ping_training_run from trainer.callbacks import TrainerCallback from trainer.generic_utils import ( KeepAverage, @@ -241,6 +242,10 @@ class TrainerArgs(Coqpit): default=False, metadata={"help": "Skip training and only run evaluation and test."}, ) + start_with_eval: bool = field( + default=False, + metadata={"help": "Start with evaluation and test."}, + ) small_run: int = field( default=None, metadata={ @@ -388,6 +393,7 @@ def __init__( # pylint: disable=dangerous-default-value self.grad_accum_steps = args.grad_accum_steps self.overfit_batch = args.overfit_batch self.skip_train_epoch = args.skip_train_epoch + self.start_with_eval = args.start_with_eval assert self.grad_accum_steps > 0, " [!] grad_accum_steps must be greater than 0." @@ -519,6 +525,7 @@ def __init__( # pylint: disable=dangerous-default-value self.callbacks.on_init_end(self) self.dashboard_logger.add_config(config) self.save_training_script() + ping_training_run() def save_training_script(self): """Save the training script to tracking dashboard and output path.""" @@ -1519,7 +1526,7 @@ def _fit(self) -> None: self.keep_avg_eval = KeepAverage() if self.config.run_eval else None self.epochs_done = epoch self.c_logger.print_epoch_start(epoch, self.config.epochs, self.output_path) - if not self.skip_train_epoch: + if not self.skip_train_epoch and not self.start_with_eval: self.train_epoch() if self.config.run_eval: self.eval_epoch() @@ -1532,6 +1539,7 @@ def _fit(self) -> None: if self.args.rank in [None, 0]: self.save_best_model() self.callbacks.on_epoch_end(self) + self.start_with_eval = False def fit_with_largest_batch_size(self, starting_batch_size=2048) -> None: cuda_meminfo() @@ -1552,7 +1560,7 @@ def fit_with_largest_batch_size(self, starting_batch_size=2048) -> None: torch.cuda.empty_cache() else: raise - except Exception as exception: #pylint: disable=broad-except + except Exception as exception: # pylint: disable=broad-except # catches the torch.cuda.OutOfMemoryError if bs > 1 and should_reduce_batch_size(exception): bs //= 2 diff --git a/trainer/utils/cpu_memory.py b/trainer/utils/cpu_memory.py index 3ec788e..2f4ff97 100644 --- a/trainer/utils/cpu_memory.py +++ b/trainer/utils/cpu_memory.py @@ -8,8 +8,9 @@ def get_available_cpu_memory(): available_memory = psutil.virtual_memory().available try: - import resource # pylint: disable=import-outside-toplevel - _, hard_mem_limit = resource.getrlimit(resource.RLIMIT_AS) #pylint: disable=unused-variable + import resource # pylint: disable=import-outside-toplevel + + _, hard_mem_limit = resource.getrlimit(resource.RLIMIT_AS) # pylint: disable=unused-variable if hard_mem_limit != resource.RLIM_INFINITY: used_memory = this_process.memory_info().vms available_memory = min(hard_mem_limit - used_memory, available_memory) @@ -21,9 +22,9 @@ def get_available_cpu_memory(): def set_cpu_memory_limit(num_gigabytes): try: - import resource # pylint: disable=import-outside-toplevel + import resource # pylint: disable=import-outside-toplevel - num_bytes = int(num_gigabytes * 2 ** 30) + num_bytes = int(num_gigabytes * 2**30) _, hard_limit = resource.getrlimit(resource.RLIMIT_AS) if hard_limit != resource.RLIM_INFINITY: hard_limit = min(num_bytes, hard_limit)