Skip to content

Commit

Permalink
Merge pull request #215 from InnopolisUni/system_monitoring
Browse files Browse the repository at this point in the history
System monitoring
  • Loading branch information
InnopolisUni authored Dec 8, 2023
2 parents e9e6c04 + aa30bee commit 088167f
Show file tree
Hide file tree
Showing 18 changed files with 482 additions and 20 deletions.
2 changes: 1 addition & 1 deletion config/augmentations_train/osl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ defaults:
- random_horizontal_flip
- /augmentations/color:
- /augmentations/postprocessing:
- to_tensor
- to_tensor_torchvision
- normalize
# task:
# - one-shot-learning
2 changes: 1 addition & 1 deletion config/augmentations_val/osl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ defaults:
- random_horizontal_flip
- /augmentations/color:
- /augmentations/postprocessing:
- to_tensor
- to_tensor_torchvision
- normalize
# task:
# - one-shot-learning
2 changes: 1 addition & 1 deletion config/datasets/detection/detection_lungs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ description: Набор данных содержит РГ снимки легк
markup_info: Набор данных содержит разметку bounding box легких и патологий.
date_time: 18.07.2022

_target_: innofw.core.datamodules.lightning_datamodules.detection.YOLOv5DataModule
_target_: innofw.core.integrations.ultralytics.datamodule.UltralyticsDataModuleAdapter

train:
source: https://api.blackhole.ai.innopolis.university/public-datasets/lungs_detection/train.zip
Expand Down
24 changes: 9 additions & 15 deletions config/experiments/classification/KG_090323_wpng92_mnist.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,19 @@
defaults:
- override /models: classification/resnet.yaml
- override /datasets: classification/classification_mnist.yaml
- override /augmentations_train: none
- override /augmentations_train: none #classification.yaml
- override /augmentations_val: none
- override /augmentations_test: none
- override /losses: log_loss.yaml
- override /optimizers: lion
# - override /callbacks: classification.yaml

- override /optimizers: adam
- override /schedulers:

project: "mnist_classification"
task: "image-classification"
random_seed: 42
original_work_dir: ${hydra:runtime.cwd}
weights_freq: 1
batch_size: 32
stop_param: 1
epochs: 10

#wandb:
# enable: True
# project: mnist
# group: 160323_just_testing_wandb
# job_type: training
batch_size: 128
epochs: 5
accelerator: "gpu"
devices: 2
num_workers: 1
weights_freq: 7
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,7 @@ project: "faces"
task: "one-shot-learning"
random_seed: 42
weights_freq: 1
epochs: 20
epochs: 50
accelerator: "gpu"
devices: 1
batch_size: 64
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# @package _global_
defaults:
- override /models: semantic-segmentation/unet_smp.yaml
- override /datasets: semantic-segmentation/segmentation_arable.yaml
- override /optimizers: adam.yaml
- override /losses: segmentation_losses

trainers:
overfit_batches: 20 # fit only 20 val batches and 40 train batches

accelerator: gpu

models:
in_channels: 4


project: "segmentation"
task: "image-segmentation"
random_seed: 42
weights_freq: 1
batch_size: 10
39 changes: 39 additions & 0 deletions examples/measure_anomaly_detection.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/bin/bash
# run like this: sudo -E env "PATH=$PATH" bash examples/measure_image_classification.sh
sudo apt-get install -y iotop > /dev/null
PWD=$(pwd)

current_dir=${PWD##*/}

var2="examples"
if [ "$current_dir" = "$var2" ]; then
cd ..
fi
echo "" > nohup.out
echo "" > nvidiasmi_log
echo "" > mem_log
echo "" > cpu_log

export NO_CLI=True

INITIAL_PROCESSES=$(sudo lsof nohup.out | wc -l) && echo $INITIAL_PROCESSES


nohup time sudo -E env "PATH=$PATH" python train.py experiments=anomaly-detection/IM_040822_rbe23lls_anomaly_detection_timeseries_lstm.yaml epochs=100 optimizers=adam accelerator=gpu &
PID=$!
echo "Saving cpu+ram info and nvidia-smi to cpu_log + mem_log and nvidiasmi_log"
echo $PID

PROCESSES=$(sudo lsof nohup.out | wc -l) && echo $PROCESSES
while [ $PROCESSES -gt $INITIAL_PROCESSES ]
do
top -b n1 | grep -E 'Cpu' >> cpu_log
top -b n1 | grep -E 'MiB Mem' >> mem_log
sudo nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv | grep "00000000:25:00.0" >> nvidiasmi_log
sleep 1
PROCESSES=$(sudo lsof nohup.out | wc -l) # && echo $PROCESSES
done

sudo -E env "PATH=$PATH" python examples/measurements_compaction.py

sudo kill -9 $PID
42 changes: 42 additions & 0 deletions examples/measure_complexing.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/bin/bash
# run like this: sudo -E env "PATH=$PATH" bash examples/measure_image_classification.sh
sudo apt-get install -y iotop > /dev/null
PWD=$(pwd)

current_dir=${PWD##*/}

var2="examples"
if [ "$current_dir" = "$var2" ]; then
cd ..
fi
echo "" > nohup.out
echo "" > nvidiasmi_log
echo "" > mem_log
echo "" > cpu_log

export NO_CLI=True

INITIAL_PROCESSES=$(sudo lsof nohup.out | wc -l) && echo $INITIAL_PROCESSES


nohup time sudo -E env "PATH=$PATH" python innofw/utils/data_utils/preprocessing/band_composer.py\
--src_type sentinel2\
--src_path tests/data/images/other/satellite_cropped/sentinel2/one\
--channels "[\"RED\", \"GRN\", \"BLU\", \"NIR\"]" &
PID=$!
echo "Saving cpu+ram info and nvidia-smi to cpu_log + mem_log and nvidiasmi_log"
echo $PID

PROCESSES=$(sudo lsof nohup.out | wc -l) && echo $PROCESSES
while [ $PROCESSES -gt $INITIAL_PROCESSES ]
do
top -b n1 | grep -E 'Cpu' >> cpu_log
top -b n1 | grep -E 'MiB Mem' >> mem_log
sudo nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv | grep "00000000:25:00.0" >> nvidiasmi_log
sleep 1
PROCESSES=$(sudo lsof nohup.out | wc -l) # && echo $PROCESSES
done

sudo -E env "PATH=$PATH" python examples/measurements_compaction.py

sudo kill -9 $PID
32 changes: 32 additions & 0 deletions examples/measure_detection_yolov5.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash
# run like this: sudo -E env "PATH=$PATH" bash examples/measure_image_classification.sh
sudo apt-get install -y iotop > /dev/null
PWD=$(pwd)

current_dir=${PWD##*/}

var2="examples"
if [ "$current_dir" = "$var2" ]; then
cd ..
fi
echo "" > nohup.out
echo "" > nvidiasmi_log
echo "" > mem_log
echo "" > cpu_log

export NO_CLI=True

nohup time sudo -E env "PATH=$PATH" python train.py experiments=detection/KA_120722_8adfcdaa_yolov5.yaml epochs=100 optimizers=adam accelerator=gpu &
PID=$!
echo "Saving cpu+ram info and nvidia-smi to cpu_log+mem_log and nvidiasmi_log"
echo $PID

while true
do
top -b n1 | grep -E 'Cpu' >> cpu_log
top -b n1 | grep -E 'MiB Mem' >> mem_log
sudo nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv >> nvidiasmi_log
sleep 1
done

kill $PID
39 changes: 39 additions & 0 deletions examples/measure_image_classification.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/bin/bash
# run like this: sudo -E env "PATH=$PATH" bash examples/measure_image_classification.sh
sudo apt-get install -y iotop > /dev/null
PWD=$(pwd)

current_dir=${PWD##*/}

var2="examples"
if [ "$current_dir" = "$var2" ]; then
cd ..
fi
echo "" > nohup.out
echo "" > nvidiasmi_log
echo "" > mem_log
echo "" > cpu_log

export NO_CLI=True

INITIAL_PROCESSES=$(sudo lsof nohup.out | wc -l) && echo $INITIAL_PROCESSES


nohup time sudo -E env "PATH=$PATH" python train.py experiments=classification/KG_090323_wpng92_mnist.yaml epochs=100 &
PID=$!
echo "Saving cpu+ram info and nvidia-smi to cpu_log + mem_log and nvidiasmi_log"
echo $PID

PROCESSES=$(sudo lsof nohup.out | wc -l) && echo $PROCESSES
while [ $PROCESSES -gt $INITIAL_PROCESSES ]
do
top -b n1 | grep -E 'Cpu' >> cpu_log
top -b n1 | grep -E 'MiB Mem' >> mem_log
sudo nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv | grep "00000000:25:00.0" >> nvidiasmi_log
sleep 1
PROCESSES=$(sudo lsof nohup.out | wc -l) # && echo $PROCESSES
done

sudo -E env "PATH=$PATH" python examples/measurements_compaction.py

sudo kill -9 $PID
39 changes: 39 additions & 0 deletions examples/measure_lin_regression.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/bin/bash
# run like this: sudo -E env "PATH=$PATH" bash examples/measure_image_classification.sh
sudo apt-get install -y iotop > /dev/null
PWD=$(pwd)

current_dir=${PWD##*/}

var2="examples"
if [ "$current_dir" = "$var2" ]; then
cd ..
fi
echo "" > nohup.out
echo "" > nvidiasmi_log
echo "" > mem_log
echo "" > cpu_log

export NO_CLI=True

INITIAL_PROCESSES=$(sudo lsof nohup.out | wc -l) && echo $INITIAL_PROCESSES


nohup time sudo -E env "PATH=$PATH" python train.py experiments=regression/KA_130722_9f7134db_linear_regression.yaml &
PID=$!
echo "Saving cpu+ram info and nvidia-smi to cpu_log + mem_log and nvidiasmi_log"
echo $PID

PROCESSES=$(sudo lsof nohup.out | wc -l) && echo $PROCESSES
while [ $PROCESSES -gt $INITIAL_PROCESSES ]
do
top -b n1 | grep -E 'Cpu' >> cpu_log
top -b n1 | grep -E 'MiB Mem' >> mem_log
sudo nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv | grep "00000000:25:00.0" >> nvidiasmi_log
sleep 1
PROCESSES=$(sudo lsof nohup.out | wc -l) # && echo $PROCESSES
done

sudo -E env "PATH=$PATH" python examples/measurements_compaction.py

sudo kill -9 $PID
39 changes: 39 additions & 0 deletions examples/measure_oneshot_learning.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/bin/bash
# run like this: sudo -E env "PATH=$PATH" bash examples/measure_image_classification.sh
sudo apt-get install -y iotop > /dev/null
PWD=$(pwd)

current_dir=${PWD##*/}

var2="examples"
if [ "$current_dir" = "$var2" ]; then
cd ..
fi
echo "" > nohup.out
echo "" > nvidiasmi_log
echo "" > mem_log
echo "" > cpu_log

export NO_CLI=True

INITIAL_PROCESSES=$(sudo lsof nohup.out | wc -l) && echo $INITIAL_PROCESSES


nohup time sudo -E env "PATH=$PATH" python train.py experiments=one-shot-learning/IM_190722_vwer3f23_oneshotlearning.yaml &
PID=$!
echo "Saving cpu+ram info and nvidia-smi to cpu_log + mem_log and nvidiasmi_log"
echo $PID

PROCESSES=$(sudo lsof nohup.out | wc -l) && echo $PROCESSES
while [ $PROCESSES -gt $INITIAL_PROCESSES ]
do
top -b n1 | grep -E 'Cpu' >> cpu_log
top -b n1 | grep -E 'MiB Mem' >> mem_log
sudo nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv | grep "00000000:25:00.0" >> nvidiasmi_log
sleep 1
PROCESSES=$(sudo lsof nohup.out | wc -l) # && echo $PROCESSES
done

sudo -E env "PATH=$PATH" python examples/measurements_compaction.py

sudo kill -9 $PID
39 changes: 39 additions & 0 deletions examples/measure_segmentation.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/bin/bash
# run like this: sudo -E env "PATH=$PATH" bash examples/measure_image_classification.sh
sudo apt-get install -y iotop > /dev/null
PWD=$(pwd)

current_dir=${PWD##*/}

var2="examples"
if [ "$current_dir" = "$var2" ]; then
cd ..
fi
echo "" > nohup.out
echo "" > nvidiasmi_log
echo "" > mem_log
echo "" > cpu_log

export NO_CLI=True

INITIAL_PROCESSES=$(sudo lsof nohup.out | wc -l) && echo $INITIAL_PROCESSES


nohup time sudo -E env "PATH=$PATH" python train.py experiments=semantic-segmentation/KG_041223_ingieq921_unet.yaml epochs=100 optimizers=adam accelerator=gpu &
PID=$!
echo "Saving cpu+ram info and nvidia-smi to cpu_log + mem_log and nvidiasmi_log"
echo $PID

PROCESSES=$(sudo lsof nohup.out | wc -l) && echo $PROCESSES
while [ $PROCESSES -gt $INITIAL_PROCESSES ]
do
top -b n1 | grep -E 'Cpu' >> cpu_log
top -b n1 | grep -E 'MiB Mem' >> mem_log
sudo nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv | grep "00000000:25:00.0" >> nvidiasmi_log
sleep 1
PROCESSES=$(sudo lsof nohup.out | wc -l) # && echo $PROCESSES
done

sudo -E env "PATH=$PATH" python examples/measurements_compaction.py

sudo kill -9 $PID
Loading

0 comments on commit 088167f

Please sign in to comment.