-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmcheck-service
executable file
·135 lines (101 loc) · 2.87 KB
/
mcheck-service
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/bin/bash
LOCKFILE=/tmp/mining.lock
if [ -e ${LOCKFILE} ] && kill -0 `cat ${LOCKFILE}`; then
echo "already running"
exit
fi
# make sure the lockfile is removed when we exit and then claim it
trap "rm -f ${LOCKFILE}; exit" INT TERM EXIT
echo $$ > ${LOCKFILE}
################################################################################
### SCRIPT_DIR is the location the script
SCRIPT_DIR="`dirname \"$0\"`" # relative
SCRIPT_DIR="`( cd \"$SCRIPT_DIR\" && pwd )`" # absolutized and normalized
if [ -z "$SCRIPT_DIR" ] ; then
# error; for some reason, the path is not accessible
# to the script (e.g. permissions re-evaled after suid)
exit 1 # fail
fi
################################################################################
source $SCRIPT_DIR/FUNCTIONS
settings_load
if [ "$RESTART_IF_IDLE" = "0" ]; then
exit 0
fi
reboot_check ()
{
sudo reboot
# if [ "$REBOOT_IF_IDLE" = "1" ]; then
# fi
}
PID=$(ps -ef | awk '/[m]iner/{print $2}')
if [ "$PID" = "" ]; then
echo "no mining detected, restarting system"
systemctl restart mining
exit 0
else
UP=$(ps -o etime= -p $PID)
UP="$(echo -e "${UP}" | sed -e 's/^[[:space:]]*//')"
if [ "${#UP}" -le 5 ]; then
DIFF=${UP:(-5)}
if [[ "$DIFF" = "00:"* ]] || [[ "$DIFF" = "02:"* ]]; then
echo "miner started less than 3 mn ago, skipping check"
exit 0
fi
fi
echo "Mining uptime: $UP"
fi
FAIL_COUNT=0
GPU=0
if [[ "$NB_GPUS" = 0 ]]; then
MSG="No Cards detected, rebooting"
echo "$MSG"
state=$MSG$(tail -n 40 /var/log/mining.log 2> /dev/null)
mail "$EMAIL_NOTIF" -s "$HOSTNAME : $MSG" <<< $state
reboot_check
exit 1
fi
while [ "$GPU" -lt "$NB_GPUS" ]; do
REBOOT=0
MSG=""
GPUINFO=`timeout 1.5s nvidia-smi -i $GPU --query-gpu=index,name,utilization.gpu,temperature.gpu --format=csv,noheader`
if [[ "$?" != "0" ]] ; then
REBOOT=1
MSG="system stalled, rebooting"
elif [[ $GPUINFO = *"Unknown"* ]]; then
REBOOT=1
MSG="Rebooting due to unknown GPU state $GPUINFO"
elif [[ $GPUINFO = *"ERR"* ]]; then
REBOOT=1
MSG="Rebooting due to ERROR in GPU state $GPUINFO"
fi
if [ "$REBOOT" = "1" ]; then
echo "$MSG"
state=$MSG$(tail -n 40 /var/log/mining.log 2> /dev/null)
mail "$EMAIL_NOTIF" -s "$HOSTNAME : $MSG" <<< $state
reboot_check
exit 1
fi
UTIL=$(echo $GPUINFO | cut -f3 -d"," | cut -f2 -d" ")
if [ "$UTIL" = "" ] ; then
echo "Issue checking "
fi
if (($UTIL < $IDLE_GPU)); then
let FAIL_COUNT=FAIL_COUNT+1
echo "GPU$GPU IDLE $GPUINFO"
else
FAIL_COUNT=0
# echo "GPU$GPU OK"
fi
let GPU=GPU+1
done
# if 2 fail. restart
if [ $FAIL_COUNT -ge 3 ]; then
state=$($SCRIPT_DIR/ms -nofancy)$(tail -n 40 /var/log/mining.log 2> /dev/null)$(nvidia-smi)
mail "$EMAIL_NOTIF" -s "$HOSTNAME : ISSUE ($FAIL_COUNT/$NB_GPUS), restarting miner" <<< $state
### restart ccminer
echo "restarting mining, fail=$FAIL_COUNT"
#reboot_check
# systemctl restart mining
fi
rm -f ${LOCKFILE}