Skip to content

Commit

Permalink
Merge pull request #20 from cms-DQM/dev128
Browse files Browse the repository at this point in the history
  • Loading branch information
nothingface0 authored Feb 23, 2024
2 parents 8592761 + 706003d commit 9e19b2d
Show file tree
Hide file tree
Showing 31 changed files with 258 additions and 150 deletions.
2 changes: 1 addition & 1 deletion bin/visDQMCreateInfoDaemon
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
import os, re, hashlib, time, sys
from traceback import print_exc
from Monitoring.Core.Utils import logme
from Monitoring.Core.Utils.Common import logme

# Command line arguments
BASE_DIR = sys.argv[1] # "/dqmdata/offline/repository/data/OnlineData"
Expand Down
2 changes: 1 addition & 1 deletion bin/visDQMDeleteDaemon
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import re, os, time, sys
from traceback import print_exc
from Monitoring.Core.Utils import logme
from Monitoring.Core.Utils.Common import logme
from urllib import request
from fcntl import lockf, LOCK_EX, LOCK_UN

Expand Down
2 changes: 1 addition & 1 deletion bin/visDQMExportDaemon
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import os, os.path, sys
from time import strftime, localtime, sleep, time
from Monitoring.Core.Utils import logme
from Monitoring.Core.Utils.Common import logme
from tempfile import mkstemp
from glob import glob
from threading import Thread, Lock, active_count
Expand Down
12 changes: 6 additions & 6 deletions bin/visDQMImportDaemon
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os, os.path, time, sys, argparse, subprocess, functools
from subprocess import Popen, PIPE
from traceback import print_exc
from Monitoring.Core.Utils import logme
from Monitoring.Core.Utils.Common import logme
from datetime import datetime, timedelta
from glob import glob
from fcntl import lockf, LOCK_EX, LOCK_UN
Expand Down Expand Up @@ -240,11 +240,11 @@ def sendIndexIntegrityErrorMessage():
"Sending email to %s to notify about failed index integrity check." % args.email
)
process = Popen("/usr/sbin/sendmail -t", shell=True, stdin=PIPE)
process.stdin.write("To: %s\n" % args.email)
process.stdin.write("Subject: Problem when checking the index integrity\n")
process.stdin.write("\n") # blank line separating headers from body
process.stdin.write("Problem when checking the index integrity\n")
process.stdin.write("Please check logs! This is serious!\n")
process.stdin.write(f"To: {args.email}\n".encode())
process.stdin.write("Subject: Problem when checking the index integrity\n".encode())
process.stdin.write("\n".encode()) # blank line separating headers from body
process.stdin.write("Problem when checking the index integrity\n".encode())
process.stdin.write("Please check logs! This is serious!\n".encode())
process.stdin.close()
returncode = process.wait()
if returncode != 0:
Expand Down
3 changes: 2 additions & 1 deletion bin/visDQMIndexCastorStager
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os, time, sys, argparse
from subprocess import Popen, PIPE
from traceback import print_exc
from Monitoring.Core.Utils import logme
from Monitoring.Core.Utils.Common import logme
from datetime import datetime, timedelta
from glob import glob
from socket import gethostname
Expand Down Expand Up @@ -293,6 +293,7 @@ def executeCommand(command):
# Alert email addresses given as parameter about a failure of the process.
def alertBySendingEmail(errorText):
process = Popen("/usr/sbin/sendmail -t", shell=True, stdin=PIPE)
# TODO: encode() the strings for python3, when Offline GUI upgrades
process.stdin.write("To: %s\n" % args.EMAIL)
process.stdin.write("Subject: Problem sending DQM GUI index backup to EOS\n")
process.stdin.write("\n") # blank line separating headers from body
Expand Down
2 changes: 1 addition & 1 deletion bin/visDQMIndexMergeDaemon
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import os, time, sys
from glob import glob
from Monitoring.Core.Utils import logme
from Monitoring.Core.Utils.Common import logme
from traceback import print_exc
from subprocess import Popen, PIPE
from fcntl import lockf, LOCK_EX, LOCK_UN
Expand Down
2 changes: 1 addition & 1 deletion bin/visDQMMergeDaemon
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import os, os.path, time, sys
from traceback import print_exc
from Monitoring.Core.Utils import logme
from Monitoring.Core.Utils.Common import logme
from tempfile import mkstemp
from glob import glob

Expand Down
2 changes: 1 addition & 1 deletion bin/visDQMOnlineSyncDaemon
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ from time import time, strptime, sleep
from optparse import OptionParser
from urllib import parse
from calendar import timegm
from Monitoring.Core.Utils import logme
from Monitoring.Core.Utils.Common import logme
from traceback import print_exc
from tempfile import mkstemp
from stat import *
Expand Down
19 changes: 10 additions & 9 deletions bin/visDQMReceiveDaemon
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os, os.path, time, sys, re, hashlib, functools
import logging
from traceback import print_exc
from Monitoring.Core.Utils import logme
from Monitoring.Core.Utils.Common import logme
from tempfile import mkstemp
from stat import *
from Monitoring.DQM import visDQMUtils
Expand Down Expand Up @@ -303,14 +303,15 @@ def finaliseOneFile(info):
info["runnr"],
info["dataset"].replace("/", "__"),
)
info[
"zippat"
] = "OfflineData/%s/%s/DQM_Offline_%s_%s_R%07dxx_S%%04d.zip" % (
info["era"],
info["primds"],
info["era"],
info["primds"],
info["runnr"] / 100,
info["zippat"] = (
"OfflineData/%s/%s/DQM_Offline_%s_%s_R%07dxx_S%%04d.zip"
% (
info["era"],
info["primds"],
info["era"],
info["primds"],
info["runnr"] / 100,
)
)
assignUniqueVersion(info)
ok = True
Expand Down
2 changes: 1 addition & 1 deletion bin/visDQMRootFileQuotaControlDaemon
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import os, time, re, sys, errno
from traceback import print_exc
from Monitoring.Core.Utils import logme
from Monitoring.Core.Utils.Common import logme
from glob import glob


Expand Down
2 changes: 1 addition & 1 deletion bin/visDQMRootFileVersionControlDaemon
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import os, time, re, sys
from traceback import print_exc
from Monitoring.Core.Utils import logme
from Monitoring.Core.Utils.Common import logme
from glob import glob


Expand Down
129 changes: 24 additions & 105 deletions bin/visDQMSoundAlarmDaemon
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@ import time
import sys
import json
from traceback import print_exc
from Monitoring.Core.Utils import logme
from Monitoring.Core.Utils.Common import logme
from Monitoring.Core.Utils.SoundAlarm import send_sound_msg, send_email_msg, run_test

from urllib import parse, request, error
from socket import socket, AF_INET, SOCK_STREAM, gethostname
from subprocess import Popen, PIPE


# IMPORTANT: If you want to play a test sound, just start the program with
# all the usual parameters, but add "test" as last parameter.
Expand Down Expand Up @@ -73,76 +74,12 @@ else:
ERROR_FOLDER = parse.quote("00 Shift/Errors", ":/-_")
DATA_LOCATION = parse.quote("data/json/live/1/Global/Online/ALL", ":/-_")

# Body of the XML message that is sent to CMS-WOW
MSGBODY = (
'<CommandSequence><alarm sender="DQM" sound="DQM_1.wav" talk="%s">'
"%s Check plots in the DQM Error folder.</alarm></CommandSequence>"
)

WAITTIME = 30


# --------------------------------------------------------------------


# Short hand to send XML message to CMS-WOW
def send_sound_msg(msg, spoken_msg):
s = socket(AF_INET, SOCK_STREAM)
s.connect((SOUNDSERVER, PORT))
s.send(MSGBODY % (spoken_msg, msg))
data = s.recv(1024)
s.close()
if data == "All ok\n":
logme("INFO: Broadcasted message: %s ", msg)
send_email_msg(
"We (DQM) just played a sound in the control room.\n"
'The message we played was: "%s"\n\n--\n%s' % (spoken_msg, msg)
)
return True
else:
error_msg = "ERROR: Unexpected answer from CMS-WOW: %s" % repr(data)
logme(error_msg)
send_email_msg(error_msg)
return False


# Short hand to send email message
def send_email_msg(msg):
if EMAIL_ADDRESSES:
logme(
"Sending email to %s with following message:\n%s" % (EMAIL_ADDRESSES, msg)
)
process = Popen("/usr/sbin/sendmail -t", shell=True, stdin=PIPE)
process.stdin.write(f"To: {EMAIL_ADDRESSES}\n".encode())
process.stdin.write(
f"Subject: Message from the visDQMSoundAlarmDaemon on {gethostname()} at P5\n".encode()
)
process.stdin.write("\n".encode()) # blank line separating headers from body
process.stdin.write(f"{msg}\n\n".encode())
process.stdin.write(
f"The logs should be here: /data/srv/logs/dqmgui/online/\n".encode()
)
process.stdin.close()
returncode = process.wait()
if returncode != 0:
logme("ERROR: Sendmail exit with status %s", returncode)
else:
logme("Not sending email, since no emailaddresses were set.")


# Short hand to extract GUI information, it fails if the retrieved
# data is not python format.
def getGuiData(opener, url):
page1 = opener.open(url)
data = page1.read()
try:
contents = eval(data)
except Exception as e:
raise e
page1.close()
return contents


# Construct URL pointing to the Error folder
sr = re.search(r"dqm/(online)", BASEURL)
if not sr:
Expand All @@ -152,28 +89,6 @@ if not sr:
BASEURL = "%s/%s/%s/" % (BASEURL, DATA_LOCATION, ERROR_FOLDER)


# This method is just to test the sound infrastructure.
# It will try to play a test message, send a test email and then exit.
def run_test():
logme("Running in test mode.")
msg = "This is a test"
spoken_msg = "This is a test"
# First try to test sending a sound message
logme("Trying to send a sound message.")
try:
send_sound_msg(msg, spoken_msg)
except Exception as e:
logme("ERROR: %s", e)
print_exc()
# Then try to test sending an email message
logme("Trying to send an email message.")
try:
send_email_msg(msg)
except Exception as e:
logme("ERROR: %s", e)
print_exc()


# This method will start the actual daemon, checking the GUI every WAITTIME
# seconds
def run_daemon():
Expand Down Expand Up @@ -234,12 +149,11 @@ def run_daemon():
if histo["properties"]["report"]["alarm"] == 1:
if name in disabledAlarms:
logme(
"Histo caused an alarm but it was disabled in the alarm manager: %s"
% name
f"Histo caused an alarm but it was disabled in the alarm manager: {name}"
)
else:
knownNewAlarms.add(name)
logme("Info from the DQM GUI: %s" % str(histo))
logme(f"Info from the DQM GUI: {str(histo)}")

# should alarm be triggered
alarmsNew = knownNewAlarms.difference(knownAlarms)
Expand Down Expand Up @@ -286,14 +200,19 @@ def run_daemon():
spoken_msgs = msgs[:2] + [" Check plots in the DQM Error folder."]
spoken_msg = "".join(spoken_msgs).replace("DQM", "D Q M")

send_sound_msg(msg, spoken_msg)
send_sound_msg(
msg=msg,
spoken_msg=spoken_msg,
soundserver=SOUNDSERVER,
port=PORT,
email_addresses=EMAIL_ADDRESSES,
)

knownAlarms = knownNewAlarms

# Done some heartbeat logging:
logme(
"Daemon woke up. A run was going on. %d active alarm(s)."
% len(knownAlarms)
f"Daemon woke up. A run was going on. {len(knownAlarms)} active alarm(s)."
)

except KeyboardInterrupt as e:
Expand All @@ -306,36 +225,36 @@ def run_daemon():
# there is no point in alerting people.
if activeURLErrors <= 3:
logme(
"Daemon woke up. Couldn't connect to GUI (%d). Going back to "
"sleep." % activeURLErrors
f"Daemon woke up. Couldn't connect to GUI ({activeURLErrors}). Going back to "
"sleep."
)
# However, when we see more then 3 times in a row that the network
# connection is failing, then we start sending emails:
else:
logme(
"Daemon woke up. Couldn't connect to GUI for the %dth time! "
"Sending mail and going back to sleep." % activeURLErrors
f"Daemon woke up. Couldn't connect to GUI for the {activeURLErrors}th time! "
"Sending mail and going back to sleep."
)
logme("BASEURL: %s", BASEURL)
logme("ERROR: %s", e)
logme(f"BASEURL: {BASEURL}")
logme(f"ERROR: {e}")
print_exc()
send_email_msg("ERROR: %s\n\n" % e)
send_email_msg(msg=f"ERROR: {e}\n\n", email_addresses=EMAIL_ADDRESSES)
# Reset counter to -20. Otherwise you would start receiving a message
# every 30 seconds now.
# At least this way, the next email will only be in >10 minutes.
activeURLErrors = -20

except Exception as e:
logme("ERROR: %s", e)
logme(f"ERROR: {e}")
print_exc()
send_email_msg("ERROR: %s\n\n" % e)
send_email_msg(msg=f"ERROR: {e}\n\n", email_addresses=EMAIL_ADDRESSES)

time.sleep(WAITTIME)


if IS_TEST:
# If we're in test mode
run_test()
run_test(soundserver=SOUNDSERVER, port=PORT, email_addresses=EMAIL_ADDRESSES)
else:
# After starting the script, we wait a while before we try, because maybe the
# GUI isn't started yet.
Expand Down
2 changes: 1 addition & 1 deletion bin/visDQMZipCastorStager
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ from traceback import print_exc
from tempfile import mkstemp
from math import sqrt
from glob import glob
from Monitoring.Core.Utils import logme
from Monitoring.Core.Utils.Common import logme


DROPBOX = sys.argv[1] # Directory where we receive input ("drop box").
Expand Down
3 changes: 2 additions & 1 deletion bin/visDQMZipCastorVerifier
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python3

import os, time, sys, pickle
from Monitoring.Core.Utils import logme
from Monitoring.Core.Utils.Common import logme
from glob import glob
from math import sqrt
from tempfile import mkstemp
Expand Down Expand Up @@ -57,6 +57,7 @@ def runme(cmd, *args, **keys):


def sendmail(body="Hello from visDQMZipCastorVerifier"):
# TODO: encode() when Offline DQMGUI updates to python3
scall = Popen("%s -t" % SENDMAIL, shell=True, stdin=PIPE)
scall.stdin.write("To: %s\n" % EMAIL)
scall.stdin.write("Subject: Problem verifying file transfer to EOS\n")
Expand Down
2 changes: 1 addition & 1 deletion bin/visDQMZipDaemon
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import os, os.path, time, sys
from traceback import print_exc
from Monitoring.Core.Utils import logme
from Monitoring.Core.Utils.Common import logme
from tempfile import mkstemp
from glob import glob
from stat import *
Expand Down
2 changes: 1 addition & 1 deletion bin/visDQMZipFreezeDaemon
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import os, time, sys, re
from traceback import print_exc
from Monitoring.Core.Utils import logme
from Monitoring.Core.Utils.Common import logme
from tempfile import mkstemp
from glob import glob

Expand Down
Loading

0 comments on commit 9e19b2d

Please sign in to comment.