forked from CMSLQ/submitJobsWithCrabV2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpostProcessWithCrab3.py
359 lines (316 loc) · 15 KB
/
postProcessWithCrab3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
#!/usr/bin/env python2
from __future__ import print_function
import subprocess
import os
import sys
import string
from optparse import OptionParser
from datetime import datetime
import shutil
from multiprocessing import Process, Queue
try:
from CRABClient.UserUtilities import config, getUsernameFromSiteDB
except ImportError:
print
print('ERROR: Could not load CRABClient.UserUtilities. Please source the crab3 setup:')
print('source /cvmfs/cms.cern.ch/crab3/crab.sh')
exit(-1)
try:
cmsswBaseDir = os.environ['CMSSW_BASE']
except KeyError as e:
print('Could not find CMSSW_BASE env var; have you set up the CMSSW environment?')
exit(-1)
# now we should be able to import all the crab stuff
from CRABAPI.RawCommand import crabCommand
from httplib import HTTPException
import utils
def crabSubmit(config):
try:
crabCommand('submit', config = config)
#crabCommand('submit', 'dryrun', config = config)
except HTTPException, hte:
sys.stdout.write("\033[1;31m") # red
print('-----> there was a problem. see below.')
print(hte.headers)
print('quit here')
fileName = 'failedCrabCfg.py'
with open(fileName,'w') as f:
print(config, file=f)
print('configuration saved to',fileName)
sys.stdout.write("\033[0;0m") # regular color
q.put(-1)
q.put(0)
def validateOptions(options):
error = False
if options.localStorageDir is None:
error = True
elif options.inputList is None:
error = True
if error:
print('You are missing one or more required options: d, i')
parser.print_help()
exit(-1)
if options.prevJsonFile is not None and options.jsonFile is None:
print('It does not make sense to specify a previously used/analyzed JSON file without specifying a new JSON file, since with this option specified, the difference between the new and old JSON is taken as the lumi mask.')
exit(-1)
def makeDirAndCheck(dir):
if not os.path.exists(dir):
os.makedirs(dir)
else:
# in practice, this doesn't happen because of the seconds in the name, but always good to check
print('ERROR: directory %s already exists. Not going to overwrite it.' % dir)
exit(-2)
def CheckProxy():
proc = subprocess.Popen(['voms-proxy-info','--all'],stderr=subprocess.PIPE,stdout=subprocess.PIPE)
out,err = proc.communicate()
#print 'output----->',output
#print 'err------>',err
if 'Proxy not found' in err or 'timeleft : 00:00:00' in out:
# get a proxy
print('you have no valid proxy; let\'s get one via voms-proxy-init:')
# this will suppress the stderr; maybe that's not so good, but I get some error messages at the moment
#with open(os.devnull, "w") as f:
# proc2 = subprocess.call(['voms-proxy-init','--voms','cms','--valid','168:00'],stderr=f)
proc2 = subprocess.call(['voms-proxy-init','--voms','cms','--valid','168:00'])
def checkStoragePath(storagePath):
print('will store (example):',storagePath)
#print '\twhich has length:',len(storagePath)
if len(storagePath) > 255:
print
print('we might have a problem with output path lengths too long (if we want to run crab over these).')
print('example output will look like:')
print(storagePath)
print('which has length:',len(storagePath))
print('cowardly refusing to submit the jobs; exiting')
exit(-3)
#else:
# print
# print 'will use storage path like:',storagePath
# to feed additional files into the crab sandbox if needed
additionalInputFiles = []
#rootTupleTestDir = os.getenv('CMSSW_BASE')+'/src/Leptoquarks/RootTupleMakerV2/test/'
# just feed both in, even though we only need one at a time
#additionalInputFiles.append(rootTupleTestDir+'Summer16_23Sep2016V4_MC.db')
#additionalInputFiles.append(rootTupleTestDir+'Summer16_23Sep2016AllV4_DATA.db')
additionalInputFiles.extend(['keepAndDrop.txt','utils.py','doSkim_stockNanoV5.py',
cmsswBaseDir+'/src/PhysicsTools/NanoAODTools/scripts/haddnano.py'] #hadd nano will not be needed once nano tools are in cmssw
)
##############################################################
# RUN
##############################################################
#---Option Parser
#--- TODO: WHY PARSER DOES NOT WORK IN CMSSW ENVIRONMENT? ---#
usage = "Usage: %prog [options] "
#XXX TODO FIX/UPDATE THIS MESSAGE
usage+="\nSee https://twiki.cern.ch/twiki/bin/view/CMS/ExoticaLeptoquarkShiftMakeRootTuplesV22012 for more details "
usage+="\nExample1 (NORMAL MODE): %prog -d `pwd`/RootNtuple -i inputList.txt"
usage+="\nExample2 (NORMAL MODE + RUN SELECTION): %prog -d `pwd`/RootNtuple -i inputList.txt -r 132440-200000 "
usage+="\nExample3 (JSON MODE): %prog -d `pwd`/RootNtuple -i inputList.txt -j [JSON.txt or URL, https://cms-service-dqm.web.cern.ch/cms-service-dqm/CAF/certification/Collisions12/8TeV/Prompt/Cert_190456-208686_8TeV_PromptReco_Collisions12_JSON.txt]"
usage+="\nExample4 (PREV JSON MODE): %prog -d `pwd`/RootNtuple -i inputList.txt -j [JSON.txt or URL, https://cms-service-dqm.web.cern.ch/cms-service-dqm/CAF/certification/Collisions12/8TeV/Prompt/Cert_190456-208686_8TeV_PromptReco_Collisions12_JSON.txt] -p [lumiSummary.json from crab report from previous processing of same dataset]"
parser = OptionParser(usage=usage)
parser.add_option("-d", "--localStorageDir", dest="localStorageDir",
help="the directory localStorageDir is where the local job info is kept",
metavar="INDIR")
parser.add_option("-v", "--tagName", dest="tagName",
help="tagName of postproc package",
metavar="TAGNAME",
default="")
parser.add_option("-i", "--inputList", dest="inputList",
help="list of all datasets to be used (full path required)",
metavar="LIST")
parser.add_option("-e", "--eosDir", dest="eosDir",
help="EOS directory (start with /store...) to store files (used for Data.outLFNDirBase); otherwise EXO LJ group dir used with userName",
metavar="EOSDIR")
parser.add_option("-j", "--json", dest="jsonFile",
help="JSON file with selected lumi sections",
metavar="JSONFILE")
parser.add_option("-r", "--run range", dest="runRange",
help="selected run range",
metavar="RUNRANGE")
parser.add_option("-p", "--previousJSON json", dest="prevJsonFile",
help="previous lumiSummary.json from crab",
metavar="PREVJSON")
parser.add_option("-s", "--site siteName", dest="storageSite",
help="storage site",
metavar="STORAGESITE",
default="T2_CH_CERN")
(options, args) = parser.parse_args()
# validate options
validateOptions(options)
# time: YYYYMMDD_HHMMSS
date = datetime.now()
#dateString = date.strftime("%Y%m%d_%H%M%S")
# I like this better, but does it break anything?
dateString = date.strftime("%Y%b%d_%H%M%S")
if options.tagName:
topDirName = 'lqNanoPostProc_'+options.tagName+'_'+dateString
else:
topDirName = 'lqNanoPostProc_'+dateString
productionDir = options.localStorageDir+'/'+topDirName
cfgFilesDir = productionDir+'/cfgfiles'
outputDir = productionDir+'/output'
workDir = productionDir+'/workdir'
localDirs = [productionDir,cfgFilesDir,outputDir,workDir]
print('Making local directories:')
for dir in localDirs:
print('\t',dir)
makeDirAndCheck(dir)
print
localInputListFile = productionDir+'/inputList.txt'
shutil.copy2(options.inputList,localInputListFile)
# check if we have a proxy
CheckProxy()
# setup general crab settings
# from https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRABClientLibraryAPI
#TODO: this will work for MC. Need to update to run over data.
# notes on how the output will be stored: see https://twiki.cern.ch/twiki/bin/view/CMSPublic/Crab3DataHandling
# <lfn-prefix>/<primary-dataset>/<publication-name>/<time-stamp>/<counter>[/log]/<file-name>
# LFNDirBase / / datasetTagName / stuff automatically done / from outputFile defined below
config = config()
config.General.requestName = topDirName # overridden per dataset
config.General.transferOutputs = True
config.General.transferLogs = False
# We want to put all the CRAB project directories from the tasks we submit here into one common directory.
# That's why we need to set this parameter (here or above in the configuration file, it does not matter, we will not overwrite it).
config.General.workArea = productionDir
#
config.JobType.pluginName = 'Analysis'
#config.JobType.maxMemoryMB = 3000
# this will make sure jobs only run on sites which host the data.
# See: https://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3FAQ#What_is_glideinWms_Overflow_and
# postprocessing jobs take forever (and can exceed max wall clock time) otherwise
config.Debug.extraJDL = ['+CMS_ALLOW_OVERFLOW=False']
# feed in any additional input files
if len(additionalInputFiles) > 0:
config.JobType.inputFiles = []
config.JobType.inputFiles.extend(additionalInputFiles)
config.JobType.psetName = 'PSet.py'
config.JobType.scriptExe = 'crab_script.sh'
config.JobType.sendPythonFolder = True
config.Data.inputDataset = '' # overridden per dataset
config.Data.inputDBS = 'global'
config.Data.splitting = 'Automatic' # below this is set to LumiBased for data, FileBased for MC
config.Data.totalUnits = -1 # overridden per dataset, but doesn't matter for Automatic splitting
# no publishing
config.Data.publication = False
config.Data.outputDatasetTag = 'LQ' #overridden for data
#This is for EXO group space
if options.tagName:
config.Data.outLFNDirBase = '/store/group/phys_exotica/leptonsPlusJets/LQ/%s/nanoPostProc' % (getUsernameFromSiteDB()) + options.tagName + '/'
else:
config.Data.outLFNDirBase = '/store/group/phys_exotica/leptonsPlusJets/LQ/%s/nanoPostProc' % (getUsernameFromSiteDB()) + '/'
#This is for Higgs group space
#config.Data.outLFNDirBase = '/store/group/phys_higgs/HiggsExo/HH_bbZZ_bbllqq/%s/' % (getUsernameFromSiteDB()) + options.tagName + '/'
#This is for personal user space (beware quotas)
#config.Data.outLFNDirBase = '/store/user/%s/' % (getUsernameFromSiteDB()) + topDirName + '/'
if options.eosDir is not None:
# split of /eos/cms if it is there
if options.eosDir.startswith('/eos/cms'):
options.eosDir = options.eosDir.split('/eos/cms')[-1]
# require /store unless it's CERNBOX
if not options.eosDir.startswith('/store'):
print('eosDir must start with /eos/cms/store or /store and you specified:',options.eosDir)
print('quit')
exit(-1)
outputLFN=options.eosDir
if not outputLFN[-1]=='/':
outputLFN+='/'
if options.tagName:
outputLFN+=options.tagName+'/'
if not getUsernameFromSiteDB() in outputLFN:
outputLFN.rstrip('/')
#config.Data.outLFNDirBase = outputLFN+'/%s/' % (getUsernameFromSiteDB()) + topDirName + '/'
# make the LFN shorter, and in any case, the timestamp is put in by crab
if options.tagName:
config.Data.outLFNDirBase = outputLFN+'/%s/' % (getUsernameFromSiteDB()) + options.tagName + '/'
else:
config.Data.outLFNDirBase = outputLFN+'/%s/' % (getUsernameFromSiteDB()) + '/'
else:
config.Data.outLFNDirBase = outputLFN
print('Using outLFNDirBase:',config.Data.outLFNDirBase)
config.Site.storageSite = options.storageSite
# look at the input list
# use DAS to find the dataset names.
# Example:
# das_client.py --query="dataset=/LQToUE_M-*_BetaOne_TuneCUETP8M1_13TeV-pythia8/*/MINIAODSIM"
with open(localInputListFile, 'r') as f:
for line in f:
split = line.split()
if len(split) <= 0:
continue
if '#' in split[0]: # skip comments
#print 'found comment:',line
continue
if len(split) < 3:
print('inputList line is not properly formatted:',line)
exit(-3)
dataset = split[0]
nUnits = int(split[1]) #also used for total lumis for data
nUnitsPerJob = int(split[2]) # used for files/dataset for MC and LS per data
datasetTag,datasetName,primaryDatasetName,secondaryDatasetName,isData = utils.GetOutputDatasetTagAndModifiedDatasetName(dataset)
outputFile = utils.GetOutputFilename(dataset,not isData)
config.Data.outputDatasetTag=datasetTag
config.Data.inputDataset = dataset
print
print('Consider dataset {0}'.format(dataset))
if not isData:
config.Data.splitting = 'FileBased'
else:
config.Data.splitting = 'LumiBased'
# get era
# see, for example: https://twiki.cern.ch/twiki/bin/viewauth/CMS/PdmVAnalysisSummaryTable
# secondaryDatasetName looks like 'Run2015D-PromptReco-v3'
if 'Summer16' in secondaryDatasetName or 'Run2016' in secondaryDatasetName:
year=2016
elif 'Fall17' in secondaryDatasetName or 'Run2017' in secondaryDatasetName:
year=2017
elif 'Autumn18' in secondaryDatasetName or 'Run2018' in secondaryDatasetName:
year=2018
else:
print('ERROR: could not determine year from secondaryDatasetName "{0}" from datasetName "{1}"'.format(secondaryDatasetName,datasetName))
exit(-4)
# get dataRun
dataRun = 'X'
if isData:
dataRun = secondaryDatasetName[secondaryDatasetName.find('Run')+7:secondaryDatasetName.find('Run')+8]
config.JobType.scriptArgs = ['dataset='+config.Data.inputDataset,'ismc='+str(not isData),'era='+str(year),'dataRun='+dataRun]
config.JobType.outputFiles = [outputFile]
config.Data.unitsPerJob = nUnitsPerJob
thisWorkDir = workDir+'/'+datasetName
storagePath=config.Data.outLFNDirBase+primaryDatasetName+'/'+config.Data.outputDatasetTag+'/'+'YYMMDD_hhmmss/0000/'+outputFile.replace('.root','_9999.root')
#print 'make dir:',thisWorkDir
makeDirAndCheck(thisWorkDir)
checkStoragePath(storagePath)
config.General.requestName = datasetName
config.Data.totalUnits = nUnits
# computing JSON mask
if options.jsonFile is not None:
if options.prevJsonFile is not None:
print('Using the subtraction between previous json and new json; WARNING: if lumis changed from good in previous to bad in new json, this will not remove them')
from WMCore.DataStructs.LumiList import LumiList
prevJsonLumiList = LumiList(url=options.prevJsonFile) if 'http:' in options.prevJsonFile else LumiList(filename=options.prevJsonFile)
currentJsonLumiList = LumiList(url=options.jsonFile) if 'http:' in options.jsonFile else LumiList(filename=options.jsonFile)
newLumiList = currentJsonLumiList - prevJsonLumiList
newLumiList.writeJSON('newJSON_minus_oldJSON.json')
config.Data.lumiMask = 'newJSON_minus_oldJSON.json'
else:
config.Data.lumiMask = options.jsonFile
if options.runRange is not None:
config.Data.runRange = runRange
# and submit
print(config.JobType.scriptArgs)
print('submit to crab. output from crab submit follows:')
sys.stdout.write("\033[1;34m") # blue
#crabSubmit(config)
# workaround for cmssw multiple-loading problem
# submit in subprocess
q = Queue()
p = Process(target=crabSubmit, args=(config,))
p.start()
p.join()
if q.get()==-1:
exit(-1)
sys.stdout.write("\033[0;0m")
print('Done with this dataset.')
print('Done!')
exit(0)