-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathread_merged.py
211 lines (162 loc) · 6.35 KB
/
read_merged.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import ROOT
import sys
import os
import glob
import subprocess
from array import array
from numpy import ceil
from optparse import OptionParser
classdir = os.path.join(os.getcwd(), "base", "classes")
if not classdir in sys.path:
sys.path.append(classdir)
from batchConfig_base import batchConfig_base
def parse_args():
usage = '%prog [options] path/to/ntuples\n'
parser = OptionParser(usage = usage)
parser.add_option( "-p", "--parallel",
help = "do calculations of the histograms in parallel on batch system (default: False)",
action = "store_true",
default = False,
dest = "run_on_batch"
)
options, args = parser.parse_args()
return options, args
def main ():
'''
read files and write ttbb event data into histograms
use data generated by different generators respectively
execute with "python read.py <PATH/TO/NTUPLES/*.root>"
'''
options, infiles = parse_args()
# make directory system
workdir = os.getcwd()
# directory for the finished Histograms
histdir = os.path.join(workdir,'Merged_Histograms')
if not os.path.exists(histdir):
os.mkdir(histdir)
# directory for the finished batches
batchdir = os.path.join(histdir,'batches')
if not os.path.exists(batchdir):
os.mkdir(batchdir)
# directory for the runscripts
scriptdir = os.path.join(batchdir,'runscripts')
if not os.path.exists(scriptdir):
os.mkdir(scriptdir)
for infile in infiles:
infile = os.path.abspath(infile)
chain = getChain(infiles = infiles)
pathtoreffiles = os.path.abspath('/nfs/dust/cms/user/mhorzela/ttbb_data/ntuples/TTToSemiLeptonic_TuneCP5_13TeV-powheg-pythia8_new_pmx/')
reffiles = glob.glob(pathtoreffiles+'/TTToSemiLeptonic_TuneCP5_13TeV-powheg-pythia8_new_pmx*.root')
for reffile in reffiles:
reffile = os.path.abspath(reffile)
chain_ref = getChain(infiles = reffiles)
low_edges, up_edges = getListOfEdges(chain = chain, reference_chain = chain_ref)
infiledir = os.path.dirname(infiles[0])
scriptdir, scriptfiles = makeShellScripts(low_edges = low_edges, up_edges = up_edges, infiledir = infiledir)
run_ShellScripts(scriptfiles = scriptfiles)
def getChain(infiles):
'''
read all the .root files and add the corresponding tree (here: MVATree) into a TChain
'''
chain=ROOT.TChain("MVATree")
for inpath in infiles:
inpath = os.path.abspath(inpath)
# print "checking file", inpath
f = ROOT.TFile(inpath)
if f.IsOpen():
if not f.IsZombie() and not f.TestBit(ROOT.TFile.kRecovered):
chain.Add(inpath)
continue
else:
f.Close()
print "file '%s' is broken!" % inpath
print "Read in " + str(chain.GetNtrees()) + " files."
return chain
def getListOfEdges(chain, reference_chain, batchsize = 50000):
'''
Take the chain with the higher statistics for the merging of two samples.
Cut the chain in smaller batches, which can be submitted afterwards and run in parallel.
'''
nentries = chain.GetEntries()
if nentries < reference_chain.GetEntries():
nentries = reference_chain.GetEntries()
chain_cuts = []
ncuts = nentries/batchsize
print "Cutting the TChain in " + str(ncuts) + " batches"
for i in range(ncuts):
chain_cuts.append(i*batchsize)
chain_cuts.append(nentries)
low_edges = chain_cuts[:-1]
up_edges = chain_cuts[1:]
return low_edges, up_edges
def makeShellScripts(low_edges, up_edges, infiledir):
if len(low_edges) != len(up_edges):
print "Error: Number of low and upper batch boundaries doesn't correspond! Abort!"
exit()
edges = zip(low_edges, up_edges)
datadir = infiledir
datadir = os.path.abspath(datadir)
workdir = os.getcwd()
histdir = os.path.join(workdir, 'Merged_Histograms')
batchdir = os.path.join(histdir, 'batches')
scriptdir = os.path.join(batchdir, 'runscripts')
files = []
for iedge, edge in enumerate(edges):
batch = str(iedge+1)
filename = os.path.join(scriptdir, os.path.basename(datadir)+'_batch'+batch+'.sh')
with open(filename, 'wb') as scriptfile:
scriptfile.writelines(['# set the CMSSW environment and load all needed modules\n',
'#module use -a /afs/desy.de/group/cms/modulefiles/\n',
'export VO_CMS_SW_DIR=/cvmfs/cms.cern.ch\n',
'source $VO_CMS_SW_DIR/cmsset_default.sh\n',
'export CMSSW_GIT_REFERENCE=/nfs/dust/cms/user/mhorzela/.cmsgit-cache\n',
'alias cd=\'cd -P\'\n',
'myvarcwd=$PWD\n',
'cd /nfs/dust/cms/user/mhorzela/CMSSW_9_4_9/src\n',
'eval `scramv1 runtime -sh`\n',
'cd ~\n',
'echo "setup CMSSW_949 and stuff"\n',
'cd $myvarcwd\n\n',
])
scriptfile.writelines(['# fill histo with batch number ' + str(batch) + '\n',
'cd ' + str(batchdir) + '\n'])
scriptfile.writelines(['python ../../fill_histo.py -n ' + batch + ' -l ' + str(edge[0]) + ' -u ' + str(edge[1]) + ' ' + datadir+"/"+os.path.basename(datadir) + '*.root\n'])
scriptfile.write('cd ' + str(workdir))
files.append(filename)
os.chdir(workdir)
return scriptdir, files
def run_ShellScripts(scriptfiles):
workdir = os.getcwd()
histdir = os.path.join(workdir, 'Merged_Histograms')
batchdir = os.path.join(histdir, 'batches')
scriptdir = os.path.join(batchdir, 'runscripts')
# check if the scripts are in the right directory
if scriptdir != os.path.dirname(scriptfiles[0]):
print "Error: The scriptfiles' are in the wrong directory! They should be in ./Merged_Histograms/batches/runscripts ! Abort!"
exit()
# set the job properties
jobids =[]
print 'Setting job porperties ... \n'
bc = batchConfig_base()
bc.diskspace = 2000000
bc.runtime = 3600*2
# submit the batches in the current stage as an arrayjob to the cluster
print 'Submitting jobs ... \n'
arrayscriptpath = os.path.join(scriptdir, os.path.basename(scriptfiles[0]).rsplit('_',1)[0]+'_array.sh')
jobids += bc.submitArrayToBatch(scripts = scriptfiles, arrayscriptpath = arrayscriptpath)
print 'Jobs submitted!\n'
# wait till the jobs are finished, before the next stage is started
print 'Waiting for jobs to finish ... \n'
bc.do_qstat(jobids)
print 'Reading step finished.'
os.chdir(workdir)
def haddBatches(histfiles):
workdir = os.getcwd()
histdir = os.path.join(workdir, 'Merged_Histograms')
batchdir = os.path.join(histdir, 'batches')
scriptdir = os.path.join(batchdir, 'runscripts')
histname = os.path.basename(histfiles[0]).split('.')[0].rsplit('_',1)[0]
haddprocess = 'hadd -k ' +os.path.join(histdir, histname) + ' ' + histfiles
subprocess.check_call(haddprocess)
if __name__ == '__main__':
main()