PublicAnalysisDatabase: exclusion_CLs.py

File exclusion_CLs.py, 16.5 KB (added by Benjamin Fuks, 9 years ago)

Statistics module for testing exclusion

Line 
1#! /usr/bin/env python
2
3# derive exclusions for the tested scenario from the signal regions of an
4# analysis implemented using MadAnalysis 5 (MA5), using a simplified procedure
5# and the CLs prescription
6
7# version 1.1 (July 9, 2014)
8# made by Beranger Dumont
9# based on toy MC code by Benjamin Fuks, Chris Wymant and Sam Bein
10# v1.1: BF added an efficiency map calculator
11
12# takes as input:
13# -- an XML analysis_name.info file that should be present in the same directory
14# as the analysis code, i.e. Build/SampleAnalyzer/User/Analyzer/
15# -- the SAF files for the cutflows of the tested scenario of interest in all
16# signal regions, where the information on acceptance*efficiency can be found
17# -- the cross section for the tested scenario (if not given as argument in
18# command-line, it is looked for in the SAF file analysis_name.saf
19
20# returns the output on the screen and print basic results
21# in the file analysis_name_[run number].out in
22# Output/benchmark_point/
23
24
25# path to the directory where the analysis code and output is present
26# (this is the directory created when running MA5 in expert mode,
27# containing the "Build", "Input" and "Output" directories)
28analysis_path = "./"
29
30# The number of Poisson distributions we consider (each one effectively being
31# a toy experiment with its own certain prediction for the background):
32numtoyexperiments = 100000
33
34#
35# the user is not supposed to modify the code below this line
36#
37
38import os, sys
39try:
40 from lxml import ET
41except:
42 import xml.etree.ElementTree as ET
43import scipy.stats
44
45def usage():
46 print 'Usage: ./exclusion_CLs.py analysis_name benchmark_point ' + \
47 '[run_number] [cross section in pb]'
48 print 'Example: ./exclusion_CLs.py cms_sus_13_011 T2tt_650_50.txt ' + \
49 '0 0.014'
50 print 'Default value of run number is 0'
51 print 'If the cross section is not given, it is taken from the MA5 output'
52
53def listdirectory(path):
54 lfile=[]
55 for root, dirs, files in os.walk(path):
56 for i in files:
57 lfile.append(os.path.join(root, i))
58 return lfile
59
60bestSR = ""
61
62def clean_name(str):
63 # based on CleanName() from
64 # ./tools/SampleAnalyzer/Process/Core/SampleAnalyzer.cpp
65 # in MA5 v1.1.11beta4
66 str = str.replace("/", "_slash_")
67 str = str.replace("->", "_to_")
68 str = str.replace(">=", "_greater_than_or_equal_to_")
69 str = str.replace(">", "_greater_than_")
70 str = str.replace("<=", "_smaller_than_or_equal_to_")
71 str = str.replace("<", "_smaller_than_")
72 str = str.replace(" ", "_")
73 str = str.replace(",", "_")
74 str = str.replace("+", "_")
75 str = str.replace("-", "_")
76 str = str.replace("(", "_lp_")
77 str = str.replace(")", "_rp_")
78 return str
79
80def CLs(NumObserved, ExpectedBG, BGError, SigHypothesis, NumToyExperiments):
81 # generate a set of expected-number-of-background-events, one for each toy
82 # experiment, distributed according to a Gaussian with the specified mean
83 # and uncertainty
84 ExpectedBGs = scipy.stats.norm.rvs(loc=ExpectedBG, \
85 scale=BGError, size=NumToyExperiments)
86
87 # Ignore values in the tail of the Gaussian extending to negative numbers
88 ExpectedBGs = [value for value in ExpectedBGs if value > 0]
89
90 # For each toy experiment, get the actual number of background events by
91 # taking one value from a Poisson distribution created using the expected
92 # number of events.
93 ToyBGs = scipy.stats.poisson.rvs(ExpectedBGs)
94 ToyBGs = map(float, ToyBGs)
95
96 # The probability for the background alone to fluctutate as LOW as
97 # observed = the fraction of the toy experiments with backgrounds as low as
98 # observed = p_b.
99 # NB (1 - this p_b) corresponds to what is usually called p_b for CLs.
100 p_b = scipy.stats.percentileofscore(ToyBGs, NumObserved, kind='weak')*.01
101
102 # Toy MC for background+signal
103 ExpectedBGandS = [expectedbg + SigHypothesis for expectedbg in ExpectedBGs]
104 ToyBplusS = scipy.stats.poisson.rvs(ExpectedBGandS)
105 ToyBplusS = map(float, ToyBplusS)
106
107 # Calculate the fraction of these that are >= the number observed,
108 # giving p_(S+B). Divide by (1 - p_b) a la the CLs prescription.
109 p_SplusB = scipy.stats.percentileofscore(ToyBplusS, NumObserved, kind='weak')*.01
110
111 return 1.-(p_SplusB / p_b) # 1 - CLs
112
113def exclusion_check(crosssection):
114 global bestSR
115
116 if len(signalregions) > 1:
117 # if more than one signal region, decide which signal regions
118 # yields the best expected limit
119 limit = -1.
120 for SR in signalregions:
121 nsignal = crosssection * lumi * 1000. * signalregions[SR]["acceff"]
122 nb = signalregions[SR]["nb"]
123 deltanb = signalregions[SR]["deltanb"]
124
125 limitSR = CLs(nb, nb, deltanb, nsignal, numtoyexperiments)
126 if limitSR > limit:
127 bestSR = SR
128 limit = limitSR
129 else:
130 bestSR = signalregions.keys()[0]
131
132 nsignal = crosssection * lumi * 1000. * signalregions[bestSR]["acceff"]
133 nobs = int(signalregions[bestSR]["nobs"])
134 nb = signalregions[bestSR]["nb"]
135 deltanb = signalregions[bestSR]["deltanb"]
136 bestCLs=CLs(nobs, nb, deltanb, nsignal, numtoyexperiments)
137
138 print 'The best expected signal region is "' + bestSR + '".'
139 print 'It has: nobs = ' + str(nobs) + ', nb = ' + str(nb) + ' \pm ' + \
140 str(deltanb) + ', nsignal = ' + str(round(nsignal,2)) + '.'
141
142 return bestCLs
143
144def exclusion_check95(crosssection):
145 return exclusion_check(crosssection)-0.95
146
147if analysis_path[-1] == "/":
148 analysis_path = analysis_path[:-1]
149
150# at least one argument, check if asking for help
151if len(sys.argv) < 3 or (sys.argv[1] == "-h" or sys.argv[1] == "--help"):
152 usage()
153 sys.exit()
154
155analysis_name = sys.argv[1]
156bench_name = sys.argv[2]
157if len(sys.argv) > 3:
158 run_number = sys.argv[3]
159else:
160 run_number = "0"
161if len(sys.argv) > 4:
162 try:
163 xsection = float(sys.argv[4])
164 except ValueError:
165 print 'Invalid cross section given as command-line argument: "' + \
166 sys.argv[4]+ '".'
167 sys.exit()
168else:
169 xsection = 0
170
171
172
173######################################
174# first, read the XML .info file
175# and fill the variable lumi
176# and the dictionary signalregions
177######################################
178
179lumi = 0 # integrated luminosity, in fb^-1
180signalregions = {}
181
182analysisinfo_path = analysis_path + "/Build/SampleAnalyzer/User/Analyzer/" + \
183 analysis_name + ".info"
184
185try:
186 info_input = open(analysisinfo_path)
187except IOError as e:
188 print 'I/O error({0}): {1}'.format(e.errno, e.strerror)
189 print 'Cannot open the XML info file "' + analysisinfo_path + '".'
190 sys.exit()
191
192info_tree = ET.parse(info_input)
193info_input.close()
194
195info_root = info_tree.getroot()
196if info_root.tag != "analysis":
197 print 'Invalid XML info file "' + analysisinfo_path + '".'
198 print 'Root tag should be <analysis>, not <' + info_root.tag + '>.'
199 sys.exit()
200if info_root.attrib["id"] != analysis_name:
201 print 'Invalid XML info file "' + analysisinfo_path + '".'
202 print 'Analysis id in root tag <analysis> should be "' + analysis_name + \
203 '", not "' + info_root.attrib["id"] + '".'
204 sys.exit()
205
206
207for child in info_root:
208 # for <lumi> tag
209 if child.tag == "lumi":
210 if lumi != 0:
211 print 'Warning: redefinition of the luminosity in the ' + \
212 'XML info file "' + analysisinfo_path + '".'
213
214 try:
215 lumi = float(child.text)
216 except TypeError: # empty tag is of type NULL
217 lumi = 0
218 except ValueError:
219 print 'Invalid XML info file "' + analysisinfo_path + '".'
220 print 'The value of the <lumi> tag is not a number.'
221 sys.exit()
222
223 # for the <region> tags
224 # if no type is specified, assumed to be a signal region
225 if child.tag == "region" and \
226 ("type" not in child.attrib or child.attrib["type"] == "signal"):
227 if "id" not in child.attrib:
228 print 'Invalid XML info file "' + analysisinfo_path + '".'
229 print 'Presence of <region> tags with no id attribute.'
230 sys.exit()
231
232 regionid = child.attrib["id"]
233
234 if regionid in signalregions:
235 # a <region> tag with the same id has already been defined
236 print 'Invalid XML info file "' + analysisinfo_path + '".'
237 print 'A region with id="' + regionid + ' is defined ' + \
238 'multiple times.'
239 sys.exit()
240
241 signalregions[regionid] = {"acceff": 0, "syst":0, "stat":0} # initialize efficiency to 0
242 for rchild in child:
243 if rchild.tag in ["nobs", "nb", "deltanb"]:
244 ntag = rchild.tag
245 if ntag in signalregions[regionid]:
246 print 'Warning: redefinition of <' + ntag + '> in the ' + \
247 'region "' + \
248 regionid + '" of the XML info file "' + \
249 analysisinfo_path + '".'
250
251 try:
252 signalregions[regionid][ntag] = float(rchild.text)
253 except TypeError: # empty tag is of type NULL
254 signalregions[regionid][ntag] = 0
255 except ValueError:
256 print 'Invalid XML info file "' + analysisinfo_path + '".'
257 print 'The value of the <' + ntag + '> tag in region "' + \
258 regionid + '" is not a number.'
259 sys.exit()
260
261######################################
262# then, read the SAF files
263# for the cutflows
264# generated by MA5
265######################################
266
267analysisinfo_path = analysis_path + "/Output/" + \
268 bench_name + "/" + analysis_name + "_" + run_number + "/Cutflows"
269
270listdir = listdirectory(analysisinfo_path)
271
272if not listdir:
273 print 'The directory "' + analysisinfo_path + '" containing the SAF' + \
274 ' files for the cutflows cannot be listed.'
275 sys.exit()
276
277for file in listdir:
278 # signal region (as defined in the XML info file)
279 # associated with the cutflow file
280 assoc_SR = ""
281
282 if file[-4:] != ".saf":
283 continue
284 SRname = file.split("/")[-1][:-4]
285
286 for regionid in signalregions:
287 for sr in regionid.split(";"):
288 if clean_name(sr) == SRname:
289 assoc_SR = sr
290
291 # if there is no signal region found associated with the SAF file
292 if assoc_SR == "":
293 print 'Warning: no region found associated with the SAF file "' + \
294 file + '"; will be skipped.'
295 continue
296
297 # otherwise, read the acceptance times efficiency from the SAF file
298 try:
299 SAF_cutflow = open(file)
300 except IOError as e:
301 print 'I/O error({0}): {1}'.format(e.errno, e.strerror)
302 print 'Cannot open the XML info file "' + file + '".'
303 sys.exit()
304
305 in_initialcounter = False
306 in_counter = False
307 initialnum = 0
308 finalnum = 0
309 for line in SAF_cutflow:
310 line = line.rstrip("\n").strip()
311 if line[:16] == "<InitialCounter>":
312 in_initialcounter = True
313 continue
314 if line[:17] == "</InitialCounter>":
315 in_initialcounter = False
316 continue
317 if line[:9] == "<Counter>":
318 in_counter = True
319 continue
320 if line[:10] == "</Counter>":
321 in_counter = False
322 continue
323
324 if in_initialcounter and line[-14:] == "sum of weights":
325 try:
326 initialnum = float(line.split()[0])
327 except:
328 print 'Invalid SAF file "' + file + '".'
329 print 'The initial number of events cannot be read.'
330 sys.exit()
331
332 if in_counter and line[-14:] == "sum of weights":
333 try:
334 finalnum = float(line.split()[0])
335 except:
336 print 'Invalid SAF file "' + file + '".'
337 print 'The number of events cannot be read.'
338 sys.exit()
339
340 if initialnum == 0:
341 print 'Invalid SAF file "' + file + '".'
342 print 'The number of events cannot be read.'
343 sys.exit()
344
345 for regionid in signalregions:
346 # for each SR id as defined in the XML info file...
347 for individualSR in regionid.split(";"):
348 # ...look for each individual SR...
349 if individualSR == assoc_SR:
350 # ... if it matches with the SAF file read, add the
351 # acceptance*efficiency to existing value
352 signalregions[regionid]["acceff"] += finalnum / initialnum
353 signalregions[regionid]["stat"] += finalnum
354
355######################################
356# then, get the cross section info
357# if not given as command-line
358# argument, look into SAF file
359######################################
360
361if xsection == 0: # not given as command-line argument
362 mainSAF_path = analysis_path + "/Output/" + \
363 bench_name + "/" + bench_name + ".saf"
364
365 try:
366 mainSAF = open(mainSAF_path)
367 except IOError as e:
368 print 'I/O error({0}): {1}'.format(e.errno, e.strerror)
369 print 'Cannot open the XML info file "' + mainSAF_path + '".'
370 sys.exit()
371
372 nline = 0
373 for line in mainSAF:
374 line = line.rstrip("\n").strip()
375 if line[:18] == "<SampleGlobalInfo>":
376 nline = 1
377 continue
378 if nline > 0:
379 nline += 1
380 if nline == 3:
381 try:
382 xsection = float(line.split()[0])
383 except:
384 print 'Invalid SAF file "' + mainSAF_path + '".'
385 print 'The cross section cannot be read.'
386 sys.exit()
387
388 if xsection <= 0.:
389 print 'Invalid cross section of ' + str(xsection) + ' pb.'
390 print 'The cross section cannot be read from the SAF file "' + \
391 mainSAF_path + '".'
392 sys.exit()
393 break
394
395######################################
396# now, we have the information on the
397# cross section (variable xsection),
398# the luminosity (variable lumi),
399# the number of background events,
400# observed events, and the acceptance
401# times efficiency (in the dictionary
402# signalregions)
403#
404# we can proceed with the exclusion
405######################################
406
407if xsection > 0:
408 # first, decide which signal regions yields the best expected limit
409 final_limit = exclusion_check(xsection)
410
411 print '\nThis signal is excluded at the ' + str(round(final_limit*100.,1)) + \
412 '% CL (CLs=' + str(round(1-final_limit,3)) + ').'
413else:
414 # if a negative cross section is given as input,
415 # the code is looking for the cross section that is excluded at 95% CL
416 # using a root-finding algorithm
417
418 print 'Negative cross section is given.'
419 print 'Will look for the cross section that is excluded at 95% CL'
420 print 'using a root-finding algorithm.\n'
421
422 # need to tune the lower and upper bound (corresponding to a cross section
423 # that we know is not excluded or excluded, respectively)
424 lowerb = 1. # 1 pb
425 upperb = 1. # 1 pb
426 while exclusion_check95(lowerb) > 0.:
427 lowerb *= 0.1
428 while exclusion_check95(upperb) < 0.:
429 upperb *= 10.
430
431 print '\nlower and upper bounds for the root-finding algorithm' + \
432 ' have been found: [%.2e %.2e] pb\n' % (lowerb, upperb)
433
434 final_limit = scipy.optimize.brentq(exclusion_check95, lowerb, upperb)
435
436 print '\nThe excluded cross section at 2 sigma is %.5E pb.' % final_limit
437
438# finally, write the results in an output file
439output_path = analysis_path + "/Output/" + \
440 bench_name + "/" + analysis_name + "_" + run_number + ".out"
441
442try:
443 output = open(output_path, "w")
444except IOError as e:
445 print 'I/O error({0}): {1}'.format(e.errno, e.strerror)
446 print 'Cannot create the output file "' + output_path + '".'
447 sys.exit()
448
449output.write(bestSR+'\n'+str(final_limit))
450output.close()
451
452# Efficiencies
453import math
454out = open('efficiencies_'+analysis_name+'.dat','w')
455Title=['Signal region', 'Efficiency', 'Stat. error', 'Syst. error', 'Total error', 'Best region']
456out.write(Title[0].center(50) + Title[1].center(16) + Title[2].center(21) + Title[3].center(16) +\
457 Title[4].center(18)+ Title[5].rjust(10)+'\n')
458for region in signalregions:
459 if signalregions[region]["stat"]!=0:
460 signalregions[region]["stat"] =1./math.sqrt(float(signalregions[region]["stat"]))
461 signalregions[region]["syst"] = float(signalregions[region]["syst"])
462 IsBest=False;
463 if region is bestSR:
464 IsBest=True
465 out.write(region.replace(' ','').ljust(50) + "%1.5f".center(15) %(signalregions[region]["acceff"]) +\
466 "%1.6f".center(15) %(signalregions[region]["stat"])+ \
467 "%1.6f".center(15) %(signalregions[region]["syst"])+ \
468 "%1.6f".center(15) %(math.sqrt(signalregions[region]["syst"]**2+signalregions[region]["stat"]**2))+\
469 str(int(IsBest)).rjust(8) +\
470 '\n')
471out.close()