Context Navigation

Back to PublicAnalysisDatabase

PublicAnalysisDatabase: exclusion_CLs.py

File exclusion_CLs.py, 16.5 KB (added by Benjamin Fuks, 10 years ago)
Statistics module for testing exclusion

Line
1	#! /usr/bin/env python
2
3	# derive exclusions for the tested scenario from the signal regions of an
4	# analysis implemented using MadAnalysis 5 (MA5), using a simplified procedure
5	# and the CLs prescription
6
7	# version 1.1 (July 9, 2014)
8	# made by Beranger Dumont
9	# based on toy MC code by Benjamin Fuks, Chris Wymant and Sam Bein
10	# v1.1: BF added an efficiency map calculator
11
12	# takes as input:
13	# -- an XML analysis_name.info file that should be present in the same directory
14	# as the analysis code, i.e. Build/SampleAnalyzer/User/Analyzer/
15	# -- the SAF files for the cutflows of the tested scenario of interest in all
16	# signal regions, where the information on acceptance*efficiency can be found
17	# -- the cross section for the tested scenario (if not given as argument in
18	# command-line, it is looked for in the SAF file analysis_name.saf
19
20	# returns the output on the screen and print basic results
21	# in the file analysis_name_[run number].out in
22	# Output/benchmark_point/
23
24
25	# path to the directory where the analysis code and output is present
26	# (this is the directory created when running MA5 in expert mode,
27	# containing the "Build", "Input" and "Output" directories)
28	analysis_path = "./"
29
30	# The number of Poisson distributions we consider (each one effectively being
31	# a toy experiment with its own certain prediction for the background):
32	numtoyexperiments = 100000
33
34	#
35	# the user is not supposed to modify the code below this line
36	#
37
38	import os, sys
39	try:
40	from lxml import ET
41	except:
42	import xml.etree.ElementTree as ET
43	import scipy.stats
44
45	def usage():
46	print 'Usage: ./exclusion_CLs.py analysis_name benchmark_point ' + \
47	'[run_number] [cross section in pb]'
48	print 'Example: ./exclusion_CLs.py cms_sus_13_011 T2tt_650_50.txt ' + \
49	'0 0.014'
50	print 'Default value of run number is 0'
51	print 'If the cross section is not given, it is taken from the MA5 output'
52
53	def listdirectory(path):
54	lfile=[]
55	for root, dirs, files in os.walk(path):
56	for i in files:
57	lfile.append(os.path.join(root, i))
58	return lfile
59
60	bestSR = ""
61
62	def clean_name(str):
63	# based on CleanName() from
64	# ./tools/SampleAnalyzer/Process/Core/SampleAnalyzer.cpp
65	# in MA5 v1.1.11beta4
66	str = str.replace("/", "_slash_")
67	str = str.replace("->", "_to_")
68	str = str.replace(">=", "_greater_than_or_equal_to_")
69	str = str.replace(">", "_greater_than_")
70	str = str.replace("<=", "_smaller_than_or_equal_to_")
71	str = str.replace("<", "_smaller_than_")
72	str = str.replace(" ", "_")
73	str = str.replace(",", "_")
74	str = str.replace("+", "_")
75	str = str.replace("-", "_")
76	str = str.replace("(", "_lp_")
77	str = str.replace(")", "_rp_")
78	return str
79
80	def CLs(NumObserved, ExpectedBG, BGError, SigHypothesis, NumToyExperiments):
81	# generate a set of expected-number-of-background-events, one for each toy
82	# experiment, distributed according to a Gaussian with the specified mean
83	# and uncertainty
84	ExpectedBGs = scipy.stats.norm.rvs(loc=ExpectedBG, \
85	scale=BGError, size=NumToyExperiments)
86
87	# Ignore values in the tail of the Gaussian extending to negative numbers
88	ExpectedBGs = [value for value in ExpectedBGs if value > 0]
89
90	# For each toy experiment, get the actual number of background events by
91	# taking one value from a Poisson distribution created using the expected
92	# number of events.
93	ToyBGs = scipy.stats.poisson.rvs(ExpectedBGs)
94	ToyBGs = map(float, ToyBGs)
95
96	# The probability for the background alone to fluctutate as LOW as
97	# observed = the fraction of the toy experiments with backgrounds as low as
98	# observed = p_b.
99	# NB (1 - this p_b) corresponds to what is usually called p_b for CLs.
100	p_b = scipy.stats.percentileofscore(ToyBGs, NumObserved, kind='weak')*.01
101
102	# Toy MC for background+signal
103	ExpectedBGandS = [expectedbg + SigHypothesis for expectedbg in ExpectedBGs]
104	ToyBplusS = scipy.stats.poisson.rvs(ExpectedBGandS)
105	ToyBplusS = map(float, ToyBplusS)
106
107	# Calculate the fraction of these that are >= the number observed,
108	# giving p_(S+B). Divide by (1 - p_b) a la the CLs prescription.
109	p_SplusB = scipy.stats.percentileofscore(ToyBplusS, NumObserved, kind='weak')*.01
110
111	return 1.-(p_SplusB / p_b) # 1 - CLs
112
113	def exclusion_check(crosssection):
114	global bestSR
115
116	if len(signalregions) > 1:
117	# if more than one signal region, decide which signal regions
118	# yields the best expected limit
119	limit = -1.
120	for SR in signalregions:
121	nsignal = crosssection * lumi * 1000. * signalregions[SR]["acceff"]
122	nb = signalregions[SR]["nb"]
123	deltanb = signalregions[SR]["deltanb"]
124
125	limitSR = CLs(nb, nb, deltanb, nsignal, numtoyexperiments)
126	if limitSR > limit:
127	bestSR = SR
128	limit = limitSR
129	else:
130	bestSR = signalregions.keys()[0]
131
132	nsignal = crosssection * lumi * 1000. * signalregions[bestSR]["acceff"]
133	nobs = int(signalregions[bestSR]["nobs"])
134	nb = signalregions[bestSR]["nb"]
135	deltanb = signalregions[bestSR]["deltanb"]
136	bestCLs=CLs(nobs, nb, deltanb, nsignal, numtoyexperiments)
137
138	print 'The best expected signal region is "' + bestSR + '".'
139	print 'It has: nobs = ' + str(nobs) + ', nb = ' + str(nb) + ' \pm ' + \
140	str(deltanb) + ', nsignal = ' + str(round(nsignal,2)) + '.'
141
142	return bestCLs
143
144	def exclusion_check95(crosssection):
145	return exclusion_check(crosssection)-0.95
146
147	if analysis_path[-1] == "/":
148	analysis_path = analysis_path[:-1]
149
150	# at least one argument, check if asking for help
151	if len(sys.argv) < 3 or (sys.argv[1] == "-h" or sys.argv[1] == "--help"):
152	usage()
153	sys.exit()
154
155	analysis_name = sys.argv[1]
156	bench_name = sys.argv[2]
157	if len(sys.argv) > 3:
158	run_number = sys.argv[3]
159	else:
160	run_number = "0"
161	if len(sys.argv) > 4:
162	try:
163	xsection = float(sys.argv[4])
164	except ValueError:
165	print 'Invalid cross section given as command-line argument: "' + \
166	sys.argv[4]+ '".'
167	sys.exit()
168	else:
169	xsection = 0
170
171
172
173	######################################
174	# first, read the XML .info file
175	# and fill the variable lumi
176	# and the dictionary signalregions
177	######################################
178
179	lumi = 0 # integrated luminosity, in fb^-1
180	signalregions = {}
181
182	analysisinfo_path = analysis_path + "/Build/SampleAnalyzer/User/Analyzer/" + \
183	analysis_name + ".info"
184
185	try:
186	info_input = open(analysisinfo_path)
187	except IOError as e:
188	print 'I/O error({0}): {1}'.format(e.errno, e.strerror)
189	print 'Cannot open the XML info file "' + analysisinfo_path + '".'
190	sys.exit()
191
192	info_tree = ET.parse(info_input)
193	info_input.close()
194
195	info_root = info_tree.getroot()
196	if info_root.tag != "analysis":
197	print 'Invalid XML info file "' + analysisinfo_path + '".'
198	print 'Root tag should be <analysis>, not <' + info_root.tag + '>.'
199	sys.exit()
200	if info_root.attrib["id"] != analysis_name:
201	print 'Invalid XML info file "' + analysisinfo_path + '".'
202	print 'Analysis id in root tag <analysis> should be "' + analysis_name + \
203	'", not "' + info_root.attrib["id"] + '".'
204	sys.exit()
205
206
207	for child in info_root:
208	# for <lumi> tag
209	if child.tag == "lumi":
210	if lumi != 0:
211	print 'Warning: redefinition of the luminosity in the ' + \
212	'XML info file "' + analysisinfo_path + '".'
213
214	try:
215	lumi = float(child.text)
216	except TypeError: # empty tag is of type NULL
217	lumi = 0
218	except ValueError:
219	print 'Invalid XML info file "' + analysisinfo_path + '".'
220	print 'The value of the <lumi> tag is not a number.'
221	sys.exit()
222
223	# for the <region> tags
224	# if no type is specified, assumed to be a signal region
225	if child.tag == "region" and \
226	("type" not in child.attrib or child.attrib["type"] == "signal"):
227	if "id" not in child.attrib:
228	print 'Invalid XML info file "' + analysisinfo_path + '".'
229	print 'Presence of <region> tags with no id attribute.'
230	sys.exit()
231
232	regionid = child.attrib["id"]
233
234	if regionid in signalregions:
235	# a <region> tag with the same id has already been defined
236	print 'Invalid XML info file "' + analysisinfo_path + '".'
237	print 'A region with id="' + regionid + ' is defined ' + \
238	'multiple times.'
239	sys.exit()
240
241	signalregions[regionid] = {"acceff": 0, "syst":0, "stat":0} # initialize efficiency to 0
242	for rchild in child:
243	if rchild.tag in ["nobs", "nb", "deltanb"]:
244	ntag = rchild.tag
245	if ntag in signalregions[regionid]:
246	print 'Warning: redefinition of <' + ntag + '> in the ' + \
247	'region "' + \
248	regionid + '" of the XML info file "' + \
249	analysisinfo_path + '".'
250
251	try:
252	signalregions[regionid][ntag] = float(rchild.text)
253	except TypeError: # empty tag is of type NULL
254	signalregions[regionid][ntag] = 0
255	except ValueError:
256	print 'Invalid XML info file "' + analysisinfo_path + '".'
257	print 'The value of the <' + ntag + '> tag in region "' + \
258	regionid + '" is not a number.'
259	sys.exit()
260
261	######################################
262	# then, read the SAF files
263	# for the cutflows
264	# generated by MA5
265	######################################
266
267	analysisinfo_path = analysis_path + "/Output/" + \
268	bench_name + "/" + analysis_name + "_" + run_number + "/Cutflows"
269
270	listdir = listdirectory(analysisinfo_path)
271
272	if not listdir:
273	print 'The directory "' + analysisinfo_path + '" containing the SAF' + \
274	' files for the cutflows cannot be listed.'
275	sys.exit()
276
277	for file in listdir:
278	# signal region (as defined in the XML info file)
279	# associated with the cutflow file
280	assoc_SR = ""
281
282	if file[-4:] != ".saf":
283	continue
284	SRname = file.split("/")[-1][:-4]
285
286	for regionid in signalregions:
287	for sr in regionid.split(";"):
288	if clean_name(sr) == SRname:
289	assoc_SR = sr
290
291	# if there is no signal region found associated with the SAF file
292	if assoc_SR == "":
293	print 'Warning: no region found associated with the SAF file "' + \
294	file + '"; will be skipped.'
295	continue
296
297	# otherwise, read the acceptance times efficiency from the SAF file
298	try:
299	SAF_cutflow = open(file)
300	except IOError as e:
301	print 'I/O error({0}): {1}'.format(e.errno, e.strerror)
302	print 'Cannot open the XML info file "' + file + '".'
303	sys.exit()
304
305	in_initialcounter = False
306	in_counter = False
307	initialnum = 0
308	finalnum = 0
309	for line in SAF_cutflow:
310	line = line.rstrip("\n").strip()
311	if line[:16] == "<InitialCounter>":
312	in_initialcounter = True
313	continue
314	if line[:17] == "</InitialCounter>":
315	in_initialcounter = False
316	continue
317	if line[:9] == "<Counter>":
318	in_counter = True
319	continue
320	if line[:10] == "</Counter>":
321	in_counter = False
322	continue
323
324	if in_initialcounter and line[-14:] == "sum of weights":
325	try:
326	initialnum = float(line.split()[0])
327	except:
328	print 'Invalid SAF file "' + file + '".'
329	print 'The initial number of events cannot be read.'
330	sys.exit()
331
332	if in_counter and line[-14:] == "sum of weights":
333	try:
334	finalnum = float(line.split()[0])
335	except:
336	print 'Invalid SAF file "' + file + '".'
337	print 'The number of events cannot be read.'
338	sys.exit()
339
340	if initialnum == 0:
341	print 'Invalid SAF file "' + file + '".'
342	print 'The number of events cannot be read.'
343	sys.exit()
344
345	for regionid in signalregions:
346	# for each SR id as defined in the XML info file...
347	for individualSR in regionid.split(";"):
348	# ...look for each individual SR...
349	if individualSR == assoc_SR:
350	# ... if it matches with the SAF file read, add the
351	# acceptance*efficiency to existing value
352	signalregions[regionid]["acceff"] += finalnum / initialnum
353	signalregions[regionid]["stat"] += finalnum
354
355	######################################
356	# then, get the cross section info
357	# if not given as command-line
358	# argument, look into SAF file
359	######################################
360
361	if xsection == 0: # not given as command-line argument
362	mainSAF_path = analysis_path + "/Output/" + \
363	bench_name + "/" + bench_name + ".saf"
364
365	try:
366	mainSAF = open(mainSAF_path)
367	except IOError as e:
368	print 'I/O error({0}): {1}'.format(e.errno, e.strerror)
369	print 'Cannot open the XML info file "' + mainSAF_path + '".'
370	sys.exit()
371
372	nline = 0
373	for line in mainSAF:
374	line = line.rstrip("\n").strip()
375	if line[:18] == "<SampleGlobalInfo>":
376	nline = 1
377	continue
378	if nline > 0:
379	nline += 1
380	if nline == 3:
381	try:
382	xsection = float(line.split()[0])
383	except:
384	print 'Invalid SAF file "' + mainSAF_path + '".'
385	print 'The cross section cannot be read.'
386	sys.exit()
387
388	if xsection <= 0.:
389	print 'Invalid cross section of ' + str(xsection) + ' pb.'
390	print 'The cross section cannot be read from the SAF file "' + \
391	mainSAF_path + '".'
392	sys.exit()
393	break
394
395	######################################
396	# now, we have the information on the
397	# cross section (variable xsection),
398	# the luminosity (variable lumi),
399	# the number of background events,
400	# observed events, and the acceptance
401	# times efficiency (in the dictionary
402	# signalregions)
403	#
404	# we can proceed with the exclusion
405	######################################
406
407	if xsection > 0:
408	# first, decide which signal regions yields the best expected limit
409	final_limit = exclusion_check(xsection)
410
411	print '\nThis signal is excluded at the ' + str(round(final_limit*100.,1)) + \
412	'% CL (CLs=' + str(round(1-final_limit,3)) + ').'
413	else:
414	# if a negative cross section is given as input,
415	# the code is looking for the cross section that is excluded at 95% CL
416	# using a root-finding algorithm
417
418	print 'Negative cross section is given.'
419	print 'Will look for the cross section that is excluded at 95% CL'
420	print 'using a root-finding algorithm.\n'
421
422	# need to tune the lower and upper bound (corresponding to a cross section
423	# that we know is not excluded or excluded, respectively)
424	lowerb = 1. # 1 pb
425	upperb = 1. # 1 pb
426	while exclusion_check95(lowerb) > 0.:
427	lowerb *= 0.1
428	while exclusion_check95(upperb) < 0.:
429	upperb *= 10.
430
431	print '\nlower and upper bounds for the root-finding algorithm' + \
432	' have been found: [%.2e %.2e] pb\n' % (lowerb, upperb)
433
434	final_limit = scipy.optimize.brentq(exclusion_check95, lowerb, upperb)
435
436	print '\nThe excluded cross section at 2 sigma is %.5E pb.' % final_limit
437
438	# finally, write the results in an output file
439	output_path = analysis_path + "/Output/" + \
440	bench_name + "/" + analysis_name + "_" + run_number + ".out"
441
442	try:
443	output = open(output_path, "w")
444	except IOError as e:
445	print 'I/O error({0}): {1}'.format(e.errno, e.strerror)
446	print 'Cannot create the output file "' + output_path + '".'
447	sys.exit()
448
449	output.write(bestSR+'\n'+str(final_limit))
450	output.close()
451
452	# Efficiencies
453	import math
454	out = open('efficiencies_'+analysis_name+'.dat','w')
455	Title=['Signal region', 'Efficiency', 'Stat. error', 'Syst. error', 'Total error', 'Best region']
456	out.write(Title[0].center(50) + Title[1].center(16) + Title[2].center(21) + Title[3].center(16) +\
457	Title[4].center(18)+ Title[5].rjust(10)+'\n')
458	for region in signalregions:
459	if signalregions[region]["stat"]!=0:
460	signalregions[region]["stat"] =1./math.sqrt(float(signalregions[region]["stat"]))
461	signalregions[region]["syst"] = float(signalregions[region]["syst"])
462	IsBest=False;
463	if region is bestSR:
464	IsBest=True
465	out.write(region.replace(' ','').ljust(50) + "%1.5f".center(15) %(signalregions[region]["acceff"]) +\
466	"%1.6f".center(15) %(signalregions[region]["stat"])+ \
467	"%1.6f".center(15) %(signalregions[region]["syst"])+ \
468	"%1.6f".center(15) %(math.sqrt(signalregions[region]["syst"]2+signalregions[region]["stat"]2))+\
469	str(int(IsBest)).rjust(8) +\
470	'\n')
471	out.close()

Download in other formats:

Original Format