# Assigns an ID to each sample along with different access functions # Author in C++: Matthias Schroeder # matthias.schroeder@AT@desy.de # November 2013 import glob import numpy as np # manipulations and operations on arrays import pandas as pd # easy access to data structures import matplotlib.pyplot as plt # plotting from matplotlib.lines import Line2D import time from scipy.stats import poisson from scipy.stats import norm import ast # useful for converting strings to lists class Sample: #path to data needs to be set manually path = "/Unknown Path/" # check if ID is correct def checkId(id): if( id > 6 or id < 0 ): print("\n\nERROR invalid sample id ",id) # input id output full file name def fileNameFullSample(id): Sample.checkId(id) if id == 0: name = "4VectorData.csv" if id == 7: name = "MuHadUnnested.csv" if id == 1: name = "4VectorWJets.csv" if id == 2: name = "4VectorTTJets.csv" if id == 3: name = "4VectorZJets.csv" if id == 4: name = "4VectorQCD.csv" if id == 5: name = "4VectorLM6.csv" if id == 6: name = "4VectorLM9.csv" name = Sample.path + name #for i in range(len(name)): # name[i] = Sample.path+name[i] return name # input id output name of data (label) for legends in plot def label(id): Sample.checkId(id) label = "" if( id == 0 ): label += "Data" if( id == 1 ): label += "W(l#nu)+Jets" if( id == 2 ): label += "t#bar{t}+Jets" if( id == 3 ): label += "Z(#nu#bar{#nu})+Jets" if( id == 4 ): label += "QCD" if( id == 5 ): label += "LM6" if( id == 6 ): label += "LM9" return label # sample name without spaces for files def toString(id): Sample.checkId(id) string = "" if( id == 0 ): string += "Data" if( id == 1 ): string += "WJets" if( id == 2 ): string += "TTJets" if( id == 3 ): string += "ZJets" if( id == 4 ): string += "QCD" if( id == 5 ): string += "LM6" if( id == 6 ): string += "LM9" return string # assigns color to each sample for plotting def color(id): Sample.checkId(id) color = 0; if( id == 0 ): color = 'k' if( id == 1 ): color = 'g' if( id == 2 ): color = 'r' if( id == 3 ): color = 'y' if( id == 4 ): color = 'm' if( id == 5 ): color = 'b' if( id == 6 ): color = 'c' return color # returns cross section: def CrossSection(ID): if ID == 1: return(30.08) if ID == 2: return(127.06) if ID == 3: return(6.26) if ID == 4: return(8630.0) if ID == 5: return(0.502) if ID == 6: return(9.287) else: return('Wrong ID: cross section not found') # total Number of MC events prior to selection def TotalEvents(ID): if ID == 1: return(6619654.) if ID == 2: return(1925324.) if ID == 3: return(51637.) if ID == 4: return(44443155.) if ID == 5: return(51282.) if ID == 6: return(51282.) else: return('Wrong ID: Total Number of MC Events not found') # takes a dataframe or a list of dataframes and returns the maximum number of jets in the dataframe def MaxNJets(df): if isinstance(df, list): out = [(len(x.columns)-2)//4 for x in df ] elif isinstance(df, pd.DataFrame): out = (len(df.columns)-2)//4 else: print('MaxNJets Error: Input is not a dataframe or a list of dataframes') return(out) # returns list of MaxNJets for the dataframes in input # takes a dataframe or a list of dataframes and returns list of the jets as strings def JetNames(df): if isinstance(df, list): out = [] for element in df: Jets_in_df = ['Jet {}'.format(i) for i in range(Sample.MaxNJets(element))] out.append(Jets_in_df) elif isinstance(df, pd.DataFrame): out = ['Jet {}'.format(i) for i in range(Sample.MaxNJets(df))] else: print('Jets Error: Input is not a dataframe or a list of dataframes') return (out) # takes a dataframe and a Jetnumber and returns needed jet as tuble def Get_Jet(df,Jet_Number): return(df[Sample.JetNames(df)[Jet_Number]]) # subtract arrays while ignoring NaN values def nansubtract(x1,x2): return(np.nansum(np.array([x1,-1*x2]),axis = 0)) def Bin_Values(Data, Bins, Weights): binValues, binEdges = np.histogram( Data, bins = Bins ,weights = Weights) bincenters = 0.5*(binEdges[1:]+binEdges[:-1]) out = [ '{:.10f}'.format(a) for a in binValues] return(out) # Compare data with background and signal distributions def Compare(Background, Signal, Data, Bins, Labels = None, Colors = None): # Function Variables: #Bkg: List of background dataframes #Sgl: List of signal dataframes #Data: Data Dataframe NOT a List #Bins: list of bins for MaxNJets, Ht, and MHt in this order #labels: list of labels in following order: [Data,Background,Signal] #color: list of colors in following order: [Data,Background,Signal] np.set_printoptions(formatter={'float_kind':'{:f}'.format}) fig, axs = plt.subplots(1, 3, figsize = (21.6,4.8)) # Plotting Background Bkg_Weights = [Bkg['Weights'] for Bkg in Background] Bkg_Njets = [Bkg['NJets'] for Bkg in Background] Bkg_Ht = [Bkg['Ht'] for Bkg in Background] Bkg_MHt = [Bkg['MHt']for Bkg in Background] Bkg_labels = [Labels[i+1] for i in range(len(Background))] # +1 to aviod counting data as background Bkg_colors = [Colors[i+1] for i in range(len(Background))] axs[0].hist(Bkg_Njets, bins = Bins[0] , color = Bkg_colors, label = Bkg_labels,stacked=True,weights = Bkg_Weights) axs[1].hist(Bkg_Ht , bins = Bins[1] ,color = Bkg_colors, label = Bkg_labels,stacked=True,weights = Bkg_Weights) axs[2].hist( Bkg_MHt , bins = Bins[2] ,color = Bkg_colors, label = Bkg_labels,stacked=True,weights = Bkg_Weights) # Plotting Signals for signal_index,signal_dataframe in enumerate(Signal): Signal_Weights = signal_dataframe['Weights'] Signal_Njets = signal_dataframe['NJets'] Signal_Ht = signal_dataframe['Ht'] Signal_MHt = signal_dataframe['MHt'] Signal_labels = Labels[len(Background)+1+signal_index] # note that Labels = [Data_label, Background_labels, Signal_labels] Signal_colors = Colors[len(Background)+1+signal_index] # note that Labels = [Data_label, Background_labels, Signal_labels] axs[0].hist(Signal_Njets,bins=Bins[0],color=Signal_colors,label=Signal_labels,stacked=False,weights=Signal_Weights,histtype=u'step') axs[1].hist(Signal_Ht,bins=Bins[1],color=Signal_colors,label=Signal_labels,stacked=False,weights=Signal_Weights,histtype=u'step') axs[2].hist( Signal_MHt,bins=Bins[2],color=Signal_colors,label=Signal_labels,stacked=False,weights=Signal_Weights,histtype=u'step') # Plotting Data for Variable_index,Variable in enumerate(['NJets','Ht','MHt']): binValues, binEdges = np.histogram(Data[Variable], bins =Bins[Variable_index], weights = Data['Weights']) bincenters = 0.5*(binEdges[1:]+binEdges[:-1]) menStd = np.sqrt(binValues) axs[Variable_index].errorbar(bincenters, binValues,color = Colors[0],yerr = np.sqrt(binValues) ,label = Labels[0] ,fmt='o') #----Adjust--Axis--- for N_axs in [0,1,2]: axs[N_axs].set_yscale("log") axs[N_axs].set_ylabel('Events',fontsize=16) axs[N_axs].legend() axs[0].set_xlabel('N(jets)',fontsize=16) axs[1].set_xlabel('$H_{t}$ [GeV]',fontsize=16) axs[2].set_xlabel("$ MH_{t}[GeV] $",fontsize=16) plt.show()