Source code for higgs_dna.scripts.plotting.plotting_utils

# Hist to plot sigma_m
import os 
import awkward as ak
import numpy as np
import matplotlib.pyplot as plt 
import mplhep, hist
plt.style.use([mplhep.style.CMS])
import pandas as pd
import scipy
import mplhep as hep
import glob
import warnings

import json

warnings.filterwarnings("ignore")

[docs]def plotter( mc_hist, mc_hist_up, mc_hist_down , data_hist, output_filename, flow_bins='none' , xlabel = 'myVariable', yscale='linear', IspostEE=False, IsBarrel=False):

    fig, ax = plt.subplots(2, 1, gridspec_kw={'height_ratios': [3, 1], 'hspace': 0.03}, sharex=True)

    stat_errors     = np.sqrt(mc_hist.variances())
    data_stat_error = np.sqrt(data_hist.variances()) 

    # plotting nominal and data histograms
    hep.histplot(
        mc_hist,
        label = r'$Z \rightarrow ee$ - simulation',
        yerr = stat_errors,
        density = True,
        linewidth=3,
        ax=ax[0],
        flow=flow_bins
    )

    hep.histplot(
        data_hist,
        label = "Data",
        yerr= data_stat_error,
        density = True,
        color="black",
        linewidth=3,
        histtype='errorbar',
        markersize=12,
        elinewidth=3,
        alpha=1,
        ax=ax[0],
        flow=flow_bins
    )

    ax[0].set_xlabel('')
    ax[0].margins(y=0.15)
    ax[0].set_ylim(0, 1.1*ax[0].get_ylim()[1])
    ax[0].tick_params(labelsize=22)

    # Ploting the systematic bands around the nominal sigma_m curve
    data_hist_numpy = data_hist.to_numpy()
    mc_hist_numpy = mc_hist.to_numpy()

    # Calculating the integrals to normalize the systematic yields, since it is a density that is being plotted!
    integral_mc_down = mc_hist_down.sum().value * (mc_hist_down.to_numpy()[1][1] - mc_hist_down.to_numpy()[1][0])
    integral_mc_up   = mc_hist_up.sum().value * (mc_hist_up.to_numpy()[1][1] - mc_hist_up.to_numpy()[1][0])

    # Lower and upper bounds of the systematic band - normalized to one
    lower_bound = mc_hist_down.to_numpy()[0]/integral_mc_down 
    upper_bound = mc_hist_up.to_numpy()[0]/integral_mc_up

    # Using the matplotlib .fill_between() to plot the bands around the nominal variation
    ax[0].fill_between(data_hist_numpy[1][:-1],
        lower_bound,
        upper_bound,
        step='post',
        hatch='XXX',
        alpha=0.9,
        facecolor="none",
        edgecolor="tab:purple", 
        linewidth=0)


    # line at 1 in the ratio plot
    ax[1].axhline(1, 0, 1, label=None, linestyle='--', color="black", linewidth=1)

    # Integrals of data and nominal mc for ratio calculation
    integral_data = data_hist.sum().value * (data_hist_numpy[1][1] - data_hist_numpy[1][0])
    integral_mc = mc_hist.sum().value * (mc_hist_numpy[1][1] - mc_hist_numpy[1][0])
    
    # to numpy histos
    mc_hist_numpy      = mc_hist.to_numpy()
    mc_hist_up_numpy   = mc_hist_up.to_numpy()
    mc_hist_down_numpy = mc_hist_down.to_numpy()

    integral_mc_up = mc_hist_up.sum().value * (mc_hist_up_numpy[1][1] - mc_hist_up_numpy[1][0])
    error_up       = np.abs(mc_hist_up_numpy[0]/integral_mc_up - mc_hist_numpy[0]/integral_mc)/(mc_hist_numpy[0]/integral_mc)

    integral_mc_down = mc_hist_down.sum().value * (mc_hist_down_numpy[1][1] - mc_hist_down_numpy[1][0])
    error_down       = np.abs(mc_hist_down_numpy[0]/integral_mc_down - mc_hist_numpy[0]/integral_mc)/(mc_hist_numpy[0]/integral_mc)

    # Plotting the bands around 1
    lower_bound = 1 - error_down
    upper_bound = 1 + error_up

    # Plot the hatched region
    ax[1].fill_between(data_hist_numpy[1][:-1],
        lower_bound,
        upper_bound,
        step='post',
        hatch='XXX',
        alpha=0.9,
        facecolor="none",
        edgecolor="tab:purple", 
        linewidth=0
    )

    # Calculating the ratio betwenn nominal mc and data and the stat error of mc + data
    values1, variances1 = data_hist.values(), data_hist.variances()
    values2, variances2 = mc_hist.values()  , mc_hist.variances()

    # Calculate the ratio 
    ratio = (values1/np.sum(values1)) / (values2/np.sum(values2))

    # Error from both data stat and MC stat
    error_ratio = np.sqrt((np.sqrt(variances2) / values2)**2 + (np.sqrt(variances1) / values1))**2

    error_ratio[error_ratio <= 0] = 0

    hep.histplot(
        ratio[:-1],
        bins=data_hist_numpy[1][:-1],
        label=None,
        color="black",
        histtype='errorbar',
        yerr= error_ratio[:-1],
        markersize=12,
        elinewidth=3,
        alpha=1,
        ax=ax[1]
    )

    # Setting the labels
    ax[0].set_ylabel("Normalised density", fontsize=26)
    ax[1].set_ylabel("Data / MC", fontsize=26)
    ax[1].set_xlabel(xlabel, fontsize=26, usetex=True)
    
    # Some axes configurations
    ax[0].tick_params(labelsize=24)
    ax[0].set_yscale(yscale)
    ax[1].set_ylim(0., 1.1*ax[0].get_ylim()[1])
    ax[1].set_ylim(0.6, 1.4)

    ax[0].legend(loc="upper right", fontsize=20)

    # Text indicating the region
    #if(IsBarrel):
    #    ax[0].text(0.05, 0.95, r'EB-EB', transform=ax[0].transAxes, fontsize=20, verticalalignment='top')
    #else:
    #    ax[0].text(0.05, 0.95, r'Not EB-EB', transform=ax[0].transAxes, fontsize=20, verticalalignment='top')

    # Setting the lumi labels
    if(IspostEE):
        hep.cms.label(data=True, ax=ax[0], loc=0, label="Private Work", com=13.6, lumi=27.0)
    else:
        hep.cms.label(data=True, ax=ax[0], loc=0, label="Private Work", com=13.6, lumi=8.1)

    plt.subplots_adjust(hspace=0.03)
    plt.tight_layout()
    fig.savefig(output_filename, bbox_inches='tight')


[docs]def create_mask(arr):

    data_mass = np.array(arr["mass"])

    #mask data interval 
    mask_mass = np.logical_and(data_mass > 80, data_mass < 100)
    mask_mass = np.logical_and(mask_mass, np.abs(np.array(arr["tag_eta"])) < 2.5 )
    mask_mass = np.logical_and(mask_mass, np.abs(np.array(arr["tag_pt"]))  > 42 )
    
    mask_mass = np.logical_and(mask_mass, np.abs(np.array(arr["probe_pt"]))  > 22 )
    mask_mass = np.logical_and(mask_mass, np.abs(np.array(arr["probe_eta"])) < 2.5 ) 

    #lets make the electron veto cut!
    mask_mass = np.logical_and(mask_mass, np.abs(np.array(arr["tag_electronVeto"]) ) == False )
    mask_mass = np.logical_and(mask_mass, np.abs(np.array(arr["probe_electronVeto"]) ) == False )

    # also, the cutBased ID!
    mask_mass = np.logical_and(mask_mass, np.array(arr["tag_cutBased"]) > 0 )
    mask_mass = np.logical_and(mask_mass, np.array(arr["probe_cutBased"]) > 0 )

    return mask_mass


[docs]def get_sigmaE_over_E(ak_arr, energyErr_field):
    return np.array(ak_arr[energyErr_field]/(ak_arr.probe_pt * np.cosh(ak_arr.probe_eta)))


[docs]def main():

    # Read the plot_settings json to check what we want to plot
    with open('./plot_settings.json', 'r') as f:
        plot_settings = json.load(f)
    
    # We need to read the nominal up and down systs
    energy_Err_systs = ["nominal","energyErrShift_up","energyErrShift_down"]
    store_sigma_over_m_syst = [] 
    store_sigma_over_m_corr_smeared_syst = [] 
    store_sigma_over_m_corr_syst = [] 
    store_sigma_over_m_smeared_syst = [] 
    _weights = [] # for MC
    store_mc_ebeb_masks = []
    store_mc_not_ebeb_masks = []

    for target in plot_settings.keys():
        print('INFO: Making the plot for', target, 'now...')
        target_settings = plot_settings[target] # Just a shortcut to avoid diving into deeply nested dictionaries
        ## Data
        ak_arr = ak.from_parquet(glob.glob(target_settings['files_data']))#, columns=target_settings['variable'])
        ak_arr = ak_arr[create_mask(ak_arr)] # Apply event selection
        if target == 'probe_energyErr_DIVbyE':
            data_arr = get_sigmaE_over_E(ak_arr, 'probe_energyErr')
        elif target == 'probe_corr_energyErr_DIVbyE':
            data_arr = get_sigmaE_over_E(ak_arr, 'probe_energyErr')
        # In Data, we do not have corrected variables
        elif target_settings['variable'] == 'sigma_m_over_m_corr':
            data_arr = ak_arr['sigma_m_over_m']
        elif target_settings['variable'] == 'sigma_m_over_m_Smeared_corrected':
            data_arr = ak_arr['sigma_m_over_m_Smeared']
        elif target_settings['variable'] == 'probe_corr_energyErr':
            data_arr = ak_arr['probe_energyErr']
        else:
            data_arr = ak_arr[target_settings['variable']]
        data_weights = np.ones(len(data_arr))

        MC_arr_list = [] # Needs to be re-initialised in each pass through the loop

        ## MC
        for variation in energy_Err_systs:
            print('Reading variation:', variation) 
            ak_arr = ak.from_parquet(glob.glob(target_settings['files_MC'] + str(variation) + "/*.parquet"))
            ak_arr = ak_arr[create_mask(ak_arr)]
            if target == 'probe_energyErr_DIVbyE':
                MC_arr_list.append(get_sigmaE_over_E(ak_arr, 'probe_energyErr'))
            elif target == 'probe_corr_energyErr_DIVbyE':
                MC_arr_list.append(get_sigmaE_over_E(ak_arr, 'probe_corr_energyErr'))
            else:
                MC_arr_list.append(np.array(ak_arr[target_settings['variable']])) # Maybe a dict would be better suited...
            _weights.append(np.array(ak_arr["weight"]))

        binning = target_settings['binning']
        
        data_hist    = hist.new.Reg(binning[0], binning[1], binning[2]).Weight()
        mc_hist      = hist.new.Reg(binning[0], binning[1], binning[2]).Weight() 
        mc_hist_up   = hist.new.Reg(binning[0], binning[1], binning[2]).Weight() 
        mc_hist_down = hist.new.Reg(binning[0], binning[1], binning[2]).Weight()

        if target_settings['density'] == True:
            data_hist.fill(data_arr, weight = data_weights/np.sum(data_weights))
            mc_hist.fill(MC_arr_list[0], weight = _weights[0]/np.sum(_weights[0]))
            mc_hist_up.fill(MC_arr_list[1], weight = _weights[1]/np.sum(_weights[1]))
            mc_hist_down.fill(MC_arr_list[2], weight = _weights[2]/np.sum(data_weights[2]))
        else:
            data_hist.fill(data_arr, weight = data_weights/np.sum(data_weights))
            mc_hist.fill(MC_arr_list[0], weight = _weights[0])
            mc_hist_up.fill(MC_arr_list[1], weight = _weights[1])
            mc_hist_down.fill(MC_arr_list[2], weight = _weights[2])
        
        plotter(mc_hist, mc_hist_up, mc_hist_down, data_hist, 'plots_smear/test/'+target+'.png', flow_bins = target_settings['flow_bins'], xlabel = target_settings['xlabel'], yscale=target_settings['yscale'])


if __name__ == "__main__":
    main()