Source code for higgs_dna.tools.photonid_mva

import warnings
from typing import List, Optional, Tuple

import awkward as ak
import numpy
import xgboost


[docs]def load_photonid_mva(fname: str) -> Optional[xgboost.Booster]:
    try:
        photonid_mva = xgboost.Booster()
        photonid_mva.load_model(fname)
    except (xgboost.core.XGBoostError, FileNotFoundError) as e:
        warnings.warn(
            f"Could not load photonid MVA model from '{fname}': {e}. Skipping.",
            stacklevel=2
        )
        photonid_mva = None
    return photonid_mva


[docs]def calculate_photonid_mva(
    mva: Tuple[Optional[xgboost.Booster], List[str]],
    photon: ak.Array,
) -> ak.Array:
    """Recompute PhotonIDMVA on-the-fly. This step is necessary considering that the inputs have to be corrected
    with the QRC process. Following is the list of features (barrel has 12, endcap two more):
    EB:
        events.Photon.energyRaw
        events.Photon.r9
        events.Photon.sieie
        events.Photon.etaWidth
        events.Photon.phiWidth
        events.Photon.sieip
        events.Photon.s4
        events.Photon.pfPhoIso03
        events.Photon.pfChargedIsoPFPV
        events.Photon.pfChargedIsoWorstVtx
        events.Photon.eta
        events.fixedGridRhoAll

    EE: EB +
        events.Photon.esEffSigmaRR
        events.Photon.esEnergyOverRawE
    """
    photonid_mva, var_order = mva

    if photonid_mva is None:
        return ak.ones_like(photon.pt)

    bdt_inputs = {}
    bdt_inputs = numpy.column_stack(
        [ak.to_numpy(photon[name]) for name in var_order]
    )
    tempmatrix = xgboost.DMatrix(bdt_inputs, feature_names=var_order)

    mvaID = photonid_mva.predict(tempmatrix)

    # Only needed to compare to TMVA
    # mvaID = 1.0 - 2.0 / (1.0 + numpy.exp(2.0 * mvaID))

    # the previous transformation was not working correctly, peakin at about 0.7
    # since we can't really remember why that functional form was picked in the first place we decided
    # to switch to a simpler stretch of the output that works better, even though not perfectly.
    # Open for changes/ideas
    mvaID = -1 + 2 * mvaID

    return mvaID


[docs]def load_photonid_mva_run3(fname: str) -> Optional[xgboost.Booster]:

    """ Reads and returns both the EB and EE Xgboost run3 mvaID models """

    photonid_mva_EB = xgboost.Booster()
    photonid_mva_EB.load_model(fname + 'Egamma_Run3_photonIDMVA_EB.json')

    photonid_mva_EE = xgboost.Booster()
    photonid_mva_EE.load_model(fname + 'Egamma_Run3_photonIDMVA_EE.json')

    return photonid_mva_EB, photonid_mva_EE


[docs]def calculate_photonid_mva_run3(
    mva: Tuple[Optional[xgboost.Booster], List[str]],
    photon: ak.Array, rho,
) -> ak.Array:

    """Recompute PhotonIDMVA on-the-fly. This step is necessary considering that the inputs have to be corrected
    with the QRC process. Following is the list of features (barrel has 12, endcap two more):
    EB:
        events.Photon.energyRaw
        events.Photon.r9
        events.Photon.sieie
        events.Photon.etaWidth
        events.Photon.phiWidth
        events.Photon.sieip
        events.Photon.s4
        events.photon.hoe
        probe_ecalPFClusterIso
        probe_trkSumPtHollowConeDR03
        probe_trkSumPtSolidConeDR04
        probe_pfChargedIso
        probe_pfChargedIsoWorstVtx
        events.Photon.ScEta
        events.fixedGridRhoAll

    EE: +
        events.Photon.energyRaw
        events.Photon.r9
        events.Photon.sieie
        events.Photon.etaWidth
        events.Photon.phiWidth
        events.Photon.sieip
        events.Photon.s4
        events.photon.hoe
        probe_ecalPFClusterIso
        probe_hcalPFClusterIso
        probe_trkSumPtHollowConeDR03
        probe_trkSumPtSolidConeDR04
        probe_pfChargedIso
        probe_pfChargedIsoWorstVtx
        events.Photon.ScEta
        events.fixedGridRhoAll
        events.Photon.esEffSigmaRR
        events.Photon.esEnergyOverRawE
    """

    photon["fixedGridRhoAll"] = numpy.nan_to_num(rho)
    photon = numpy.nan_to_num(photon)

    photonid_mva, var_order = mva

    if photonid_mva is None:
        return ak.ones_like(photon.pt)

    bdt_inputs = {}
    bdt_inputs = numpy.column_stack(
        [ak.to_numpy(photon[name]) for name in var_order]
    )

    tempmatrix = xgboost.DMatrix(bdt_inputs)

    mvaID = photonid_mva.predict(tempmatrix)

    # Only needed to compare to TMVA
    mvaID = 1.0 - 2.0 / (1.0 + numpy.exp(2.0 * mvaID))

    return mvaID