#!/usr/bin/env python
import argparse
from higgs_dna.utils.logger_utils import setup_logger
import pandas as pd
import uproot
import awkward as ak
import numpy as np
import os
import json
from importlib import resources
[docs]def main():
parser = argparse.ArgumentParser(
description="Simple utility script to convert one parquet file into one ROOT file."
)
parser.add_argument("source", type=str, help="Path to input file.")
parser.add_argument("target", type=str, help="Path to desired output file.")
parser.add_argument("type", type=str, help="Type of dataset (data or mc).")
parser.add_argument(
"--log", dest="log", type=str, default="INFO", help="Logger info level"
)
parser.add_argument("--process", type=str, default="", help="Production mode.")
parser.add_argument(
"--notag",
dest="notag",
action="store_true",
default=False,
help="create NOTAG dataset as well.",
)
parser.add_argument(
"--do-syst",
dest="do_syst",
action="store_true",
default=False,
help="create branches for systematic variations",
)
parser.add_argument(
"--cats",
type=str,
dest="cats_dict",
default="",
help="Dictionary containing category selections.",
)
parser.add_argument(
"--vars",
type=str,
dest="vars_dict",
default="",
help="Dictionary containing variations.",
)
parser.add_argument(
"--abs",
dest="abs",
action="store_true",
default=False,
help="Uses absolute path for the dictionary files.",
)
parser.add_argument(
"--genBinning",
type=str,
dest="genBinning",
default="",
help="Optional: Path to the JSON containing the binning at gen-level.",
)
args = parser.parse_args()
source_path = args.source
target_path = args.target
type = args.type
notag = True if (type == "mc" and args.notag == True) else False
process = args.process if (args.process != "") else "data"
logger = setup_logger(level=args.log)
BASEDIR = resources.files("higgs_dna").joinpath("")
if args.genBinning != "":
if args.abs:
genBinning_path = os.path.realpath(args.genBinning)
else:
genBinning_path = os.path.join(BASEDIR, "scripts/postprocessing/sample_gen_binning.json")
with open(genBinning_path, 'r') as json_file:
gen_binning = json.load(json_file)
else:
gen_binning = None
# Dictionary for renaming variables in ROOT tree output for final fits
rename_dict = {
"mass": "CMS_hgg_mass"
}
# Ensure that the target directory exists
os.makedirs('/'.join(target_path.split("/")[:-1]), exist_ok=True)
df_dict = {}
outfiles = {
"ch": target_path.replace(
"merged.root", "output_cHToGG_M125_13TeV_amcatnloFXFX_pythia8.root"
),
"ggh": target_path.replace(
"merged.root", "output_GluGluHToGG_M125_13TeV_amcatnloFXFX_pythia8.root"
),
"ggh_125": target_path.replace(
"merged.root", "output_GluGluHToGG_M125_13TeV_amcatnloFXFX_pythia8.root"
),
"ggh_120": target_path.replace(
"merged.root", "output_GluGluHToGG_M120_13TeV_amcatnloFXFX_pythia8.root"
),
"ggh_130": target_path.replace(
"merged.root", "output_GluGluHToGG_M130_13TeV_amcatnloFXFX_pythia8.root"
),
"vbf": target_path.replace(
"merged.root", "output_VBFHToGG_M125_13TeV_amcatnlo_pythia8.root"
),
"vbf_125": target_path.replace(
"merged.root", "output_VBFHToGG_M125_13TeV_amcatnlo_pythia8.root"
),
"vbf_120": target_path.replace(
"merged.root", "output_VBFHToGG_M120_13TeV_amcatnlo_pythia8.root"
),
"vbf_130": target_path.replace(
"merged.root", "output_VBFHToGG_M130_13TeV_amcatnlo_pythia8.root"
),
"vh": target_path.replace(
"merged.root", "output_VHToGG_M125_13TeV_amcatnlo_pythia8.root"
),
"vh_125": target_path.replace(
"merged.root", "output_VHToGG_M125_13TeV_amcatnlo_pythia8.root"
),
"vh_120": target_path.replace(
"merged.root", "output_VHToGG_M120_13TeV_amcatnlo_pythia8.root"
),
"vh_130": target_path.replace(
"merged.root", "output_VHToGG_M130_13TeV_amcatnlo_pythia8.root"
),
"tth": target_path.replace(
"merged.root", "output_TTHToGG_M125_13TeV_amcatnlo_pythia8.root"
),
"tth_125": target_path.replace(
"merged.root", "output_TTHToGG_M125_13TeV_amcatnlo_pythia8.root"
),
"tth_120": target_path.replace(
"merged.root", "output_TTHToGG_M120_13TeV_amcatnlo_pythia8.root"
),
"tth_130": target_path.replace(
"merged.root", "output_TTHToGG_M130_13TeV_amcatnlo_pythia8.root"
),
"dy": target_path.replace(
"merged.root", "output_DYto2L.root"
),
"ggbox": target_path.replace(
"merged.root", "output_GG-Box-3Jets_MGG-80.root"
),
"gjet": target_path.replace(
"merged.root", "output_GJet_DoubleEMEnriched_MGG-80.root"
),
"data": target_path.replace("merged.root", "allData_2017.root"),
}
# Loading category informations (used for naming of files to read/write)
if args.cats_dict != "":
if args.abs:
cats_path = os.path.realpath(args.cats_dict)
else:
cats_path = os.path.join(BASEDIR, args.cats_dict)
with open(cats_path) as pf:
# with resources.open_text("higgs_dna", args.cats_dict) as pf:
cat_dict = json.load(pf)
for cat in cat_dict:
logger.debug(f"Found category: {cat}")
else:
logger.info(
"You provided an invalid dictionary containing categories information, have a look at your version of prepare_output_file.py"
)
logger.info(
"An inclusive NOTAG category is used as default"
)
cat_dict = {"NOTAG": {"cat_filter": [("pt", ">", -1.0)]}}
# Loading variation informations (used for naming of files to read/write)
# Active object systematics, weight systematics are just different sets of weights contained in the nominal file
if args.vars_dict != "":
if args.abs:
vars_path = os.path.realpath(args.vars_dict)
else:
vars_path = os.path.join(BASEDIR, args.vars_dict)
# with resources.open_text("higgs_dna", args.vars_dict) as pf:
with open(vars_path) as pf:
variation_dict = json.load(pf)
for var in variation_dict:
logger.debug(f"Found variation: {var}")
else:
if args.do_syst:
raise Exception(
"You provided an invalid dictionary containing systematic variations information, have a look at your version of prepare_output_file.py"
)
if args.do_syst:
# object systematics come from a different file (you are supposed to have merged .parquet with the merge_parquet.py script)
for var in variation_dict:
df_dict[var] = {}
for cat in cat_dict:
var_path = source_path.replace(
"merged.parquet", f"{variation_dict[var]}/{cat}_merged.parquet"
)
logger.info(
f"Starting conversion of one parquet file to ROOT. Attempting to read file {var_path} for category: {cat}."
)
eve = ak.from_parquet(var_path)
logger.info("Successfully read from parquet file with awkward.")
dict = {}
for i in eve.fields:
i_re = rename_dict[i] if i in rename_dict else i
dict[i_re] = eve[i]
df_dict[var][cat] = dict
logger.debug(
f"Successfully created dict from awkward arrays for {var} variation for category: {cat}."
)
logger.info(f"Attempting to write dict to ROOT file {target_path}.")
else:
for cat in cat_dict:
var_path = source_path.replace("merged.parquet", f"{cat}_merged.parquet")
logger.info(
f"Starting conversion of one parquet file to ROOT. Attempting to read file {var_path}."
)
eve = ak.from_parquet(var_path)
logger.info("Successfully read from parquet file with awkward.")
dict = {}
for i in eve.fields:
i_re = rename_dict[i] if i in rename_dict else i
dict[i_re] = eve[i]
df_dict[cat] = dict
logger.debug(
f"Successfully created dict from awkward arrays without variation for category: {cat}."
)
cat_postfix = {"ggh": "GG2H", "vbf": "VBF", "tth": "TTH", "vh": "VH", "dy": "DY"}
# For MC: {inputTreeDir}/{production-mode}_{mass}_{sqrts}_{category}
# For data: {inputTreeDir}/Data_{sqrts}_{category}
labels = {}
names = {}
if type == "mc":
for cat in cat_dict:
if len(process.split("_"))>1:
# If process of the form {process}_{mass}
names[
cat
] = f"DiphotonTree/{process.split('_')[0]}_{process.split('_')[-1]}_13TeV_{cat}" # _"+cat_postfix[process]
else:
names[
cat
] = f"DiphotonTree/{process}_125_13TeV_{cat}" # _"+cat_postfix[process]
labels[cat] = []
if len(process.split("_"))>1:
name_notag = "DiphotonTree/" + process.split('_')[0] + f"_{process.split('_')[-1]}_13TeV_NOTAG"
else:
name_notag = "DiphotonTree/" + process + "_125_13TeV_NOTAG"
# flashggFinalFit needs to have each systematic variation in a different branch
if args.do_syst:
for var in variation_dict:
for cat in cat_dict:
# for object systematics we have different files storing the variated collections with the nominal weights
syst_ = var
logger.info("found syst: %s for category: %s" % (syst_, cat))
if len(process.split("_"))>1:
labels[cat].append(
[
"DiphotonTree/" + process.split('_')[0] + f"_{process.split('_')[-1]}_13TeV_{cat}_" + syst_,
"weight",
syst_,
cat,
]
)
else:
labels[cat].append(
[
"DiphotonTree/" + process + f"_125_13TeV_{cat}_" + syst_,
"weight",
syst_,
cat,
]
)
else:
for cat in cat_dict:
labels[cat] = []
labels[cat].append([f"DiphotonTree/Data_13TeV_{cat}", cat])
names[cat] = f"DiphotonTree/Data_13TeV_{cat}"
# Now we want to write the dictionary to a root file, since object systematics don't come from
# the nominal file we have to separate again the treatment of them from the object ones
with uproot.recreate(outfiles[process]) as file:
logger.debug(outfiles[process])
# Final fit want a separate tree for each category and variation,
# the naming of the branches are quite rigid:
# For MC: {inputTreeDir}/{production-mode}_{mass}_{sqrts}_{category}_{syst}
# For data: {inputTreeDir}/Data_{sqrts}_{category}
for cat in cat_dict:
logger.debug(f"writing category: {cat}")
if args.do_syst:
# check that the category actually contains something, otherwise the flattening step will make the script crash,
# an improvement (not sure if needed) may be to also write an empty TTree to not confuse FinalFit
if len(df_dict["NOMINAL"][cat]["weight"]):
for branch in df_dict["NOMINAL"][cat]:
# here I had to add a flattening step to help uproot with the type of the awkward arrays,
# if you don't flatten (event if you don't have a nested field) you end up having a type like (len_of_array) * ?type, which make uproot very mad apparently
df_dict["NOMINAL"][cat][branch] = ak.flatten(df_dict["NOMINAL"][cat][branch], axis=0)
file[names[cat]] = df_dict["NOMINAL"][cat]
if notag:
file[name_notag] = df_dict["NOMINAL"][cat] # this is wrong, to be fixed
for syst_name, weight, syst_, c in labels[cat]:
# Skip "NOMINAL" as information included in nominal tree
if syst_ == "NOMINAL":
continue
logger.debug(f"{syst_name}, {weight}, {syst_}, {c}")
# If the name is not in the variation dictionary it is assumed to be a weight systematic
var_list = [
["CMS_hgg_mass", "CMS_hgg_mass"],
[weight, "weight"],
["HTXS_Higgs_pt", "HTXS_Higgs_pt"],
["HTXS_Higgs_y", "HTXS_Higgs_y"],
["PTH", "PTH"],
["YH", "YH"],
["fiducialGeometricFlag", "fiducialGeometricFlag"]
]
if gen_binning != None:
for keys in gen_binning:
var_list.append(["diffVariable_" + keys, "diffVariable_" + keys])
if syst_ not in variation_dict:
logger.debug(f"found weight syst {syst_}")
red_dict = {}
for key, new_key in var_list:
if "NOMINAL" in df_dict and cat in df_dict["NOMINAL"] and key in df_dict["NOMINAL"][cat]:
red_dict[new_key] = df_dict["NOMINAL"][cat][key]
logger.info(f"Adding {syst_name}01sigma to out tree...")
file[syst_name + "01sigma"] = red_dict
else:
red_dict = {}
for key, new_key in var_list:
if syst_ in df_dict and cat in df_dict[syst_] and key in df_dict[syst_][cat]:
red_dict[new_key] = ak.flatten(df_dict[syst_][cat][key], 0)
logger.info(f"Adding {syst_name}01sigma to out tree...")
file[syst_name + "01sigma"] = red_dict
else:
logger.info(f"no events survived category selection for cat: {cat}")
else:
# if there are no syst there is no df_dict["NOMINAL"] entry in the dict
if len(df_dict[cat][[*df_dict[cat]][0]]):
# same as before
for branch in df_dict[cat]:
df_dict[cat][branch] = ak.flatten(df_dict[cat][branch], axis=0)
file[names[cat]] = df_dict[cat]
if notag:
file[name_notag] = df_dict[cat] # this is wrong, to be fixed
else:
logger.info(f"no events survived category selection for cat: {cat}")
logger.info(
f"Successfully converted parquet file to ROOT file for process {process}."
)
if __name__ == "__main__":
main()