Skip to content

Commit

Permalink
feat: updated validation script to check partial bin sums (iris-hep#171)
Browse files Browse the repository at this point in the history
* updated validation script to check partial bin sums
* added verbose flag
  • Loading branch information
ekauffma authored and eguiraud committed Jul 19, 2023
1 parent 9a0a3db commit 2a6fa94
Showing 1 changed file with 55 additions and 4 deletions.
59 changes: 55 additions & 4 deletions analyses/cms-open-data-ttbar/validate_histograms.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def parse_args() -> argparse.Namespace:
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("--reference", help="JSON reference against which histogram contents should be compared")
group.add_argument("--dump-json", help="Print JSON representation of histogram contents to screen", action='store_true')
parser.add_argument("--verbose", help="Print extra information about bin mismatches", action='store_true')
return parser.parse_args()

# convert uproot file containing only TH1Ds to a corresponding JSON-compatible dict with structure:
Expand All @@ -31,7 +32,7 @@ def as_dict(f: uproot.ReadOnlyDirectory) -> dict[str, dict]:
histos[name]["contents"] = h.counts(flow=True).tolist()
return histos

def validate(histos: dict, reference: dict) -> dict[str, list[str]]:
def validate(histos: dict, reference: dict, verbose=False) -> dict[str, list[str]]:
errors = defaultdict(list)
for name, ref_h in reference.items():
if name not in histos:
Expand All @@ -42,8 +43,57 @@ def validate(histos: dict, reference: dict) -> dict[str, list[str]]:
if not np.allclose(h['edges'], ref_h['edges']):
errors[name].append(f"Edges do not match:\n\tgot {h['edges']}\n\texpected {ref_h['edges']}")
contents_depend_on_rng = "pt_res_up" in name # skip checking the contents of these histograms as they are not stable
if not contents_depend_on_rng and not np.allclose(h['contents'], ref_h['contents']):
errors[name].append(f"Contents do not match:\n\tgot {h['contents']}\n\texpected {ref_h['contents']}")
is_close = np.isclose(h['contents'], ref_h['contents'])

#### check if bin migration ####
if not contents_depend_on_rng and not all(is_close):

# gets indices where above array is false
where_not_close = np.where(np.invert(is_close))[0]
# gets difference of adjacent entries
diff_values = np.diff(where_not_close)
# get the indices where the above difference is greater than 1 (to form groupings of bins based on location)
split_indices = np.argwhere(np.abs(diff_values)>1)
# np.argwhere adds extra dimension we don't need
split_indices = split_indices.reshape((split_indices.shape[0],))

# if only one grouping detected
if len(split_indices)==0:
split_values = [where_not_close]
# if more than one grouping detected
else:
# shift indices to the point we want to split at
split_indices = split_indices + 1
# split indices into groupings
split_values = np.split(where_not_close, split_indices)

is_error=False
# iterate through groupings and test for bin migration or error
for group in split_values:
h_group = np.array(h['contents'])[group]
ref_group = np.array(ref_h['contents'])[group]
# if difference is great, count as error
if not np.allclose(h_group, ref_group, atol=2.0): # 2.0 is chosen as it seems to cover the difference expected of one events migrating between bins in histograms from 1 file per sample. this definitely can be tuned in the future
is_error = True
if verbose:
print(f"In {name}: Not close enough for bin migration")
print("histogram: ", h_group, ", reference: ", ref_group)
print()
# check partial sum
elif not np.allclose(sum(h_group), sum(ref_group)):
is_error = True
if verbose:
print(f"In {name}: Partial sums are unequal")
print("histogram: ", h_group, ", reference: ", ref_group)
print()
else:
if verbose:
print(f"In {name}: Bin migration likely")
print("histogram: ", h_group, ", reference: ", ref_group)
print()
if is_error:
errors[name].append(f"Contents do not match:\n\tgot {h['contents']}\n\texpected {ref_h['contents']}")
print()

return errors

Expand All @@ -60,11 +110,12 @@ def validate(histos: dict, reference: dict) -> dict[str, list[str]]:
ref_histos = json.load(reference)

print(f"Validating '{args.histos}' against reference '{args.reference}'...")
errs = validate(histos=histos, reference=ref_histos)
errs = validate(histos=histos, reference=ref_histos, verbose=args.verbose)
if len(errs) == 0:
print("All good!")
else:
for hist_name, errors in errs.items():
errors = '\n\t'.join(errors)
print(f"{hist_name}\n\t{errors}")
sys.exit(1)

0 comments on commit 2a6fa94

Please sign in to comment.