Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Truth building + LArCV scripts #64

Merged
merged 6 commits into from
Feb 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 109 additions & 0 deletions bin/larcv_find_duplicates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#!/usr/bin/env python3
"""Finds duplicated files."""

import argparse

import numpy as np
from tqdm import tqdm
from ROOT import TFile # pylint: disable=E0611
from larcv import larcv # pylint: disable=W0611


def main(source, source_list, output, tree_name):
"""Loops over a list of files and identifies files which contain the same
set of (run, subrun, event) triplets.

In order to save time, this script only checks if:
1. The number of entries in the files are the same
2. The run, subrun and event numbers in the first entry are the same

Parameters
----------
source : Union[str, List[str]]
Path or list of paths to the input files
source_list : str
Path to a text file containing a list of data file paths
output : str
Path to the output text file with the list of duplicates
tree_name : str
Name of the tree to use as a reference to count the number of entries.
If not specified, takes the first tree in the list.
"""
# If using source list, read it in
if source_list is not None:
with open(source_list, 'r', encoding='utf-8') as f:
source = f.read().splitlines()

# Initialize the output text file
out_file = open(output, 'w', encoding='utf-8')

# Loop over the list of files in the input
print(f"\nGathering information from {len(source)} files:")
values = np.empty((len(source), 4), dtype=int)
for idx, file_path in enumerate(tqdm(source)):
# Get the tree to get the number of entries from
f = TFile(file_path, 'r')
if tree_name is None:
key = [key.GetName() for key in f.GetListOfKeys()][0]
else:
key = f'{tree_name}_tree'
branch_key = key.replace('_tree', '_branch')

# Check the number of entries in the file
tree = getattr(f, key)
num_entries = tree.GetEntries()

# Get the event information of the first entry in the file
tree.GetEntry(0)
branch = getattr(tree, branch_key)
run, subrun, event = branch.run(), branch.subrun(), branch.event()

# Set the values list
values[idx] = [num_entries, run, subrun, event]

# Loop over non-unique files
print(f"\nChecking for duplicates among {len(source)} files:")
_, inverse, counts = np.unique(
values, axis=0, return_inverse=True, return_counts=True)
duplicate_files = []
for idx in tqdm(np.where(counts > 1)[0]):
# Build a file mask for this class of duplicates
index = np.where(inverse == idx)[0]

# All the files which are not the first in this class are duplicates
for i in range(1, len(index)):
file_path = source[index[i]]
duplicate_files.append(file_path)
out_file.write(f'{file_path}\n')
tqdm.write(f"- Duplicate file: {file_path}")

print(f"\nFound {len(duplicate_files)} duplicate files.")

# Close text file
out_file.close()


if __name__ == "__main__":
# Parse the command-line arguments
parser = argparse.ArgumentParser(description="Count entries in dataset")

group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--source', '-s',
help='Path or list of paths to data files',
type=str, nargs="+")
group.add_argument('--source-list', '-S',
help='Path to a text file of data file paths',
type=str)

parser.add_argument('--output', '-o',
help='Path to the output text file with the duplicate list',
type=str, required=True)

parser.add_argument('--tree_name',
help='TTree name used to count the entries.',
type=str)

args = parser.parse_args()

# Execute the main function
main(args.source, args.source_list, args.output, args.tree_name)
94 changes: 94 additions & 0 deletions bin/larcv_find_run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#!/usr/bin/env python3
"""Builds a list of file which make a data run."""

import argparse

from tqdm import tqdm
from ROOT import TFile # pylint: disable=E0611
from larcv import larcv # pylint: disable=W0611


def main(source, source_list, output, run_number, tree_name):
"""Loops over a list of files and finds those which belong to a certain run.

Parameters
----------
source : Union[str, List[str]]
Path or list of paths to the input files
source_list : str
Path to a text file containing a list of data file paths
output : str
Path to the output text file with the list of run files
run_number : int
Run number to look for
tree_name : str
Name of the tree to use as a reference to get the run number from.
If not specified, takes the first tree in the list.
"""
# If using source list, read it in
if source_list is not None:
with open(source_list, 'r', encoding='utf-8') as f:
source = f.read().splitlines()

# Initialize the output text file
out_file = open(output, 'w', encoding='utf-8')

# Loop over the list of files in the input
print(f"\nLooking for run {run_number} in {len(source)} files:")
run_files = []
for file_path in tqdm(source):
# Get the tree to get the number of entries from
f = TFile(file_path, 'r')
if tree_name is None:
key = [key.GetName() for key in f.GetListOfKeys()][0]
else:
key = f'{tree_name}_tree'
branch_key = key.replace('_tree', '_branch')

# Check the run number of the first entry in the file
tree = getattr(f, key)
tree.GetEntry(0)
run = getattr(tree, branch_key).run()
f.Close()

# If the file contains entries from the correct run, append
if run == run_number:
tqdm.write(f"- Good file: {file_path}")
run_files.append(file_path)
out_file.write(f'{file_path}\n')

print(f"\nFound {len(run_files)} run {run_number} files.")

# Close text file
out_file.close()


if __name__ == "__main__":
# Parse the command-line arguments
parser = argparse.ArgumentParser(description="Count entries in dataset")

group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--source', '-s',
help='Path or list of paths to data files',
type=str, nargs="+")
group.add_argument('--source-list', '-S',
help='Path to a text file of data file paths',
type=str)

parser.add_argument('--output', '-o',
help='Path to the output text file with the run file list',
type=str, required=True)

parser.add_argument('--run-number',
help='Run number to look for',
type=int, required=True)

parser.add_argument('--tree_name',
help='TTree name used to count the entries.',
type=str)

args = parser.parse_args()

# Execute the main function
main(args.source, args.source_list, args.output, args.run_number,
args.tree_name)
171 changes: 171 additions & 0 deletions bin/larcv_inject_run_number.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
"""Script which injects a run number in every event of every tree in a file or
a list of files.
"""

import os
import argparse
import tempfile

import numpy as np
from tqdm import tqdm
from ROOT import TFile # pylint: disable=E0611
from larcv import larcv # pylint: disable=W0611

# LArCV IO Manager configuration string
CFG = """
IOManager: {
Verbosity : 4
Name : "OutIO"
IOMode : 2
InputFiles : [INPUT_PATH]
OutFileName : OUTPUT_PATH
}
"""


def initialize_manager(file_path, dest, overwrite, suffix):
"""Initialize an IOManager object given a configuration.

Parameters
----------
file_path : str
Path to the input file

Returns
-------
larcv.IOManager
IOManager object
"""
# If the destination is provided, direct the output file there
out_path = file_path
if dest is not None:
base = os.path.basename(file_path)
out_path = f'{dest}/{base}'

# If a suffix is provided, append
assert suffix is None or not overwrite, (
"No point in providing a suffix if the original file is overwritten.")
if suffix is not None:
out_path = out_path.replace('.root', f'_{suffix}.root')
elif overwrite:
out_path = out_path.replace('.root', '_tmp.root')

# Check that the output file does is not the same as the original file
if file_path == out_path:
raise ValueError(
"The input file name and the output file name are the same. "
"This is not allowed by the LArCV IOManager.")

# Update the configuration with the input/output file names
cfg = CFG
cfg = cfg.replace('INPUT_PATH', file_path)
cfg = cfg.replace('OUTPUT_PATH', out_path)

# Create a temporary text file with the configuration
tmp = tempfile.NamedTemporaryFile('w')
tmp.write(cfg)
tmp.flush()

# Initialize the IOManager
manager = larcv.IOManager(tmp.name)
manager.initialize()

return manager, out_path


def main(source, source_list, dest, overwrite, run_number, suffix):
"""Checks the output of the SPINE process.

The script loops over the input files, fetch the list of keys in the file
and injects a run number of each event in each file.

.. code-block:: bash

$ python3 bin/inject_run_number.py -S file_list.txt
--overwrite --run_number 123

Parameters
----------
source : List[str]
List of paths to the input files
source_list : str
Path to a text file containing a list of data file paths
dest : str
Destination folder to write the files to
overwrite : bool
If `True`, overwrite the original files
run_number : int
Run number to inject in the input file list. If it is specied as -1,
each file is assigned a unique run number
suffix : str
String to append to the end of the input file names to form the name
of the output file with the updated run numbers
"""
# If using source list, read it in
if source_list is not None:
with open(source_list, 'r', encoding='utf-8') as f:
source = f.read().splitlines()

# Initialize the output text file
#out_file = open(output, 'w', encoding='utf-8')

# Loop over the list of files in the input
print("\nUpdating the run numbers of input files.")
for idx, file_path in enumerate(tqdm(source)):
# Initialize the input/output processes
io, out_path = initialize_manager(file_path, dest, overwrite, suffix)

# Loop over entries, set the run number for every data product
num_entries = io.get_n_entries()
run = run_number if run_number > -1 else idx
for e in range(num_entries):
# Read existing content
io.read_entry(e)

# Update the run number
io.set_id(run, 0, e + 1)

# Save
io.save_entry()

# Finalize
io.finalize()

# If needed move the output file to where the
if overwrite:
os.rename(out_path, file_path)


if __name__ == "__main__":
# Parse the command-line arguments
parser = argparse.ArgumentParser(description="Check dataset validity")

group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--source', '-s',
help='Path or list of paths to data files',
type=str, nargs="+")
group.add_argument('--source-list', '-S',
help='Path to a text file of data file paths',
type=str)

group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--dest',
help='Destination folder for the output file',
type=str)
group.add_argument('--overwrite',
help='Overwrite the input file with the output file',
action='store_true')

parser.add_argument('--run-number',
help='Run number to assign to every input file',
type=int, required=True)

parser.add_argument('--suffix',
help='Suffix to append to the input file names',
type=str)

args = parser.parse_args()

# Execute the main function
main(args.source, args.source_list, args.dest, args.overwrite,
args.run_number, args.suffix)
Loading
Loading