Skip to content

Commit

Permalink
Merge pull request #115 from ikrommyd/add-limit-option
Browse files Browse the repository at this point in the history
feat: add option to limit datasets to first N files
  • Loading branch information
ikrommyd authored Oct 30, 2024
2 parents f77e1d8 + e264188 commit 8a59ae1
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 5 deletions.
13 changes: 8 additions & 5 deletions scripts/run_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,14 @@ def main():
fileset = runner_utils.load_json(args.fileset)
logger.info(f"Loaded fileset from {args.fileset}")

if args.limit == 0:
raise ValueError("Limit cannot be 0")
if args.limit is not None:
logger.info(f"Limiting each dataset of the fileset to the first {args.limit} files")
for dataset in fileset.keys():
files = fileset[dataset]["files"]
fileset[dataset]["files"] = dict(list(files.items())[: args.limit])

if args.executor == "dask/casa" or args.executor.startswith("tls://"):
# use xcache for coffea-casa
xrootd_pfx = "root://"
Expand Down Expand Up @@ -239,11 +247,6 @@ def main():
out = runner_utils.process_out(out, args.output)
logger.info(f"Final output after post-processing:\n{out}")
logger.info("Finished the E/Gamma Tag and Probe workflow")
logger.info("Shutting down the client and cluster if any")
if client:
client.shutdown()
if cluster:
cluster.close()


if __name__ == "__main__":
Expand Down
3 changes: 3 additions & 0 deletions src/egamma_tnp/utils/runner_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,8 @@ def get_main_parser():
Path to the output directory. The default is the current working directory.
--executor: str, optional
The executor to use for the computations. The default is None and lets dask decide.
--limit: int, optional
Limit to the first N files of each dataset in the filset. The default is None.
--preprocess: bool, optional
Preprocess the fileset before running the workflow. The default is False.
--skip_report: bool, optional
Expand Down Expand Up @@ -458,6 +460,7 @@ def get_main_parser():
parser.add_argument("--binning", type=str, help="Path to a JSON file specifying the binning. Default is None.")
parser.add_argument("--output", type=str, help="Path to the output directory. Default is None.")
parser.add_argument("--executor", type=str, help="The executor to use for the computations. Default is None and lets dask decide.")
parser.add_argument("--limit", type=int, help="Limit to the first N files of each dataset in the filset. Default is None.")
parser.add_argument("--preprocess", action="store_true", default=False, help="Preprocess the fileset before running the workflow. Default is False.")
parser.add_argument("--skip_report", action="store_true", default=False, help="Skip computing and saving the report. Default is False.")
parser.add_argument("--repartition_n_to_one", type=int, help="The number of partitions to merge during saving. Default is None.")
Expand Down

0 comments on commit 8a59ae1

Please sign in to comment.