From c5eeaa782cf53bd1c2635eed807591b0c970e3d1 Mon Sep 17 00:00:00 2001 From: Peter Onyisi Date: Thu, 8 May 2025 11:59:06 -0500 Subject: [PATCH 1/3] More general python codegen --- .../templates/transform_single_file.py | 99 ++++++++++--------- 1 file changed, 55 insertions(+), 44 deletions(-) diff --git a/code_generator_python/python_code_generator/templates/transform_single_file.py b/code_generator_python/python_code_generator/templates/transform_single_file.py index 8fd6d2987..dfb34e76e 100644 --- a/code_generator_python/python_code_generator/templates/transform_single_file.py +++ b/code_generator_python/python_code_generator/templates/transform_single_file.py @@ -22,58 +22,69 @@ def transform_single_file(file_path: str, output_path: Path, output_format: str) try: stime = time.time() - output = generated_transformer.run_query(file_path) - - ttime = time.time() - - if output_format == 'root-file': + # We first see if the function has the signature to directly write output + # If it doesn't, then we assume it's giving us back awkward array results + try: + generated_transformer.run_query(file_path, output_path) + if not output_path.exists(): + raise RuntimeError("Transformation did not produce expected output file " + f"{output_path}") + ttime = time.time() etime = time.time() - if isinstance(output, ak.Array): - awkward_arrays = {default_tree_name: output} - elif isinstance(output, dict): - awkward_arrays = output - with open(output_path, 'b+w') as wfile: - with uproot.recreate(wfile) as writer: - for key in awkward_arrays.keys(): - total_events = awkward_arrays[key].__len__() - if awkward_arrays[key].fields and total_events: - o_dict = {field: awkward_arrays[key][field] - for field in awkward_arrays[key].fields} - elif awkward_arrays[key].fields and not total_events: - o_dict = {field: np.array([]) - for field in awkward_arrays[key].fields} - elif not awkward_arrays[key].fields and total_events: - o_dict = {default_branch_name: awkward_arrays[key]} - else: - o_dict = {default_branch_name: np.array([])} - writer[key] = o_dict - wtime = time.time() - elif output_format == 'raw-file': - etime = time.time() total_events = 0 - output_path = output - wtime = time.time() - else: - if isinstance(output, dict): - tree_name = list(output.keys())[0] - awkward_array = output[tree_name] - print(f'Returned type from your Python function is a dictionary - ' - f'Only the first key {tree_name} will be written as parquet files. ' - f'Please use root-file output to write all trees.') + except AttributeError: + output = generated_transformer.run_query(file_path) + + ttime = time.time() + if output_format == 'root-file': + etime = time.time() + if isinstance(output, ak.Array): + awkward_arrays = {default_tree_name: output} + elif isinstance(output, dict): + awkward_arrays = output + with open(output_path, 'b+w') as wfile: + with uproot.recreate(wfile) as writer: + for key in awkward_arrays.keys(): + total_events = awkward_arrays[key].__len__() + if awkward_arrays[key].fields and total_events: + o_dict = {field: awkward_arrays[key][field] + for field in awkward_arrays[key].fields} + elif awkward_arrays[key].fields and not total_events: + o_dict = {field: np.array([]) + for field in awkward_arrays[key].fields} + elif not awkward_arrays[key].fields and total_events: + o_dict = {default_branch_name: awkward_arrays[key]} + else: + o_dict = {default_branch_name: np.array([])} + writer[key] = o_dict + + wtime = time.time() + elif output_format == 'raw-file': + etime = time.time() + total_events = 0 + output_path = output + wtime = time.time() else: - awkward_array = output + if isinstance(output, dict): + tree_name = list(output.keys())[0] + awkward_array = output[tree_name] + print(f'Returned type from your Python function is a dictionary - ' + f'Only the first key {tree_name} will be written as parquet files. ' + f'Please use root-file output to write all trees.') + else: + awkward_array = output - total_events = ak.num(awkward_array, axis=0) - arrow = ak.to_arrow_table(awkward_array) + total_events = ak.num(awkward_array, axis=0) + arrow = ak.to_arrow_table(awkward_array) - etime = time.time() + etime = time.time() - writer = pq.ParquetWriter(output_path, arrow.schema) - writer.write_table(table=arrow) - writer.close() + writer = pq.ParquetWriter(output_path, arrow.schema) + writer.write_table(table=arrow) + writer.close() - wtime = time.time() + wtime = time.time() output_size = os.stat(output_path).st_size print(f'Detailed transformer times. query_time:{round(ttime - stime, 3)} ' From 78c1d76705ae78e1e318529e276c053e1f1be48a Mon Sep 17 00:00:00 2001 From: Peter Onyisi Date: Thu, 8 May 2025 13:46:36 -0500 Subject: [PATCH 2/3] Only import awkward etc. if needed --- .../templates/transform_single_file.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/code_generator_python/python_code_generator/templates/transform_single_file.py b/code_generator_python/python_code_generator/templates/transform_single_file.py index dfb34e76e..f4c9ca5d2 100644 --- a/code_generator_python/python_code_generator/templates/transform_single_file.py +++ b/code_generator_python/python_code_generator/templates/transform_single_file.py @@ -3,10 +3,6 @@ import time from pathlib import Path import generated_transformer -import awkward as ak -import uproot -import pyarrow.parquet as pq -import numpy as np instance = os.environ.get('INSTANCE_NAME', 'Unknown') default_tree_name = "servicex" default_branch_name = "branch" @@ -34,6 +30,11 @@ def transform_single_file(file_path: str, output_path: Path, output_format: str) wtime = time.time() total_events = 0 except AttributeError: + import awkward as ak + import uproot + import pyarrow.parquet as pq + import numpy as np + output = generated_transformer.run_query(file_path) ttime = time.time() From 4f4076c0236947210618d69aa7e06e83b0f39a9b Mon Sep 17 00:00:00 2001 From: Peter Onyisi Date: Fri, 9 May 2025 21:43:55 -0500 Subject: [PATCH 3/3] Call query with string, not Path --- .../python_code_generator/templates/transform_single_file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code_generator_python/python_code_generator/templates/transform_single_file.py b/code_generator_python/python_code_generator/templates/transform_single_file.py index f4c9ca5d2..4f2d1ebe3 100644 --- a/code_generator_python/python_code_generator/templates/transform_single_file.py +++ b/code_generator_python/python_code_generator/templates/transform_single_file.py @@ -21,7 +21,7 @@ def transform_single_file(file_path: str, output_path: Path, output_format: str) # We first see if the function has the signature to directly write output # If it doesn't, then we assume it's giving us back awkward array results try: - generated_transformer.run_query(file_path, output_path) + generated_transformer.run_query(file_path, str(output_path)) if not output_path.exists(): raise RuntimeError("Transformation did not produce expected output file " f"{output_path}")