Coverage for rta_reconstruction/dl1_to_dl2.py: 79%
103 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-02 09:59 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-02 09:59 +0000
1"""DL1 to DL2 processing script."""
3import argparse
4import json
5from pathlib import Path
6from typing import Dict
8import astropy.units as u
9import joblib
10import numpy as np
11import pandas as pd
12from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
14from rta_reconstruction.dl1_reader import (
15 convert_az_and_sin_az_angle_to_degree,
16 filter_dl1,
17 interpolate_missing_alt_az,
18 read_dl1,
19 read_telescope_optics,
20)
21from rta_reconstruction.dl2_io import init_dl2_file, write_dl2_df
22from rta_reconstruction.image_displacement import (
23 disp_to_pos,
24 disp_vector_pol2cart,
25 update_disp_with_effective_focal_length,
26)
27from rta_reconstruction.utils.coordinates import camera_to_altaz, camera_to_shower_coordinates
28from rta_reconstruction.utils.logging import init_logging
31# TODO: move somewhere else ?
32def reco_source_position_sky(cog_x, cog_y, disp_dx, disp_dy, focal_length, pointing_alt, pointing_az):
33 """
34 Compute the reconstructed source position in the sky
36 Parameters
37 ----------
38 cog_x: `astropy.units.Quantity`
39 cog_y: `astropy.units.Quantity`
40 disp_dx: `astropy.units.Quantity`
41 disp_dy: `astropy.units.Quantity`
42 focal_length: `astropy.units.Quantity`
43 pointing_alt: `astropy.units.Quantity`
44 pointing_az: `astropy.units.Quantity`
46 Returns
47 -------
48 sky frame: `astropy.coordinates.sky_coordinate.SkyCoord`
49 """
50 src_x, src_y = disp_to_pos(disp_dx, disp_dy, cog_x, cog_y)
51 return camera_to_altaz(src_x, src_y, focal_length, pointing_alt, pointing_az)
54# TODO: use pydantic configuration instead of json to Dict
55# TODO: use more generic type hints than specific class (sklearn regressor/classifier maybe ?)
56# TODO: refactor in several steps to isolate the "predict" calls, to make it simpler to implement training
57def dl1_to_dl2(
58 dl1_df: pd.DataFrame,
59 dl1_dl2_config: Dict,
60 effective_focal_length,
61 energy_regressor: RandomForestRegressor,
62 gamma_classifier: RandomForestClassifier,
63 image_displacement_vector_regressor: RandomForestRegressor | None = None,
64 disp_norm_regressor: RandomForestRegressor | None = None,
65 disp_sign_classifier: RandomForestClassifier | None = None,
66) -> pd.DataFrame:
67 # TODO: make very clear in doc that this is "inplace" operations
68 # (this doesn't copy, simply we will add column to the df)
69 dl2_df = dl1_df
71 # TODO: refactor model computation to individual functions (if surrounding operations are really required)
73 # TODO: is log energy or energy used by disp vector, disp norm or disp sign ? then need concatenate
74 # to avoid copy ? Can't use "out" since predict doesn't support that API, could copy only vector of
75 # size (#samples) instead of everything though
76 # what could do, since fortran indexing may be best (TEST!?)
77 # load everything as a fortran order rec-array (pre-allocated with dl2 columns.)
78 # that way we can in with column names, and avoid pandas copy
79 # pytables read has an "out" parameters, but it doesn't check the types between out and datadisk (only copies)
80 dl2_df["log_reco_energy"] = energy_regressor.predict(dl2_df.loc[:, dl1_dl2_config["energy_regressor_features"]])
81 dl2_df["reco_energy"] = 10 ** dl2_df["log_reco_energy"]
83 if image_displacement_vector_regressor is not None:
84 disp_vector = image_displacement_vector_regressor.predict(
85 dl2_df.loc[:, dl1_dl2_config["disp_vector_regressor_features"]]
86 )
87 else:
88 dl2_df["reco_disp_norm"] = disp_norm_regressor.predict(
89 dl2_df.loc[:, dl1_dl2_config["disp_norm_regressor_features"]]
90 )
91 disp_sign_proba = disp_sign_classifier.predict_proba(
92 dl2_df[:, dl1_dl2_config["disp_sign_classifier_features"]]
93 )
94 # TODO: since we only care about something been gamma or not, 1 probability should be enough
95 # we need to change the models for it to predict "gamma" or "not-gamma" with single proba.
96 col = list(disp_sign_classifier.classes_).index(1)
97 disp_sign = np.where(disp_sign_proba[:, col] > 0.5, 1, -1)
98 dl2_df["reco_disp_sign"] = disp_sign
99 dl2_df["reco_disp_sign_proba"] = disp_sign_proba[:, 0]
101 disp_vector = disp_vector_pol2cart(dl2_df["reco_disp_norm"], dl2_df["psi"], disp_sign)
103 dl2_df["reco_disp_dx"] = disp_vector[:, 0]
104 dl2_df["reco_disp_dy"] = disp_vector[:, 1]
105 dl2_df["reco_src_x"], dl2_df["reco_src_y"] = disp_to_pos(
106 dl2_df.reco_disp_dx,
107 dl2_df.reco_disp_dy,
108 dl2_df.x,
109 dl2_df.y,
110 )
112 # TODO: stack coordinates and project in // as in pyhiperta ?
113 longi, _ = camera_to_shower_coordinates(
114 dl2_df["reco_src_x"], dl2_df["reco_src_y"], dl2_df["x"], dl2_df["y"], dl2_df["psi"]
115 )
117 # TODO: check sign of longitudinal coordinate with HiPeRTA
118 # TODO: is this required ?
119 # Obtain the time gradient with sign relative to the reconstructed shower direction (reco_src_x, reco_src_y)
120 # Defined positive if light arrival times increase with distance to it. Negative otherwise:
121 dl2_df["signed_time_gradient"] = -1 * np.sign(longi) * dl2_df["time_gradient"]
122 # Obtain skewness with sign relative to the reconstructed shower direction (reco_src_x, reco_src_y)
123 # Defined on the major image axis; sign is such that it is typically positive for gammas:
124 dl2_df["signed_skewness"] = -1 * np.sign(longi) * dl2_df["skewness"]
126 # TODO: what if we use simulations but still want to use the reconstructed alt az ?
127 # TODO: better default value (nan ?)
128 if "mc_alt_tel" in dl2_df.columns:
129 alt_tel = dl2_df["mc_alt_tel"].values
130 az_tel = dl2_df["mc_az_tel"].values
131 elif "alt_tel" in dl2_df.columns:
132 alt_tel = dl2_df["alt_tel"].values
133 az_tel = dl2_df["az_tel"].values
134 else:
135 alt_tel = -np.pi / 2.0 * np.ones(len(dl2_df))
136 az_tel = -np.pi / 2.0 * np.ones(len(dl2_df))
138 # TODO: this calls astropy coordinates changing routines, can it be optimized ?
139 src_pos_reco = reco_source_position_sky(
140 dl2_df.x.values * u.m,
141 dl2_df.y.values * u.m,
142 dl2_df.reco_disp_dx.values * u.m,
143 dl2_df.reco_disp_dy.values * u.m,
144 effective_focal_length,
145 alt_tel * u.rad,
146 az_tel * u.rad,
147 )
148 dl2_df["reco_alt"] = src_pos_reco.alt.rad
149 dl2_df["reco_az"] = src_pos_reco.az.rad
151 gammaness = gamma_classifier.predict_proba(dl2_df.loc[:, dl1_dl2_config["gamma_classifier_features"]])
153 # TODO: replace this with a single proba predictor like disp sign, so no hardcoded class values!
154 # gammaness is the prediction probability for the class 0 (proton: class 101)
155 mc_type_gamma, mc_type_proton = 0, 101
156 col = list(gamma_classifier.classes_).index(mc_type_gamma)
157 dl2_df["gammaness"] = gammaness[:, col]
158 dl2_df["reco_type"] = np.where(gammaness[:, col] > 0.5, mc_type_gamma, mc_type_proton)
160 return dl2_df
163def main():
164 # TODO: init logging with log files passed as argument or in config
165 init_logging(log_filename="dl1_to_dl2.log")
167 parser = argparse.ArgumentParser(
168 description="Stand-alone DL1 to DL2 processing from ctapipe-lstchain for Real Time Analysis",
169 formatter_class=argparse.ArgumentDefaultsHelpFormatter,
170 )
171 parser.add_argument(
172 "--config",
173 "-c",
174 action="store",
175 type=Path,
176 dest="config_path",
177 help="Path to the configuration file",
178 required=True,
179 )
180 parser.add_argument(
181 "--input_dl1",
182 "-i",
183 type=Path,
184 nargs="+",
185 dest="dl1_file_paths",
186 help="Path(s) to DL1 file(s) to process to DL2.",
187 required=True,
188 )
189 parser.add_argument(
190 "--energy_regressor",
191 "-e",
192 action="store",
193 type=Path,
194 required=True,
195 dest="energy_regressor_model_path",
196 help="Path to the energy regressor model (.sav)",
197 )
198 parser.add_argument(
199 "--gamma_classifier",
200 "-g",
201 action="store",
202 type=Path,
203 required=True,
204 dest="gamma_classifier_path",
205 help="Path to the gamma/hadron classifier model (.sav)",
206 )
207 parser.add_argument(
208 "--image_displacement_vector_regressor",
209 "-v",
210 action="store",
211 type=Path,
212 default=None,
213 dest="disp_vector_regressor_model_path",
214 help="Path to the image displacement vector regressor model (.sav). "
215 "Either this argument must be specified, or `image_displacement_norm_regressor` and "
216 "`image_displacement_sign_classifier`",
217 )
218 parser.add_argument(
219 "--image_displacement_norm_regressor",
220 "-n",
221 action="store",
222 type=Path,
223 default=None,
224 dest="disp_norm_regressor_model_path",
225 help="Path to the image displacement norm regressor model (.sav)"
226 "Either this argument and `image_displacement_sign_classifier` must be specified, or "
227 "`image_displacement_vector_regressor`",
228 )
229 parser.add_argument(
230 "--image_displacement_sign_classifier",
231 "-s",
232 action="store",
233 type=Path,
234 default=None,
235 dest="disp_sign_classifier_model_path",
236 help="Path to the image displacement sign classifier model (.sav)"
237 "Either this argument and `image_displacement_norm_regressor` must be specified, or "
238 "`image_displacement_vector_regressor`",
239 )
240 parser.add_argument(
241 "--output_dl2",
242 "-o",
243 type=Path,
244 nargs="+",
245 dest="dl2_file_paths",
246 help="Path(s) to DL2 file(s) to write the output.",
247 required=True,
248 )
250 args = parser.parse_args()
252 if len(args.dl1_file_paths) != len(args.dl2_file_paths):
253 raise argparse.ArgumentTypeError(
254 "The number {} of input dl1 files must match the number {} of output DL2 files.".format(
255 len(args.dl1_file_paths), len(args.dl2_files)
256 )
257 )
259 if args.disp_vector_regressor_model_path is not None and args.disp_norm_regressor_model_path is not None:
260 raise argparse.ArgumentTypeError(
261 "`image_displacement_vector_regressor` and `image_displacement_norm_regressor` are mutually exclusive arguments. "
262 "Got {} and {}".format(args.disp_vector_regressor_model_path, args.disp_norm_regressor_model_path)
263 )
264 if args.disp_vector_regressor_model_path is not None and args.disp_sign_classifier_model_path is not None:
265 raise argparse.ArgumentTypeError(
266 "`image_displacement_vector_regressor` and `image_displacement_sign_classifier` are mutually exclusive arguments. "
267 "Got {} and {}".format(args.disp_vector_regressor_model_path, args.disp_sign_classifier_model_path)
268 )
269 if (
270 args.disp_vector_regressor_model_path is None
271 and args.disp_norm_regressor_model_path is None
272 and args.disp_sign_classifier_model_path is None
273 ):
274 raise argparse.ArgumentTypeError(
275 "Either `image_displacement_vector_regressor` or `image_displacement_norm_regressor` and "
276 "`image_displacement_sign_classifier` must be specified, got none of them."
277 )
278 if (args.disp_norm_regressor_model_path is not None and args.disp_sign_classifier_model_path is None) or (
279 args.disp_norm_regressor_model_path is None and args.disp_sign_classifier_model_path is not None
280 ):
281 raise argparse.ArgumentError(
282 "`image_displacement_vector_regressor` and `image_displacement_sign_classifier` must both be specified if used. Got {} and {}".format(
283 args.disp_norm_regressor_model_path, args.disp_sign_classifier_model_path
284 )
285 )
287 with open(args.config_path, "r") as config_f:
288 config = json.load(config_f)
290 energy_regressor = joblib.load(args.energy_regressor_model_path)
291 gamma_classifier = joblib.load(args.gamma_classifier_path)
292 all_features = config["energy_regressor_features"] + config["gamma_classifier_features"]
294 using_disp_vector = args.disp_vector_regressor_model_path is not None
295 if using_disp_vector:
296 disp_vector_regressor = joblib.load(args.disp_vector_regressor_model_path)
297 disp_norm_regressor = None
298 disp_sign_classifier = None
299 all_features += config["disp_vector_regressor_features"]
300 else:
301 disp_vector_regressor = None
302 disp_norm_regressor = joblib.load(args.disp_norm_regressor_model_path)
303 disp_sign_classifier = joblib.load(args.disp_sign_classifier_model_path)
304 all_features += config["disp_norm_regressor_features"] + config["disp_sign_classifier_features"]
306 for dl1_file_path, dl2_file_path in zip(args.dl1_file_paths, args.dl2_file_paths):
307 # TODO: read dl1 straight in a numpy array (avoid pandas to avoid copy before passing to sklearn)
308 # should be Fortran order ? -> benchmark
309 dl1_df = read_dl1(dl1_file_path) # TODO: make table "key" configurable
310 interpolate_missing_alt_az(dl1_df) # TODO: required after dl1 alt-az maker ? what about simple dropna
311 tel_optics = read_telescope_optics(dl1_file_path) # TODO: make sure correct
312 convert_az_and_sin_az_angle_to_degree(
313 dl1_df
314 ) # TODO: is this required ? (angles come from arctan2, and sin is not used ?)
315 dl1_df = filter_dl1(
316 dl1_df, filters=config["events_filters"], finite_params=all_features
317 ) # TODO: config ? (filter events based on column names, filters, ~isnan, etc.)
319 # TODO: is this required ?
320 # Update parameters related to target direction on camera frame for MC data
321 # taking into account of the aberration effect using effective focal length
322 if "disp_norm" in dl1_df.columns:
323 update_disp_with_effective_focal_length(dl1_df, effective_focal_length=tel_optics.effective_focal_length)
325 # TODO: make dl1_to_dl2 not return anything - inplace operations in the df
326 dl2_df = dl1_to_dl2(
327 dl1_df,
328 config,
329 tel_optics.effective_focal_length,
330 energy_regressor,
331 gamma_classifier,
332 disp_vector_regressor,
333 disp_norm_regressor,
334 disp_sign_classifier,
335 )
337 init_dl2_file(dl2_file_path, dl1_file_path)
338 write_dl2_df(
339 dl2_file_path,
340 dl2_df,
341 attributes={"config": config},
342 )
345if __name__ == "__main__":
346 main()