diff --git a/magicctapipe/scripts/lst1_magic/README.md b/magicctapipe/scripts/lst1_magic/README.md index 9da8b49ee..c494d0d72 100644 --- a/magicctapipe/scripts/lst1_magic/README.md +++ b/magicctapipe/scripts/lst1_magic/README.md @@ -29,30 +29,17 @@ During the analysis, some files (i.e., bash scripts, lists of sources and runs) ### DL0 to DL1 -In this step, we will convert the MAGIC Calibrated data to Data Level (DL) 1 (our goal is to reach DL3) and MC DL0 to DL1. +In this step, we will convert the MAGIC Calibrated data to Data Level (DL) 1 (our goal is to reach DL3). -In your working IT Container directory (e.g. /fefs/aswg/workspace/yourname/yourprojectname), open your environment with the command `conda activate {env_name}` and update the file `config_auto_MCP.yaml` according to your analysis. If you need non-standard parameters (e.g., for the cleaning), take care that the `resources/config.yaml` file gets installed when you install the pipeline, so you will have to copy it, e.g. in your workspace, modify it and put the path to this new file in the `config_auto_MCP.yaml` (this way you don't need to install again the pipeline). +In your working IT Container directory (i.e., `workspace_dir`), open your environment with the command `conda activate {env_name}` and update the file `config_auto_MCP.yaml` according to your analysis. If you need non-standard parameters (e.g., for the cleaning), take care that the `resources/config.yaml` file gets installed when you install the pipeline, so you will have to copy it, e.g. in your workspace, modify it and put the path to this new file in the `config_auto_MCP.yaml` (this way you don't need to install again the pipeline). -The file `config_auto_MCP.yaml` must contain the telescope IDs, the directories with the MC data (ignored if you set NSB_matching = true), the data selection, and some information on the night sky background (NSB) level and software versions: +The file `config_auto_MCP.yaml` must contain parameters for data selection and some information on the night sky background (NSB) level and software versions: ``` - mc_tel_ids: - LST-1: 1 - LST-2: 0 - LST-3: 0 - LST-4: 0 - MAGIC-I: 2 - MAGIC-II: 3 - directories: workspace_dir : "/fefs/aswg/workspace/elisa.visentin/auto_MCP_PR/" # Output directory where all the data products will be saved. - # MC paths below are ignored if you set NSB_matching = true. - MC_gammas : "/fefs/aswg/data/mc/DL0/LSTProd2/TestDataset/sim_telarray" # set to "" if you don't want to process these Monte Carlo simulations. - MC_electrons : "" - MC_helium : "" - MC_protons : "/fefs/aswg/data/mc/DL0/LSTProd2/TrainingDataset/Protons/dec_2276/sim_telarray" - MC_gammadiff : "/fefs/aswg/data/mc/DL0/LSTProd2/TrainingDataset/GammaDiffuse/dec_2276/sim_telarray/" + data_selection: source_name_database: "CrabNebula" # MUST BE THE SAME AS IN THE DATABASE; Set to null to process all sources in the given time range. @@ -68,17 +55,13 @@ general: base_config_file: '' # path + name to a custom MCP config file. If not provided, the default config.yaml file will be used LST_version : "v0.10" # check the `processed_lstchain_file` version in the LST database! LST_tailcut : "tailcut84" - focal_length : "effective" simtel_nsb : "/fefs/aswg/data/mc/DL0/LSTProd2/TestDataset/sim_telarray/node_theta_14.984_az_355.158_/output_v1.4/simtel_corsika_theta_14.984_az_355.158_run10.simtel.gz" # simtel file (DL0) to evaluate NSB - lstchain_modified_config : true # use_flatfield_heuristic = True to evaluate NSB - proton_train_fraction : 0.8 # 0.8 means that 80% of the DL1 protons will be used for training the Random Forest. + lstchain_modified_config : true # use_flatfield_heuristic = True to evaluate NSB nsb : [0.5, 1.0, 1.5, 2.0, 2.5, 3.0] env_name : magic-lst # name of the conda environment to be used to process data. cluster : "SLURM" # cluster management system on which data are processed. At the moment we have only SLURM available, in the future maybe also condor (PIC, CNAF). - NSB_matching : true # Set to false to process also the MCs. Set to true if adequate MC productions (DLx) are already available on the IT Container. - NSB_MC : 0.5 # extra noise in dim pixels used to process MCs; e.g., you could put here the average NSB value of the processed LST runs. Ignored if NSB_matching=true. - + ``` WARNING: Only the runs for which the `LST_version` parameter matches the `processed_lstchain_file` version in the LST database (i.e., the version used to evaluate the NSB level; generally the last available and processable version of a run) will be processed. @@ -113,9 +96,9 @@ The command `dl1_production` does a series of things: - Creates a directory with the target name within the directory `yourprojectname/{MCP_version}` and several subdirectories inside it that are necessary for the rest of the data reduction. The main directories are: ``` -/fefs/aswg/workspace/yourname/yourprojectname/VERSION/ -/fefs/aswg/workspace/yourname/yourprojectname/VERSION/{source}/DL1 -/fefs/aswg/workspace/yourname/yourprojectname/VERSION/{source}/DL1/[subdirectories] +workspace_dir/VERSION/ +workspace_dir/VERSION/{source}/DL1 +workspace_dir/VERSION/{source}/DL1/[subdirectories] ``` where [subdirectories] stands for several subdirectories containing the MAGIC subruns in the DL1 format. - Generates a configuration file called `config_DL0_to_DL1.yaml` with telescope ID information and adopted imaging/cleaning cuts, and puts it in the directory `[...]/yourprojectname/VERSION/{source}/` created in the previous step. @@ -131,7 +114,7 @@ or > $ squeue -u your_user_name -Once it is done, all of the subdirectories in `/fefs/aswg/workspace/yourname/yourprojectname/VERSION/{source}/DL1` will be filled with files of the type `dl1_MX.RunXXXXXX.0XX.h5` for each MAGIC subrun. +Once it is done, all of the subdirectories in `workspace_dir/VERSION/{source}/DL1` will be filled with files of the type `dl1_MX.RunXXXXXX.0XX.h5` for each MAGIC subrun. WARNING: some of these jobs could fail due to 'broken' input files: before moving to the next step, check for failed jobs (through `job_accounting` and/or log files) and remove the output files produced by these failed jobs (these output files will generally have a very small size, lower than few kB, and cannot be read in the following steps) diff --git a/magicctapipe/scripts/lst1_magic/semi_automatic_scripts/__init__.py b/magicctapipe/scripts/lst1_magic/semi_automatic_scripts/__init__.py index 22dc6ca2a..d153a2dd2 100644 --- a/magicctapipe/scripts/lst1_magic/semi_automatic_scripts/__init__.py +++ b/magicctapipe/scripts/lst1_magic/semi_automatic_scripts/__init__.py @@ -8,41 +8,34 @@ from .coincident_events import configfile_coincidence, linking_bash_lst from .dl1_production import ( config_file_gen, - directories_generator_MC, directories_generator_real, lists_and_bash_gen_MAGIC, - lists_and_bash_generator, ) from .job_accounting import run_shell from .list_from_h5 import clear_files, list_run, magic_date, split_lst_date from .merge_stereo import MergeStereo -from .merging_runs import merge, mergeMC, split_train_test -from .stereo_events import bash_stereo, bash_stereoMC, configfile_stereo +from .merging_runs import merge +from .stereo_events import bash_stereo, configfile_stereo __all__ = [ "bash_stereo", - "bash_stereoMC", "clear_files", "configfile_coincidence", "configfile_stereo", "config_file_gen", "directories_generator_real", - "directories_generator_MC", "existing_files", "fix_lists_and_convert", "linking_bash_lst", - "lists_and_bash_generator", "lists_and_bash_gen_MAGIC", "list_run", "magic_date", "merge", - "mergeMC", "MergeStereo", "missing_files", "rc_lines", "run_shell", "slurm_lines", "split_lst_date", - "split_train_test", "table_magic_runs", ] diff --git a/magicctapipe/scripts/lst1_magic/semi_automatic_scripts/config_auto_MCP.yaml b/magicctapipe/scripts/lst1_magic/semi_automatic_scripts/config_auto_MCP.yaml index 509585ba3..1ce9c418a 100644 --- a/magicctapipe/scripts/lst1_magic/semi_automatic_scripts/config_auto_MCP.yaml +++ b/magicctapipe/scripts/lst1_magic/semi_automatic_scripts/config_auto_MCP.yaml @@ -1,32 +1,24 @@ directories: - workspace_dir : "/fefs/aswg/workspace/elisa.visentin/auto_MCP_PR/" # Output directory where all the data products will be saved. - # MC paths below are ignored if you set NSB_matching = true. - MC_gammas : "/fefs/aswg/data/mc/DL0/LSTProd2/TestDataset/sim_telarray" # set to "" if you don't want to process these Monte Carlo simulations. - MC_electrons : "" - MC_helium : "" - MC_protons : "/fefs/aswg/data/mc/DL0/LSTProd2/TrainingDataset/Protons/dec_2276/sim_telarray" - MC_gammadiff : "/fefs/aswg/data/mc/DL0/LSTProd2/TrainingDataset/GammaDiffuse/dec_2276/sim_telarray/" + workspace_dir: "/fefs/aswg/workspace/elisa.visentin/auto_MCP_PR/" # Output directory where all the data products will be saved. + data_selection: source_name_database: "CrabNebula" # MUST BE THE SAME AS IN THE DATABASE; Set to null to process all sources in the given time range. source_name_output: 'Crabtest' # Name tag of your target. Used only if source_name_database != null. - time_range : True # Search for all runs in a LST time range (e.g., 2020_01_01 -> 2022_01_01). - min : "2023_11_17" - max : "2024_03_03" - date_list : ['2020_12_15','2021_03_11'] # LST list of days to be processed (only if time_range=False), format: YYYY_MM_DD. + time_range: True # Search for all runs in a LST time range (e.g., 2020_01_01 -> 2022_01_01). + min: "2023_11_17" + max: "2024_03_03" + date_list: ['2020_12_15','2021_03_11'] # LST list of days to be processed (only if time_range=False), format: YYYY_MM_DD. skip_LST_runs: [3216,3217] # LST runs to ignore. skip_MAGIC_runs: [5094658] # MAGIC runs to ignore. general: base_config_file: '' # path + name to a custom MCP config file. If not provided, the default config.yaml file will be used - LST_version : "v0.10" # check the `processed_lstchain_file` version in the LST database! - LST_tailcut : "tailcut84" - focal_length : "effective" - simtel_nsb : "/fefs/aswg/data/mc/DL0/LSTProd2/TestDataset/sim_telarray/node_theta_14.984_az_355.158_/output_v1.4/simtel_corsika_theta_14.984_az_355.158_run10.simtel.gz" # simtel file (DL0) to evaluate NSB - lstchain_modified_config : true # use_flatfield_heuristic = True to evaluate NSB - proton_train_fraction : 0.8 # 0.8 means that 80% of the DL1 protons will be used for training the Random Forest. - nsb : [0.5, 1.0, 1.5, 2.0, 2.5, 3.0] - env_name : magic-lst # name of the conda environment to be used to process data. - cluster : "SLURM" # cluster management system on which data are processed. At the moment we have only SLURM available, in the future maybe also condor (PIC, CNAF). - NSB_matching : true # Set to false to process also the MCs. Set to true if adequate MC productions (DLx) are already available on the IT Container. - NSB_MC : 0.5 # extra noise in dim pixels used to process MCs; e.g., you could put here the average NSB value of the processed LST runs. Ignored if NSB_matching=true. + LST_version: "v0.10" # check the `processed_lstchain_file` version in the LST database! + LST_tailcut: "tailcut84" + simtel_nsb: "/fefs/aswg/data/mc/DL0/LSTProd2/TestDataset/sim_telarray/node_theta_14.984_az_355.158_/output_v1.4/simtel_corsika_theta_14.984_az_355.158_run10.simtel.gz" # simtel file (DL0) to evaluate NSB + lstchain_modified_config: true # use_flatfield_heuristic = True to evaluate NSB + nsb: [0.5, 1.0, 1.5, 2.0, 2.5, 3.0] + env_name: magic-lst # name of the conda environment to be used to process data. + cluster: "SLURM" # cluster management system on which data are processed. At the moment we have only SLURM available, in the future maybe also condor (PIC, CNAF). + diff --git a/magicctapipe/scripts/lst1_magic/semi_automatic_scripts/dl1_production.py b/magicctapipe/scripts/lst1_magic/semi_automatic_scripts/dl1_production.py index e8043c0c7..bfd17707e 100644 --- a/magicctapipe/scripts/lst1_magic/semi_automatic_scripts/dl1_production.py +++ b/magicctapipe/scripts/lst1_magic/semi_automatic_scripts/dl1_production.py @@ -11,7 +11,7 @@ No LST data is used here. Standard usage: -$ dl1_production (-t analysis_type) (-c config_file.yaml) +$ dl1_production (-c config_file.yaml) """ import argparse import glob @@ -32,10 +32,8 @@ __all__ = [ "config_file_gen", - "lists_and_bash_generator", "lists_and_bash_gen_MAGIC", "directories_generator_real", - "directories_generator_MC", ] logger = logging.getLogger(__name__) @@ -43,7 +41,7 @@ logger.setLevel(logging.INFO) -def config_file_gen(target_dir, noise_value, NSB_match, source_name, config_file): +def config_file_gen(target_dir, source_name, config_file): """ Here we create the configuration file needed for transforming DL0 into DL1 @@ -52,10 +50,6 @@ def config_file_gen(target_dir, noise_value, NSB_match, source_name, config_file ---------- target_dir : path Directory to store the results - noise_value : list - List of the noise correction values for LST - NSB_match : bool - If real data are matched to pre-processed MCs or not source_name : str Name of the target source config_file : str @@ -69,152 +63,17 @@ def config_file_gen(target_dir, noise_value, NSB_match, source_name, config_file ) as fc: # "rb" mode opens the file in binary format for reading config_dict = yaml.safe_load(fc) - LST_config = config_dict["LST"] - MAGIC_config = config_dict["MAGIC"] - - if not NSB_match: - LST_config["increase_nsb"]["extra_noise_in_dim_pixels"] = noise_value[0] - LST_config["increase_nsb"]["extra_bias_in_dim_pixels"] = noise_value[2] - LST_config["increase_nsb"]["extra_noise_in_bright_pixels"] = noise_value[1] - conf = { "mc_tel_ids": config_dict["mc_tel_ids"], - "LST": LST_config, - "MAGIC": MAGIC_config, + "LST": config_dict["LST"], + "MAGIC": config_dict["MAGIC"], } - if source_name == "MC": - file_name = f"{target_dir}/v{__version__}/MC/config_DL0_to_DL1.yaml" - else: - file_name = f"{target_dir}/v{__version__}/{source_name}/config_DL0_to_DL1.yaml" + + file_name = f"{target_dir}/v{__version__}/{source_name}/config_DL0_to_DL1.yaml" with open(file_name, "w") as f: yaml.dump(conf, f, default_flow_style=False) -def lists_and_bash_generator( - particle_type, - target_dir, - MC_path, - focal_length, - env_name, - cluster, -): - - """ - This function creates the lists list_nodes_*_complete.txt and list_folder_*.txt with the MC file paths. - After that, it generates a few bash scripts to link the MC paths to each subdirectory and to process them from DL0 to DL1. - These bash scripts will be called later in the main() function below. This step will be skipped in case the MC path has not been provided (MC_path='') - - Parameters - ---------- - particle_type : str - Particle type (e.g., protons) - target_dir : str - Directory to store the results - MC_path : str - Path to the MCs DL0s - focal_length : str - Focal length to be used to process MCs (e.g., 'nominal') - env_name : str - Name of the environment - cluster : str - Cluster system - """ - - if MC_path == "": - return - print(f"running {particle_type} from {MC_path}") - process_name = "MC" - - list_of_nodes = glob.glob(f"{MC_path}/node*") - dir1 = f"{target_dir}/v{__version__}/MC" - with open( - f"{dir1}/logs/list_nodes_{particle_type}_complete.txt", "w" - ) as f: # creating list_nodes_gammas_complete.txt - for i in list_of_nodes: - out_list = glob.glob(f"{i}/output*") - if len(out_list) == 0: - logger.error( - f"No output file for node {i}, or the directory structure is not the usual one. Skipping..." - ) - continue - elif len(out_list) == 1: - f.write(f"{out_list[0]}\n") - else: - output_index = input( - f"The available outputs are {out_list}, please provide the array index of the desired one:" - ) - f.write(f"{out_list[output_index]}\n") - - with open( - f"{dir1}/logs/list_folder_{particle_type}.txt", "w" - ) as f: # creating list_folder_gammas.txt - for i in list_of_nodes: - f.write(f'{i.split("/")[-1]}\n') - - #################################################################################### - # bash scripts that link the MC paths to each subdirectory. - #################################################################################### - if cluster != "SLURM": - logger.warning( - "Automatic processing not implemented for the cluster indicated in the config file" - ) - return - with open(f"linking_MC_{particle_type}_paths.sh", "w") as f: - slurm = slurm_lines( - queue="short", - job_name=process_name, - out_name=f"{dir1}/DL1/{particle_type}/logs/slurm-linkMC-%x.%j", - ) - lines_of_config_file = slurm + [ - "while read -r -u 3 lineA && read -r -u 4 lineB\n", - "do\n", - f" cd {dir1}/DL1/{particle_type}\n", - " mkdir $lineB\n", - " cd $lineA\n", - " ls -lR *.gz |wc -l\n", - f" mkdir -p {dir1}/DL1/{particle_type}/$lineB/logs/\n", - f" ls *.gz > {dir1}/DL1/{particle_type}/$lineB/logs/list_dl0.txt\n", - ' string=$lineA"/"\n', - f" export file={dir1}/DL1/{particle_type}/$lineB/logs/list_dl0.txt\n\n", - " cat $file | while read line; do echo $string${line}" - + f" >>{dir1}/DL1/{particle_type}/$lineB/logs/list_dl0_ok.txt; done\n\n", - ' echo "folder $lineB and node $lineA"\n', - f'done 3<"{dir1}/logs/list_nodes_{particle_type}_complete.txt" 4<"{dir1}/logs/list_folder_{particle_type}.txt"\n', - "", - ] - f.writelines(lines_of_config_file) - - ################################################################################################################ - # bash script that applies lst1_magic_mc_dl0_to_dl1.py to all MC data files. - ################################################################################################################ - - number_of_nodes = glob.glob(f"{MC_path}/node*") - number_of_nodes = len(number_of_nodes) - 1 - with open(f"linking_MC_{particle_type}_paths_r.sh", "w") as f: - slurm = slurm_lines( - queue="xxl", - job_name=process_name, - array=number_of_nodes, - mem="10g", - out_name=f"{dir1}/DL1/{particle_type}/logs/slurm-%x.%A_%a", - ) - lines_of_config_file = slurm + [ - f"cd {dir1}/DL1/{particle_type}\n\n", - f"export INF={dir1}/logs\n", - f"SAMPLE_LIST=($(<$INF/list_folder_{particle_type}.txt))\n", - "SAMPLE=${SAMPLE_LIST[${SLURM_ARRAY_TASK_ID}]}\n", - "cd $SAMPLE\n\n", - f"export LOG={dir1}/DL1/{particle_type}/logs/simtel_{{$SAMPLE}}_${{SLURM_ARRAY_JOB_ID}}_${{SLURM_ARRAY_TASK_ID}}_all.log\n", - "cat logs/list_dl0_ok.txt | while read line\n", - "do\n", - f" cd {dir1}/../\n", - f" conda run -n {env_name} lst1_magic_mc_dl0_to_dl1 --input-file $line --output-dir {dir1}/DL1/{particle_type}/$SAMPLE --config-file {dir1}/config_DL0_to_DL1.yaml --focal_length_choice {focal_length}>>$LOG 2>&1\n\n", - "done\n", - "", - ] - f.writelines(lines_of_config_file) - - def lists_and_bash_gen_MAGIC( target_dir, telescope_ids, MAGIC_runs, source, env_name, cluster ): @@ -301,9 +160,7 @@ def lists_and_bash_gen_MAGIC( f.writelines(lines) -def directories_generator_real( - target_dir, telescope_ids, MAGIC_runs, NSB_match, source_name -): +def directories_generator_real(target_dir, telescope_ids, MAGIC_runs, source_name): """ Here we create all subdirectories for a given workspace and target name. @@ -315,37 +172,12 @@ def directories_generator_real( List of the telescope IDs (set by the user) MAGIC_runs : array MAGIC dates and runs to be processed - NSB_match : bool - If real data are matched to pre-processed MCs or not source_name : str Name of the target source """ - if NSB_match: - os.makedirs(f"{target_dir}/v{__version__}/{source_name}/DL1", exist_ok=True) - dl1_dir = str(f"{target_dir}/v{__version__}/{source_name}/DL1") - else: - - dl1_dir = str(f"{target_dir}/v{__version__}/{source_name}/DL1") - if not os.path.exists(f"{target_dir}/v{__version__}/{source_name}"): - os.makedirs( - f"{target_dir}/v{__version__}/{source_name}/DL1", - exist_ok=True, - ) - - else: - overwrite = input( - f'data directory for {target_dir.split("/")[-1]} already exists. Would you like to overwrite it? [only "y" or "n"]: ' - ) - if overwrite == "y": - os.system(f"rm -r {target_dir}/v{__version__}/{source_name}") - os.makedirs( - f"{target_dir}/v{__version__}/{source_name}/DL1", - exist_ok=True, - ) - - else: - print("Directory not modified.") + dl1_dir = str(f"{target_dir}/v{__version__}/{source_name}/DL1") + os.makedirs(dl1_dir, exist_ok=True) ########################################### # MAGIC @@ -356,50 +188,6 @@ def directories_generator_real( os.makedirs(f"{dl1_dir}/M{magic}/{i[0]}/{i[1]}/logs", exist_ok=True) -def directories_generator_MC(target_dir, telescope_ids): - - """ - Here we create all subdirectories for a given workspace and target name. - - Parameters - ---------- - target_dir : str - Directory to store the results - telescope_ids : list - List of the telescope IDs (set by the user) - """ - - dir_list = [ - "gammas", - "gammadiffuse", - "electrons", - "protons", - "helium", - ] - if not os.path.exists(f"{target_dir}/v{__version__}/MC"): - os.makedirs(f"{target_dir}/v{__version__}/MC/logs", exist_ok=True) - os.makedirs(f"{target_dir}/v{__version__}/MC/DL1", exist_ok=True) - for dir in dir_list: - os.makedirs( - f"{target_dir}/v{__version__}/MC/DL1/{dir}/logs", - exist_ok=True, - ) - else: - overwrite = input( - 'MC directory already exists. Would you like to overwrite it? [only "y" or "n"]: ' - ) - if overwrite == "y": - os.system(f"rm -r {target_dir}/v{__version__}/MC") - os.makedirs(f"{target_dir}/v{__version__}/MC/logs", exist_ok=True) - for dir in dir_list: - os.makedirs( - f"{target_dir}/v{__version__}/MC/DL1/{dir}/logs", - exist_ok=True, - ) - else: - print("Directory not modified.") - - def main(): """ @@ -409,15 +197,6 @@ def main(): # Here we are simply collecting the parameters from the command line, as input file, output directory, and configuration file parser = argparse.ArgumentParser() - parser.add_argument( - "--analysis-type", - "-t", - choices=["onlyMAGIC", "onlyMC"], - dest="analysis_type", - type=str, - default="doEverything", - help="You can type 'onlyMAGIC' or 'onlyMC' to run this script only on MAGIC or MC data, respectively.", - ) parser.add_argument( "--config-file", @@ -436,15 +215,7 @@ def main(): telescope_ids = list(config["mc_tel_ids"].values()) env_name = config["general"]["env_name"] - NSB_match = config["general"]["NSB_matching"] config_file = config["general"]["base_config_file"] - - MC_gammas = config["directories"]["MC_gammas"] - MC_electrons = config["directories"]["MC_electrons"] - MC_helium = config["directories"]["MC_helium"] - MC_protons = config["directories"]["MC_protons"] - MC_gammadiff = config["directories"]["MC_gammadiff"] - focal_length = config["general"]["focal_length"] source_in = config["data_selection"]["source_name_database"] source = config["data_selection"]["source_name_output"] cluster = config["general"]["cluster"] @@ -458,95 +229,46 @@ def main(): source = source_in source_list = [source] - noise_value = [0, 0, 0] - if not NSB_match: - nsb = config["general"]["NSB_MC"] - - noisebright = 1.15 * pow(nsb, 1.115) - biasdim = 0.358 * pow(nsb, 0.805) - noise_value = [nsb, noisebright, biasdim] - - if not NSB_match: - # Below we run the analysis on the MC data - if (args.analysis_type == "onlyMC") or (args.analysis_type == "doEverything"): - directories_generator_MC( - str(target_dir), telescope_ids - ) # Here we create all the necessary directories in the given workspace and collect the main directory of the target - config_file_gen(target_dir, noise_value, NSB_match, "MC", config_file) - to_process = { - "gammas": MC_gammas, - "electrons": MC_electrons, - "helium": MC_helium, - "protons": MC_protons, - "gammadiffuse": MC_gammadiff, - } - for particle in to_process.keys(): - lists_and_bash_generator( - particle, - target_dir, - to_process[particle], - focal_length, - env_name, - cluster, - ) - list_of_MC = glob.glob(f"linking_MC_{particle}_*.sh") - if len(list_of_MC) < 2: - logger.warning( - f"No bash script has been produced for processing {particle}" - ) - else: - launch_jobs_MC = f"linking=$(sbatch --parsable linking_MC_{particle}_paths.sh) && running=$(sbatch --parsable --dependency=afterany:$linking linking_MC_{particle}_paths_r.sh)" - os.system(launch_jobs_MC) - # Here we do the MC DL0 to DL1 conversion: - for source_name in source_list: - if ( - (args.analysis_type == "onlyMAGIC") - or (args.analysis_type == "doEverything") - or (NSB_match) - ): - - MAGIC_runs_and_dates = f"{source_name}_MAGIC_runs.txt" - MAGIC_runs = np.genfromtxt( - MAGIC_runs_and_dates, dtype=str, delimiter=",", ndmin=2 - ) # READ LIST OF DATES AND RUNS: format table where each line is like "2020_11_19,5093174" - - # TODO: fix here above - print("*** Converting Calibrated into DL1 data ***") - print(f"Process name: {source_name}") - print( - f"To check the jobs submitted to the cluster, type: squeue -n {source_name}" - ) - - directories_generator_real( - str(target_dir), telescope_ids, MAGIC_runs, NSB_match, source_name - ) # Here we create all the necessary directories in the given workspace and collect the main directory of the target - config_file_gen( - target_dir, noise_value, NSB_match, source_name, config_file - ) - # Below we run the analysis on the MAGIC data - - lists_and_bash_gen_MAGIC( - target_dir, - telescope_ids, - MAGIC_runs, - source_name, - env_name, - cluster, - ) # MAGIC real data - if (telescope_ids[-2] > 0) or (telescope_ids[-1] > 0): - list_of_MAGIC_runs = glob.glob(f"{source_name}_MAGIC-*.sh") - if len(list_of_MAGIC_runs) < 1: - logger.warning( - "No bash script has been produced. Please check the provided MAGIC_runs.txt and the MAGIC calibrated data" - ) - continue + MAGIC_runs_and_dates = f"{source_name}_MAGIC_runs.txt" + MAGIC_runs = np.genfromtxt( + MAGIC_runs_and_dates, dtype=str, delimiter=",", ndmin=2 + ) # READ LIST OF DATES AND RUNS: format table where each line is like "2020_11_19,5093174" + + print("*** Converting Calibrated into DL1 data ***") + print(f"Process name: {source_name}") + print( + f"To check the jobs submitted to the cluster, type: squeue -n {source_name}" + ) + + directories_generator_real( + str(target_dir), telescope_ids, MAGIC_runs, source_name + ) # Here we create all the necessary directories in the given workspace and collect the main directory of the target + config_file_gen(target_dir, source_name, config_file) + + # Below we run the analysis on the MAGIC data + + lists_and_bash_gen_MAGIC( + target_dir, + telescope_ids, + MAGIC_runs, + source_name, + env_name, + cluster, + ) # MAGIC real data + if (telescope_ids[-2] > 0) or (telescope_ids[-1] > 0): + list_of_MAGIC_runs = glob.glob(f"{source_name}_MAGIC-*.sh") + if len(list_of_MAGIC_runs) < 1: + logger.warning( + "No bash script has been produced. Please check the provided MAGIC_runs.txt and the MAGIC calibrated data" + ) + continue - launch_jobs = f"linking=$(sbatch --parsable {source_name}_linking_MAGIC_data_paths.sh)" - for n, run in enumerate(list_of_MAGIC_runs): - launch_jobs = f"{launch_jobs} && RES{n}=$(sbatch --parsable --dependency=afterany:$linking {run})" + launch_jobs = f"linking=$(sbatch --parsable {source_name}_linking_MAGIC_data_paths.sh)" + for n, run in enumerate(list_of_MAGIC_runs): + launch_jobs = f"{launch_jobs} && RES{n}=$(sbatch --parsable --dependency=afterany:$linking {run})" - os.system(launch_jobs) + os.system(launch_jobs) if __name__ == "__main__": diff --git a/magicctapipe/scripts/lst1_magic/semi_automatic_scripts/merging_runs.py b/magicctapipe/scripts/lst1_magic/semi_automatic_scripts/merging_runs.py index b2fb9086a..ca80d7d56 100644 --- a/magicctapipe/scripts/lst1_magic/semi_automatic_scripts/merging_runs.py +++ b/magicctapipe/scripts/lst1_magic/semi_automatic_scripts/merging_runs.py @@ -1,27 +1,13 @@ """ -This script splits the proton MC data sample into "train" -and "test" and generates the bash scripts to merge MC and -real data files by calling the script "merge_hdf_files.py": +This script generates the bash +scripts to merge real data files by calling the script "merge_hdf_files.py": MAGIC: Merge the subruns into runs for M1 and M2 individually. -MC: - -Merges all MC runs in a node - Usage: -$ merging_runs (-c config.yaml) (-t analysis_type) - -If you want to merge only the MAGIC or only the MC data, -you can do as follows: - -Only MAGIC: -$ merging_runs -t onlyMAGIC (-c config.yaml) - -Only MC: -$ merging_runs -t onlyMC (-c config.yaml) +$ merging_runs (-c config.yaml) """ import argparse @@ -33,7 +19,6 @@ import joblib import numpy as np import yaml -from tqdm import tqdm from magicctapipe import __version__ from magicctapipe.scripts.lst1_magic.semi_automatic_scripts.clusters import ( @@ -41,58 +26,13 @@ slurm_lines, ) -__all__ = ["split_train_test", "merge", "mergeMC"] +__all__ = ["merge"] logger = logging.getLogger(__name__) logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.INFO) -def split_train_test(target_dir, train_fraction): - - """ - This function splits the MC proton sample in 2, i.e. the "test" and the "train" subsamples, in case you want to make performance studies on MC. For regular analyses, you can/should use the whole MC sample for training. - It generates 2 subdirectories in the directory .../DL1/MC/protons named "test" and "train" and creates sub-sub-directories with the names of all nodes. - For each node sub-sub-directory we move `train_fraction` of the .h5 files to the "train" subdirectory and `1-train_fraction` of the .h5 files to the "test" subdirectory. - - Parameters - ---------- - target_dir : str - Path to the working directory - train_fraction : float - Fraction of proton MC files to be used in the training of RFs - """ - - proton_dir = f"{target_dir}/v{__version__}/MC/DL1/protons" - - list_of_dir = np.sort(glob.glob(f"{proton_dir}/node*{os.path.sep}")) - - for directory in tqdm( - range(len(list_of_dir)) - ): # tqdm allows us to print a progessbar in the terminal - node = list_of_dir[directory].split("/")[-2] - os.makedirs(f"{proton_dir}/train/{node}", exist_ok=True) - os.makedirs( - f'{proton_dir}/../protons_test/{list_of_dir[directory].split("/")[-2]}', - exist_ok=True, - ) - list_of_runs = np.sort( - glob.glob(f'{proton_dir}/{list_of_dir[directory].split("/")[-2]}/*.h5') - ) - number_train_runs = int(len(list_of_runs) * train_fraction) - for j in list_of_runs[0:number_train_runs]: - os.system(f"mv {j} {proton_dir}/train/{node}") - - os.system(f"cp {list_of_dir[directory]}logs/*.txt {proton_dir}/train/{node}") - os.system( - f"mv {list_of_dir[directory]}logs/*.txt {proton_dir}/../protons_test/{node}" - ) - os.system( - f"mv {list_of_dir[directory]}*.h5 {proton_dir}/../protons_test/{node}" - ) - os.system(f"rm -r {list_of_dir[directory]}") - - def merge(target_dir, MAGIC_runs, env_name, source, cluster): """ @@ -152,62 +92,6 @@ def merge(target_dir, MAGIC_runs, env_name, source, cluster): logger.error(f"{indir} does not exist") -def mergeMC(target_dir, identification, env_name, cluster): - - """ - This function creates the bash scripts to run merge_hdf_files.py in all MC nodes. - - Parameters - ---------- - target_dir : str - Path to the working directory - identification : str - Tells which sample to process - env_name : str - Name of the environment - cluster : str - Cluster system - """ - - process_name = "merging_MC" - - MC_DL1_dir = f"{target_dir}/v{__version__}/MC/DL1" - os.makedirs(f"{MC_DL1_dir}/{identification}/Merged/logs", exist_ok=True) - - if identification == "protons": - list_of_nodes = np.sort(glob.glob(f"{MC_DL1_dir}/{identification}/train/node*")) - else: - list_of_nodes = np.sort(glob.glob(f"{MC_DL1_dir}/{identification}/node*")) - - np.savetxt( - f"{MC_DL1_dir}/{identification}/list_of_nodes.txt", list_of_nodes, fmt="%s" - ) - - process_size = len(list_of_nodes) - 1 - - if cluster != "SLURM": - logger.warning( - "Automatic processing not implemented for the cluster indicated in the config file" - ) - return - with open(f"Merge_MC_{identification}.sh", "w") as f: - slurm = slurm_lines( - queue="short", - array=process_size, - mem="7g", - job_name=process_name, - out_name=f"{MC_DL1_dir}/{identification}/Merged/logs/slurm-%x.%A_%a", - ) - lines_bash_file = slurm + [ - f"SAMPLE_LIST=($(<{MC_DL1_dir}/{identification}/list_of_nodes.txt))\n", - "SAMPLE=${SAMPLE_LIST[${SLURM_ARRAY_TASK_ID}]}\n", - f"export LOG={MC_DL1_dir}/{identification}/Merged/logs" - + "/merged_${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}.log\n", - f"conda run -n {env_name} merge_hdf_files --input-dir $SAMPLE --output-dir {MC_DL1_dir}/{identification}/Merged >$LOG 2>&1\n", - ] - f.writelines(lines_bash_file) - - def main(): """ @@ -224,16 +108,6 @@ def main(): help="Path to a configuration file", ) - parser.add_argument( - "--analysis-type", - "-t", - choices=["onlyMAGIC", "onlyMC"], - dest="analysis_type", - type=str, - default="doEverything", - help="You can type 'onlyMAGIC' or 'onlyMC' to run this script only on MAGIC or MC data, respectively.", - ) - args = parser.parse_args() with open( args.config_file, "rb" @@ -242,9 +116,6 @@ def main(): target_dir = Path(config["directories"]["workspace_dir"]) - NSB_match = config["general"]["NSB_matching"] - train_fraction = float(config["general"]["proton_train_fraction"]) - env_name = config["general"]["env_name"] source_in = config["data_selection"]["source_name_database"] source = config["data_selection"]["source_name_output"] @@ -257,41 +128,6 @@ def main(): if source is None: source = source_in source_list = [source] - if not NSB_match: - if (args.analysis_type == "onlyMC") or (args.analysis_type == "doEverything"): - # Here we slice the proton MC data into "train" and "test" (but first we check if the directory already exists): - if not os.path.exists(f"{target_dir}/v{__version__}/MC/DL1/protons_test"): - print("***** Splitting protons into 'train' and 'test' datasets...") - split_train_test(target_dir, train_fraction) - - print("***** Generating merge_MC bashscripts...") - mergeMC( - target_dir, "protons", env_name, cluster - ) # generating the bash script to merge the files - mergeMC( - target_dir, "gammadiffuse", env_name, cluster - ) # generating the bash script to merge the files - mergeMC( - target_dir, "gammas", env_name, cluster - ) # generating the bash script to merge the files - mergeMC(target_dir, "protons_test", env_name, cluster) - mergeMC(target_dir, "helium", env_name, cluster) - mergeMC(target_dir, "electrons", env_name, cluster) - - print("***** Running merge_hdf_files.py on the MC data files...") - - # Below we run the bash scripts to merge the MC files - list_of_merging_scripts = np.sort(glob.glob("Merge_MC_*.sh")) - if len(list_of_merging_scripts) < 1: - logger.warning("No bash script has been produced for MC") - # TODO: check - - else: - launch_jobs = "" - for n, run in enumerate(list_of_merging_scripts): - launch_jobs += (" && " if n > 0 else "") + f"sbatch {run}" - - os.system(launch_jobs) for source_name in source_list: MAGIC_runs_and_dates = f"{source_name}_MAGIC_runs.txt" @@ -300,34 +136,28 @@ def main(): ) # Below we run the analysis on the MAGIC data - if ( - (args.analysis_type == "onlyMAGIC") - or (args.analysis_type == "doEverything") - or (NSB_match) - ): - print("***** Generating merge_MAGIC bashscripts...") - merge( - target_dir, - MAGIC_runs, - env_name, - source_name, - cluster, - ) # generating the bash script to merge the subruns - - print("***** Running merge_hdf_files.py on the MAGIC data files...") - - # Below we run the bash scripts to merge the MAGIC files - list_of_merging_scripts = np.sort( - glob.glob(f"{source_name}_Merge_MAGIC*.sh") - ) - if len(list_of_merging_scripts) < 1: - logger.warning("No bash scripts for real data") - continue - launch_jobs = "" - for n, run in enumerate(list_of_merging_scripts): - launch_jobs += (" && " if n > 0 else "") + f"sbatch {run}" - - os.system(launch_jobs) + + print("***** Generating merge_MAGIC bashscripts...") + merge( + target_dir, + MAGIC_runs, + env_name, + source_name, + cluster, + ) # generating the bash script to merge the subruns + + print("***** Running merge_hdf_files.py on the MAGIC data files...") + + # Below we run the bash scripts to merge the MAGIC files + list_of_merging_scripts = np.sort(glob.glob(f"{source_name}_Merge_MAGIC*.sh")) + if len(list_of_merging_scripts) < 1: + logger.warning("No bash scripts for real data") + continue + launch_jobs = "" + for n, run in enumerate(list_of_merging_scripts): + launch_jobs += (" && " if n > 0 else "") + f"sbatch {run}" + + os.system(launch_jobs) print(f"Process name: merging_{source_name}") print( diff --git a/magicctapipe/scripts/lst1_magic/semi_automatic_scripts/stereo_events.py b/magicctapipe/scripts/lst1_magic/semi_automatic_scripts/stereo_events.py index 785b3eae4..a2f8a4eea 100644 --- a/magicctapipe/scripts/lst1_magic/semi_automatic_scripts/stereo_events.py +++ b/magicctapipe/scripts/lst1_magic/semi_automatic_scripts/stereo_events.py @@ -1,19 +1,10 @@ """ This scripts generates and runs the bashscripts -to compute the stereo parameters of DL1 MC and +to compute the stereo parameters of DL1 Coincident MAGIC+LST data files. Usage: -$ stereo_events (-c config.yaml) (-t analysis_type) - -If you want to compute the stereo parameters only the real data or only the MC data, -you can do as follows: - -Only real data: -$ stereo_events -t onlyMAGIC (-c config.yaml) - -Only MC: -$ stereo_events -t onlyMC (-c config.yaml) +$ stereo_events (-c config.yaml) """ import argparse @@ -33,7 +24,7 @@ slurm_lines, ) -__all__ = ["configfile_stereo", "bash_stereo", "bash_stereoMC"] +__all__ = ["configfile_stereo", "bash_stereo"] logger = logging.getLogger(__name__) logger.addHandler(logging.StreamHandler()) @@ -66,10 +57,7 @@ def configfile_stereo(target_dir, source_name, config_file): "mc_tel_ids": config_dict["mc_tel_ids"], "stereo_reco": config_dict["stereo_reco"], } - if source_name == "MC": - file_name = f"{target_dir}/v{__version__}/MC/config_stereo.yaml" - else: - file_name = f"{target_dir}/v{__version__}/{source_name}/config_stereo.yaml" + file_name = f"{target_dir}/v{__version__}/{source_name}/config_stereo.yaml" with open(file_name, "w") as f: yaml.dump(conf, f, default_flow_style=False) @@ -147,57 +135,6 @@ def bash_stereo(target_dir, source, env_name, cluster): f.writelines(lines) -def bash_stereoMC(target_dir, identification, env_name, cluster): - - """ - This function generates the bashscripts for running the stereo analysis. - - Parameters - ---------- - target_dir : str - Path to the working directory - identification : str - Particle name. Options: protons, gammadiffuse, gammas, protons_test - env_name : str - Name of the environment - cluster : str - Cluster system - """ - - process_name = "stereo_MC" - - inputdir = f"{target_dir}/v{__version__}/MC/DL1/{identification}/Merged" - os.makedirs(f"{inputdir}/StereoMerged/logs", exist_ok=True) - - os.system( - f"ls {inputdir}/dl1*.h5 > {inputdir}/list_coin.txt" - ) # generating a list with the DL1 coincident data files. - with open(f"{inputdir}/list_coin.txt", "r") as f: - process_size = len(f.readlines()) - 1 - if cluster != "SLURM": - logger.warning( - "Automatic processing not implemented for the cluster indicated in the config file" - ) - return - with open(f"StereoEvents_MC_{identification}.sh", "w") as f: - slurm = slurm_lines( - queue="xxl", - job_name=f"{process_name}_stereo", - array=process_size, - mem="7g", - out_name=f"{inputdir}/StereoMerged/logs/slurm-%x.%A_%a", - ) - lines = slurm + [ - f"export INPUTDIR={inputdir}\n", - f"export OUTPUTDIR={inputdir}/StereoMerged\n", - "SAMPLE_LIST=($(<$INPUTDIR/list_coin.txt))\n", - "SAMPLE=${SAMPLE_LIST[${SLURM_ARRAY_TASK_ID}]}\n", - "export LOG=$OUTPUTDIR/logs/stereo_${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}.log\n", - f"conda run -n {env_name} lst1_magic_stereo_reco --input-file $SAMPLE --output-dir $OUTPUTDIR --config-file {target_dir}/v{__version__}/MC/config_stereo.yaml >$LOG 2>&1", - ] - f.writelines(lines) - - def main(): """ @@ -214,16 +151,6 @@ def main(): help="Path to a configuration file", ) - parser.add_argument( - "--analysis-type", - "-t", - choices=["onlyMAGIC", "onlyMC"], - dest="analysis_type", - type=str, - default="doEverything", - help="You can type 'onlyMAGIC' or 'onlyMC' to run this script only on real or MC data, respectively.", - ) - args = parser.parse_args() with open( args.config_file, "rb" @@ -235,7 +162,6 @@ def main(): env_name = config["general"]["env_name"] config_file = config["general"]["base_config_file"] - NSB_match = config["general"]["NSB_matching"] source_in = config["data_selection"]["source_name_database"] source = config["data_selection"]["source_name_output"] @@ -247,68 +173,33 @@ def main(): if source is None: source = source_in source_list = [source] - if not NSB_match: - if ( - (args.analysis_type == "onlyMC") - or (args.analysis_type == "doEverything") - and not NSB_match - ): - configfile_stereo(target_dir, "MC", config_file) - print("***** Generating the bashscript for MCs...") - for part in [ - "gammadiffuse", - "gammas", - "protons", - "protons_test", - "helium", - "electrons", - ]: - bash_stereoMC(target_dir, part, env_name, cluster) - - list_of_stereo_scripts = np.sort(glob.glob("StereoEvents_MC_*.sh")) - if len(list_of_stereo_scripts) < 1: - logger.warning("No bash script has been produced for processing MCs") - else: - launch_jobs = "" - # TODO: check on N. bash scripts - - for n, run in enumerate(list_of_stereo_scripts): - launch_jobs += (" && " if n > 0 else "") + f"sbatch {run}" - - os.system(launch_jobs) for source_name in source_list: - if ( - (args.analysis_type == "onlyMAGIC") - or (args.analysis_type == "doEverything") - or (NSB_match) - ): - print("***** Generating file config_stereo.yaml...") - configfile_stereo(target_dir, source_name, config_file) - - # Below we run the analysis on the real data - - print("***** Generating the bashscript...") - bash_stereo(target_dir, source_name, env_name, cluster) - - print("***** Submitting processess to the cluster...") - print(f"Process name: {source_name}_stereo") - print( - f"To check the jobs submitted to the cluster, type: squeue -n {source_name}_stereo" - ) - - # Below we run the bash scripts to find the stereo events - list_of_stereo_scripts = np.sort( - glob.glob(f"{source_name}_StereoEvents*.sh") - ) - if len(list_of_stereo_scripts) < 1: - logger.warning("No bash scripts for real data") - continue - launch_jobs = "" - for n, run in enumerate(list_of_stereo_scripts): - launch_jobs += (" && " if n > 0 else "") + f"sbatch {run}" - - os.system(launch_jobs) + + print("***** Generating file config_stereo.yaml...") + configfile_stereo(target_dir, source_name, config_file) + + # Below we run the analysis on the real data + + print("***** Generating the bashscript...") + bash_stereo(target_dir, source_name, env_name, cluster) + + print("***** Submitting processess to the cluster...") + print(f"Process name: {source_name}_stereo") + print( + f"To check the jobs submitted to the cluster, type: squeue -n {source_name}_stereo" + ) + + # Below we run the bash scripts to find the stereo events + list_of_stereo_scripts = np.sort(glob.glob(f"{source_name}_StereoEvents*.sh")) + if len(list_of_stereo_scripts) < 1: + logger.warning("No bash scripts for real data") + continue + launch_jobs = "" + for n, run in enumerate(list_of_stereo_scripts): + launch_jobs += (" && " if n > 0 else "") + f"sbatch {run}" + + os.system(launch_jobs) if __name__ == "__main__":