From 074ea09d4035f414b62a03735d993777fbff0694 Mon Sep 17 00:00:00 2001 From: Karl Kosack Date: Thu, 24 Mar 2022 17:44:20 +0100 Subject: [PATCH 1/3] fixed last cell of notebook and reformat - PrecisionRecallDisplay crashed due to bins with no data - re-ran black and isort to reformat all cells --- .../benchmarks_MODELS_classification.ipynb | 221 ++++++++++-------- 1 file changed, 126 insertions(+), 95 deletions(-) diff --git a/protopipe/benchmarks/notebooks/MODELS/benchmarks_MODELS_classification.ipynb b/protopipe/benchmarks/notebooks/MODELS/benchmarks_MODELS_classification.ipynb index 76b781f5..99ac8f74 100644 --- a/protopipe/benchmarks/notebooks/MODELS/benchmarks_MODELS_classification.ipynb +++ b/protopipe/benchmarks/notebooks/MODELS/benchmarks_MODELS_classification.ipynb @@ -84,29 +84,35 @@ "metadata": {}, "outputs": [], "source": [ - "import gzip\n", "import glob\n", + "import gzip\n", + "import pickle\n", "from pathlib import Path\n", "\n", - "import pickle\n", "import joblib\n", - "import yaml\n", + "import matplotlib.patches as mpatches\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.style as style\n", "import numpy as np\n", "import pandas as pd\n", + "import yaml\n", + "from cycler import cycler\n", + "from matplotlib.pyplot import rc\n", "from scipy.optimize import curve_fit\n", - "from sklearn.metrics import roc_curve, auc\n", + "from sklearn.metrics import auc, roc_curve\n", "\n", - "import matplotlib.pyplot as plt\n", - "import matplotlib.patches as mpatches\n", - "from matplotlib.pyplot import rc\n", - "import matplotlib.style as style\n", - "from cycler import cycler\n", - "plt.rcParams.update({'figure.max_open_warning': 0})\n", + "plt.rcParams.update({\"figure.max_open_warning\": 0})\n", "\n", - "from protopipe.pipeline.io import load_config, load_obj\n", - "from protopipe.benchmarks.utils import string_to_boolean\n", "from protopipe.benchmarks.operations import get_evt_subarray_model_output\n", - "from protopipe.benchmarks.plot import plot_hist, plot_distributions, plot_roc_curve, ClassifierDiagnostic, BoostedDecisionTreeDiagnostic" + "from protopipe.benchmarks.plot import (\n", + " BoostedDecisionTreeDiagnostic,\n", + " ClassifierDiagnostic,\n", + " plot_distributions,\n", + " plot_hist,\n", + " plot_roc_curve,\n", + ")\n", + "from protopipe.benchmarks.utils import string_to_boolean\n", + "from protopipe.pipeline.io import load_config, load_obj" ] }, { @@ -136,8 +142,8 @@ "analyses_directory = None\n", "analysis_name = None\n", "analysis_name_2 = None\n", - "model_configuration_filename = None # Name of the configuration file of the model\n", - "output_directory = Path.cwd() # default output directory for plots\n", + "model_configuration_filename = None # Name of the configuration file of the model\n", + "output_directory = Path.cwd() # default output directory for plots\n", "use_seaborn = False" ] }, @@ -172,9 +178,17 @@ "metadata": {}, "outputs": [], "source": [ - "analysis_configuration_path = Path(analyses_directory) / analysis_name / Path(\"configs/analysis.yaml\")\n", - "model_configuration_path = Path(analyses_directory) / analysis_name / \"configs\" / model_configuration_filename\n", - "input_directory = Path(analyses_directory) / analysis_name / Path(\"estimators/gamma_hadron_classifier\")" + "analysis_configuration_path = (\n", + " Path(analyses_directory) / analysis_name / Path(\"configs/analysis.yaml\")\n", + ")\n", + "model_configuration_path = (\n", + " Path(analyses_directory) / analysis_name / \"configs\" / model_configuration_filename\n", + ")\n", + "input_directory = (\n", + " Path(analyses_directory)\n", + " / analysis_name\n", + " / Path(\"estimators/gamma_hadron_classifier\")\n", + ")" ] }, { @@ -198,23 +212,37 @@ "metadata": {}, "outputs": [], "source": [ - "cameras = [model.split('/')[-1].split('_')[1] for model in glob.glob(f\"{input_directory}/{model_type}*.pkl.gz\")]\n", - "data = {camera : dict.fromkeys([\"model\", \"data_scikit\", \"data_train\", \"data_test\"]) for camera in cameras} \n", + "cameras = [\n", + " model.split(\"/\")[-1].split(\"_\")[1]\n", + " for model in glob.glob(f\"{input_directory}/{model_type}*.pkl.gz\")\n", + "]\n", + "data = {\n", + " camera: dict.fromkeys([\"model\", \"data_scikit\", \"data_train\", \"data_test\"])\n", + " for camera in cameras\n", + "}\n", "\n", "for camera in cameras:\n", "\n", " data[camera][\"data_scikit\"] = load_obj(\n", - " glob.glob(f\"{input_directory}/data_scikit_{model_type}_{method_name}_{camera}.pkl.gz\")[0]\n", - " )\n", + " glob.glob(\n", + " f\"{input_directory}/data_scikit_{model_type}_{method_name}_{camera}.pkl.gz\"\n", + " )[0]\n", + " )\n", " data[camera][\"data_train\"] = pd.read_pickle(\n", - " glob.glob(f\"{input_directory}/data_train_{model_type}_{method_name}_{camera}.pkl.gz\")[0]\n", - " )\n", + " glob.glob(\n", + " f\"{input_directory}/data_train_{model_type}_{method_name}_{camera}.pkl.gz\"\n", + " )[0]\n", + " )\n", " data[camera][\"data_test\"] = pd.read_pickle(\n", - " glob.glob(f\"{input_directory}/data_test_{model_type}_{method_name}_{camera}.pkl.gz\")[0]\n", + " glob.glob(\n", + " f\"{input_directory}/data_test_{model_type}_{method_name}_{camera}.pkl.gz\"\n", + " )[0]\n", " )\n", - " \n", + "\n", " modelName = f\"{model_type}_{camera}_{method_name}.pkl.gz\"\n", - " data[camera][\"model\"] = joblib.load(glob.glob(f\"{input_directory}/{model_type}_{camera}_{method_name}.pkl.gz\")[0])" + " data[camera][\"model\"] = joblib.load(\n", + " glob.glob(f\"{input_directory}/{model_type}_{camera}_{method_name}.pkl.gz\")[0]\n", + " )" ] }, { @@ -256,11 +284,11 @@ "nbins = cfg[\"Diagnostic\"][\"energy\"][\"nbins\"]\n", "\n", "energy_edges = np.logspace(\n", - " np.log10(cfg[\"Diagnostic\"][\"energy\"][\"min\"]),\n", - " np.log10(cfg[\"Diagnostic\"][\"energy\"][\"max\"]),\n", - " nbins + 1,\n", - " True,\n", - " )" + " np.log10(cfg[\"Diagnostic\"][\"energy\"][\"min\"]),\n", + " np.log10(cfg[\"Diagnostic\"][\"energy\"][\"max\"]),\n", + " nbins + 1,\n", + " True,\n", + ")" ] }, { @@ -297,15 +325,16 @@ "outputs": [], "source": [ "diagnostic = dict.fromkeys(cameras)\n", - "for camera in cameras: \n", + "for camera in cameras:\n", " diagnostic[camera] = ClassifierDiagnostic(\n", - " model=data[camera][\"model\"],\n", - " feature_name_list=features,\n", - " target_name=cfg[\"Method\"][\"target_name\"],\n", - " data_train=data[camera][\"data_train\"],\n", - " data_test=data[camera][\"data_test\"],\n", - " model_output_name=output_model_name,\n", - " is_output_proba=cfg[\"Method\"][\"use_proba\"])" + " model=data[camera][\"model\"],\n", + " feature_name_list=features,\n", + " target_name=cfg[\"Method\"][\"target_name\"],\n", + " data_train=data[camera][\"data_train\"],\n", + " data_test=data[camera][\"data_test\"],\n", + " model_output_name=output_model_name,\n", + " is_output_proba=cfg[\"Method\"][\"use_proba\"],\n", + " )" ] }, { @@ -616,16 +645,12 @@ "n_feature = len(cut_list)\n", "ncols = 2\n", "nrows = (\n", - " int(n_feature / ncols)\n", - " if n_feature % ncols == 0\n", - " else int((n_feature + 1) / ncols)\n", + " int(n_feature / ncols) if n_feature % ncols == 0 else int((n_feature + 1) / ncols)\n", ")\n", "\n", "for camera in cameras:\n", "\n", - " fig, axes = plt.subplots(\n", - " nrows=nrows, ncols=ncols, figsize=(5 * ncols, 5 * nrows)\n", - " )\n", + " fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(5 * ncols, 5 * nrows))\n", " plt.subplots_adjust(hspace=0.5, wspace=0.5)\n", " if nrows == 1 and ncols == 1:\n", " axes = [axes]\n", @@ -661,12 +686,14 @@ " )\n", "\n", " ax.set_xlim(the_range)\n", - " ax.set_ylim(0,1.2)\n", + " ax.set_ylim(0, 1.2)\n", " ax.set_xlabel(output_model_name)\n", " ax.set_ylabel(\"Arbitrary units\")\n", - " #ax.legend(loc=\"best\", fontsize=\"small\")\n", + " # ax.legend(loc=\"best\", fontsize=\"small\")\n", " ax.legend(loc=\"upper center\")\n", - " ax.set_title(f\"{energy_edges[i]:.2f} TeV < E_reco < {energy_edges[i+1]:.2f} TeV\")\n", + " ax.set_title(\n", + " f\"{energy_edges[i]:.2f} TeV < E_reco < {energy_edges[i+1]:.2f} TeV\"\n", + " )\n", " ax.grid()\n", "\n", " plt.suptitle(camera)" @@ -687,8 +714,8 @@ "outputs": [], "source": [ "for camera in cameras:\n", - " \n", - " plt.figure(figsize=(6,6))\n", + "\n", + " plt.figure(figsize=(6, 6))\n", " ax = plt.gca()\n", "\n", " color = 1.0\n", @@ -703,10 +730,15 @@ " opt = dict(\n", " color=str(c),\n", " lw=2,\n", - " #label=\"{}\".format(cut.replace(\"reco_energy\", \"E\")),\n", - " label=f\"{energy_edges[i]:.2f} TeV < E_reco < {energy_edges[i+1]:.2f} TeV\"\n", + " # label=\"{}\".format(cut.replace(\"reco_energy\", \"E\")),\n", + " label=f\"{energy_edges[i]:.2f} TeV < E_reco < {energy_edges[i+1]:.2f} TeV\",\n", + " )\n", + " plot_roc_curve(\n", + " ax,\n", + " test_data[output_model_name],\n", + " test_data[cfg[\"Method\"][\"target_name\"]],\n", + " **opt,\n", " )\n", - " plot_roc_curve(ax, test_data[output_model_name], test_data[cfg[\"Method\"][\"target_name\"]], **opt)\n", " ax.plot([0, 1], [0, 1], color=\"navy\", lw=2, linestyle=\"--\")\n", " ax.set_title(camera)\n", " ax.set_xlabel(\"False Positive Rate\")\n", @@ -730,11 +762,11 @@ "outputs": [], "source": [ "finer_energy_edges = np.logspace(\n", - " np.log10(0.02),\n", - " np.log10(200),\n", - " 21,\n", - " True,\n", - " )\n", + " np.log10(0.02),\n", + " np.log10(200),\n", + " 21,\n", + " True,\n", + ")\n", "\n", "cut_list_with_finer_energy_edges = [\n", " \"reco_energy >= {:.2f} and reco_energy <= {:.2f}\".format(\n", @@ -744,27 +776,36 @@ "]\n", "\n", "for camera in cameras:\n", - " plt.figure(figsize=(8,8))\n", + " plt.figure(figsize=(8, 8))\n", " plt.title(camera)\n", - " \n", + "\n", " aucs = []\n", " reco_energy = []\n", - " \n", + "\n", " for i, cut in enumerate(cut_list_with_finer_energy_edges):\n", - " \n", + "\n", " selected_images = data[camera][\"data_test\"].query(cut)\n", - " if len(selected_images)==0:\n", + " if len(selected_images) == 0:\n", " continue\n", - " \n", - " fpr, tpr, _ = roc_curve(y_score=selected_images[output_model_name], y_true=selected_images[cfg[\"Method\"][\"target_name\"]])\n", + "\n", + " fpr, tpr, _ = roc_curve(\n", + " y_score=selected_images[output_model_name],\n", + " y_true=selected_images[cfg[\"Method\"][\"target_name\"]],\n", + " )\n", " roc_auc = auc(fpr, tpr)\n", - " \n", + "\n", " aucs.append(roc_auc)\n", - " reco_energy.append( 0.5 * (finer_energy_edges[i] + finer_energy_edges[i+1]) )\n", + " reco_energy.append(0.5 * (finer_energy_edges[i] + finer_energy_edges[i + 1]))\n", "\n", " plt.plot(reco_energy, aucs, \"bo\")\n", - " plt.hlines(1, xmin=plt.gca().get_xlim()[0], xmax=plt.gca().get_xlim()[1], linestyles=\"dashed\", color=\"green\")\n", - " plt.ylim(0,1.2)\n", + " plt.hlines(\n", + " 1,\n", + " xmin=plt.gca().get_xlim()[0],\n", + " xmax=plt.gca().get_xlim()[1],\n", + " linestyles=\"dashed\",\n", + " color=\"green\",\n", + " )\n", + " plt.ylim(0, 1.2)\n", " plt.xscale(\"log\")\n", " plt.xlabel(\"log10(Reconstructed energy [TeV])\")\n", " plt.ylabel(\"AUC\")\n", @@ -796,49 +837,39 @@ " response_method = \"predict_proba\"\n", "\n", "for camera in cameras:\n", - " \n", + "\n", " plt.figure(figsize=(7, 5))\n", " plt.grid()\n", " plt.title(camera)\n", - " \n", + "\n", " for i, cut in enumerate(cut_list):\n", " c = color - (i + 1) * step_color\n", "\n", " selected_test_data = diagnostic[camera].data_test.query(cut)\n", "\n", - " if len(test_data) == 0:\n", + " # skip the energy bin if it's not there\n", + " if len(selected_test_data) == 0:\n", " continue\n", "\n", - " opt = dict(\n", - " color=str(c),\n", - " lw=2,\n", - " label=f\"{energy_edges[i]:.2f} TeV < E_reco < {energy_edges[i+1]:.2f} TeV\"\n", + " PrecisionRecallDisplay.from_estimator(\n", + " estimator=diagnostic[camera].model,\n", + " x=selected_test_data[features].to_numpy(),\n", + " y=selected_test_data[cfg[\"Method\"][\"target_name\"]],\n", + " response_method=response_method,\n", + " ax=plt.gca(),\n", + " name=f\"{energy_edges[i]:.2f} TeV < E_reco < {energy_edges[i+1]:.2f} TeV\",\n", + " color=f\"C{i}\",\n", " )\n", "\n", - " PrecisionRecallDisplay.from_estimator(diagnostic[camera].model, \n", - " selected_test_data[features].to_numpy(),\n", - " selected_test_data[cfg[\"Method\"][\"target_name\"]],\n", - " response_method=response_method,\n", - " ax=plt.gca(),\n", - " name=opt[\"label\"])\n", - " \n", - "\n", - " plt.ylim(0,1)" + " plt.ylim(0, 1)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python (Protopipe-CTADIRAC)", "language": "python", - "name": "python3" + "name": "protopipe-ctadirac" }, "language_info": { "codemirror_mode": { From 0f281146cc12598e5682338f721594354389e125 Mon Sep 17 00:00:00 2001 From: Karl Kosack Date: Tue, 29 Mar 2022 15:53:50 +0200 Subject: [PATCH 2/3] added some more plots to show the training distributions --- .../benchmarks_MODELS_classification.ipynb | 121 +++++++++++++++++- 1 file changed, 118 insertions(+), 3 deletions(-) diff --git a/protopipe/benchmarks/notebooks/MODELS/benchmarks_MODELS_classification.ipynb b/protopipe/benchmarks/notebooks/MODELS/benchmarks_MODELS_classification.ipynb index 99ac8f74..74e5ccc5 100644 --- a/protopipe/benchmarks/notebooks/MODELS/benchmarks_MODELS_classification.ipynb +++ b/protopipe/benchmarks/notebooks/MODELS/benchmarks_MODELS_classification.ipynb @@ -337,6 +337,15 @@ " )" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "LABELS = {0: \"Protons\", 1: \"Gammas\"}" + ] + }, { "cell_type": "markdown", "metadata": { @@ -394,6 +403,112 @@ " font_scale=seaborn_settings[\"theme\"][\"font_scale\"] if \"font_scale\" in seaborn_settings[\"theme\"] else 1.0)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Input Statistics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for impact_dist_range in [[0, 3000], [0, 500], [1000, 3000]]:\n", + " fig, ax = plt.subplots(\n", + " len(cameras), 1, figsize=(12, 4 * len(cameras)), tight_layout=True, sharex=True\n", + " )\n", + " fig.suptitle(f\"Impact distance: {impact_dist_range}\")\n", + " for ii, camera in enumerate(cameras):\n", + " ax[ii].set_ylabel(\"events\")\n", + " ax[ii].set_title(f\"Training statistics: {camera}\")\n", + " d = data[camera][\"data_train\"].query(\n", + " f\"impact_dist > {impact_dist_range[0]} and impact_dist < {impact_dist_range[1]}\"\n", + " )\n", + "\n", + " d[\"log10_reco_energy\"].hist(\n", + " stacked=False, alpha=1, bins=50, range=[-2, 3], label=\"total\", ax=ax[ii]\n", + " )\n", + " d.groupby(\"label\")[\"log10_reco_energy\"].hist(\n", + " stacked=False,\n", + " alpha=1,\n", + " bins=50,\n", + " range=[-2, 3],\n", + " legend=True,\n", + " ax=ax[ii],\n", + " )\n", + " ax[ii].set_xlabel(\"$E^{\\gamma}_{\\mathrm{reco}}$\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "impact_bins = np.sqrt(np.linspace(0, 2000 ** 2, 100))\n", + "\n", + "for logebin in [[-2, 3], [-2, 1], [1.5, 3]]:\n", + "\n", + " fig, ax = plt.subplots(\n", + " len(cameras), 1, figsize=(12, 4 * len(cameras)), tight_layout=True, sharex=True\n", + " )\n", + " fig.suptitle(f\"logE = {logebin} TeV\")\n", + "\n", + " for ii, camera in enumerate(cameras):\n", + " ax[ii].set_ylabel(\"events\")\n", + " ax[ii].set_title(f\"Training statistics: {camera}\")\n", + " ax[ii].set_yscale(\"log\")\n", + " d = data[camera][\"data_train\"].query(\n", + " f\"log10_reco_energy>{logebin[0]} and log10_reco_energy<{logebin[1]}\"\n", + " )\n", + "\n", + " d.impact_dist.hist(\n", + " stacked=False, alpha=1, bins=impact_bins, label=\"total\", ax=ax[ii]\n", + " )\n", + " (\n", + " d.groupby(\"label\").impact_dist.hist(\n", + " stacked=False,\n", + " alpha=1,\n", + " bins=impact_bins,\n", + " legend=True,\n", + " ax=ax[ii],\n", + " )\n", + " )\n", + " ax[ii].set_xlabel(\"Impact Parameter \")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for camera in cameras:\n", + " fig, axs = plt.subplots(1, 2, figsize=(10,4), tight_layout=True)\n", + " fig.suptitle(camera)\n", + " d = data[camera][\"data_train\"].groupby(\"label\")\n", + " i=0\n", + " for name, group in d:\n", + " ax = axs[i]\n", + " ax.grid(False)\n", + " ax.set_aspect(1.0)\n", + " h,x,y,im = ax.hist2d(\n", + " x=group.reco_core_x,\n", + " y=group.reco_core_y,\n", + " bins=[100, 100],\n", + " range=[[-2000, 2000], [-2000, 2000]],\n", + " norm=mpl.colors.LogNorm(),\n", + " )\n", + " fig.colorbar(im, ax=ax)\n", + " ax.set_title(f\"{LABELS[name]}\")\n", + " i+=1" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -852,9 +967,9 @@ " continue\n", "\n", " PrecisionRecallDisplay.from_estimator(\n", - " estimator=diagnostic[camera].model,\n", - " x=selected_test_data[features].to_numpy(),\n", - " y=selected_test_data[cfg[\"Method\"][\"target_name\"]],\n", + " diagnostic[camera].model,\n", + " selected_test_data[features].to_numpy(),\n", + " selected_test_data[cfg[\"Method\"][\"target_name\"]],\n", " response_method=response_method,\n", " ax=plt.gca(),\n", " name=f\"{energy_edges[i]:.2f} TeV < E_reco < {energy_edges[i+1]:.2f} TeV\",\n", From 019314f9baa08934ac3480aad4f1fb06dfce8624 Mon Sep 17 00:00:00 2001 From: Karl Kosack Date: Tue, 29 Mar 2022 16:17:21 +0200 Subject: [PATCH 3/3] fixed missing import --- .../notebooks/MODELS/benchmarks_MODELS_classification.ipynb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/protopipe/benchmarks/notebooks/MODELS/benchmarks_MODELS_classification.ipynb b/protopipe/benchmarks/notebooks/MODELS/benchmarks_MODELS_classification.ipynb index 74e5ccc5..ec967b35 100644 --- a/protopipe/benchmarks/notebooks/MODELS/benchmarks_MODELS_classification.ipynb +++ b/protopipe/benchmarks/notebooks/MODELS/benchmarks_MODELS_classification.ipynb @@ -93,6 +93,7 @@ "import matplotlib.patches as mpatches\n", "import matplotlib.pyplot as plt\n", "import matplotlib.style as style\n", + "from matplotlib.colors import LogNorm\n", "import numpy as np\n", "import pandas as pd\n", "import yaml\n", @@ -502,7 +503,7 @@ " y=group.reco_core_y,\n", " bins=[100, 100],\n", " range=[[-2000, 2000], [-2000, 2000]],\n", - " norm=mpl.colors.LogNorm(),\n", + " norm=LogNorm(),\n", " )\n", " fig.colorbar(im, ax=ax)\n", " ax.set_title(f\"{LABELS[name]}\")\n",