From 47b4e1b52f88436af682ab19f4119512f82ea70e Mon Sep 17 00:00:00 2001
From: vuillaut <thomas.vuillaume@gmail.com>
Date: Fri, 24 Jun 2022 13:51:06 +0200
Subject: [PATCH 1/2] correction of the astro ml

---
 machine-learning/astro_ml_corr.ipynb | 1381 ++++++++++++++++++++++++++
 machine-learning/ml_func.py          |   26 +-
 2 files changed, 1406 insertions(+), 1 deletion(-)
 create mode 100644 machine-learning/astro_ml_corr.ipynb

diff --git a/machine-learning/astro_ml_corr.ipynb b/machine-learning/astro_ml_corr.ipynb
new file mode 100644
index 0000000..0b21eff
--- /dev/null
+++ b/machine-learning/astro_ml_corr.ipynb
@@ -0,0 +1,1381 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import ml_func as ml\n",
+    "\n",
+    "train_filename = \"escape_school_cta_data_train.h5\"\n",
+    "ml.download_file(\"https://nextcloud.e5.physik.tu-dortmund.de/index.php/s/Kz3EbfbMAzyRHEa/download?path=%2Fparticipants&files=escape_school_cta_data_train.h5\",\n",
+    "                 train_filename\n",
+    "                )\n",
+    "\n",
+    "test_filename = \"escape_school_cta_data_test_no_labels.h5\"\n",
+    "ml.download_file(\"https://nextcloud.e5.physik.tu-dortmund.de/index.php/s/Kz3EbfbMAzyRHEa/download?path=%2Fparticipants&files=escape_school_cta_data_test_no_labels.h5\",\n",
+    "                 test_filename,\n",
+    "                )\n",
+    "                \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# CTA data machine learning challenge\n",
+    "\n",
+    "\n",
+    "Notebook stolen and modified from https://github.com/tudo-astroparticlephysics/machine-learning-lecture/blob/main/smd_boosting.ipynb\n",
+    "\n",
+    "main author: Maximilian Nöthe\n",
+    "\n",
+    "additional authors: Thomas Vuillaume, Michaël Dell'aiera, Martino Sorbaro"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## The challenge\n",
+    "\n",
+    "In this exercise, you must train two models:\n",
+    "- one classifier to predict the type of particle\n",
+    "- one regressor to predict the energy of gamma events\n",
+    "\n",
+    "You will apply the methods seen previously during the course and try to find the best model with the best hyperparameters.\n",
+    "At the end, you can apply your model to a test dataset with unknown labels and send your prediction back to us. We will then evaluate it and rank the predictions.\n",
+    "\n",
+    "\n",
+    "rules:\n",
+    "- You can only use models in scikit-learn.\n",
+    "- regression score will be computed using a logarithmic MSE\n",
+    "- classficiation score will be accuracy\n",
+    "- You must do you pull-request on the repository https://github.com/escape2020/school2022-mlctapred with your prediction before 10:30 today to be ranked.\n",
+    "- Tutors can participate, they can't win\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-11-27T15:05:27.643365Z",
+     "start_time": "2018-11-27T15:05:26.177785Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import ml_func as ml\n",
+    "\n",
+    "import matplotlib\n",
+    "import matplotlib.pyplot as plt\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from matplotlib.colors import ListedColormap"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-11-27T15:05:27.643365Z",
+     "start_time": "2018-11-27T15:05:26.177785Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "pd.options.display.max_rows = 10\n",
+    "ml.set_plot_style()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# A Complete Example\n",
+    "\n",
+    "Below we load a dataset containing data from simulated CTA Observations.\n",
+    "\n",
+    "<img width=\"100%\" src=\"https://upload.wikimedia.org/wikipedia/commons/thumb/b/b0/CTA_Telescopes_in_Southern_Hemisphere.jpg/1280px-CTA_Telescopes_in_Southern_Hemisphere.jpg\" />   \n",
+    "\n",
+    "We will perform the typical steps to build and evaluate a classifier.\n",
+    "\n",
+    "0. Understand where your data comes from\n",
+    "\n",
+    "1. Preprocessing\n",
+    "    * Drop Constant Values,\n",
+    "    * Handle Missing Data \n",
+    "    * Feature Generation\n",
+    "\n",
+    "2. Splitting\n",
+    "    \n",
+    "    * Split your data into training and evaluation sets\n",
+    "    \n",
+    "3. Training \n",
+    "    \n",
+    "    * Train your classifier of choice.\n",
+    "    \n",
+    "4. Evaluation\n",
+    "    \n",
+    "    * Evaluate the performance on the test data set.\n",
+    "    * If not good enough, go back to step 1 \n",
+    "    \n",
+    "5. Physics\n",
+    "    \n",
+    "    * Check whether your data support your hypothesis\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Get to know your data\n",
+    "\n",
+    "Cherenkov telescopes record short flashes of light produced by very high energy cosmic rays and photons hitting earths atmosphere.\n",
+    "\n",
+    "![](https://www.cta-observatory.org/wp-content/uploads/2016/05/cta47.png)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-11-27T15:07:00.589316Z",
+     "start_time": "2018-11-27T15:07:00.584438Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "%%HTML\n",
+    "<!-- https://nextcloud.e5.physik.tu-dortmund.de/index.php/s/e7yb2mifGDeyDBN/download -->\n",
+    "<video width=\"100%\" controls>\n",
+    "  <source src=\"./resources/fact_events.mp4\" type=\"video/mp4\">\n",
+    "</video>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will use machine learning for two tasks in this example. \n",
+    "\n",
+    " * Train a classifier to distinguish events induced by gamma rays form events induced by cosmic rays\n",
+    " * Train a regressor to estimate the energy of the incoming primary particle."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Preprocess data\n",
+    "\n",
+    "A lot of preprocessing has already happened at this point.\n",
+    "\n",
+    "* Calibration of Raw Data\n",
+    "* Data Reduction from voltage timeseries per pixel to number of photons and mean time for each pixel\n",
+    "* Calculation of image features\n",
+    "* Geometrical Reconstruction of the Shower Geometry\n",
+    "\n",
+    "\n",
+    "Load data and remove unwanted columns and store the true labels separately."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_df = pd.read_hdf(train_filename)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(train_df.columns)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Check the data types of the columns. We can select non-numeric types and encode them. But in this case we might as well drop them as the attribute is not important."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-11-27T15:07:02.075296Z",
+     "start_time": "2018-11-27T15:07:02.057255Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "c = train_df.select_dtypes(exclude=['number']).columns\n",
+    "print('Removed columns:', c.values)\n",
+    "\n",
+    "train_df = train_df.drop(c, axis='columns')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can spot the columns with constant values by looking at the standard deviation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "desc = train_df.describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-11-27T15:07:02.547778Z",
+     "start_time": "2018-11-27T15:07:02.532415Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "c = desc.columns[desc.loc['std'] == 0]\n",
+    "print('Removed columns:', c.values)\n",
+    "train_df = train_df.drop(c, axis='columns')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "drop columns where all rows are nan"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "c = train_df.columns[train_df.count() == 0]\n",
+    "print('Removed columns:', c.values)\n",
+    "train_df = train_df.drop(c, axis='columns')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Check for missing data. (Just delete it in this case)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-11-27T15:07:02.594941Z",
+     "start_time": "2018-11-27T15:07:02.551401Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "print(len(train_df))\n",
+    "train_df = train_df.dropna()\n",
+    "print(len(train_df))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-11-27T15:07:02.608429Z",
+     "start_time": "2018-11-27T15:07:02.597366Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def preprocess(df):\n",
+    "    \"\"\" \n",
+    "    All the preprocessing in one function\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    c = df.select_dtypes(exclude=['number']).columns\n",
+    "    df = df.drop(c, axis='columns')\n",
+    "    \n",
+    "    c = df.columns[df.count() == 0]\n",
+    "    df = df.drop(c, axis='columns')\n",
+    "    \n",
+    "    desc = df.describe()\n",
+    "    \n",
+    "    c = desc.columns[desc.loc['std'] == 0]\n",
+    "    df = df.drop(c, axis='columns')\n",
+    "    \n",
+    "    df = df.dropna()\n",
+    "    \n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we can perform feature generation. We use our expert knowledge or intuition to create a new feature by combining existing columns into a new variable."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-11-27T15:07:03.481076Z",
+     "start_time": "2018-11-27T15:07:03.471131Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def feature_generation(df):\n",
+    "    df['awesome_feature'] =  df['hillas_intensity'] * (df['hillas_width'] / df['hillas_length'])\n",
+    "    \n",
+    "    # distance of impact point to the telescope\n",
+    "    df['impact'] = np.sqrt(\n",
+    "        (df['HillasReconstructor_core_x'] - df['pos_x'])**2\n",
+    "        + (df['HillasReconstructor_core_y'] - df['pos_y'])**2\n",
+    "    )\n",
+    "    return df\n",
+    "\n",
+    "train_df = feature_generation(train_df)\n",
+    "\n",
+    "train_df[['awesome_feature', 'impact']]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. A quick look at the data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First we can divide the data in two datasets, based on the type of particles, gammas or protons"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gammas = train_df[train_df.true_shower_primary_id==0]\n",
+    "protons = train_df[train_df.true_shower_primary_id==101]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-11-27T15:07:03.829801Z",
+     "start_time": "2018-11-27T15:07:03.484278Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "bins = np.linspace(-20, 20, 100)\n",
+    "# bins = np.logspace(0, 1, 100)\n",
+    "# bins = 100\n",
+    "bins = np.arange(0, 10) - 0.5\n",
+    "bins = np.geomspace(1e3, 30e3, 51)\n",
+    "\n",
+    "col = 'HillasReconstructor_h_max'\n",
+    "\n",
+    "plt.figure()\n",
+    "plt.hist(gammas[col], bins=bins, histtype='step', lw=2, label='Gammas', density=True)\n",
+    "plt.hist(protons[col], bins=bins, histtype='step', lw=2, label='Protons', density=True)\n",
+    "plt.xscale('log')\n",
+    "plt.xlabel(col)\n",
+    "plt.legend()\n",
+    "None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Exercise\n",
+    "You may plot other histograms or scatter plots to get a better grasp at the data."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Train a regressor to reconstruct the gammas energy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# we select the training features by removing columns with `true_` values coming from the simulation\n",
+    "training_features = [name for name in list(gammas.columns) if not name.startswith('true_')]\n",
+    "predict_feature = 'true_energy'  # replace with the simulated energy column name\n",
+    "\n",
+    "training_features.remove('obs_id')\n",
+    "training_features.remove('event_id')\n",
+    "training_features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Implement the train / test split for X,y\n",
+    "\n",
+    "X = gammas[training_features]\n",
+    "y = gammas[predict_feature]\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "1. Divide your dataset into train and validation\n",
+    "1. Fit a `DecisionTree` to do the regression of the energy on gammas\n",
+    "1. Apply the trained model to the test dataset\n",
+    "1. Evaluate the goodness of your prediction using a logarithmic MSE\n",
+    "1. Extra: How can we improve that regression?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.tree import DecisionTreeRegressor\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import mean_squared_log_error"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "X = gammas[training_features]\n",
+    "y = gammas[predict_feature]\n",
+    "\n",
+    "\n",
+    "X_train, X_val, y_train, y_val = train_test_split(X, y)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reg = DecisionTreeRegressor()\n",
+    "reg.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_predict = reg.predict(X_val)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mean_squared_log_error(y_val, y_predict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(6,6))\n",
+    "plt.hist2d(y_val, y_predict, bins=np.logspace(-2,2,100));\n",
+    "plt.plot((0, 100), (0, 100), color='red', ls='--')\n",
+    "plt.axis('equal')\n",
+    "plt.xscale('log')\n",
+    "plt.yscale('log')\n",
+    "plt.xlabel('True Energy / TeV')\n",
+    "plt.ylabel('Reco Energy / TeV')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Train a classifier to do gamma/hadron separation\n",
+    "\n",
+    "Data preparation for gamma/hadron classification\n",
+    "\n",
+    "Execise: split the data into train / test using sklearn"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "At this point we combine the two datasets into one big matrix and build a label vector $y$"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-11-27T15:07:03.895879Z",
+     "start_time": "2018-11-27T15:07:03.832960Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "X = train_df[training_features]\n",
+    "y = np.concatenate([np.ones(len(gammas)), np.zeros(len(protons))])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-11-27T15:07:03.983514Z",
+     "start_time": "2018-11-27T15:07:03.898708Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "X_train, X_val, y_train, y_val = train_test_split(X, y)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Train the classifier\n",
+    "\n",
+    "Now we can train any classifier we want on the prepared data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-11-27T15:13:04.181685Z",
+     "start_time": "2018-11-27T15:13:02.875532Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.tree import DecisionTreeClassifier\n",
+    "\n",
+    "classifier = DecisionTreeClassifier()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "classifier.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "1. Train the `DecisionTreeClassifier`\n",
+    "2. This time get the prediction **proba** on the test dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reco_type = classifier.predict(X_val)\n",
+    "proba = classifier.predict_proba(X_val)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Classifier Evaluation \n",
+    "\n",
+    "Check accuracy of the models and other metrics "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "importance = pd.Series(classifier.feature_importances_, index=training_features)\n",
+    "\n",
+    "\n",
+    "plt.figure()\n",
+    "importance.sort_values().tail(20).plot.barh()\n",
+    "plt.tight_layout()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-11-27T15:13:04.199205Z",
+     "start_time": "2018-11-27T15:13:04.183884Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score\n",
+    "\n",
+    "acc = accuracy_score(y_val, reco_type)\n",
+    "auc = roc_auc_score(y_val, proba[:,1])\n",
+    "fpr, tpr, thresholds = roc_curve(y_val, proba[:,1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-11-27T15:13:46.845086Z",
+     "start_time": "2018-11-27T15:13:46.501506Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "\n",
+    "plt.figure()\n",
+    "plt.scatter(fpr, tpr, c=thresholds, vmax=1)\n",
+    "plt.xlabel('FPR')\n",
+    "plt.ylabel('TPR')\n",
+    "plt.gca().set_aspect(1)\n",
+    "plt.plot(fpr, tpr, '--', color='gray', alpha=0.5)\n",
+    "plt.text(0.5, 0.5, f'AuC ROC: {auc:0.03f} \\nAccuracy: {acc:0.03f}')\n",
+    "plt.colorbar()\n",
+    "None"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 8. Redo, using cross-validation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Energy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import GridSearchCV\n",
+    "from sklearn.metrics import make_scorer\n",
+    "\n",
+    "max_depth = [5, 10, 20, 40, 100, 500]\n",
+    "min_samples_leaf = [2, 20, 40]\n",
+    "\n",
+    "param_grid = {\"max_depth\": max_depth, \"min_samples_leaf\": min_samples_leaf}\n",
+    "\n",
+    "# define your own mse and set greater_is_better=False to allow GridSearchCV to maximize the score\n",
+    "mse = make_scorer(mean_squared_log_error, greater_is_better=False)\n",
+    "\n",
+    "reg_grid_search = GridSearchCV(cv=5, estimator=DecisionTreeRegressor(),\n",
+    "                               scoring=mse,\n",
+    "                               param_grid=param_grid,\n",
+    "                               return_train_score=True,\n",
+    "                               n_jobs=-1  # use all cores for training in parallel\n",
+    "                              )\n",
+    "\n",
+    "reg_grid_search.fit(gammas[training_features], gammas['true_energy'])\n",
+    "\n",
+    "print(\"Best parameters set :\", reg_grid_search.best_params_)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reg_grid_search.best_score_"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_grid_search(cv_results, grid_param_1, grid_param_2, name_param_1, name_param_2):\n",
+    "    # Get Test Scores Mean and std for each grid search\n",
+    "    scores_mean = cv_results['mean_test_score']\n",
+    "    scores_mean = np.array(scores_mean).reshape(len(grid_param_2),len(grid_param_1))\n",
+    "\n",
+    "    scores_sd = cv_results['std_test_score']\n",
+    "    scores_sd = np.array(scores_sd).reshape(len(grid_param_2),len(grid_param_1))\n",
+    "\n",
+    "    # Plot Grid search scores\n",
+    "    _, ax = plt.subplots(1,1, figsize=(8,5))\n",
+    "\n",
+    "    # Param1 is the X-axis, Param 2 is represented as a different curve (color line)\n",
+    "    for idx, val in enumerate(grid_param_2):\n",
+    "        ax.errorbar(grid_param_1, scores_mean[idx,:], yerr=scores_sd[idx,:],\n",
+    "                    fmt='--o', label= name_param_2 + ': ' + str(val), lw=1)\n",
+    "\n",
+    "    ax.set_title(\"Grid Search Scores\", fontsize=20, fontweight='bold')\n",
+    "    ax.set_xlabel(name_param_1, fontsize=16)\n",
+    "    ax.set_ylabel('CV Average Score', fontsize=16)\n",
+    "    ax.legend(loc=\"best\", fontsize=15)\n",
+    "    ax.grid('on')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_grid_search(reg_grid_search.cv_results_, max_depth, min_samples_leaf, 'max_depth', 'min_samples_leaf')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Particle Type"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "max_depth = [5, 10, 20, 40]\n",
+    "min_samples_leaf = [2, 20, 40]\n",
+    "\n",
+    "param_grid = {\"max_depth\": max_depth, \"min_samples_leaf\": min_samples_leaf}\n",
+    "\n",
+    "class_grid_search = GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),\n",
+    "                                 param_grid=param_grid,\n",
+    "                                 scoring=make_scorer(accuracy_score),\n",
+    "                                 return_train_score=True,\n",
+    "                                 n_jobs=-1\n",
+    "                                )\n",
+    "class_grid_search.fit(train_df[training_features], train_df['true_shower_primary_id'])\n",
+    "\n",
+    "print(\"Best parameters set :\", class_grid_search.best_params_)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class_grid_search.best_score_"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_grid_search(class_grid_search.cv_results_, max_depth, min_samples_leaf, 'max_depth', 'min_samples_leaf')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 9. Apply your best model to the test dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Last training on the full dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reg = DecisionTreeRegressor(**reg_grid_search.best_params_)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "classifier = DecisionTreeClassifier(**class_grid_search.best_params_\n",
+    "                                   )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reg.fit(train_df[training_features], train_df[predict_feature])\n",
+    "classifier.fit(train_df[training_features], train_df['true_shower_primary_id'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_df = pd.read_hdf(test_filename)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_df = preprocess(test_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "feature_generation(test_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_df['reco_energy'] = reg.predict(test_df[training_features])\n",
+    "test_df['reco_type'] = classifier.predict(test_df[training_features])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ml.save_prediction(test_df, 'thomas')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### ⏱ SEND IT ⏱ "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### NOTE : If you have followed this notebook, you have done a \"mono\" prediction (per telescope) \n",
+    "Extra exercise: combine the mono predictions with the method of your choice"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 10. Physics\n",
+    "\n",
+    "Now we could test our model and our hypothesis on real observed data. This part of the analysis is the most time \n",
+    "consuming in general. It also requires more data than than this notebook can handle. \n",
+    "After careful analysis one can produce an image of the gamma-ray sky\n",
+    "\n",
+    "<img width=\"60%\" src=\"https://www.mpi-hd.mpg.de/hfm/HESS/hgps/figures/HESS_J1813m126.png\">"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## To go further (see below)\n",
+    "\n",
+    "- use a more complex model, such as random forests\n",
+    "- train to predict the log of the energy\n",
+    "- add new features\n",
+    "- combine the predictions from each telescope for one event (group by `['obs_id', 'event_id']`)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 10. Extra: Improving Classification\n",
+    "\n",
+    "\n",
+    "### Boosting and AdaBoost\n",
+    "\n",
+    "Similar to the idea of combining many classifiers through bagging (like we did for the RandomForests) we now \n",
+    "train many estimators in a sequential manner. In each iteration the data gets modified slightly using weights $w$\n",
+    "for each sample in the training data. In the first iteration the weights are all set to $w=1$\n",
+    "\n",
+    "In each successive iteration the weights are updated. The samples that were incorrectly classified in the previous \n",
+    "iteration get a higher weight. The weights for correctly classified samples get decreases. \n",
+    "In other words: We increase the influence/importance of samples that are difficult to classify.\n",
+    "\n",
+    "Predictions are performed by taking a weighted average of the single predictors.\n",
+    "\n",
+    "The popular AdaBoost algorithms takes this a step further by optimizing the weight of each separate classifier \n",
+    "in the ensemble.\n",
+    "The AdaBoost ensemble combines many learners in an iterative way. The learner at iteration $m$ is:\n",
+    "\n",
+    "$$\n",
+    " F_{m}(x)=F_{m-1}(x)+\\gamma _{m}h_{m}(x)\n",
+    "$$\n",
+    "\n",
+    "The choice of $F_0$ is problem specific.\n",
+    "\n",
+    "Each weak learner produces a prediction $h(x_{m})$ for each sample in the training set. At each iteration $m$ a \n",
+    "weak learner is fitted and assigned a coefficient $\\gamma_{m}$ which is found by minimizing:\n",
+    "\n",
+    "$$\n",
+    "\\gamma_m = {\\underset {\\gamma }{\\arg \\min }} \\sum_{i}^{N}E\\bigl(F_{m-1}(x_{i})+\\gamma h(x_{i})\\bigr)\n",
+    "$$\n",
+    "\n",
+    "where $E(F)$ is some error function and $x_i$ is the reweighted data sample.\n",
+    "\n",
+    "In general this method can work with any classifying method. Traditionally it is being used with very small \n",
+    "decision trees. \n",
+    "The weights get used to select the split points during the minimization of the loss function in each node\n",
+    "\n",
+    "$$\n",
+    " \\underset{(X, s) \\in \\, \\mathbf{X} \\times {S}}{\\arg \\max} IG(X,Y) =   \\underset{(X, s) \\in \\, \\mathbf{X} \\times {S}}{\\arg \\max} ( H(Y) - H(Y |\\, X) ).\n",
+    "$$\n",
+    "\n",
+    "Below we try AdaBoost on the CTA data.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-11-27T15:49:09.247110Z",
+     "start_time": "2018-11-27T15:48:58.872812Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.ensemble import AdaBoostClassifier\n",
+    "\n",
+    "ada = AdaBoostClassifier(\n",
+    "    base_estimator=DecisionTreeClassifier(max_depth=2),\n",
+    "    n_estimators=100,\n",
+    "    learning_rate=0.5,\n",
+    ")\n",
+    "ada.fit(X_train, y_train)\n",
+    "\n",
+    "y_prediction = ada.predict(X_test)\n",
+    "y_prediction_proba = ada.predict_proba(X_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-11-27T15:49:10.072785Z",
+     "start_time": "2018-11-27T15:49:09.249195Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "scores = np.array(list(ada.staged_score(X_test, y_test)))\n",
+    "\n",
+    "plt.figure()\n",
+    "plt.plot(scores, '.')\n",
+    "plt.ylabel('Accuracy')\n",
+    "plt.xlabel('Iteration')\n",
+    "None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-11-27T15:49:10.727863Z",
+     "start_time": "2018-11-27T15:49:10.075262Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "acc = accuracy_score(y_test, y_prediction)\n",
+    "auc = roc_auc_score(y_test, y_prediction_proba[:, 1])\n",
+    "fpr, tpr, thresholds = roc_curve(y_test, y_prediction_proba[:, 1])\n",
+    "\n",
+    "plt.figure()\n",
+    "plt.scatter(fpr, tpr, c=thresholds)\n",
+    "plt.plot(fpr, tpr, '--', color='gray', alpha=0.5)\n",
+    "plt.text(0.5, 0.5, f'AuC ROC: {auc:0.03f} \\nAccuracy: {acc:0.03f}')\n",
+    "None"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Gradient Boosting \n",
+    "\n",
+    "Very similar to AdaBoost. Only this time we change the target label we train the classifiers for.\n",
+    "\n",
+    "Formulate the general problem as follows (See Wikipedia):\n",
+    "\n",
+    "Starts with a constant function $F_{0}(x)$ and some differentiable loss function $L$ and incrementally expands it in a greedy fashion:\n",
+    "\n",
+    "$$\n",
+    "F_{0}(x)={\\underset {\\gamma }{\\arg \\min }}{\\sum _{i=1}^{n}{L(y_{i},\\gamma )}}\n",
+    "$$\n",
+    "\n",
+    "$$\n",
+    "F_{m}(x)=F_{m-1}(x)+{\\underset {h_{m}\\in {\\mathcal {H}}}{\\operatorname {arg\\,min} }}\\left[{\\sum _{i=1}^{n}{L(y_{i},F_{m-1}(x_{i})+h_{m}(x_{i}))}}\\right]\n",
+    "$$\n",
+    "\n",
+    "Finding the best $ h_{m}\\in {\\mathcal {H}}$ is computationally speaking impossible.\n",
+    "If we could find the perfect $h$ however, we know that \n",
+    "\n",
+    "$$\n",
+    "F_{m+1}(x_i)=F_{m}(x_i)+h(x_i)=y_i\n",
+    "$$\n",
+    "\n",
+    "or, equivalently, \n",
+    "\n",
+    "$$\n",
+    "   h(x_i)= y_i - F_{m}(x_i)\n",
+    "$$\n",
+    "\n",
+    "Note that for the mean squared error loss $\\frac{1}{2}(y_i - F(x_i))^2$ this is equivalent to the negative \n",
+    "gradient with respect to $F_i$.\n",
+    "\n",
+    "For a general loss function we fit $h_{m}(x)$ to the residuals, or negative gradients \n",
+    "$$\n",
+    " r_{i, m}=-\\left[{\\frac {\\partial L(y_{i},F(x_{i}))}{\\partial F(x_{i})}}\\right]_{F(x)=F_{m-1}(x)}\\quad {\\mbox{for }}i=1,\\ldots ,n.\n",
+    "$$\n",
+    "\n",
+    "\n",
+    "\n",
+    "Below we try it on CTA data again.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-11-27T16:56:38.715276Z",
+     "start_time": "2018-11-27T16:56:29.657159Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.ensemble import GradientBoostingClassifier\n",
+    "\n",
+    "grb = GradientBoostingClassifier(\n",
+    "    verbose=True,\n",
+    "    n_estimators=300,\n",
+    ")\n",
+    "grb.fit(X_train, y_train)\n",
+    "\n",
+    "y_prediction = grb.predict(X_test)\n",
+    "y_prediction_proba = grb.predict_proba(X_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-11-27T16:56:39.343691Z",
+     "start_time": "2018-11-27T16:56:38.718176Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "l = [accuracy_score(y_pred, y_test) for y_pred in grb.staged_predict(X_test)]\n",
+    "\n",
+    "plt.figure()\n",
+    "plt.plot(range(len(l)), l, '.')\n",
+    "plt.ylabel('Accuracy')\n",
+    "plt.xlabel('Iteration')\n",
+    "None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-11-27T16:56:40.215880Z",
+     "start_time": "2018-11-27T16:56:39.346460Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "acc = accuracy_score(y_test, y_prediction)\n",
+    "auc = roc_auc_score(y_test, y_prediction_proba[:, 1])\n",
+    "fpr, tpr, thresholds = roc_curve(y_test, y_prediction_proba[:, 1])\n",
+    "\n",
+    "plt.figure()\n",
+    "plt.scatter(fpr, tpr, c=thresholds)\n",
+    "plt.plot(fpr, tpr, '--', color='gray', alpha=0.5)\n",
+    "plt.text(0.5, 0.5, f'AuC ROC: {auc:0.03f} \\nAccuracy: {acc:0.03f}')\n",
+    "None"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "More on gradient descent algorithms can be found in the Neural Network lecture.\n",
+    "\n",
+    "Let's now test our all time favorite classifier. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-11-27T16:58:57.873659Z",
+     "start_time": "2018-11-27T16:58:45.510336Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "\n",
+    "rf = RandomForestClassifier(n_estimators=150,  max_depth=18, criterion='entropy')\n",
+    "rf.fit(X_train, y_train)\n",
+    "\n",
+    "y_prediction = rf.predict(X_test)\n",
+    "y_prediction_proba = rf.predict_proba(X_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-11-27T16:58:58.442111Z",
+     "start_time": "2018-11-27T16:58:57.875736Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "acc = accuracy_score(y_test, y_prediction)\n",
+    "auc = roc_auc_score(y_test, y_prediction_proba[:, 1])\n",
+    "fpr, tpr, thresholds = roc_curve(y_test, y_prediction_proba[:, 1])\n",
+    "\n",
+    "plt.figure()\n",
+    "plt.scatter(fpr, tpr, c=thresholds)\n",
+    "plt.plot(fpr, tpr, '--', color='gray', alpha=0.5)\n",
+    "plt.text(0.5, 0.5, f'AuC ROC: {auc:0.03f} \\nAccuracy: {acc:0.03f}')\n",
+    "None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "execution": {
+   "allow_errors": true,
+   "timeout": 300
+  },
+  "kernelspec": {
+   "display_name": "eschool2022",
+   "language": "python",
+   "name": "eschool2022"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/machine-learning/ml_func.py b/machine-learning/ml_func.py
index 42d533e..a0d357f 100644
--- a/machine-learning/ml_func.py
+++ b/machine-learning/ml_func.py
@@ -2,7 +2,8 @@
 from pathlib import Path
 import requests
 import sys
-
+from sklearn.metrics import mean_squared_log_error, accuracy_score, mean_squared_error
+import pandas as pd
 
 
 def save_prediction(test_df, yourname):
@@ -40,3 +41,26 @@ def set_plot_style():
     plt.rcParams["axes.spines.right"] = False
 
 
+
+    
+def scoring(test_file):
+    test_labels = pd.read_hdf('escape_school_cta_data_test.h5')
+    prediction = pd.read_hdf(test_file)
+    df = pd.merge(test_labels, prediction, on=['obs_id', 'event_id'])
+    gammas = df[df['true_shower_primary_id']==0]
+    mse = mean_squared_log_error(gammas['true_energy'], gammas['reco_energy'])
+    # mse = mean_squared_error(df['true_energy'], df['reco_energy'])
+    acc = accuracy_score(df['true_shower_primary_id'], df['reco_type'])
+    return mse, acc
+
+def ranking(directory):
+    scores = pd.DataFrame(columns=['name', 'mse', 'acc'])
+    for file in Path(directory).glob('*.h5'):
+        participant_name = file.stem.replace('pred_', '')
+        sc = scoring(file)
+        scores = scores.append({'name': participant_name, 'mse': sc[0], 'acc': sc[1]},
+                      ignore_index=True
+                     )
+    scores = scores.sort_values(['mse', 'acc'])
+    return scores
+    
\ No newline at end of file

From 527b90f6062985c4d1bb8a53230ec186241e32a8 Mon Sep 17 00:00:00 2001
From: vuillaut <thomas.vuillaume@gmail.com>
Date: Fri, 24 Jun 2022 13:53:32 +0200
Subject: [PATCH 2/2] add ranking notebook

---
 machine-learning/challenge_ranking.ipynb | 89 ++++++++++++++++++++++++
 machine-learning/ml_func.py              |  1 -
 2 files changed, 89 insertions(+), 1 deletion(-)
 create mode 100644 machine-learning/challenge_ranking.ipynb

diff --git a/machine-learning/challenge_ranking.ipynb b/machine-learning/challenge_ranking.ipynb
new file mode 100644
index 0000000..b344d09
--- /dev/null
+++ b/machine-learning/challenge_ranking.ipynb
@@ -0,0 +1,89 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "59d7b12e-df49-4398-9597-32dabd4acdde",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ce83b3ed-aa69-4680-b53b-cf741d1b3a16",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import ml_func as ml\n",
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c4a04bfa-991f-41fa-a6bd-c9c8512f0c2f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rank = ml.ranking('../school2022-mlctapred/')\n",
+    "rank"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6f636fcb-40d9-4fcb-b723-261a94613110",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Best at classification\n",
+    "rank.sort_values(by='acc',  ascending=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ab05b17e-3c6f-4adb-8073-916022d0d83f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Best at regression\n",
+    "rank.sort_values(by='mse',  ascending=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "830babb0-6031-45ec-8f63-27b1ff51115d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "eschool2022",
+   "language": "python",
+   "name": "eschool2022"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/machine-learning/ml_func.py b/machine-learning/ml_func.py
index a0d357f..16c7f3c 100644
--- a/machine-learning/ml_func.py
+++ b/machine-learning/ml_func.py
@@ -49,7 +49,6 @@ def scoring(test_file):
     df = pd.merge(test_labels, prediction, on=['obs_id', 'event_id'])
     gammas = df[df['true_shower_primary_id']==0]
     mse = mean_squared_log_error(gammas['true_energy'], gammas['reco_energy'])
-    # mse = mean_squared_error(df['true_energy'], df['reco_energy'])
     acc = accuracy_score(df['true_shower_primary_id'], df['reco_type'])
     return mse, acc