From 47b4e1b52f88436af682ab19f4119512f82ea70e Mon Sep 17 00:00:00 2001 From: vuillaut Date: Fri, 24 Jun 2022 13:51:06 +0200 Subject: [PATCH 1/2] correction of the astro ml --- machine-learning/astro_ml_corr.ipynb | 1381 ++++++++++++++++++++++++++ machine-learning/ml_func.py | 26 +- 2 files changed, 1406 insertions(+), 1 deletion(-) create mode 100644 machine-learning/astro_ml_corr.ipynb diff --git a/machine-learning/astro_ml_corr.ipynb b/machine-learning/astro_ml_corr.ipynb new file mode 100644 index 0000000..0b21eff --- /dev/null +++ b/machine-learning/astro_ml_corr.ipynb @@ -0,0 +1,1381 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import ml_func as ml\n", + "\n", + "train_filename = \"escape_school_cta_data_train.h5\"\n", + "ml.download_file(\"https://nextcloud.e5.physik.tu-dortmund.de/index.php/s/Kz3EbfbMAzyRHEa/download?path=%2Fparticipants&files=escape_school_cta_data_train.h5\",\n", + " train_filename\n", + " )\n", + "\n", + "test_filename = \"escape_school_cta_data_test_no_labels.h5\"\n", + "ml.download_file(\"https://nextcloud.e5.physik.tu-dortmund.de/index.php/s/Kz3EbfbMAzyRHEa/download?path=%2Fparticipants&files=escape_school_cta_data_test_no_labels.h5\",\n", + " test_filename,\n", + " )\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# CTA data machine learning challenge\n", + "\n", + "\n", + "Notebook stolen and modified from https://github.com/tudo-astroparticlephysics/machine-learning-lecture/blob/main/smd_boosting.ipynb\n", + "\n", + "main author: Maximilian Nöthe\n", + "\n", + "additional authors: Thomas Vuillaume, Michaël Dell'aiera, Martino Sorbaro" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The challenge\n", + "\n", + "In this exercise, you must train two models:\n", + "- one classifier to predict the type of particle\n", + "- one regressor to predict the energy of gamma events\n", + "\n", + "You will apply the methods seen previously during the course and try to find the best model with the best hyperparameters.\n", + "At the end, you can apply your model to a test dataset with unknown labels and send your prediction back to us. We will then evaluate it and rank the predictions.\n", + "\n", + "\n", + "rules:\n", + "- You can only use models in scikit-learn.\n", + "- regression score will be computed using a logarithmic MSE\n", + "- classficiation score will be accuracy\n", + "- You must do you pull-request on the repository https://github.com/escape2020/school2022-mlctapred with your prediction before 10:30 today to be ranked.\n", + "- Tutors can participate, they can't win\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-27T15:05:27.643365Z", + "start_time": "2018-11-27T15:05:26.177785Z" + } + }, + "outputs": [], + "source": [ + "import ml_func as ml\n", + "\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "import numpy as np\n", + "from matplotlib.colors import ListedColormap" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-27T15:05:27.643365Z", + "start_time": "2018-11-27T15:05:26.177785Z" + } + }, + "outputs": [], + "source": [ + "pd.options.display.max_rows = 10\n", + "ml.set_plot_style()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# A Complete Example\n", + "\n", + "Below we load a dataset containing data from simulated CTA Observations.\n", + "\n", + " \n", + "\n", + "We will perform the typical steps to build and evaluate a classifier.\n", + "\n", + "0. Understand where your data comes from\n", + "\n", + "1. Preprocessing\n", + " * Drop Constant Values,\n", + " * Handle Missing Data \n", + " * Feature Generation\n", + "\n", + "2. Splitting\n", + " \n", + " * Split your data into training and evaluation sets\n", + " \n", + "3. Training \n", + " \n", + " * Train your classifier of choice.\n", + " \n", + "4. Evaluation\n", + " \n", + " * Evaluate the performance on the test data set.\n", + " * If not good enough, go back to step 1 \n", + " \n", + "5. Physics\n", + " \n", + " * Check whether your data support your hypothesis\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Get to know your data\n", + "\n", + "Cherenkov telescopes record short flashes of light produced by very high energy cosmic rays and photons hitting earths atmosphere.\n", + "\n", + "![](https://www.cta-observatory.org/wp-content/uploads/2016/05/cta47.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-27T15:07:00.589316Z", + "start_time": "2018-11-27T15:07:00.584438Z" + } + }, + "outputs": [], + "source": [ + "%%HTML\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will use machine learning for two tasks in this example. \n", + "\n", + " * Train a classifier to distinguish events induced by gamma rays form events induced by cosmic rays\n", + " * Train a regressor to estimate the energy of the incoming primary particle." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Preprocess data\n", + "\n", + "A lot of preprocessing has already happened at this point.\n", + "\n", + "* Calibration of Raw Data\n", + "* Data Reduction from voltage timeseries per pixel to number of photons and mean time for each pixel\n", + "* Calculation of image features\n", + "* Geometrical Reconstruction of the Shower Geometry\n", + "\n", + "\n", + "Load data and remove unwanted columns and store the true labels separately." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_df = pd.read_hdf(train_filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(train_df.columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check the data types of the columns. We can select non-numeric types and encode them. But in this case we might as well drop them as the attribute is not important." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-27T15:07:02.075296Z", + "start_time": "2018-11-27T15:07:02.057255Z" + } + }, + "outputs": [], + "source": [ + "c = train_df.select_dtypes(exclude=['number']).columns\n", + "print('Removed columns:', c.values)\n", + "\n", + "train_df = train_df.drop(c, axis='columns')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can spot the columns with constant values by looking at the standard deviation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "desc = train_df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-27T15:07:02.547778Z", + "start_time": "2018-11-27T15:07:02.532415Z" + } + }, + "outputs": [], + "source": [ + "c = desc.columns[desc.loc['std'] == 0]\n", + "print('Removed columns:', c.values)\n", + "train_df = train_df.drop(c, axis='columns')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "drop columns where all rows are nan" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "c = train_df.columns[train_df.count() == 0]\n", + "print('Removed columns:', c.values)\n", + "train_df = train_df.drop(c, axis='columns')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check for missing data. (Just delete it in this case)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-27T15:07:02.594941Z", + "start_time": "2018-11-27T15:07:02.551401Z" + } + }, + "outputs": [], + "source": [ + "print(len(train_df))\n", + "train_df = train_df.dropna()\n", + "print(len(train_df))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-27T15:07:02.608429Z", + "start_time": "2018-11-27T15:07:02.597366Z" + } + }, + "outputs": [], + "source": [ + "def preprocess(df):\n", + " \"\"\" \n", + " All the preprocessing in one function\n", + " \"\"\"\n", + " \n", + " c = df.select_dtypes(exclude=['number']).columns\n", + " df = df.drop(c, axis='columns')\n", + " \n", + " c = df.columns[df.count() == 0]\n", + " df = df.drop(c, axis='columns')\n", + " \n", + " desc = df.describe()\n", + " \n", + " c = desc.columns[desc.loc['std'] == 0]\n", + " df = df.drop(c, axis='columns')\n", + " \n", + " df = df.dropna()\n", + " \n", + " return df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can perform feature generation. We use our expert knowledge or intuition to create a new feature by combining existing columns into a new variable." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-27T15:07:03.481076Z", + "start_time": "2018-11-27T15:07:03.471131Z" + } + }, + "outputs": [], + "source": [ + "def feature_generation(df):\n", + " df['awesome_feature'] = df['hillas_intensity'] * (df['hillas_width'] / df['hillas_length'])\n", + " \n", + " # distance of impact point to the telescope\n", + " df['impact'] = np.sqrt(\n", + " (df['HillasReconstructor_core_x'] - df['pos_x'])**2\n", + " + (df['HillasReconstructor_core_y'] - df['pos_y'])**2\n", + " )\n", + " return df\n", + "\n", + "train_df = feature_generation(train_df)\n", + "\n", + "train_df[['awesome_feature', 'impact']]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. A quick look at the data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First we can divide the data in two datasets, based on the type of particles, gammas or protons" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gammas = train_df[train_df.true_shower_primary_id==0]\n", + "protons = train_df[train_df.true_shower_primary_id==101]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-27T15:07:03.829801Z", + "start_time": "2018-11-27T15:07:03.484278Z" + } + }, + "outputs": [], + "source": [ + "bins = np.linspace(-20, 20, 100)\n", + "# bins = np.logspace(0, 1, 100)\n", + "# bins = 100\n", + "bins = np.arange(0, 10) - 0.5\n", + "bins = np.geomspace(1e3, 30e3, 51)\n", + "\n", + "col = 'HillasReconstructor_h_max'\n", + "\n", + "plt.figure()\n", + "plt.hist(gammas[col], bins=bins, histtype='step', lw=2, label='Gammas', density=True)\n", + "plt.hist(protons[col], bins=bins, histtype='step', lw=2, label='Protons', density=True)\n", + "plt.xscale('log')\n", + "plt.xlabel(col)\n", + "plt.legend()\n", + "None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Exercise\n", + "You may plot other histograms or scatter plots to get a better grasp at the data." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Train a regressor to reconstruct the gammas energy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# we select the training features by removing columns with `true_` values coming from the simulation\n", + "training_features = [name for name in list(gammas.columns) if not name.startswith('true_')]\n", + "predict_feature = 'true_energy' # replace with the simulated energy column name\n", + "\n", + "training_features.remove('obs_id')\n", + "training_features.remove('event_id')\n", + "training_features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Implement the train / test split for X,y\n", + "\n", + "X = gammas[training_features]\n", + "y = gammas[predict_feature]\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. Divide your dataset into train and validation\n", + "1. Fit a `DecisionTree` to do the regression of the energy on gammas\n", + "1. Apply the trained model to the test dataset\n", + "1. Evaluate the goodness of your prediction using a logarithmic MSE\n", + "1. Extra: How can we improve that regression?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.tree import DecisionTreeRegressor\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import mean_squared_log_error" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "X = gammas[training_features]\n", + "y = gammas[predict_feature]\n", + "\n", + "\n", + "X_train, X_val, y_train, y_val = train_test_split(X, y)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "reg = DecisionTreeRegressor()\n", + "reg.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "y_predict = reg.predict(X_val)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mean_squared_log_error(y_val, y_predict)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(6,6))\n", + "plt.hist2d(y_val, y_predict, bins=np.logspace(-2,2,100));\n", + "plt.plot((0, 100), (0, 100), color='red', ls='--')\n", + "plt.axis('equal')\n", + "plt.xscale('log')\n", + "plt.yscale('log')\n", + "plt.xlabel('True Energy / TeV')\n", + "plt.ylabel('Reco Energy / TeV')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Train a classifier to do gamma/hadron separation\n", + "\n", + "Data preparation for gamma/hadron classification\n", + "\n", + "Execise: split the data into train / test using sklearn" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "At this point we combine the two datasets into one big matrix and build a label vector $y$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-27T15:07:03.895879Z", + "start_time": "2018-11-27T15:07:03.832960Z" + } + }, + "outputs": [], + "source": [ + "X = train_df[training_features]\n", + "y = np.concatenate([np.ones(len(gammas)), np.zeros(len(protons))])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-27T15:07:03.983514Z", + "start_time": "2018-11-27T15:07:03.898708Z" + } + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "X_train, X_val, y_train, y_val = train_test_split(X, y)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Train the classifier\n", + "\n", + "Now we can train any classifier we want on the prepared data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-27T15:13:04.181685Z", + "start_time": "2018-11-27T15:13:02.875532Z" + } + }, + "outputs": [], + "source": [ + "from sklearn.tree import DecisionTreeClassifier\n", + "\n", + "classifier = DecisionTreeClassifier()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "classifier.fit(X_train, y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. Train the `DecisionTreeClassifier`\n", + "2. This time get the prediction **proba** on the test dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "reco_type = classifier.predict(X_val)\n", + "proba = classifier.predict_proba(X_val)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Classifier Evaluation \n", + "\n", + "Check accuracy of the models and other metrics " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "importance = pd.Series(classifier.feature_importances_, index=training_features)\n", + "\n", + "\n", + "plt.figure()\n", + "importance.sort_values().tail(20).plot.barh()\n", + "plt.tight_layout()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-27T15:13:04.199205Z", + "start_time": "2018-11-27T15:13:04.183884Z" + } + }, + "outputs": [], + "source": [ + "from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score\n", + "\n", + "acc = accuracy_score(y_val, reco_type)\n", + "auc = roc_auc_score(y_val, proba[:,1])\n", + "fpr, tpr, thresholds = roc_curve(y_val, proba[:,1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-27T15:13:46.845086Z", + "start_time": "2018-11-27T15:13:46.501506Z" + } + }, + "outputs": [], + "source": [ + "\n", + "plt.figure()\n", + "plt.scatter(fpr, tpr, c=thresholds, vmax=1)\n", + "plt.xlabel('FPR')\n", + "plt.ylabel('TPR')\n", + "plt.gca().set_aspect(1)\n", + "plt.plot(fpr, tpr, '--', color='gray', alpha=0.5)\n", + "plt.text(0.5, 0.5, f'AuC ROC: {auc:0.03f} \\nAccuracy: {acc:0.03f}')\n", + "plt.colorbar()\n", + "None" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 8. Redo, using cross-validation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Energy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.metrics import make_scorer\n", + "\n", + "max_depth = [5, 10, 20, 40, 100, 500]\n", + "min_samples_leaf = [2, 20, 40]\n", + "\n", + "param_grid = {\"max_depth\": max_depth, \"min_samples_leaf\": min_samples_leaf}\n", + "\n", + "# define your own mse and set greater_is_better=False to allow GridSearchCV to maximize the score\n", + "mse = make_scorer(mean_squared_log_error, greater_is_better=False)\n", + "\n", + "reg_grid_search = GridSearchCV(cv=5, estimator=DecisionTreeRegressor(),\n", + " scoring=mse,\n", + " param_grid=param_grid,\n", + " return_train_score=True,\n", + " n_jobs=-1 # use all cores for training in parallel\n", + " )\n", + "\n", + "reg_grid_search.fit(gammas[training_features], gammas['true_energy'])\n", + "\n", + "print(\"Best parameters set :\", reg_grid_search.best_params_)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "reg_grid_search.best_score_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_grid_search(cv_results, grid_param_1, grid_param_2, name_param_1, name_param_2):\n", + " # Get Test Scores Mean and std for each grid search\n", + " scores_mean = cv_results['mean_test_score']\n", + " scores_mean = np.array(scores_mean).reshape(len(grid_param_2),len(grid_param_1))\n", + "\n", + " scores_sd = cv_results['std_test_score']\n", + " scores_sd = np.array(scores_sd).reshape(len(grid_param_2),len(grid_param_1))\n", + "\n", + " # Plot Grid search scores\n", + " _, ax = plt.subplots(1,1, figsize=(8,5))\n", + "\n", + " # Param1 is the X-axis, Param 2 is represented as a different curve (color line)\n", + " for idx, val in enumerate(grid_param_2):\n", + " ax.errorbar(grid_param_1, scores_mean[idx,:], yerr=scores_sd[idx,:],\n", + " fmt='--o', label= name_param_2 + ': ' + str(val), lw=1)\n", + "\n", + " ax.set_title(\"Grid Search Scores\", fontsize=20, fontweight='bold')\n", + " ax.set_xlabel(name_param_1, fontsize=16)\n", + " ax.set_ylabel('CV Average Score', fontsize=16)\n", + " ax.legend(loc=\"best\", fontsize=15)\n", + " ax.grid('on')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_grid_search(reg_grid_search.cv_results_, max_depth, min_samples_leaf, 'max_depth', 'min_samples_leaf')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Particle Type" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "max_depth = [5, 10, 20, 40]\n", + "min_samples_leaf = [2, 20, 40]\n", + "\n", + "param_grid = {\"max_depth\": max_depth, \"min_samples_leaf\": min_samples_leaf}\n", + "\n", + "class_grid_search = GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),\n", + " param_grid=param_grid,\n", + " scoring=make_scorer(accuracy_score),\n", + " return_train_score=True,\n", + " n_jobs=-1\n", + " )\n", + "class_grid_search.fit(train_df[training_features], train_df['true_shower_primary_id'])\n", + "\n", + "print(\"Best parameters set :\", class_grid_search.best_params_)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class_grid_search.best_score_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_grid_search(class_grid_search.cv_results_, max_depth, min_samples_leaf, 'max_depth', 'min_samples_leaf')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 9. Apply your best model to the test dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Last training on the full dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "reg = DecisionTreeRegressor(**reg_grid_search.best_params_)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "classifier = DecisionTreeClassifier(**class_grid_search.best_params_\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "reg.fit(train_df[training_features], train_df[predict_feature])\n", + "classifier.fit(train_df[training_features], train_df['true_shower_primary_id'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_df = pd.read_hdf(test_filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_df = preprocess(test_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "feature_generation(test_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_df['reco_energy'] = reg.predict(test_df[training_features])\n", + "test_df['reco_type'] = classifier.predict(test_df[training_features])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ml.save_prediction(test_df, 'thomas')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### ⏱ SEND IT ⏱ " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### NOTE : If you have followed this notebook, you have done a \"mono\" prediction (per telescope) \n", + "Extra exercise: combine the mono predictions with the method of your choice" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 10. Physics\n", + "\n", + "Now we could test our model and our hypothesis on real observed data. This part of the analysis is the most time \n", + "consuming in general. It also requires more data than than this notebook can handle. \n", + "After careful analysis one can produce an image of the gamma-ray sky\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## To go further (see below)\n", + "\n", + "- use a more complex model, such as random forests\n", + "- train to predict the log of the energy\n", + "- add new features\n", + "- combine the predictions from each telescope for one event (group by `['obs_id', 'event_id']`)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 10. Extra: Improving Classification\n", + "\n", + "\n", + "### Boosting and AdaBoost\n", + "\n", + "Similar to the idea of combining many classifiers through bagging (like we did for the RandomForests) we now \n", + "train many estimators in a sequential manner. In each iteration the data gets modified slightly using weights $w$\n", + "for each sample in the training data. In the first iteration the weights are all set to $w=1$\n", + "\n", + "In each successive iteration the weights are updated. The samples that were incorrectly classified in the previous \n", + "iteration get a higher weight. The weights for correctly classified samples get decreases. \n", + "In other words: We increase the influence/importance of samples that are difficult to classify.\n", + "\n", + "Predictions are performed by taking a weighted average of the single predictors.\n", + "\n", + "The popular AdaBoost algorithms takes this a step further by optimizing the weight of each separate classifier \n", + "in the ensemble.\n", + "The AdaBoost ensemble combines many learners in an iterative way. The learner at iteration $m$ is:\n", + "\n", + "$$\n", + " F_{m}(x)=F_{m-1}(x)+\\gamma _{m}h_{m}(x)\n", + "$$\n", + "\n", + "The choice of $F_0$ is problem specific.\n", + "\n", + "Each weak learner produces a prediction $h(x_{m})$ for each sample in the training set. At each iteration $m$ a \n", + "weak learner is fitted and assigned a coefficient $\\gamma_{m}$ which is found by minimizing:\n", + "\n", + "$$\n", + "\\gamma_m = {\\underset {\\gamma }{\\arg \\min }} \\sum_{i}^{N}E\\bigl(F_{m-1}(x_{i})+\\gamma h(x_{i})\\bigr)\n", + "$$\n", + "\n", + "where $E(F)$ is some error function and $x_i$ is the reweighted data sample.\n", + "\n", + "In general this method can work with any classifying method. Traditionally it is being used with very small \n", + "decision trees. \n", + "The weights get used to select the split points during the minimization of the loss function in each node\n", + "\n", + "$$\n", + " \\underset{(X, s) \\in \\, \\mathbf{X} \\times {S}}{\\arg \\max} IG(X,Y) = \\underset{(X, s) \\in \\, \\mathbf{X} \\times {S}}{\\arg \\max} ( H(Y) - H(Y |\\, X) ).\n", + "$$\n", + "\n", + "Below we try AdaBoost on the CTA data.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-27T15:49:09.247110Z", + "start_time": "2018-11-27T15:48:58.872812Z" + } + }, + "outputs": [], + "source": [ + "from sklearn.ensemble import AdaBoostClassifier\n", + "\n", + "ada = AdaBoostClassifier(\n", + " base_estimator=DecisionTreeClassifier(max_depth=2),\n", + " n_estimators=100,\n", + " learning_rate=0.5,\n", + ")\n", + "ada.fit(X_train, y_train)\n", + "\n", + "y_prediction = ada.predict(X_test)\n", + "y_prediction_proba = ada.predict_proba(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-27T15:49:10.072785Z", + "start_time": "2018-11-27T15:49:09.249195Z" + } + }, + "outputs": [], + "source": [ + "scores = np.array(list(ada.staged_score(X_test, y_test)))\n", + "\n", + "plt.figure()\n", + "plt.plot(scores, '.')\n", + "plt.ylabel('Accuracy')\n", + "plt.xlabel('Iteration')\n", + "None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-27T15:49:10.727863Z", + "start_time": "2018-11-27T15:49:10.075262Z" + } + }, + "outputs": [], + "source": [ + "acc = accuracy_score(y_test, y_prediction)\n", + "auc = roc_auc_score(y_test, y_prediction_proba[:, 1])\n", + "fpr, tpr, thresholds = roc_curve(y_test, y_prediction_proba[:, 1])\n", + "\n", + "plt.figure()\n", + "plt.scatter(fpr, tpr, c=thresholds)\n", + "plt.plot(fpr, tpr, '--', color='gray', alpha=0.5)\n", + "plt.text(0.5, 0.5, f'AuC ROC: {auc:0.03f} \\nAccuracy: {acc:0.03f}')\n", + "None" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Gradient Boosting \n", + "\n", + "Very similar to AdaBoost. Only this time we change the target label we train the classifiers for.\n", + "\n", + "Formulate the general problem as follows (See Wikipedia):\n", + "\n", + "Starts with a constant function $F_{0}(x)$ and some differentiable loss function $L$ and incrementally expands it in a greedy fashion:\n", + "\n", + "$$\n", + "F_{0}(x)={\\underset {\\gamma }{\\arg \\min }}{\\sum _{i=1}^{n}{L(y_{i},\\gamma )}}\n", + "$$\n", + "\n", + "$$\n", + "F_{m}(x)=F_{m-1}(x)+{\\underset {h_{m}\\in {\\mathcal {H}}}{\\operatorname {arg\\,min} }}\\left[{\\sum _{i=1}^{n}{L(y_{i},F_{m-1}(x_{i})+h_{m}(x_{i}))}}\\right]\n", + "$$\n", + "\n", + "Finding the best $ h_{m}\\in {\\mathcal {H}}$ is computationally speaking impossible.\n", + "If we could find the perfect $h$ however, we know that \n", + "\n", + "$$\n", + "F_{m+1}(x_i)=F_{m}(x_i)+h(x_i)=y_i\n", + "$$\n", + "\n", + "or, equivalently, \n", + "\n", + "$$\n", + " h(x_i)= y_i - F_{m}(x_i)\n", + "$$\n", + "\n", + "Note that for the mean squared error loss $\\frac{1}{2}(y_i - F(x_i))^2$ this is equivalent to the negative \n", + "gradient with respect to $F_i$.\n", + "\n", + "For a general loss function we fit $h_{m}(x)$ to the residuals, or negative gradients \n", + "$$\n", + " r_{i, m}=-\\left[{\\frac {\\partial L(y_{i},F(x_{i}))}{\\partial F(x_{i})}}\\right]_{F(x)=F_{m-1}(x)}\\quad {\\mbox{for }}i=1,\\ldots ,n.\n", + "$$\n", + "\n", + "\n", + "\n", + "Below we try it on CTA data again.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-27T16:56:38.715276Z", + "start_time": "2018-11-27T16:56:29.657159Z" + } + }, + "outputs": [], + "source": [ + "from sklearn.ensemble import GradientBoostingClassifier\n", + "\n", + "grb = GradientBoostingClassifier(\n", + " verbose=True,\n", + " n_estimators=300,\n", + ")\n", + "grb.fit(X_train, y_train)\n", + "\n", + "y_prediction = grb.predict(X_test)\n", + "y_prediction_proba = grb.predict_proba(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-27T16:56:39.343691Z", + "start_time": "2018-11-27T16:56:38.718176Z" + } + }, + "outputs": [], + "source": [ + "l = [accuracy_score(y_pred, y_test) for y_pred in grb.staged_predict(X_test)]\n", + "\n", + "plt.figure()\n", + "plt.plot(range(len(l)), l, '.')\n", + "plt.ylabel('Accuracy')\n", + "plt.xlabel('Iteration')\n", + "None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-27T16:56:40.215880Z", + "start_time": "2018-11-27T16:56:39.346460Z" + } + }, + "outputs": [], + "source": [ + "acc = accuracy_score(y_test, y_prediction)\n", + "auc = roc_auc_score(y_test, y_prediction_proba[:, 1])\n", + "fpr, tpr, thresholds = roc_curve(y_test, y_prediction_proba[:, 1])\n", + "\n", + "plt.figure()\n", + "plt.scatter(fpr, tpr, c=thresholds)\n", + "plt.plot(fpr, tpr, '--', color='gray', alpha=0.5)\n", + "plt.text(0.5, 0.5, f'AuC ROC: {auc:0.03f} \\nAccuracy: {acc:0.03f}')\n", + "None" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "More on gradient descent algorithms can be found in the Neural Network lecture.\n", + "\n", + "Let's now test our all time favorite classifier. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-27T16:58:57.873659Z", + "start_time": "2018-11-27T16:58:45.510336Z" + } + }, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "rf = RandomForestClassifier(n_estimators=150, max_depth=18, criterion='entropy')\n", + "rf.fit(X_train, y_train)\n", + "\n", + "y_prediction = rf.predict(X_test)\n", + "y_prediction_proba = rf.predict_proba(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-27T16:58:58.442111Z", + "start_time": "2018-11-27T16:58:57.875736Z" + } + }, + "outputs": [], + "source": [ + "acc = accuracy_score(y_test, y_prediction)\n", + "auc = roc_auc_score(y_test, y_prediction_proba[:, 1])\n", + "fpr, tpr, thresholds = roc_curve(y_test, y_prediction_proba[:, 1])\n", + "\n", + "plt.figure()\n", + "plt.scatter(fpr, tpr, c=thresholds)\n", + "plt.plot(fpr, tpr, '--', color='gray', alpha=0.5)\n", + "plt.text(0.5, 0.5, f'AuC ROC: {auc:0.03f} \\nAccuracy: {acc:0.03f}')\n", + "None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "execution": { + "allow_errors": true, + "timeout": 300 + }, + "kernelspec": { + "display_name": "eschool2022", + "language": "python", + "name": "eschool2022" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/machine-learning/ml_func.py b/machine-learning/ml_func.py index 42d533e..a0d357f 100644 --- a/machine-learning/ml_func.py +++ b/machine-learning/ml_func.py @@ -2,7 +2,8 @@ from pathlib import Path import requests import sys - +from sklearn.metrics import mean_squared_log_error, accuracy_score, mean_squared_error +import pandas as pd def save_prediction(test_df, yourname): @@ -40,3 +41,26 @@ def set_plot_style(): plt.rcParams["axes.spines.right"] = False + + +def scoring(test_file): + test_labels = pd.read_hdf('escape_school_cta_data_test.h5') + prediction = pd.read_hdf(test_file) + df = pd.merge(test_labels, prediction, on=['obs_id', 'event_id']) + gammas = df[df['true_shower_primary_id']==0] + mse = mean_squared_log_error(gammas['true_energy'], gammas['reco_energy']) + # mse = mean_squared_error(df['true_energy'], df['reco_energy']) + acc = accuracy_score(df['true_shower_primary_id'], df['reco_type']) + return mse, acc + +def ranking(directory): + scores = pd.DataFrame(columns=['name', 'mse', 'acc']) + for file in Path(directory).glob('*.h5'): + participant_name = file.stem.replace('pred_', '') + sc = scoring(file) + scores = scores.append({'name': participant_name, 'mse': sc[0], 'acc': sc[1]}, + ignore_index=True + ) + scores = scores.sort_values(['mse', 'acc']) + return scores + \ No newline at end of file From 527b90f6062985c4d1bb8a53230ec186241e32a8 Mon Sep 17 00:00:00 2001 From: vuillaut Date: Fri, 24 Jun 2022 13:53:32 +0200 Subject: [PATCH 2/2] add ranking notebook --- machine-learning/challenge_ranking.ipynb | 89 ++++++++++++++++++++++++ machine-learning/ml_func.py | 1 - 2 files changed, 89 insertions(+), 1 deletion(-) create mode 100644 machine-learning/challenge_ranking.ipynb diff --git a/machine-learning/challenge_ranking.ipynb b/machine-learning/challenge_ranking.ipynb new file mode 100644 index 0000000..b344d09 --- /dev/null +++ b/machine-learning/challenge_ranking.ipynb @@ -0,0 +1,89 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "59d7b12e-df49-4398-9597-32dabd4acdde", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce83b3ed-aa69-4680-b53b-cf741d1b3a16", + "metadata": {}, + "outputs": [], + "source": [ + "import ml_func as ml\n", + "import warnings\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c4a04bfa-991f-41fa-a6bd-c9c8512f0c2f", + "metadata": {}, + "outputs": [], + "source": [ + "rank = ml.ranking('../school2022-mlctapred/')\n", + "rank" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f636fcb-40d9-4fcb-b723-261a94613110", + "metadata": {}, + "outputs": [], + "source": [ + "# Best at classification\n", + "rank.sort_values(by='acc', ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab05b17e-3c6f-4adb-8073-916022d0d83f", + "metadata": {}, + "outputs": [], + "source": [ + "# Best at regression\n", + "rank.sort_values(by='mse', ascending=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "830babb0-6031-45ec-8f63-27b1ff51115d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "eschool2022", + "language": "python", + "name": "eschool2022" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/machine-learning/ml_func.py b/machine-learning/ml_func.py index a0d357f..16c7f3c 100644 --- a/machine-learning/ml_func.py +++ b/machine-learning/ml_func.py @@ -49,7 +49,6 @@ def scoring(test_file): df = pd.merge(test_labels, prediction, on=['obs_id', 'event_id']) gammas = df[df['true_shower_primary_id']==0] mse = mean_squared_log_error(gammas['true_energy'], gammas['reco_energy']) - # mse = mean_squared_error(df['true_energy'], df['reco_energy']) acc = accuracy_score(df['true_shower_primary_id'], df['reco_type']) return mse, acc