update sup_lrn

9438ad1e · GILSON Matthieu · 06422a35 · 9438ad1e · 9438ad1e · 9438ad1e
Commit 9438ad1e authored Mar 3, 2024 by GILSON Matthieu
--- a/basics/python/basic_scikitlearn.pdf
+++ b/basics/python/basic_scikitlearn.pdf
--- a/net_th/NetworkAnalysis.pdf
+++ b/net_th/NetworkAnalysis.pdf
--- a/sup_lrn/SupLearn.pdf
+++ b/sup_lrn/SupLearn.pdf
--- a/sup_lrn/nb_suplearn1_cross-validation.ipynb
+++ b/sup_lrn/nb_suplearn1_cross-validation.ipynb
@@ -5,9 +5,9 @@
   "id": "25dfd6d1",
   "metadata": {},
   "source": [
-    "# 1) Cross-validation\n",
+    "author: Mat Gilson, https://etulab.univ-amu.fr/gilson.m/compneuro_course/\n",
    "\n",
-    "author: Mat Gilson, https://github.com/MatthieuGilson\n",
+    "# 1) Cross-validation\n",
    "\n",
    "This notebook shows a key concept at the core of all machine learning procedures. It aims to quantify the generalizability of a trained classifier to unseen data. Here we also see how to get a baseline reference in terms of accuracy. \n",
    "\n",
@@ -48,6 +48,16 @@
    "mpl.rc('font', **font)\n"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "687e9515-5422-47f0-92d5-358b7d9aeed4",
+   "metadata": {},
+   "source": [
+    "## Generate samples to classify\n",
+    "\n",
+    "We first generate synthetic data with 2 classes to separate (`s0` and `s1` samples, respectively). The input dimensionality corresponds `m` features."
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@@ -109,7 +119,7 @@
    "m = 10 # input dimensionality\n",
    "s0 = 100 # number of samples for class 0\n",
    "s1 = 100 # number of samples for class 1\n",
-    "scaling = 0.7 # class contrast\n",
+    "scaling = 0.0 # class contrast\n",
    "\n",
    "# generate inputs\n",
    "X, y = gen_inputs(m, s0, s1, scaling)\n",
@@ -146,7 +156,6 @@
    "plt.title('input {}'.format(i), loc='left')\n",
    "plt.xlabel('X values')\n",
    "plt.tight_layout()\n",
-    "plt.savefig('ex_contrast_X')\n",
    "plt.show()"
   ]
  },
@@ -165,6 +174,7 @@
    "    mat_rgb[i,:,0] = np.histogram(X[:s0,i], bins=vbins)[0]\n",
    "    mat_rgb[i,:,2] = np.histogram(X[s0:,i], bins=vbins)[0]\n",
    "mat_rgb /= mat_rgb.max() / 2.0\n",
+    "mat_rgb = np.clip(mat_rgb, 0.0, 1.0)\n",
    "\n",
    "plt.figure(figsize=[6,7])\n",
    "plt.imshow(mat_rgb)\n",
@@ -213,7 +223,7 @@
   },
   "outputs": [],
   "source": [
-    "acc = pd.DataFrame(columns=['score'])\n",
+    "acc = pd.DataFrame()\n",
    "\n",
    "# repetitions\n",
    "n_rep = 20\n",
@@ -228,7 +238,7 @@
    "    acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)\n",
    "\n",
    "# plot\n",
-    "sb.violinplot(data=acc, y='score', scale='width', palette=['brown'])\n",
+    "sb.violinplot(data=acc, y='score', density_norm='width', palette=['brown'])\n",
    "plt.text(0, 1.05, str(acc['score'].mean())[:4], horizontalalignment='center')\n",
    "plt.yticks([0,1])\n",
    "plt.axis(ymax=1.02)\n",
@@ -249,7 +259,7 @@
    "# theoretical chance level\n",
    "chance_level = 1.0 / 2\n",
    "\n",
-    "sb.violinplot(data=acc, y='score', scale='width', palette=['brown'])\n",
+    "sb.violinplot(data=acc, y='score', density_norm='width', palette=['brown'])\n",
    "plt.plot([-1,1], [chance_level]*2, '--k')\n",
    "plt.yticks([0,1])\n",
    "plt.axis(ymax=1.02)\n",
@@ -275,7 +285,7 @@
   },
   "outputs": [],
   "source": [
-    "acc = pd.DataFrame(columns=['score'])\n",
+    "acc = pd.DataFrame()\n",
    "\n",
    "# change contrast\n",
    "scaling = 0.0 # try 0.2, 0.1, 0.0\n",
@@ -293,7 +303,7 @@
    "    acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)\n",
    "\n",
    "# plot\n",
-    "sb.violinplot(data=acc, y='score', scale='width', palette=['brown']) # cut=0\n",
+    "sb.violinplot(data=acc, y='score', density_norm='width', palette=['brown'])\n",
    "plt.text(0, 1.05, str(acc['score'].mean())[:4], horizontalalignment='center')\n",
    "plt.plot([-1,1], [chance_level]*2, '--k')\n",
    "plt.yticks([0,1])\n",
@@ -310,7 +320,7 @@
   "source": [
    "Even for `scaling=0.0`, the classification accuracy is above the expected chance level $0.5$...\n",
    "\n",
-    "Let's try with similar (i.e. with same scaling), but new data."
+    "Let's try with similar (i.e. with same scaling), but new generated data."
   ]
  },
  {
@@ -336,7 +346,7 @@
   },
   "outputs": [],
   "source": [
-    "acc = pd.DataFrame(columns=['type','score'])\n",
+    "acc = pd.DataFrame()\n",
    "\n",
    "# input dimensionality (number of features)\n",
    "m = 10 # try 5, 20\n",
@@ -363,7 +373,7 @@
    "         'score': [clf.score(X_new, y_new)]}\n",
    "    acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)\n",
    "    \n",
-    "sb.violinplot(data=acc, x='type', y='score', scale='width', palette=['brown','orange'])\n",
+    "sb.violinplot(data=acc, x='type', y='score', hue='type', density_norm='width', palette=['brown','orange'])\n",
    "plt.text(0, 1.05, str(acc[acc['type']=='training']['score'].mean())[:4], horizontalalignment='center')\n",
    "plt.text(1, 1.05, str(acc[acc['type']=='new']['score'].mean())[:4], horizontalalignment='center')\n",
    "plt.plot([-1,2], [chance_level]*2, '--k')\n",
@@ -454,7 +464,7 @@
    "# wrapper to test cross-validation scheme (cvs) \n",
    "def test_cvs(cvs, cvs_lbl):\n",
    "    \n",
-    "    acc = pd.DataFrame(columns=['type', 'cv', 'score'])\n",
+    "    acc = pd.DataFrame()\n",
    "    \n",
    "    for train_ind, test_ind in cvs.split(X, y):\n",
    "    \n",
@@ -487,16 +497,16 @@
    "m = 10 # input dimensionality\n",
    "s0 = 100 # number of samples for class 0\n",
    "s1 = 100 # number of samples for class 1\n",
-    "scaling = 0.7 # class contrast\n",
+    "scaling = 0.5 # class contrast\n",
    "X, y = gen_inputs(m, s0, s1, scaling)\n",
    "\n",
    "# evaluate classification accuracy on train and test sets\n",
    "acc = test_cvs(cvs0, 'no strat')\n",
    "\n",
    "# theoretical chance level\n",
-    "chance_level = 0.5\n",
+    "chance_level = 0.7\n",
    "\n",
-    "sb.violinplot(data=acc, x='cv', y='score', hue='type', split=True, scale='width', palette=['brown','orange'])\n",
+    "sb.violinplot(data=acc, x='cv', y='score', hue='type', split=True, density_norm='width', palette=['brown','orange'])\n",
    "plt.plot([-1,2], [chance_level]*2, '--k')\n",
    "plt.yticks([0,1])\n",
    "plt.ylabel('accuracy')\n",
@@ -561,8 +571,9 @@
    "# evaluate classification accuracy on train and test sets\n",
    "acc = pd.concat((acc, test_cvs(cvs1, 'strat')))\n",
    "\n",
-    "sb.violinplot(data=acc, x='cv', y='score', hue='type', split=True, scale='width', palette=['brown','orange'])\n",
+    "sb.violinplot(data=acc, x='cv', y='score', hue='type', split=True, density_norm='width', palette=['brown','orange'])\n",
    "plt.plot([-1,2], [chance_level]*2, '--k')\n",
+    "plt.legend(loc='lower center')\n",
    "plt.yticks([0,1])\n",
    "plt.ylabel('accuracy')\n",
    "plt.axis(ymax=1.02)\n",
@@ -591,7 +602,7 @@
    "# wrapper to test cross-validation scheme (cvs) \n",
    "def test_cvs(cvs, cvs_lbl):\n",
    "    \n",
-    "    acc = pd.DataFrame(columns=['type', 'cv', 'score'])\n",
+    "    acc = pd.DataFrame()\n",
    "    \n",
    "    for train_ind, test_ind in cvs.split(X, y):\n",
    "    \n",
@@ -629,7 +640,7 @@
    "# theoretical chance level\n",
    "chance_level = 0.5\n",
    "\n",
-    "sb.violinplot(data=acc, x='cv', y='score', hue='type', split=True, scale='width', palette=['orange','gray'])\n",
+    "sb.violinplot(data=acc, x='cv', y='score', hue='type', split=True, density_norm='width', palette=['orange','gray'])\n",
    "plt.plot([-1,2], [chance_level]*2, '--k')\n",
    "plt.yticks([0,1])\n",
    "plt.ylabel('accuracy')\n",
@@ -688,7 +699,7 @@
    "naive_chance_level = 1.0 / 2 # equal probability for each category\n",
    "greedy_chance_level = max(s0,s1) / (s0+s1) # always predict the larger class\n",
    "\n",
-    "sb.violinplot(data=acc, x='cv', y='score', hue='type', split=True, scale='width', palette=['orange','gray'])\n",
+    "sb.violinplot(data=acc, x='cv', y='score', hue='type', split=True, density_norm='width', palette=['orange','gray'])\n",
    "plt.plot([-1,2], [naive_chance_level]*2, ':k')\n",
    "plt.plot([-1,2], [greedy_chance_level]*2, '--k')\n",
    "plt.yticks([0,1])\n",
@@ -807,7 +818,7 @@
    "cvs_lbl = ['no strat', 'strat', 'loo']\n",
    "\n",
    "# accuracy\n",
-    "acc = pd.DataFrame(columns=['type', 'cv', 'score'])\n",
+    "acc = pd.DataFrame()\n",
    "\n",
    "# loop over cross-validation schemes\n",
    "for lbl, cvs in zip(cvs_lbl, cvs_list):\n",
@@ -823,7 +834,7 @@
    "mean_small_test = acc[sel_df]['score'].mean()\n",
    "\n",
    "# plot\n",
-    "sb.violinplot(data=acc, x='cv', y='score', hue='type', split=True, scale='width', palette=['orange','gray'], cut=0)\n",
+    "sb.violinplot(data=acc, x='cv', y='score', hue='type', split=True, density_norm='width', palette=['orange','gray'], cut=0)\n",
    "plt.plot([-1,3], [naive_chance_level]*2, ':k')\n",
    "plt.plot([-1,3], [greedy_chance_level]*2, '--k')\n",
    "plt.plot([2.5], [mean_small_test], marker='x', markersize=16, color='k', ls='')\n",
@@ -831,7 +842,7 @@
    "plt.ylabel('accuracy')\n",
    "plt.legend(loc='lower left')\n",
    "plt.tight_layout()\n",
-    "plt.savefig('acc_cv')\n",
+    "\n",
    "plt.show()"
   ]
  },
@@ -910,7 +921,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.11.7"
  }
 },
 "nbformat": 4,

 %% Cell type:markdown id:25dfd6d1 tags:

-# 1) Cross-validation
+author: Mat Gilson, https://etulab.univ-amu.fr/gilson.m/compneuro_course/

-author: Mat Gilson, https://github.com/MatthieuGilson
+# 1) Cross-validation

 This notebook shows a key concept at the core of all machine learning procedures. It aims to quantify the generalizability of a trained classifier to unseen data. Here we also see how to get a baseline reference in terms of accuracy.

 See also the documentation of scikit-learn library (https://scikit-learn.org/)

 %% Cell type:code id:707a0699 tags:

 ``` python
 # import librairies

 import numpy as np

 from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit, LeaveOneOut

 from sklearn.linear_model import LogisticRegression
 from sklearn.preprocessing import StandardScaler
 from sklearn.pipeline import make_pipeline

 from sklearn.metrics import confusion_matrix

 import pandas as pd

 %matplotlib inline
 import matplotlib as mpl
 import matplotlib.pyplot as plt
 import seaborn as sb

 font = {'family' : 'DejaVu Sans',
        'weight' : 'regular',
        'size'   : 18}
 mpl.rc('font', **font)
 ```

+%% Cell type:markdown id:687e9515-5422-47f0-92d5-358b7d9aeed4 tags:
+
+## Generate samples to classify
+
+We first generate synthetic data with 2 classes to separate (`s0` and `s1` samples, respectively). The input dimensionality corresponds `m` features.
+
 %% Cell type:code id:219520bc tags:

 ``` python
 # create synthetic dataset where 2 classes of s0+s1 samples of m-dimensional inputs with controlled contrast
 def gen_inputs(m,        # input dimensionality
               s0,       # number of samples for class 0
               s1,       # number of samples for class 1
               scaling): # scaling factor to separate classes

    # labels
    lbl = np.zeros([s0+s1], dtype=int)
    # inputs
    X = np.zeros([s0+s1,m])

    # create s0 and s1 samples for the 2 classes
    for i in range(s0+s1):
        # label
        lbl[i] = int(i<s0)
        # inputs are random noise plus a shift
        for j in range(m):
            # positive/negative shift for 1st/2nd class, only for indices j larger than m/2
            if j>=m/2:
                if i<s0:
                    a = -scaling
                else:
                    a = scaling
            else:
                a = 0.0
            # the shift linearly depends on the feature index j
            X[i,j] = a + np.random.randn()

    return X, lbl
 ```

 %% Cell type:markdown id:506698d5 tags:

 Let's have a first look at the data. For features with indices smaller than $m/2$, there is no contrast between the two classes (in red and blue). For indices larger than $m/2$, the contrast is determined by the scaling parameters (separation between the two disributions). Each feature involve independent noise in teh generation of the samples.

 %% Cell type:code id:1bc5284f tags:

 ``` python
 # input properties
 m = 10 # input dimensionality
 s0 = 100 # number of samples for class 0
 s1 = 100 # number of samples for class 1
-scaling = 0.7 # class contrast
+scaling = 0.0 # class contrast

 # generate inputs
 X, y = gen_inputs(m, s0, s1, scaling)

 # bins for istograms
 vbins = np.linspace(-3,3,30)

 # plot
 plt.figure(figsize=[6,7])
 plt.subplot(411)
 i = 0
 plt.hist(X[:s0,i], histtype='step', bins=vbins, color='r')
 plt.hist(X[s0:,i], histtype='step', bins=vbins, color='b')
 plt.axis(xmin=-3, xmax=3)
 plt.legend(['class 0', 'class 1'], fontsize=10)
 plt.title('input {}'.format(i), loc='left')
 plt.subplot(412)
 i = int((m-1)*0.33)
 plt.hist(X[:s0,i], histtype='step', bins=vbins, color='r')
 plt.hist(X[s0:,i], histtype='step', bins=vbins, color='b')
 plt.axis(xmin=-3, xmax=3)
 plt.title('input {}'.format(i), loc='left')
 plt.subplot(413)
 i = int((m-1)*0.66)
 plt.hist(X[:s0,i], histtype='step', bins=vbins, color='r')
 plt.hist(X[s0:,i], histtype='step', bins=vbins, color='b')
 plt.axis(xmin=-3, xmax=3)
 plt.title('input {}'.format(i), loc='left')
 plt.subplot(414)
 i = m-1
 plt.hist(X[:s0,i], histtype='step', bins=vbins, color='r')
 plt.hist(X[s0:,i], histtype='step', bins=vbins, color='b')
 plt.axis(xmin=-3, xmax=3)
 plt.title('input {}'.format(i), loc='left')
 plt.xlabel('X values')
 plt.tight_layout()
-plt.savefig('ex_contrast_X')
 plt.show()
 ```

 %% Cell type:code id:3b9a9c63 tags:

 ``` python
 # create matrix rgb
 mat_rgb = np.zeros([m, vbins.size-1, 3])
 for i in range(m):
    mat_rgb[i,:,0] = np.histogram(X[:s0,i], bins=vbins)[0]
    mat_rgb[i,:,2] = np.histogram(X[s0:,i], bins=vbins)[0]
 mat_rgb /= mat_rgb.max() / 2.0
+mat_rgb = np.clip(mat_rgb, 0.0, 1.0)

 plt.figure(figsize=[6,7])
 plt.imshow(mat_rgb)
 plt.xlabel('values')
 plt.ylabel('input index')
 plt.show()
 ```

 %% Cell type:markdown id:46880c12 tags:

 Above, the purple pixels indicate that the distirbutions of teh two classes overlap, whereas red and blue pixels indicate the dominance of one class.

 Now let's see how to separate the 2 classes using a classifier.

 %% Cell type:code id:d25b05e3 tags:

 ``` python
 # Classifiers and learning parameters
 clf = LogisticRegression(C=10000.0, penalty='l2', solver='lbfgs', max_iter=500)
 ```

 %% Cell type:markdown id:845c6ded tags:

 What is the reference as "chance" level: $50\%$ for $2$ classes?

 %% Cell type:code id:832441a2 tags:

 ``` python
-acc = pd.DataFrame(columns=['score'])
+acc = pd.DataFrame()

 # repetitions
 n_rep = 20
 for i_rep in range(n_rep):
    # generate data
    X, y = gen_inputs(m, s0, s1, scaling)

    # Train and test classifiers with subject labels
    clf.fit(X, y)
    # accuracy on train set
    d = {'score': [clf.score(X, y)]}
    acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)

 # plot
-sb.violinplot(data=acc, y='score', scale='width', palette=['brown'])
+sb.violinplot(data=acc, y='score', density_norm='width', palette=['brown'])
 plt.text(0, 1.05, str(acc['score'].mean())[:4], horizontalalignment='center')
 plt.yticks([0,1])
 plt.axis(ymax=1.02)
 plt.xlabel('classifier')
 plt.ylabel('accuracy')
 plt.show()
 ```

 %% Cell type:code id:d28622c6 tags:

 ``` python
 # theoretical chance level
 chance_level = 1.0 / 2

-sb.violinplot(data=acc, y='score', scale='width', palette=['brown'])
+sb.violinplot(data=acc, y='score', density_norm='width', palette=['brown'])
 plt.plot([-1,1], [chance_level]*2, '--k')
 plt.yticks([0,1])
 plt.axis(ymax=1.02)
 plt.xlabel('classifier')
 plt.ylabel('accuracy')
 plt.show()
 ```

 %% Cell type:markdown id:ae55e427 tags:

 We can play with the contrast between the 2 classes, for example a more difficult classification with lower contrast / separability.

 %% Cell type:code id:5452ae09 tags:

 ``` python
-acc = pd.DataFrame(columns=['score'])
+acc = pd.DataFrame()

 # change contrast
 scaling = 0.0 # try 0.2, 0.1, 0.0

 # repetitions
 n_rep = 20
 for i_rep in range(n_rep):
    # generate data
    X, y = gen_inputs(m, s0, s1, scaling)

    # Train and test classifiers with subject labels
    clf.fit(X, y)
    # accuracy on train set
    d = {'score': [clf.score(X, y)]}
    acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)

 # plot
-sb.violinplot(data=acc, y='score', scale='width', palette=['brown']) # cut=0
+sb.violinplot(data=acc, y='score', density_norm='width', palette=['brown'])
 plt.text(0, 1.05, str(acc['score'].mean())[:4], horizontalalignment='center')
 plt.plot([-1,1], [chance_level]*2, '--k')
 plt.yticks([0,1])
 plt.axis(ymax=1.02)
 plt.xlabel('classifier')
 plt.ylabel('accuracy')
 plt.show()
 ```

 %% Cell type:markdown id:b9d4a9a2 tags:

 Even for `scaling=0.0`, the classification accuracy is above the expected chance level $0.5$...

-Let's try with similar (i.e. with same scaling), but new data.
+Let's try with similar (i.e. with same scaling), but new generated data.

 %% Cell type:code id:65df5e9c tags:

 ``` python
 X_new, y_new = gen_inputs(m, s0, s1, 0.0) # also change the scaling to play with the code

 print(clf.score(X_new, y_new))
 ```

 %% Cell type:code id:cddebf1c tags:

 ``` python
-acc = pd.DataFrame(columns=['type','score'])
+acc = pd.DataFrame()

 # input dimensionality (number of features)
 m = 10 # try 5, 20

 # class contrast
 scaling = 0.7 # try 0.5, 0.0

 # loop with training on a dataset and testing on a new dataset
 for i_rep in range(n_rep):
    # generate data
    X, y = gen_inputs(m, s0, s1, scaling)

    # train and calcluate accuracy
    clf.fit(X, y)
    d = {'type': ['training'],
         'score': [clf.score(X, y)]}
    acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)

    # generate new data
    X_new, y_new = gen_inputs(m, s0, s1, scaling)

    # only test classifier that was trained on other data
    d = {'type': ['new'],
         'score': [clf.score(X_new, y_new)]}
    acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)

-sb.violinplot(data=acc, x='type', y='score', scale='width', palette=['brown','orange'])
+sb.violinplot(data=acc, x='type', y='score', hue='type', density_norm='width', palette=['brown','orange'])
 plt.text(0, 1.05, str(acc[acc['type']=='training']['score'].mean())[:4], horizontalalignment='center')
 plt.text(1, 1.05, str(acc[acc['type']=='new']['score'].mean())[:4], horizontalalignment='center')
 plt.plot([-1,2], [chance_level]*2, '--k')
 plt.yticks([0,1])
 plt.axis(ymax=1.02)
 plt.xlabel('classifier')
 plt.ylabel('accuracy')
 plt.show()
 ```

 %% Cell type:markdown id:154c6878 tags:

 The classifier tends to extract specific "information" from the data it is trained with, which corresponds to the notion of overfitting.

 The "real" accuracy that should be taken into account is the accuracy for the new data, which quantifies the generalization capability of the classifier to new data from the same class.

 %% Cell type:markdown id:3783c050 tags:

 ## Cross-validation scheme

 The idea is to generalize the previous observation by splitting the data into a training set and a testing set, ofr a number of repetitions. The relevant result to report is the test accuracy.

 %% Cell type:code id:5895003c tags:

 ``` python
 # number of repetitions and storage of results
 n_rep = 10

 # Cross-validation scheme
 cvs0 = ShuffleSplit(n_splits=n_rep, test_size=0.2)
 ```

 %% Cell type:code id:3d69f322 tags:

 ``` python
 # generate n_rep splits
 ind_split = np.zeros([n_rep,s0+s1])
 i_rep = 0
 for train_ind, test_ind in cvs0.split(X, y):
    ind_split[i_rep, test_ind] = 1
    i_rep += 1

 # calculate the size of the test set for each split
 test_size = np.vstack((ind_split[:,:s0].sum(axis=1),
                       ind_split[:,s0:].sum(axis=1)))

 plt.figure()
 plt.subplot(121)
 plt.imshow(ind_split, cmap='binary', interpolation='nearest', aspect=40)
 plt.xlabel('sample index')
 plt.ylabel('split index')
 plt.subplot(122)
 plt.plot(test_size[0,::-1], np.arange(n_rep), 'b')
 plt.plot(test_size[1,::-1], np.arange(n_rep), 'r')
 plt.xlabel('test size per class')
 plt.show()
 ```

 %% Cell type:code id:3bd9e7e2 tags:

 ``` python
 # wrapper to test cross-validation scheme (cvs)
 def test_cvs(cvs, cvs_lbl):

-    acc = pd.DataFrame(columns=['type', 'cv', 'score'])
+    acc = pd.DataFrame()

    for train_ind, test_ind in cvs.split(X, y):

        # train and test classifier
        clf.fit(X[train_ind,:], y[train_ind])
        # accuracy on train set
        d = {'type': ['train'],
             'cv': [cvs_lbl],
             'score': [clf.score(X[train_ind,:], y[train_ind])]}
        acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)
        # accuracy on test set
        d = {'type': ['test'],
             'cv': [cvs_lbl],
             'score': [clf.score(X[test_ind,:], y[test_ind])]}
        acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)

    return acc
 ```

 %% Cell type:code id:e594d7e1 tags:

 ``` python
 # generate a dataset of features and labels
 m = 10 # input dimensionality
 s0 = 100 # number of samples for class 0
 s1 = 100 # number of samples for class 1
-scaling = 0.7 # class contrast
+scaling = 0.5 # class contrast
 X, y = gen_inputs(m, s0, s1, scaling)

 # evaluate classification accuracy on train and test sets
 acc = test_cvs(cvs0, 'no strat')

 # theoretical chance level
-chance_level = 0.5
+chance_level = 0.7

-sb.violinplot(data=acc, x='cv', y='score', hue='type', split=True, scale='width', palette=['brown','orange'])
+sb.violinplot(data=acc, x='cv', y='score', hue='type', split=True, density_norm='width', palette=['brown','orange'])
 plt.plot([-1,2], [chance_level]*2, '--k')
 plt.yticks([0,1])
 plt.ylabel('accuracy')
 plt.axis(ymax=1.02)
 plt.show()
 ```

 %% Cell type:markdown id:c462266b tags:

 ## Stratification

 Let's consider a different splitting scheme that preserves the ratio of classes from the original dataset in each train-test sets.

 %% Cell type:code id:19850420 tags:

 ``` python
 # stratified shuffle split: preserving ratio of classes in train-test sets
 cvs1 = StratifiedShuffleSplit(n_splits=n_rep, test_size=0.2)

 # generate n_rep splits
 ind_split = np.zeros([n_rep,s0+s1])
 i_rep = 0
 for train_ind, test_ind in cvs1.split(X, y):
    ind_split[i_rep, test_ind] = 1
    i_rep += 1

 # calculate the size of the test set for each split
 test_size = np.vstack((ind_split[:,:s0].sum(axis=1),
                       ind_split[:,s0:].sum(axis=1)))

 plt.figure()
 plt.subplot(121)
 plt.imshow(ind_split, cmap='binary', interpolation='nearest', aspect=60)
 plt.xlabel('sample index')
 plt.ylabel('split index')
 plt.subplot(122)
 plt.plot(test_size[0,::-1], np.arange(n_rep), 'b')
 plt.plot(test_size[1,::-1], np.arange(n_rep), 'r')
 plt.xlabel('test size per class')
 plt.show()
 ```

 %% Cell type:code id:bd7a8cc4 tags:

 ``` python
 # evaluate classification accuracy on train and test sets
 acc = pd.concat((acc, test_cvs(cvs1, 'strat')))

-sb.violinplot(data=acc, x='cv', y='score', hue='type', split=True, scale='width', palette=['brown','orange'])
+sb.violinplot(data=acc, x='cv', y='score', hue='type', split=True, density_norm='width', palette=['brown','orange'])
 plt.plot([-1,2], [chance_level]*2, '--k')
+plt.legend(loc='lower center')
 plt.yticks([0,1])
 plt.ylabel('accuracy')
 plt.axis(ymax=1.02)
 plt.show()
 ```

 %% Cell type:markdown id:04873ea0 tags:

 # Baseline accuracy as reference

 How to interpret the test accuracy?

 %% Cell type:code id:c49c4763 tags:

 ``` python
 # wrapper to test cross-validation scheme (cvs)
 def test_cvs(cvs, cvs_lbl):

-    acc = pd.DataFrame(columns=['type', 'cv', 'score'])
+    acc = pd.DataFrame()

    for train_ind, test_ind in cvs.split(X, y):

        # train and test classifiers
        clf.fit(X[train_ind,:], y[train_ind])
        # accuracy on test set
        d = {'type': ['test'],
             'cv': [cvs_lbl],
             'score': [clf.score(X[test_ind,:], y[test_ind])]}
        acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)

        # shuffling
        train_ind_rand = np.random.permutation(train_ind)
        clf.fit(X[train_ind,:], y[train_ind_rand])
        # accuracy on test set
        d = {'type': ['shuf'],
             'cv': [cvs_lbl],
             'score': [clf.score(X[test_ind,:], y[test_ind])]}
        acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)

    return acc
 ```

 %% Cell type:code id:2e0f1248 tags:

 ``` python
 acc = test_cvs(cvs0, 'no strat')

 # theoretical chance level
 chance_level = 0.5

-sb.violinplot(data=acc, x='cv', y='score', hue='type', split=True, scale='width', palette=['orange','gray'])
+sb.violinplot(data=acc, x='cv', y='score', hue='type', split=True, density_norm='width', palette=['orange','gray'])
 plt.plot([-1,2], [chance_level]*2, '--k')
 plt.yticks([0,1])
 plt.ylabel('accuracy')
 plt.axis(ymax=1.02)
 plt.show()
 ```

 %% Cell type:markdown id:082f8d49 tags:

 We find distributed values of accuracies for the shuffling surrogates around the expected value. It looks fine, but is all the variability coming from the classifier?

 %% Cell type:markdown id:21edac79 tags:

 # Unbalanced data

 We now look into a different version of the dataset, a bit less artificial. We consider the case of unbalanced classes, with different number of samples in the two classes.

 %% Cell type:code id:902d7b27 tags:

 ``` python
 # generate inputs with unbalanced classes
 m = 10 # input dimensionality
 s0 = 100 # number of samples for class 0
 s1 = 300 # number of samples for class 1
 scaling = 0.7 # class contrast

 X, y = gen_inputs(m, s0, s1, scaling) # also change the scaling to play with the code
 ```

 %% Cell type:code id:1d1629a0 tags:

 ``` python
 acc = test_cvs(cvs0, 'no strat')

 # theoretical chance level
 naive_chance_level = 1.0 / 2 # equal probability for each category
 greedy_chance_level = max(s0,s1) / (s0+s1) # always predict the larger class

-sb.violinplot(data=acc, x='cv', y='score', hue='type', split=True, scale='width', palette=['orange','gray'])
+sb.violinplot(data=acc, x='cv', y='score', hue='type', split=True, density_norm='width', palette=['orange','gray'])
 plt.plot([-1,2], [naive_chance_level]*2, ':k')
 plt.plot([-1,2], [greedy_chance_level]*2, '--k')
 plt.yticks([0,1])
 plt.ylabel('accuracy')
 plt.axis(ymax=1.02)
 plt.show()
 ```

 %% Cell type:markdown id:4f9a0bd6 tags:

 The baseline accuracy has changed in this case... A "stupid" classifier predicting always the larger class will perform above $50\%$ accuracy for unbalanced classes.

 %% Cell type:code id:b662b586 tags:

 ``` python
 # generate n_rep splits
 ind_split = np.zeros([n_rep,s0+s1])
 i_rep = 0
 for train_ind, test_ind in cvs0.split(X, y):
    ind_split[i_rep, test_ind] = 1
    i_rep += 1

 # calculate the size of the test set for each split
 test_size = np.vstack((ind_split[:,:s0].sum(axis=1),
                       ind_split[:,s0:].sum(axis=1)))

 plt.figure()
 plt.subplot(121)
 plt.imshow(ind_split, cmap='binary', interpolation='nearest', aspect=40)
 plt.xlabel('sample index')
 plt.ylabel('split index')
 plt.subplot(122)
 plt.plot(test_size[0,::-1], np.arange(n_rep), 'b')
 plt.plot(test_size[1,::-1], np.arange(n_rep), 'r')
 plt.xlabel('test size per class')
 plt.show()
 ```

 %% Cell type:markdown id:0a26b002 tags:

 Does the variability of the class ratio in the test set matter?

 Let's compare to other possibility for splitting the data in train-test sets.

 %% Cell type:markdown id:9e9b65a9 tags:

 ## Leave-one-out scheme

 Another popular scheme consists in leaving out one sample for testing, especially in the case of small datasets.

 %% Cell type:code id:005d13cf tags:

 ``` python
 # leave-two-out scheme
 cvs2 = LeaveOneOut()

 # generate n_rep splits
 ind_split = np.zeros([s0+s1,s0+s1])
 i_rep = 0
 for train_ind, test_ind in cvs2.split(X, y):
    ind_split[i_rep, test_ind] = 1
    i_rep += 1

 plt.figure()
 plt.imshow(ind_split, cmap='binary', interpolation='nearest', aspect=1)
 plt.xlabel('sample index')
 plt.ylabel('split index')
 plt.show()
 ```

 %% Cell type:markdown id:8f2ae9a7 tags:

 ## Comparison of cross-validation schemes

 %% Cell type:code id:4790171d tags:

 ``` python
 # list of cross-validation schemes
 cvs_list = [cvs0, cvs1, cvs2]

 # labels for cvs
 cvs_lbl = ['no strat', 'strat', 'loo']

 # accuracy
-acc = pd.DataFrame(columns=['type', 'cv', 'score'])
+acc = pd.DataFrame()

 # loop over cross-validation schemes
 for lbl, cvs in zip(cvs_lbl, cvs_list):
    acc_tmp = test_cvs(cvs, lbl)
    acc = pd.concat((acc, acc_tmp))

 # theoretical chance level
 naive_chance_level = 1.0 / 2 # equal probability for each category
 greedy_chance_level = max(s0,s1) / (s0+s1) # always predict the larger class

 # calculate mean of accuracy for leave-two-out scheme
 sel_df = np.logical_and(acc['cv']=='loo', acc['type']=='shuf')
 mean_small_test = acc[sel_df]['score'].mean()

 # plot
-sb.violinplot(data=acc, x='cv', y='score', hue='type', split=True, scale='width', palette=['orange','gray'], cut=0)
+sb.violinplot(data=acc, x='cv', y='score', hue='type', split=True, density_norm='width', palette=['orange','gray'], cut=0)
 plt.plot([-1,3], [naive_chance_level]*2, ':k')
 plt.plot([-1,3], [greedy_chance_level]*2, '--k')
 plt.plot([2.5], [mean_small_test], marker='x', markersize=16, color='k', ls='')
 plt.yticks([0,1])
 plt.ylabel('accuracy')
 plt.legend(loc='lower left')
 plt.tight_layout()
-plt.savefig('acc_cv')
+
 plt.show()
 ```

 %% Cell type:markdown id:5e25aa63 tags:

 ## Unpacking classification accuracy across classes

 %% Cell type:code id:6ea00c4e tags:

 ``` python
 # matrix of predicted class versus true class in test set
 cm = np.zeros([2,2])

 # repeat classification
 for train_ind, test_ind in cvs1.split(X, y):
    clf.fit(X[train_ind,:], y[train_ind])
    cm += confusion_matrix(y_true=y[test_ind],
                           y_pred=clf.predict(X[test_ind,:]))


 plt.figure()
 plt.imshow(cm, cmap='jet', vmin=0)
 plt.colorbar(label='sample count')
 plt.xticks([0,1], ['class 0','class 1'])
 plt.yticks([0,1], ['class 0','class 1'])
 plt.xlabel('predicted label')
 plt.ylabel('true label')
 plt.tight_layout()
 plt.show()
 ```

 %% Cell type:markdown id:35a5cbd1 tags:

 ## Conclusion

 We see that the leave-one-out (loo) only makes sense when averaging the results (see the cross).

 The non-stratified splitting inflates the distribution of chance-level accuracy.

 A good default (for sufficient data) is $80\%$-$20\%$ split for the train-test sets, repeated $100$ times.

 Importantly, this approach is *conservative*: the risk to find "information" by chance where there is none (akin to a "false positive") can controlled by comparing the classifier with teh chance-level accuracy, in particular its spread. Instead, mistakes tend to yield a large variability in the accuracy distributions, with an absence of positive outcome. In other words, when you find some effect, you can trust it, but it is also likely that you have to try different classifiers to get the best accuracy. This is in fact not a problem, but can inform on the structure of the data (where there is information related to the predicted label).

 *Refs:*
 - cross-validation: https://scikit-learn.org/stable/modules/cross_validation.html
 - chance-level evaluation: https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html

--- a/sup_lrn/nb_suplearn2_nested_cv.ipynb
+++ b/sup_lrn/nb_suplearn2_nested_cv.ipynb
@@ -5,6 +5,8 @@
   "id": "00cde4c7",
   "metadata": {},
   "source": [
+    "author: Mat Gilson, https://etulab.univ-amu.fr/gilson.m/compneuro_course/\n",
+    "\n",
    "# 2) Nested cross-validation \n",
    "\n",
    "This notebook shows the use of cross-validation to tune hyperparameters for classifiers\n",
@@ -49,7 +51,7 @@
   "source": [
    "## Generate samples to classify\n",
    "\n",
-    "We first generate synthetic data with 2 classes to separate (`s0` and `s1` samples, respectively). The input dimensionality corresponds `m` features."
+    "As before, we use the same synthetic data with 2 classes to separate (`s0` and `s1` samples, respectively). The input dimensionality corresponds `m` features."
   ]
  },
  {
@@ -151,7 +153,7 @@
    "Cs = np.array([0.01,0.1,1.0,10.0,100.0])\n",
    "n_C = Cs.size\n",
    "\n",
-    "acc = pd.DataFrame(columns=['type', 'C', 'score'])\n",
+    "acc = pd.DataFrame()\n",
    "\n",
    "coefs = np.zeros([n_C,m])\n",
    "\n",
@@ -178,7 +180,8 @@
    "\n",
    "\n",
    "# plot\n",
-    "sb.violinplot(data=acc, x='C', y='score', hue='type', split=True, scale='width', palette=['brown','orange'])\n",
+    "sb.violinplot(data=acc, x='C', y='score', hue='type', split=True,\n",
+    "              density_norm='width', palette=['brown','orange'])\n",
    "plt.yticks([0,1])\n",
    "plt.ylabel('accuracy')\n",
    "plt.xlabel('reg hyperparam C')\n",
@@ -198,7 +201,20 @@
    "plt.figure()\n",
    "for i_C in range(n_C):\n",
    "    plt.plot(range(m),coefs[i_C,:], label=Cs[i_C])\n",
-    "plt.legend()"
+    "plt.legend(fontsize=9)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "35706eb0-b2a6-4183-bb8b-dd8cf1ac304d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure()\n",
+    "for i_C in range(n_C):\n",
+    "    plt.plot(range(m),coefs[i_C,:]/max(coefs[i_C,:]), label=Cs[i_C])\n",
+    "plt.legend(fontsize=9)"
   ]
  },
  {
@@ -424,7 +440,7 @@
   "source": [
    "scalings = [0.1,0.2,0.3,0.4]\n",
    "\n",
-    "acc = pd.DataFrame(columns=['scaling', 'type', 'log C', 'score'])\n",
+    "acc = pd.DataFrame()\n",
    "\n",
    "for scaling in scalings:\n",
    "    # generate inputs\n",
@@ -477,7 +493,7 @@
    "\n",
    "plt.figure()\n",
    "sb.violinplot(data=acc2, x='scaling', y='score', hue='type', split=True, \n",
-    "              palette=['brown','orange'], scale='width')\n",
+    "              palette=['brown','orange'], density_norm='width')\n",
    "plt.plot([-1,4], [chance_level]*2, '--k')\n",
    "plt.yticks([0,1])\n",
    "plt.ylabel('accuracy')\n",
@@ -490,7 +506,7 @@
    "\n",
    "plt.figure()\n",
    "sb.violinplot(data=acc2, x='scaling', y='score', hue='type', hue_order=('shuf','test'),\n",
-    "              split=True, palette=['gray','orange'], scale='width')\n",
+    "              split=True, palette=['gray','orange'], density_norm='width')\n",
    "plt.plot([-1,4], [chance_level]*2, '--k')\n",
    "plt.yticks([0,1])\n",
    "plt.ylabel('accuracy')\n",
@@ -501,7 +517,7 @@
    "\n",
    "plt.figure()\n",
    "sb.violinplot(data=acc2, x='scaling', y='log C', hue='type', \n",
-    "              palette=['orange','gray'], scale='width')\n",
+    "              palette=['orange','gray'], density_norm='width')\n",
    "plt.legend(loc='lower right')\n",
    "plt.tight_layout()\n",
    "plt.title('best C (log$_{10}$ scale)')\n",
@@ -543,7 +559,7 @@
    "                    {'C': Cs},\n",
    "                    cv=cv_nest)\n",
    "\n",
-    "acc = pd.DataFrame(columns=['scaling', 'type', 'log C', 'score'])\n",
+    "acc = pd.DataFrame()\n",
    "\n",
    "for scaling in scalings:\n",
    "    # generate inputs\n",
@@ -585,7 +601,7 @@
   "source": [
    "plt.figure()\n",
    "sb.violinplot(data=acc, x='scaling', y='score', hue='type', hue_order=['shuf','test'], split=True, \n",
-    "              palette=['gray','orange'], scale='width')\n",
+    "              palette=['gray','orange'], density_norm='width')\n",
    "plt.plot([-1,4], [chance_level]*2, '--k')\n",
    "plt.yticks([0,1])\n",
    "plt.ylabel('accuracy')\n",
@@ -598,7 +614,7 @@
    "\n",
    "plt.figure()\n",
    "sb.violinplot(data=acc2, x='scaling', y='log C', hue='type', \n",
-    "              palette=['orange'], scale='width')\n",
+    "              palette=['orange'], density_norm='width')\n",
    "plt.legend(loc='upper right')\n",
    "plt.tight_layout()\n",
    "plt.title('best C (log$_{10}$ scale)')\n",
@@ -623,7 +639,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.11.7"
  }
 },
 "nbformat": 4,

 %% Cell type:markdown id:00cde4c7 tags:

+author: Mat Gilson, https://etulab.univ-amu.fr/gilson.m/compneuro_course/
+
 # 2) Nested cross-validation

 This notebook shows the use of cross-validation to tune hyperparameters for classifiers

 See also the documentation of scikit-learn library (https://scikit-learn.org/)

 %% Cell type:code id:8c9c8432 tags:

 ``` python
 # import librairies

 import numpy as np

 from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, GridSearchCV

 from sklearn.linear_model import LogisticRegression
 from sklearn.preprocessing import StandardScaler
 from sklearn.pipeline import make_pipeline

 import pandas as pd

 %matplotlib inline
 import matplotlib as mpl
 import matplotlib.pyplot as plt
 import seaborn as sb

 font = {'family' : 'DejaVu Sans',
        'weight' : 'regular',
        'size'   : 18}
 mpl.rc('font', **font)
 ```

 %% Cell type:markdown id:0d12a052 tags:

 ## Generate samples to classify

-We first generate synthetic data with 2 classes to separate (`s0` and `s1` samples, respectively). The input dimensionality corresponds `m` features.
+As before, we use the same synthetic data with 2 classes to separate (`s0` and `s1` samples, respectively). The input dimensionality corresponds `m` features.

 %% Cell type:code id:4673d31d tags:

 ``` python
 # create synthetic dataset where 2 classes of s0+s1 samples of m-dimensional inputs with controlled contrast
 def gen_inputs(m,        # input dimensionality
               s0,       # number of samples for class 0
               s1,       # number of samples for class 1
               scaling): # scaling factor to separate classes

    # labels
    lbl = np.zeros([s0+s1], dtype=int)
    # inputs
    X = np.zeros([s0+s1,m])

    # create s0 and s1 samples for the 2 classes
    for i in range(s0+s1):
        # label
        lbl[i] = int(i<s0)
        # inputs are random noise plus a shift
        for j in range(m):
            # positive/negative shift for 1st/2nd class, only for indices j larger than m/2
            if j>=m/2:
                if i<s0:
                    a = -scaling
                else:
                    a = scaling
            else:
                a = 0.0
            # the shift linearly depends on the feature index j
            X[i,j] = a*j/m + np.random.randn()

    return X, lbl
 ```

 %% Cell type:code id:029ab1fa tags:

 ``` python
 # generate inputs and labels
 m = 50 # input dimensionality
 s0=100 # number of samples for class 0
 s1=100 # number of samples for class 1
 X, y = gen_inputs(m, s0, s1, scaling=0.2)
 ```

 %% Cell type:markdown id:0024c6aa tags:

 ## Parameterization of classifier

 We then choose a classifier and a cross-validation scheme.

 %% Cell type:code id:6f2a4e24 tags:

 ``` python
 # (multinomial) logistic regression, involving regularization controlled by the hyperparameter C
 clf_MLR = LogisticRegression(C=10000.0, penalty='l2', solver='lbfgs', max_iter=500)

 # repetitions of cross-validation (train-test)
 n_rep = 5
 # stratified K fold with (same splitting with 80% for training and 20% for testing as StratifiedShuffleSplit before)
 cvs = StratifiedKFold(n_splits=n_rep)
 ```

 %% Cell type:markdown id:0c8bb544 tags:

 ### Role of hyperparameter

 What is the effect of the hyperparameter C that governs the regularization?

 %% Cell type:code id:069c8442 tags:

 ``` python
 # grid evaluation (log scale)
 Cs = np.array([0.01,0.1,1.0,10.0,100.0])
 n_C = Cs.size

-acc = pd.DataFrame(columns=['type', 'C', 'score'])
+acc = pd.DataFrame()

 coefs = np.zeros([n_C,m])

 # repeat classification
 cnt = 0
 for train_ind, test_ind in cvs.split(X, y):
    # loop over hyperparameter
    for C in Cs:
        # set regularization
        clf_MLR.set_params(C=C)
        # train classifier
        clf_MLR.fit(X[train_ind,:], y[train_ind])
        d = {'type': ['train'],
             'C': [C],
             'score': [clf_MLR.score(X[train_ind,:], y[train_ind])]}
        acc = pd.concat((acc, pd.DataFrame(data=d)))
        coefs[cnt:] = clf_MLR.coef_
        cnt += 1
        # test classifier
        d = {'type': ['test'],
             'C': [C],
             'score': [clf_MLR.score(X[test_ind,:], y[test_ind])]}
        acc = pd.concat((acc, pd.DataFrame(data=d)))


 # plot
-sb.violinplot(data=acc, x='C', y='score', hue='type', split=True, scale='width', palette=['brown','orange'])
+sb.violinplot(data=acc, x='C', y='score', hue='type', split=True,
+              density_norm='width', palette=['brown','orange'])
 plt.yticks([0,1])
 plt.ylabel('accuracy')
 plt.xlabel('reg hyperparam C')
 plt.legend(loc='lower right')
 plt.tight_layout()
 plt.title('train-test accuracy')
 plt.show()
 ```

 %% Cell type:code id:a65b81a7 tags:

 ``` python
 plt.figure()
 for i_C in range(n_C):
    plt.plot(range(m),coefs[i_C,:], label=Cs[i_C])
-plt.legend()
+plt.legend(fontsize=9)
+```
+
+%% Cell type:code id:35706eb0-b2a6-4183-bb8b-dd8cf1ac304d tags:
+
+``` python
+plt.figure()
+for i_C in range(n_C):
+    plt.plot(range(m),coefs[i_C,:]/max(coefs[i_C,:]), label=Cs[i_C])
+plt.legend(fontsize=9)
 ```

 %% Cell type:markdown id:a21b4c4c tags:

 The hyperparameter `C` has opposing effects on the train and test accuracies. This indicates that the unregularized MLR classifier (with large `C`) is overfitting the data here.

 How to select the best value for `C`?

 %% Cell type:markdown id:bdb04e59 tags:

 ## Inner cross-validation loop within the training set

 We can use cross-validation within the training set to test the effec tof `C`. Of course this reduces the amount of data for training within the original training set.

 %% Cell type:code id:e614b340 tags:

 ``` python
 # nested cross-validation scheme
 n_split_nest = 5
 cv_nest = StratifiedKFold(n_splits=n_split_nest)

 # indices of splits
 ind_split_inner = np.zeros([n_rep*n_split_nest,s0+s1])
 ind_split_outer = np.zeros([n_rep,s0+s1])

 # outer loop
 i_inner = 0
 i_outer = 0
 for tr_ind, test_ind in cvs.split(X, y):
    # split data
    X_tr, X_test = X[tr_ind,:], X[test_ind,:]
    y_tr, y_test = y[tr_ind], y[test_ind]

    # 2=train+val, 3=test
    ind_split_outer[i_outer, tr_ind] = 2
    ind_split_outer[i_outer, test_ind] = 3

    # inner loop
    for train_ind, val_ind in cv_nest.split(X_tr, y_tr):
        # 0=train, 1=val, 3=test
        ind_split_inner[i_inner, tr_ind[val_ind]] = 1
        ind_split_inner[i_inner, test_ind] = 3
        # increase counter
        i_inner += 1

    # increase counter
    i_outer += 1
 ```

 %% Cell type:markdown id:f0173728 tags:

 Let's first look at the splits in the outer loop.

 %% Cell type:code id:11469825 tags:

 ``` python
 # plot split indices for each
 plt.figure(figsize=[10,10])
 plt.imshow(ind_split_outer, vmin=0, vmax=7, cmap='Set2', interpolation='nearest', aspect=40)
 plt.xlabel('sample index')
 plt.ylabel('split index')
 cbar = plt.colorbar(ticks=[2,3], shrink=0.7)
 cbar.ax.set_yticklabels(['train+val','test'])
 plt.show()
 ```

 %% Cell type:markdown id:a52fffac tags:

 For each split, we subdivide the outer "training" set, into the inner training and validation sets.

 %% Cell type:code id:eb71ec5a tags:

 ``` python
 # plot split indices
 plt.figure(figsize=[10,10])
 plt.imshow(ind_split_inner[:n_split_nest,:], vmax=7, cmap='Set2', interpolation='nearest', aspect=40)
 plt.xlabel('sample index')
 plt.ylabel('split index')
 plt.colorbar(ticks=[0,1,3], shrink=0.7)
 plt.show()
 ```

 %% Cell type:markdown id:8df51f10 tags:

 Putting everything together, we have the 3 sets

 %% Cell type:code id:5c81d8eb tags:

 ``` python
 # plot split indices
 plt.figure(figsize=[10,10])
 plt.imshow(ind_split_inner, vmax=7, cmap='Set2', interpolation='nearest', aspect=8)
 plt.xlabel('sample index')
 plt.ylabel('split index')
 plt.colorbar(ticks=[0,1,3], shrink=0.7)
 plt.show()
 ```

 %% Cell type:markdown id:3200c011 tags:

 We can do the same with any outer loop, like the `StratifiedShuffleSplit` that we used before.

 %% Cell type:code id:f2e30669 tags:

 ``` python
 # change outer cross-validation scheme
 n_rep = 20
 cvs = StratifiedShuffleSplit(n_splits=n_rep, test_size=0.2)

 # indices of splits
 ind_split = np.zeros([n_rep*n_split_nest,s0+s1])

 # outer loop
 i_rep = 0
 for train_ind, test_ind in cvs.split(X, y):
    # split data
    X_train, X_test = X[train_ind,:], X[test_ind,:]
    y_train, y_test = y[train_ind], y[test_ind]

    # inner loop
    for tr_ind, val_ind in cv_nest.split(X_train, y_train):
        ind_split[i_rep, train_ind[val_ind]] = 1
        ind_split[i_rep, test_ind] = 3
        i_rep += 1

 # plot split indices
 plt.figure(figsize=[10,10])
 plt.imshow(ind_split, vmax=7, cmap='Set2', interpolation='nearest', aspect=2)
 plt.xlabel('sample index')
 plt.ylabel('split index')
 cbar = plt.colorbar(ticks=[0,1,3], shrink=0.7)
 cbar.ax.set_yticklabels(['train','val','test'])
 plt.show()
 ```

 %% Cell type:markdown id:cc9b644c tags:

 Now we can use this splitting scheme to test the effect of `C`, selecting the value that gives the best accuracy averaged over the validation sets.

 %% Cell type:code id:53b90b92 tags:

 ``` python
 # function to train hyperparameter with nested CV
 def nested_CV(clf, X, y):
    acc_nest = np.zeros([n_C,n_split_nest])
    j = 0
    for ind0, ind1 in cv_nest.split(X, y):
        # split in train and validation sets
        X0, X1 = X[ind0], X[ind1]
        y0, y1 = y[ind0], y[ind1]
        # train classifier over all reg parameters
        for i, C in enumerate(Cs):
            # set regularization
            clf.set_params(C=C)
            # train classifier on X0
            clf.fit(X0, y0)
            # test classifier on X1
            acc_nest[i,j] = clf.score(X1,y1)
        j += 1
    # get log C parameter
    i_best = np.argmax(acc_nest.mean(1))
    return Cs[i_best]
 ```

 %% Cell type:code id:5a264a05 tags:

 ``` python
 scalings = [0.1,0.2,0.3,0.4]

-acc = pd.DataFrame(columns=['scaling', 'type', 'log C', 'score'])
+acc = pd.DataFrame()

 for scaling in scalings:
    # generate inputs
    X, y = gen_inputs(m, s0, s1, scaling=scaling)

    # repeat classification
    for train_ind, test_ind in cvs.split(X, y):
        # select best hyperparameter
        best_C = nested_CV(clf_MLR, X[train_ind,:], y[train_ind])
        clf_MLR.set_params(C=best_C)
        # train and test classifier
        clf_MLR.fit(X[train_ind,:], y[train_ind])
        d = {'scaling': [scaling],
             'type': ['train'],
             'log C': [np.log10(best_C)],
             'score': [clf_MLR.score(X[train_ind,:], y[train_ind])]}
        acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)
        d = {'scaling': [scaling],
             'type': ['test'],
             'log C': [np.log10(best_C)],
             'score': [clf_MLR.score(X[test_ind,:], y[test_ind])]}
        acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)

        # shuffling
        train_ind_rand = np.random.permutation(train_ind)

        best_C = nested_CV(clf_MLR, X[train_ind,:], y[train_ind_rand])
        clf_MLR.set_params(C=best_C)
        clf_MLR.fit(X[train_ind,:], y[train_ind_rand])
        d = {'scaling': [scaling],
             'type': ['shuf'],
             'log C': [np.log10(best_C)],
             'score': [clf_MLR.score(X[test_ind,:], y[test_ind])]}
        acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)
 ```

 %% Cell type:code id:020d231f tags:

 ``` python
 # plots

 # theoretical chance level
 chance_level = 0.5

 acc2 = acc[np.logical_or(acc['type']=='train',acc['type']=='test')]

 plt.figure()
 sb.violinplot(data=acc2, x='scaling', y='score', hue='type', split=True,
-              palette=['brown','orange'], scale='width')
+              palette=['brown','orange'], density_norm='width')
 plt.plot([-1,4], [chance_level]*2, '--k')
 plt.yticks([0,1])
 plt.ylabel('accuracy')
 plt.legend(loc='lower right')
 plt.tight_layout()
 plt.title('train-test accuracies')


 acc2 = acc[np.logical_or(acc['type']=='shuf',acc['type']=='test')]

 plt.figure()
 sb.violinplot(data=acc2, x='scaling', y='score', hue='type', hue_order=('shuf','test'),
-              split=True, palette=['gray','orange'], scale='width')
+              split=True, palette=['gray','orange'], density_norm='width')
 plt.plot([-1,4], [chance_level]*2, '--k')
 plt.yticks([0,1])
 plt.ylabel('accuracy')
 plt.legend(loc='lower right')
 plt.tight_layout()
 plt.title('test-shuf accuracies')


 plt.figure()
 sb.violinplot(data=acc2, x='scaling', y='log C', hue='type',
-              palette=['orange','gray'], scale='width')
+              palette=['orange','gray'], density_norm='width')
 plt.legend(loc='lower right')
 plt.tight_layout()
 plt.title('best C (log$_{10}$ scale)')

 plt.show()
 ```

 %% Cell type:markdown id:146441cb tags:

 We find that the best `C` selected by the nested cross-validation is $0.01$, in line with the comparison of training/testing accuracies at the beginning of the notebook (with the scaling $0.2$ that governs the contrast across the 2 classes).

 Here the scaling affects the selected `C` with looser selection for smaller scaling (i.e. when the classification is difficult and the classifier does not perform well).

 Note that the classifier trained with shuffled labels for the baseline accuracy does not depend on `C`.

 %% Cell type:markdown id:e3f096d7 tags:

 ## Implementation in scikit-learn

 Let's use `GridSearchCV`, which performs the fitting of the parameters.

 %% Cell type:code id:07cdf8f2 tags:

 ``` python
 # grid search for hyperparameter C
 gscv = GridSearchCV(clf_MLR,
                    {'C': Cs},
                    cv=cv_nest)

-acc = pd.DataFrame(columns=['scaling', 'type', 'log C', 'score'])
+acc = pd.DataFrame()

 for scaling in scalings:
    # generate inputs
    X, y = gen_inputs(m, s0, s1, scaling=scaling)

    # repeat classification
    for train_ind, test_ind in cvs.split(X, y):
        # Grid search to find best classifier
        gscv.fit(X[train_ind,:], y[train_ind])
        clf = gscv.best_estimator_

        # Train and test best classifiers with subject labels
        clf.fit(X[train_ind,:], y[train_ind])
        d = {'scaling': [scaling],
             'type': ['test'],
             'log C': [np.log10(gscv.best_params_['C'])],
             'score': [clf.score(X[test_ind,:], y[test_ind])]}
        acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)

        # shuffling
        train_ind_rand = np.random.permutation(train_ind)

        clf.fit(X[train_ind,:], y[train_ind_rand])
        d = {'scaling': [scaling],
             'type': ['shuf'],
             'log C': [np.log10(gscv.best_params_['C'])],
             'score': [clf.score(X[test_ind,:], y[test_ind])]}
        acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)
 ```

 %% Cell type:code id:a92c13eb tags:

 ``` python
 plt.figure()
 sb.violinplot(data=acc, x='scaling', y='score', hue='type', hue_order=['shuf','test'], split=True,
-              palette=['gray','orange'], scale='width')
+              palette=['gray','orange'], density_norm='width')
 plt.plot([-1,4], [chance_level]*2, '--k')
 plt.yticks([0,1])
 plt.ylabel('accuracy')
 plt.legend(loc='lower right')
 plt.tight_layout()
 plt.title('train-test accuracies')


 acc2 = acc[acc['type']=='test']

 plt.figure()
 sb.violinplot(data=acc2, x='scaling', y='log C', hue='type',
-              palette=['orange'], scale='width')
+              palette=['orange'], density_norm='width')
 plt.legend(loc='upper right')
 plt.tight_layout()
 plt.title('best C (log$_{10}$ scale)')

 plt.show()
 ```

--- a/sup_lrn/nb_suplearn3_classifiers.ipynb
+++ b/sup_lrn/nb_suplearn3_classifiers.ipynb
@@ -5,9 +5,9 @@
   "id": "2628a7ec",
   "metadata": {},
   "source": [
-    "# 3) Comparing classifiers \n",
+    "author: Mat Gilson, https://etulab.univ-amu.fr/gilson.m/compneuro_course/\n",
    "\n",
-    "author: Mat Gilson, https://github.com/MatthieuGilson\n",
+    "# 3) Comparing classifiers \n",
    "\n",
    "This notebook compares different classifiers for a synthetic dataset. It includes nested cross-validation to optimize hyperparameters.\n",
    "\n",
@@ -190,7 +190,7 @@
   "outputs": [],
   "source": [
    "# to store result accuracies\n",
-    "acc = pd.DataFrame(columns=['type', 'clf', 'score'])\n",
+    "acc = pd.DataFrame()\n",
    "\n",
    "# loop over classifiers\n",
    "for i, (k, param_name) in enumerate(zip(dict_clf,dict_params)):\n",
@@ -247,7 +247,8 @@
    "# theoretical chance level\n",
    "chance_level = 0.5\n",
    "\n",
-    "sb.violinplot(data=acc2, x='clf', y='score', hue='type', split=True, scale='width', palette=['brown','orange']) # cut=0\n",
+    "sb.violinplot(data=acc2, x='clf', y='score', hue='type', split=True,\n",
+    "              density_norm='width', palette=['brown','orange'])\n",
    "plt.plot([-1,4], [chance_level]*2, '--k')\n",
    "plt.yticks([0,1])\n",
    "plt.xlabel('classifier')\n",
@@ -268,7 +269,7 @@
    "acc2 = acc[np.logical_or(acc['type']=='shuf',acc['type']=='test')]\n",
    "\n",
    "sb.violinplot(data=acc2, x='clf', y='score', hue='type', hue_order=['shuf','test'], \n",
-    "              split=True, scale='width', palette=['gray','orange'])\n",
+    "              split=True, density_norm='width', palette=['gray','orange'])\n",
    "plt.plot([-1,4], [chance_level]*2, '--k')\n",
    "plt.yticks([0,1])\n",
    "plt.xlabel('classifier')\n",
@@ -295,7 +296,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.11.7"
  }
 },
 "nbformat": 4,

 %% Cell type:markdown id:2628a7ec tags:

-# 3) Comparing classifiers
+author: Mat Gilson, https://etulab.univ-amu.fr/gilson.m/compneuro_course/

-author: Mat Gilson, https://github.com/MatthieuGilson
+# 3) Comparing classifiers

 This notebook compares different classifiers for a synthetic dataset. It includes nested cross-validation to optimize hyperparameters.

 See also the documentation of scikit-learn library (https://scikit-learn.org/)

 %% Cell type:code id:ec9bb8b2 tags:

 ``` python
 # import librairies

 import numpy as np

 from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, GridSearchCV

 # classifiers
 from sklearn.linear_model import LogisticRegression
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

 import pandas as pd

 %matplotlib inline
 import matplotlib as mpl
 import matplotlib.pyplot as plt
 import seaborn as sb

 font = {'family' : 'DejaVu Sans',
        'weight' : 'regular',
        'size'   : 18}
 mpl.rc('font', **font)
 ```

 %% Cell type:markdown id:5e78fcf9 tags:

 Let's generate a dataset for classification, including labels.

 %% Cell type:code id:aef4882d tags:

 ``` python
 # create synthetic dataset where 2 classes of s0+s1 samples of m-dimensional inputs with controlled contrast
 def gen_inputs(m,        # input dimensionality
               s0,       # number of samples for class 0
               s1,       # number of samples for class 1
               scaling): # scaling factor to separate classes

    # labels
    lbl = np.zeros([s0+s1], dtype=int)
    # inputs
    X = np.zeros([s0+s1,m])

    # create s0 and s1 samples for the 2 classes
    for i in range(s0+s1):
        # label
        lbl[i] = int(i<s0)
        # inputs are random noise plus a shift
        for j in range(m):
            # positive/negative shift for 1st/2nd class, only for indices j larger than m/2
            if j>=m/2:
                if i<s0:
                    a = -scaling
                else:
                    a = scaling
            else:
                a = 0.0
            # the shift linearly depends on the feature index j
            X[i,j] = a*j/m + np.random.randn()

    return X, lbl
 ```

 %% Cell type:code id:4d4bade5 tags:

 ``` python
 # input properties
 m = 10 # input dimensionality
 s0 = 100 # number of samples for class 0
 s1 = 100 # number of samples for class 1
 scaling = 0.7 # class contrast (try 1.0, 0.5, 0.1)

 # generate inputs
 X, y = gen_inputs(m, s0, s1, scaling)
 ```

 %% Cell type:markdown id:45051d6f tags:

 ## Comparison of classifiers

 We want to compare the accuracy for different classifiers:
 - MLR is the multinomial logistic regression
 - 1NN is the 1-nearest-neighbor
 - RF is a random forest
 - QDA is a quadratic discrimnant analysis

 %% Cell type:code id:5ffeb39f tags:

 ``` python
 # classifiers
 clf_MLR = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=500)

 clf_1NN = KNeighborsClassifier(n_neighbors=1, algorithm='brute', metric='minkowski')

 clf_RF = RandomForestClassifier(n_estimators=10)

 clf_QDA = QuadraticDiscriminantAnalysis()

 dict_clf = {'MLR': clf_MLR,
            '1NN': clf_1NN,
            'RF': clf_RF,
            'QDA': clf_QDA}

 # dictionary of hyperparameters and ranges to explore (grid search)
 dict_params = {'C': [0.01,0.1,1.0,10.0,100.0],               # C for regularization
               'n_neighbors': [1,2,3,4,5],                   # k nearest neighbors
               'criterion': ['gini', 'entropy'], # splitting quality eval
               'reg_param': [0.01,0.1,1.0]}       # regularization param
 ```

 %% Cell type:markdown id:a5ed9e3c tags:

 We use cross-validation with the evaluation of test accuracy, as well as nested cross-validation to optimize hyperparameters.

 %% Cell type:code id:08581b70 tags:

 ``` python
 # outer cross-validation scheme: 80% for training and 20% for testing
 cvs = StratifiedShuffleSplit(n_splits=20, test_size=0.2)
 ```

 %% Cell type:code id:7065006a tags:

 ``` python
 # nested cross-validation scheme
 n_split_nest = 5
 cv_nest = StratifiedKFold(n_splits=n_split_nest)
 ```

 %% Cell type:code id:cc99137f tags:

 ``` python
 # to store result accuracies
-acc = pd.DataFrame(columns=['type', 'clf', 'score'])
+acc = pd.DataFrame()

 # loop over classifiers
 for i, (k, param_name) in enumerate(zip(dict_clf,dict_params)):
    # get classifier and related information
    clf = dict_clf[k]
    param_vals = dict_params[param_name]
    # print(param_name, param_vals)

    # repeat classification
    for train_ind, test_ind in cvs.split(X, y):
        # grid search for hyperparameter C
        gscv = GridSearchCV(clf,
                    {param_name: param_vals},
                    cv=cv_nest)
        gscv.fit(X[train_ind,:], y[train_ind])
        best_clf = gscv.best_estimator_
        print(gscv.best_params_)

        # train and test best classifier with subject labels
        best_clf.fit(X[train_ind,:], y[train_ind])
        # train and test accuracies
        d = {'type': ['train'],
             'clf': [k],
             'score': [best_clf.score(X[train_ind,:], y[train_ind])]}
        acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)
        d = {'type': ['test'],
             'clf': [k],
             'score': [best_clf.score(X[test_ind,:], y[test_ind])]}
        acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)

        # shuffling labels and fit again for evaluation of chance level
        train_ind_rand = np.random.permutation(train_ind)
        best_clf.fit(X[train_ind,:], y[train_ind_rand])
        d = {'type': ['shuf'],
             'clf': [k],
             'score': [best_clf.score(X[test_ind,:], y[test_ind])]}
        acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)

 # print table of results
 print()
 print(acc)
 ```

 %% Cell type:code id:ebeab02a tags:

 ``` python
 # plot: train versus test accuracy
 acc2 = acc[np.logical_or(acc['type']=='train',acc['type']=='test')]

 # theoretical chance level
 chance_level = 0.5

-sb.violinplot(data=acc2, x='clf', y='score', hue='type', split=True, scale='width', palette=['brown','orange']) # cut=0
+sb.violinplot(data=acc2, x='clf', y='score', hue='type', split=True,
+              density_norm='width', palette=['brown','orange'])
 plt.plot([-1,4], [chance_level]*2, '--k')
 plt.yticks([0,1])
 plt.xlabel('classifier')
 plt.ylabel('accuracy')
 plt.legend(loc='lower right', fontsize=11)
 plt.tight_layout()
 plt.show()
 ```

 %% Cell type:code id:77102885 tags:

 ``` python
 # test versus baseline accuracy
 acc2 = acc[np.logical_or(acc['type']=='shuf',acc['type']=='test')]

 sb.violinplot(data=acc2, x='clf', y='score', hue='type', hue_order=['shuf','test'],
-              split=True, scale='width', palette=['gray','orange'])
+              split=True, density_norm='width', palette=['gray','orange'])
 plt.plot([-1,4], [chance_level]*2, '--k')
 plt.yticks([0,1])
 plt.xlabel('classifier')
 plt.ylabel('accuracy')
 plt.legend(loc='lower right', fontsize=11)
 plt.tight_layout()
 plt.show()
 ```

--- a/sup_lrn/nb_suplearn4_feature_selection.ipynb
+++ b/sup_lrn/nb_suplearn4_feature_selection.ipynb
@@ -5,6 +5,8 @@
   "id": "00cde4c7",
   "metadata": {},
   "source": [
+    "author: Mat Gilson, https://etulab.univ-amu.fr/gilson.m/compneuro_course/\n",
+    "\n",
    "# 4) Recursive feature elimination (RFE)\n",
    "\n",
    "This notebook shows the use of RFE to identify informative features for the classification\n",
@@ -49,7 +51,7 @@
   "source": [
    "## Generate samples to classify\n",
    "\n",
-    "We first generate synthetic data with 2 classes to separate (`s0` and `s1` samples, respectively). The input dimensionality corresponds `m` features. "
+    "We generate synthetic data with 2 classes to separate (`s0` and `s1` samples, respectively). The input dimensionality corresponds `m` features. "
   ]
  },
  {
@@ -125,6 +127,7 @@
    "    mat_rgb[i,:,0] = np.histogram(X[:s0,i], bins=vbins)[0]\n",
    "    mat_rgb[i,:,2] = np.histogram(X[s0:,i], bins=vbins)[0]\n",
    "mat_rgb /= mat_rgb.max() / 2.0\n",
+    "mat_rgb = np.clip(mat_rgb,0.0,1.0)\n",
    "\n",
    "plt.figure(figsize=[6,7])\n",
    "plt.imshow(mat_rgb)\n",
@@ -251,6 +254,7 @@
    "    mat_rgb[i,:,0] = np.histogram(X[:s0,i], bins=vbins)[0]\n",
    "    mat_rgb[i,:,2] = np.histogram(X[s0:,i], bins=vbins)[0]\n",
    "mat_rgb /= mat_rgb.max() / 2.0\n",
+    "mat_rgb = np.clip(mat_rgb,0.0,1.0)\n",
    "\n",
    "plt.figure(figsize=[6,7])\n",
    "plt.imshow(mat_rgb)\n",
@@ -366,6 +370,7 @@
    "    mat_rgb[i,:,0] = np.histogram(X[:s0,i], bins=vbins)[0]\n",
    "    mat_rgb[i,:,2] = np.histogram(X[s0:,i], bins=vbins)[0]\n",
    "mat_rgb /= mat_rgb.max() / 2.0\n",
+    "mat_rgb = np.clip(mat_rgb,0.0,1.0)\n",
    "\n",
    "plt.figure(figsize=[6,7])\n",
    "plt.imshow(mat_rgb)\n",
@@ -440,7 +445,7 @@
    "                    {'mlr__C': Cs},\n",
    "                    cv=cv_nest)\n",
    "\n",
-    "acc = pd.DataFrame(columns=['type', 'log C', 'score', 'ranking'])\n",
+    "df_rfe = pd.DataFrame()\n",
    "\n",
    "# repeat classification\n",
    "for train_ind, test_ind in cvs.split(X, y):\n",
@@ -465,7 +470,7 @@
    "         'log C': [np.log10(best_C)], \n",
    "         'score': [score],\n",
    "         'ranking': [ranking]}\n",
-    "    acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)\n",
+    "    df_rfe = pd.concat((df_rfe, pd.DataFrame(data=d)), ignore_index=True)\n",
    "\n",
    "    # shuffling\n",
    "    train_ind_rand = np.random.permutation(train_ind)\n",
@@ -480,7 +485,7 @@
    "         'log C': [np.log10(best_C)], \n",
    "         'score': [score],\n",
    "         'ranking': [ranking]}\n",
-    "    acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)"
+    "    df_rfe = pd.concat((df_rfe, pd.DataFrame(data=d)), ignore_index=True)"
   ]
  },
  {
@@ -501,7 +506,7 @@
    "chance_level = 0.5\n",
    "\n",
    "plt.figure()\n",
-    "sb.violinplot(data=acc, y='score', x='type', hue='type',\n",
+    "sb.violinplot(data=df_rfe, y='score', x='type', hue='type',\n",
    "              palette=['brown','orange'], density_norm='width')\n",
    "plt.plot([-1,2], [chance_level]*2, '--k')\n",
    "plt.yticks([0,1])\n",
@@ -510,10 +515,10 @@
    "plt.title('train-test accuracies')\n",
    "\n",
    "\n",
-    "acc2 = acc[acc['type']=='test']\n",
+    "acc = df_rfe[df_rfe['type']=='test']\n",
    "\n",
    "plt.figure()\n",
-    "sb.violinplot(data=acc2, y='log C', x='type', hue='type', \n",
+    "sb.violinplot(data=acc, y='log C', x='type', hue='type', \n",
    "              palette=['orange'], density_norm='width')\n",
    "plt.tight_layout()\n",
    "plt.title('best C (log$_{10}$ scale)')\n",
@@ -534,18 +539,23 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "e96d2aa6",
+   "id": "9b67aa9b-97ee-46bd-aee6-50c1cbdf3a36",
   "metadata": {},
   "outputs": [],
   "source": [
-    "sel_test = acc['type']=='test'\n",
-    "sel_shuf = acc['type']=='shuf'\n",
+    "# reshape dataframe to extract ranking for each input index\n",
+    "rkg = pd.DataFrame()\n",
+    "for i in range(df_rfe.shape[0]):\n",
+    "    r = df_rfe.iloc[i]\n",
+    "    for j in range(m):\n",
+    "        d_tmp = {'type': r['type'],\n",
+    "                 'input index': [j],\n",
+    "                 'ranking': r['ranking'][j]}\n",
+    "        rkg = pd.concat((rkg, pd.DataFrame(d_tmp)), ignore_index=True)\n",
    "\n",
    "plt.figure()\n",
-    "plt.plot(acc[sel_test]['ranking'].mean(), label='test')\n",
-    "plt.plot(acc[sel_shuf]['ranking'].mean(), label='shuf')\n",
-    "plt.xlabel('input index')\n",
-    "plt.ylabel('mean ranking across CV splits')\n",
+    "sb.lineplot(data=rkg, x='input index', y='ranking', hue='type')\n",
+    "plt.legend(fontsize=12)\n",
    "plt.tight_layout()\n",
    "plt.title('ranking of informative features')\n",
    "\n",
@@ -557,7 +567,9 @@
   "id": "9468ad24-29f5-4688-92cb-4821854051ef",
   "metadata": {},
   "source": [
-    "The ranking estimated from classification can be compared to the \"ground truth\", which is in theory the order of input indices. Note that the randomness in the generation of the inputs adds some noise that affects the precision of the estimated ranking."
+    "The ranking estimated from classification can be compared to the \"ground truth\", which is in theory the order of input indices. Note that the randomness in the generation of the inputs adds some noise that affects the precision of the estimated ranking.\n",
+    "\n",
+    "The ranking for the shuffle case also indicates a level of randomness of the ranking, reflecting the number of inputs, the number of samples, etc. From this we can evaluate a (e.g. via the standard deviation)"
   ]
  },
  {
@@ -567,7 +579,8 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "print(np.argsort(acc[sel_test]['ranking'].mean()))"
+    "sel_test = df_rfe['type']=='test'\n",
+    "print(np.argsort(df_rfe[sel_test]['ranking'].mean()))"
   ]
  }
 ],

 %% Cell type:markdown id:00cde4c7 tags:

+author: Mat Gilson, https://etulab.univ-amu.fr/gilson.m/compneuro_course/
+
 # 4) Recursive feature elimination (RFE)

 This notebook shows the use of RFE to identify informative features for the classification

 See also the documentation of scikit-learn library (https://scikit-learn.org/)

 %% Cell type:code id:8c9c8432 tags:

 ``` python
 # import librairies

 import numpy as np
 import pandas as pd

 from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, GridSearchCV
 from sklearn.linear_model import LogisticRegression
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.preprocessing import StandardScaler
 from sklearn.pipeline import Pipeline, make_pipeline
 from sklearn.feature_selection import RFE

 %matplotlib inline
 import matplotlib as mpl
 import matplotlib.pyplot as plt
 import seaborn as sb

 font = {'family' : 'DejaVu Sans',
        'weight' : 'regular',
        'size'   : 18}
 mpl.rc('font', **font)
 ```

 %% Cell type:markdown id:0d12a052 tags:

 ## Generate samples to classify

-We first generate synthetic data with 2 classes to separate (`s0` and `s1` samples, respectively). The input dimensionality corresponds `m` features.
+We generate synthetic data with 2 classes to separate (`s0` and `s1` samples, respectively). The input dimensionality corresponds `m` features.

 %% Cell type:code id:4673d31d tags:

 ``` python
 # create synthetic dataset where 2 classes of s0+s1 samples of m-dimensional inputs with controlled contrast
 def gen_inputs(m,        # input dimensionality
               s0,       # number of samples for class 0
               s1,       # number of samples for class 1
               scaling): # scaling factor to separate classes

    # labels
    lbl = np.zeros([s0+s1], dtype=int)
    # inputs
    X = np.zeros([s0+s1,m])

    # create s0 and s1 samples for the 2 classes
    for i in range(s0+s1):
        # label
        lbl[i] = int(i<s0)
        # inputs are random noise plus a shift
        for j in range(m):
            # positive/negative shift for 1st/2nd class
            if i<s0:
                a = -scaling
            else:
                a = scaling
            # the shift across classes linearly depends on the feature index j
            X[i,j] = a*j/m + np.random.randn()

    return X, lbl
 ```

 %% Cell type:markdown id:9d0e394b-eb3e-4140-99e7-9d6123960711 tags:

 ## Toy example

 How to interpret the trained weights?

 %% Cell type:code id:029ab1fa tags:

 ``` python
 # generate inputs and labels
 m = 10 # input dimensionality
 s0 = 100 # number of samples for class 0
 s1 = 100 # number of samples for class 1
 X, y = gen_inputs(m, s0, s1, scaling=0.7)
 ```

 %% Cell type:code id:0a45c553-630d-4d95-94d5-6735e9fe0ee2 tags:

 ``` python
 # create matrix rgb
 vbins = np.linspace(-3,3,30) # bins for istograms
 mat_rgb = np.zeros([m, vbins.size-1, 3])
 for i in range(m):
    mat_rgb[i,:,0] = np.histogram(X[:s0,i], bins=vbins)[0]
    mat_rgb[i,:,2] = np.histogram(X[s0:,i], bins=vbins)[0]
 mat_rgb /= mat_rgb.max() / 2.0
+mat_rgb = np.clip(mat_rgb,0.0,1.0)

 plt.figure(figsize=[6,7])
 plt.imshow(mat_rgb)
 plt.xlabel('values')
 plt.ylabel('input index')
 plt.show()
 ```

 %% Cell type:code id:d712a2fb-86b6-46df-a869-bc05475e79d2 tags:

 ``` python
 # train classifier
 clf = LogisticRegression()
 clf.fit(X,y)

 # plot weights
 plt.figure()
 plt.plot(range(m), clf.coef_.flatten())
 plt.xlabel('input index')
 plt.ylabel('weight')
 plt.show()
 ```

 %% Cell type:code id:bc00e090-8aad-4bf5-9e7a-29a92e7e50fe tags:

 ``` python
 # copy X and rescale an input
 X1 = np.copy(X)
 i = 8
 X1[:,i] *= 10
 ```

 %% Cell type:code id:2777f72d-b304-4e12-ae53-5e0df531d85c tags:

 ``` python
 # plot
 plt.figure(figsize=[6,7])
 plt.subplot(211)
 plt.hist(X[:s0,i], histtype='step', bins=40, color='r')
 plt.hist(X[s0:,i], histtype='step', bins=40, color='b')
 plt.legend(['class 0', 'class 1'], fontsize=10)
 plt.title('input {} before'.format(i), loc='left')
 plt.subplot(212)
 plt.hist(X1[:s0,i], histtype='step', bins=40, color='r')
 plt.hist(X1[s0:,i], histtype='step', bins=40, color='b')
 plt.legend(['class 0', 'class 1'], fontsize=10)
 plt.title('input {} after'.format(i), loc='left')
 plt.tight_layout()
 ```

 %% Cell type:code id:56bb23cb-598a-4164-97f9-af9c1300eaf8 tags:

 ``` python
 clf1 = LogisticRegression()
 clf1.fit(X1,y)

 plt.figure()
 plt.plot(range(m), clf.coef_.flatten(), label='orig')
 plt.plot(range(m), clf1.coef_.flatten()/np.std(X,axis=0), label='rescaled')
 plt.xlabel('input index')
 plt.ylabel('weight')
 plt.show()
 ```

 %% Cell type:markdown id:341418f1-2c2b-4e40-8711-1278e2af2948 tags:

 ## Using the RFE object

 %% Cell type:code id:8d55d9cb-1f2a-4da6-b443-9713f75f6cf5 tags:

 ``` python
 # generate inputs and labels
 m = 10 # input dimensionality
 s0 = 100 # number of samples for class 0
 s1 = 100 # number of samples for class 1
 X, y = gen_inputs(m, s0, s1, scaling=0.7)
 ```

 %% Cell type:markdown id:56e3cdf7-e3ca-43bb-89a1-e68dae9627dc tags:

 Here the inputs are modified compared to the previous notebooks, with a gradient of contrast that increases with the feature index $j$.

 %% Cell type:code id:1f5113c4-b6c5-462f-84f0-5ba38872d420 tags:

 ``` python
 # create matrix rgb
 vbins = np.linspace(-3,3,30) # bins for istograms
 mat_rgb = np.zeros([m, vbins.size-1, 3])
 for i in range(m):
    mat_rgb[i,:,0] = np.histogram(X[:s0,i], bins=vbins)[0]
    mat_rgb[i,:,2] = np.histogram(X[s0:,i], bins=vbins)[0]
 mat_rgb /= mat_rgb.max() / 2.0
+mat_rgb = np.clip(mat_rgb,0.0,1.0)

 plt.figure(figsize=[6,7])
 plt.imshow(mat_rgb)
 plt.xlabel('values')
 plt.ylabel('input index')
 plt.show()
 ```

 %% Cell type:code id:d355c4af-54a7-4668-8181-f69f9a9b466e tags:

 ``` python
 # classifier in pipeline
 clf = Pipeline([('scl',StandardScaler()),
                ('mlr',LogisticRegression(C=1.0))])

 # number of repetitions
 n_rep = 1 # change for 10

 # outer cross-validation scheme
 cvs = StratifiedShuffleSplit(n_splits=n_rep, test_size=0.2)
 ```

 %% Cell type:code id:4fe3c14c-06ae-4cea-91ff-50b56dbe2f07 tags:

 ``` python
 # check names of parameters for pipeline (for grid search)
 print(clf.get_params())

 # quick fix to get coefficients from mlr estimator in pipeline
 def get_coef(clf_pipeline):
    return clf_pipeline['mlr'].coef_
 ```

 %% Cell type:code id:876b966b-5222-452e-b643-58d7ab638f87 tags:

 ``` python
 # repeat classification
 for train_ind, test_ind in cvs.split(X, y):

    # wrap classifier to be fitted to data to calculate the ranking
    feature_select = RFE(clf, n_features_to_select=1, step=1,
                         importance_getter=get_coef)

    # perform RFE
    feature_select.fit(X[train_ind,:], y[train_ind])
    print(feature_select.ranking_)
 ```

 %% Cell type:markdown id:e044c2ed-1cf9-4337-ab91-41a6c239b7b0 tags:

 Exercises:
 - test the variability of the ranking across train-test splits
 - change the granularity of the output ranking with `n_features_to_select` and `step`
 - modify the scaling for the input generation to see how the ranking changes.

 %% Cell type:markdown id:c26064a9-17a9-44d6-9c66-b85676c94fc8 tags:

 ## Back to general case and combining all options for the classification

 %% Cell type:code id:cc559465-2c51-4e62-9b40-d683b91f1eac tags:

 ``` python
 # generate inputs and labels
 m = 50 # input dimensionality
 s0 = 100 # number of samples for class 0
 s1 = 100 # number of samples for class 1
 X, y = gen_inputs(m, s0, s1, scaling=0.5)
 ```

 %% Cell type:markdown id:6febb8bf-a89c-4c1d-8383-feffd43f98c2 tags:

 Here the inputs are modified compared to the previous notebooks, with a gradient of contrast that increases with the feature index $j$.

 %% Cell type:code id:edb43b31-62f0-4d0f-98d8-ac3e3c062405 tags:

 ``` python
 # create matrix rgb
 vbins = np.linspace(-3,3,30) # bins for istograms
 mat_rgb = np.zeros([m, vbins.size-1, 3])
 for i in range(m):
    mat_rgb[i,:,0] = np.histogram(X[:s0,i], bins=vbins)[0]
    mat_rgb[i,:,2] = np.histogram(X[s0:,i], bins=vbins)[0]
 mat_rgb /= mat_rgb.max() / 2.0
+mat_rgb = np.clip(mat_rgb,0.0,1.0)

 plt.figure(figsize=[6,7])
 plt.imshow(mat_rgb)
 plt.xlabel('values')
 plt.ylabel('input index')
 plt.show()
 ```

 %% Cell type:markdown id:0024c6aa tags:

 We then build a pipeline for the classifier. The outer cross-validation corresponds to the train-test splitting as before.

 The inner crosss-validation corresponds to the optimization of the hyperparameter `C` of the classifier (logistic regression in the pipeline).

 %% Cell type:code id:6f2a4e24 tags:

 ``` python
 # hyperparameter for regularization
 Cs = [0.01,0.1,1.0,10.0,100.0]

 # classifier in pipeline
 clf = Pipeline([('scl',StandardScaler()),
                ('mlr',LogisticRegression())])

 # number of repetitions and storage of results
 n_rep = 10

 # outer cross-validation scheme
 cvs = StratifiedShuffleSplit(n_splits=n_rep, test_size=0.2)

 # inner cross-validation scheme
 cv_nest = StratifiedKFold(n_splits=3)
 ```

 %% Cell type:code id:90de0aa7 tags:

 ``` python
 # quick fix to get coefficients from mlr estimator in pipeline
 def get_coef(clf_pipeline):
    return clf_pipeline['mlr'].coef_
 ```

 %% Cell type:markdown id:e3f096d7 tags:

 We use `GridSearchCV` to optimize the hyperparameter, the use the best classifier pipeline on the test set and perform recursive feature elimination (RFE) to identify informative features that contribute to the correct classification. The latter gives a ranking where low ranks correspond to informative features.

 %% Cell type:code id:07cdf8f2 tags:

 ``` python
 # grid search for hyperparameter C
 gscv = GridSearchCV(clf,
                    {'mlr__C': Cs},
                    cv=cv_nest)

-acc = pd.DataFrame(columns=['type', 'log C', 'score', 'ranking'])
+df_rfe = pd.DataFrame()

 # repeat classification
 for train_ind, test_ind in cvs.split(X, y):

    # optimize hyperparameter
    gscv.fit(X[train_ind,:], y[train_ind])
    clf_best = gscv.best_estimator_
    best_C = gscv.best_params_['mlr__C']

    # wrap classifier to be fitted to data to calculate the ranking
    feature_select = RFE(clf_best, n_features_to_select=1, step=1,
                         importance_getter=get_coef)

    # train and test classifier
    clf_best.fit(X[train_ind,:], y[train_ind])
    score = clf_best.score(X[test_ind,:], y[test_ind])
    # perform RFE
    feature_select.fit(X[train_ind,:], y[train_ind])
    ranking = feature_select.ranking_
    # store results
    d = {'type': ['test'],
         'log C': [np.log10(best_C)],
         'score': [score],
         'ranking': [ranking]}
-    acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)
+    df_rfe = pd.concat((df_rfe, pd.DataFrame(data=d)), ignore_index=True)

    # shuffling
    train_ind_rand = np.random.permutation(train_ind)

    clf_best.fit(X[train_ind,:], y[train_ind_rand])
    score = clf_best.score(X[test_ind,:], y[test_ind])
    # perform RFE
    feature_select.fit(X[train_ind,:], y[train_ind_rand])
    ranking = feature_select.ranking_
    # store results
    d = {'type': ['shuf'],
         'log C': [np.log10(best_C)],
         'score': [score],
         'ranking': [ranking]}
-    acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)
+    df_rfe = pd.concat((df_rfe, pd.DataFrame(data=d)), ignore_index=True)
 ```

 %% Cell type:markdown id:2702e677 tags:

 Plot the results for accuracy and best parameters.

 %% Cell type:code id:03237341 tags:

 ``` python
 chance_level = 0.5

 plt.figure()
-sb.violinplot(data=acc, y='score', x='type', hue='type',
+sb.violinplot(data=df_rfe, y='score', x='type', hue='type',
              palette=['brown','orange'], density_norm='width')
 plt.plot([-1,2], [chance_level]*2, '--k')
 plt.yticks([0,1])
 plt.ylabel('accuracy')
 plt.tight_layout()
 plt.title('train-test accuracies')


-acc2 = acc[acc['type']=='test']
+acc = df_rfe[df_rfe['type']=='test']

 plt.figure()
-sb.violinplot(data=acc2, y='log C', x='type', hue='type',
+sb.violinplot(data=acc, y='log C', x='type', hue='type',
              palette=['orange'], density_norm='width')
 plt.tight_layout()
 plt.title('best C (log$_{10}$ scale)')

 plt.show()
 ```

 %% Cell type:markdown id:8ef0a315 tags:

 Plot the ranking obtained from RFE.

 Recall that the inputs are geneterated such that "the shift across classes linearly depends on the feature index". This means that inputs with large index should have low (informative) ranking.

-%% Cell type:code id:e96d2aa6 tags:
+%% Cell type:code id:9b67aa9b-97ee-46bd-aee6-50c1cbdf3a36 tags:

 ``` python
-sel_test = acc['type']=='test'
-sel_shuf = acc['type']=='shuf'
+# reshape dataframe to extract ranking for each input index
+rkg = pd.DataFrame()
+for i in range(df_rfe.shape[0]):
+    r = df_rfe.iloc[i]
+    for j in range(m):
+        d_tmp = {'type': r['type'],
+                 'input index': [j],
+                 'ranking': r['ranking'][j]}
+        rkg = pd.concat((rkg, pd.DataFrame(d_tmp)), ignore_index=True)

 plt.figure()
-plt.plot(acc[sel_test]['ranking'].mean(), label='test')
-plt.plot(acc[sel_shuf]['ranking'].mean(), label='shuf')
-plt.xlabel('input index')
-plt.ylabel('mean ranking across CV splits')
+sb.lineplot(data=rkg, x='input index', y='ranking', hue='type')
+plt.legend(fontsize=12)
 plt.tight_layout()
 plt.title('ranking of informative features')

 plt.show()
 ```

 %% Cell type:markdown id:9468ad24-29f5-4688-92cb-4821854051ef tags:

 The ranking estimated from classification can be compared to the "ground truth", which is in theory the order of input indices. Note that the randomness in the generation of the inputs adds some noise that affects the precision of the estimated ranking.

+The ranking for the shuffle case also indicates a level of randomness of the ranking, reflecting the number of inputs, the number of samples, etc. From this we can evaluate a (e.g. via the standard deviation)
+
 %% Cell type:code id:6ee2f8f7-3f55-41bd-b991-fefed23fe20f tags:

 ``` python
-print(np.argsort(acc[sel_test]['ranking'].mean()))
+sel_test = df_rfe['type']=='test'
+print(np.argsort(df_rfe[sel_test]['ranking'].mean()))
 ```

--- a/sup_lrn/nb_suplearn_exercise.ipynb
+++ b/sup_lrn/nb_suplearn_exercise.ipynb
--- a/tm_srs/environment.yml
+++ b/tm_srs/environment.yml
-name: sktime
-channels:
-  - conda-forge
-dependencies:
-  - jupyterlab
-  - sktime
-  - matplotlib
-