update suplearn4

ca1c8361 · GILSON Matthieu · 20c25fd5 · ca1c8361
Commit ca1c8361 authored 1 year ago by GILSON Matthieu
--- a/sup_lrn/nb_suplearn4_feature_selection.ipynb
+++ b/sup_lrn/nb_suplearn4_feature_selection.ipynb
@@ -87,6 +87,16 @@
    "    return X, lbl"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "9d0e394b-eb3e-4140-99e7-9d6123960711",
+   "metadata": {},
+   "source": [
+    "## Toy example\n",
+    "\n",
+    "How to interpret the trained weights?"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@@ -95,10 +105,126 @@
   "outputs": [],
   "source": [
    "# generate inputs and labels\n",
-    "m = 50 # input dimensionality\n",
+    "m = 10 # input dimensionality\n",
+    "s0 = 100 # number of samples for class 0\n",
+    "s1 = 100 # number of samples for class 1\n",
+    "X, y = gen_inputs(m, s0, s1, scaling=0.7)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0a45c553-630d-4d95-94d5-6735e9fe0ee2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create matrix rgb\n",
+    "vbins = np.linspace(-3,3,30) # bins for istograms\n",
+    "mat_rgb = np.zeros([m, vbins.size-1, 3])\n",
+    "for i in range(m):\n",
+    "    mat_rgb[i,:,0] = np.histogram(X[:s0,i], bins=vbins)[0]\n",
+    "    mat_rgb[i,:,2] = np.histogram(X[s0:,i], bins=vbins)[0]\n",
+    "mat_rgb /= mat_rgb.max() / 2.0\n",
+    "\n",
+    "plt.figure(figsize=[6,7])\n",
+    "plt.imshow(mat_rgb)\n",
+    "plt.xlabel('values')\n",
+    "plt.ylabel('input index')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d712a2fb-86b6-46df-a869-bc05475e79d2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# train classifier\n",
+    "clf = LogisticRegression()\n",
+    "clf.fit(X,y)\n",
+    "\n",
+    "# plot weights\n",
+    "plt.figure()\n",
+    "plt.plot(range(m), clf.coef_.flatten())\n",
+    "plt.xlabel('input index')\n",
+    "plt.ylabel('weight')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bc00e090-8aad-4bf5-9e7a-29a92e7e50fe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# copy X and rescale an input\n",
+    "X1 = np.copy(X)\n",
+    "i = 8\n",
+    "X1[:,i] *= 10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2777f72d-b304-4e12-ae53-5e0df531d85c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# plot\n",
+    "plt.figure(figsize=[6,7])\n",
+    "plt.subplot(211)\n",
+    "plt.hist(X[:s0,i], histtype='step', bins=40, color='r')\n",
+    "plt.hist(X[s0:,i], histtype='step', bins=40, color='b')\n",
+    "plt.legend(['class 0', 'class 1'], fontsize=10)\n",
+    "plt.title('input {} before'.format(i), loc='left')\n",
+    "plt.subplot(212)\n",
+    "plt.hist(X1[:s0,i], histtype='step', bins=40, color='r')\n",
+    "plt.hist(X1[s0:,i], histtype='step', bins=40, color='b')\n",
+    "plt.legend(['class 0', 'class 1'], fontsize=10)\n",
+    "plt.title('input {} after'.format(i), loc='left')\n",
+    "plt.tight_layout()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "56bb23cb-598a-4164-97f9-af9c1300eaf8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "clf1 = LogisticRegression()\n",
+    "clf1.fit(X1,y)\n",
+    "\n",
+    "plt.figure()\n",
+    "plt.plot(range(m), clf.coef_.flatten(), label='orig')\n",
+    "plt.plot(range(m), clf1.coef_.flatten()/np.std(X,axis=0), label='rescaled')\n",
+    "plt.xlabel('input index')\n",
+    "plt.ylabel('weight')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "341418f1-2c2b-4e40-8711-1278e2af2948",
+   "metadata": {},
+   "source": [
+    "## Using the RFE object"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8d55d9cb-1f2a-4da6-b443-9713f75f6cf5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# generate inputs and labels\n",
+    "m = 10 # input dimensionality\n",
    "s0 = 100 # number of samples for class 0\n",
    "s1 = 100 # number of samples for class 1\n",
-    "X, y = gen_inputs(m, s0, s1, scaling=0.5) # try 0.2"
+    "X, y = gen_inputs(m, s0, s1, scaling=0.7)"
   ]
  },
  {
@@ -113,6 +239,123 @@
   "cell_type": "code",
   "execution_count": null,
   "id": "1f5113c4-b6c5-462f-84f0-5ba38872d420",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# create matrix rgb\n",
+    "vbins = np.linspace(-3,3,30) # bins for istograms\n",
+    "mat_rgb = np.zeros([m, vbins.size-1, 3])\n",
+    "for i in range(m):\n",
+    "    mat_rgb[i,:,0] = np.histogram(X[:s0,i], bins=vbins)[0]\n",
+    "    mat_rgb[i,:,2] = np.histogram(X[s0:,i], bins=vbins)[0]\n",
+    "mat_rgb /= mat_rgb.max() / 2.0\n",
+    "\n",
+    "plt.figure(figsize=[6,7])\n",
+    "plt.imshow(mat_rgb)\n",
+    "plt.xlabel('values')\n",
+    "plt.ylabel('input index')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d355c4af-54a7-4668-8181-f69f9a9b466e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# classifier in pipeline\n",
+    "clf = Pipeline([('scl',StandardScaler()),\n",
+    "                ('mlr',LogisticRegression(C=1.0))])\n",
+    "\n",
+    "# number of repetitions \n",
+    "n_rep = 1 # change for 10\n",
+    "\n",
+    "# outer cross-validation scheme\n",
+    "cvs = StratifiedShuffleSplit(n_splits=n_rep, test_size=0.2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4fe3c14c-06ae-4cea-91ff-50b56dbe2f07",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# check names of parameters for pipeline (for grid search)\n",
+    "print(clf.get_params())\n",
+    "\n",
+    "# quick fix to get coefficients from mlr estimator in pipeline\n",
+    "def get_coef(clf_pipeline):\n",
+    "    return clf_pipeline['mlr'].coef_"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "876b966b-5222-452e-b643-58d7ab638f87",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# repeat classification\n",
+    "for train_ind, test_ind in cvs.split(X, y):\n",
+    "    \n",
+    "    # wrap classifier to be fitted to data to calculate the ranking\n",
+    "    feature_select = RFE(clf, n_features_to_select=1, step=1,\n",
+    "                         importance_getter=get_coef)\n",
+    "    \n",
+    "    # perform RFE\n",
+    "    feature_select.fit(X[train_ind,:], y[train_ind])\n",
+    "    print(feature_select.ranking_)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e044c2ed-1cf9-4337-ab91-41a6c239b7b0",
+   "metadata": {},
+   "source": [
+    "Exercises:\n",
+    "- test the variability of the ranking across train-test splits\n",
+    "- change the granularity of the output ranking with `n_features_to_select` and `step`\n",
+    "- modify the scaling for the input generation to see how the ranking changes."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c26064a9-17a9-44d6-9c66-b85676c94fc8",
+   "metadata": {},
+   "source": [
+    "## Back to general case and combining all options for the classification"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cc559465-2c51-4e62-9b40-d683b91f1eac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# generate inputs and labels\n",
+    "m = 50 # input dimensionality\n",
+    "s0 = 100 # number of samples for class 0\n",
+    "s1 = 100 # number of samples for class 1\n",
+    "X, y = gen_inputs(m, s0, s1, scaling=0.5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6febb8bf-a89c-4c1d-8383-feffd43f98c2",
+   "metadata": {},
+   "source": [
+    "Here the inputs are modified compared to the previous notebooks, with a gradient of contrast that increases with the feature index $j$."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "edb43b31-62f0-4d0f-98d8-ac3e3c062405",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -136,8 +379,6 @@
   "id": "0024c6aa",
   "metadata": {},
   "source": [
-    "## Parameterization of classifier\n",
-    "\n",
    "We then build a pipeline for the classifier. The outer cross-validation corresponds to the train-test splitting as before.\n",
    "\n",
    "The inner crosss-validation corresponds to the optimization of the hyperparameter `C` of the classifier (logistic regression in the pipeline)."
@@ -153,7 +394,7 @@
    "# hyperparameter for regularization\n",
    "Cs = [0.01,0.1,1.0,10.0,100.0]\n",
    "\n",
-    "# classifier in pipeline and wrapper for RFE\n",
+    "# classifier in pipeline\n",
    "clf = Pipeline([('scl',StandardScaler()),\n",
    "                ('mlr',LogisticRegression())])\n",
    "\n",
@@ -174,9 +415,6 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# check names of parameters for pipeline (for grid search)\n",
-    "print(clf.get_params())\n",
-    "\n",
    "# quick fix to get coefficients from mlr estimator in pipeline\n",
    "def get_coef(clf_pipeline):\n",
    "    return clf_pipeline['mlr'].coef_"
@@ -187,8 +425,6 @@
   "id": "e3f096d7",
   "metadata": {},
   "source": [
-    "## Optimization involving the tuning of hyperparameter\n",
-    "\n",
    "We use `GridSearchCV` to optimize the hyperparameter, the use the best classifier pipeline on the test set and perform recursive feature elimination (RFE) to identify informative features that contribute to the correct classification. The latter gives a ranking where low ranks correspond to informative features."
   ]
  },
@@ -265,8 +501,8 @@
    "chance_level = 0.5\n",
    "\n",
    "plt.figure()\n",
-    "sb.violinplot(data=acc, y='score', x='type', \n",
+    "sb.violinplot(data=acc, y='score', x='type', hue='type',\n",
-    "              palette=['brown','orange'], scale='width')\n",
+    "              palette=['brown','orange'], density_norm='width')\n",
    "plt.plot([-1,2], [chance_level]*2, '--k')\n",
    "plt.yticks([0,1])\n",
    "plt.ylabel('accuracy')\n",
@@ -277,8 +513,8 @@
    "acc2 = acc[acc['type']=='test']\n",
    "\n",
    "plt.figure()\n",
-    "sb.violinplot(data=acc2, y='log C', x='type', \n",
+    "sb.violinplot(data=acc2, y='log C', x='type', hue='type', \n",
-    "              palette=['orange'], scale='width')\n",
+    "              palette=['orange'], density_norm='width')\n",
    "plt.tight_layout()\n",
    "plt.title('best C (log$_{10}$ scale)')\n",
    "\n",
@@ -302,8 +538,12 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "sel_test = acc['type']=='test'\n",
+    "sel_shuf = acc['type']=='shuf'\n",
+    "\n",
    "plt.figure()\n",
-    "plt.plot(acc['ranking'].mean())\n",
+    "plt.plot(acc[sel_test]['ranking'].mean(), label='test')\n",
+    "plt.plot(acc[sel_shuf]['ranking'].mean(), label='shuf')\n",
    "plt.xlabel('input index')\n",
    "plt.ylabel('mean ranking across CV splits')\n",
    "plt.tight_layout()\n",
@@ -314,10 +554,20 @@
  },
  {
   "cell_type": "markdown",
-   "id": "bcf0b3dc",
+   "id": "9468ad24-29f5-4688-92cb-4821854051ef",
+   "metadata": {},
+   "source": [
+    "The ranking estimated from classification can be compared to the \"ground truth\", which is in theory the order of input indices. Note that the randomness in the generation of the inputs adds some noise that affects the precision of the estimated ranking."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6ee2f8f7-3f55-41bd-b991-fefed23fe20f",
   "metadata": {},
+   "outputs": [],
   "source": [
-    "Modify the scaling for the input generation to see how the ranking changes."
+    "print(np.argsort(acc[sel_test]['ranking'].mean()))"
   ]
  }
 ],
@@ -337,7 +587,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.11.7"
  }
 },
 "nbformat": 4,

 %% Cell type:markdown id:00cde4c7 tags:
 # 4) Recursive feature elimination (RFE)
 This notebook shows the use of RFE to identify informative features for the classification
 See also the documentation of scikit-learn library (https://scikit-learn.org/)
 %% Cell type:code id:8c9c8432 tags:
 ``` python
 # import librairies
 import numpy as np
 import pandas as pd
 from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, GridSearchCV
 from sklearn.linear_model import LogisticRegression
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.preprocessing import StandardScaler
 from sklearn.pipeline import Pipeline, make_pipeline
 from sklearn.feature_selection import RFE
 %matplotlib inline
 import matplotlib as mpl
 import matplotlib.pyplot as plt
 import seaborn as sb
 font = {'family' : 'DejaVu Sans',
        'weight' : 'regular',
        'size'   : 18}
 mpl.rc('font', **font)
 ```
 %% Cell type:markdown id:0d12a052 tags:
 ## Generate samples to classify
 We first generate synthetic data with 2 classes to separate (`s0` and `s1` samples, respectively). The input dimensionality corresponds `m` features.
 %% Cell type:code id:4673d31d tags:
 ``` python
 # create synthetic dataset where 2 classes of s0+s1 samples of m-dimensional inputs with controlled contrast
 def gen_inputs(m,        # input dimensionality
               s0,       # number of samples for class 0
               s1,       # number of samples for class 1
               scaling): # scaling factor to separate classes
    # labels
    lbl = np.zeros([s0+s1], dtype=int)
    # inputs
    X = np.zeros([s0+s1,m])
    # create s0 and s1 samples for the 2 classes
    for i in range(s0+s1):
        # label
        lbl[i] = int(i<s0)
        # inputs are random noise plus a shift
        for j in range(m):
            # positive/negative shift for 1st/2nd class
            if i<s0:
                a = -scaling
            else:
                a = scaling
            # the shift across classes linearly depends on the feature index j
            X[i,j] = a*j/m + np.random.randn()
    return X, lbl
 ```
+%% Cell type:markdown id:9d0e394b-eb3e-4140-99e7-9d6123960711 tags:
+## Toy example
+How to interpret the trained weights?
 %% Cell type:code id:029ab1fa tags:
 ``` python
 # generate inputs and labels
-m = 50 # input dimensionality
+m = 10 # input dimensionality
 s0 = 100 # number of samples for class 0
 s1 = 100 # number of samples for class 1
-X, y = gen_inputs(m, s0, s1, scaling=0.5) # try 0.2
+X, y = gen_inputs(m, s0, s1, scaling=0.7)
+```
+%% Cell type:code id:0a45c553-630d-4d95-94d5-6735e9fe0ee2 tags:
+``` python
+# create matrix rgb
+vbins = np.linspace(-3,3,30) # bins for istograms
+mat_rgb = np.zeros([m, vbins.size-1, 3])
+for i in range(m):
+    mat_rgb[i,:,0] = np.histogram(X[:s0,i], bins=vbins)[0]
+    mat_rgb[i,:,2] = np.histogram(X[s0:,i], bins=vbins)[0]
+mat_rgb /= mat_rgb.max() / 2.0
+plt.figure(figsize=[6,7])
+plt.imshow(mat_rgb)
+plt.xlabel('values')
+plt.ylabel('input index')
+plt.show()
+```
+%% Cell type:code id:d712a2fb-86b6-46df-a869-bc05475e79d2 tags:
+``` python
+# train classifier
+clf = LogisticRegression()
+clf.fit(X,y)
+# plot weights
+plt.figure()
+plt.plot(range(m), clf.coef_.flatten())
+plt.xlabel('input index')
+plt.ylabel('weight')
+plt.show()
+```
+%% Cell type:code id:bc00e090-8aad-4bf5-9e7a-29a92e7e50fe tags:
+``` python
+# copy X and rescale an input
+X1 = np.copy(X)
+i = 8
+X1[:,i] *= 10
+```
+%% Cell type:code id:2777f72d-b304-4e12-ae53-5e0df531d85c tags:
+``` python
+# plot
+plt.figure(figsize=[6,7])
+plt.subplot(211)
+plt.hist(X[:s0,i], histtype='step', bins=40, color='r')
+plt.hist(X[s0:,i], histtype='step', bins=40, color='b')
+plt.legend(['class 0', 'class 1'], fontsize=10)
+plt.title('input {} before'.format(i), loc='left')
+plt.subplot(212)
+plt.hist(X1[:s0,i], histtype='step', bins=40, color='r')
+plt.hist(X1[s0:,i], histtype='step', bins=40, color='b')
+plt.legend(['class 0', 'class 1'], fontsize=10)
+plt.title('input {} after'.format(i), loc='left')
+plt.tight_layout()
+```
+%% Cell type:code id:56bb23cb-598a-4164-97f9-af9c1300eaf8 tags:
+``` python
+clf1 = LogisticRegression()
+clf1.fit(X1,y)
+plt.figure()
+plt.plot(range(m), clf.coef_.flatten(), label='orig')
+plt.plot(range(m), clf1.coef_.flatten()/np.std(X,axis=0), label='rescaled')
+plt.xlabel('input index')
+plt.ylabel('weight')
+plt.show()
+```
+%% Cell type:markdown id:341418f1-2c2b-4e40-8711-1278e2af2948 tags:
+## Using the RFE object
+%% Cell type:code id:8d55d9cb-1f2a-4da6-b443-9713f75f6cf5 tags:
+``` python
+# generate inputs and labels
+m = 10 # input dimensionality
+s0 = 100 # number of samples for class 0
+s1 = 100 # number of samples for class 1
+X, y = gen_inputs(m, s0, s1, scaling=0.7)
 ```
 %% Cell type:markdown id:56e3cdf7-e3ca-43bb-89a1-e68dae9627dc tags:
 Here the inputs are modified compared to the previous notebooks, with a gradient of contrast that increases with the feature index $j$.
 %% Cell type:code id:1f5113c4-b6c5-462f-84f0-5ba38872d420 tags:
 ``` python
 # create matrix rgb
 vbins = np.linspace(-3,3,30) # bins for istograms
 mat_rgb = np.zeros([m, vbins.size-1, 3])
 for i in range(m):
    mat_rgb[i,:,0] = np.histogram(X[:s0,i], bins=vbins)[0]
    mat_rgb[i,:,2] = np.histogram(X[s0:,i], bins=vbins)[0]
 mat_rgb /= mat_rgb.max() / 2.0
 plt.figure(figsize=[6,7])
 plt.imshow(mat_rgb)
 plt.xlabel('values')
 plt.ylabel('input index')
 plt.show()
 ```
-%% Cell type:markdown id:0024c6aa tags:
+%% Cell type:code id:d355c4af-54a7-4668-8181-f69f9a9b466e tags:
+``` python
+# classifier in pipeline
+clf = Pipeline([('scl',StandardScaler()),
+                ('mlr',LogisticRegression(C=1.0))])
+# number of repetitions
+n_rep = 1 # change for 10
+# outer cross-validation scheme
+cvs = StratifiedShuffleSplit(n_splits=n_rep, test_size=0.2)
+```
-## Parameterization of classifier
+%% Cell type:code id:4fe3c14c-06ae-4cea-91ff-50b56dbe2f07 tags:
+``` python
+# check names of parameters for pipeline (for grid search)
+print(clf.get_params())
+# quick fix to get coefficients from mlr estimator in pipeline
+def get_coef(clf_pipeline):
+    return clf_pipeline['mlr'].coef_
+```
+%% Cell type:code id:876b966b-5222-452e-b643-58d7ab638f87 tags:
+``` python
+# repeat classification
+for train_ind, test_ind in cvs.split(X, y):
+    # wrap classifier to be fitted to data to calculate the ranking
+    feature_select = RFE(clf, n_features_to_select=1, step=1,
+                         importance_getter=get_coef)
+    # perform RFE
+    feature_select.fit(X[train_ind,:], y[train_ind])
+    print(feature_select.ranking_)
+```
+%% Cell type:markdown id:e044c2ed-1cf9-4337-ab91-41a6c239b7b0 tags:
+Exercises:
+- test the variability of the ranking across train-test splits
+- change the granularity of the output ranking with `n_features_to_select` and `step`
+- modify the scaling for the input generation to see how the ranking changes.
+%% Cell type:markdown id:c26064a9-17a9-44d6-9c66-b85676c94fc8 tags:
+## Back to general case and combining all options for the classification
+%% Cell type:code id:cc559465-2c51-4e62-9b40-d683b91f1eac tags:
+``` python
+# generate inputs and labels
+m = 50 # input dimensionality
+s0 = 100 # number of samples for class 0
+s1 = 100 # number of samples for class 1
+X, y = gen_inputs(m, s0, s1, scaling=0.5)
+```
+%% Cell type:markdown id:6febb8bf-a89c-4c1d-8383-feffd43f98c2 tags:
+Here the inputs are modified compared to the previous notebooks, with a gradient of contrast that increases with the feature index $j$.
+%% Cell type:code id:edb43b31-62f0-4d0f-98d8-ac3e3c062405 tags:
+``` python
+# create matrix rgb
+vbins = np.linspace(-3,3,30) # bins for istograms
+mat_rgb = np.zeros([m, vbins.size-1, 3])
+for i in range(m):
+    mat_rgb[i,:,0] = np.histogram(X[:s0,i], bins=vbins)[0]
+    mat_rgb[i,:,2] = np.histogram(X[s0:,i], bins=vbins)[0]
+mat_rgb /= mat_rgb.max() / 2.0
+plt.figure(figsize=[6,7])
+plt.imshow(mat_rgb)
+plt.xlabel('values')
+plt.ylabel('input index')
+plt.show()
+```
+%% Cell type:markdown id:0024c6aa tags:
 We then build a pipeline for the classifier. The outer cross-validation corresponds to the train-test splitting as before.
 The inner crosss-validation corresponds to the optimization of the hyperparameter `C` of the classifier (logistic regression in the pipeline).
 %% Cell type:code id:6f2a4e24 tags:
 ``` python
 # hyperparameter for regularization
 Cs = [0.01,0.1,1.0,10.0,100.0]
-# classifier in pipeline and wrapper for RFE
+# classifier in pipeline
 clf = Pipeline([('scl',StandardScaler()),
                ('mlr',LogisticRegression())])
 # number of repetitions and storage of results
 n_rep = 10
 # outer cross-validation scheme
 cvs = StratifiedShuffleSplit(n_splits=n_rep, test_size=0.2)
 # inner cross-validation scheme
 cv_nest = StratifiedKFold(n_splits=3)
 ```
 %% Cell type:code id:90de0aa7 tags:
 ``` python
-# check names of parameters for pipeline (for grid search)
-print(clf.get_params())
 # quick fix to get coefficients from mlr estimator in pipeline
 def get_coef(clf_pipeline):
    return clf_pipeline['mlr'].coef_
 ```
 %% Cell type:markdown id:e3f096d7 tags:
-## Optimization involving the tuning of hyperparameter
 We use `GridSearchCV` to optimize the hyperparameter, the use the best classifier pipeline on the test set and perform recursive feature elimination (RFE) to identify informative features that contribute to the correct classification. The latter gives a ranking where low ranks correspond to informative features.
 %% Cell type:code id:07cdf8f2 tags:
 ``` python
 # grid search for hyperparameter C
 gscv = GridSearchCV(clf,
                    {'mlr__C': Cs},
                    cv=cv_nest)
 acc = pd.DataFrame(columns=['type', 'log C', 'score', 'ranking'])
 # repeat classification
 for train_ind, test_ind in cvs.split(X, y):
    # optimize hyperparameter
    gscv.fit(X[train_ind,:], y[train_ind])
    clf_best = gscv.best_estimator_
    best_C = gscv.best_params_['mlr__C']
    # wrap classifier to be fitted to data to calculate the ranking
    feature_select = RFE(clf_best, n_features_to_select=1, step=1,
                         importance_getter=get_coef)
    # train and test classifier
    clf_best.fit(X[train_ind,:], y[train_ind])
    score = clf_best.score(X[test_ind,:], y[test_ind])
    # perform RFE
    feature_select.fit(X[train_ind,:], y[train_ind])
    ranking = feature_select.ranking_
    # store results
    d = {'type': ['test'],
         'log C': [np.log10(best_C)],
         'score': [score],
         'ranking': [ranking]}
    acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)
    # shuffling
    train_ind_rand = np.random.permutation(train_ind)
    clf_best.fit(X[train_ind,:], y[train_ind_rand])
    score = clf_best.score(X[test_ind,:], y[test_ind])
    # perform RFE
    feature_select.fit(X[train_ind,:], y[train_ind_rand])
    ranking = feature_select.ranking_
    # store results
    d = {'type': ['shuf'],
         'log C': [np.log10(best_C)],
         'score': [score],
         'ranking': [ranking]}
    acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)
 ```
 %% Cell type:markdown id:2702e677 tags:
 Plot the results for accuracy and best parameters.
 %% Cell type:code id:03237341 tags:
 ``` python
 chance_level = 0.5
 plt.figure()
-sb.violinplot(data=acc, y='score', x='type',
+sb.violinplot(data=acc, y='score', x='type', hue='type',
-              palette=['brown','orange'], scale='width')
+              palette=['brown','orange'], density_norm='width')
 plt.plot([-1,2], [chance_level]*2, '--k')
 plt.yticks([0,1])
 plt.ylabel('accuracy')
 plt.tight_layout()
 plt.title('train-test accuracies')
 acc2 = acc[acc['type']=='test']
 plt.figure()
-sb.violinplot(data=acc2, y='log C', x='type',
+sb.violinplot(data=acc2, y='log C', x='type', hue='type',
-              palette=['orange'], scale='width')
+              palette=['orange'], density_norm='width')
 plt.tight_layout()
 plt.title('best C (log$_{10}$ scale)')
 plt.show()
 ```
 %% Cell type:markdown id:8ef0a315 tags:
 Plot the ranking obtained from RFE.
 Recall that the inputs are geneterated such that "the shift across classes linearly depends on the feature index". This means that inputs with large index should have low (informative) ranking.
 %% Cell type:code id:e96d2aa6 tags:
 ``` python
+sel_test = acc['type']=='test'
+sel_shuf = acc['type']=='shuf'
 plt.figure()
-plt.plot(acc['ranking'].mean())
+plt.plot(acc[sel_test]['ranking'].mean(), label='test')
+plt.plot(acc[sel_shuf]['ranking'].mean(), label='shuf')
 plt.xlabel('input index')
 plt.ylabel('mean ranking across CV splits')
 plt.tight_layout()
 plt.title('ranking of informative features')
 plt.show()
 ```
-%% Cell type:markdown id:bcf0b3dc tags:
+%% Cell type:markdown id:9468ad24-29f5-4688-92cb-4821854051ef tags:
-Modify the scaling for the input generation to see how the ranking changes.
+The ranking estimated from classification can be compared to the "ground truth", which is in theory the order of input indices. Note that the randomness in the generation of the inputs adds some noise that affects the precision of the estimated ranking.
+%% Cell type:code id:6ee2f8f7-3f55-41bd-b991-fefed23fe20f tags:
+``` python
+print(np.argsort(acc[sel_test]['ranking'].mean()))
+```