From ca1c83610794d8659d91fc1067f7c9a10daf5896 Mon Sep 17 00:00:00 2001
From: MatthieuGilson <matthieu.gilson@univ-amu.fr>
Date: Mon, 26 Feb 2024 09:53:24 +0100
Subject: [PATCH] update suplearn4

---
 sup_lrn/nb_suplearn4_feature_selection.ipynb | 288 +++++++++++++++++--
 1 file changed, 269 insertions(+), 19 deletions(-)

diff --git a/sup_lrn/nb_suplearn4_feature_selection.ipynb b/sup_lrn/nb_suplearn4_feature_selection.ipynb
index bae4b3c..f106e2a 100644
--- a/sup_lrn/nb_suplearn4_feature_selection.ipynb
+++ b/sup_lrn/nb_suplearn4_feature_selection.ipynb
@@ -87,6 +87,16 @@
     "    return X, lbl"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "9d0e394b-eb3e-4140-99e7-9d6123960711",
+   "metadata": {},
+   "source": [
+    "## Toy example\n",
+    "\n",
+    "How to interpret the trained weights?"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -95,10 +105,126 @@
    "outputs": [],
    "source": [
     "# generate inputs and labels\n",
-    "m = 50 # input dimensionality\n",
+    "m = 10 # input dimensionality\n",
     "s0 = 100 # number of samples for class 0\n",
     "s1 = 100 # number of samples for class 1\n",
-    "X, y = gen_inputs(m, s0, s1, scaling=0.5) # try 0.2"
+    "X, y = gen_inputs(m, s0, s1, scaling=0.7)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0a45c553-630d-4d95-94d5-6735e9fe0ee2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create matrix rgb\n",
+    "vbins = np.linspace(-3,3,30) # bins for istograms\n",
+    "mat_rgb = np.zeros([m, vbins.size-1, 3])\n",
+    "for i in range(m):\n",
+    "    mat_rgb[i,:,0] = np.histogram(X[:s0,i], bins=vbins)[0]\n",
+    "    mat_rgb[i,:,2] = np.histogram(X[s0:,i], bins=vbins)[0]\n",
+    "mat_rgb /= mat_rgb.max() / 2.0\n",
+    "\n",
+    "plt.figure(figsize=[6,7])\n",
+    "plt.imshow(mat_rgb)\n",
+    "plt.xlabel('values')\n",
+    "plt.ylabel('input index')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d712a2fb-86b6-46df-a869-bc05475e79d2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# train classifier\n",
+    "clf = LogisticRegression()\n",
+    "clf.fit(X,y)\n",
+    "\n",
+    "# plot weights\n",
+    "plt.figure()\n",
+    "plt.plot(range(m), clf.coef_.flatten())\n",
+    "plt.xlabel('input index')\n",
+    "plt.ylabel('weight')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bc00e090-8aad-4bf5-9e7a-29a92e7e50fe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# copy X and rescale an input\n",
+    "X1 = np.copy(X)\n",
+    "i = 8\n",
+    "X1[:,i] *= 10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2777f72d-b304-4e12-ae53-5e0df531d85c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# plot\n",
+    "plt.figure(figsize=[6,7])\n",
+    "plt.subplot(211)\n",
+    "plt.hist(X[:s0,i], histtype='step', bins=40, color='r')\n",
+    "plt.hist(X[s0:,i], histtype='step', bins=40, color='b')\n",
+    "plt.legend(['class 0', 'class 1'], fontsize=10)\n",
+    "plt.title('input {} before'.format(i), loc='left')\n",
+    "plt.subplot(212)\n",
+    "plt.hist(X1[:s0,i], histtype='step', bins=40, color='r')\n",
+    "plt.hist(X1[s0:,i], histtype='step', bins=40, color='b')\n",
+    "plt.legend(['class 0', 'class 1'], fontsize=10)\n",
+    "plt.title('input {} after'.format(i), loc='left')\n",
+    "plt.tight_layout()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "56bb23cb-598a-4164-97f9-af9c1300eaf8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "clf1 = LogisticRegression()\n",
+    "clf1.fit(X1,y)\n",
+    "\n",
+    "plt.figure()\n",
+    "plt.plot(range(m), clf.coef_.flatten(), label='orig')\n",
+    "plt.plot(range(m), clf1.coef_.flatten()/np.std(X,axis=0), label='rescaled')\n",
+    "plt.xlabel('input index')\n",
+    "plt.ylabel('weight')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "341418f1-2c2b-4e40-8711-1278e2af2948",
+   "metadata": {},
+   "source": [
+    "## Using the RFE object"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8d55d9cb-1f2a-4da6-b443-9713f75f6cf5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# generate inputs and labels\n",
+    "m = 10 # input dimensionality\n",
+    "s0 = 100 # number of samples for class 0\n",
+    "s1 = 100 # number of samples for class 1\n",
+    "X, y = gen_inputs(m, s0, s1, scaling=0.7)"
    ]
   },
   {
@@ -113,6 +239,123 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "1f5113c4-b6c5-462f-84f0-5ba38872d420",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# create matrix rgb\n",
+    "vbins = np.linspace(-3,3,30) # bins for istograms\n",
+    "mat_rgb = np.zeros([m, vbins.size-1, 3])\n",
+    "for i in range(m):\n",
+    "    mat_rgb[i,:,0] = np.histogram(X[:s0,i], bins=vbins)[0]\n",
+    "    mat_rgb[i,:,2] = np.histogram(X[s0:,i], bins=vbins)[0]\n",
+    "mat_rgb /= mat_rgb.max() / 2.0\n",
+    "\n",
+    "plt.figure(figsize=[6,7])\n",
+    "plt.imshow(mat_rgb)\n",
+    "plt.xlabel('values')\n",
+    "plt.ylabel('input index')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d355c4af-54a7-4668-8181-f69f9a9b466e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# classifier in pipeline\n",
+    "clf = Pipeline([('scl',StandardScaler()),\n",
+    "                ('mlr',LogisticRegression(C=1.0))])\n",
+    "\n",
+    "# number of repetitions \n",
+    "n_rep = 1 # change for 10\n",
+    "\n",
+    "# outer cross-validation scheme\n",
+    "cvs = StratifiedShuffleSplit(n_splits=n_rep, test_size=0.2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4fe3c14c-06ae-4cea-91ff-50b56dbe2f07",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# check names of parameters for pipeline (for grid search)\n",
+    "print(clf.get_params())\n",
+    "\n",
+    "# quick fix to get coefficients from mlr estimator in pipeline\n",
+    "def get_coef(clf_pipeline):\n",
+    "    return clf_pipeline['mlr'].coef_"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "876b966b-5222-452e-b643-58d7ab638f87",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# repeat classification\n",
+    "for train_ind, test_ind in cvs.split(X, y):\n",
+    "    \n",
+    "    # wrap classifier to be fitted to data to calculate the ranking\n",
+    "    feature_select = RFE(clf, n_features_to_select=1, step=1,\n",
+    "                         importance_getter=get_coef)\n",
+    "    \n",
+    "    # perform RFE\n",
+    "    feature_select.fit(X[train_ind,:], y[train_ind])\n",
+    "    print(feature_select.ranking_)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e044c2ed-1cf9-4337-ab91-41a6c239b7b0",
+   "metadata": {},
+   "source": [
+    "Exercises:\n",
+    "- test the variability of the ranking across train-test splits\n",
+    "- change the granularity of the output ranking with `n_features_to_select` and `step`\n",
+    "- modify the scaling for the input generation to see how the ranking changes."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c26064a9-17a9-44d6-9c66-b85676c94fc8",
+   "metadata": {},
+   "source": [
+    "## Back to general case and combining all options for the classification"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cc559465-2c51-4e62-9b40-d683b91f1eac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# generate inputs and labels\n",
+    "m = 50 # input dimensionality\n",
+    "s0 = 100 # number of samples for class 0\n",
+    "s1 = 100 # number of samples for class 1\n",
+    "X, y = gen_inputs(m, s0, s1, scaling=0.5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6febb8bf-a89c-4c1d-8383-feffd43f98c2",
+   "metadata": {},
+   "source": [
+    "Here the inputs are modified compared to the previous notebooks, with a gradient of contrast that increases with the feature index $j$."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "edb43b31-62f0-4d0f-98d8-ac3e3c062405",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -136,8 +379,6 @@
    "id": "0024c6aa",
    "metadata": {},
    "source": [
-    "## Parameterization of classifier\n",
-    "\n",
     "We then build a pipeline for the classifier. The outer cross-validation corresponds to the train-test splitting as before.\n",
     "\n",
     "The inner crosss-validation corresponds to the optimization of the hyperparameter `C` of the classifier (logistic regression in the pipeline)."
@@ -153,7 +394,7 @@
     "# hyperparameter for regularization\n",
     "Cs = [0.01,0.1,1.0,10.0,100.0]\n",
     "\n",
-    "# classifier in pipeline and wrapper for RFE\n",
+    "# classifier in pipeline\n",
     "clf = Pipeline([('scl',StandardScaler()),\n",
     "                ('mlr',LogisticRegression())])\n",
     "\n",
@@ -174,9 +415,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# check names of parameters for pipeline (for grid search)\n",
-    "print(clf.get_params())\n",
-    "\n",
     "# quick fix to get coefficients from mlr estimator in pipeline\n",
     "def get_coef(clf_pipeline):\n",
     "    return clf_pipeline['mlr'].coef_"
@@ -187,8 +425,6 @@
    "id": "e3f096d7",
    "metadata": {},
    "source": [
-    "## Optimization involving the tuning of hyperparameter\n",
-    "\n",
     "We use `GridSearchCV` to optimize the hyperparameter, the use the best classifier pipeline on the test set and perform recursive feature elimination (RFE) to identify informative features that contribute to the correct classification. The latter gives a ranking where low ranks correspond to informative features."
    ]
   },
@@ -230,7 +466,7 @@
     "         'score': [score],\n",
     "         'ranking': [ranking]}\n",
     "    acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)\n",
-    "    \n",
+    "\n",
     "    # shuffling\n",
     "    train_ind_rand = np.random.permutation(train_ind)\n",
     "\n",
@@ -265,8 +501,8 @@
     "chance_level = 0.5\n",
     "\n",
     "plt.figure()\n",
-    "sb.violinplot(data=acc, y='score', x='type', \n",
-    "              palette=['brown','orange'], scale='width')\n",
+    "sb.violinplot(data=acc, y='score', x='type', hue='type',\n",
+    "              palette=['brown','orange'], density_norm='width')\n",
     "plt.plot([-1,2], [chance_level]*2, '--k')\n",
     "plt.yticks([0,1])\n",
     "plt.ylabel('accuracy')\n",
@@ -277,8 +513,8 @@
     "acc2 = acc[acc['type']=='test']\n",
     "\n",
     "plt.figure()\n",
-    "sb.violinplot(data=acc2, y='log C', x='type', \n",
-    "              palette=['orange'], scale='width')\n",
+    "sb.violinplot(data=acc2, y='log C', x='type', hue='type', \n",
+    "              palette=['orange'], density_norm='width')\n",
     "plt.tight_layout()\n",
     "plt.title('best C (log$_{10}$ scale)')\n",
     "\n",
@@ -302,8 +538,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "sel_test = acc['type']=='test'\n",
+    "sel_shuf = acc['type']=='shuf'\n",
+    "\n",
     "plt.figure()\n",
-    "plt.plot(acc['ranking'].mean())\n",
+    "plt.plot(acc[sel_test]['ranking'].mean(), label='test')\n",
+    "plt.plot(acc[sel_shuf]['ranking'].mean(), label='shuf')\n",
     "plt.xlabel('input index')\n",
     "plt.ylabel('mean ranking across CV splits')\n",
     "plt.tight_layout()\n",
@@ -314,10 +554,20 @@
   },
   {
    "cell_type": "markdown",
-   "id": "bcf0b3dc",
+   "id": "9468ad24-29f5-4688-92cb-4821854051ef",
+   "metadata": {},
+   "source": [
+    "The ranking estimated from classification can be compared to the \"ground truth\", which is in theory the order of input indices. Note that the randomness in the generation of the inputs adds some noise that affects the precision of the estimated ranking."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6ee2f8f7-3f55-41bd-b991-fefed23fe20f",
    "metadata": {},
+   "outputs": [],
    "source": [
-    "Modify the scaling for the input generation to see how the ranking changes."
+    "print(np.argsort(acc[sel_test]['ranking'].mean()))"
    ]
   }
  ],
@@ -337,7 +587,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
-- 
GitLab