From 4087c14b1d28f743c61f4111fc01d611ee8f24a0 Mon Sep 17 00:00:00 2001 From: MatthieuGilson <matthieu.gilson@univ-amu.fr> Date: Mon, 15 Apr 2024 08:21:30 +0200 Subject: [PATCH] update suplearn nb --- .../nb_suplearn_exercise.ipynb | 0 sup_lrn/nb3_compare_classifiers.ipynb | 2 +- sup_lrn/nb4_feature_selection.ipynb | 64 ++++++++++++++++--- 3 files changed, 56 insertions(+), 10 deletions(-) rename sup_lrn/{ => exercises}/nb_suplearn_exercise.ipynb (100%) diff --git a/sup_lrn/nb_suplearn_exercise.ipynb b/sup_lrn/exercises/nb_suplearn_exercise.ipynb similarity index 100% rename from sup_lrn/nb_suplearn_exercise.ipynb rename to sup_lrn/exercises/nb_suplearn_exercise.ipynb diff --git a/sup_lrn/nb3_compare_classifiers.ipynb b/sup_lrn/nb3_compare_classifiers.ipynb index a95db3d..6f3200e 100644 --- a/sup_lrn/nb3_compare_classifiers.ipynb +++ b/sup_lrn/nb3_compare_classifiers.ipynb @@ -51,7 +51,7 @@ "id": "5e78fcf9", "metadata": {}, "source": [ - "Let's generate a dataset for classification, including labels." + "As in the previous notebook, let's generate a synthetic dataset for classification, including labels." ] }, { diff --git a/sup_lrn/nb4_feature_selection.ipynb b/sup_lrn/nb4_feature_selection.ipynb index 4529727..dbd80ca 100644 --- a/sup_lrn/nb4_feature_selection.ipynb +++ b/sup_lrn/nb4_feature_selection.ipynb @@ -51,7 +51,7 @@ "source": [ "## Generate samples to classify\n", "\n", - "We generate synthetic data with 2 classes to separate (`s0` and `s1` samples, respectively). The input dimensionality corresponds `m` features. " + "We generate synthetic data with 2 classes to separate (`s0` and `s1` samples, respectively). The input dimensionality corresponds `m` features. Compared to the previous notebook, the contrast between classes linearly depends on the input index: no contrast for index $j = 0$ until maximum contrast as defined by `scaling` for $j = m-1$. In this way, inputs have various levels of \"information\" (in teh general sense) that can be extracted to discriminate the two classes." ] }, { @@ -152,7 +152,33 @@ "plt.plot(range(m), clf.coef_.flatten())\n", "plt.xlabel('input index')\n", "plt.ylabel('weight')\n", - "plt.show()" + "\n", + "# check the training accuracy of classification using a single feature as input\n", + "clf_indiv = LogisticRegression()\n", + "perf_indiv = []\n", + "for j in range(m):\n", + " # retain only a single feature\n", + " X_indiv = X[:,j].reshape([s0+s1,1])\n", + " # train the classifier and get the training accuracy\n", + " clf_indiv.fit(X_indiv,y)\n", + " perf_indiv.append(clf_indiv.score(X_indiv,y))\n", + "\n", + "plt.figure()\n", + "plt.scatter(np.abs(clf.coef_).flatten(), perf_indiv)\n", + "plt.xlabel('abs weight')\n", + "plt.ylabel('perf single feature')\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "id": "6809b05c-0144-4f81-82b0-c3852de682dd", + "metadata": {}, + "source": [ + "The absolute value of the trained weights seems informative about the importance of the input for the classification: here large index corresponds to more contrast between the two classes (also shown by the classification using each input dimension individually).\n", + "\n", + "Now let's transform the input without changing the contrast across the two classes, by simply rescaling one of the features." ] }, { @@ -162,7 +188,7 @@ "metadata": {}, "outputs": [], "source": [ - "# copy X and rescale an input\n", + "# copy X and rescale input 8\n", "X1 = np.copy(X)\n", "i = 8\n", "X1[:,i] *= 10" @@ -193,21 +219,41 @@ { "cell_type": "code", "execution_count": null, - "id": "56bb23cb-598a-4164-97f9-af9c1300eaf8", + "id": "1568758c-2a45-4ca8-a751-394da7a7f089", "metadata": {}, "outputs": [], "source": [ + "# train a classifier on transformed inputs\n", "clf1 = LogisticRegression()\n", "clf1.fit(X1,y)\n", "\n", "plt.figure()\n", - "plt.plot(range(m), clf.coef_.flatten(), label='orig')\n", - "plt.plot(range(m), clf1.coef_.flatten()/np.std(X,axis=0), label='rescaled')\n", + "plt.plot(range(m), np.std(X1,axis=0))\n", "plt.xlabel('input index')\n", - "plt.ylabel('weight')\n", + "plt.ylabel('input variance')\n", + "\n", + "# comparison of the absolute weights, with a correction related to the standard deviation of each feature\n", + "plt.figure()\n", + "plt.plot(range(m), np.abs(clf.coef_).flatten(), label='orig')\n", + "plt.plot(range(m), np.abs(clf1.coef_).flatten(), label='not corrected', ls='-.')\n", + "plt.plot(range(m), np.abs(clf1.coef_).flatten()*np.std(X1,axis=0), label='corrected')\n", + "plt.xlabel('input index')\n", + "plt.ylabel('abs weight')\n", + "plt.legend()\n", + "\n", "plt.show()" ] }, + { + "cell_type": "markdown", + "id": "181f1dd6-b5e5-4955-9bbf-30baaeac485f", + "metadata": {}, + "source": [ + "This shows that the trained weights provide information about how informative the corresponding input features are, but the distribution of the input values matter (here quantified by the standard deviation). This can be understood by the weight multiplication in the logistic regression:\n", + "$$ y = \\phi[\\sum_j w_j x_j] = \\phi[\\cdots + (\\alpha w_j) \\frac{x_j}{\\alpha} + \\cdots ] $$\n", + "This also explains why a standard scaler is often used with such linear models, such that all features are in a comparable range." + ] + }, { "cell_type": "markdown", "id": "341418f1-2c2b-4e40-8711-1278e2af2948", @@ -312,7 +358,7 @@ " \n", " # perform RFE\n", " feature_select.fit(X[train_ind,:], y[train_ind])\n", - " print(feature_select.ranking_)\n" + " print(feature_select.ranking_)" ] }, { @@ -600,7 +646,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.7" + "version": "3.10.9" } }, "nbformat": 4, -- GitLab