Skip to content
Snippets Groups Projects
Commit ca1c8361 authored by GILSON Matthieu's avatar GILSON Matthieu
Browse files

update suplearn4

parent 20c25fd5
Branches
No related tags found
No related merge requests found
%% Cell type:markdown id:00cde4c7 tags: %% Cell type:markdown id:00cde4c7 tags:
# 4) Recursive feature elimination (RFE) # 4) Recursive feature elimination (RFE)
This notebook shows the use of RFE to identify informative features for the classification This notebook shows the use of RFE to identify informative features for the classification
See also the documentation of scikit-learn library (https://scikit-learn.org/) See also the documentation of scikit-learn library (https://scikit-learn.org/)
%% Cell type:code id:8c9c8432 tags: %% Cell type:code id:8c9c8432 tags:
``` python ``` python
# import librairies # import librairies
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, GridSearchCV from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import RFE from sklearn.feature_selection import RFE
%matplotlib inline %matplotlib inline
import matplotlib as mpl import matplotlib as mpl
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import seaborn as sb import seaborn as sb
font = {'family' : 'DejaVu Sans', font = {'family' : 'DejaVu Sans',
'weight' : 'regular', 'weight' : 'regular',
'size' : 18} 'size' : 18}
mpl.rc('font', **font) mpl.rc('font', **font)
``` ```
%% Cell type:markdown id:0d12a052 tags: %% Cell type:markdown id:0d12a052 tags:
## Generate samples to classify ## Generate samples to classify
We first generate synthetic data with 2 classes to separate (`s0` and `s1` samples, respectively). The input dimensionality corresponds `m` features. We first generate synthetic data with 2 classes to separate (`s0` and `s1` samples, respectively). The input dimensionality corresponds `m` features.
%% Cell type:code id:4673d31d tags: %% Cell type:code id:4673d31d tags:
``` python ``` python
# create synthetic dataset where 2 classes of s0+s1 samples of m-dimensional inputs with controlled contrast # create synthetic dataset where 2 classes of s0+s1 samples of m-dimensional inputs with controlled contrast
def gen_inputs(m, # input dimensionality def gen_inputs(m, # input dimensionality
s0, # number of samples for class 0 s0, # number of samples for class 0
s1, # number of samples for class 1 s1, # number of samples for class 1
scaling): # scaling factor to separate classes scaling): # scaling factor to separate classes
# labels # labels
lbl = np.zeros([s0+s1], dtype=int) lbl = np.zeros([s0+s1], dtype=int)
# inputs # inputs
X = np.zeros([s0+s1,m]) X = np.zeros([s0+s1,m])
# create s0 and s1 samples for the 2 classes # create s0 and s1 samples for the 2 classes
for i in range(s0+s1): for i in range(s0+s1):
# label # label
lbl[i] = int(i<s0) lbl[i] = int(i<s0)
# inputs are random noise plus a shift # inputs are random noise plus a shift
for j in range(m): for j in range(m):
# positive/negative shift for 1st/2nd class # positive/negative shift for 1st/2nd class
if i<s0: if i<s0:
a = -scaling a = -scaling
else: else:
a = scaling a = scaling
# the shift across classes linearly depends on the feature index j # the shift across classes linearly depends on the feature index j
X[i,j] = a*j/m + np.random.randn() X[i,j] = a*j/m + np.random.randn()
return X, lbl return X, lbl
``` ```
%% Cell type:markdown id:9d0e394b-eb3e-4140-99e7-9d6123960711 tags:
## Toy example
How to interpret the trained weights?
%% Cell type:code id:029ab1fa tags: %% Cell type:code id:029ab1fa tags:
``` python ``` python
# generate inputs and labels # generate inputs and labels
m = 50 # input dimensionality m = 10 # input dimensionality
s0 = 100 # number of samples for class 0 s0 = 100 # number of samples for class 0
s1 = 100 # number of samples for class 1 s1 = 100 # number of samples for class 1
X, y = gen_inputs(m, s0, s1, scaling=0.5) # try 0.2 X, y = gen_inputs(m, s0, s1, scaling=0.7)
```
%% Cell type:code id:0a45c553-630d-4d95-94d5-6735e9fe0ee2 tags:
``` python
# create matrix rgb
vbins = np.linspace(-3,3,30) # bins for istograms
mat_rgb = np.zeros([m, vbins.size-1, 3])
for i in range(m):
mat_rgb[i,:,0] = np.histogram(X[:s0,i], bins=vbins)[0]
mat_rgb[i,:,2] = np.histogram(X[s0:,i], bins=vbins)[0]
mat_rgb /= mat_rgb.max() / 2.0
plt.figure(figsize=[6,7])
plt.imshow(mat_rgb)
plt.xlabel('values')
plt.ylabel('input index')
plt.show()
```
%% Cell type:code id:d712a2fb-86b6-46df-a869-bc05475e79d2 tags:
``` python
# train classifier
clf = LogisticRegression()
clf.fit(X,y)
# plot weights
plt.figure()
plt.plot(range(m), clf.coef_.flatten())
plt.xlabel('input index')
plt.ylabel('weight')
plt.show()
```
%% Cell type:code id:bc00e090-8aad-4bf5-9e7a-29a92e7e50fe tags:
``` python
# copy X and rescale an input
X1 = np.copy(X)
i = 8
X1[:,i] *= 10
```
%% Cell type:code id:2777f72d-b304-4e12-ae53-5e0df531d85c tags:
``` python
# plot
plt.figure(figsize=[6,7])
plt.subplot(211)
plt.hist(X[:s0,i], histtype='step', bins=40, color='r')
plt.hist(X[s0:,i], histtype='step', bins=40, color='b')
plt.legend(['class 0', 'class 1'], fontsize=10)
plt.title('input {} before'.format(i), loc='left')
plt.subplot(212)
plt.hist(X1[:s0,i], histtype='step', bins=40, color='r')
plt.hist(X1[s0:,i], histtype='step', bins=40, color='b')
plt.legend(['class 0', 'class 1'], fontsize=10)
plt.title('input {} after'.format(i), loc='left')
plt.tight_layout()
```
%% Cell type:code id:56bb23cb-598a-4164-97f9-af9c1300eaf8 tags:
``` python
clf1 = LogisticRegression()
clf1.fit(X1,y)
plt.figure()
plt.plot(range(m), clf.coef_.flatten(), label='orig')
plt.plot(range(m), clf1.coef_.flatten()/np.std(X,axis=0), label='rescaled')
plt.xlabel('input index')
plt.ylabel('weight')
plt.show()
```
%% Cell type:markdown id:341418f1-2c2b-4e40-8711-1278e2af2948 tags:
## Using the RFE object
%% Cell type:code id:8d55d9cb-1f2a-4da6-b443-9713f75f6cf5 tags:
``` python
# generate inputs and labels
m = 10 # input dimensionality
s0 = 100 # number of samples for class 0
s1 = 100 # number of samples for class 1
X, y = gen_inputs(m, s0, s1, scaling=0.7)
``` ```
%% Cell type:markdown id:56e3cdf7-e3ca-43bb-89a1-e68dae9627dc tags: %% Cell type:markdown id:56e3cdf7-e3ca-43bb-89a1-e68dae9627dc tags:
Here the inputs are modified compared to the previous notebooks, with a gradient of contrast that increases with the feature index $j$. Here the inputs are modified compared to the previous notebooks, with a gradient of contrast that increases with the feature index $j$.
%% Cell type:code id:1f5113c4-b6c5-462f-84f0-5ba38872d420 tags: %% Cell type:code id:1f5113c4-b6c5-462f-84f0-5ba38872d420 tags:
``` python ``` python
# create matrix rgb # create matrix rgb
vbins = np.linspace(-3,3,30) # bins for istograms vbins = np.linspace(-3,3,30) # bins for istograms
mat_rgb = np.zeros([m, vbins.size-1, 3]) mat_rgb = np.zeros([m, vbins.size-1, 3])
for i in range(m): for i in range(m):
mat_rgb[i,:,0] = np.histogram(X[:s0,i], bins=vbins)[0] mat_rgb[i,:,0] = np.histogram(X[:s0,i], bins=vbins)[0]
mat_rgb[i,:,2] = np.histogram(X[s0:,i], bins=vbins)[0] mat_rgb[i,:,2] = np.histogram(X[s0:,i], bins=vbins)[0]
mat_rgb /= mat_rgb.max() / 2.0 mat_rgb /= mat_rgb.max() / 2.0
plt.figure(figsize=[6,7]) plt.figure(figsize=[6,7])
plt.imshow(mat_rgb) plt.imshow(mat_rgb)
plt.xlabel('values') plt.xlabel('values')
plt.ylabel('input index') plt.ylabel('input index')
plt.show() plt.show()
``` ```
%% Cell type:markdown id:0024c6aa tags: %% Cell type:code id:d355c4af-54a7-4668-8181-f69f9a9b466e tags:
``` python
# classifier in pipeline
clf = Pipeline([('scl',StandardScaler()),
('mlr',LogisticRegression(C=1.0))])
# number of repetitions
n_rep = 1 # change for 10
# outer cross-validation scheme
cvs = StratifiedShuffleSplit(n_splits=n_rep, test_size=0.2)
```
## Parameterization of classifier %% Cell type:code id:4fe3c14c-06ae-4cea-91ff-50b56dbe2f07 tags:
``` python
# check names of parameters for pipeline (for grid search)
print(clf.get_params())
# quick fix to get coefficients from mlr estimator in pipeline
def get_coef(clf_pipeline):
return clf_pipeline['mlr'].coef_
```
%% Cell type:code id:876b966b-5222-452e-b643-58d7ab638f87 tags:
``` python
# repeat classification
for train_ind, test_ind in cvs.split(X, y):
# wrap classifier to be fitted to data to calculate the ranking
feature_select = RFE(clf, n_features_to_select=1, step=1,
importance_getter=get_coef)
# perform RFE
feature_select.fit(X[train_ind,:], y[train_ind])
print(feature_select.ranking_)
```
%% Cell type:markdown id:e044c2ed-1cf9-4337-ab91-41a6c239b7b0 tags:
Exercises:
- test the variability of the ranking across train-test splits
- change the granularity of the output ranking with `n_features_to_select` and `step`
- modify the scaling for the input generation to see how the ranking changes.
%% Cell type:markdown id:c26064a9-17a9-44d6-9c66-b85676c94fc8 tags:
## Back to general case and combining all options for the classification
%% Cell type:code id:cc559465-2c51-4e62-9b40-d683b91f1eac tags:
``` python
# generate inputs and labels
m = 50 # input dimensionality
s0 = 100 # number of samples for class 0
s1 = 100 # number of samples for class 1
X, y = gen_inputs(m, s0, s1, scaling=0.5)
```
%% Cell type:markdown id:6febb8bf-a89c-4c1d-8383-feffd43f98c2 tags:
Here the inputs are modified compared to the previous notebooks, with a gradient of contrast that increases with the feature index $j$.
%% Cell type:code id:edb43b31-62f0-4d0f-98d8-ac3e3c062405 tags:
``` python
# create matrix rgb
vbins = np.linspace(-3,3,30) # bins for istograms
mat_rgb = np.zeros([m, vbins.size-1, 3])
for i in range(m):
mat_rgb[i,:,0] = np.histogram(X[:s0,i], bins=vbins)[0]
mat_rgb[i,:,2] = np.histogram(X[s0:,i], bins=vbins)[0]
mat_rgb /= mat_rgb.max() / 2.0
plt.figure(figsize=[6,7])
plt.imshow(mat_rgb)
plt.xlabel('values')
plt.ylabel('input index')
plt.show()
```
%% Cell type:markdown id:0024c6aa tags:
We then build a pipeline for the classifier. The outer cross-validation corresponds to the train-test splitting as before. We then build a pipeline for the classifier. The outer cross-validation corresponds to the train-test splitting as before.
The inner crosss-validation corresponds to the optimization of the hyperparameter `C` of the classifier (logistic regression in the pipeline). The inner crosss-validation corresponds to the optimization of the hyperparameter `C` of the classifier (logistic regression in the pipeline).
%% Cell type:code id:6f2a4e24 tags: %% Cell type:code id:6f2a4e24 tags:
``` python ``` python
# hyperparameter for regularization # hyperparameter for regularization
Cs = [0.01,0.1,1.0,10.0,100.0] Cs = [0.01,0.1,1.0,10.0,100.0]
# classifier in pipeline and wrapper for RFE # classifier in pipeline
clf = Pipeline([('scl',StandardScaler()), clf = Pipeline([('scl',StandardScaler()),
('mlr',LogisticRegression())]) ('mlr',LogisticRegression())])
# number of repetitions and storage of results # number of repetitions and storage of results
n_rep = 10 n_rep = 10
# outer cross-validation scheme # outer cross-validation scheme
cvs = StratifiedShuffleSplit(n_splits=n_rep, test_size=0.2) cvs = StratifiedShuffleSplit(n_splits=n_rep, test_size=0.2)
# inner cross-validation scheme # inner cross-validation scheme
cv_nest = StratifiedKFold(n_splits=3) cv_nest = StratifiedKFold(n_splits=3)
``` ```
%% Cell type:code id:90de0aa7 tags: %% Cell type:code id:90de0aa7 tags:
``` python ``` python
# check names of parameters for pipeline (for grid search)
print(clf.get_params())
# quick fix to get coefficients from mlr estimator in pipeline # quick fix to get coefficients from mlr estimator in pipeline
def get_coef(clf_pipeline): def get_coef(clf_pipeline):
return clf_pipeline['mlr'].coef_ return clf_pipeline['mlr'].coef_
``` ```
%% Cell type:markdown id:e3f096d7 tags: %% Cell type:markdown id:e3f096d7 tags:
## Optimization involving the tuning of hyperparameter
We use `GridSearchCV` to optimize the hyperparameter, the use the best classifier pipeline on the test set and perform recursive feature elimination (RFE) to identify informative features that contribute to the correct classification. The latter gives a ranking where low ranks correspond to informative features. We use `GridSearchCV` to optimize the hyperparameter, the use the best classifier pipeline on the test set and perform recursive feature elimination (RFE) to identify informative features that contribute to the correct classification. The latter gives a ranking where low ranks correspond to informative features.
%% Cell type:code id:07cdf8f2 tags: %% Cell type:code id:07cdf8f2 tags:
``` python ``` python
# grid search for hyperparameter C # grid search for hyperparameter C
gscv = GridSearchCV(clf, gscv = GridSearchCV(clf,
{'mlr__C': Cs}, {'mlr__C': Cs},
cv=cv_nest) cv=cv_nest)
acc = pd.DataFrame(columns=['type', 'log C', 'score', 'ranking']) acc = pd.DataFrame(columns=['type', 'log C', 'score', 'ranking'])
# repeat classification # repeat classification
for train_ind, test_ind in cvs.split(X, y): for train_ind, test_ind in cvs.split(X, y):
# optimize hyperparameter # optimize hyperparameter
gscv.fit(X[train_ind,:], y[train_ind]) gscv.fit(X[train_ind,:], y[train_ind])
clf_best = gscv.best_estimator_ clf_best = gscv.best_estimator_
best_C = gscv.best_params_['mlr__C'] best_C = gscv.best_params_['mlr__C']
# wrap classifier to be fitted to data to calculate the ranking # wrap classifier to be fitted to data to calculate the ranking
feature_select = RFE(clf_best, n_features_to_select=1, step=1, feature_select = RFE(clf_best, n_features_to_select=1, step=1,
importance_getter=get_coef) importance_getter=get_coef)
# train and test classifier # train and test classifier
clf_best.fit(X[train_ind,:], y[train_ind]) clf_best.fit(X[train_ind,:], y[train_ind])
score = clf_best.score(X[test_ind,:], y[test_ind]) score = clf_best.score(X[test_ind,:], y[test_ind])
# perform RFE # perform RFE
feature_select.fit(X[train_ind,:], y[train_ind]) feature_select.fit(X[train_ind,:], y[train_ind])
ranking = feature_select.ranking_ ranking = feature_select.ranking_
# store results # store results
d = {'type': ['test'], d = {'type': ['test'],
'log C': [np.log10(best_C)], 'log C': [np.log10(best_C)],
'score': [score], 'score': [score],
'ranking': [ranking]} 'ranking': [ranking]}
acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True) acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)
# shuffling # shuffling
train_ind_rand = np.random.permutation(train_ind) train_ind_rand = np.random.permutation(train_ind)
clf_best.fit(X[train_ind,:], y[train_ind_rand]) clf_best.fit(X[train_ind,:], y[train_ind_rand])
score = clf_best.score(X[test_ind,:], y[test_ind]) score = clf_best.score(X[test_ind,:], y[test_ind])
# perform RFE # perform RFE
feature_select.fit(X[train_ind,:], y[train_ind_rand]) feature_select.fit(X[train_ind,:], y[train_ind_rand])
ranking = feature_select.ranking_ ranking = feature_select.ranking_
# store results # store results
d = {'type': ['shuf'], d = {'type': ['shuf'],
'log C': [np.log10(best_C)], 'log C': [np.log10(best_C)],
'score': [score], 'score': [score],
'ranking': [ranking]} 'ranking': [ranking]}
acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True) acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)
``` ```
%% Cell type:markdown id:2702e677 tags: %% Cell type:markdown id:2702e677 tags:
Plot the results for accuracy and best parameters. Plot the results for accuracy and best parameters.
%% Cell type:code id:03237341 tags: %% Cell type:code id:03237341 tags:
``` python ``` python
chance_level = 0.5 chance_level = 0.5
plt.figure() plt.figure()
sb.violinplot(data=acc, y='score', x='type', sb.violinplot(data=acc, y='score', x='type', hue='type',
palette=['brown','orange'], scale='width') palette=['brown','orange'], density_norm='width')
plt.plot([-1,2], [chance_level]*2, '--k') plt.plot([-1,2], [chance_level]*2, '--k')
plt.yticks([0,1]) plt.yticks([0,1])
plt.ylabel('accuracy') plt.ylabel('accuracy')
plt.tight_layout() plt.tight_layout()
plt.title('train-test accuracies') plt.title('train-test accuracies')
acc2 = acc[acc['type']=='test'] acc2 = acc[acc['type']=='test']
plt.figure() plt.figure()
sb.violinplot(data=acc2, y='log C', x='type', sb.violinplot(data=acc2, y='log C', x='type', hue='type',
palette=['orange'], scale='width') palette=['orange'], density_norm='width')
plt.tight_layout() plt.tight_layout()
plt.title('best C (log$_{10}$ scale)') plt.title('best C (log$_{10}$ scale)')
plt.show() plt.show()
``` ```
%% Cell type:markdown id:8ef0a315 tags: %% Cell type:markdown id:8ef0a315 tags:
Plot the ranking obtained from RFE. Plot the ranking obtained from RFE.
Recall that the inputs are geneterated such that "the shift across classes linearly depends on the feature index". This means that inputs with large index should have low (informative) ranking. Recall that the inputs are geneterated such that "the shift across classes linearly depends on the feature index". This means that inputs with large index should have low (informative) ranking.
%% Cell type:code id:e96d2aa6 tags: %% Cell type:code id:e96d2aa6 tags:
``` python ``` python
sel_test = acc['type']=='test'
sel_shuf = acc['type']=='shuf'
plt.figure() plt.figure()
plt.plot(acc['ranking'].mean()) plt.plot(acc[sel_test]['ranking'].mean(), label='test')
plt.plot(acc[sel_shuf]['ranking'].mean(), label='shuf')
plt.xlabel('input index') plt.xlabel('input index')
plt.ylabel('mean ranking across CV splits') plt.ylabel('mean ranking across CV splits')
plt.tight_layout() plt.tight_layout()
plt.title('ranking of informative features') plt.title('ranking of informative features')
plt.show() plt.show()
``` ```
%% Cell type:markdown id:bcf0b3dc tags: %% Cell type:markdown id:9468ad24-29f5-4688-92cb-4821854051ef tags:
Modify the scaling for the input generation to see how the ranking changes. The ranking estimated from classification can be compared to the "ground truth", which is in theory the order of input indices. Note that the randomness in the generation of the inputs adds some noise that affects the precision of the estimated ranking.
%% Cell type:code id:6ee2f8f7-3f55-41bd-b991-fefed23fe20f tags:
``` python
print(np.argsort(acc[sel_test]['ranking'].mean()))
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment