Skip to content
Snippets Groups Projects
Commit 9438ad1e authored by GILSON Matthieu's avatar GILSON Matthieu
Browse files

update sup_lrn

parent 06422a35
No related branches found
No related tags found
No related merge requests found
File added
File added
No preview for this file type
%% Cell type:markdown id:25dfd6d1 tags:
# 1) Cross-validation
author: Mat Gilson, https://etulab.univ-amu.fr/gilson.m/compneuro_course/
author: Mat Gilson, https://github.com/MatthieuGilson
# 1) Cross-validation
This notebook shows a key concept at the core of all machine learning procedures. It aims to quantify the generalizability of a trained classifier to unseen data. Here we also see how to get a baseline reference in terms of accuracy.
See also the documentation of scikit-learn library (https://scikit-learn.org/)
%% Cell type:code id:707a0699 tags:
``` python
# import librairies
import numpy as np
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit, LeaveOneOut
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix
import pandas as pd
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sb
font = {'family' : 'DejaVu Sans',
'weight' : 'regular',
'size' : 18}
mpl.rc('font', **font)
```
%% Cell type:markdown id:687e9515-5422-47f0-92d5-358b7d9aeed4 tags:
## Generate samples to classify
We first generate synthetic data with 2 classes to separate (`s0` and `s1` samples, respectively). The input dimensionality corresponds `m` features.
%% Cell type:code id:219520bc tags:
``` python
# create synthetic dataset where 2 classes of s0+s1 samples of m-dimensional inputs with controlled contrast
def gen_inputs(m, # input dimensionality
s0, # number of samples for class 0
s1, # number of samples for class 1
scaling): # scaling factor to separate classes
# labels
lbl = np.zeros([s0+s1], dtype=int)
# inputs
X = np.zeros([s0+s1,m])
# create s0 and s1 samples for the 2 classes
for i in range(s0+s1):
# label
lbl[i] = int(i<s0)
# inputs are random noise plus a shift
for j in range(m):
# positive/negative shift for 1st/2nd class, only for indices j larger than m/2
if j>=m/2:
if i<s0:
a = -scaling
else:
a = scaling
else:
a = 0.0
# the shift linearly depends on the feature index j
X[i,j] = a + np.random.randn()
return X, lbl
```
%% Cell type:markdown id:506698d5 tags:
Let's have a first look at the data. For features with indices smaller than $m/2$, there is no contrast between the two classes (in red and blue). For indices larger than $m/2$, the contrast is determined by the scaling parameters (separation between the two disributions). Each feature involve independent noise in teh generation of the samples.
%% Cell type:code id:1bc5284f tags:
``` python
# input properties
m = 10 # input dimensionality
s0 = 100 # number of samples for class 0
s1 = 100 # number of samples for class 1
scaling = 0.7 # class contrast
scaling = 0.0 # class contrast
# generate inputs
X, y = gen_inputs(m, s0, s1, scaling)
# bins for istograms
vbins = np.linspace(-3,3,30)
# plot
plt.figure(figsize=[6,7])
plt.subplot(411)
i = 0
plt.hist(X[:s0,i], histtype='step', bins=vbins, color='r')
plt.hist(X[s0:,i], histtype='step', bins=vbins, color='b')
plt.axis(xmin=-3, xmax=3)
plt.legend(['class 0', 'class 1'], fontsize=10)
plt.title('input {}'.format(i), loc='left')
plt.subplot(412)
i = int((m-1)*0.33)
plt.hist(X[:s0,i], histtype='step', bins=vbins, color='r')
plt.hist(X[s0:,i], histtype='step', bins=vbins, color='b')
plt.axis(xmin=-3, xmax=3)
plt.title('input {}'.format(i), loc='left')
plt.subplot(413)
i = int((m-1)*0.66)
plt.hist(X[:s0,i], histtype='step', bins=vbins, color='r')
plt.hist(X[s0:,i], histtype='step', bins=vbins, color='b')
plt.axis(xmin=-3, xmax=3)
plt.title('input {}'.format(i), loc='left')
plt.subplot(414)
i = m-1
plt.hist(X[:s0,i], histtype='step', bins=vbins, color='r')
plt.hist(X[s0:,i], histtype='step', bins=vbins, color='b')
plt.axis(xmin=-3, xmax=3)
plt.title('input {}'.format(i), loc='left')
plt.xlabel('X values')
plt.tight_layout()
plt.savefig('ex_contrast_X')
plt.show()
```
%% Cell type:code id:3b9a9c63 tags:
``` python
# create matrix rgb
mat_rgb = np.zeros([m, vbins.size-1, 3])
for i in range(m):
mat_rgb[i,:,0] = np.histogram(X[:s0,i], bins=vbins)[0]
mat_rgb[i,:,2] = np.histogram(X[s0:,i], bins=vbins)[0]
mat_rgb /= mat_rgb.max() / 2.0
mat_rgb = np.clip(mat_rgb, 0.0, 1.0)
plt.figure(figsize=[6,7])
plt.imshow(mat_rgb)
plt.xlabel('values')
plt.ylabel('input index')
plt.show()
```
%% Cell type:markdown id:46880c12 tags:
Above, the purple pixels indicate that the distirbutions of teh two classes overlap, whereas red and blue pixels indicate the dominance of one class.
Now let's see how to separate the 2 classes using a classifier.
%% Cell type:code id:d25b05e3 tags:
``` python
# Classifiers and learning parameters
clf = LogisticRegression(C=10000.0, penalty='l2', solver='lbfgs', max_iter=500)
```
%% Cell type:markdown id:845c6ded tags:
What is the reference as "chance" level: $50\%$ for $2$ classes?
%% Cell type:code id:832441a2 tags:
``` python
acc = pd.DataFrame(columns=['score'])
acc = pd.DataFrame()
# repetitions
n_rep = 20
for i_rep in range(n_rep):
# generate data
X, y = gen_inputs(m, s0, s1, scaling)
# Train and test classifiers with subject labels
clf.fit(X, y)
# accuracy on train set
d = {'score': [clf.score(X, y)]}
acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)
# plot
sb.violinplot(data=acc, y='score', scale='width', palette=['brown'])
sb.violinplot(data=acc, y='score', density_norm='width', palette=['brown'])
plt.text(0, 1.05, str(acc['score'].mean())[:4], horizontalalignment='center')
plt.yticks([0,1])
plt.axis(ymax=1.02)
plt.xlabel('classifier')
plt.ylabel('accuracy')
plt.show()
```
%% Cell type:code id:d28622c6 tags:
``` python
# theoretical chance level
chance_level = 1.0 / 2
sb.violinplot(data=acc, y='score', scale='width', palette=['brown'])
sb.violinplot(data=acc, y='score', density_norm='width', palette=['brown'])
plt.plot([-1,1], [chance_level]*2, '--k')
plt.yticks([0,1])
plt.axis(ymax=1.02)
plt.xlabel('classifier')
plt.ylabel('accuracy')
plt.show()
```
%% Cell type:markdown id:ae55e427 tags:
We can play with the contrast between the 2 classes, for example a more difficult classification with lower contrast / separability.
%% Cell type:code id:5452ae09 tags:
``` python
acc = pd.DataFrame(columns=['score'])
acc = pd.DataFrame()
# change contrast
scaling = 0.0 # try 0.2, 0.1, 0.0
# repetitions
n_rep = 20
for i_rep in range(n_rep):
# generate data
X, y = gen_inputs(m, s0, s1, scaling)
# Train and test classifiers with subject labels
clf.fit(X, y)
# accuracy on train set
d = {'score': [clf.score(X, y)]}
acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)
# plot
sb.violinplot(data=acc, y='score', scale='width', palette=['brown']) # cut=0
sb.violinplot(data=acc, y='score', density_norm='width', palette=['brown'])
plt.text(0, 1.05, str(acc['score'].mean())[:4], horizontalalignment='center')
plt.plot([-1,1], [chance_level]*2, '--k')
plt.yticks([0,1])
plt.axis(ymax=1.02)
plt.xlabel('classifier')
plt.ylabel('accuracy')
plt.show()
```
%% Cell type:markdown id:b9d4a9a2 tags:
Even for `scaling=0.0`, the classification accuracy is above the expected chance level $0.5$...
Let's try with similar (i.e. with same scaling), but new data.
Let's try with similar (i.e. with same scaling), but new generated data.
%% Cell type:code id:65df5e9c tags:
``` python
X_new, y_new = gen_inputs(m, s0, s1, 0.0) # also change the scaling to play with the code
print(clf.score(X_new, y_new))
```
%% Cell type:code id:cddebf1c tags:
``` python
acc = pd.DataFrame(columns=['type','score'])
acc = pd.DataFrame()
# input dimensionality (number of features)
m = 10 # try 5, 20
# class contrast
scaling = 0.7 # try 0.5, 0.0
# loop with training on a dataset and testing on a new dataset
for i_rep in range(n_rep):
# generate data
X, y = gen_inputs(m, s0, s1, scaling)
# train and calcluate accuracy
clf.fit(X, y)
d = {'type': ['training'],
'score': [clf.score(X, y)]}
acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)
# generate new data
X_new, y_new = gen_inputs(m, s0, s1, scaling)
# only test classifier that was trained on other data
d = {'type': ['new'],
'score': [clf.score(X_new, y_new)]}
acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)
sb.violinplot(data=acc, x='type', y='score', scale='width', palette=['brown','orange'])
sb.violinplot(data=acc, x='type', y='score', hue='type', density_norm='width', palette=['brown','orange'])
plt.text(0, 1.05, str(acc[acc['type']=='training']['score'].mean())[:4], horizontalalignment='center')
plt.text(1, 1.05, str(acc[acc['type']=='new']['score'].mean())[:4], horizontalalignment='center')
plt.plot([-1,2], [chance_level]*2, '--k')
plt.yticks([0,1])
plt.axis(ymax=1.02)
plt.xlabel('classifier')
plt.ylabel('accuracy')
plt.show()
```
%% Cell type:markdown id:154c6878 tags:
The classifier tends to extract specific "information" from the data it is trained with, which corresponds to the notion of overfitting.
The "real" accuracy that should be taken into account is the accuracy for the new data, which quantifies the generalization capability of the classifier to new data from the same class.
%% Cell type:markdown id:3783c050 tags:
## Cross-validation scheme
The idea is to generalize the previous observation by splitting the data into a training set and a testing set, ofr a number of repetitions. The relevant result to report is the test accuracy.
%% Cell type:code id:5895003c tags:
``` python
# number of repetitions and storage of results
n_rep = 10
# Cross-validation scheme
cvs0 = ShuffleSplit(n_splits=n_rep, test_size=0.2)
```
%% Cell type:code id:3d69f322 tags:
``` python
# generate n_rep splits
ind_split = np.zeros([n_rep,s0+s1])
i_rep = 0
for train_ind, test_ind in cvs0.split(X, y):
ind_split[i_rep, test_ind] = 1
i_rep += 1
# calculate the size of the test set for each split
test_size = np.vstack((ind_split[:,:s0].sum(axis=1),
ind_split[:,s0:].sum(axis=1)))
plt.figure()
plt.subplot(121)
plt.imshow(ind_split, cmap='binary', interpolation='nearest', aspect=40)
plt.xlabel('sample index')
plt.ylabel('split index')
plt.subplot(122)
plt.plot(test_size[0,::-1], np.arange(n_rep), 'b')
plt.plot(test_size[1,::-1], np.arange(n_rep), 'r')
plt.xlabel('test size per class')
plt.show()
```
%% Cell type:code id:3bd9e7e2 tags:
``` python
# wrapper to test cross-validation scheme (cvs)
def test_cvs(cvs, cvs_lbl):
acc = pd.DataFrame(columns=['type', 'cv', 'score'])
acc = pd.DataFrame()
for train_ind, test_ind in cvs.split(X, y):
# train and test classifier
clf.fit(X[train_ind,:], y[train_ind])
# accuracy on train set
d = {'type': ['train'],
'cv': [cvs_lbl],
'score': [clf.score(X[train_ind,:], y[train_ind])]}
acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)
# accuracy on test set
d = {'type': ['test'],
'cv': [cvs_lbl],
'score': [clf.score(X[test_ind,:], y[test_ind])]}
acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)
return acc
```
%% Cell type:code id:e594d7e1 tags:
``` python
# generate a dataset of features and labels
m = 10 # input dimensionality
s0 = 100 # number of samples for class 0
s1 = 100 # number of samples for class 1
scaling = 0.7 # class contrast
scaling = 0.5 # class contrast
X, y = gen_inputs(m, s0, s1, scaling)
# evaluate classification accuracy on train and test sets
acc = test_cvs(cvs0, 'no strat')
# theoretical chance level
chance_level = 0.5
chance_level = 0.7
sb.violinplot(data=acc, x='cv', y='score', hue='type', split=True, scale='width', palette=['brown','orange'])
sb.violinplot(data=acc, x='cv', y='score', hue='type', split=True, density_norm='width', palette=['brown','orange'])
plt.plot([-1,2], [chance_level]*2, '--k')
plt.yticks([0,1])
plt.ylabel('accuracy')
plt.axis(ymax=1.02)
plt.show()
```
%% Cell type:markdown id:c462266b tags:
## Stratification
Let's consider a different splitting scheme that preserves the ratio of classes from the original dataset in each train-test sets.
%% Cell type:code id:19850420 tags:
``` python
# stratified shuffle split: preserving ratio of classes in train-test sets
cvs1 = StratifiedShuffleSplit(n_splits=n_rep, test_size=0.2)
# generate n_rep splits
ind_split = np.zeros([n_rep,s0+s1])
i_rep = 0
for train_ind, test_ind in cvs1.split(X, y):
ind_split[i_rep, test_ind] = 1
i_rep += 1
# calculate the size of the test set for each split
test_size = np.vstack((ind_split[:,:s0].sum(axis=1),
ind_split[:,s0:].sum(axis=1)))
plt.figure()
plt.subplot(121)
plt.imshow(ind_split, cmap='binary', interpolation='nearest', aspect=60)
plt.xlabel('sample index')
plt.ylabel('split index')
plt.subplot(122)
plt.plot(test_size[0,::-1], np.arange(n_rep), 'b')
plt.plot(test_size[1,::-1], np.arange(n_rep), 'r')
plt.xlabel('test size per class')
plt.show()
```
%% Cell type:code id:bd7a8cc4 tags:
``` python
# evaluate classification accuracy on train and test sets
acc = pd.concat((acc, test_cvs(cvs1, 'strat')))
sb.violinplot(data=acc, x='cv', y='score', hue='type', split=True, scale='width', palette=['brown','orange'])
sb.violinplot(data=acc, x='cv', y='score', hue='type', split=True, density_norm='width', palette=['brown','orange'])
plt.plot([-1,2], [chance_level]*2, '--k')
plt.legend(loc='lower center')
plt.yticks([0,1])
plt.ylabel('accuracy')
plt.axis(ymax=1.02)
plt.show()
```
%% Cell type:markdown id:04873ea0 tags:
# Baseline accuracy as reference
How to interpret the test accuracy?
%% Cell type:code id:c49c4763 tags:
``` python
# wrapper to test cross-validation scheme (cvs)
def test_cvs(cvs, cvs_lbl):
acc = pd.DataFrame(columns=['type', 'cv', 'score'])
acc = pd.DataFrame()
for train_ind, test_ind in cvs.split(X, y):
# train and test classifiers
clf.fit(X[train_ind,:], y[train_ind])
# accuracy on test set
d = {'type': ['test'],
'cv': [cvs_lbl],
'score': [clf.score(X[test_ind,:], y[test_ind])]}
acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)
# shuffling
train_ind_rand = np.random.permutation(train_ind)
clf.fit(X[train_ind,:], y[train_ind_rand])
# accuracy on test set
d = {'type': ['shuf'],
'cv': [cvs_lbl],
'score': [clf.score(X[test_ind,:], y[test_ind])]}
acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)
return acc
```
%% Cell type:code id:2e0f1248 tags:
``` python
acc = test_cvs(cvs0, 'no strat')
# theoretical chance level
chance_level = 0.5
sb.violinplot(data=acc, x='cv', y='score', hue='type', split=True, scale='width', palette=['orange','gray'])
sb.violinplot(data=acc, x='cv', y='score', hue='type', split=True, density_norm='width', palette=['orange','gray'])
plt.plot([-1,2], [chance_level]*2, '--k')
plt.yticks([0,1])
plt.ylabel('accuracy')
plt.axis(ymax=1.02)
plt.show()
```
%% Cell type:markdown id:082f8d49 tags:
We find distributed values of accuracies for the shuffling surrogates around the expected value. It looks fine, but is all the variability coming from the classifier?
%% Cell type:markdown id:21edac79 tags:
# Unbalanced data
We now look into a different version of the dataset, a bit less artificial. We consider the case of unbalanced classes, with different number of samples in the two classes.
%% Cell type:code id:902d7b27 tags:
``` python
# generate inputs with unbalanced classes
m = 10 # input dimensionality
s0 = 100 # number of samples for class 0
s1 = 300 # number of samples for class 1
scaling = 0.7 # class contrast
X, y = gen_inputs(m, s0, s1, scaling) # also change the scaling to play with the code
```
%% Cell type:code id:1d1629a0 tags:
``` python
acc = test_cvs(cvs0, 'no strat')
# theoretical chance level
naive_chance_level = 1.0 / 2 # equal probability for each category
greedy_chance_level = max(s0,s1) / (s0+s1) # always predict the larger class
sb.violinplot(data=acc, x='cv', y='score', hue='type', split=True, scale='width', palette=['orange','gray'])
sb.violinplot(data=acc, x='cv', y='score', hue='type', split=True, density_norm='width', palette=['orange','gray'])
plt.plot([-1,2], [naive_chance_level]*2, ':k')
plt.plot([-1,2], [greedy_chance_level]*2, '--k')
plt.yticks([0,1])
plt.ylabel('accuracy')
plt.axis(ymax=1.02)
plt.show()
```
%% Cell type:markdown id:4f9a0bd6 tags:
The baseline accuracy has changed in this case... A "stupid" classifier predicting always the larger class will perform above $50\%$ accuracy for unbalanced classes.
%% Cell type:code id:b662b586 tags:
``` python
# generate n_rep splits
ind_split = np.zeros([n_rep,s0+s1])
i_rep = 0
for train_ind, test_ind in cvs0.split(X, y):
ind_split[i_rep, test_ind] = 1
i_rep += 1
# calculate the size of the test set for each split
test_size = np.vstack((ind_split[:,:s0].sum(axis=1),
ind_split[:,s0:].sum(axis=1)))
plt.figure()
plt.subplot(121)
plt.imshow(ind_split, cmap='binary', interpolation='nearest', aspect=40)
plt.xlabel('sample index')
plt.ylabel('split index')
plt.subplot(122)
plt.plot(test_size[0,::-1], np.arange(n_rep), 'b')
plt.plot(test_size[1,::-1], np.arange(n_rep), 'r')
plt.xlabel('test size per class')
plt.show()
```
%% Cell type:markdown id:0a26b002 tags:
Does the variability of the class ratio in the test set matter?
Let's compare to other possibility for splitting the data in train-test sets.
%% Cell type:markdown id:9e9b65a9 tags:
## Leave-one-out scheme
Another popular scheme consists in leaving out one sample for testing, especially in the case of small datasets.
%% Cell type:code id:005d13cf tags:
``` python
# leave-two-out scheme
cvs2 = LeaveOneOut()
# generate n_rep splits
ind_split = np.zeros([s0+s1,s0+s1])
i_rep = 0
for train_ind, test_ind in cvs2.split(X, y):
ind_split[i_rep, test_ind] = 1
i_rep += 1
plt.figure()
plt.imshow(ind_split, cmap='binary', interpolation='nearest', aspect=1)
plt.xlabel('sample index')
plt.ylabel('split index')
plt.show()
```
%% Cell type:markdown id:8f2ae9a7 tags:
## Comparison of cross-validation schemes
%% Cell type:code id:4790171d tags:
``` python
# list of cross-validation schemes
cvs_list = [cvs0, cvs1, cvs2]
# labels for cvs
cvs_lbl = ['no strat', 'strat', 'loo']
# accuracy
acc = pd.DataFrame(columns=['type', 'cv', 'score'])
acc = pd.DataFrame()
# loop over cross-validation schemes
for lbl, cvs in zip(cvs_lbl, cvs_list):
acc_tmp = test_cvs(cvs, lbl)
acc = pd.concat((acc, acc_tmp))
# theoretical chance level
naive_chance_level = 1.0 / 2 # equal probability for each category
greedy_chance_level = max(s0,s1) / (s0+s1) # always predict the larger class
# calculate mean of accuracy for leave-two-out scheme
sel_df = np.logical_and(acc['cv']=='loo', acc['type']=='shuf')
mean_small_test = acc[sel_df]['score'].mean()
# plot
sb.violinplot(data=acc, x='cv', y='score', hue='type', split=True, scale='width', palette=['orange','gray'], cut=0)
sb.violinplot(data=acc, x='cv', y='score', hue='type', split=True, density_norm='width', palette=['orange','gray'], cut=0)
plt.plot([-1,3], [naive_chance_level]*2, ':k')
plt.plot([-1,3], [greedy_chance_level]*2, '--k')
plt.plot([2.5], [mean_small_test], marker='x', markersize=16, color='k', ls='')
plt.yticks([0,1])
plt.ylabel('accuracy')
plt.legend(loc='lower left')
plt.tight_layout()
plt.savefig('acc_cv')
plt.show()
```
%% Cell type:markdown id:5e25aa63 tags:
## Unpacking classification accuracy across classes
%% Cell type:code id:6ea00c4e tags:
``` python
# matrix of predicted class versus true class in test set
cm = np.zeros([2,2])
# repeat classification
for train_ind, test_ind in cvs1.split(X, y):
clf.fit(X[train_ind,:], y[train_ind])
cm += confusion_matrix(y_true=y[test_ind],
y_pred=clf.predict(X[test_ind,:]))
plt.figure()
plt.imshow(cm, cmap='jet', vmin=0)
plt.colorbar(label='sample count')
plt.xticks([0,1], ['class 0','class 1'])
plt.yticks([0,1], ['class 0','class 1'])
plt.xlabel('predicted label')
plt.ylabel('true label')
plt.tight_layout()
plt.show()
```
%% Cell type:markdown id:35a5cbd1 tags:
## Conclusion
We see that the leave-one-out (loo) only makes sense when averaging the results (see the cross).
The non-stratified splitting inflates the distribution of chance-level accuracy.
A good default (for sufficient data) is $80\%$-$20\%$ split for the train-test sets, repeated $100$ times.
Importantly, this approach is *conservative*: the risk to find "information" by chance where there is none (akin to a "false positive") can controlled by comparing the classifier with teh chance-level accuracy, in particular its spread. Instead, mistakes tend to yield a large variability in the accuracy distributions, with an absence of positive outcome. In other words, when you find some effect, you can trust it, but it is also likely that you have to try different classifiers to get the best accuracy. This is in fact not a problem, but can inform on the structure of the data (where there is information related to the predicted label).
*Refs:*
- cross-validation: https://scikit-learn.org/stable/modules/cross_validation.html
- chance-level evaluation: https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html
......
%% Cell type:markdown id:00cde4c7 tags:
author: Mat Gilson, https://etulab.univ-amu.fr/gilson.m/compneuro_course/
# 2) Nested cross-validation
This notebook shows the use of cross-validation to tune hyperparameters for classifiers
See also the documentation of scikit-learn library (https://scikit-learn.org/)
%% Cell type:code id:8c9c8432 tags:
``` python
# import librairies
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import pandas as pd
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sb
font = {'family' : 'DejaVu Sans',
'weight' : 'regular',
'size' : 18}
mpl.rc('font', **font)
```
%% Cell type:markdown id:0d12a052 tags:
## Generate samples to classify
We first generate synthetic data with 2 classes to separate (`s0` and `s1` samples, respectively). The input dimensionality corresponds `m` features.
As before, we use the same synthetic data with 2 classes to separate (`s0` and `s1` samples, respectively). The input dimensionality corresponds `m` features.
%% Cell type:code id:4673d31d tags:
``` python
# create synthetic dataset where 2 classes of s0+s1 samples of m-dimensional inputs with controlled contrast
def gen_inputs(m, # input dimensionality
s0, # number of samples for class 0
s1, # number of samples for class 1
scaling): # scaling factor to separate classes
# labels
lbl = np.zeros([s0+s1], dtype=int)
# inputs
X = np.zeros([s0+s1,m])
# create s0 and s1 samples for the 2 classes
for i in range(s0+s1):
# label
lbl[i] = int(i<s0)
# inputs are random noise plus a shift
for j in range(m):
# positive/negative shift for 1st/2nd class, only for indices j larger than m/2
if j>=m/2:
if i<s0:
a = -scaling
else:
a = scaling
else:
a = 0.0
# the shift linearly depends on the feature index j
X[i,j] = a*j/m + np.random.randn()
return X, lbl
```
%% Cell type:code id:029ab1fa tags:
``` python
# generate inputs and labels
m = 50 # input dimensionality
s0=100 # number of samples for class 0
s1=100 # number of samples for class 1
X, y = gen_inputs(m, s0, s1, scaling=0.2)
```
%% Cell type:markdown id:0024c6aa tags:
## Parameterization of classifier
We then choose a classifier and a cross-validation scheme.
%% Cell type:code id:6f2a4e24 tags:
``` python
# (multinomial) logistic regression, involving regularization controlled by the hyperparameter C
clf_MLR = LogisticRegression(C=10000.0, penalty='l2', solver='lbfgs', max_iter=500)
# repetitions of cross-validation (train-test)
n_rep = 5
# stratified K fold with (same splitting with 80% for training and 20% for testing as StratifiedShuffleSplit before)
cvs = StratifiedKFold(n_splits=n_rep)
```
%% Cell type:markdown id:0c8bb544 tags:
### Role of hyperparameter
What is the effect of the hyperparameter C that governs the regularization?
%% Cell type:code id:069c8442 tags:
``` python
# grid evaluation (log scale)
Cs = np.array([0.01,0.1,1.0,10.0,100.0])
n_C = Cs.size
acc = pd.DataFrame(columns=['type', 'C', 'score'])
acc = pd.DataFrame()
coefs = np.zeros([n_C,m])
# repeat classification
cnt = 0
for train_ind, test_ind in cvs.split(X, y):
# loop over hyperparameter
for C in Cs:
# set regularization
clf_MLR.set_params(C=C)
# train classifier
clf_MLR.fit(X[train_ind,:], y[train_ind])
d = {'type': ['train'],
'C': [C],
'score': [clf_MLR.score(X[train_ind,:], y[train_ind])]}
acc = pd.concat((acc, pd.DataFrame(data=d)))
coefs[cnt:] = clf_MLR.coef_
cnt += 1
# test classifier
d = {'type': ['test'],
'C': [C],
'score': [clf_MLR.score(X[test_ind,:], y[test_ind])]}
acc = pd.concat((acc, pd.DataFrame(data=d)))
# plot
sb.violinplot(data=acc, x='C', y='score', hue='type', split=True, scale='width', palette=['brown','orange'])
sb.violinplot(data=acc, x='C', y='score', hue='type', split=True,
density_norm='width', palette=['brown','orange'])
plt.yticks([0,1])
plt.ylabel('accuracy')
plt.xlabel('reg hyperparam C')
plt.legend(loc='lower right')
plt.tight_layout()
plt.title('train-test accuracy')
plt.show()
```
%% Cell type:code id:a65b81a7 tags:
``` python
plt.figure()
for i_C in range(n_C):
plt.plot(range(m),coefs[i_C,:], label=Cs[i_C])
plt.legend()
plt.legend(fontsize=9)
```
%% Cell type:code id:35706eb0-b2a6-4183-bb8b-dd8cf1ac304d tags:
``` python
plt.figure()
for i_C in range(n_C):
plt.plot(range(m),coefs[i_C,:]/max(coefs[i_C,:]), label=Cs[i_C])
plt.legend(fontsize=9)
```
%% Cell type:markdown id:a21b4c4c tags:
The hyperparameter `C` has opposing effects on the train and test accuracies. This indicates that the unregularized MLR classifier (with large `C`) is overfitting the data here.
How to select the best value for `C`?
%% Cell type:markdown id:bdb04e59 tags:
## Inner cross-validation loop within the training set
We can use cross-validation within the training set to test the effec tof `C`. Of course this reduces the amount of data for training within the original training set.
%% Cell type:code id:e614b340 tags:
``` python
# nested cross-validation scheme
n_split_nest = 5
cv_nest = StratifiedKFold(n_splits=n_split_nest)
# indices of splits
ind_split_inner = np.zeros([n_rep*n_split_nest,s0+s1])
ind_split_outer = np.zeros([n_rep,s0+s1])
# outer loop
i_inner = 0
i_outer = 0
for tr_ind, test_ind in cvs.split(X, y):
# split data
X_tr, X_test = X[tr_ind,:], X[test_ind,:]
y_tr, y_test = y[tr_ind], y[test_ind]
# 2=train+val, 3=test
ind_split_outer[i_outer, tr_ind] = 2
ind_split_outer[i_outer, test_ind] = 3
# inner loop
for train_ind, val_ind in cv_nest.split(X_tr, y_tr):
# 0=train, 1=val, 3=test
ind_split_inner[i_inner, tr_ind[val_ind]] = 1
ind_split_inner[i_inner, test_ind] = 3
# increase counter
i_inner += 1
# increase counter
i_outer += 1
```
%% Cell type:markdown id:f0173728 tags:
Let's first look at the splits in the outer loop.
%% Cell type:code id:11469825 tags:
``` python
# plot split indices for each
plt.figure(figsize=[10,10])
plt.imshow(ind_split_outer, vmin=0, vmax=7, cmap='Set2', interpolation='nearest', aspect=40)
plt.xlabel('sample index')
plt.ylabel('split index')
cbar = plt.colorbar(ticks=[2,3], shrink=0.7)
cbar.ax.set_yticklabels(['train+val','test'])
plt.show()
```
%% Cell type:markdown id:a52fffac tags:
For each split, we subdivide the outer "training" set, into the inner training and validation sets.
%% Cell type:code id:eb71ec5a tags:
``` python
# plot split indices
plt.figure(figsize=[10,10])
plt.imshow(ind_split_inner[:n_split_nest,:], vmax=7, cmap='Set2', interpolation='nearest', aspect=40)
plt.xlabel('sample index')
plt.ylabel('split index')
plt.colorbar(ticks=[0,1,3], shrink=0.7)
plt.show()
```
%% Cell type:markdown id:8df51f10 tags:
Putting everything together, we have the 3 sets
%% Cell type:code id:5c81d8eb tags:
``` python
# plot split indices
plt.figure(figsize=[10,10])
plt.imshow(ind_split_inner, vmax=7, cmap='Set2', interpolation='nearest', aspect=8)
plt.xlabel('sample index')
plt.ylabel('split index')
plt.colorbar(ticks=[0,1,3], shrink=0.7)
plt.show()
```
%% Cell type:markdown id:3200c011 tags:
We can do the same with any outer loop, like the `StratifiedShuffleSplit` that we used before.
%% Cell type:code id:f2e30669 tags:
``` python
# change outer cross-validation scheme
n_rep = 20
cvs = StratifiedShuffleSplit(n_splits=n_rep, test_size=0.2)
# indices of splits
ind_split = np.zeros([n_rep*n_split_nest,s0+s1])
# outer loop
i_rep = 0
for train_ind, test_ind in cvs.split(X, y):
# split data
X_train, X_test = X[train_ind,:], X[test_ind,:]
y_train, y_test = y[train_ind], y[test_ind]
# inner loop
for tr_ind, val_ind in cv_nest.split(X_train, y_train):
ind_split[i_rep, train_ind[val_ind]] = 1
ind_split[i_rep, test_ind] = 3
i_rep += 1
# plot split indices
plt.figure(figsize=[10,10])
plt.imshow(ind_split, vmax=7, cmap='Set2', interpolation='nearest', aspect=2)
plt.xlabel('sample index')
plt.ylabel('split index')
cbar = plt.colorbar(ticks=[0,1,3], shrink=0.7)
cbar.ax.set_yticklabels(['train','val','test'])
plt.show()
```
%% Cell type:markdown id:cc9b644c tags:
Now we can use this splitting scheme to test the effect of `C`, selecting the value that gives the best accuracy averaged over the validation sets.
%% Cell type:code id:53b90b92 tags:
``` python
# function to train hyperparameter with nested CV
def nested_CV(clf, X, y):
acc_nest = np.zeros([n_C,n_split_nest])
j = 0
for ind0, ind1 in cv_nest.split(X, y):
# split in train and validation sets
X0, X1 = X[ind0], X[ind1]
y0, y1 = y[ind0], y[ind1]
# train classifier over all reg parameters
for i, C in enumerate(Cs):
# set regularization
clf.set_params(C=C)
# train classifier on X0
clf.fit(X0, y0)
# test classifier on X1
acc_nest[i,j] = clf.score(X1,y1)
j += 1
# get log C parameter
i_best = np.argmax(acc_nest.mean(1))
return Cs[i_best]
```
%% Cell type:code id:5a264a05 tags:
``` python
scalings = [0.1,0.2,0.3,0.4]
acc = pd.DataFrame(columns=['scaling', 'type', 'log C', 'score'])
acc = pd.DataFrame()
for scaling in scalings:
# generate inputs
X, y = gen_inputs(m, s0, s1, scaling=scaling)
# repeat classification
for train_ind, test_ind in cvs.split(X, y):
# select best hyperparameter
best_C = nested_CV(clf_MLR, X[train_ind,:], y[train_ind])
clf_MLR.set_params(C=best_C)
# train and test classifier
clf_MLR.fit(X[train_ind,:], y[train_ind])
d = {'scaling': [scaling],
'type': ['train'],
'log C': [np.log10(best_C)],
'score': [clf_MLR.score(X[train_ind,:], y[train_ind])]}
acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)
d = {'scaling': [scaling],
'type': ['test'],
'log C': [np.log10(best_C)],
'score': [clf_MLR.score(X[test_ind,:], y[test_ind])]}
acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)
# shuffling
train_ind_rand = np.random.permutation(train_ind)
best_C = nested_CV(clf_MLR, X[train_ind,:], y[train_ind_rand])
clf_MLR.set_params(C=best_C)
clf_MLR.fit(X[train_ind,:], y[train_ind_rand])
d = {'scaling': [scaling],
'type': ['shuf'],
'log C': [np.log10(best_C)],
'score': [clf_MLR.score(X[test_ind,:], y[test_ind])]}
acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)
```
%% Cell type:code id:020d231f tags:
``` python
# plots
# theoretical chance level
chance_level = 0.5
acc2 = acc[np.logical_or(acc['type']=='train',acc['type']=='test')]
plt.figure()
sb.violinplot(data=acc2, x='scaling', y='score', hue='type', split=True,
palette=['brown','orange'], scale='width')
palette=['brown','orange'], density_norm='width')
plt.plot([-1,4], [chance_level]*2, '--k')
plt.yticks([0,1])
plt.ylabel('accuracy')
plt.legend(loc='lower right')
plt.tight_layout()
plt.title('train-test accuracies')
acc2 = acc[np.logical_or(acc['type']=='shuf',acc['type']=='test')]
plt.figure()
sb.violinplot(data=acc2, x='scaling', y='score', hue='type', hue_order=('shuf','test'),
split=True, palette=['gray','orange'], scale='width')
split=True, palette=['gray','orange'], density_norm='width')
plt.plot([-1,4], [chance_level]*2, '--k')
plt.yticks([0,1])
plt.ylabel('accuracy')
plt.legend(loc='lower right')
plt.tight_layout()
plt.title('test-shuf accuracies')
plt.figure()
sb.violinplot(data=acc2, x='scaling', y='log C', hue='type',
palette=['orange','gray'], scale='width')
palette=['orange','gray'], density_norm='width')
plt.legend(loc='lower right')
plt.tight_layout()
plt.title('best C (log$_{10}$ scale)')
plt.show()
```
%% Cell type:markdown id:146441cb tags:
We find that the best `C` selected by the nested cross-validation is $0.01$, in line with the comparison of training/testing accuracies at the beginning of the notebook (with the scaling $0.2$ that governs the contrast across the 2 classes).
Here the scaling affects the selected `C` with looser selection for smaller scaling (i.e. when the classification is difficult and the classifier does not perform well).
Note that the classifier trained with shuffled labels for the baseline accuracy does not depend on `C`.
%% Cell type:markdown id:e3f096d7 tags:
## Implementation in scikit-learn
Let's use `GridSearchCV`, which performs the fitting of the parameters.
%% Cell type:code id:07cdf8f2 tags:
``` python
# grid search for hyperparameter C
gscv = GridSearchCV(clf_MLR,
{'C': Cs},
cv=cv_nest)
acc = pd.DataFrame(columns=['scaling', 'type', 'log C', 'score'])
acc = pd.DataFrame()
for scaling in scalings:
# generate inputs
X, y = gen_inputs(m, s0, s1, scaling=scaling)
# repeat classification
for train_ind, test_ind in cvs.split(X, y):
# Grid search to find best classifier
gscv.fit(X[train_ind,:], y[train_ind])
clf = gscv.best_estimator_
# Train and test best classifiers with subject labels
clf.fit(X[train_ind,:], y[train_ind])
d = {'scaling': [scaling],
'type': ['test'],
'log C': [np.log10(gscv.best_params_['C'])],
'score': [clf.score(X[test_ind,:], y[test_ind])]}
acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)
# shuffling
train_ind_rand = np.random.permutation(train_ind)
clf.fit(X[train_ind,:], y[train_ind_rand])
d = {'scaling': [scaling],
'type': ['shuf'],
'log C': [np.log10(gscv.best_params_['C'])],
'score': [clf.score(X[test_ind,:], y[test_ind])]}
acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)
```
%% Cell type:code id:a92c13eb tags:
``` python
plt.figure()
sb.violinplot(data=acc, x='scaling', y='score', hue='type', hue_order=['shuf','test'], split=True,
palette=['gray','orange'], scale='width')
palette=['gray','orange'], density_norm='width')
plt.plot([-1,4], [chance_level]*2, '--k')
plt.yticks([0,1])
plt.ylabel('accuracy')
plt.legend(loc='lower right')
plt.tight_layout()
plt.title('train-test accuracies')
acc2 = acc[acc['type']=='test']
plt.figure()
sb.violinplot(data=acc2, x='scaling', y='log C', hue='type',
palette=['orange'], scale='width')
palette=['orange'], density_norm='width')
plt.legend(loc='upper right')
plt.tight_layout()
plt.title('best C (log$_{10}$ scale)')
plt.show()
```
......
%% Cell type:markdown id:2628a7ec tags:
# 3) Comparing classifiers
author: Mat Gilson, https://etulab.univ-amu.fr/gilson.m/compneuro_course/
author: Mat Gilson, https://github.com/MatthieuGilson
# 3) Comparing classifiers
This notebook compares different classifiers for a synthetic dataset. It includes nested cross-validation to optimize hyperparameters.
See also the documentation of scikit-learn library (https://scikit-learn.org/)
%% Cell type:code id:ec9bb8b2 tags:
``` python
# import librairies
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, GridSearchCV
# classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import pandas as pd
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sb
font = {'family' : 'DejaVu Sans',
'weight' : 'regular',
'size' : 18}
mpl.rc('font', **font)
```
%% Cell type:markdown id:5e78fcf9 tags:
Let's generate a dataset for classification, including labels.
%% Cell type:code id:aef4882d tags:
``` python
# create synthetic dataset where 2 classes of s0+s1 samples of m-dimensional inputs with controlled contrast
def gen_inputs(m, # input dimensionality
s0, # number of samples for class 0
s1, # number of samples for class 1
scaling): # scaling factor to separate classes
# labels
lbl = np.zeros([s0+s1], dtype=int)
# inputs
X = np.zeros([s0+s1,m])
# create s0 and s1 samples for the 2 classes
for i in range(s0+s1):
# label
lbl[i] = int(i<s0)
# inputs are random noise plus a shift
for j in range(m):
# positive/negative shift for 1st/2nd class, only for indices j larger than m/2
if j>=m/2:
if i<s0:
a = -scaling
else:
a = scaling
else:
a = 0.0
# the shift linearly depends on the feature index j
X[i,j] = a*j/m + np.random.randn()
return X, lbl
```
%% Cell type:code id:4d4bade5 tags:
``` python
# input properties
m = 10 # input dimensionality
s0 = 100 # number of samples for class 0
s1 = 100 # number of samples for class 1
scaling = 0.7 # class contrast (try 1.0, 0.5, 0.1)
# generate inputs
X, y = gen_inputs(m, s0, s1, scaling)
```
%% Cell type:markdown id:45051d6f tags:
## Comparison of classifiers
We want to compare the accuracy for different classifiers:
- MLR is the multinomial logistic regression
- 1NN is the 1-nearest-neighbor
- RF is a random forest
- QDA is a quadratic discrimnant analysis
%% Cell type:code id:5ffeb39f tags:
``` python
# classifiers
clf_MLR = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=500)
clf_1NN = KNeighborsClassifier(n_neighbors=1, algorithm='brute', metric='minkowski')
clf_RF = RandomForestClassifier(n_estimators=10)
clf_QDA = QuadraticDiscriminantAnalysis()
dict_clf = {'MLR': clf_MLR,
'1NN': clf_1NN,
'RF': clf_RF,
'QDA': clf_QDA}
# dictionary of hyperparameters and ranges to explore (grid search)
dict_params = {'C': [0.01,0.1,1.0,10.0,100.0], # C for regularization
'n_neighbors': [1,2,3,4,5], # k nearest neighbors
'criterion': ['gini', 'entropy'], # splitting quality eval
'reg_param': [0.01,0.1,1.0]} # regularization param
```
%% Cell type:markdown id:a5ed9e3c tags:
We use cross-validation with the evaluation of test accuracy, as well as nested cross-validation to optimize hyperparameters.
%% Cell type:code id:08581b70 tags:
``` python
# outer cross-validation scheme: 80% for training and 20% for testing
cvs = StratifiedShuffleSplit(n_splits=20, test_size=0.2)
```
%% Cell type:code id:7065006a tags:
``` python
# nested cross-validation scheme
n_split_nest = 5
cv_nest = StratifiedKFold(n_splits=n_split_nest)
```
%% Cell type:code id:cc99137f tags:
``` python
# to store result accuracies
acc = pd.DataFrame(columns=['type', 'clf', 'score'])
acc = pd.DataFrame()
# loop over classifiers
for i, (k, param_name) in enumerate(zip(dict_clf,dict_params)):
# get classifier and related information
clf = dict_clf[k]
param_vals = dict_params[param_name]
# print(param_name, param_vals)
# repeat classification
for train_ind, test_ind in cvs.split(X, y):
# grid search for hyperparameter C
gscv = GridSearchCV(clf,
{param_name: param_vals},
cv=cv_nest)
gscv.fit(X[train_ind,:], y[train_ind])
best_clf = gscv.best_estimator_
print(gscv.best_params_)
# train and test best classifier with subject labels
best_clf.fit(X[train_ind,:], y[train_ind])
# train and test accuracies
d = {'type': ['train'],
'clf': [k],
'score': [best_clf.score(X[train_ind,:], y[train_ind])]}
acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)
d = {'type': ['test'],
'clf': [k],
'score': [best_clf.score(X[test_ind,:], y[test_ind])]}
acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)
# shuffling labels and fit again for evaluation of chance level
train_ind_rand = np.random.permutation(train_ind)
best_clf.fit(X[train_ind,:], y[train_ind_rand])
d = {'type': ['shuf'],
'clf': [k],
'score': [best_clf.score(X[test_ind,:], y[test_ind])]}
acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)
# print table of results
print()
print(acc)
```
%% Cell type:code id:ebeab02a tags:
``` python
# plot: train versus test accuracy
acc2 = acc[np.logical_or(acc['type']=='train',acc['type']=='test')]
# theoretical chance level
chance_level = 0.5
sb.violinplot(data=acc2, x='clf', y='score', hue='type', split=True, scale='width', palette=['brown','orange']) # cut=0
sb.violinplot(data=acc2, x='clf', y='score', hue='type', split=True,
density_norm='width', palette=['brown','orange'])
plt.plot([-1,4], [chance_level]*2, '--k')
plt.yticks([0,1])
plt.xlabel('classifier')
plt.ylabel('accuracy')
plt.legend(loc='lower right', fontsize=11)
plt.tight_layout()
plt.show()
```
%% Cell type:code id:77102885 tags:
``` python
# test versus baseline accuracy
acc2 = acc[np.logical_or(acc['type']=='shuf',acc['type']=='test')]
sb.violinplot(data=acc2, x='clf', y='score', hue='type', hue_order=['shuf','test'],
split=True, scale='width', palette=['gray','orange'])
split=True, density_norm='width', palette=['gray','orange'])
plt.plot([-1,4], [chance_level]*2, '--k')
plt.yticks([0,1])
plt.xlabel('classifier')
plt.ylabel('accuracy')
plt.legend(loc='lower right', fontsize=11)
plt.tight_layout()
plt.show()
```
......
%% Cell type:markdown id:00cde4c7 tags:
author: Mat Gilson, https://etulab.univ-amu.fr/gilson.m/compneuro_course/
# 4) Recursive feature elimination (RFE)
This notebook shows the use of RFE to identify informative features for the classification
See also the documentation of scikit-learn library (https://scikit-learn.org/)
%% Cell type:code id:8c9c8432 tags:
``` python
# import librairies
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import RFE
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sb
font = {'family' : 'DejaVu Sans',
'weight' : 'regular',
'size' : 18}
mpl.rc('font', **font)
```
%% Cell type:markdown id:0d12a052 tags:
## Generate samples to classify
We first generate synthetic data with 2 classes to separate (`s0` and `s1` samples, respectively). The input dimensionality corresponds `m` features.
We generate synthetic data with 2 classes to separate (`s0` and `s1` samples, respectively). The input dimensionality corresponds `m` features.
%% Cell type:code id:4673d31d tags:
``` python
# create synthetic dataset where 2 classes of s0+s1 samples of m-dimensional inputs with controlled contrast
def gen_inputs(m, # input dimensionality
s0, # number of samples for class 0
s1, # number of samples for class 1
scaling): # scaling factor to separate classes
# labels
lbl = np.zeros([s0+s1], dtype=int)
# inputs
X = np.zeros([s0+s1,m])
# create s0 and s1 samples for the 2 classes
for i in range(s0+s1):
# label
lbl[i] = int(i<s0)
# inputs are random noise plus a shift
for j in range(m):
# positive/negative shift for 1st/2nd class
if i<s0:
a = -scaling
else:
a = scaling
# the shift across classes linearly depends on the feature index j
X[i,j] = a*j/m + np.random.randn()
return X, lbl
```
%% Cell type:markdown id:9d0e394b-eb3e-4140-99e7-9d6123960711 tags:
## Toy example
How to interpret the trained weights?
%% Cell type:code id:029ab1fa tags:
``` python
# generate inputs and labels
m = 10 # input dimensionality
s0 = 100 # number of samples for class 0
s1 = 100 # number of samples for class 1
X, y = gen_inputs(m, s0, s1, scaling=0.7)
```
%% Cell type:code id:0a45c553-630d-4d95-94d5-6735e9fe0ee2 tags:
``` python
# create matrix rgb
vbins = np.linspace(-3,3,30) # bins for istograms
mat_rgb = np.zeros([m, vbins.size-1, 3])
for i in range(m):
mat_rgb[i,:,0] = np.histogram(X[:s0,i], bins=vbins)[0]
mat_rgb[i,:,2] = np.histogram(X[s0:,i], bins=vbins)[0]
mat_rgb /= mat_rgb.max() / 2.0
mat_rgb = np.clip(mat_rgb,0.0,1.0)
plt.figure(figsize=[6,7])
plt.imshow(mat_rgb)
plt.xlabel('values')
plt.ylabel('input index')
plt.show()
```
%% Cell type:code id:d712a2fb-86b6-46df-a869-bc05475e79d2 tags:
``` python
# train classifier
clf = LogisticRegression()
clf.fit(X,y)
# plot weights
plt.figure()
plt.plot(range(m), clf.coef_.flatten())
plt.xlabel('input index')
plt.ylabel('weight')
plt.show()
```
%% Cell type:code id:bc00e090-8aad-4bf5-9e7a-29a92e7e50fe tags:
``` python
# copy X and rescale an input
X1 = np.copy(X)
i = 8
X1[:,i] *= 10
```
%% Cell type:code id:2777f72d-b304-4e12-ae53-5e0df531d85c tags:
``` python
# plot
plt.figure(figsize=[6,7])
plt.subplot(211)
plt.hist(X[:s0,i], histtype='step', bins=40, color='r')
plt.hist(X[s0:,i], histtype='step', bins=40, color='b')
plt.legend(['class 0', 'class 1'], fontsize=10)
plt.title('input {} before'.format(i), loc='left')
plt.subplot(212)
plt.hist(X1[:s0,i], histtype='step', bins=40, color='r')
plt.hist(X1[s0:,i], histtype='step', bins=40, color='b')
plt.legend(['class 0', 'class 1'], fontsize=10)
plt.title('input {} after'.format(i), loc='left')
plt.tight_layout()
```
%% Cell type:code id:56bb23cb-598a-4164-97f9-af9c1300eaf8 tags:
``` python
clf1 = LogisticRegression()
clf1.fit(X1,y)
plt.figure()
plt.plot(range(m), clf.coef_.flatten(), label='orig')
plt.plot(range(m), clf1.coef_.flatten()/np.std(X,axis=0), label='rescaled')
plt.xlabel('input index')
plt.ylabel('weight')
plt.show()
```
%% Cell type:markdown id:341418f1-2c2b-4e40-8711-1278e2af2948 tags:
## Using the RFE object
%% Cell type:code id:8d55d9cb-1f2a-4da6-b443-9713f75f6cf5 tags:
``` python
# generate inputs and labels
m = 10 # input dimensionality
s0 = 100 # number of samples for class 0
s1 = 100 # number of samples for class 1
X, y = gen_inputs(m, s0, s1, scaling=0.7)
```
%% Cell type:markdown id:56e3cdf7-e3ca-43bb-89a1-e68dae9627dc tags:
Here the inputs are modified compared to the previous notebooks, with a gradient of contrast that increases with the feature index $j$.
%% Cell type:code id:1f5113c4-b6c5-462f-84f0-5ba38872d420 tags:
``` python
# create matrix rgb
vbins = np.linspace(-3,3,30) # bins for istograms
mat_rgb = np.zeros([m, vbins.size-1, 3])
for i in range(m):
mat_rgb[i,:,0] = np.histogram(X[:s0,i], bins=vbins)[0]
mat_rgb[i,:,2] = np.histogram(X[s0:,i], bins=vbins)[0]
mat_rgb /= mat_rgb.max() / 2.0
mat_rgb = np.clip(mat_rgb,0.0,1.0)
plt.figure(figsize=[6,7])
plt.imshow(mat_rgb)
plt.xlabel('values')
plt.ylabel('input index')
plt.show()
```
%% Cell type:code id:d355c4af-54a7-4668-8181-f69f9a9b466e tags:
``` python
# classifier in pipeline
clf = Pipeline([('scl',StandardScaler()),
('mlr',LogisticRegression(C=1.0))])
# number of repetitions
n_rep = 1 # change for 10
# outer cross-validation scheme
cvs = StratifiedShuffleSplit(n_splits=n_rep, test_size=0.2)
```
%% Cell type:code id:4fe3c14c-06ae-4cea-91ff-50b56dbe2f07 tags:
``` python
# check names of parameters for pipeline (for grid search)
print(clf.get_params())
# quick fix to get coefficients from mlr estimator in pipeline
def get_coef(clf_pipeline):
return clf_pipeline['mlr'].coef_
```
%% Cell type:code id:876b966b-5222-452e-b643-58d7ab638f87 tags:
``` python
# repeat classification
for train_ind, test_ind in cvs.split(X, y):
# wrap classifier to be fitted to data to calculate the ranking
feature_select = RFE(clf, n_features_to_select=1, step=1,
importance_getter=get_coef)
# perform RFE
feature_select.fit(X[train_ind,:], y[train_ind])
print(feature_select.ranking_)
```
%% Cell type:markdown id:e044c2ed-1cf9-4337-ab91-41a6c239b7b0 tags:
Exercises:
- test the variability of the ranking across train-test splits
- change the granularity of the output ranking with `n_features_to_select` and `step`
- modify the scaling for the input generation to see how the ranking changes.
%% Cell type:markdown id:c26064a9-17a9-44d6-9c66-b85676c94fc8 tags:
## Back to general case and combining all options for the classification
%% Cell type:code id:cc559465-2c51-4e62-9b40-d683b91f1eac tags:
``` python
# generate inputs and labels
m = 50 # input dimensionality
s0 = 100 # number of samples for class 0
s1 = 100 # number of samples for class 1
X, y = gen_inputs(m, s0, s1, scaling=0.5)
```
%% Cell type:markdown id:6febb8bf-a89c-4c1d-8383-feffd43f98c2 tags:
Here the inputs are modified compared to the previous notebooks, with a gradient of contrast that increases with the feature index $j$.
%% Cell type:code id:edb43b31-62f0-4d0f-98d8-ac3e3c062405 tags:
``` python
# create matrix rgb
vbins = np.linspace(-3,3,30) # bins for istograms
mat_rgb = np.zeros([m, vbins.size-1, 3])
for i in range(m):
mat_rgb[i,:,0] = np.histogram(X[:s0,i], bins=vbins)[0]
mat_rgb[i,:,2] = np.histogram(X[s0:,i], bins=vbins)[0]
mat_rgb /= mat_rgb.max() / 2.0
mat_rgb = np.clip(mat_rgb,0.0,1.0)
plt.figure(figsize=[6,7])
plt.imshow(mat_rgb)
plt.xlabel('values')
plt.ylabel('input index')
plt.show()
```
%% Cell type:markdown id:0024c6aa tags:
We then build a pipeline for the classifier. The outer cross-validation corresponds to the train-test splitting as before.
The inner crosss-validation corresponds to the optimization of the hyperparameter `C` of the classifier (logistic regression in the pipeline).
%% Cell type:code id:6f2a4e24 tags:
``` python
# hyperparameter for regularization
Cs = [0.01,0.1,1.0,10.0,100.0]
# classifier in pipeline
clf = Pipeline([('scl',StandardScaler()),
('mlr',LogisticRegression())])
# number of repetitions and storage of results
n_rep = 10
# outer cross-validation scheme
cvs = StratifiedShuffleSplit(n_splits=n_rep, test_size=0.2)
# inner cross-validation scheme
cv_nest = StratifiedKFold(n_splits=3)
```
%% Cell type:code id:90de0aa7 tags:
``` python
# quick fix to get coefficients from mlr estimator in pipeline
def get_coef(clf_pipeline):
return clf_pipeline['mlr'].coef_
```
%% Cell type:markdown id:e3f096d7 tags:
We use `GridSearchCV` to optimize the hyperparameter, the use the best classifier pipeline on the test set and perform recursive feature elimination (RFE) to identify informative features that contribute to the correct classification. The latter gives a ranking where low ranks correspond to informative features.
%% Cell type:code id:07cdf8f2 tags:
``` python
# grid search for hyperparameter C
gscv = GridSearchCV(clf,
{'mlr__C': Cs},
cv=cv_nest)
acc = pd.DataFrame(columns=['type', 'log C', 'score', 'ranking'])
df_rfe = pd.DataFrame()
# repeat classification
for train_ind, test_ind in cvs.split(X, y):
# optimize hyperparameter
gscv.fit(X[train_ind,:], y[train_ind])
clf_best = gscv.best_estimator_
best_C = gscv.best_params_['mlr__C']
# wrap classifier to be fitted to data to calculate the ranking
feature_select = RFE(clf_best, n_features_to_select=1, step=1,
importance_getter=get_coef)
# train and test classifier
clf_best.fit(X[train_ind,:], y[train_ind])
score = clf_best.score(X[test_ind,:], y[test_ind])
# perform RFE
feature_select.fit(X[train_ind,:], y[train_ind])
ranking = feature_select.ranking_
# store results
d = {'type': ['test'],
'log C': [np.log10(best_C)],
'score': [score],
'ranking': [ranking]}
acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)
df_rfe = pd.concat((df_rfe, pd.DataFrame(data=d)), ignore_index=True)
# shuffling
train_ind_rand = np.random.permutation(train_ind)
clf_best.fit(X[train_ind,:], y[train_ind_rand])
score = clf_best.score(X[test_ind,:], y[test_ind])
# perform RFE
feature_select.fit(X[train_ind,:], y[train_ind_rand])
ranking = feature_select.ranking_
# store results
d = {'type': ['shuf'],
'log C': [np.log10(best_C)],
'score': [score],
'ranking': [ranking]}
acc = pd.concat((acc, pd.DataFrame(data=d)), ignore_index=True)
df_rfe = pd.concat((df_rfe, pd.DataFrame(data=d)), ignore_index=True)
```
%% Cell type:markdown id:2702e677 tags:
Plot the results for accuracy and best parameters.
%% Cell type:code id:03237341 tags:
``` python
chance_level = 0.5
plt.figure()
sb.violinplot(data=acc, y='score', x='type', hue='type',
sb.violinplot(data=df_rfe, y='score', x='type', hue='type',
palette=['brown','orange'], density_norm='width')
plt.plot([-1,2], [chance_level]*2, '--k')
plt.yticks([0,1])
plt.ylabel('accuracy')
plt.tight_layout()
plt.title('train-test accuracies')
acc2 = acc[acc['type']=='test']
acc = df_rfe[df_rfe['type']=='test']
plt.figure()
sb.violinplot(data=acc2, y='log C', x='type', hue='type',
sb.violinplot(data=acc, y='log C', x='type', hue='type',
palette=['orange'], density_norm='width')
plt.tight_layout()
plt.title('best C (log$_{10}$ scale)')
plt.show()
```
%% Cell type:markdown id:8ef0a315 tags:
Plot the ranking obtained from RFE.
Recall that the inputs are geneterated such that "the shift across classes linearly depends on the feature index". This means that inputs with large index should have low (informative) ranking.
%% Cell type:code id:e96d2aa6 tags:
%% Cell type:code id:9b67aa9b-97ee-46bd-aee6-50c1cbdf3a36 tags:
``` python
sel_test = acc['type']=='test'
sel_shuf = acc['type']=='shuf'
# reshape dataframe to extract ranking for each input index
rkg = pd.DataFrame()
for i in range(df_rfe.shape[0]):
r = df_rfe.iloc[i]
for j in range(m):
d_tmp = {'type': r['type'],
'input index': [j],
'ranking': r['ranking'][j]}
rkg = pd.concat((rkg, pd.DataFrame(d_tmp)), ignore_index=True)
plt.figure()
plt.plot(acc[sel_test]['ranking'].mean(), label='test')
plt.plot(acc[sel_shuf]['ranking'].mean(), label='shuf')
plt.xlabel('input index')
plt.ylabel('mean ranking across CV splits')
sb.lineplot(data=rkg, x='input index', y='ranking', hue='type')
plt.legend(fontsize=12)
plt.tight_layout()
plt.title('ranking of informative features')
plt.show()
```
%% Cell type:markdown id:9468ad24-29f5-4688-92cb-4821854051ef tags:
The ranking estimated from classification can be compared to the "ground truth", which is in theory the order of input indices. Note that the randomness in the generation of the inputs adds some noise that affects the precision of the estimated ranking.
The ranking for the shuffle case also indicates a level of randomness of the ranking, reflecting the number of inputs, the number of samples, etc. From this we can evaluate a (e.g. via the standard deviation)
%% Cell type:code id:6ee2f8f7-3f55-41bd-b991-fefed23fe20f tags:
``` python
print(np.argsort(acc[sel_test]['ranking'].mean()))
sel_test = df_rfe['type']=='test'
print(np.argsort(df_rfe[sel_test]['ranking'].mean()))
```
......
This diff is collapsed.
name: sktime
channels:
- conda-forge
dependencies:
- jupyterlab
- sktime
- matplotlib
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment