update load data

904a4227 · GILSON Matthieu · ef256cbc · 904a4227
Commit 904a4227 authored Feb 19, 2024 by GILSON Matthieu
--- a/data/data_fMRI_ARCHIsoc/load_data.ipynb
+++ b/data/data_fMRI_ARCHIsoc/load_data.ipynb
@@ -27,15 +27,15 @@
    "base_dir = './'\n",
    "\n",
    "# time series\n",
-    "X_ts = pd.read_hdf(base_dir+'df_ts.hdf')\n",
-    "n_sample, n_ts = X_ts.shape\n",
+    "df_ts = pd.read_hdf(base_dir+'df_ts_dataset0.hdf')\n",
+    "n_sample, N = df_ts.shape\n",
    "\n",
    "# labels\n",
-    "y = np.load(base_dir+'lbl_task.npy')\n",
+    "y_task = np.load(base_dir+'y_task_dataset0.npy')\n",
+    "y_sub = np.load(base_dir+'y_sub_dataset0.npy')\n",
    "\n",
    "# functional connectivity\n",
-    "FC = np.load(base_dir+'FC.npy')\n",
-    "_, N, _ = FC.shape"
+    "FC = np.load(base_dir+'FC_dataset0.npy')"
   ]
  },
  {
@@ -45,9 +45,17 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "print('shape of data frame for time series:', X_ts.shape)\n",
+    "print('shape of data frame for time series:', df_ts.shape)\n",
    "print('shape of numpy array for FC:', FC.shape)\n",
-    "print('shape of labels:', y.shape)"
+    "print('shape of labels:', y_task.shape, y_sub.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8d3fda37-8191-4bec-af12-76a1df6db301",
+   "metadata": {},
+   "source": [
+    "Be careful of the types of the elements of the dataframe: here for time series they correspond to `pandas.Series`. We can plot the time series for a given sample and a few inputs."
   ]
  },
  {
@@ -59,7 +67,10 @@
   },
   "outputs": [],
   "source": [
-    "type(X['dim_0'][0])"
+    "print(type(df_ts))\n",
+    "print(df_ts.shape)\n",
+    "print(type(df_ts.iloc[0,0]))\n",
+    "print(df_ts.iloc[0,0].shape)"
   ]
  },
  {
@@ -73,7 +84,7 @@
   "source": [
    "plt.figure()\n",
    "for i in range(6):\n",
-    "    plt.plot(X_ts.iloc[0,i])\n",
+    "    plt.plot(df_ts.iloc[0,i])\n",
    "plt.xlabel('time')\n",
    "plt.ylabel('a.u.')\n",
    "plt.title('example trace')\n",
@@ -81,6 +92,55 @@
    "plt.show()"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "f5188adb-adec-4f79-ac36-a8b4df2c0f48",
+   "metadata": {},
+   "source": [
+    "To calculate the mean, we can apply the `numpy.mean` fonction to all `Series` elements of the dataframe."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8b819b32-7847-4cb2-b98d-faf27988dd32",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_ts.map(np.mean)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d7e9d49c-27af-4faa-8f8a-81486590a4d9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_ind = [0,1,3,5]\n",
+    "df_ts.iloc[train_ind]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "55367ad3-b7c6-4e17-8f8c-275b51b0ed66",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = np.array(df_ts.map(np.mean))\n",
+    "print(X.shape)\n",
+    "print(X.dtype)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "88f56f3a-7fab-4b8a-87b6-bb92b929ed4d",
+   "metadata": {},
+   "source": [
+    "The `FC` array contains the functional connectivity matrices. We can recompute them from the time series."
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@@ -97,6 +157,31 @@
    "plt.show()"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "de2c9861-3296-4015-be79-d813042f4167",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get times series for sample 0 and all inputs\n",
+    "T = df_ts.iloc[0,0].size # number of time points\n",
+    "ts_tmp = np.zeros([T,N])\n",
+    "for i in range(N):\n",
+    "    ts_tmp[:,i] = df_ts.iloc[0,i]\n",
+    "\n",
+    "# calculate correlation across all pairs of regions\n",
+    "FC_tmp = np.corrcoef(ts_tmp, rowvar=False)\n",
+    "\n",
+    "plt.figure()\n",
+    "plt.imshow(FC_tmp)\n",
+    "plt.colorbar()\n",
+    "plt.xlabel('region index')\n",
+    "plt.ylabel('region index')\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@@ -122,7 +207,7 @@
   "outputs": [],
   "source": [
    "# classes to predict (here tasks)\n",
-    "np.unique(y)"
+    "np.unique(y_task)"
   ]
  }
 ],

 %% Cell type:code id:349c8bc6-1a18-4fe2-869c-5581df684b1c tags:

 ``` python
 import numpy as np
 import pandas as pd

 import matplotlib.pyplot as plt
 ```

 %% Cell type:code id:47021efb-21dc-4ed2-8375-de1355344a16 tags:

 ``` python
 base_dir = './'

 # time series
-X_ts = pd.read_hdf(base_dir+'df_ts.hdf')
-n_sample, n_ts = X_ts.shape
+df_ts = pd.read_hdf(base_dir+'df_ts_dataset0.hdf')
+n_sample, N = df_ts.shape

 # labels
-y = np.load(base_dir+'lbl_task.npy')
+y_task = np.load(base_dir+'y_task_dataset0.npy')
+y_sub = np.load(base_dir+'y_sub_dataset0.npy')

 # functional connectivity
-FC = np.load(base_dir+'FC.npy')
-_, N, _ = FC.shape
+FC = np.load(base_dir+'FC_dataset0.npy')
 ```

 %% Cell type:code id:4dd6acc5-40c8-48e2-81c6-562d518b5859 tags:

 ``` python
-print('shape of data frame for time series:', X_ts.shape)
+print('shape of data frame for time series:', df_ts.shape)
 print('shape of numpy array for FC:', FC.shape)
-print('shape of labels:', y.shape)
+print('shape of labels:', y_task.shape, y_sub.shape)
 ```

+%% Cell type:markdown id:8d3fda37-8191-4bec-af12-76a1df6db301 tags:
+
+Be careful of the types of the elements of the dataframe: here for time series they correspond to `pandas.Series`. We can plot the time series for a given sample and a few inputs.
+
 %% Cell type:code id:a6bc3926-de0a-4967-9b00-b66a5a64bd9f tags:

 ``` python
-type(X['dim_0'][0])
+print(type(df_ts))
+print(df_ts.shape)
+print(type(df_ts.iloc[0,0]))
+print(df_ts.iloc[0,0].shape)
 ```

 %% Cell type:code id:ea059511-2df8-447e-a064-ab23a2d4b6f5 tags:

 ``` python
 plt.figure()
 for i in range(6):
-    plt.plot(X_ts.iloc[0,i])
+    plt.plot(df_ts.iloc[0,i])
 plt.xlabel('time')
 plt.ylabel('a.u.')
 plt.title('example trace')

 plt.show()
 ```

+%% Cell type:markdown id:f5188adb-adec-4f79-ac36-a8b4df2c0f48 tags:
+
+To calculate the mean, we can apply the `numpy.mean` fonction to all `Series` elements of the dataframe.
+
+%% Cell type:code id:8b819b32-7847-4cb2-b98d-faf27988dd32 tags:
+
+``` python
+df_ts.map(np.mean)
+```
+
+%% Cell type:code id:d7e9d49c-27af-4faa-8f8a-81486590a4d9 tags:
+
+``` python
+train_ind = [0,1,3,5]
+df_ts.iloc[train_ind]
+```
+
+%% Cell type:code id:55367ad3-b7c6-4e17-8f8c-275b51b0ed66 tags:
+
+``` python
+X = np.array(df_ts.map(np.mean))
+print(X.shape)
+print(X.dtype)
+```
+
+%% Cell type:markdown id:88f56f3a-7fab-4b8a-87b6-bb92b929ed4d tags:
+
+The `FC` array contains the functional connectivity matrices. We can recompute them from the time series.
+
 %% Cell type:code id:5c2b55ce-f85a-4800-aeed-e202bc193688 tags:

 ``` python
 plt.figure()
 plt.imshow(FC[0,:,:])
 plt.colorbar()
 plt.xlabel('region index')
 plt.ylabel('region index')

 plt.show()
 ```

+%% Cell type:code id:de2c9861-3296-4015-be79-d813042f4167 tags:
+
+``` python
+# get times series for sample 0 and all inputs
+T = df_ts.iloc[0,0].size # number of time points
+ts_tmp = np.zeros([T,N])
+for i in range(N):
+    ts_tmp[:,i] = df_ts.iloc[0,i]
+
+# calculate correlation across all pairs of regions
+FC_tmp = np.corrcoef(ts_tmp, rowvar=False)
+
+plt.figure()
+plt.imshow(FC_tmp)
+plt.colorbar()
+plt.xlabel('region index')
+plt.ylabel('region index')
+
+plt.show()
+```
+
 %% Cell type:code id:37e06d59-48b7-4d8d-8de3-c83641da7896 tags:

 ``` python
 # format FC data for scikit learn (sample x feature)

 ind_tri = np.tri(N,N,-1,dtype=bool)
 print(ind_tri.sum())

 # X array for scikit learn
 X_FC = FC[:,ind_tri]
 print(X_FC.shape)
 ```

 %% Cell type:code id:b450da32-1cf9-4628-9422-8e76d2916fa7 tags:

 ``` python
 # classes to predict (here tasks)
-np.unique(y)
+np.unique(y_task)
 ```