commit

60993a46 · FERRAT Samy · d6b4f840 · 60993a46
Commit 60993a46 authored 7 months ago by FERRAT Samy
--- a/notebook.ipynb
+++ b/notebook.ipynb
@@ -15,7 +15,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 143,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -33,6 +33,7 @@
    "from src.figures.figures import plot_area_price_relationship, plot_correlation_matrix, plot_air_conditioning_presence, plot_bedrooms_distribution, plot_price_distribution\n",
    "from src.data_science.data import split_data\n",
    "from sklearn.metrics import r2_score\n",
+    "from sklearn.metrics import mean_squared_error\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from sklearn.preprocessing import StandardScaler\n",
@@ -43,7 +44,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 144,
   "metadata": {},
   "outputs": [
    {
@@ -74,7 +75,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 145,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -83,7 +84,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 146,
   "metadata": {},
   "outputs": [
    {
@@ -214,29 +215,17 @@
       "</div>"
      ],
      "text/plain": [
-       "        price    AreA  bedrooms  BATHROOMS  stories mainroad guestroom  \\\n",
-       "0   4543000.0  4990.0       4.0        2.0      2.0      yes       yes   \n",
-       "1   8080940.0  7000.0       3.0        2.0      4.0      yes        no   \n",
-       "2   8750000.0  4321.0       3.0        2.0      2.0      yes        no   \n",
-       "3   1890000.0  1700.0       3.0        1.0      2.0      yes        no   \n",
-       "4  12215000.0  7500.0       4.0        2.0      2.0      yes        no   \n",
-       "\n",
-       "  basement hotwaterheating air conditioning  parking prefarea  \\\n",
-       "0      yes              no               no      0.0      yes   \n",
-       "1       no              no              yes      2.0       no   \n",
-       "2      yes             yes               no      2.0       no   \n",
-       "3       no              no               no      0.0       no   \n",
-       "4      yes              no              yes      3.0      yes   \n",
-       "\n",
-       "  furnishing STATUS  houSeaGe  \n",
-       "0         furnished      15.0  \n",
-       "1         FURNISHED      11.0  \n",
-       "2         FURNISHED       NaN  \n",
-       "3       unfurnished       NaN  \n",
-       "4         furnished       NaN  "
-      ]
-     },
-     "execution_count": 11,
+       "        price    AreA  bedrooms  ...  prefarea  furnishing STATUS houSeaGe\n",
+       "0   4543000.0  4990.0       4.0  ...       yes          furnished     15.0\n",
+       "1   8080940.0  7000.0       3.0  ...        no          FURNISHED     11.0\n",
+       "2   8750000.0  4321.0       3.0  ...        no          FURNISHED      NaN\n",
+       "3   1890000.0  1700.0       3.0  ...        no        unfurnished      NaN\n",
+       "4  12215000.0  7500.0       4.0  ...       yes          furnished      NaN\n",
+       "\n",
+       "[5 rows x 14 columns]"
+      ]
+     },
+     "execution_count": 146,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -247,7 +236,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 147,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -256,14 +245,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 148,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "c:\\Users\\samys\\OneDrive\\Bureau\\last_test\\projet-python-m2-ds-2024\\src\\data_processing\\preprocessing.py:30: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
+      "c:\\Users\\samys\\OneDrive\\Bureau\\verylasttest\\projet-python-m2-ds-2024\\src\\data_processing\\preprocessing.py:30: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
      "  data=data.replace({'yes': 1, 'no': 0})\n"
     ]
    }
@@ -281,7 +270,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 149,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -297,7 +286,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 150,
   "metadata": {},
   "outputs": [
    {
@@ -329,7 +318,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 151,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -346,7 +335,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 152,
   "metadata": {},
   "outputs": [
    {
@@ -477,29 +466,17 @@
       "</div>"
      ],
      "text/plain": [
-       "      price  area  bedrooms  bathrooms  stories  mainroad  guestroom  \\\n",
-       "0   4543000  4990         4          2        2         1          1   \n",
-       "1   8080940  7000         3          2        4         1          0   \n",
-       "2   8750000  4321         3          2        2         1          0   \n",
-       "3   1890000  1700         3          1        2         1          0   \n",
-       "4  12215000  7500         4          2        2         1          0   \n",
-       "\n",
-       "   basement  hotwaterheating  air_conditioning  parking  prefarea  furnished  \\\n",
-       "0         1                0                 0        0         1          1   \n",
-       "1         0                0                 1        2         0          1   \n",
-       "2         1                1                 0        2         0          1   \n",
-       "3         0                0                 0        0         0          0   \n",
-       "4         1                0                 1        3         1          1   \n",
-       "\n",
-       "   semi-furnished  \n",
-       "0               0  \n",
-       "1               0  \n",
-       "2               0  \n",
-       "3               0  \n",
-       "4               0  "
-      ]
-     },
-     "execution_count": 17,
+       "      price  area  bedrooms  ...  prefarea  furnished  semi-furnished\n",
+       "0   4543000  4990         4  ...         1          1               0\n",
+       "1   8080940  7000         3  ...         0          1               0\n",
+       "2   8750000  4321         3  ...         0          1               0\n",
+       "3   1890000  1700         3  ...         0          0               0\n",
+       "4  12215000  7500         4  ...         1          1               0\n",
+       "\n",
+       "[5 rows x 14 columns]"
+      ]
+     },
+     "execution_count": 152,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -517,7 +494,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 153,
   "metadata": {},
   "outputs": [
    {
@@ -567,7 +544,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 154,
   "metadata": {},
   "outputs": [
    {
@@ -618,7 +595,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 155,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -627,7 +604,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 156,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -656,7 +633,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 157,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -682,7 +659,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 158,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -700,16 +677,16 @@
    "    lin_reg_baseline.fit(X_train_subset.reshape(-1,1),y_train_subset)\n",
    "\n",
    "    #calcul de la prédiction\n",
-    "    y_pred = lin_reg_baseline.predict(X_test[:,1].reshape(-1,1))\n",
+    "    y_pred_bed = lin_reg_baseline.predict(X_test[:,1].reshape(-1,1))\n",
    "    \n",
-    "    # calcul de l'erreur R2\n",
-    "    error_mse = np.mean((y_test-y_pred)**2)\n",
+    "    # calcul du MSE\n",
+    "    error_mse = mean_squared_error(y_test,y_pred_bed)\n",
    "    list_error.append(error_mse)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 159,
   "metadata": {},
   "outputs": [
    {
@@ -749,22 +726,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 160,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "-0.18908813220600074\n",
-      "0.49828257362511363\n",
-      "0.6595360217499793\n",
-      "0.6738827057529093\n",
-      "0.6710848125168165\n",
-      "0.6714672798459871\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "list_error=[]\n",
    "# on réalise une boucle sur différentes tailles de l'ensemble d'entrainement\n",
@@ -780,15 +744,14 @@
    "    #calcul de la prédiction\n",
    "    y_pred=lin_reg.predict(X_test)\n",
    "    \n",
-    "    # calcul de l'erreur R2\n",
-    "    error_mse=np.mean((y_test-y_pred)**2)\n",
-    "    print(r2_score(y_test,y_pred))\n",
+    "    # calcul du MSE\n",
+    "    error_mse=mean_squared_error(y_test,y_pred)\n",
    "    list_error.append(error_mse)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 161,
   "metadata": {},
   "outputs": [
    {
@@ -819,7 +782,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "L'ajout de données améliore la performance du modèle cependant on constate une très faible amélioration du score à partir de n=100, mais qui est très forte entre n=10 et n=100"
+    "L'ajout de données améliore la performance du modèle cependant on constate une très faible amélioration du score à partir de n=100, mais qui est très forte entre n=10 et n=100, cet exemple met en évidence l'importance de la taille de l'ensemble d'entrainement pour avoir de meilleurs résultats."
   ]
  },
  {
@@ -831,7 +794,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 162,
   "metadata": {},
   "outputs": [
    {
@@ -863,10 +826,10 @@
    "        rf.fit(X_train, y_train)\n",
    "\n",
    "        # Prédictions\n",
-    "        y_pred = rf.predict(X_test)\n",
+    "        y_pred_forest = rf.predict(X_test)\n",
    "\n",
    "        # Calcul de l'erreur\n",
-    "        error_mse_forest = np.mean((y_test-y_pred)**2)\n",
+    "        error_mse_forest = mean_squared_error(y_test,y_pred_forest)\n",
    "\n",
    "        test_scores.append(error_mse_forest)\n",
    "\n",
@@ -899,13 +862,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 163,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
-       "<style>#sk-container-id-1 {\n",
+       "<style>#sk-container-id-5 {\n",
       "  /* Definition of color scheme common for light and dark mode */\n",
       "  --sklearn-color-text: black;\n",
       "  --sklearn-color-line: gray;\n",
@@ -935,15 +898,15 @@
       "  }\n",
       "}\n",
       "\n",
-       "#sk-container-id-1 {\n",
+       "#sk-container-id-5 {\n",
       "  color: var(--sklearn-color-text);\n",
       "}\n",
       "\n",
-       "#sk-container-id-1 pre {\n",
+       "#sk-container-id-5 pre {\n",
       "  padding: 0;\n",
       "}\n",
       "\n",
-       "#sk-container-id-1 input.sk-hidden--visually {\n",
+       "#sk-container-id-5 input.sk-hidden--visually {\n",
       "  border: 0;\n",
       "  clip: rect(1px 1px 1px 1px);\n",
       "  clip: rect(1px, 1px, 1px, 1px);\n",
@@ -955,7 +918,7 @@
       "  width: 1px;\n",
       "}\n",
       "\n",
-       "#sk-container-id-1 div.sk-dashed-wrapped {\n",
+       "#sk-container-id-5 div.sk-dashed-wrapped {\n",
       "  border: 1px dashed var(--sklearn-color-line);\n",
       "  margin: 0 0.4em 0.5em 0.4em;\n",
       "  box-sizing: border-box;\n",
@@ -963,7 +926,7 @@
       "  background-color: var(--sklearn-color-background);\n",
       "}\n",
       "\n",
-       "#sk-container-id-1 div.sk-container {\n",
+       "#sk-container-id-5 div.sk-container {\n",
       "  /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
       "     but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
       "     so we also need the `!important` here to be able to override the\n",
@@ -973,7 +936,7 @@
       "  position: relative;\n",
       "}\n",
       "\n",
-       "#sk-container-id-1 div.sk-text-repr-fallback {\n",
+       "#sk-container-id-5 div.sk-text-repr-fallback {\n",
       "  display: none;\n",
       "}\n",
       "\n",
@@ -989,14 +952,14 @@
       "\n",
       "/* Parallel-specific style estimator block */\n",
       "\n",
-       "#sk-container-id-1 div.sk-parallel-item::after {\n",
+       "#sk-container-id-5 div.sk-parallel-item::after {\n",
       "  content: \"\";\n",
       "  width: 100%;\n",
       "  border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
       "  flex-grow: 1;\n",
       "}\n",
       "\n",
-       "#sk-container-id-1 div.sk-parallel {\n",
+       "#sk-container-id-5 div.sk-parallel {\n",
       "  display: flex;\n",
       "  align-items: stretch;\n",
       "  justify-content: center;\n",
@@ -1004,28 +967,28 @@
       "  position: relative;\n",
       "}\n",
       "\n",
-       "#sk-container-id-1 div.sk-parallel-item {\n",
+       "#sk-container-id-5 div.sk-parallel-item {\n",
       "  display: flex;\n",
       "  flex-direction: column;\n",
       "}\n",
       "\n",
-       "#sk-container-id-1 div.sk-parallel-item:first-child::after {\n",
+       "#sk-container-id-5 div.sk-parallel-item:first-child::after {\n",
       "  align-self: flex-end;\n",
       "  width: 50%;\n",
       "}\n",
       "\n",
-       "#sk-container-id-1 div.sk-parallel-item:last-child::after {\n",
+       "#sk-container-id-5 div.sk-parallel-item:last-child::after {\n",
       "  align-self: flex-start;\n",
       "  width: 50%;\n",
       "}\n",
       "\n",
-       "#sk-container-id-1 div.sk-parallel-item:only-child::after {\n",
+       "#sk-container-id-5 div.sk-parallel-item:only-child::after {\n",
       "  width: 0;\n",
       "}\n",
       "\n",
       "/* Serial-specific style estimator block */\n",
       "\n",
-       "#sk-container-id-1 div.sk-serial {\n",
+       "#sk-container-id-5 div.sk-serial {\n",
       "  display: flex;\n",
       "  flex-direction: column;\n",
       "  align-items: center;\n",
@@ -1043,14 +1006,14 @@
       "\n",
       "/* Pipeline and ColumnTransformer style (default) */\n",
       "\n",
-       "#sk-container-id-1 div.sk-toggleable {\n",
+       "#sk-container-id-5 div.sk-toggleable {\n",
       "  /* Default theme specific background. It is overwritten whether we have a\n",
       "  specific estimator or a Pipeline/ColumnTransformer */\n",
       "  background-color: var(--sklearn-color-background);\n",
       "}\n",
       "\n",
       "/* Toggleable label */\n",
-       "#sk-container-id-1 label.sk-toggleable__label {\n",
+       "#sk-container-id-5 label.sk-toggleable__label {\n",
       "  cursor: pointer;\n",
       "  display: block;\n",
       "  width: 100%;\n",
@@ -1060,7 +1023,7 @@
       "  text-align: center;\n",
       "}\n",
       "\n",
-       "#sk-container-id-1 label.sk-toggleable__label-arrow:before {\n",
+       "#sk-container-id-5 label.sk-toggleable__label-arrow:before {\n",
       "  /* Arrow on the left of the label */\n",
       "  content: \"▸\";\n",
       "  float: left;\n",
@@ -1068,13 +1031,13 @@
       "  color: var(--sklearn-color-icon);\n",
       "}\n",
       "\n",
-       "#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {\n",
+       "#sk-container-id-5 label.sk-toggleable__label-arrow:hover:before {\n",
       "  color: var(--sklearn-color-text);\n",
       "}\n",
       "\n",
       "/* Toggleable content - dropdown */\n",
       "\n",
-       "#sk-container-id-1 div.sk-toggleable__content {\n",
+       "#sk-container-id-5 div.sk-toggleable__content {\n",
       "  max-height: 0;\n",
       "  max-width: 0;\n",
       "  overflow: hidden;\n",
@@ -1083,12 +1046,12 @@
       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
       "}\n",
       "\n",
-       "#sk-container-id-1 div.sk-toggleable__content.fitted {\n",
+       "#sk-container-id-5 div.sk-toggleable__content.fitted {\n",
       "  /* fitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-0);\n",
       "}\n",
       "\n",
-       "#sk-container-id-1 div.sk-toggleable__content pre {\n",
+       "#sk-container-id-5 div.sk-toggleable__content pre {\n",
       "  margin: 0.2em;\n",
       "  border-radius: 0.25em;\n",
       "  color: var(--sklearn-color-text);\n",
@@ -1096,79 +1059,79 @@
       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
       "}\n",
       "\n",
-       "#sk-container-id-1 div.sk-toggleable__content.fitted pre {\n",
+       "#sk-container-id-5 div.sk-toggleable__content.fitted pre {\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-0);\n",
       "}\n",
       "\n",
-       "#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
+       "#sk-container-id-5 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
       "  /* Expand drop-down */\n",
       "  max-height: 200px;\n",
       "  max-width: 100%;\n",
       "  overflow: auto;\n",
       "}\n",
       "\n",
-       "#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
+       "#sk-container-id-5 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
       "  content: \"▾\";\n",
       "}\n",
       "\n",
       "/* Pipeline/ColumnTransformer-specific style */\n",
       "\n",
-       "#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
+       "#sk-container-id-5 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
       "  color: var(--sklearn-color-text);\n",
       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
       "}\n",
       "\n",
-       "#sk-container-id-1 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
+       "#sk-container-id-5 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
       "  background-color: var(--sklearn-color-fitted-level-2);\n",
       "}\n",
       "\n",
       "/* Estimator-specific style */\n",
       "\n",
       "/* Colorize estimator box */\n",
-       "#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
+       "#sk-container-id-5 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
       "}\n",
       "\n",
-       "#sk-container-id-1 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
+       "#sk-container-id-5 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
       "  /* fitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-2);\n",
       "}\n",
       "\n",
-       "#sk-container-id-1 div.sk-label label.sk-toggleable__label,\n",
-       "#sk-container-id-1 div.sk-label label {\n",
+       "#sk-container-id-5 div.sk-label label.sk-toggleable__label,\n",
+       "#sk-container-id-5 div.sk-label label {\n",
       "  /* The background is the default theme color */\n",
       "  color: var(--sklearn-color-text-on-default-background);\n",
       "}\n",
       "\n",
       "/* On hover, darken the color of the background */\n",
-       "#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {\n",
+       "#sk-container-id-5 div.sk-label:hover label.sk-toggleable__label {\n",
       "  color: var(--sklearn-color-text);\n",
       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
       "}\n",
       "\n",
       "/* Label box, darken color on hover, fitted */\n",
-       "#sk-container-id-1 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
+       "#sk-container-id-5 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
       "  color: var(--sklearn-color-text);\n",
       "  background-color: var(--sklearn-color-fitted-level-2);\n",
       "}\n",
       "\n",
       "/* Estimator label */\n",
       "\n",
-       "#sk-container-id-1 div.sk-label label {\n",
+       "#sk-container-id-5 div.sk-label label {\n",
       "  font-family: monospace;\n",
       "  font-weight: bold;\n",
       "  display: inline-block;\n",
       "  line-height: 1.2em;\n",
       "}\n",
       "\n",
-       "#sk-container-id-1 div.sk-label-container {\n",
+       "#sk-container-id-5 div.sk-label-container {\n",
       "  text-align: center;\n",
       "}\n",
       "\n",
       "/* Estimator-specific */\n",
-       "#sk-container-id-1 div.sk-estimator {\n",
+       "#sk-container-id-5 div.sk-estimator {\n",
       "  font-family: monospace;\n",
       "  border: 1px dotted var(--sklearn-color-border-box);\n",
       "  border-radius: 0.25em;\n",
@@ -1178,18 +1141,18 @@
       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
       "}\n",
       "\n",
-       "#sk-container-id-1 div.sk-estimator.fitted {\n",
+       "#sk-container-id-5 div.sk-estimator.fitted {\n",
       "  /* fitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-0);\n",
       "}\n",
       "\n",
       "/* on hover */\n",
-       "#sk-container-id-1 div.sk-estimator:hover {\n",
+       "#sk-container-id-5 div.sk-estimator:hover {\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
       "}\n",
       "\n",
-       "#sk-container-id-1 div.sk-estimator.fitted:hover {\n",
+       "#sk-container-id-5 div.sk-estimator.fitted:hover {\n",
       "  /* fitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-2);\n",
       "}\n",
@@ -1276,7 +1239,7 @@
       "\n",
       "/* \"?\"-specific style due to the `<a>` HTML tag */\n",
       "\n",
-       "#sk-container-id-1 a.estimator_doc_link {\n",
+       "#sk-container-id-5 a.estimator_doc_link {\n",
       "  float: right;\n",
       "  font-size: 1rem;\n",
       "  line-height: 1em;\n",
@@ -1291,31 +1254,31 @@
       "  border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
       "}\n",
       "\n",
-       "#sk-container-id-1 a.estimator_doc_link.fitted {\n",
+       "#sk-container-id-5 a.estimator_doc_link.fitted {\n",
       "  /* fitted */\n",
       "  border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
       "  color: var(--sklearn-color-fitted-level-1);\n",
       "}\n",
       "\n",
       "/* On hover */\n",
-       "#sk-container-id-1 a.estimator_doc_link:hover {\n",
+       "#sk-container-id-5 a.estimator_doc_link:hover {\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-unfitted-level-3);\n",
       "  color: var(--sklearn-color-background);\n",
       "  text-decoration: none;\n",
       "}\n",
       "\n",
-       "#sk-container-id-1 a.estimator_doc_link.fitted:hover {\n",
+       "#sk-container-id-5 a.estimator_doc_link.fitted:hover {\n",
       "  /* fitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-3);\n",
       "}\n",
-       "</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>RandomForestRegressor(max_depth=15, n_estimators=50, random_state=42)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;&nbsp;RandomForestRegressor<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.ensemble.RandomForestRegressor.html\">?<span>Documentation for RandomForestRegressor</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>RandomForestRegressor(max_depth=15, n_estimators=50, random_state=42)</pre></div> </div></div></div></div>"
+       "</style><div id=\"sk-container-id-5\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>RandomForestRegressor(max_depth=15, n_estimators=50, random_state=42)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-5\" type=\"checkbox\" checked><label for=\"sk-estimator-id-5\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;&nbsp;RandomForestRegressor<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.ensemble.RandomForestRegressor.html\">?<span>Documentation for RandomForestRegressor</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>RandomForestRegressor(max_depth=15, n_estimators=50, random_state=42)</pre></div> </div></div></div></div>"
      ],
      "text/plain": [
       "RandomForestRegressor(max_depth=15, n_estimators=50, random_state=42)"
      ]
     },
-     "execution_count": 28,
+     "execution_count": 163,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -1325,26 +1288,6 @@
    "rf.fit(X_train,y_train)"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0.8074574314651273"
-      ]
-     },
-     "execution_count": 29,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "r2_score(y_test,rf.predict(X_test))"
-   ]
-  },
  {
   "cell_type": "markdown",
   "metadata": {},
@@ -1361,7 +1304,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 164,
   "metadata": {},
   "outputs": [
    {
@@ -1397,7 +1340,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 165,
   "metadata": {},
   "outputs": [
    {
@@ -1435,7 +1378,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 166,
   "metadata": {},
   "outputs": [
    {
@@ -1470,7 +1413,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 167,
   "metadata": {},
   "outputs": [
    {
@@ -1517,7 +1460,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 168,
   "metadata": {},
   "outputs": [
    {
@@ -1525,15 +1468,7 @@
     "output_type": "stream",
     "text": [
      "Meilleurs hyperparamètres (RandomizedSearchCV) : {'n_estimators': 20, 'max_depth': 50}\n",
-      "Score R2 sur l'ensemble de test : 0.17109802066519297\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "c:\\Users\\samys\\anaconda\\envs\\jpp\\Lib\\site-packages\\numpy\\ma\\core.py:2881: RuntimeWarning: invalid value encountered in cast\n",
-      "  _data = np.array(data, dtype=dtype, copy=copy,\n"
+      "Score MSE sur l'ensemble de test : 0.17109802066519297\n"
     ]
    }
   ],
@@ -1565,16 +1500,16 @@
    "\n",
    "y_pred_random = best_model_random.predict(X_test)\n",
    "\n",
-    "# Calcul du R2 sur l'ensemble de test\n",
-    "r2_test_random = np.mean((y_test - y_pred_random)**2)\n",
+    "# Calcul de la MSE sur l'ensemble de test\n",
+    "mse_test_random = mean_squared_error(y_test,y_pred_random)\n",
    "\n",
    "print(\"Meilleurs hyperparamètres (RandomizedSearchCV) :\", best_params_random)\n",
-    "print(\"Score R2 sur l'ensemble de test :\", r2_test_random)"
+    "print(\"Score MSE sur l'ensemble de test :\", mse_test_random)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 169,
   "metadata": {},
   "outputs": [
    {
@@ -1582,15 +1517,7 @@
     "output_type": "stream",
     "text": [
      "Meilleurs hyperparamètres (GridSearchCV) : {'max_depth': 22, 'n_estimators': 20}\n",
-      "Score R2 sur l'ensemble de test : 0.17383407975848472\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "c:\\Users\\samys\\anaconda\\envs\\jpp\\Lib\\site-packages\\numpy\\ma\\core.py:2881: RuntimeWarning: invalid value encountered in cast\n",
-      "  _data = np.array(data, dtype=dtype, copy=copy,\n"
+      "Score MSE sur l'ensemble de test : 0.17383407975848472\n"
     ]
    }
   ],
@@ -1621,18 +1548,18 @@
    "\n",
    "y_pred_grid = best_model_grid.predict(X_test)\n",
    "\n",
-    "# Calcul du R2 sur l'ensemble de test\n",
-    "r2_test_grid = np.mean((y_test - y_pred_grid)**2)\n",
+    "# Calcul de la MSE sur l'ensemble de test\n",
+    "mse_test_grid = mean_squared_error(y_test,y_pred_grid)\n",
    "\n",
    "print(\"Meilleurs hyperparamètres (GridSearchCV) :\", best_params_grid)\n",
-    "print(\"Score R2 sur l'ensemble de test :\", r2_test_grid)"
+    "print(\"Score MSE sur l'ensemble de test :\", mse_test_grid)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "On constate qu'on a pas les memes meilleurs hyperparamètres  entre GridSearchCV et RandomSearchCV, et on obtient une meilleure MSE avec GridSearchCV sur l'ensemble de test ce qui est prévisible car GridSearchCV explore toutes les combinaisons d'hyperparamètres possibles contrairement à RandomSearchCV."
+    "On constate qu'on a pas les memes hyperparamètres optimaux entre GridSearchCV et RandomSearchCV ce qui est normal, et on obtient une meilleure MSE avec RandomSearchCV sur l'ensemble de test, ce qui est possible car les meilleurs hyperparamètres sur l'ensemble d'entrainement ne sont pas forcément meilleurs sur l'ensemble de test"
   ]
  }
 ],

 %% Cell type:markdown id: tags:
  
 <center>
    <h1 style="color:green">
        <b>
            <u> PROJECT </u>
        </b>
    </h1>
 </center>
  
 %% Cell type:code id: tags:
  
 ``` python
 import pandas as pd
 import missingno as msno
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
 from src.data_processing.load_data import load_data
 from src.data_processing.preprocessing import rename_data
 from src.data_processing.preprocessing import cat_to_quant
 from src.data_processing.preprocessing import delete_feature
 from src.data_processing.preprocessing import encode_and_bind
 from src.data_processing.preprocessing import impute_mean
 from src.figures.figures import plot_area_price_relationship, plot_correlation_matrix, plot_air_conditioning_presence, plot_bedrooms_distribution, plot_price_distribution
 from src.data_science.data import split_data
 from sklearn.metrics import r2_score
+from sklearn.metrics import mean_squared_error
 from sklearn.linear_model import LinearRegression
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.preprocessing import StandardScaler
 from sklearn.model_selection import RandomizedSearchCV
 from sklearn.model_selection import GridSearchCV
 from config.config import TEST_RATIO, SEED, SIZE_TRAIN
 ```
  
 %% Cell type:code id: tags:
  
 ``` python
 print(TEST_RATIO, SEED)
 ```
  
 %% Output
  
    0.2 5
  
 %% Cell type:markdown id: tags:
  
 # EDA
  
 %% Cell type:markdown id: tags:
  
 ## Chargement et prétraitement des données
  
 %% Cell type:code id: tags:
  
 ``` python
 df = load_data('data/raw/house_prices.csv')
 ```
  
 %% Cell type:code id: tags:
  
 ``` python
 df.head()
 ```
  
 %% Output
  
-            price    AreA  bedrooms  BATHROOMS  stories mainroad guestroom  \
-    0   4543000.0  4990.0       4.0        2.0      2.0      yes       yes
-    1   8080940.0  7000.0       3.0        2.0      4.0      yes        no
-    2   8750000.0  4321.0       3.0        2.0      2.0      yes        no
-    3   1890000.0  1700.0       3.0        1.0      2.0      yes        no
-    4  12215000.0  7500.0       4.0        2.0      2.0      yes        no
+            price    AreA  bedrooms  ...  prefarea  furnishing STATUS houSeaGe
+    0   4543000.0  4990.0       4.0  ...       yes          furnished     15.0
+    1   8080940.0  7000.0       3.0  ...        no          FURNISHED     11.0
+    2   8750000.0  4321.0       3.0  ...        no          FURNISHED      NaN
+    3   1890000.0  1700.0       3.0  ...        no        unfurnished      NaN
+    4  12215000.0  7500.0       4.0  ...       yes          furnished      NaN
    
-      basement hotwaterheating air conditioning  parking prefarea  \
-    0      yes              no               no      0.0      yes
-    1       no              no              yes      2.0       no
-    2      yes             yes               no      2.0       no
-    3       no              no               no      0.0       no
-    4      yes              no              yes      3.0      yes
-    
-      furnishing STATUS  houSeaGe
-    0         furnished      15.0
-    1         FURNISHED      11.0
-    2         FURNISHED       NaN
-    3       unfurnished       NaN
-    4         furnished       NaN
+    [5 rows x 14 columns]
  
 %% Cell type:code id: tags:
  
 ``` python
 df = rename_data(df) # On renomme correctement les noms de variable
 ```
  
 %% Cell type:code id: tags:
  
 ``` python
 df = cat_to_quant(df) # On transforme les données catégorielles en données quantitatives
 ```
  
 %% Output
  
-    c:\Users\samys\OneDrive\Bureau\last_test\projet-python-m2-ds-2024\src\data_processing\preprocessing.py:30: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
+    c:\Users\samys\OneDrive\Bureau\verylasttest\projet-python-m2-ds-2024\src\data_processing\preprocessing.py:30: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
      data=data.replace({'yes': 1, 'no': 0})
  
 %% Cell type:markdown id: tags:
  
 On transforme la variable 'furnishing_status' en variable numérique
  
 %% Cell type:code id: tags:
  
 ``` python
 df=encode_and_bind(df,'furnishing_status')
 ```
  
 %% Cell type:markdown id: tags:
  
 Visualisons les données manquantes dans le jeu de données
  
 %% Cell type:code id: tags:
  
 ``` python
 missing = df.isnull()
  
 plt.figure(figsize=(6,4))
 sns.heatmap(missing, cbar=False,yticklabels=False)
 plt.title('Visualisation de données manquantes')
 plt.show()
 ```
  
 %% Output
  

  
 %% Cell type:markdown id: tags:
  
 On décide de virer la colonne 'houseage' car elle contient quasiment exclusivement des données manquantes et d'imputer le reste des variables avec leurs moyennes respectives.
  
 %% Cell type:code id: tags:
  
 ``` python
 df = delete_feature(df,'houseage') #Suppresion de la variable 'housage'
 df = impute_mean(df)
 ```
  
 %% Cell type:markdown id: tags:
  
 Voici un aperçu notre jeu de données après le prétraitement des données :
  
 %% Cell type:code id: tags:
  
 ``` python
 df.head()
 ```
  
 %% Output
  
-          price  area  bedrooms  bathrooms  stories  mainroad  guestroom  \
-    0   4543000  4990         4          2        2         1          1
-    1   8080940  7000         3          2        4         1          0
-    2   8750000  4321         3          2        2         1          0
-    3   1890000  1700         3          1        2         1          0
-    4  12215000  7500         4          2        2         1          0
-    
-       basement  hotwaterheating  air_conditioning  parking  prefarea  furnished  \
-    0         1                0                 0        0         1          1
-    1         0                0                 1        2         0          1
-    2         1                1                 0        2         0          1
-    3         0                0                 0        0         0          0
-    4         1                0                 1        3         1          1
+          price  area  bedrooms  ...  prefarea  furnished  semi-furnished
+    0   4543000  4990         4  ...         1          1               0
+    1   8080940  7000         3  ...         0          1               0
+    2   8750000  4321         3  ...         0          1               0
+    3   1890000  1700         3  ...         0          0               0
+    4  12215000  7500         4  ...         1          1               0
    
-       semi-furnished
-    0               0
-    1               0
-    2               0
-    3               0
-    4               0
+    [5 rows x 14 columns]
  
 %% Cell type:markdown id: tags:
  
 ## Exploration des données
  
 %% Cell type:code id: tags:
  
 ``` python
 # histogramme de la distribution des prix de vente
 plot_price_distribution(df)
  
  
 #diagramme de presence ou non de la climatisation
 plot_air_conditioning_presence(df)
  
  
 # Diagramme en barres pour une variable catégorielle (Nombre de chambres)
 plot_bedrooms_distribution(df)
  
 ```
  
 %% Output
  

  

  

  
 %% Cell type:code id: tags:
  
 ``` python
 # Scatter plot pour les relations entre variables numériques (ex. SalePrice et LotArea)
 plot_area_price_relationship(df)
  
  
  
 # Heatmap pour visualiser la corrélation entre les variables numériques
 plot_correlation_matrix(df)
  
 ```
  
 %% Output
  

  

  
 %% Cell type:markdown id: tags:
  
 On remarque que la plus forte corrélation est entre la variable 'price' et 'bathrooms', ce qui pouvait etre attendu etant donné qu'avoir plusieurs salles de bain est réservé aux maisons assez luxueuses, la seconde plus forte corrélation est entre la variable 'price' et 'area' ce qui est aussi prévisible.
  
 %% Cell type:markdown id: tags:
  
 # Validation croisée
  
 %% Cell type:code id: tags:
  
 ``` python
 X_train, y_train, X_test, y_test = split_data(df,test_ratio=TEST_RATIO,random_seed=SEED,target_column='price')
 ```
  
 %% Cell type:code id: tags:
  
 ``` python
 #normalisation des données
  
 normalization = StandardScaler()
 normalization_y = StandardScaler()
  
 #restructuration des données de sorte à ce que la normalisation avec sklearn puisse etre fait
 y_train = np.array(y_train).reshape(-1,1)
 y_test = np.array(y_test).reshape(-1,1)
  
 normalization.fit(X_train)
 normalization_y.fit(y_train)
  
 #Normalisation des covariables
 X_train = normalization.transform(X_train)
 X_test = normalization.transform(X_test)
  
  
  
 #Normalisation de la variable cible
 y_train = normalization_y.transform(y_train)
 y_test = normalization_y.transform(y_test)
 ```
  
 %% Cell type:code id: tags:
  
 ``` python
 #retransformation de y_train et y_test en 1D
  
 y_train = y_train.ravel()
 y_test = y_test.ravel()
 ```
  
 %% Cell type:markdown id: tags:
  
 # Machine Learning
  
 %% Cell type:markdown id: tags:
  
 ## Entrainement sur la variable bedrooms
  
 %% Cell type:code id: tags:
  
 ``` python
 list_error = []
 size_train = SIZE_TRAIN  #différentes tailles d'évaluation du modèle
  
 # On réalise une boucle sur différentes tailles de l'ensemble d'entraînement
 for n in size_train:
    # Sélectionner l'ensemble d'entraînement et de test correspondant
    X_train_subset = X_train[:,1][:n]
    y_train_subset = y_train[:n]
  
    # entrainement du modele
    lin_reg_baseline = LinearRegression()
    lin_reg_baseline.fit(X_train_subset.reshape(-1,1),y_train_subset)
  
    #calcul de la prédiction
-    y_pred = lin_reg_baseline.predict(X_test[:,1].reshape(-1,1))
+    y_pred_bed = lin_reg_baseline.predict(X_test[:,1].reshape(-1,1))
  
-    # calcul de l'erreur R2
-    error_mse = np.mean((y_test-y_pred)**2)
+    # calcul du MSE
+    error_mse = mean_squared_error(y_test,y_pred_bed)
    list_error.append(error_mse)
 ```
  
 %% Cell type:code id: tags:
  
 ``` python
 # Affichage du graphique représentant l'erreur en fonction de la taille de l'ensemble d'entraînement
 plt.plot(size_train, list_error, marker='o')
 plt.xlabel("Taille de l'ensemble d'entraînement")
 plt.ylabel("Erreur (MSE)")
 plt.title("Erreur en fonction de la taille de l'ensemble d'entraînement")
  
 plt.show()
 ```
  
 %% Output
  

  
 %% Cell type:markdown id: tags:
  
 On voit un optimum de la taille de l'ensemble d'entrainement pourn n=648 (l'ensemble du jeu d'entrainements), on remarque aussi une que l'erreur MSE est plus grande pour n=250 et n=450 que pour n=100 et n=50, ce qui montre qu'un ensemble d'entrainement plus grand n'implique pas systématiquement de meilleures performances, meme si ici cela pourrait etre lié à la simplicité du modèle
  
 %% Cell type:markdown id: tags:
  
 ## Entrainement d'une regression linéaire sur toutes les données
  
 %% Cell type:code id: tags:
  
 ``` python
 list_error=[]
 # on réalise une boucle sur différentes tailles de l'ensemble d'entrainement
 for n in size_train:
  
    X_train_subset = X_train[:n]
    y_train_subset = y_train[:n]
  
    # entrainement du modele
    lin_reg = LinearRegression()
    lin_reg.fit(X_train_subset,y_train_subset)
  
    #calcul de la prédiction
    y_pred=lin_reg.predict(X_test)
  
-    # calcul de l'erreur R2
-    error_mse=np.mean((y_test-y_pred)**2)
-    print(r2_score(y_test,y_pred))
+    # calcul du MSE
+    error_mse=mean_squared_error(y_test,y_pred)
    list_error.append(error_mse)
 ```
  
-%% Output
-
-    -0.18908813220600074
-    0.49828257362511363
-    0.6595360217499793
-    0.6738827057529093
-    0.6710848125168165
-    0.6714672798459871
-
 %% Cell type:code id: tags:
  
 ``` python
 # affichage du graphique représentant l'erreur en fonction de la taille de l'ensemble d'entrainement
  
 plt.plot(size_train, list_error, marker='o')
  
 plt.xlabel("Taille de l'ensemble d'entraînement")
 plt.ylabel("Erreur")
 plt.title("Erreur en fonction de la taille de l'ensemble d'entraînement")
  
  
 plt.show()
 ```
  
 %% Output
  

  
 %% Cell type:markdown id: tags:
  
-L'ajout de données améliore la performance du modèle cependant on constate une très faible amélioration du score à partir de n=100, mais qui est très forte entre n=10 et n=100
+L'ajout de données améliore la performance du modèle cependant on constate une très faible amélioration du score à partir de n=100, mais qui est très forte entre n=10 et n=100, cet exemple met en évidence l'importance de la taille de l'ensemble d'entrainement pour avoir de meilleurs résultats.
  
 %% Cell type:markdown id: tags:
  
 ## Forets aléatoires
  
 %% Cell type:code id: tags:
  
 ``` python
 n_estimators_list = [5,10,20,50]
 max_depth_list = [5, 15 ,22,30, 40,50,60]
  
 # Stocker les résultats
 results = {}
  
 for n_estimators in n_estimators_list:
    test_scores = []
  
    for max_depth in max_depth_list:
        # Définir le modèle Random Forest
        rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
  
        # Entraînement du modèle
        rf.fit(X_train, y_train)
  
        # Prédictions
-        y_pred = rf.predict(X_test)
+        y_pred_forest = rf.predict(X_test)
  
        # Calcul de l'erreur
-        error_mse_forest = np.mean((y_test-y_pred)**2)
+        error_mse_forest = mean_squared_error(y_test,y_pred_forest)
  
        test_scores.append(error_mse_forest)
  
    # Sauvegarder les résultats pour chaque n_estimators
    results[n_estimators] = {
        "max_depth": max_depth_list,
        "test_scores": test_scores
    }
  
 # Affichage des graphes
 plt.figure(figsize=(15, 10))
 for i, n_estimators in enumerate(n_estimators_list):
    plt.subplot(3, 2, i + 1)
    plt.plot(results[n_estimators]["max_depth"], results[n_estimators]["test_scores"], label="Test")
    plt.title(f"n_estimators = {n_estimators}")
    plt.xlabel("max_depth")
    plt.ylabel("MSE")
    plt.legend()
    plt.tight_layout()
  
 plt.show()
 ```
  
 %% Output
  

  
 %% Cell type:markdown id: tags:
  
 On voit que la meilleure MSE a lieu pour un nombre d'estimateurs à 50 et une maximum de profondeur de 15 (pour plus de profondeur ça stagne), on constate aussi qu'on a une meilleure MSE que pour la régression linéaire
  
 %% Cell type:code id: tags:
  
 ``` python
 rf = RandomForestRegressor(n_estimators=50, max_depth=15, random_state=42)
 rf.fit(X_train,y_train)
 ```
  
 %% Output
  
    RandomForestRegressor(max_depth=15, n_estimators=50, random_state=42)
  
-%% Cell type:code id: tags:
-
-``` python
-r2_score(y_test,rf.predict(X_test))
-```
-
-%% Output
-
-    0.8074574314651273
-
 %% Cell type:markdown id: tags:
  
 ## Features Importances
  
 %% Cell type:markdown id: tags:
  
 ### Régression linéaire
  
 %% Cell type:code id: tags:
  
 ``` python
 # Extraction des coefficients
 coefficients = lin_reg.coef_
  
 coef = pd.DataFrame()
 coef['Coefficient'] = coefficients
 coef['Columns']=df.iloc[:,1:].columns
 print(coef)
 ```
  
 %% Output
  
        Coefficient           Columns
    0      0.251262              area
    1      0.063651          bedrooms
    2      0.306408         bathrooms
    3      0.152747           stories
    4      0.085228          mainroad
    5      0.063935         guestroom
    6      0.063827          basement
    7      0.125000   hotwaterheating
    8      0.224648  air_conditioning
    9      0.137781           parking
    10     0.134494          prefarea
    11     0.068656         furnished
    12     0.079305    semi-furnished
  
 %% Cell type:code id: tags:
  
 ``` python
 plt.barh(range(len(coefficients)), coefficients)
 plt.yticks(range(len(coefficients)), df.iloc[:,1:].columns)
 plt.xlabel('Importance de la caractéristique')
 plt.ylabel('Caractéristiques')
 plt.show()
 ```
  
 %% Output
  

  
 %% Cell type:markdown id: tags:
  
 Etant donné que le jeu de données a été centré réduit nous pouvons directement analysé l'importance de chaque feature par son coefficient associé. On constate donc que les coefficients les plus forts sont associés d'abord à la variable 'bathrooms' avec un coefficient associé de 0.36 puis à la variable 'area' avec un coefficient associé de 0.244, ces résultats concordent avec notre visualisation des corrélations entre variables.
  
 %% Cell type:markdown id: tags:
  
 ### Forets aléatoires
  
 %% Cell type:code id: tags:
  
 ``` python
 coefficients_rf = rf.feature_importances_
  
 coef_rf = pd.DataFrame()
 coef_rf['Coefficient'] = coefficients_rf
 coef_rf['Columns'] = df.iloc[:,1:].columns
 print(coef_rf)
 ```
  
 %% Output
  
        Coefficient           Columns
    0      0.364237              area
    1      0.025185          bedrooms
    2      0.304803         bathrooms
    3      0.049687           stories
    4      0.009976          mainroad
    5      0.020051         guestroom
    6      0.021538          basement
    7      0.028398   hotwaterheating
    8      0.048936  air_conditioning
    9      0.054310           parking
    10     0.031695          prefarea
    11     0.020924         furnished
    12     0.020259    semi-furnished
  
 %% Cell type:code id: tags:
  
 ``` python
 plt.barh(range(len(coefficients_rf)), coefficients_rf)
 plt.yticks(range(len(coefficients_rf)), df.iloc[:,1:].columns)
 plt.xlabel('Importance de la caractéristique')
 plt.ylabel('Caractéristiques')
 plt.show()
 ```
  
 %% Output
  

  
 %% Cell type:markdown id: tags:
  
 Ici encore plus que pour la régression linéaire les 2 variables les plus importantes dans le modèle sont 'bathrooms' et 'area'
  
 %% Cell type:markdown id: tags:
  
 ## Bonus: RandomizedSearchCV and GridSearchCV
  
 %% Cell type:markdown id: tags:
  
 GridSearchCV fait de la validation croisée sur toutes les combinaisons de paramètres qu'on lui fournit, RandomSearchCV quant à elle fait de la validation croisée avec un nombre de combinaisons spécifiées par la paramètre "n_iter" de manière aléatoire.
 ##### Avantages et inconvénients :
 - Testant toutes les combinaisons de paramètres GridSearchCV va fournir la meilleure combinaison possible de paramètre mais est plus couteuse computationnellement que RandomSearchCV inversement RandomSearchCV ne va pas forcément sortir la combinaison optimale mais la méthode est moins couteuse, en pratique pour des modèles complexes et sur de gros jeu de données RandomSearchCV va etre plus utilisé.
  
 %% Cell type:code id: tags:
  
 ``` python
  
 # Hyperparamètres à explorer
 param_distributions = {
    'n_estimators': n_estimators_list,
    'max_depth': max_depth_list
 }
  
 rf = RandomForestRegressor(random_state=42)
  
  
 random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_distributions,
    n_iter=8,  # Nombre de combinaisons à tester
    scoring='neg_mean_squared_error',
    cv=5,
    random_state=42
 )
  
 random_search.fit(X_train, y_train)
  
  
 best_params_random = random_search.best_params_
 best_model_random = random_search.best_estimator_
  
 y_pred_random = best_model_random.predict(X_test)
  
-# Calcul du R2 sur l'ensemble de test
-r2_test_random = np.mean((y_test - y_pred_random)**2)
+# Calcul de la MSE sur l'ensemble de test
+mse_test_random = mean_squared_error(y_test,y_pred_random)
  
 print("Meilleurs hyperparamètres (RandomizedSearchCV) :", best_params_random)
-print("Score R2 sur l'ensemble de test :", r2_test_random)
+print("Score MSE sur l'ensemble de test :", mse_test_random)
 ```
  
 %% Output
  
    Meilleurs hyperparamètres (RandomizedSearchCV) : {'n_estimators': 20, 'max_depth': 50}
-    Score R2 sur l'ensemble de test : 0.17109802066519297
-
-    c:\Users\samys\anaconda\envs\jpp\Lib\site-packages\numpy\ma\core.py:2881: RuntimeWarning: invalid value encountered in cast
-      _data = np.array(data, dtype=dtype, copy=copy,
+    Score MSE sur l'ensemble de test : 0.17109802066519297
  
 %% Cell type:code id: tags:
  
 ``` python
 from sklearn.model_selection import GridSearchCV
  
 # Définir la grille des hyperparamètres
 param_grid = {
    'n_estimators': n_estimators_list,
    'max_depth': max_depth_list
 }
  
 # Définir le modèle Random Forest
 rf = RandomForestRegressor()
  
 grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5
 )
  
 grid_search.fit(X_train, y_train)
  
  
 best_params_grid = grid_search.best_params_
 best_model_grid = grid_search.best_estimator_
  
 y_pred_grid = best_model_grid.predict(X_test)
  
-# Calcul du R2 sur l'ensemble de test
-r2_test_grid = np.mean((y_test - y_pred_grid)**2)
+# Calcul de la MSE sur l'ensemble de test
+mse_test_grid = mean_squared_error(y_test,y_pred_grid)
  
 print("Meilleurs hyperparamètres (GridSearchCV) :", best_params_grid)
-print("Score R2 sur l'ensemble de test :", r2_test_grid)
+print("Score MSE sur l'ensemble de test :", mse_test_grid)
 ```
  
 %% Output
  
    Meilleurs hyperparamètres (GridSearchCV) : {'max_depth': 22, 'n_estimators': 20}
-    Score R2 sur l'ensemble de test : 0.17383407975848472
-
-    c:\Users\samys\anaconda\envs\jpp\Lib\site-packages\numpy\ma\core.py:2881: RuntimeWarning: invalid value encountered in cast
-      _data = np.array(data, dtype=dtype, copy=copy,
+    Score MSE sur l'ensemble de test : 0.17383407975848472
  
 %% Cell type:markdown id: tags:
  
-On constate qu'on a pas les memes meilleurs hyperparamètres  entre GridSearchCV et RandomSearchCV, et on obtient une meilleure MSE avec GridSearchCV sur l'ensemble de test ce qui est prévisible car GridSearchCV explore toutes les combinaisons d'hyperparamètres possibles contrairement à RandomSearchCV.
+On constate qu'on a pas les memes hyperparamètres optimaux entre GridSearchCV et RandomSearchCV ce qui est normal, et on obtient une meilleure MSE avec RandomSearchCV sur l'ensemble de test, ce qui est possible car les meilleurs hyperparamètres sur l'ensemble d'entrainement ne sont pas forcément meilleurs sur l'ensemble de test