initial commit

d5f624e6 · FERRAT Samy · 29baedb9 · d5f624e6 · d5f624e6 · d5f624e6
Commit d5f624e6 authored 7 months ago by FERRAT Samy
--- a/config/config.py
+++ b/config/config.py
-<<<<<<< HEAD
-TEST_RATIO = 0.3
-SEED = 13
-=======
 TEST_RATIO = 0.2
 SEED = 5
 SIZE_TRAIN=[20, 50, 100, 250, 450, 648]  
->>>>>>> 324e24a (dépot du projet)
--- a/notebook.ipynb
+++ b/notebook.ipynb
@@ -15,9 +15,21 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "SyntaxError",
+     "evalue": "invalid decimal literal (config.py, line 8)",
+     "output_type": "error",
+     "traceback": [
+      "Traceback \u001b[1;36m(most recent call last)\u001b[0m:\n",
+      "\u001b[0m  File \u001b[0;32mc:\\Users\\samys\\anaconda\\envs\\jpp\\Lib\\site-packages\\IPython\\core\\interactiveshell.py:3577\u001b[0m in \u001b[0;35mrun_code\u001b[0m\n    exec(code_obj, self.user_global_ns, self.user_ns)\u001b[0m\n",
+      "\u001b[1;36m  Cell \u001b[1;32mIn[8], line 19\u001b[1;36m\n\u001b[1;33m    from config.config import TEST_RATIO, SEED, SIZE_TRAIN\u001b[1;36m\n",
+      "\u001b[1;36m  File \u001b[1;32mc:\\Users\\samys\\OneDrive\\Bureau\\test_python_project\\projet-python-m2ds-2024\\config\\config.py:8\u001b[1;36m\u001b[0m\n\u001b[1;33m    >>>>>>> 324e24a (dépot du projet)\u001b[0m\n\u001b[1;37m                 ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid decimal literal\n"
+     ]
+    }
+   ],
   "source": [
    "import pandas as pd\n",
    "import missingno as msno\n",
@@ -42,14 +54,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0.2 5\n"
+     "ename": "NameError",
+     "evalue": "name 'TEST_RATIO' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[5], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[43mTEST_RATIO\u001b[49m, SEED)\n",
+      "\u001b[1;31mNameError\u001b[0m: name 'TEST_RATIO' is not defined"
     ]
    }
   ],
@@ -1243,7 +1259,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.11.2"
+   "version": "3.13.0"
  },
  "orig_nbformat": 4
 },

 %% Cell type:markdown id: tags:
  
 <center>
    <h1 style="color:green">
        <b>
            <u> PROJECT </u>
        </b>
    </h1>
 </center>
  
 %% Cell type:code id: tags:
  
 ``` python
 import pandas as pd
 import missingno as msno
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
 from src.data_processing.load_data import load_data
 from src.data_processing.preprocessing import rename_data
 from src.data_processing.preprocessing import cat_to_quant
 from src.data_processing.preprocessing import delete_feature
 from src.data_processing.preprocessing import encode_and_bind
 from src.data_processing.preprocessing import impute_mean
 from src.data_science.data import split_data
 from sklearn.metrics import r2_score
 from sklearn.linear_model import LinearRegression
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.preprocessing import StandardScaler
 from sklearn.model_selection import RandomizedSearchCV
 from sklearn.model_selection import GridSearchCV
 from config.config import TEST_RATIO, SEED, SIZE_TRAIN
 ```
  
+%% Output
+
+Traceback     (most recent call last):
+      File c:\Users\samys\anaconda\envs\jpp\Lib\site-packages\IPython\core\interactiveshell.py:3577 in run_code
+    exec(code_obj, self.user_global_ns, self.user_ns)
+      Cell In[8], line 19
+        from config.config import TEST_RATIO, SEED, SIZE_TRAIN
+      File c:\Users\samys\OneDrive\Bureau\test_python_project\projet-python-m2ds-2024\config\config.py:8
+        >>>>>>> 324e24a (dépot du projet)
+                     ^
+    SyntaxError: invalid decimal literal
+
 %% Cell type:code id: tags:
  
 ``` python
 print(TEST_RATIO, SEED)
 ```
  
 %% Output
  
-    0.2 5
+    ---------------------------------------------------------------------------
+    NameError                                 Traceback (most recent call last)
+Cell     In[5], line 1
+    ----> 1 print(TEST_RATIO, SEED)
+    NameError: name 'TEST_RATIO' is not defined
  
 %% Cell type:markdown id: tags:
  
 # EDA
  
 %% Cell type:markdown id: tags:
  
 ## Chargement et prétraitement des données
  
 %% Cell type:code id: tags:
  
 ``` python
 df = load_data('data/raw/house_prices.csv')
 ```
  
 %% Cell type:code id: tags:
  
 ``` python
 df.head()
 ```
  
 %% Output
  
            price    AreA  bedrooms  BATHROOMS  stories mainroad guestroom  \
    0   4543000.0  4990.0       4.0        2.0      2.0      yes       yes
    1   8080940.0  7000.0       3.0        2.0      4.0      yes        no
    2   8750000.0  4321.0       3.0        2.0      2.0      yes        no
    3   1890000.0  1700.0       3.0        1.0      2.0      yes        no
    4  12215000.0  7500.0       4.0        2.0      2.0      yes        no
    
      basement hotwaterheating air conditioning  parking prefarea  \
    0      yes              no               no      0.0      yes
    1       no              no              yes      2.0       no
    2      yes             yes               no      2.0       no
    3       no              no               no      0.0       no
    4      yes              no              yes      3.0      yes
    
      furnishing STATUS  houSeaGe
    0         furnished      15.0
    1         FURNISHED      11.0
    2         FURNISHED       NaN
    3       unfurnished       NaN
    4         furnished       NaN
  
 %% Cell type:code id: tags:
  
 ``` python
 df = rename_data(df) # On renomme correctement les noms de variable
 ```
  
 %% Cell type:code id: tags:
  
 ``` python
 df = cat_to_quant(df) # On transforme les données catégorielles en données quantitatives
 ```
  
 %% Cell type:markdown id: tags:
  
 On transforme la variable 'furnishing_status' en variable numérique
  
 %% Cell type:code id: tags:
  
 ``` python
 df=encode_and_bind(df,'furnishing_status')
 ```
  
 %% Cell type:markdown id: tags:
  
 Visualisons les données manquantes dans le jeu de données
  
 %% Cell type:code id: tags:
  
 ``` python
 missing = df.isnull()
  
 plt.figure(figsize=(6,4))
 sns.heatmap(missing, cbar=False,yticklabels=False)
 plt.title('Visualisation de données manquantes')
 plt.show()
 ```
  
 %% Output
  

  
 %% Cell type:markdown id: tags:
  
 On décide de virer la colonne 'houseage' car elle contient quasiment exclusivement des données manquantes et d'imputer le reste des variables avec leurs moyennes respectives.
  
 %% Cell type:code id: tags:
  
 ``` python
 df = delete_feature(df,'houseage') #Suppresion de la variable 'housage'
 df = impute_mean(df)
 ```
  
 %% Cell type:markdown id: tags:
  
 Voici un aperçu notre jeu de données après le prétraitement des données :
  
 %% Cell type:code id: tags:
  
 ``` python
 df.head()
 ```
  
 %% Output
  
          price  area  bedrooms  bathrooms  stories  mainroad  guestroom  \
    0   4543000  4990         4          2        2         1          1
    1   8080940  7000         3          2        4         1          0
    2   8750000  4321         3          2        2         1          0
    3   1890000  1700         3          1        2         1          0
    4  12215000  7500         4          2        2         1          0
    
       basement  hotwaterheating  air_conditioning  parking  prefarea  furnished  \
    0         1                0                 0        0         1          1
    1         0                0                 1        2         0          1
    2         1                1                 0        2         0          1
    3         0                0                 0        0         0          0
    4         1                0                 1        3         1          1
    
       semi-furnished
    0               0
    1               0
    2               0
    3               0
    4               0
  
 %% Cell type:markdown id: tags:
  
 ## Exploration des données
  
 %% Cell type:code id: tags:
  
 ``` python
  
 plt.figure(figsize=(6, 4))
 sns.histplot(df['price'], kde=True)
 plt.title('Distribution des prix de vente')
 plt.xlabel('Prix de vente')
 plt.ylabel('Fréquence')
 plt.show()
  
  
 #diagramme de presence ou non de la climatisation
 plt.figure(figsize=(6, 4))
 sns.countplot(x=df['guestroom'])
 plt.xlabel('Climatisation')
 plt.ylabel('Nombre de maisons')
 plt.xticks([0, 1], ['Non', 'Oui'])
 plt.show()
  
 # Diagramme en barres pour une variable catégorielle (Nombre de chambres)
 plt.figure(figsize=(6, 4))
 sns.countplot(y='bedrooms', data=df, order=df['bedrooms'].value_counts().index)
 plt.title('Nombre de chambres par maison')
 plt.xlabel('Nombre de maisons')
 plt.ylabel('Quartier')
 plt.show()
 ```
  
 %% Output
  

  

  

  
 %% Cell type:code id: tags:
  
 ``` python
 # Scatter plot pour les relations entre variables numériques (ex. SalePrice et LotArea)
 plt.figure(figsize=(6, 4))
 sns.scatterplot(x='area', y='price', data=df)
 plt.title('Relation entre la superficie et le prix de vente')
 plt.xlabel('Superficie')
 plt.ylabel('Prix de vente')
 plt.show()
  
  
  
 # Heatmap pour visualiser la corrélation entre les variables numériques
 plt.figure(figsize=(7, 5))
 correlation_matrix = df.corr()  # Calcule la matrice de corrélation
 sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
 plt.title('Matrice de corrélation des variables numériques')
 plt.show()
 ```
  
 %% Output
  

  

  
 %% Cell type:markdown id: tags:
  
 On remarque que la plus forte corrélation est entre la variable 'price' et 'bathrooms', ce qui pouvait etre attendu etant donné qu'avoir plusieurs salles de bain est réservé aux maisons assez luxueuses, la seconde plus forte corrélation est entre la variable 'price' et 'area' ce qui est aussi prévisible.
  
 %% Cell type:markdown id: tags:
  
 # Validation croisée
  
 %% Cell type:code id: tags:
  
 ``` python
 X_train, y_train, X_test, y_test = split_data(df,test_ratio=TEST_RATIO,random_seed=SEED,target_column='price')
 ```
  
 %% Cell type:code id: tags:
  
 ``` python
 #normalisation des données
  
 normalization = StandardScaler()
 normalization_y = StandardScaler()
  
 #restructuration des données de sorte à ce que la normalisation avec sklearn puisse etre fait
 y_train = np.array(y_train).reshape(-1,1)
 y_test = np.array(y_test).reshape(-1,1)
  
 normalization.fit(X_train)
 normalization_y.fit(y_train)
  
 #Normalisation des covariables
 X_train = normalization.transform(X_train)
 X_test = normalization.transform(X_test)
  
  
  
 #Normalisation de la variable cible
 y_train = normalization_y.transform(y_train)
 y_test = normalization_y.transform(y_test)
 ```
  
 %% Cell type:code id: tags:
  
 ``` python
 #retransformation de y_train et y_test en 1D
  
 y_train = y_train.ravel()
 y_test = y_test.ravel()
 ```
  
 %% Cell type:markdown id: tags:
  
 # Machine Learning
  
 %% Cell type:markdown id: tags:
  
 ## Entrainement sur la variable bedrooms
  
 %% Cell type:code id: tags:
  
 ``` python
 list_error = []
 size_train = SIZE_TRAIN  #différentes tailles d'évaluation du modèle
  
 # On réalise une boucle sur différentes tailles de l'ensemble d'entraînement
 for n in size_train:
    # Sélectionner l'ensemble d'entraînement et de test correspondant
    X_train_subset = X_train[:,1][:n]
    y_train_subset = y_train[:n]
  
    # entrainement du modele
    lin_reg_baseline = LinearRegression()
    lin_reg_baseline.fit(X_train_subset.reshape(-1,1),y_train_subset)
  
    #calcul de la prédiction
    y_pred = lin_reg_baseline.predict(X_test[:,1].reshape(-1,1))
  
    # calcul de l'erreur R2
    error_mse = np.mean((y_test-y_pred)**2)
    list_error.append(error_mse)
 ```
  
 %% Cell type:code id: tags:
  
 ``` python
 # Affichage du graphique représentant l'erreur en fonction de la taille de l'ensemble d'entraînement
 plt.plot(size_train, list_error, marker='o')
 plt.xlabel("Taille de l'ensemble d'entraînement")
 plt.ylabel("Erreur (MSE)")
 plt.title("Erreur en fonction de la taille de l'ensemble d'entraînement")
  
 plt.show()
 ```
  
 %% Output
  

  
 %% Cell type:markdown id: tags:
  
 On voit un optimum de la taille de l'ensemble d'entrainement pourn n=648 (l'ensemble du jeu d'entrainements), on remarque aussi une que l'erreur MSE est plus grande pour n=250 et n=450 que pour n=100 et n=50, ce qui montre qu'un ensemble d'entrainement plus grand n'implique pas systématiquement de meilleures performances, meme si ici cela pourrait etre lié à la simplicité du modèle
  
 %% Cell type:markdown id: tags:
  
 ## Entrainement d'une regression linéaire sur toutes les données
  
 %% Cell type:code id: tags:
  
 ``` python
 list_error=[]
 # on réalise une boucle sur différentes tailles de l'ensemble d'entrainement
 for n in size_train:
  
    X_train_subset = X_train[:n]
    y_train_subset = y_train[:n]
  
    # entrainement du modele
    lin_reg = LinearRegression()
    lin_reg.fit(X_train_subset,y_train_subset)
  
    #calcul de la prédiction
    y_pred=lin_reg.predict(X_test)
  
    # calcul de l'erreur R2
    error_mse=np.mean((y_test-y_pred)**2)
    print(r2_score(y_test,y_pred))
    list_error.append(error_mse)
 ```
  
 %% Output
  
    -0.18908813220600362
    0.4982825736251134
    0.6595360217499794
    0.6738827057529095
    0.6710848125168166
    0.6714672798459871
  
 %% Cell type:code id: tags:
  
 ``` python
 # affichage du graphique représentant l'erreur en fonction de la taille de l'ensemble d'entrainement
  
 plt.plot(size_train, list_error, marker='o')
  
 plt.xlabel("Taille de l'ensemble d'entraînement")
 plt.ylabel("Erreur")
 plt.title("Erreur en fonction de la taille de l'ensemble d'entraînement")
  
  
 plt.show()
 ```
  
 %% Output
  

  
 %% Cell type:markdown id: tags:
  
 L'ajout de données améliore la performance du modèle cependant on constate une très faible amélioration du score à partir de n=100, mais qui est très forte entre n=10 et n=100
  
 %% Cell type:markdown id: tags:
  
 ## Forets aléatoires
  
 %% Cell type:code id: tags:
  
 ``` python
 n_estimators_list = [5,10,20,50]
 max_depth_list = [5, 15 ,22,30, 40,50,60]
  
 # Stocker les résultats
 results = {}
  
 for n_estimators in n_estimators_list:
    test_scores = []
  
    for max_depth in max_depth_list:
        # Définir le modèle Random Forest
        rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
  
        # Entraînement du modèle
        rf.fit(X_train, y_train)
  
        # Prédictions
        y_pred = rf.predict(X_test)
  
        # Calcul du score R2
        error_mse_forest = np.mean((y_test-y_pred)**2)
  
        test_scores.append(error_mse_forest)
  
    # Sauvegarder les résultats pour chaque n_estimators
    results[n_estimators] = {
        "max_depth": max_depth_list,
        "test_scores": test_scores
    }
  
 # Affichage des graphes
 plt.figure(figsize=(15, 10))
 for i, n_estimators in enumerate(n_estimators_list):
    plt.subplot(3, 2, i + 1)
    plt.plot(results[n_estimators]["max_depth"], results[n_estimators]["test_scores"], label="Test")
    plt.title(f"n_estimators = {n_estimators}")
    plt.xlabel("max_depth")
    plt.ylabel("R2 Score")
    plt.legend()
    plt.tight_layout()
  
 plt.show()
 ```
  
 %% Output
  

  
 %% Cell type:markdown id: tags:
  
 On voit que la meilleure MSE a lieu pour un nombre d'estimateurs à 50 et une maximum de profondeur de 15 (pour plus de profondeur ça stagne), on constate aussi qu'on a une meilleure MSE que pour la régression linéaire
  
 %% Cell type:code id: tags:
  
 ``` python
 rf = RandomForestRegressor(n_estimators=50, max_depth=15, random_state=42)
 rf.fit(X_train,y_train)
 ```
  
 %% Output
  
    RandomForestRegressor(max_depth=15, n_estimators=50, random_state=42)
  
 %% Cell type:code id: tags:
  
 ``` python
 r2_score(y_test,rf.predict(X_test))
 ```
  
 %% Output
  
    0.8074574314651273
  
 %% Cell type:markdown id: tags:
  
 ## Features Importances
  
 %% Cell type:markdown id: tags:
  
 ### Régression linéaire
  
 %% Cell type:code id: tags:
  
 ``` python
 # Extraction des coefficients
 coefficients = lin_reg.coef_
  
 coef = pd.DataFrame()
 coef['Coefficient'] = coefficients
 coef['Columns']=df.iloc[:,1:].columns
 print(coef)
 ```
  
 %% Output
  
        Coefficient           Columns
    0      0.251262              area
    1      0.063651          bedrooms
    2      0.306408         bathrooms
    3      0.152747           stories
    4      0.085228          mainroad
    5      0.063935         guestroom
    6      0.063827          basement
    7      0.125000   hotwaterheating
    8      0.224648  air_conditioning
    9      0.137781           parking
    10     0.134494          prefarea
    11     0.068656         furnished
    12     0.079305    semi-furnished
  
 %% Cell type:code id: tags:
  
 ``` python
 plt.barh(range(len(coefficients)), coefficients)
 plt.yticks(range(len(coefficients)), df.iloc[:,1:].columns)
 plt.xlabel('Importance de la caractéristique')
 plt.ylabel('Caractéristiques')
 plt.show()
 ```
  
 %% Output
  

  
 %% Cell type:markdown id: tags:
  
 Etant donné que le jeu de données a été centré réduit nous pouvons directement analysé l'importance de chaque feature par son coefficient associé. On constate donc que les coefficients les plus forts sont associés d'abord à la variable 'bathrooms' avec un coefficient associé de 0.36 puis à la variable 'area' avec un coefficient associé de 0.244, ces résultats concordent avec notre visualisation des corrélations entre variables.
  
 %% Cell type:markdown id: tags:
  
 ### Forets aléatoires
  
 %% Cell type:code id: tags:
  
 ``` python
 coefficients_rf = rf.feature_importances_
  
 coef_rf = pd.DataFrame()
 coef_rf['Coefficient'] = coefficients_rf
 coef_rf['Columns']=df.iloc[:,1:].columns
 print(coef_rf)
 ```
  
 %% Output
  
        Coefficient           Columns
    0      0.364237              area
    1      0.025185          bedrooms
    2      0.304803         bathrooms
    3      0.049687           stories
    4      0.009976          mainroad
    5      0.020051         guestroom
    6      0.021538          basement
    7      0.028398   hotwaterheating
    8      0.048936  air_conditioning
    9      0.054310           parking
    10     0.031695          prefarea
    11     0.020924         furnished
    12     0.020259    semi-furnished
  
 %% Cell type:code id: tags:
  
 ``` python
 plt.barh(range(len(coefficients_rf)), coefficients_rf)
 plt.yticks(range(len(coefficients_rf)), df.iloc[:,1:].columns)
 plt.xlabel('Importance de la caractéristique')
 plt.ylabel('Caractéristiques')
 plt.show()
 ```
  
 %% Output
  

  
 %% Cell type:markdown id: tags:
  
 Ici encore plus que pour la régression linéaire les 2 variables les plus importantes dans le modèle sont 'bathrooms' et 'area'
  
 %% Cell type:markdown id: tags:
  
 ## Bonus: RandomizedSearchCV and GridSearchCV
  
 %% Cell type:markdown id: tags:
  
 GridSearchCV fait de la validation croisée sur toutes les combinaisons de paramètres qu'on lui fournit, RandomSearchCV quant à elle fait de la validation croisée avec un nombre de combinaisons spécifiées par la paramètre "n_iter" de manière aléatoire.
 ##### Avantages et inconvénients :
 - Testant toutes les combinaisons de paramètres GridSearchCV va fournir la meilleure combinaison possible de paramètre mais est plus couteuse computationnellement que RandomSearchCV inversement RandomSearchCV ne va pas forcément sortir la combinaison optimale mais la méthode est moins couteuse, en pratique pour des modèles complexes et sur de gros jeu de données RandomSearchCV va etre plus utilisé.
  
 %% Cell type:code id: tags:
  
 ``` python
  
 # Hyperparamètres à explorer
 param_distributions = {
    'n_estimators': n_estimators_list,
    'max_depth': max_depth_list
 }
  
 rf = RandomForestRegressor(random_state=42)
  
  
 random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_distributions,
    n_iter=8,  # Nombre de combinaisons à tester
    scoring='r2',
    cv=5,
    random_state=42
 )
  
 random_search.fit(X_train, y_train)
  
  
 best_params_random = random_search.best_params_
 best_model_random = random_search.best_estimator_
  
 y_pred_random = best_model_random.predict(X_test)
  
 # Calcul du R2 sur l'ensemble de test
 r2_test_random = r2_score(y_test, y_pred_random)
  
 print("Meilleurs hyperparamètres (RandomizedSearchCV) :", best_params_random)
 print("Score R2 sur l'ensemble de test :", r2_test_random)
 ```
  
 %% Output
  
    Meilleurs hyperparamètres (RandomizedSearchCV) : {'n_estimators': 20, 'max_depth': 50}
    Score R2 sur l'ensemble de test : 0.7964509919711419
  
 %% Cell type:code id: tags:
  
 ``` python
 from sklearn.model_selection import GridSearchCV
  
 # Définir la grille des hyperparamètres
 param_grid = {
    'n_estimators': n_estimators_list,
    'max_depth': max_depth_list
 }
  
 # Définir le modèle Random Forest
 rf = RandomForestRegressor()
  
 grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='r2',
    cv=5
 )
  
 grid_search.fit(X_train, y_train)
  
  
 best_params_grid = grid_search.best_params_
 best_model_grid = grid_search.best_estimator_
  
 y_pred_grid = best_model_grid.predict(X_test)
  
 # Calcul du R2 sur l'ensemble de test
 r2_test_grid = r2_score(y_test, y_pred_grid)
  
 print("Meilleurs hyperparamètres (GridSearchCV) :", best_params_grid)
 print("Score R2 sur l'ensemble de test :", r2_test_grid)
 ```
  
 %% Output
  
    Meilleurs hyperparamètres (GridSearchCV) : {'max_depth': 40, 'n_estimators': 50}
    Score R2 sur l'ensemble de test : 0.8019025540167176
  
 %% Cell type:markdown id: tags:
  
 On constate qu'on a pas les memes meilleurs hyperparamètres  entre GridSearchCV et RandomSearchCV, et on obtient un meilleur score R2 pour GridSearchCV ce qui est normal car GridSearchCV explore toutes les combinaisons d'hyperparamètres possibles contrairement à RandomSearchCV

--- a/requirements.txt
+++ b/requirements.txt
@@ -3,3 +3,4 @@ numpy>=1.24.3
 matplotlib>=3.7.1
 seaborn>=0.12.2
 scikit-learn>=1.2.2
+missingno>=0.5.2
\ No newline at end of file