diff --git a/src/.Rhistory b/src/.Rhistory
new file mode 100644
index 0000000000000000000000000000000000000000..7de21e43a55a0df46876f9bc7280594f5ac384e4
--- /dev/null
+++ b/src/.Rhistory
@@ -0,0 +1,2 @@
+if (!require("SNPRelate", quietly = TRUE))
+BiocManager::install("SNPRelate")
diff --git a/src/download_data.R b/src/download_data.R
new file mode 100644
index 0000000000000000000000000000000000000000..61e5291ad5d4922b8943bd354266bfbdebf5fbf0
--- /dev/null
+++ b/src/download_data.R
@@ -0,0 +1,54 @@
+options(repos = c(CRAN = "https://cloud.r-project.org"))
+
+# Install 'remotes' if it's not already installed
+if (!require("remotes")) {
+  install.packages("remotes", dependencies = TRUE)
+  library(remotes)
+}
+
+# Load necessary library and install a specific version if not present
+if (!requireNamespace("digest", quietly = TRUE)) {
+  remotes::install_version("digest", version = "0.6.25", repos = "https://cloud.r-project.org")
+}
+# Load necessary library
+if (!require("digest")) install.packages("digest", dependencies = TRUE)
+library(digest)
+
+# Define variables
+# wdir="/shared/projects/2427_data_master/user/agonzalez/m2bsgreprod/src"
+wdir="."
+results_dir <- file.path(wdir, "results")
+url <- "https://d1ypx1ckp5bo16.cloudfront.net/penncath/penncath.tar.gz"
+dest_file <- file.path(results_dir, "penncath.tar.gz")
+expected_md5 <- "5d5f422aeafdd2d725ad93f447d9af4b"
+
+# Create results directory if it doesn't exist
+if (!dir.exists(results_dir)) {
+  dir.create(results_dir, recursive = TRUE)
+}
+
+# Check if the file exists
+if (!file.exists(dest_file)) {
+  message("File does not exist. Downloading...")
+  # Download the file
+  download.file(url, dest_file, method = "auto")
+} else {
+  message("File already exists.")
+}
+
+# Verify the MD5 checksum
+actual_md5 <- digest(dest_file, algo = "md5", file = TRUE)
+
+if (actual_md5 == expected_md5) {
+  message("MD5 checksum matches! Proceeding to extract the file...")
+  
+  # Uncompress the file
+  untar(dest_file, exdir = results_dir)
+  message("File uncompressed successfully!")
+  
+} else {
+  stop("MD5 checksum does not match!")
+}
+
+
+
diff --git a/src/extp2.R b/src/extp2.R
new file mode 100644
index 0000000000000000000000000000000000000000..ed23afb7b725a459958e2815d06f9353bdbbbabe
--- /dev/null
+++ b/src/extp2.R
@@ -0,0 +1,28 @@
+
+wdir ="/amuhome/s23014817/m2bsgreprod/sources"
+dir.create(wdir, showWarnings = F, recursive = T)
+setwd(wdir)
+
+library(devtools)
+
+if (!require("BiocManager", quietly = TRUE)) #boucle qui permet de dire que s'il n'est pas installé il faut l'installer mais s'il est déjà installé pas besoin 
+  install.packages("BiocManager")
+
+if (!require("snpStats", quietly = TRUE))
+  BiocManager::install("snpStats")
+
+if (!require("SNPRelate", quietly = TRUE))
+  BiocManager::install("SNPRelate")
+
+library(snpStats)
+library(SNPRelate)
+
+# Les données analysées nécessitant beaucoup de RAM, nous allons sélectionner aléatoirement 250000 SNPs et réecrire des fichiers bed, bim, penncath
+penncath_bed_path = "/amuhome/s23014817/m2bsgreprod/results/data/penncath.bed"
+penncath_bim_path = "/amuhome/s23014817/m2bsgreprod/results/data/penncath.bim"
+penncath_fam_path = "/amuhome/s23014817/m2bsgreprod/results/data/penncath.fam"
+geno <- read.plink(penncath_bed_path, penncath_bim_path, penncath_fam_path, select.snps=sample(1:861473, 25000, replace = FALSE ), na.strings = ("-9"))
+
+
+
+
diff --git a/src/tp1.R b/src/tp1.R
new file mode 100644
index 0000000000000000000000000000000000000000..44b14f311b1cadf89e0651bbdabe4c9b3cef90dc
--- /dev/null
+++ b/src/tp1.R
@@ -0,0 +1,45 @@
+options(repos = c(CRAN = "https://cloud.r-project.org"))
+output_dir = "results/tp1"
+dir.create(output_dir, showWarnings = F, recursive = T)
+
+# if (!require("BiocManager", quietly = TRUE))
+#  install.packages("BiocManager")
+
+#if (!require("snpStats", quietly = TRUE))
+#  BiocManager::install("snpStats")
+
+#if (!require("SNPRelate", quietly = TRUE))
+#BiocManager::install("SNPRelate")
+
+# Charger les bibliothèques
+#library(snpStats)
+#library(SNPRelate)
+#library(devtools)
+#library(plyr)
+
+# Les données analysées nécessitant beaucoup de RAM, nous allons sélectionner aléatoirement 250000 SNPs et réecrire des fichiers bed, bim, fam
+penncath_bed_path = "results/data/penncath.bed"
+penncath_bim_path = "results/data/penncath.bim"
+penncath_fam_path = "results/data/penncath.fam"
+
+geno <- snpStats::read.plink(penncath_bed_path, penncath_bim_path, penncath_fam_path, select.snps=sample(1:861473, 25000, replace = FALSE ), na.strings = ("-9"))
+
+plink_base=file.path(output_dir, "plink_base")
+snpStats::write.plink(plink_base, snps=geno$genotypes, pedigree=geno$fam[,1], id=geno$fam[,1], mother=geno$fam[,4], sex=geno$fam[,5], phenotype=geno$fam[,6], chromosome = geno$map[,1], genetic.distance = geno$map[,3], position = geno$map[,4], allele.1 = geno$map[,5], allele.2 = geno$map[,6], na.code = ("-9"))
+
+genoBim<-geno$map
+colnames(genoBim)<-c("chr", "SNP", "gen.dist", "position", "A1", "A2")
+#head(genoBim)
+
+genotype<-geno$genotype
+#print(genotype)
+
+genoFam<-geno$fam
+#head(genoFam)
+
+# On commence par libérer de l'espace
+rm(geno)
+
+rdata_path = file.path(output_dir, "TP1_asbvg.RData")
+save.image(rdata_path)
+
diff --git a/src/tp2.R b/src/tp2.R
new file mode 100644
index 0000000000000000000000000000000000000000..1ab01b543ea7efa6d6f0ff9f507543d517bdf0bc
--- /dev/null
+++ b/src/tp2.R
@@ -0,0 +1,20 @@
+#load data
+load("results/tp1/TP1_asbvg.RData")
+
+#fichier de sortie
+output_dir = "results/tp2"
+dir.create(output_dir, showWarnings = F, recursive = T)
+
+#on installe et ouvre les packages/librairies nécessaires
+
+if (!require("BiocManager", quietly=TRUE))
+  install.packages("BiocManager")
+
+if (!require("SNPRelate", quietly=TRUE))
+  BiocManager::install("SNPRelate")
+
+#on enregistre le fichier de sortie 
+
+rdata_path = file.path(output_dir, "TP2_asbvg.RData")
+save.image(rdata_path)
+
diff --git a/src/tp2rmd.R b/src/tp2rmd.R
new file mode 100644
index 0000000000000000000000000000000000000000..dbf0a3a0b69ebe9522be5cfc0d0d1d7154cfde3f
--- /dev/null
+++ b/src/tp2rmd.R
@@ -0,0 +1,123 @@
+#on télécharge le fichier de données 
+
+load("results/tp1/TP1_asbvg.RData")
+
+#on crée un dossier dans lequel se trouveront les fichiers de sortie
+
+output_dir ="results/tp2"
+dir.create(output2_dir, showWarnings = F, recursive = T)
+
+#on installe les librairies nécessaires
+
+if (!require("BiocManager", quietly = TRUE)) # 
+  install.packages("BiocManager")
+
+if (!require("SNPRelate", quietly = TRUE))
+  BiocManager::install("SNPRelate")
+
+#on charge la librairie 
+
+#library(SNPRelate)  
+
+#on ouvre le fichier contenant les données SNPs
+
+#genofile <- SNPRelate::snpgdsOpen(fichier$gds, readonly = TRUE)
+
+# De nouveau on effectue une étape d'élagage
+# Placer le seuil de LD à 0.2
+
+ld.thresh <- 0.2
+
+set.seed(1000)
+geno.sample.ids <- rownames(genotype)
+snpSUB <- SNPRelate::snpgdsLDpruning(genofile, ld.threshold = ld.thresh,
+                          sample.id = geno.sample.ids, # On analyse uniquement les individus filtrés
+                          snp.id = colnames(genotype)) # On analyse uniquement les SNPs filtrés
+
+snpset.pca <- unlist(snpSUB, use.names=FALSE)
+cat(length(snpset.pca),"SNPs will be used in PCA analysis. \n") 
+
+pca <- SNPRelate::snpgdsPCA(genofile, sample.id = geno.sample.ids,  snp.id = snpset.pca, num.thread=1)
+
+# Calcul et sauvegarde des 10 composantes principales en un data frame où chaque colonne est une PC
+
+pcs <- data.frame(FamID = pca$sample.id, pca$eigenvect[,1 : 10], stringsAsFactors = FALSE)
+colnames(pcs)[2:11]<-paste("pc", 1:10, sep = "")
+
+print(head(pcs)) # Exercice 1
+closefn.gds(genofile)
+save(genotype, genoBim, clinical, pcs, file=working.data.fname(5))
+
+#Exercice 1
+
+# Lecture des données de 1000G pour le chromosome 16
+pedfile.bed <- read.pedfile(plink_base.bed$ped, snps = plink_base.bed$info, which=1)
+pedfile.fam <- read.pedfile(plink_base.fam$ped, snps = plink_base.fam$info , which=1)
+pedffile.bim <- read.pedfile(plink_base.bim$ped, snps = plink_base.bim$info, which=1)
+
+
+# Obtenir/stocker les données génotypiques pour le chromosome 16
+genoMatrix <- thougeno$genotypes
+
+# Obtenir/stocker les positions chromosomiques pour chaque SNP
+support <- thougeno$map
+colnames(support)<-c("SNP", "position", "A1", "A2")
+head(support)
+
+# Imputation des SNPs non génotypés issus des données 1000G
+# Générer une "sous"-matrice comportant les SNPs génotypés et filtrés du chromosome 16
+presSnps <- colnames(genotype)
+presDatChr <- genoBim[genoBim$SNP %in% presSnps & genoBim$chr==16, ]
+targetSnps <- presDatChr$SNP
+
+# Générer deux "sous"-matrice de SNPs issus de 1000G : "present" (matrice des SNPs 1000G typés dans nos données) et "missing" (matrice des SNPs 1000G non-typés dans nos données). Ces matrices sont nécessaires à l'établissement des règles d'imputation
+is.present <- colnames(genoMatrix) %in% targetSnps
+missing <- genoMatrix[,!is.present]
+present <- genoMatrix[,is.present]
+print(missing) 
+print(present)    
+
+#Exercice 2
+
+# Obtenir et stocker les positions des SNPs utilisés dans les règles d'imputation
+pos.pres <- support$position[is.present]
+pos.miss <- support$position[!is.present]
+
+# Calcul et sauvegarde des règles d'imputation à l'aide de snp.imputation()
+rules <- snp.imputation(present, missing, pos.pres, pos.miss)
+
+#Exercice 3
+
+# Suppression des imputations ayant échouées
+rules <- rules[can.impute(rules)]
+cat("Imputation rules for", length(rules), "SNPs were estimated\n") 
+
+#Exercice 4
+
+# Contrôles-qualité : incertitude et faible MAF estimée
+# Fixation des seuils
+r2threshold <- 0.7
+minor <- 0.01
+
+# Filtre sur l'incertitude et la faible MAF
+rules <- rules[imputation.r2(rules) >= r2threshold]
+cat(length(rules),"imputation rules remain after imputations with low certainty were removed\n")  
+
+rules <- rules[imputation.maf(rules) >= minor]
+cat(length(rules),"imputation rules remain after MAF filtering\n") 
+
+# Obtenir le génotype des SNPs imputés
+target <- genotype[,targetSnps]
+imputed <- impute.snps(rules, target, as.numeric=FALSE)
+print(imputed)  
+
+
+#on crée le fichier de sortie 
+
+rdata_path=file.path(output_dir, "TP2_asbvg.RData")
+save.image(rdata2_path) 
+
+
+
+
+
diff --git a/src/tp3.R b/src/tp3.R
new file mode 100644
index 0000000000000000000000000000000000000000..58cf65fd4b20eb4a4277e804a02446cdaf4ec60f
--- /dev/null
+++ b/src/tp3.R
@@ -0,0 +1,25 @@
+#load data
+load("results/tp2/TP2_asbvg.RData")
+
+#fichier de sortie
+output_dir = "results/tp3"
+dir.create(output_dir, showWarnings = F, recursive = T)
+
+#library
+if (!require("BiocManager", quietly = TRUE))
+  install.packages("BiocManager")
+
+if (!require("snpStats", quietly = TRUE))
+  BiocManager::install("snpStats")
+
+#if (!require("doParallel", quietly = TRUE))
+  #BiocManager::install("doParallel")
+
+library(snpStats)
+library(plyr)
+library(GenABEL)
+#library(doParallel)
+
+rdata_path = file.path(output_dir,"TP3_asbvg.RData")
+save.image(rdata_path)
+
diff --git a/src/tp4.R b/src/tp4.R
new file mode 100644
index 0000000000000000000000000000000000000000..89340804dbc635b96a5136de52dd128ca6f36946
--- /dev/null
+++ b/src/tp4.R
@@ -0,0 +1,20 @@
+#load data
+load("results/tp3/TP3_asbvg.RData")
+
+#fichier de sortie
+output_dir = "results/tp4"
+dir.create(output_dir, showWarnings = F, recursive = T)
+
+#Library
+if(!require("dplyr",quietly = T))
+  install.packages("dplyr")
+
+if(!require("plyr",quietly = T))
+  install.packages("plyr")
+
+if(!require("GenABEL",quietly = T))
+  install.packages("GenABEL")
+
+rdata_path = file.path(output_dir,"TP4_asbvg.RData")
+save.image(rdata_path)
+
diff --git a/workflows/makefile b/workflows/makefile
new file mode 100644
index 0000000000000000000000000000000000000000..32f14a625289abf5ef5443caf95bb4aa908a693a
--- /dev/null
+++ b/workflows/makefile
@@ -0,0 +1,15 @@
+all:results/tp4/TP4_asbvg.RData
+	Rscript src/tp4.R
+
+results/tp4/TP4_asbvg.RData:results/tp3/TP3_asbvg.RData
+	Rscript src/tp3.R
+
+results/tp3/TP3_asbvg.RData:results/tp2/TP2_asbvg.RData
+	Rscript src/tp2.R
+
+results/tp2/TP2_asbvg.RData:results/tp1/plink_base.bed results/tp1/plink_base.bim results/tp1/plink_base.fam results/tp1/TP1_asbvg.RData
+	Rscript src/tp1.R
+
+results/tp1/plink_base.bed results/tp1/plink_base.bim results/tp1/plink_base.fam results/tp1/TP1_asbvg.RData:results/data/penncath.bed results/data/penncath.bim results/data/penncath.fam
+	Rscript src/download_data.R
+
diff --git a/workflows/makefile_ex b/workflows/makefile_ex
new file mode 100644
index 0000000000000000000000000000000000000000..5032c3baf51bd547e8513796ba0a33c88a220d2a
--- /dev/null
+++ b/workflows/makefile_ex
@@ -0,0 +1,9 @@
+# Cible principale : exécuter le script tp2.R après avoir obtenu les fichiers nécessaires
+.PHONY: run_tp2
+run_tp2: results/data/penncath.bim results/data/penncath.bed results/data/penncath.fam
+	Rscript sources/tp2.R  
+
+# Générer les fichiers penncath.bim, penncath.bed et penncath.fam en exécutant download_data.R
+results/data/penncath.bim results/data/penncath.bed results/data/penncath.fam:
+	Rscript sources/download_data.R  # Ceci aussi doit commencer par une tabulation
+