diff --git a/src/.Rhistory b/src/.Rhistory new file mode 100644 index 0000000000000000000000000000000000000000..7de21e43a55a0df46876f9bc7280594f5ac384e4 --- /dev/null +++ b/src/.Rhistory @@ -0,0 +1,2 @@ +if (!require("SNPRelate", quietly = TRUE)) +BiocManager::install("SNPRelate") diff --git a/src/download_data.R b/src/download_data.R new file mode 100644 index 0000000000000000000000000000000000000000..61e5291ad5d4922b8943bd354266bfbdebf5fbf0 --- /dev/null +++ b/src/download_data.R @@ -0,0 +1,54 @@ +options(repos = c(CRAN = "https://cloud.r-project.org")) + +# Install 'remotes' if it's not already installed +if (!require("remotes")) { + install.packages("remotes", dependencies = TRUE) + library(remotes) +} + +# Load necessary library and install a specific version if not present +if (!requireNamespace("digest", quietly = TRUE)) { + remotes::install_version("digest", version = "0.6.25", repos = "https://cloud.r-project.org") +} +# Load necessary library +if (!require("digest")) install.packages("digest", dependencies = TRUE) +library(digest) + +# Define variables +# wdir="/shared/projects/2427_data_master/user/agonzalez/m2bsgreprod/src" +wdir="." +results_dir <- file.path(wdir, "results") +url <- "https://d1ypx1ckp5bo16.cloudfront.net/penncath/penncath.tar.gz" +dest_file <- file.path(results_dir, "penncath.tar.gz") +expected_md5 <- "5d5f422aeafdd2d725ad93f447d9af4b" + +# Create results directory if it doesn't exist +if (!dir.exists(results_dir)) { + dir.create(results_dir, recursive = TRUE) +} + +# Check if the file exists +if (!file.exists(dest_file)) { + message("File does not exist. Downloading...") + # Download the file + download.file(url, dest_file, method = "auto") +} else { + message("File already exists.") +} + +# Verify the MD5 checksum +actual_md5 <- digest(dest_file, algo = "md5", file = TRUE) + +if (actual_md5 == expected_md5) { + message("MD5 checksum matches! Proceeding to extract the file...") + + # Uncompress the file + untar(dest_file, exdir = results_dir) + message("File uncompressed successfully!") + +} else { + stop("MD5 checksum does not match!") +} + + + diff --git a/src/extp2.R b/src/extp2.R new file mode 100644 index 0000000000000000000000000000000000000000..ed23afb7b725a459958e2815d06f9353bdbbbabe --- /dev/null +++ b/src/extp2.R @@ -0,0 +1,28 @@ + +wdir ="/amuhome/s23014817/m2bsgreprod/sources" +dir.create(wdir, showWarnings = F, recursive = T) +setwd(wdir) + +library(devtools) + +if (!require("BiocManager", quietly = TRUE)) #boucle qui permet de dire que s'il n'est pas installé il faut l'installer mais s'il est déjà installé pas besoin + install.packages("BiocManager") + +if (!require("snpStats", quietly = TRUE)) + BiocManager::install("snpStats") + +if (!require("SNPRelate", quietly = TRUE)) + BiocManager::install("SNPRelate") + +library(snpStats) +library(SNPRelate) + +# Les données analysées nécessitant beaucoup de RAM, nous allons sélectionner aléatoirement 250000 SNPs et réecrire des fichiers bed, bim, penncath +penncath_bed_path = "/amuhome/s23014817/m2bsgreprod/results/data/penncath.bed" +penncath_bim_path = "/amuhome/s23014817/m2bsgreprod/results/data/penncath.bim" +penncath_fam_path = "/amuhome/s23014817/m2bsgreprod/results/data/penncath.fam" +geno <- read.plink(penncath_bed_path, penncath_bim_path, penncath_fam_path, select.snps=sample(1:861473, 25000, replace = FALSE ), na.strings = ("-9")) + + + + diff --git a/src/tp1.R b/src/tp1.R new file mode 100644 index 0000000000000000000000000000000000000000..44b14f311b1cadf89e0651bbdabe4c9b3cef90dc --- /dev/null +++ b/src/tp1.R @@ -0,0 +1,45 @@ +options(repos = c(CRAN = "https://cloud.r-project.org")) +output_dir = "results/tp1" +dir.create(output_dir, showWarnings = F, recursive = T) + +# if (!require("BiocManager", quietly = TRUE)) +# install.packages("BiocManager") + +#if (!require("snpStats", quietly = TRUE)) +# BiocManager::install("snpStats") + +#if (!require("SNPRelate", quietly = TRUE)) +#BiocManager::install("SNPRelate") + +# Charger les bibliothèques +#library(snpStats) +#library(SNPRelate) +#library(devtools) +#library(plyr) + +# Les données analysées nécessitant beaucoup de RAM, nous allons sélectionner aléatoirement 250000 SNPs et réecrire des fichiers bed, bim, fam +penncath_bed_path = "results/data/penncath.bed" +penncath_bim_path = "results/data/penncath.bim" +penncath_fam_path = "results/data/penncath.fam" + +geno <- snpStats::read.plink(penncath_bed_path, penncath_bim_path, penncath_fam_path, select.snps=sample(1:861473, 25000, replace = FALSE ), na.strings = ("-9")) + +plink_base=file.path(output_dir, "plink_base") +snpStats::write.plink(plink_base, snps=geno$genotypes, pedigree=geno$fam[,1], id=geno$fam[,1], mother=geno$fam[,4], sex=geno$fam[,5], phenotype=geno$fam[,6], chromosome = geno$map[,1], genetic.distance = geno$map[,3], position = geno$map[,4], allele.1 = geno$map[,5], allele.2 = geno$map[,6], na.code = ("-9")) + +genoBim<-geno$map +colnames(genoBim)<-c("chr", "SNP", "gen.dist", "position", "A1", "A2") +#head(genoBim) + +genotype<-geno$genotype +#print(genotype) + +genoFam<-geno$fam +#head(genoFam) + +# On commence par libérer de l'espace +rm(geno) + +rdata_path = file.path(output_dir, "TP1_asbvg.RData") +save.image(rdata_path) + diff --git a/src/tp2.R b/src/tp2.R new file mode 100644 index 0000000000000000000000000000000000000000..1ab01b543ea7efa6d6f0ff9f507543d517bdf0bc --- /dev/null +++ b/src/tp2.R @@ -0,0 +1,20 @@ +#load data +load("results/tp1/TP1_asbvg.RData") + +#fichier de sortie +output_dir = "results/tp2" +dir.create(output_dir, showWarnings = F, recursive = T) + +#on installe et ouvre les packages/librairies nécessaires + +if (!require("BiocManager", quietly=TRUE)) + install.packages("BiocManager") + +if (!require("SNPRelate", quietly=TRUE)) + BiocManager::install("SNPRelate") + +#on enregistre le fichier de sortie + +rdata_path = file.path(output_dir, "TP2_asbvg.RData") +save.image(rdata_path) + diff --git a/src/tp2rmd.R b/src/tp2rmd.R new file mode 100644 index 0000000000000000000000000000000000000000..dbf0a3a0b69ebe9522be5cfc0d0d1d7154cfde3f --- /dev/null +++ b/src/tp2rmd.R @@ -0,0 +1,123 @@ +#on télécharge le fichier de données + +load("results/tp1/TP1_asbvg.RData") + +#on crée un dossier dans lequel se trouveront les fichiers de sortie + +output_dir ="results/tp2" +dir.create(output2_dir, showWarnings = F, recursive = T) + +#on installe les librairies nécessaires + +if (!require("BiocManager", quietly = TRUE)) # + install.packages("BiocManager") + +if (!require("SNPRelate", quietly = TRUE)) + BiocManager::install("SNPRelate") + +#on charge la librairie + +#library(SNPRelate) + +#on ouvre le fichier contenant les données SNPs + +#genofile <- SNPRelate::snpgdsOpen(fichier$gds, readonly = TRUE) + +# De nouveau on effectue une étape d'élagage +# Placer le seuil de LD à 0.2 + +ld.thresh <- 0.2 + +set.seed(1000) +geno.sample.ids <- rownames(genotype) +snpSUB <- SNPRelate::snpgdsLDpruning(genofile, ld.threshold = ld.thresh, + sample.id = geno.sample.ids, # On analyse uniquement les individus filtrés + snp.id = colnames(genotype)) # On analyse uniquement les SNPs filtrés + +snpset.pca <- unlist(snpSUB, use.names=FALSE) +cat(length(snpset.pca),"SNPs will be used in PCA analysis. \n") + +pca <- SNPRelate::snpgdsPCA(genofile, sample.id = geno.sample.ids, snp.id = snpset.pca, num.thread=1) + +# Calcul et sauvegarde des 10 composantes principales en un data frame où chaque colonne est une PC + +pcs <- data.frame(FamID = pca$sample.id, pca$eigenvect[,1 : 10], stringsAsFactors = FALSE) +colnames(pcs)[2:11]<-paste("pc", 1:10, sep = "") + +print(head(pcs)) # Exercice 1 +closefn.gds(genofile) +save(genotype, genoBim, clinical, pcs, file=working.data.fname(5)) + +#Exercice 1 + +# Lecture des données de 1000G pour le chromosome 16 +pedfile.bed <- read.pedfile(plink_base.bed$ped, snps = plink_base.bed$info, which=1) +pedfile.fam <- read.pedfile(plink_base.fam$ped, snps = plink_base.fam$info , which=1) +pedffile.bim <- read.pedfile(plink_base.bim$ped, snps = plink_base.bim$info, which=1) + + +# Obtenir/stocker les données génotypiques pour le chromosome 16 +genoMatrix <- thougeno$genotypes + +# Obtenir/stocker les positions chromosomiques pour chaque SNP +support <- thougeno$map +colnames(support)<-c("SNP", "position", "A1", "A2") +head(support) + +# Imputation des SNPs non génotypés issus des données 1000G +# Générer une "sous"-matrice comportant les SNPs génotypés et filtrés du chromosome 16 +presSnps <- colnames(genotype) +presDatChr <- genoBim[genoBim$SNP %in% presSnps & genoBim$chr==16, ] +targetSnps <- presDatChr$SNP + +# Générer deux "sous"-matrice de SNPs issus de 1000G : "present" (matrice des SNPs 1000G typés dans nos données) et "missing" (matrice des SNPs 1000G non-typés dans nos données). Ces matrices sont nécessaires à l'établissement des règles d'imputation +is.present <- colnames(genoMatrix) %in% targetSnps +missing <- genoMatrix[,!is.present] +present <- genoMatrix[,is.present] +print(missing) +print(present) + +#Exercice 2 + +# Obtenir et stocker les positions des SNPs utilisés dans les règles d'imputation +pos.pres <- support$position[is.present] +pos.miss <- support$position[!is.present] + +# Calcul et sauvegarde des règles d'imputation à l'aide de snp.imputation() +rules <- snp.imputation(present, missing, pos.pres, pos.miss) + +#Exercice 3 + +# Suppression des imputations ayant échouées +rules <- rules[can.impute(rules)] +cat("Imputation rules for", length(rules), "SNPs were estimated\n") + +#Exercice 4 + +# Contrôles-qualité : incertitude et faible MAF estimée +# Fixation des seuils +r2threshold <- 0.7 +minor <- 0.01 + +# Filtre sur l'incertitude et la faible MAF +rules <- rules[imputation.r2(rules) >= r2threshold] +cat(length(rules),"imputation rules remain after imputations with low certainty were removed\n") + +rules <- rules[imputation.maf(rules) >= minor] +cat(length(rules),"imputation rules remain after MAF filtering\n") + +# Obtenir le génotype des SNPs imputés +target <- genotype[,targetSnps] +imputed <- impute.snps(rules, target, as.numeric=FALSE) +print(imputed) + + +#on crée le fichier de sortie + +rdata_path=file.path(output_dir, "TP2_asbvg.RData") +save.image(rdata2_path) + + + + + diff --git a/src/tp3.R b/src/tp3.R new file mode 100644 index 0000000000000000000000000000000000000000..58cf65fd4b20eb4a4277e804a02446cdaf4ec60f --- /dev/null +++ b/src/tp3.R @@ -0,0 +1,25 @@ +#load data +load("results/tp2/TP2_asbvg.RData") + +#fichier de sortie +output_dir = "results/tp3" +dir.create(output_dir, showWarnings = F, recursive = T) + +#library +if (!require("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + +if (!require("snpStats", quietly = TRUE)) + BiocManager::install("snpStats") + +#if (!require("doParallel", quietly = TRUE)) + #BiocManager::install("doParallel") + +library(snpStats) +library(plyr) +library(GenABEL) +#library(doParallel) + +rdata_path = file.path(output_dir,"TP3_asbvg.RData") +save.image(rdata_path) + diff --git a/src/tp4.R b/src/tp4.R new file mode 100644 index 0000000000000000000000000000000000000000..89340804dbc635b96a5136de52dd128ca6f36946 --- /dev/null +++ b/src/tp4.R @@ -0,0 +1,20 @@ +#load data +load("results/tp3/TP3_asbvg.RData") + +#fichier de sortie +output_dir = "results/tp4" +dir.create(output_dir, showWarnings = F, recursive = T) + +#Library +if(!require("dplyr",quietly = T)) + install.packages("dplyr") + +if(!require("plyr",quietly = T)) + install.packages("plyr") + +if(!require("GenABEL",quietly = T)) + install.packages("GenABEL") + +rdata_path = file.path(output_dir,"TP4_asbvg.RData") +save.image(rdata_path) + diff --git a/workflows/makefile b/workflows/makefile new file mode 100644 index 0000000000000000000000000000000000000000..32f14a625289abf5ef5443caf95bb4aa908a693a --- /dev/null +++ b/workflows/makefile @@ -0,0 +1,15 @@ +all:results/tp4/TP4_asbvg.RData + Rscript src/tp4.R + +results/tp4/TP4_asbvg.RData:results/tp3/TP3_asbvg.RData + Rscript src/tp3.R + +results/tp3/TP3_asbvg.RData:results/tp2/TP2_asbvg.RData + Rscript src/tp2.R + +results/tp2/TP2_asbvg.RData:results/tp1/plink_base.bed results/tp1/plink_base.bim results/tp1/plink_base.fam results/tp1/TP1_asbvg.RData + Rscript src/tp1.R + +results/tp1/plink_base.bed results/tp1/plink_base.bim results/tp1/plink_base.fam results/tp1/TP1_asbvg.RData:results/data/penncath.bed results/data/penncath.bim results/data/penncath.fam + Rscript src/download_data.R + diff --git a/workflows/makefile_ex b/workflows/makefile_ex new file mode 100644 index 0000000000000000000000000000000000000000..5032c3baf51bd547e8513796ba0a33c88a220d2a --- /dev/null +++ b/workflows/makefile_ex @@ -0,0 +1,9 @@ +# Cible principale : exécuter le script tp2.R après avoir obtenu les fichiers nécessaires +.PHONY: run_tp2 +run_tp2: results/data/penncath.bim results/data/penncath.bed results/data/penncath.fam + Rscript sources/tp2.R + +# Générer les fichiers penncath.bim, penncath.bed et penncath.fam en exécutant download_data.R +results/data/penncath.bim results/data/penncath.bed results/data/penncath.fam: + Rscript sources/download_data.R # Ceci aussi doit commencer par une tabulation +