Skip to content
Snippets Groups Projects
Commit dbba0f75 authored by SOLER Fiona's avatar SOLER Fiona
Browse files

scripts R

parent f31e9a68
No related branches found
No related tags found
No related merge requests found
if (!require("SNPRelate", quietly = TRUE))
BiocManager::install("SNPRelate")
options(repos = c(CRAN = "https://cloud.r-project.org"))
# Install 'remotes' if it's not already installed
if (!require("remotes")) {
install.packages("remotes", dependencies = TRUE)
library(remotes)
}
# Load necessary library and install a specific version if not present
if (!requireNamespace("digest", quietly = TRUE)) {
remotes::install_version("digest", version = "0.6.25", repos = "https://cloud.r-project.org")
}
# Load necessary library
if (!require("digest")) install.packages("digest", dependencies = TRUE)
library(digest)
# Define variables
# wdir="/shared/projects/2427_data_master/user/agonzalez/m2bsgreprod/src"
wdir="."
results_dir <- file.path(wdir, "results")
url <- "https://d1ypx1ckp5bo16.cloudfront.net/penncath/penncath.tar.gz"
dest_file <- file.path(results_dir, "penncath.tar.gz")
expected_md5 <- "5d5f422aeafdd2d725ad93f447d9af4b"
# Create results directory if it doesn't exist
if (!dir.exists(results_dir)) {
dir.create(results_dir, recursive = TRUE)
}
# Check if the file exists
if (!file.exists(dest_file)) {
message("File does not exist. Downloading...")
# Download the file
download.file(url, dest_file, method = "auto")
} else {
message("File already exists.")
}
# Verify the MD5 checksum
actual_md5 <- digest(dest_file, algo = "md5", file = TRUE)
if (actual_md5 == expected_md5) {
message("MD5 checksum matches! Proceeding to extract the file...")
# Uncompress the file
untar(dest_file, exdir = results_dir)
message("File uncompressed successfully!")
} else {
stop("MD5 checksum does not match!")
}
wdir ="/amuhome/s23014817/m2bsgreprod/sources"
dir.create(wdir, showWarnings = F, recursive = T)
setwd(wdir)
library(devtools)
if (!require("BiocManager", quietly = TRUE)) #boucle qui permet de dire que s'il n'est pas installé il faut l'installer mais s'il est déjà installé pas besoin
install.packages("BiocManager")
if (!require("snpStats", quietly = TRUE))
BiocManager::install("snpStats")
if (!require("SNPRelate", quietly = TRUE))
BiocManager::install("SNPRelate")
library(snpStats)
library(SNPRelate)
# Les données analysées nécessitant beaucoup de RAM, nous allons sélectionner aléatoirement 250000 SNPs et réecrire des fichiers bed, bim, penncath
penncath_bed_path = "/amuhome/s23014817/m2bsgreprod/results/data/penncath.bed"
penncath_bim_path = "/amuhome/s23014817/m2bsgreprod/results/data/penncath.bim"
penncath_fam_path = "/amuhome/s23014817/m2bsgreprod/results/data/penncath.fam"
geno <- read.plink(penncath_bed_path, penncath_bim_path, penncath_fam_path, select.snps=sample(1:861473, 25000, replace = FALSE ), na.strings = ("-9"))
options(repos = c(CRAN = "https://cloud.r-project.org"))
output_dir = "results/tp1"
dir.create(output_dir, showWarnings = F, recursive = T)
# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#if (!require("snpStats", quietly = TRUE))
# BiocManager::install("snpStats")
#if (!require("SNPRelate", quietly = TRUE))
#BiocManager::install("SNPRelate")
# Charger les bibliothèques
#library(snpStats)
#library(SNPRelate)
#library(devtools)
#library(plyr)
# Les données analysées nécessitant beaucoup de RAM, nous allons sélectionner aléatoirement 250000 SNPs et réecrire des fichiers bed, bim, fam
penncath_bed_path = "results/data/penncath.bed"
penncath_bim_path = "results/data/penncath.bim"
penncath_fam_path = "results/data/penncath.fam"
geno <- snpStats::read.plink(penncath_bed_path, penncath_bim_path, penncath_fam_path, select.snps=sample(1:861473, 25000, replace = FALSE ), na.strings = ("-9"))
plink_base=file.path(output_dir, "plink_base")
snpStats::write.plink(plink_base, snps=geno$genotypes, pedigree=geno$fam[,1], id=geno$fam[,1], mother=geno$fam[,4], sex=geno$fam[,5], phenotype=geno$fam[,6], chromosome = geno$map[,1], genetic.distance = geno$map[,3], position = geno$map[,4], allele.1 = geno$map[,5], allele.2 = geno$map[,6], na.code = ("-9"))
genoBim<-geno$map
colnames(genoBim)<-c("chr", "SNP", "gen.dist", "position", "A1", "A2")
#head(genoBim)
genotype<-geno$genotype
#print(genotype)
genoFam<-geno$fam
#head(genoFam)
# On commence par libérer de l'espace
rm(geno)
rdata_path = file.path(output_dir, "TP1_asbvg.RData")
save.image(rdata_path)
#load data
load("results/tp1/TP1_asbvg.RData")
#fichier de sortie
output_dir = "results/tp2"
dir.create(output_dir, showWarnings = F, recursive = T)
#on installe et ouvre les packages/librairies nécessaires
if (!require("BiocManager", quietly=TRUE))
install.packages("BiocManager")
if (!require("SNPRelate", quietly=TRUE))
BiocManager::install("SNPRelate")
#on enregistre le fichier de sortie
rdata_path = file.path(output_dir, "TP2_asbvg.RData")
save.image(rdata_path)
#on télécharge le fichier de données
load("results/tp1/TP1_asbvg.RData")
#on crée un dossier dans lequel se trouveront les fichiers de sortie
output_dir ="results/tp2"
dir.create(output2_dir, showWarnings = F, recursive = T)
#on installe les librairies nécessaires
if (!require("BiocManager", quietly = TRUE)) #
install.packages("BiocManager")
if (!require("SNPRelate", quietly = TRUE))
BiocManager::install("SNPRelate")
#on charge la librairie
#library(SNPRelate)
#on ouvre le fichier contenant les données SNPs
#genofile <- SNPRelate::snpgdsOpen(fichier$gds, readonly = TRUE)
# De nouveau on effectue une étape d'élagage
# Placer le seuil de LD à 0.2
ld.thresh <- 0.2
set.seed(1000)
geno.sample.ids <- rownames(genotype)
snpSUB <- SNPRelate::snpgdsLDpruning(genofile, ld.threshold = ld.thresh,
sample.id = geno.sample.ids, # On analyse uniquement les individus filtrés
snp.id = colnames(genotype)) # On analyse uniquement les SNPs filtrés
snpset.pca <- unlist(snpSUB, use.names=FALSE)
cat(length(snpset.pca),"SNPs will be used in PCA analysis. \n")
pca <- SNPRelate::snpgdsPCA(genofile, sample.id = geno.sample.ids, snp.id = snpset.pca, num.thread=1)
# Calcul et sauvegarde des 10 composantes principales en un data frame où chaque colonne est une PC
pcs <- data.frame(FamID = pca$sample.id, pca$eigenvect[,1 : 10], stringsAsFactors = FALSE)
colnames(pcs)[2:11]<-paste("pc", 1:10, sep = "")
print(head(pcs)) # Exercice 1
closefn.gds(genofile)
save(genotype, genoBim, clinical, pcs, file=working.data.fname(5))
#Exercice 1
# Lecture des données de 1000G pour le chromosome 16
pedfile.bed <- read.pedfile(plink_base.bed$ped, snps = plink_base.bed$info, which=1)
pedfile.fam <- read.pedfile(plink_base.fam$ped, snps = plink_base.fam$info , which=1)
pedffile.bim <- read.pedfile(plink_base.bim$ped, snps = plink_base.bim$info, which=1)
# Obtenir/stocker les données génotypiques pour le chromosome 16
genoMatrix <- thougeno$genotypes
# Obtenir/stocker les positions chromosomiques pour chaque SNP
support <- thougeno$map
colnames(support)<-c("SNP", "position", "A1", "A2")
head(support)
# Imputation des SNPs non génotypés issus des données 1000G
# Générer une "sous"-matrice comportant les SNPs génotypés et filtrés du chromosome 16
presSnps <- colnames(genotype)
presDatChr <- genoBim[genoBim$SNP %in% presSnps & genoBim$chr==16, ]
targetSnps <- presDatChr$SNP
# Générer deux "sous"-matrice de SNPs issus de 1000G : "present" (matrice des SNPs 1000G typés dans nos données) et "missing" (matrice des SNPs 1000G non-typés dans nos données). Ces matrices sont nécessaires à l'établissement des règles d'imputation
is.present <- colnames(genoMatrix) %in% targetSnps
missing <- genoMatrix[,!is.present]
present <- genoMatrix[,is.present]
print(missing)
print(present)
#Exercice 2
# Obtenir et stocker les positions des SNPs utilisés dans les règles d'imputation
pos.pres <- support$position[is.present]
pos.miss <- support$position[!is.present]
# Calcul et sauvegarde des règles d'imputation à l'aide de snp.imputation()
rules <- snp.imputation(present, missing, pos.pres, pos.miss)
#Exercice 3
# Suppression des imputations ayant échouées
rules <- rules[can.impute(rules)]
cat("Imputation rules for", length(rules), "SNPs were estimated\n")
#Exercice 4
# Contrôles-qualité : incertitude et faible MAF estimée
# Fixation des seuils
r2threshold <- 0.7
minor <- 0.01
# Filtre sur l'incertitude et la faible MAF
rules <- rules[imputation.r2(rules) >= r2threshold]
cat(length(rules),"imputation rules remain after imputations with low certainty were removed\n")
rules <- rules[imputation.maf(rules) >= minor]
cat(length(rules),"imputation rules remain after MAF filtering\n")
# Obtenir le génotype des SNPs imputés
target <- genotype[,targetSnps]
imputed <- impute.snps(rules, target, as.numeric=FALSE)
print(imputed)
#on crée le fichier de sortie
rdata_path=file.path(output_dir, "TP2_asbvg.RData")
save.image(rdata2_path)
#load data
load("results/tp2/TP2_asbvg.RData")
#fichier de sortie
output_dir = "results/tp3"
dir.create(output_dir, showWarnings = F, recursive = T)
#library
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
if (!require("snpStats", quietly = TRUE))
BiocManager::install("snpStats")
#if (!require("doParallel", quietly = TRUE))
#BiocManager::install("doParallel")
library(snpStats)
library(plyr)
library(GenABEL)
#library(doParallel)
rdata_path = file.path(output_dir,"TP3_asbvg.RData")
save.image(rdata_path)
#load data
load("results/tp3/TP3_asbvg.RData")
#fichier de sortie
output_dir = "results/tp4"
dir.create(output_dir, showWarnings = F, recursive = T)
#Library
if(!require("dplyr",quietly = T))
install.packages("dplyr")
if(!require("plyr",quietly = T))
install.packages("plyr")
if(!require("GenABEL",quietly = T))
install.packages("GenABEL")
rdata_path = file.path(output_dir,"TP4_asbvg.RData")
save.image(rdata_path)
all:results/tp4/TP4_asbvg.RData
Rscript src/tp4.R
results/tp4/TP4_asbvg.RData:results/tp3/TP3_asbvg.RData
Rscript src/tp3.R
results/tp3/TP3_asbvg.RData:results/tp2/TP2_asbvg.RData
Rscript src/tp2.R
results/tp2/TP2_asbvg.RData:results/tp1/plink_base.bed results/tp1/plink_base.bim results/tp1/plink_base.fam results/tp1/TP1_asbvg.RData
Rscript src/tp1.R
results/tp1/plink_base.bed results/tp1/plink_base.bim results/tp1/plink_base.fam results/tp1/TP1_asbvg.RData:results/data/penncath.bed results/data/penncath.bim results/data/penncath.fam
Rscript src/download_data.R
# Cible principale : exécuter le script tp2.R après avoir obtenu les fichiers nécessaires
.PHONY: run_tp2
run_tp2: results/data/penncath.bim results/data/penncath.bed results/data/penncath.fam
Rscript sources/tp2.R
# Générer les fichiers penncath.bim, penncath.bed et penncath.fam en exécutant download_data.R
results/data/penncath.bim results/data/penncath.bed results/data/penncath.fam:
Rscript sources/download_data.R # Ceci aussi doit commencer par une tabulation
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment