diff --git a/src/TP1.v2.R b/src/TP1.v2.R index 5bb53cd9367251bb9fae37a49d20b538dc442fa2..0db5778f338ab9aedbca089f7b894ee1a86837c0 100644 --- a/src/TP1.v2.R +++ b/src/TP1.v2.R @@ -8,20 +8,26 @@ output_dir <- file.path(m2bsgreprod, "results", "TP1") # Créer les répertoires si nécessaire dir.create(output_dir, showWarnings = FALSE, recursive = TRUE) +# Les données analysées nécessitant beaucoup de RAM, nous allons sélectionner aléatoirement 250000 SNPs et réecrire des fichiers bed, bim, fam +penncath_bed_path = file.path(data.dir, "penncath.bed") +penncath_bim_path = file.path(data.dir, "penncath.bim") +penncath_fam_path = file.path(data.dir, "penncath.fam") + +clinical_csv_path = "data/GWAStutorial_clinical.csv" + # Fichiers d'entrées gwas.fn<-lapply(c(bed='bed', fam='fam', bim ='bim', gds='gds'), function(n) sprintf("%s/GWAStutorial.%s", data.dir, n)) + gwas.fn.2<-lapply(c(bed='bed', fam='fam', bim ='bim', gds='gds'), function(n) sprintf("%s/data.%s", data.dir, n)) -clinical.fn<-sprintf("%s/GWAStutorial_clinical.csv", data.dir) + +# clinical.fn<-sprintf("%s/GWAStutorial_clinical.csv", data.dir) onethou.fn<-lapply(c(info='info', ped='ped'), function(n) sprintf("%s/chr16_1000g_CEU.%s", data.dir, n)) protein.coding.coords.fname<-sprintf("%s/ProCodgene_coords.csv", data.dir) # Sauvegarde des "modules" working.data.fname <- function(num) { sprintf("%s/working_%s.Rdata", output_dir, num) } -# Les données analysées nécessitant beaucoup de RAM, nous allons sélectionner aléatoirement 250000 SNPs et réecrire des fichiers bed, bim, fam -penncath_bed_path = file.path(data.dir, "penncath.bed") -penncath_bim_path = file.path(data.dir, "penncath.bim") -penncath_fam_path = file.path(data.dir, "penncath.fam") + geno <- snpStats::read.plink(penncath_bed_path, penncath_bim_path, penncath_fam_path, select.snps=sample(1:861473, 25000, replace = FALSE ), na.strings = ("-9")) @@ -39,7 +45,7 @@ genoFam<-geno$fam rm(geno) # On charge le fichier clinique -clinical<- read.csv(clinical.fn, colClasses = c("character", "factor", "factor", rep("numeric", 4))) +clinical<- read.csv(clinical_csv_path, colClasses = c("character", "factor", "factor", rep("numeric", 4))) rownames(clinical)<-clinical$FamID print(head(clinical)) diff --git a/src/TP1.v3.R b/src/TP1.v3.R index 15b1f1a9b55d579c72a69ff487853acd0a9612a9..abefe186331485710056cc4677639ded84e0841f 100644 --- a/src/TP1.v3.R +++ b/src/TP1.v3.R @@ -9,7 +9,6 @@ penncath_fam_path = "results/data/penncath.fam" geno <- snpStats::read.plink(penncath_bed_path, penncath_bim_path, penncath_fam_path, select.snps=sample(1:861473, 25000, replace = FALSE ), na.strings = ("-9")) - plink_base=file.path(output_dir, "plink_base") snpStats::write.plink(plink_base, snps=geno$genotypes, pedigree=geno$fam[,1], id=geno$fam[,1], mother=geno$fam[,4], sex=geno$fam[,5], phenotype=geno$fam[,6], chromosome = geno$map[,1], genetic.distance = geno$map[,3], position = geno$map[,4], allele.1 = geno$map[,5], allele.2 = geno$map[,6], na.code = ("-9")) @@ -26,6 +25,15 @@ genoFam<-geno$fam # On commence par libérer de l'espace rm(geno) +# On charge le fichier clinique +clinical_csv_path = "data/GWAStutorial_clinical.csv" +clinical<- read.csv(clinical_csv_path, colClasses = c("character", "factor", "factor", rep("numeric", 4))) +rownames(clinical)<-clinical$FamID +#print(head(clinical)) + +protein.coding.coords.fname<-"data/ProCodgene_coords.csv" + + rdata_path = file.path(output_dir, "TP1_asbvg.RData") save.image(rdata_path)