From 8482dc2aaad8c3b56753995fd6dd8ba14ff58903 Mon Sep 17 00:00:00 2001 From: THIBERT-RIPOSO Anais <t20006223@V-PP-47-018.salsa.univ-amu.fr> Date: Mon, 21 Oct 2024 17:03:31 +0200 Subject: [PATCH] scripts tp1 r --- src/download_data.R | 52 +++++++++++++++++++++++++++++++++++++++++++++ src/tp2.R | 24 +++++++++++++++++++++ 2 files changed, 76 insertions(+) create mode 100644 src/download_data.R create mode 100644 src/tp2.R diff --git a/src/download_data.R b/src/download_data.R new file mode 100644 index 0000000..2f0d18b --- /dev/null +++ b/src/download_data.R @@ -0,0 +1,52 @@ +options(repos = c(CRAN = "https://cloud.r-project.org")) + +# Install 'remotes' if it's not already installed +if (!require("remotes")) { + install.packages("remotes", dependencies = TRUE) + library(remotes) +} + +# Load necessary library and install a specific version if not present +if (!requireNamespace("digest", quietly = TRUE)) { + remotes::install_version("digest", version = "0.6.25", repos = "https://cloud.r-project.org") +} +# Load necessary library +if (!require("digest")) install.packages("digest", dependencies = TRUE) +library(digest) + +# Define variables +# wdir="/shared/projects/2427_data_master/user/agonzalez/m2bsgreprod/src" +wdir="." +results_dir <- file.path(wdir, "results") +url <- "https://d1ypx1ckp5bo16.cloudfront.net/penncath/penncath.tar.gz" +dest_file <- file.path(results_dir, "penncath.tar.gz") +expected_md5 <- "5d5f422aeafdd2d725ad93f447d9af4b" + +# Create results directory if it doesn't exist +if (!dir.exists(results_dir)) { + dir.create(results_dir, recursive = TRUE) +} + +# Check if the file exists +if (!file.exists(dest_file)) { + message("File does not exist. Downloading...") + # Download the file + download.file(url, dest_file, method = "auto") +} else { + message("File already exists.") +} + +# Verify the MD5 checksum +actual_md5 <- digest(dest_file, algo = "md5", file = TRUE) + +if (actual_md5 == expected_md5) { + message("MD5 checksum matches! Proceeding to extract the file...") + + # Uncompress the file + untar(dest_file, exdir = results_dir) + message("File uncompressed successfully!") + +} else { + stop("MD5 checksum does not match!") +} + diff --git a/src/tp2.R b/src/tp2.R new file mode 100644 index 0000000..4977de1 --- /dev/null +++ b/src/tp2.R @@ -0,0 +1,24 @@ +wdir="/amuhome/t20006223/m2bsgreprod/src" +dir.create(wdir) +setwd(wdir) + +library(devtools) + +if (!require("BiocManager", quietly=TRUE)) + install.packages("BiocManager") + +if (!require("snpStats", quietly=TRUE)) + BiocManager::install("snpStats") + +if (!require("SNPRelate", quietly=TRUE)) + BiocManager::install("SNPRelate") + +library(snpStats) + +library(SNPRelate) + +# Les données analysées nécessitant beaucoup de RAM, nous allons sélectionner aléatoirement 250000 SNPs et réecrire des fichiers bed, bim, fam +penncath_bed_path = "/amuhome/t20006223/m2bsgreprod/results/data/penncath.bed" +penncath_bim_path = "/amuhome/t20006223/m2bsgreprod/results/data/penncath.bim" +penncath_fam_path = "/amuhome/t20006223/m2bsgreprod/results/data/penncath.fam" +geno <- snpStats::read.plink(penncath_bed_path, penncath_bim_path, penncath_fam_path, select.snps=sample(1:861473, 25000, replace = FALSE ), na.strings = ("-9")) -- GitLab