From 99ad914389060997854a07f439481fb305a48fe3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Flutre?= Date: Thu, 9 Nov 2017 11:07:58 +0100 Subject: [PATCH] add pruneSnpsLd() + minor fix in writeSegregJoinMap() --- DESCRIPTION | 3 +- NAMESPACE | 1 + R/quantgen.R | 87 +++++++++++++++++++++++++++++++++++++-- man/pruneSnpsLd.Rd | 50 ++++++++++++++++++++++ man/thinSnps.Rd | 3 ++ man/writeSegregJoinMap.Rd | 8 ++-- 6 files changed, 143 insertions(+), 9 deletions(-) create mode 100644 man/pruneSnpsLd.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 3319052..59d4a56 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: rutilstimflutre Title: Timothee Flutre's personal R code -Version: 0.150.0 +Version: 0.151.0 Authors@R: c( person("Timothee", "Flutre", email="timothee.flutre@inra.fr", role=c("aut", "ctb", "cre")), person("Peter", "Carbonetto", role="aut", comment="function from varbvs example"), @@ -59,6 +59,7 @@ Suggests: rstan, S4Vectors, scrm, + SNPRelate, sp, sparseMVN, stats, diff --git a/NAMESPACE b/NAMESPACE index 8f5a800..9cfc928 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -146,6 +146,7 @@ export(plotVcfPercNa) export(plotWithScale) export(precMatAR1) export(prettyPrintSummary) +export(pruneSnpsLd) export(pve2beta) export(qqplotPval) export(qtlrelPerChr) diff --git a/R/quantgen.R b/R/quantgen.R index 055b441..640d5ce 100644 --- a/R/quantgen.R +++ b/R/quantgen.R @@ -1283,11 +1283,11 @@ readSegregJoinMap <- function(file, na.string="--", verbose=1){ ##' Write genotype data in the \href{https://www.kyazma.nl/index.php/JoinMap/}{JoinMap} format into a "loc" file used by this software. ##' @param pop.name name of the population ##' @param pop.type type of the population -##' @param locus vector of locus names (should be shorter or equal to 20 characters, otherwise a warning will be issued) -##' @param segregs vector of segregation types +##' @param locus vector of locus names; should be shorter or equal to 20 characters, otherwise a warning will be issued; should be in the same order as the rows of the "genos" argument +##' @param segregs vector of segregation types; should be in the same order as the rows of the "genos" argument ##' @param genos data frame containing genotypes encoded in the JoinMap format, similar to the output from \code{\link{genoClasses2JoinMap}} -##' @param phases vector of phase types (optional) -##' @param classifs vector of classification types (optional) +##' @param phases vector of phase types (optional; should be in the same order as the rows of the "genos" argument) +##' @param classifs vector of classification types (optional; should be in the same order as the rows of the "genos" argument) ##' @param file name of the file in which the data will be saved (will be compressed if ends with ".gz" and "gzip" is available in the PATH) ##' @param save.ind.names if TRUE, individual names will be saved at the end of the file ##' @param na.string character to replace NA's in "genos" @@ -1372,6 +1372,7 @@ writeSegregJoinMap <- function(pop.name, pop.type="CP", tmp <- cbind(tmp, phases) if(! is.null(classifs)) tmp <- cbind(tmp, classifs) + rownames(tmp) <- NULL genos[is.na(genos)] <- na.string tmp <- cbind(tmp, genos) suppressWarnings(utils::write.table(x=tmp, file=con, append=TRUE, @@ -4924,6 +4925,7 @@ distConsecutiveSnps <- function(snp.coords, only.chr=NULL, nb.cores=1){ ##' @param only.chr identifier of a given chromosome ##' @return vector of SNP identifiers ##' @author Timothee Flutre +##' @seealso \code{\link{pruneSnpsLd}} ##' @export thinSnps <- function(method, threshold, snp.coords, only.chr=NULL){ requireNamespaces("GenomicRanges") @@ -4961,6 +4963,83 @@ thinSnps <- function(method, threshold, snp.coords, only.chr=NULL){ return(out.snp.ids) } +##' Prune SNPs based on LD +##' +##' Prune SNPs based on their pairwise linkage disequilibriums via sliding windows using the "snpgdsLDpruning" function from the "SNPRelate" package.. +##' @param X matrix of bi-allelic SNP genotypes encoded, for each SNP, in number of copies of its second allele, i.e. as allele doses in {0,1,2}, with genotypes in rows and SNPs in columns; the "second" allele is arbitrary, it can correspond to the minor (least frequent) or the major (most frequent) allele; will be transformed into a "gds" object (see next argument) +##' @param snp.coords data frame which row names are SNP identifiers, the first column should contain chromosomes as integers, and the second column should contain positions; compulsory if the X argument is specified +##' @param gds object of class "SNPGDSFileClass" from the SNPRelate package +##' @param ld.threshold the LD threshold below which SNPs are discarded; will be passed to "snpgdsLDpruning" +##' @param remove.monosnp will be passed to "snpgdsLDpruning" +##' @param maf will be passed to "snpgdsLDpruning" +##' @param missing.rate will be passed to "snpgdsLDpruning" +##' @param method will be passed to "snpgdsLDpruning" +##' @param slide.max.bp will be passed to "snpgdsLDpruning" +##' @param slide.max.n will be passed to "snpgdsLDpruning" +##' @param seed seed for the pseudo-random number generator +##' @param nb.cores will be passed to "snpgdsLDpruning" +##' @param verbose verbosity level (0/1) +##' @return vector of SNP identifiers +##' @author Timothee Flutre +##' @seealso \code{\link{thinSnps}}, \code{\link[SNPRelate]{snpgdsLDpruning}} +##' @export +pruneSnpsLd <- function(X=NULL, snp.coords=NULL, gds=NULL, + ld.threshold=0.2, + remove.monosnp=TRUE, maf=NaN, missing.rate=NaN, + method=c("composite", "r", "dprime", "corr"), + slide.max.bp=500000, slide.max.n=NA, + seed=NULL, nb.cores=1, verbose=1){ + requireNamespace("SNPRelate") + stopifnot(xor(is.null(X), is.null(gds))) + if(is.null(gds)){ + stopIfNotValidGenosDose(X=X, check.noNA=FALSE, check.notImputed=TRUE) + stopifnot(! is.null(snp.coords), + is.data.frame(snp.coords), + ! is.null(rownames(snp.coords)), + all(colnames(X) %in% rownames(snp.coords)), + ncol(snp.coords) >= 2) + } + + gds.file <- NULL + if(is.null(gds)){ + gds.file <- tempfile() + snp.coords <- snp.coords[colnames(X),] # re-order the rows + SNPRelate::snpgdsCreateGeno(gds.fn=gds.file, + genmat=X, + sample.id=rownames(X), + snp.id=colnames(X), + snp.chromosome=snp.coords[,1], + snp.position=snp.coords[,2], + snpfirstdim=FALSE) + gds <- SNPRelate::snpgdsOpen(gds.file) + } + + if(verbose > 1) + SNPRelate::snpgdsSummary(gds) + + if(! is.null(seed)) + set.seed(seed) + out.snp.ids <- + SNPRelate::snpgdsLDpruning(gdsobj=gds, + remove.monosnp=remove.monosnp, + maf=maf, + missing.rate=missing.rate, + method=method, + slide.max.bp=slide.max.bp, + slide.max.n=slide.max.n, + ld.threshold=ld.threshold, + num.thread=nb.cores, + verbose=ifelse(verbose > 0, TRUE, FALSE)) + out.snp.ids <- unlist(out.snp.ids) + + if(! is.null(gds.file)){ + SNPRelate::snpgdsClose(gds) + file.remove(gds.file) + } + + return(out.snp.ids) +} + ##' Genotype imputation ##' ##' Impute missing SNP genotypes with the mean of the non-missing. diff --git a/man/pruneSnpsLd.Rd b/man/pruneSnpsLd.Rd new file mode 100644 index 0000000..ce32b9d --- /dev/null +++ b/man/pruneSnpsLd.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/quantgen.R +\name{pruneSnpsLd} +\alias{pruneSnpsLd} +\title{Prune SNPs based on LD} +\usage{ +pruneSnpsLd(X = NULL, snp.coords = NULL, gds = NULL, ld.threshold = 0.2, + remove.monosnp = TRUE, maf = NaN, missing.rate = NaN, + method = c("composite", "r", "dprime", "corr"), slide.max.bp = 5e+05, + slide.max.n = NA, seed = NULL, nb.cores = 1, verbose = 1) +} +\arguments{ +\item{X}{matrix of bi-allelic SNP genotypes encoded, for each SNP, in number of copies of its second allele, i.e. as allele doses in {0,1,2}, with genotypes in rows and SNPs in columns; the "second" allele is arbitrary, it can correspond to the minor (least frequent) or the major (most frequent) allele; will be transformed into a "gds" object (see next argument)} + +\item{snp.coords}{data frame which row names are SNP identifiers, the first column should contain chromosomes as integers, and the second column should contain positions; compulsory if the X argument is specified} + +\item{gds}{object of class "SNPGDSFileClass" from the SNPRelate package} + +\item{ld.threshold}{the LD threshold below which SNPs are discarded; will be passed to "snpgdsLDpruning"} + +\item{remove.monosnp}{will be passed to "snpgdsLDpruning"} + +\item{maf}{will be passed to "snpgdsLDpruning"} + +\item{missing.rate}{will be passed to "snpgdsLDpruning"} + +\item{method}{will be passed to "snpgdsLDpruning"} + +\item{slide.max.bp}{will be passed to "snpgdsLDpruning"} + +\item{slide.max.n}{will be passed to "snpgdsLDpruning"} + +\item{seed}{seed for the pseudo-random number generator} + +\item{nb.cores}{will be passed to "snpgdsLDpruning"} + +\item{verbose}{verbosity level (0/1)} +} +\value{ +vector of SNP identifiers +} +\description{ +Prune SNPs based on their pairwise linkage disequilibriums via sliding windows using the "snpgdsLDpruning" function from the "SNPRelate" package.. +} +\seealso{ +\code{\link{thinSnps}}, \code{\link[SNPRelate]{snpgdsLDpruning}} +} +\author{ +Timothee Flutre +} diff --git a/man/thinSnps.Rd b/man/thinSnps.Rd index eac46be..a9e9c7b 100644 --- a/man/thinSnps.Rd +++ b/man/thinSnps.Rd @@ -21,6 +21,9 @@ vector of SNP identifiers \description{ Thin SNPs according to various methods: based on their index or based on their genomic coordinate. } +\seealso{ +\code{\link{pruneSnpsLd}} +} \author{ Timothee Flutre } diff --git a/man/writeSegregJoinMap.Rd b/man/writeSegregJoinMap.Rd index ebabb15..701c01a 100644 --- a/man/writeSegregJoinMap.Rd +++ b/man/writeSegregJoinMap.Rd @@ -13,15 +13,15 @@ writeSegregJoinMap(pop.name, pop.type = "CP", locus, segregs, genos, \item{pop.type}{type of the population} -\item{locus}{vector of locus names (should be shorter or equal to 20 characters, otherwise a warning will be issued)} +\item{locus}{vector of locus names; should be shorter or equal to 20 characters, otherwise a warning will be issued; should be in the same order as the rows of the "genos" argument} -\item{segregs}{vector of segregation types} +\item{segregs}{vector of segregation types; should be in the same order as the rows of the "genos" argument} \item{genos}{data frame containing genotypes encoded in the JoinMap format, similar to the output from \code{\link{genoClasses2JoinMap}}} -\item{phases}{vector of phase types (optional)} +\item{phases}{vector of phase types (optional; should be in the same order as the rows of the "genos" argument)} -\item{classifs}{vector of classification types (optional)} +\item{classifs}{vector of classification types (optional; should be in the same order as the rows of the "genos" argument)} \item{file}{name of the file in which the data will be saved (will be compressed if ends with ".gz" and "gzip" is available in the PATH)}