From 99ad914389060997854a07f439481fb305a48fe3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timoth=C3=A9e=20Flutre?= <timothee.flutre@inra.fr>
Date: Thu, 9 Nov 2017 11:07:58 +0100
Subject: [PATCH] add pruneSnpsLd() + minor fix in writeSegregJoinMap()

---
 DESCRIPTION               |  3 +-
 NAMESPACE                 |  1 +
 R/quantgen.R              | 87 +++++++++++++++++++++++++++++++++++++--
 man/pruneSnpsLd.Rd        | 50 ++++++++++++++++++++++
 man/thinSnps.Rd           |  3 ++
 man/writeSegregJoinMap.Rd |  8 ++--
 6 files changed, 143 insertions(+), 9 deletions(-)
 create mode 100644 man/pruneSnpsLd.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index 3319052..59d4a56 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: rutilstimflutre
 Title: Timothee Flutre's personal R code
-Version: 0.150.0
+Version: 0.151.0
 Authors@R: c(
     person("Timothee", "Flutre", email="timothee.flutre@inra.fr", role=c("aut", "ctb", "cre")),
     person("Peter", "Carbonetto", role="aut", comment="function from varbvs example"),
@@ -59,6 +59,7 @@ Suggests:
     rstan,
     S4Vectors,
     scrm,
+    SNPRelate,
     sp,
     sparseMVN,
     stats,
diff --git a/NAMESPACE b/NAMESPACE
index 8f5a800..9cfc928 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -146,6 +146,7 @@ export(plotVcfPercNa)
 export(plotWithScale)
 export(precMatAR1)
 export(prettyPrintSummary)
+export(pruneSnpsLd)
 export(pve2beta)
 export(qqplotPval)
 export(qtlrelPerChr)
diff --git a/R/quantgen.R b/R/quantgen.R
index 055b441..640d5ce 100644
--- a/R/quantgen.R
+++ b/R/quantgen.R
@@ -1283,11 +1283,11 @@ readSegregJoinMap <- function(file, na.string="--", verbose=1){
 ##' Write genotype data in the \href{https://www.kyazma.nl/index.php/JoinMap/}{JoinMap} format into a "loc" file used by this software.
 ##' @param pop.name name of the population
 ##' @param pop.type type of the population
-##' @param locus vector of locus names (should be shorter or equal to 20 characters, otherwise a warning will be issued)
-##' @param segregs vector of segregation types
+##' @param locus vector of locus names; should be shorter or equal to 20 characters, otherwise a warning will be issued; should be in the same order as the rows of the "genos" argument
+##' @param segregs vector of segregation types; should be in the same order as the rows of the "genos" argument
 ##' @param genos data frame containing genotypes encoded in the JoinMap format,  similar to the output from \code{\link{genoClasses2JoinMap}}
-##' @param phases vector of phase types (optional)
-##' @param classifs vector of classification types (optional)
+##' @param phases vector of phase types (optional; should be in the same order as the rows of the "genos" argument)
+##' @param classifs vector of classification types (optional; should be in the same order as the rows of the "genos" argument)
 ##' @param file name of the file in which the data will be saved (will be compressed if ends with ".gz" and "gzip" is available in the PATH)
 ##' @param save.ind.names if TRUE, individual names will be saved at the end of the file
 ##' @param na.string character to replace NA's in "genos"
@@ -1372,6 +1372,7 @@ writeSegregJoinMap <- function(pop.name, pop.type="CP",
     tmp <- cbind(tmp, phases)
   if(! is.null(classifs))
     tmp <- cbind(tmp, classifs)
+  rownames(tmp) <- NULL
   genos[is.na(genos)] <- na.string
   tmp <- cbind(tmp, genos)
   suppressWarnings(utils::write.table(x=tmp, file=con, append=TRUE,
@@ -4924,6 +4925,7 @@ distConsecutiveSnps <- function(snp.coords, only.chr=NULL, nb.cores=1){
 ##' @param only.chr identifier of a given chromosome
 ##' @return vector of SNP identifiers
 ##' @author Timothee Flutre
+##' @seealso \code{\link{pruneSnpsLd}}
 ##' @export
 thinSnps <- function(method, threshold, snp.coords, only.chr=NULL){
   requireNamespaces("GenomicRanges")
@@ -4961,6 +4963,83 @@ thinSnps <- function(method, threshold, snp.coords, only.chr=NULL){
   return(out.snp.ids)
 }
 
+##' Prune SNPs based on LD
+##'
+##' Prune SNPs based on their pairwise linkage disequilibriums via sliding windows using the "snpgdsLDpruning" function from the "SNPRelate" package..
+##' @param X matrix of bi-allelic SNP genotypes encoded, for each SNP, in number of copies of its second allele, i.e. as allele doses in {0,1,2}, with genotypes in rows and SNPs in columns; the "second" allele is arbitrary, it can correspond to the minor (least frequent) or the major (most frequent) allele; will be transformed into a "gds" object (see next argument)
+##' @param snp.coords data frame which row names are SNP identifiers, the first column should contain chromosomes as integers, and the second column should contain positions; compulsory if the X argument is specified
+##' @param gds object of class "SNPGDSFileClass" from the SNPRelate package
+##' @param ld.threshold the LD threshold below which SNPs are discarded; will be passed to "snpgdsLDpruning"
+##' @param remove.monosnp will be passed to "snpgdsLDpruning"
+##' @param maf will be passed to "snpgdsLDpruning"
+##' @param missing.rate will be passed to "snpgdsLDpruning"
+##' @param method will be passed to "snpgdsLDpruning"
+##' @param slide.max.bp will be passed to "snpgdsLDpruning"
+##' @param slide.max.n will be passed to "snpgdsLDpruning"
+##' @param seed seed for the pseudo-random number generator
+##' @param nb.cores will be passed to "snpgdsLDpruning"
+##' @param verbose verbosity level (0/1)
+##' @return vector of SNP identifiers
+##' @author Timothee Flutre
+##' @seealso \code{\link{thinSnps}}, \code{\link[SNPRelate]{snpgdsLDpruning}}
+##' @export
+pruneSnpsLd <- function(X=NULL, snp.coords=NULL, gds=NULL,
+                        ld.threshold=0.2,
+                        remove.monosnp=TRUE, maf=NaN, missing.rate=NaN,
+                        method=c("composite", "r", "dprime", "corr"),
+                        slide.max.bp=500000, slide.max.n=NA,
+                        seed=NULL, nb.cores=1, verbose=1){
+  requireNamespace("SNPRelate")
+  stopifnot(xor(is.null(X), is.null(gds)))
+  if(is.null(gds)){
+    stopIfNotValidGenosDose(X=X, check.noNA=FALSE, check.notImputed=TRUE)
+    stopifnot(! is.null(snp.coords),
+              is.data.frame(snp.coords),
+              ! is.null(rownames(snp.coords)),
+              all(colnames(X) %in% rownames(snp.coords)),
+              ncol(snp.coords) >= 2)
+  }
+
+  gds.file <- NULL
+  if(is.null(gds)){
+    gds.file <- tempfile()
+    snp.coords <- snp.coords[colnames(X),] # re-order the rows
+    SNPRelate::snpgdsCreateGeno(gds.fn=gds.file,
+                                genmat=X,
+                                sample.id=rownames(X),
+                                snp.id=colnames(X),
+                                snp.chromosome=snp.coords[,1],
+                                snp.position=snp.coords[,2],
+                                snpfirstdim=FALSE)
+    gds <- SNPRelate::snpgdsOpen(gds.file)
+  }
+
+  if(verbose > 1)
+    SNPRelate::snpgdsSummary(gds)
+
+  if(! is.null(seed))
+    set.seed(seed)
+  out.snp.ids <-
+    SNPRelate::snpgdsLDpruning(gdsobj=gds,
+                               remove.monosnp=remove.monosnp,
+                               maf=maf,
+                               missing.rate=missing.rate,
+                               method=method,
+                               slide.max.bp=slide.max.bp,
+                               slide.max.n=slide.max.n,
+                               ld.threshold=ld.threshold,
+                               num.thread=nb.cores,
+                               verbose=ifelse(verbose > 0, TRUE, FALSE))
+  out.snp.ids <- unlist(out.snp.ids)
+
+  if(! is.null(gds.file)){
+    SNPRelate::snpgdsClose(gds)
+    file.remove(gds.file)
+  }
+
+  return(out.snp.ids)
+}
+
 ##' Genotype imputation
 ##'
 ##' Impute missing SNP genotypes with the mean of the non-missing.
diff --git a/man/pruneSnpsLd.Rd b/man/pruneSnpsLd.Rd
new file mode 100644
index 0000000..ce32b9d
--- /dev/null
+++ b/man/pruneSnpsLd.Rd
@@ -0,0 +1,50 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/quantgen.R
+\name{pruneSnpsLd}
+\alias{pruneSnpsLd}
+\title{Prune SNPs based on LD}
+\usage{
+pruneSnpsLd(X = NULL, snp.coords = NULL, gds = NULL, ld.threshold = 0.2,
+  remove.monosnp = TRUE, maf = NaN, missing.rate = NaN,
+  method = c("composite", "r", "dprime", "corr"), slide.max.bp = 5e+05,
+  slide.max.n = NA, seed = NULL, nb.cores = 1, verbose = 1)
+}
+\arguments{
+\item{X}{matrix of bi-allelic SNP genotypes encoded, for each SNP, in number of copies of its second allele, i.e. as allele doses in {0,1,2}, with genotypes in rows and SNPs in columns; the "second" allele is arbitrary, it can correspond to the minor (least frequent) or the major (most frequent) allele; will be transformed into a "gds" object (see next argument)}
+
+\item{snp.coords}{data frame which row names are SNP identifiers, the first column should contain chromosomes as integers, and the second column should contain positions; compulsory if the X argument is specified}
+
+\item{gds}{object of class "SNPGDSFileClass" from the SNPRelate package}
+
+\item{ld.threshold}{the LD threshold below which SNPs are discarded; will be passed to "snpgdsLDpruning"}
+
+\item{remove.monosnp}{will be passed to "snpgdsLDpruning"}
+
+\item{maf}{will be passed to "snpgdsLDpruning"}
+
+\item{missing.rate}{will be passed to "snpgdsLDpruning"}
+
+\item{method}{will be passed to "snpgdsLDpruning"}
+
+\item{slide.max.bp}{will be passed to "snpgdsLDpruning"}
+
+\item{slide.max.n}{will be passed to "snpgdsLDpruning"}
+
+\item{seed}{seed for the pseudo-random number generator}
+
+\item{nb.cores}{will be passed to "snpgdsLDpruning"}
+
+\item{verbose}{verbosity level (0/1)}
+}
+\value{
+vector of SNP identifiers
+}
+\description{
+Prune SNPs based on their pairwise linkage disequilibriums via sliding windows using the "snpgdsLDpruning" function from the "SNPRelate" package..
+}
+\seealso{
+\code{\link{thinSnps}}, \code{\link[SNPRelate]{snpgdsLDpruning}}
+}
+\author{
+Timothee Flutre
+}
diff --git a/man/thinSnps.Rd b/man/thinSnps.Rd
index eac46be..a9e9c7b 100644
--- a/man/thinSnps.Rd
+++ b/man/thinSnps.Rd
@@ -21,6 +21,9 @@ vector of SNP identifiers
 \description{
 Thin SNPs according to various methods: based on their index or based on their genomic coordinate.
 }
+\seealso{
+\code{\link{pruneSnpsLd}}
+}
 \author{
 Timothee Flutre
 }
diff --git a/man/writeSegregJoinMap.Rd b/man/writeSegregJoinMap.Rd
index ebabb15..701c01a 100644
--- a/man/writeSegregJoinMap.Rd
+++ b/man/writeSegregJoinMap.Rd
@@ -13,15 +13,15 @@ writeSegregJoinMap(pop.name, pop.type = "CP", locus, segregs, genos,
 
 \item{pop.type}{type of the population}
 
-\item{locus}{vector of locus names (should be shorter or equal to 20 characters, otherwise a warning will be issued)}
+\item{locus}{vector of locus names; should be shorter or equal to 20 characters, otherwise a warning will be issued; should be in the same order as the rows of the "genos" argument}
 
-\item{segregs}{vector of segregation types}
+\item{segregs}{vector of segregation types; should be in the same order as the rows of the "genos" argument}
 
 \item{genos}{data frame containing genotypes encoded in the JoinMap format,  similar to the output from \code{\link{genoClasses2JoinMap}}}
 
-\item{phases}{vector of phase types (optional)}
+\item{phases}{vector of phase types (optional; should be in the same order as the rows of the "genos" argument)}
 
-\item{classifs}{vector of classification types (optional)}
+\item{classifs}{vector of classification types (optional; should be in the same order as the rows of the "genos" argument)}
 
 \item{file}{name of the file in which the data will be saved (will be compressed if ends with ".gz" and "gzip" is available in the PATH)}