diff --git a/DESCRIPTION b/DESCRIPTION
index 2851b80..40d065e 100755
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -2,8 +2,8 @@ Package: rMVP
Type: Package
Title: Memory-Efficient, Visualize-Enhanced, Parallel-Accelerated GWAS
Tool
-Version: 1.2.0
-Date: 2024-09-18
+Version: 1.3.0
+Date: 2024-12-12
Authors@R: c( person("Lilin", "Yin", role = "aut"),
person("Haohao", "Zhang", role = "aut"),
person("Zhenshuang", "Tang", role = "aut"),
diff --git a/R/MVP.Data.r b/R/MVP.Data.r
index 5aaa77a..2adc7df 100755
--- a/R/MVP.Data.r
+++ b/R/MVP.Data.r
@@ -12,16 +12,13 @@
#' MVP.Data: To prepare data for MVP package
-#' Author: Xiaolei Liu, Lilin Yin and Haohao Zhang
-#' Build date: Aug 30, 2016
-#' Last update: Sep 12, 2018
#'
#' @param fileMVP Genotype in MVP format
#' @param fileVCF Genotype in VCF format
#' @param fileHMP Genotype in hapmap format
#' @param fileBed Genotype in PLINK binary format
#' @param fileInd Individual name file
-#' @param fileNum Genotype in numeric format; pure 0, 1, 2 matrix; m * n, m is marker size, n is sample size
+#' @param fileNum Genotype in numeric format; pure 0, 1, 2 matrix; m * n or n * m, m is marker size, n is sample size
#' @param filePhe Phenotype, the first column is taxa name, the subsequent columns are traits
#' @param fileMap SNP map information, there are three columns, including SNP_ID, Chromosome, and Position
#' @param fileKin Kinship that represents relationship among individuals, n * n matrix, n is sample size
@@ -246,8 +243,8 @@ MVP.Data.VCF2MVP <- function(vcf_file, out='mvp', maxLine = 1e4, type.geno='char
# parse genotype
bigmat <- filebacked.big.matrix(
- nrow = m,
- ncol = n,
+ nrow = n,
+ ncol = m,
type = type.geno,
backingfile = backingfile,
backingpath = dirname(out),
@@ -308,8 +305,8 @@ MVP.Data.Bfile2MVP <- function(bfile, out='mvp', maxLine=1e4, type.geno='char',
# parse genotype
bigmat <- filebacked.big.matrix(
- nrow = m,
- ncol = n,
+ nrow = n,
+ ncol = m,
type = type.geno,
backingfile = backingfile,
backingpath = dirname(out),
@@ -365,8 +362,8 @@ MVP.Data.Hapmap2MVP <- function(hmp_file, out='mvp', maxLine = 1e4, type.geno='c
# parse genotype
bigmat <- filebacked.big.matrix(
- nrow = m,
- ncol = n,
+ nrow = n,
+ ncol = m,
type = type.geno,
backingfile = backingfile,
backingpath = dirname(out),
@@ -432,8 +429,8 @@ MVP.Data.Numeric2MVP <- function(num_file, map_file, out='mvp', maxLine=1e4, row
# define bigmat
bigmat <- filebacked.big.matrix(
- nrow = m,
- ncol = n,
+ nrow = n,
+ ncol = m,
type = type.geno,
backingfile = backingfile,
backingpath = dirname(out),
@@ -476,15 +473,15 @@ MVP.Data.Numeric2MVP <- function(num_file, map_file, out='mvp', maxLine=1e4, row
len <- length(line)
if (len == 0) { break }
- line <- do.call(rbind, strsplit(line, '\\s+'))
- if (row_names) { line <- line[, 2:ncol(line)]}
+ line <- do.call(cbind, strsplit(line, '\\s+'))
+ if (row_names) { line <- line[2:ncol(line), ]}
if (transposed) {
- bigmat[, (i + 1):(i + ncol(line))] <- line
- i <- i + ncol(line)
- percent <- 100 * i / n
- } else {
bigmat[(i + 1):(i + nrow(line)), ] <- line
i <- i + nrow(line)
+ percent <- 100 * i / n
+ } else {
+ bigmat[, (i + 1):(i + ncol(line))] <- line
+ i <- i + ncol(line)
percent <- 100 * i / m
}
@@ -527,8 +524,11 @@ MVP.Data.Numeric2MVP <- function(num_file, map_file, out='mvp', maxLine=1e4, row
#'
MVP.Data.MVP2Bfile <- function(bigmat, map, pheno=NULL, out='mvp.plink', threads=1, verbose=TRUE) {
t1 <- as.numeric(Sys.time())
+
+ if(nrow(map) != nrow(bigmat) && nrow(map) != ncol(bigmat)) stop("mismatched number of SNPs between genotype and map.")
+ mrk_bycol <- nrow(map) == ncol(bigmat)
# write bed file
- write_bfile(bigmat@address, out, threads=threads, verbose = verbose)
+ write_bfile(bigmat@address, out, mrkbycol=mrk_bycol, threads=threads, verbose = verbose)
# write fam
# 1. Family ID ('FID')
@@ -537,13 +537,14 @@ MVP.Data.MVP2Bfile <- function(bigmat, map, pheno=NULL, out='mvp.plink', threads
# 4. Within-family ID of mother ('0' if mother isn't in dataset)
# 5. Sex code ('1' = male, '2' = female, '0' = unknown)
# 6. Phenotype value ('1' = control, '2' = case, '-9'/'0'/non-numeric = missing data if case/control)
+ n <- ifelse(mrk_bycol, nrow(bigmat), ncol(bigmat))
if (is.null(pheno)) {
- ind <- paste0("ind", 1:ncol(bigmat))
- pheno <- rep(-9, ncol(bigmat))
+ ind <- paste0("ind", 1:n)
+ pheno <- rep(-9, n)
message("pheno is NULL, automatically named individuals.")
} else if (ncol(pheno) == 1) {
ind <- pheno[, 1]
- pheno <- rep(-9, ncol(bigmat))
+ pheno <- rep(-9, n)
} else {
if (ncol(pheno) > 2) {
message("Only the first phenotype is written to the fam file, and the remaining ", ncol(pheno) - 1, " phenotypes are ignored.")
@@ -704,6 +705,7 @@ MVP.Data.Map <- function(map, out='mvp', cols=1:5, header=TRUE, sep='\t', verbos
#' @param out prefix of output file name
#' @param pcs.keep how many PCs to keep
#' @param maxLine the number of markers handled at a time, smaller value would reduce the memory cost
+#' @param mrk_bycol whether the markers are stored by columns in genotype (i.e. genotype is a n by m matrix)
#' @param sep seperator for PC file.
#' @param cpu the number of cpu
#' @param verbose whether to print detail.
@@ -727,6 +729,7 @@ MVP.Data.PC <- function(
out=NULL,
pcs.keep=5,
maxLine=10000,
+ mrk_bycol=TRUE,
sep='\t',
cpu=1,
verbose=TRUE
@@ -743,7 +746,7 @@ MVP.Data.PC <- function(
} else if (filePC == TRUE) {
if(is.null(K)){
geno <- attach.big.matrix(paste0(mvp_prefix, ".geno.desc"))
- myPC <- MVP.PCA(M=geno, pcs.keep = pcs.keep, maxLine = maxLine, cpu=cpu)
+ myPC <- MVP.PCA(M=geno, mrk_bycol=mrk_bycol, pcs.keep = pcs.keep, maxLine = maxLine, cpu=cpu)
}else{
myPC <- MVP.PCA(K=K, pcs.keep = pcs.keep, maxLine = maxLine, cpu=cpu)
}
@@ -773,6 +776,7 @@ MVP.Data.PC <- function(
#' @param mvp_prefix Prefix for mvp format files
#' @param out prefix of output file name
#' @param maxLine the number of markers handled at a time, smaller value would reduce the memory cost
+#' @param mrk_bycol whether the markers are stored by columns in genotype (i.e. genotype is a n by m matrix)
#' @param sep seperator for Kinship file.
#' @param cpu the number of cpu
#' @param verbose whether to print detail.
@@ -794,6 +798,7 @@ MVP.Data.Kin <- function(
mvp_prefix='mvp',
out=NULL,
maxLine=10000,
+ mrk_bycol=TRUE,
sep='\t',
cpu=1,
verbose=TRUE
@@ -809,7 +814,7 @@ MVP.Data.Kin <- function(
myKin <- read.big.matrix(fileKin, header = FALSE, type = 'double', sep = sep)
} else if (fileKin == TRUE) {
geno <- attach.big.matrix(paste0(mvp_prefix, ".geno.desc"))
- myKin <- MVP.K.VanRaden(geno, maxLine = maxLine, cpu = cpu, verbose = verbose)
+ myKin <- MVP.K.VanRaden(geno, mrk_bycol = mrk_bycol, maxLine = maxLine, cpu = cpu, verbose = verbose)
} else {
stop("ERROR: The value of fileKin is invalid.")
}
@@ -838,6 +843,7 @@ MVP.Data.Kin <- function(
#'
#' @param mvp_prefix the prefix of mvp file
#' @param out the prefix of output file
+#' @param mrk_bycol whether the markers are stored by columns in genotype (i.e. genotype is a n by m matrix)
#' @param method 'Major', 'Minor', "Middle"
#' @param ncpus number of threads for imputing
#' @param verbose whether to print the reminder
@@ -849,7 +855,7 @@ MVP.Data.Kin <- function(
#'
#' MVP.Data.impute(mvpPath, tempfile("outfile"), ncpus=1)
#'
-MVP.Data.impute <- function(mvp_prefix, out=NULL, method='Major', ncpus=NULL, verbose=TRUE) {
+MVP.Data.impute <- function(mvp_prefix, out=NULL, mrk_bycol=TRUE, method='Major', ncpus=NULL, verbose=TRUE) {
# input
desc <- paste0(mvp_prefix, ".geno.desc")
bigmat <- attach.big.matrix(desc)
@@ -888,7 +894,7 @@ MVP.Data.impute <- function(mvp_prefix, out=NULL, method='Major', ncpus=NULL, ve
file.copy(paste0(mvp_prefix, ".geno.map"), paste0(out, ".geno.map"))
}
- impute_marker(outmat@address, threads = ncpus, verbose = verbose)
+ impute_marker(outmat@address, mrkbycol = mrk_bycol, threads = ncpus, verbose = verbose)
logging.log("Impute Genotype File is done!\n", verbose = verbose)
}
diff --git a/R/MVP.FarmCPU.r b/R/MVP.FarmCPU.r
index cbde427..ba28518 100755
--- a/R/MVP.FarmCPU.r
+++ b/R/MVP.FarmCPU.r
@@ -20,7 +20,7 @@
#' @author Xiaolei Liu and Zhiwu Zhang
#'
#' @param phe phenotype, n by t matrix, n is sample size, t is number of phenotypes
-#' @param geno genotype, m by n matrix, m is marker size, n is sample size. This is Pure Genotype Data Matrix(GD). THERE IS NO COLUMN FOR TAXA.
+#' @param geno genotype, either m by n or n by m is supportable, m is marker size, n is population size. This is Pure Genotype Data Matrix(GD). THERE IS NO COLUMN FOR TAXA.
#' @param map SNP map information, m by 3 matrix, m is marker size, the three columns are SNP_ID, Chr, and Pos
#' @param CV covariates, n by c matrix, n is sample size, c is number of covariates
#' @param ind_idx the index of effective genotyped individuals
@@ -35,6 +35,7 @@
#' @param Prior prior information, four columns, which are SNP_ID, Chr, Pos, P-value
#' @param ncpus number of threads used for parallele computation
#' @param maxLoop maximum number of iterations
+#' @param maxLine the number of markers handled at a time, smaller value would reduce the memory cost
#' @param threshold.output only the GWAS results with p-values lower than threshold.output will be output
#' @param converge a number, 0 to 1, if selected pseudo QTNs in the last and the second last iterations have a certain probality (the probability is converge) of overlap, the loop will stop
#' @param iteration.output whether to output results of all iterations
@@ -55,7 +56,7 @@
#' print(dim(phenotype))
#' genoPath <- system.file("extdata", "06_mvp-impute", "mvp.imp.geno.desc", package = "rMVP")
#' genotype <- attach.big.matrix(genoPath)
-#' genotype <- deepcopy(genotype, cols=idx)
+#' genotype <- deepcopy(genotype, rows=idx)
#' print(dim(genotype))
#' mapPath <- system.file("extdata", "06_mvp-impute", "mvp.imp.geno.map", package = "rMVP")
#' map <- read.table(mapPath , head = TRUE)
@@ -66,15 +67,20 @@
#'
`MVP.FarmCPU` <- function(phe, geno, map, CV=NULL, ind_idx=NULL, mrk_idx=NULL, P=NULL, method.sub="reward", method.sub.final="reward",
method.bin=c("EMMA", "static", "FaST-LMM"), bin.size=c(5e5,5e6,5e7), bin.selection=seq(10,100,10),
- memo="MVP.FarmCPU", Prior=NULL, ncpus=2, maxLoop=10,
+ memo="MVP.FarmCPU", Prior=NULL, ncpus=2, maxLoop=10, maxLine=5000,
threshold.output=.01, converge=1, iteration.output=FALSE, p.threshold=NA,
QTN.threshold=0.01, bound=NULL, verbose=TRUE){
#print("--------------------- Welcome to FarmCPU ----------------------------")
- n <- ifelse(is.null(ind_idx), ncol(geno), length(ind_idx))
if(!is.big.matrix(geno)) stop("genotype should be in 'big.matrix' format.")
if(sum(is.na(phe[, 2])) != 0) stop("NAs are not allowed in phenotype.")
- if(nrow(phe) != n) stop("number of individuals does not match in phenotype and genotype.")
+ if(is.null(ind_idx)){
+ if(nrow(phe) != ncol(geno) && nrow(phe) != nrow(geno)) stop("number of individuals does not match in phenotype and genotype.")
+ n <- ifelse(nrow(phe) == ncol(geno), ncol(geno), nrow(geno))
+ }else{
+ n <- length(ind_idx)
+ if(nrow(phe) != n) stop("number of individuals does not match in phenotype and genotype.")
+ }
echo=TRUE
nm=nrow(map)
@@ -233,14 +239,10 @@
theCV=CV
if(!is.null(myRemove$bin)){
- if(length(myRemove$seqQTN) == 1){
- #myRemove$bin = as.matrix(myRemove$bin)
- myRemove$bin = t(myRemove$bin)
- }
theCV=cbind(CV,myRemove$bin)
}
- myGLM=FarmCPU.LM(y=phe[,2],GDP=geno,GDP_index=ind_idx,w=theCV,ncpus=ncpus,npc=npc, verbose=verbose)
+ myGLM=FarmCPU.LM(y=phe[,2],GDP=geno,GDP_index=ind_idx,w=theCV,maxLine=maxLine,ncpus=ncpus,npc=npc, verbose=verbose)
if(!is.null(seqQTN)){
if(ncol(myGLM$P) != (npc + length(seqQTN) + 1)) stop("wrong dimensions.")
}
@@ -474,7 +476,7 @@
#' @author Xiaolei Liu and Zhiwu Zhang
#'
#' @param Y a n by 2 matrix, the fist column is taxa id and the second is trait
-#' @param GDP genotype, m by n matrix, m is marker size, n is sample size. This is Pure Genotype Data Matrix(GD). THERE IS NO COLUMN FOR TAXA
+#' @param GDP genotype
#' @param GDP_index the index of effective genotyped individuals
#' @param GM SNP map information, m by 3 matrix, m is marker size, the three columns are SNP_ID, Chr, and Pos
#' @param CV covariates, n by c matrix, n is sample size, c is number of covariates
@@ -497,7 +499,12 @@ FarmCPU.BIN <-
if(is.null(P)) return(list(bin=NULL,binmap=NULL,seqQTN=NULL))
- if(is.null(GDP_index)) GDP_index=seq(1, ncol(GDP))
+ if(nrow(Y) == nrow(GDP)){
+ if(is.null(GDP_index)) GDP_index=seq(1, nrow(GDP))
+ }else{
+ if(is.null(GDP_index)) GDP_index=seq(1, ncol(GDP))
+ }
+
#Set upper bound for bin selection to squareroot of sample size
n=nrow(Y)
#bound=round(sqrt(n)/log10(n))
@@ -551,7 +558,11 @@ FarmCPU.BIN <-
GP=cbind(GM,P,NA,NA,NA)
mySpecify=FarmCPU.Specify(GI=GM,GP=GP,bin.size=bin,inclosure.size=inc)
seqQTN=which(mySpecify$index==TRUE)
- GK=t(GDP[seqQTN, GDP_index])
+ if(nrow(Y) == nrow(GDP)){
+ GK=GDP[GDP_index, seqQTN]
+ }else{
+ GK=t(GDP[seqQTN, GDP_index])
+ }
myBurger=FarmCPU.Burger(Y=Y[,1:2], CV=CV, GK=GK, ncpus=ncpus, method=method)
myREML=myBurger$REMLs
myVG=myBurger$vg #it is unused
@@ -595,7 +606,11 @@ FarmCPU.BIN <-
GP=cbind(GM,P,NA,NA,NA)
mySpecify=FarmCPU.Specify(GI=GM,GP=GP,bin.size=bin,inclosure.size=inc)
seqQTN=which(mySpecify$index==TRUE)
- GK=t(GDP[seqQTN,GDP_index])
+ if(nrow(Y) == nrow(GDP)){
+ GK=GDP[GDP_index, seqQTN]
+ }else{
+ GK=t(GDP[seqQTN, GDP_index])
+ }
myBurger=FarmCPU.Burger(Y=Y[,1:2], CV=CV, GK=GK, ncpus=ncpus, method=method)
myREML=myBurger$REMLs
myVG=myBurger$vg #it is unused
@@ -632,7 +647,11 @@ FarmCPU.BIN <-
GP=cbind(GM,P,NA,NA,NA)
mySpecify=FarmCPU.Specify(GI=GM,GP=GP,bin.size=bin[ii],inclosure.size=inc[ii])
seqQTN=which(mySpecify$index==TRUE)
- GK=t(GDP[seqQTN,GDP_index])
+ if(nrow(Y) == nrow(GDP)){
+ GK=GDP[GDP_index, seqQTN]
+ }else{
+ GK=t(GDP[seqQTN, GDP_index])
+ }
myBurger=FarmCPU.Burger(Y=Y[,1:2], CV=CV, GK=GK, ncpus=ncpus, method=method)
myREML=myBurger$REMLs
myVG=myBurger$vg #it is unused
@@ -754,8 +773,9 @@ FarmCPU.Specify <-
#'
#' @param y one column matrix, dependent variable
#' @param w covariates, n by c matrix, n is sample size, c is number of covariates
-#' @param GDP genotype, m by n matrix, m is marker size, n is sample size. This is Pure Genotype Data Matrix(GD). THERE IS NO COLUMN FOR TAXA.
+#' @param GDP genotype
#' @param GDP_index index of effective genotyped individuals
+#' @param maxLine the number of markers handled at a time, smaller value would reduce the memory cost
#' @param ncpus number of threads used for parallele computation
#' @param npc number of covariates without pseudo QTNs
#' @param verbose whether to print detail.
@@ -767,7 +787,7 @@ FarmCPU.Specify <-
#'
#' @keywords internal
FarmCPU.LM <-
- function(y, w=NULL, GDP, GDP_index=NULL, ncpus=2, npc=0, verbose=TRUE){
+ function(y, w=NULL, GDP, GDP_index=NULL, maxLine=5000, ncpus=2, npc=0, verbose=TRUE){
#print("FarmCPU.LM started")
if(is.null(y)) return(NULL)
if(is.null(GDP)) return(NULL)
@@ -843,7 +863,7 @@ FarmCPU.LM <-
# if(ncol(w) == 50) write.csv(P, "P.csv")
mkl_env({
- results <- glm_c(y=y, X=w, iXX = wwi, GDP@address, geno_ind=GDP_index, verbose=verbose, threads=ncpus)
+ results <- glm_c(y=y, X=w, iXX = wwi, GDP@address, geno_ind=GDP_index, step=maxLine, verbose=verbose, threads=ncpus)
})
return(list(P=results[ ,-c(1:3), drop=FALSE], betapred=betapred, sepred=sepred, B=results[ , 1, drop=FALSE], S=results[ , 2, drop=FALSE]))
# return(list(P=P, betapred=betapred, B=as.matrix(B), S=S))
@@ -904,7 +924,7 @@ FarmCPU.Burger <-
}
if(method=="EMMA"){
- K <- MVP.K.VanRaden(M=as.big.matrix(t(theGK)), verbose=FALSE)
+ K <- MVP.K.VanRaden(M=as.big.matrix(theGK), verbose=FALSE)
myEMMAREML <- MVP.EMMA.Vg.Ve(y=matrix(Y[,-1],nrow(Y),1), X=theCV, K=K)
REMLs=-2*myEMMAREML$REML
delta=myEMMAREML$delta
@@ -982,7 +1002,7 @@ FarmCPU.SUB <-
#'
#' @author Xiaolei Liu and Zhiwu Zhang
#'
-#' @param GDP genotype, m by n matrix, m is marker size, n is sample size. This is Pure Genotype Data Matrix(GD). THERE IS NO COLUMN FOR TAXA
+#' @param GDP genotype
#' @param GDP_index the index of effective genotyped individuals
#' @param GM SNP map information, m by 3 matrix, m is marker size, the three columns are SNP_ID, Chr, and Pos
#' @param seqQTN s by 1 vecter for index of QTN on GM
@@ -1001,7 +1021,11 @@ FarmCPU.Remove <-
if(is.null(seqQTN))return(list(bin=NULL,binmap=NULL,seqQTN=NULL))
seqQTN=seqQTN[order(seqQTN.p)]
- if(is.null(GDP_index)) GDP_index=seq(1, ncol(GDP))
+ if(nrow(GM) == ncol(GDP)){
+ if(is.null(GDP_index)) GDP_index=seq(1, nrow(GDP))
+ }else{
+ if(is.null(GDP_index)) GDP_index=seq(1, ncol(GDP))
+ }
hugeNum=10e10
n=length(seqQTN)
@@ -1025,7 +1049,12 @@ FarmCPU.Remove <-
#Set sample
ratio=.1
maxNum=100000
- m=nrow(GDP)
+ if(nrow(GM) == ncol(GDP)){
+ m=ncol(GDP)
+ }else{
+ m=nrow(GDP)
+ }
+
s=length(GDP_index)
sampled=s
@@ -1038,9 +1067,17 @@ FarmCPU.Remove <-
#This section has problem of turning big.matrix to R matrix
#It is OK as x is small
if(is.big.matrix(GDP)){
- x=t(as.matrix(deepcopy(GDP,rows=seqQTN,cols=index)))
+ if(nrow(GM) == ncol(GDP)){
+ x=deepcopy(GDP,cols=seqQTN,rows=index)[,, drop=FALSE]
+ }else{
+ x=t(deepcopy(GDP,rows=seqQTN,cols=index)[,, drop=FALSE])
+ }
}else{
- x=t(GDP[seqQTN,index] )
+ if(nrow(GM) == ncol(GDP)){
+ x=GDP[index, seqQTN, drop=FALSE]
+ }else{
+ x=t(GDP[seqQTN, index, drop=FALSE])
+ }
}
r=cor(as.matrix(x))
@@ -1062,9 +1099,17 @@ FarmCPU.Remove <-
#This section has problem of turning big.matrix to R matrix
if(is.big.matrix(GDP)){
- bin=t(as.matrix(deepcopy(GDP,rows=seqQTN,cols=GDP_index) ))
+ if(nrow(GM) == ncol(GDP)){
+ bin=deepcopy(GDP,cols=seqQTN,rows=GDP_index)[,, drop=FALSE]
+ }else{
+ bin=t(deepcopy(GDP,rows=seqQTN,cols=GDP_index)[,, drop=FALSE])
+ }
}else{
- bin=t(GDP[seqQTN, GDP_index, drop=FALSE] )
+ if(nrow(GM) == ncol(GDP)){
+ bin=GDP[GDP_index, seqQTN, drop=FALSE]
+ }else{
+ bin=t(GDP[seqQTN, GDP_index, drop=FALSE])
+ }
}
binmap=GM[seqQTN, , drop=FALSE]
diff --git a/R/MVP.GLM.r b/R/MVP.GLM.r
index a7d0614..044e6da 100644
--- a/R/MVP.GLM.r
+++ b/R/MVP.GLM.r
@@ -19,10 +19,11 @@
#' @author Lilin Yin and Xiaolei Liu
#'
#' @param phe phenotype, n * 2 matrix
-#' @param geno Genotype in numeric format, pure 0, 1, 2 matrix; m * n, m is marker size, n is population size
+#' @param geno genotype, either m by n or n by m is supportable, m is marker size, n is population size
#' @param CV Covariance, design matrix(n * x) for the fixed effects
#' @param ind_idx the index of effective genotyped individuals
#' @param mrk_idx the index of effective markers used in analysis
+#' @param maxLine the number of markers handled at a time, smaller value would reduce the memory cost
#' @param cpu number of cpus used for parallel computation
#' @param verbose whether to print detail.
#'
@@ -38,7 +39,7 @@
#' print(dim(phenotype))
#' genoPath <- system.file("extdata", "06_mvp-impute", "mvp.imp.geno.desc", package = "rMVP")
#' genotype <- attach.big.matrix(genoPath)
-#' genotype <- deepcopy(genotype, cols=idx)
+#' genotype <- deepcopy(genotype, rows=idx)
#' print(dim(genotype))
#'
#' glm <- MVP.GLM(phe=phenotype, geno=genotype, cpu=1)
@@ -49,18 +50,24 @@ function(
phe,
geno,
CV=NULL,
- ind_idx = NULL,
+ ind_idx=NULL,
mrk_idx=NULL,
+ maxLine=5000,
cpu=1,
verbose=TRUE
){
- n <- ifelse(is.null(ind_idx), ncol(geno), length(ind_idx))
- ys <- as.numeric(as.matrix(phe[,2]))
-
+ if(is.null(ind_idx)){
+ if(nrow(phe) != ncol(geno) && nrow(phe) != nrow(geno)) stop("number of individuals does not match in phenotype and genotype.")
+ n <- ifelse(nrow(phe) == ncol(geno), ncol(geno), nrow(geno))
+ }else{
+ n <- length(ind_idx)
+ if(nrow(phe) != n) stop("number of individuals does not match in phenotype and genotype.")
+ }
+
if(!is.big.matrix(geno)) stop("genotype should be in 'big.matrix' format.")
+ ys <- as.numeric(as.matrix(phe[,2]))
if(sum(is.na(ys)) != 0) stop("NAs are not allowed in phenotype.")
- if(nrow(phe) != n) stop("number of individuals does not match in phenotype and genotype.")
if(is.null(CV)){
X0 <- matrix(1, n)
@@ -77,7 +84,7 @@ function(
logging.log("scanning...\n", verbose = verbose)
mkl_env({
- results <- glm_c(y = ys, X = X0, iXX = iX0X0, geno@address, geno_ind = ind_idx, marker_ind = mrk_idx, verbose = verbose, threads = cpu)
+ results <- glm_c(y = ys, X = X0, iXX = iX0X0, geno@address, geno_ind = ind_idx, marker_ind = mrk_idx, step = maxLine, verbose = verbose, threads = cpu)
})
return(results[, c(1, 2, ncol(results))])
diff --git a/R/MVP.K.VanRaden.r b/R/MVP.K.VanRaden.r
index 169bac7..fe80911 100755
--- a/R/MVP.K.VanRaden.r
+++ b/R/MVP.K.VanRaden.r
@@ -13,13 +13,12 @@
#' Calculate Kinship matrix by VanRaden method
#'
-#' Build date: Dec 12, 2016
-#' Last update: Dec 12, 2019
-#'
-#' @param M Genotype, m * n, m is marker size, n is population size
+#' @param M genotype, either m by n or n by m is supportable, m is marker size, n is population size
#' @param maxLine the number of markers handled at a time, smaller value would reduce the memory cost
#' @param ind_idx the index of effective genotyped individuals used in analysis
#' @param mrk_idx the index of effective markers used in analysis
+#' @param mrk_freq the prior calculated major allele frequency (not MAF) for all markers used in analysis
+#' @param mrk_bycol whether the markers are stored by columns in genotype (i.e. M is a n by m matrix)
#' @param cpu the number of cpu
#' @param verbose whether to print detail.
#' @param checkNA whether to check NA in genotype.
@@ -42,6 +41,8 @@ function(
maxLine=5000,
ind_idx=NULL,
mrk_idx=NULL,
+ mrk_freq = NULL,
+ mrk_bycol = TRUE,
cpu=1,
verbose=TRUE,
checkNA=TRUE
@@ -53,19 +54,32 @@ function(
mac <- (!linux) & (!wind)
HPCMathLib <- mac | grepl("mkl", sessionInfo()$LAPACK) | grepl("openblas", sessionInfo()$LAPACK) | eval(parse(text = "!inherits(try(Revo.version,silent=TRUE),'try-error')"))
- if (!is.big.matrix(M)) stop("Format of Genotype Data must be big.matrix")
+ if(!is.big.matrix(M)) stop("Format of Genotype Data must be big.matrix")
if(checkNA){
- if(hasNA(M@address, geno_ind = ind_idx, threads = cpu)) stop("NA is not allowed in genotype, use 'MVP.Data.impute' to impute.")
+ if(hasNA(M@address, mrkbycol = mrk_bycol, geno_ind = ind_idx, marker_ind = mrk_idx, threads = cpu)) stop("NA is not allowed in genotype, use 'MVP.Data.impute' to impute.")
}
- n <- ifelse(is.null(ind_idx), ncol(M), length(ind_idx))
- m <- ifelse(is.null(mrk_idx), nrow(M), length(mrk_idx))
+
+ if(!is.null(mrk_freq)){
+ if(!is.null(mrk_idx)){
+ if(length(mrk_freq) != length(mrk_idx)) stop("mismatched number of markers between provided frequency and index.")
+ }else{
+ if(mrk_bycol){
+ if(length(mrk_freq) != ncol(M)) stop("mismatched number of markers between genotype and provided frequency.")
+ }else{
+ if(length(mrk_freq) != nrow(M)) stop("mismatched number of markers between genotype and provided frequency.")
+ }
+ }
+ }
+
+ n <- ifelse(is.null(ind_idx), ifelse(mrk_bycol, nrow(M), ncol(M)), length(ind_idx))
+ m <- ifelse(is.null(mrk_idx), ifelse(mrk_bycol, ncol(M), nrow(M)), length(mrk_idx))
# logging.log("Relationship matrix mode in", priority[1], "\n", verbose = verbose)
# if(is.null(dim(M))) M <- t(as.matrix(M))
logging.log(paste0("Computing GRM for ", n, " individuals using ", m, " markers with a step of ", maxLine), "\n", verbose = verbose)
- K <- try(kin_cal(M@address, geno_ind = ind_idx, marker_ind = mrk_idx, threads = cpu, step = maxLine, verbose = verbose, mkl = HPCMathLib), silent=TRUE)
+ K <- try(kin_cal(M@address, geno_ind = ind_idx, marker_ind = mrk_idx, marker_freq = mrk_freq, marker_bycol = mrk_bycol, threads = cpu, step = maxLine, verbose = verbose, mkl = HPCMathLib), silent=TRUE)
# K <- try(kin_cal_s(M@address, threads = cpu, verbose = verbose, mkl = HPCMathLib), silent=TRUE)
if(inherits(K,"try-error")){
@@ -177,40 +191,3 @@ function(
logging.log("Deriving relationship matrix successfully", "\n", verbose = verbose); gc()
return(K)
}#end of MVP.k.VanRaden function
-
-# #' Calculate Kinship matrix by Blocking strategy
-# #'
-# #' Build date: Apr 14, 2021
-# #' Last update: Apr 14, 2021
-# #'
-# #' @param M Genotype, m * n, m is marker size, n is population size
-# #' @param step Number of markers processed at one time
-# #'
-# #' @return K, n * n matrix
-# #' @export
-# #'
-# #' @examples
-# #' genoPath <- system.file("extdata", "06_mvp-impute", "mvp.imp.geno.desc", package = "rMVP")
-# #' genotype <- attach.big.matrix(genoPath)
-# #' print(dim(genotype))
-# #'
-# #' K <- MVP.calk(genotype)
-# #'
-# MVP.calk <- function(M, step = 1000) {
-# n_marker = nrow(M)
-# n_sample = ncol(M)
-# if (step > n_marker) { step = n_marker }
-# idx = 0
-# sum = 0
-# K = matrix(0, n_sample, n_sample)
-# while(idx < n_marker){
-# G_buffer = M[idx + seq_len(step), ]
-# means = rowMeans(G_buffer)
-# sum = sum + (0.5 * means) %*% (1 - 0.5 * means)
-# K = K + crossprod(G_buffer - means)
-# idx = idx + step
-# if (idx + step > n_marker) { step = n_marker - idx }
-# }
-# K = K / (2 * drop(sum))
-# return(K)
-# }
diff --git a/R/MVP.MLM.r b/R/MVP.MLM.r
index 820689c..1c2ad02 100644
--- a/R/MVP.MLM.r
+++ b/R/MVP.MLM.r
@@ -12,20 +12,18 @@
#' To perform GWAS with GLM and MLM model and get the P value of SNPs
-#'
-#' Build date: Aug 30, 2016
-#' Last update: Aug 30, 2016
#'
#' @author Lilin Yin and Xiaolei Liu
#'
#' @param phe phenotype, n * 2 matrix
-#' @param geno genotype, m * n, m is marker size, n is population size
+#' @param geno genotype, either m by n or n by m is supportable, m is marker size, n is population size
#' @param K Kinship, Covariance matrix(n * n) for random effects; must be positive semi-definite
#' @param eigenK list of eigen Kinship
#' @param CV covariates
#' @param ind_idx the index of effective genotyped individuals
#' @param mrk_idx the index of effective markers used in analysis
#' @param REML a list that contains ve and vg
+#' @param maxLine the number of markers handled at a time, smaller value would reduce the memory cost
#' @param cpu number of cpus used for parallel computation
#' @param vc.method the methods for estimating variance component("emma" or "he" or "brent")
#' @param verbose whether to print detail.
@@ -43,7 +41,7 @@
#' print(dim(phenotype))
#' genoPath <- system.file("extdata", "06_mvp-impute", "mvp.imp.geno.desc", package = "rMVP")
#' genotype <- attach.big.matrix(genoPath)
-#' genotype <- deepcopy(genotype, cols=idx)
+#' genotype <- deepcopy(genotype, rows=idx)
#' print(dim(genotype))
#' K <- MVP.K.VanRaden(genotype, cpu=1)
#'
@@ -62,18 +60,24 @@ function(
ind_idx=NULL,
mrk_idx=NULL,
REML=NULL,
+ maxLine=5000,
cpu=1,
vc.method=c("BRENT", "EMMA", "HE"),
verbose=TRUE
){
+ if(is.null(ind_idx)){
+ if(nrow(phe) != ncol(geno) && nrow(phe) != nrow(geno)) stop("number of individuals does not match in phenotype and genotype.")
+ n <- ifelse(nrow(phe) == ncol(geno), ncol(geno), nrow(geno))
+ }else{
+ n <- length(ind_idx)
+ if(nrow(phe) != n) stop("number of individuals does not match in phenotype and genotype.")
+ }
vc.method <- match.arg(vc.method)
- n <- ifelse(is.null(ind_idx), ncol(geno), length(ind_idx))
ys <- as.numeric(as.matrix(phe[,2]))
if(!is.big.matrix(geno)) stop("genotype should be in 'big.matrix' format.")
if(sum(is.na(ys)) != 0) stop("NAs are not allowed in phenotype.")
- if(nrow(phe) != n) stop("number of individuals does not match in phenotype and genotype.")
if(is.null(K)){
if(vc.method == "EMMA" | vc.method == "he") stop("Kinship must be provided!")
@@ -121,7 +125,7 @@ function(
logging.log("scanning...\n", verbose = verbose)
mkl_env({
- results <- mlm_c(y = ys, X = X0, U = U, vgs = vgs, geno@address, ind_idx, mrk_idx, verbose = verbose, threads = cpu)
+ results <- mlm_c(y = ys, X = X0, U = U, vgs = vgs, geno@address, ind_idx, mrk_idx, step = maxLine, verbose = verbose, threads = cpu)
})
return(results)
diff --git a/R/MVP.PCA.r b/R/MVP.PCA.r
index 9485120..2d8261d 100755
--- a/R/MVP.PCA.r
+++ b/R/MVP.PCA.r
@@ -13,15 +13,12 @@
#' Principal Component Analysis
#'
-#' Build date: Dec 14, 2016
-#' Last update: Oct 29, 2018
-#' @author Xiaolei Liu, Lilin Yin and Haohao Zhang
-#'
-#' @param M Genotype in numeric format, pure 0, 1, 2 matrix; m * n, m is marker size, n is population size
+#' @param M genotype, either m by n or n by m is supportable, m is marker size, n is population size
#' @param K kinship matrix
#' @param maxLine the number of markers handled at a time, smaller value would reduce the memory cost
#' @param ind_idx the index of effective genotyped individuals used in analysis
#' @param mrk_idx the index of effective markers used in analysis
+#' @param mrk_bycol whether the markers are stored by columns in genotype (i.e. M is a n by m matrix)
#' @param pcs.keep maximum number of PCs for output
#' @param cpu the number of cpu
#' @param verbose whether to print detail.
@@ -42,7 +39,7 @@
#' }
#'
MVP.PCA <-
-function(M=NULL, K=NULL, maxLine=10000, ind_idx=NULL, mrk_idx=NULL, pcs.keep=5, cpu=1, verbose=TRUE){
+function(M=NULL, K=NULL, maxLine=10000, ind_idx=NULL, mrk_idx=NULL, mrk_bycol=TRUE, pcs.keep=5, cpu=1, verbose=TRUE){
#Data Check
if (is.null(M) & is.null(K)) {
@@ -58,7 +55,7 @@ function(M=NULL, K=NULL, maxLine=10000, ind_idx=NULL, mrk_idx=NULL, pcs.keep=5,
}
if(is.null(K)){
- K <- MVP.K.VanRaden(M=M, ind_idx = ind_idx, mrk_idx = mrk_idx, maxLine = maxLine, cpu = cpu, verbose = verbose)
+ K <- MVP.K.VanRaden(M=M, ind_idx = ind_idx, mrk_idx = mrk_idx, mrk_bycol = mrk_bycol, maxLine = maxLine, cpu = cpu, verbose = verbose)
}else{
if(!is.null(ind_idx)) K <- K[ind_idx, ind_idx]
}
diff --git a/R/MVP.Utility.r b/R/MVP.Utility.r
index 8e4dd72..d959e65 100755
--- a/R/MVP.Utility.r
+++ b/R/MVP.Utility.r
@@ -25,12 +25,12 @@
#'
#' @examples
#' MVP.Version()
-MVP.Version <- function(width=60, verbose=TRUE) {
+MVP.Version <- function(width=65, verbose=TRUE) {
welcome <- "Welcome to MVP"
- title <- "A Memory-efficient, Visualization-enhanced, and Parallel-accelerated Tool For GWAS"
- authors <- c("Designed and Maintained by Lilin Yin, Haohao Zhang, and Xiaolei Liu",
+ title <- "an R package for Memory-efficient, Visualization-enhanced and Parallel-accelerated genome-wide association study"
+ authors <- c("Design & Maintain: Lilin Yin, Haohao Zhang, and Xiaolei Liu",
"Contributors: Zhenshuang Tang, Jingya Xu, Dong Yin, Zhiwu Zhang, Xiaohui Yuan, Mengjin Zhu, Shuhong Zhao, Xinyun Li")
- contact <- "Contact: xiaoleiliu@mail.hzau.edu.cn"
+ contact <- "Mailto: xiaoleiliu@mail.hzau.edu.cn, ylilin@mail.hzau.edu.cn"
logo_s <- c(" __ __ __ __ ___",
"| \\/ | \\ \\ / / | _ \\",
"| |\\/| | \\ V / | _/",
diff --git a/R/MVP.r b/R/MVP.r
index a31f29c..e453aa0 100755
--- a/R/MVP.r
+++ b/R/MVP.r
@@ -18,13 +18,10 @@
#' Object 3: Estimate variance components using EMMA, FaST-LMM, and HE regression
#' Object 4: Generate high-quality figures
#'
-#' Build date: Aug 30, 2017
-#' Last update: Dec 14, 2018
-#'
#' @author Lilin Yin, Haohao Zhang, and Xiaolei Liu
#'
#' @param phe phenotype, n * 2 matrix, n is sample size
-#' @param geno Genotype in bigmatrix format; m * n, m is marker size, n is sample size
+#' @param geno genotype, either m by n or n by m is supportable, m is marker size, n is population size
#' @param map SNP map information, SNP name, Chr, Pos
#' @param K Kinship, Covariance matrix(n * n) for random effects, must be positive semi-definite
#' @param nPC.GLM number of PCs added as fixed effects in GLM
@@ -49,7 +46,7 @@
#' @param permutation.rep number of permutation replicates
#' @param col for color of points in each chromosome on manhattan plot
#' @param memo Character. A text marker on output files
-#' @param tmppath the path of the temporary file
+#' @param outpath the path of the output files
#' @param file.output whether to output files or not
#' @param file.type figure formats, "jpg", "tiff"
#' @param dpi resolution for output figures
@@ -57,7 +54,7 @@
#' @param verbose whether to print detail.
#'
#' @export
-#' @return a m * 2 matrix, the first column is the SNP effect, the second column is the P values
+#' @return
#' Output: MVP.return$map - SNP map information, SNP name, Chr, Pos
#' Output: MVP.return$glm.results - p-values obtained by GLM method
#' Output: MVP.return$mlm.results - p-values obtained by MLM method
@@ -116,14 +113,15 @@ function(phe, geno, map, K=NULL, nPC.GLM=NULL, nPC.MLM=NULL, nPC.FarmCPU=NULL,
}
logging.initialize("MVP", logging.outpath)
- MVP.Version(width = 60, verbose = verbose)
- logging.log("Start:", as.character(Sys.time()), "\n", verbose = verbose)
+ MVP.Version(width = 65, verbose = verbose)
+ time_start <- Sys.time()
+ logging.log("Start:", format(time_start, format = "%F %T %Z"), "\n", verbose = verbose)
if ("log" %in% file.output) {
logging.log("The log has been output to the file:", get("logging.file", envir = package.env), "\n", verbose = verbose)
}
vc.method <- match.arg(vc.method)
- if (nrow(phe) != ncol(geno)) stop("The number of individuals in phenotype and genotype doesn't match!")
- if (nrow(geno) != nrow(map)) stop("The number of markers in genotype and map doesn't match!")
+ if (nrow(phe) != ncol(geno) & nrow(phe) != nrow(geno)) stop("The number of individuals in phenotype and genotype doesn't match!")
+ if (nrow(map) != ncol(geno) & nrow(map) != nrow(geno)) stop("The number of markers in genotype and map doesn't match!")
if (!is.big.matrix(geno)) stop("genotype should be in 'big.matrix' format.")
#list -> matrix
@@ -135,27 +133,39 @@ function(phe, geno, map, K=NULL, nPC.GLM=NULL, nPC.MLM=NULL, nPC.FarmCPU=NULL,
na.index <- NULL
if (!is.null(CV.GLM)) {
CV.GLM <- as.matrix(CV.GLM)
- if (nrow(CV.GLM) != ncol(geno)) stop("The number of individuals in covariates and genotype doesn't match!")
+ if (nrow(CV.GLM) != nrow(phe)) stop("The number of individuals in covariates and phenotype doesn't match!")
na.index <- c(na.index, which(is.na(CV.GLM), arr.ind = TRUE)[, 1])
}
if (!is.null(CV.MLM)) {
CV.MLM <- as.matrix(CV.MLM)
- if (nrow(CV.MLM) != ncol(geno)) stop("The number of individuals in covariates and genotype doesn't match!")
+ if (nrow(CV.MLM) != nrow(phe)) stop("The number of individuals in covariates and phenotype doesn't match!")
na.index <- c(na.index, which(is.na(CV.MLM), arr.ind = TRUE)[, 1])
}
if (!is.null(CV.FarmCPU)) {
CV.FarmCPU <- as.matrix(CV.FarmCPU)
- if (nrow(CV.FarmCPU) != ncol(geno)) stop("The number of individuals in covariates and genotype doesn't match!")
+ if (nrow(CV.FarmCPU) != nrow(phe)) stop("The number of individuals in covariates and phenotype doesn't match!")
na.index <- c(na.index, which(is.na(CV.FarmCPU), arr.ind = TRUE)[, 1])
}
na.index <- unique(na.index)
#Data information
- m <- nrow(geno)
- n <- ncol(geno)
- logging.log(paste("Input data has", n, "individuals,", m, "markers"), "\n", verbose = verbose)
+ MrkByCol <- nrow(phe) == nrow(geno)
+ m <- ifelse(MrkByCol, ncol(geno), nrow(geno))
+ n <- nrow(phe)
+ logging.log(paste("Input data has", n, "individuals and", m, "markers"), "\n", verbose = verbose)
+ logging.log(paste("Markers are detected to be stored by", ifelse(MrkByCol, "column", "row")), "\n", verbose = verbose)
logging.log("Analyzed trait:", colnames(phe)[2], "\n", verbose = verbose)
logging.log("Number of threads used:", ncpus, "\n", verbose = verbose)
+ hpclib <- grepl("mkl", sessionInfo()$LAPACK) | grepl("openblas", sessionInfo()$LAPACK) | !inherits(try(Revo.version, silent=TRUE),"try-error")
+ if(!hpclib){
+ logging.log("No high performance math library detected! The computational efficiency would be greatly reduced\n", verbose = verbose)
+ }else{
+ if(grepl("mkl", sessionInfo()$LAPACK) | !inherits(try(Revo.version, silent=TRUE),"try-error")){
+ logging.log("Math Kernel Library is detected, nice job!\n", verbose = verbose)
+ }else{
+ logging.log("OpenBLAS Library is detected, nice job!\n", verbose = verbose)
+ }
+ }
#remove samples with missing phenotype
seqTaxa <- which(!is.na(phe[,2]))
@@ -169,15 +179,23 @@ function(phe, geno, map, K=NULL, nPC.GLM=NULL, nPC.MLM=NULL, nPC.FarmCPU=NULL,
if (!is.null(CV.GLM)) { CV.GLM = CV.GLM[seqTaxa, , drop = FALSE] }
if (!is.null(CV.MLM)) { CV.MLM = CV.MLM[seqTaxa, , drop = FALSE] }
if (!is.null(CV.FarmCPU)) { CV.FarmCPU = CV.FarmCPU[seqTaxa, , drop = FALSE] }
- if(length(seqTaxa) < n * 0.85){
- geno <- deepcopy(geno, cols = seqTaxa)
+ if(length(seqTaxa) < n * 0.8){
+ logging.log("Re-build memory-mapping file for remaining individuals", "\n", verbose = verbose)
+ if(!MrkByCol){
+ geno <- deepcopy(geno, cols = seqTaxa)
+ }else{
+ geno <- deepcopy(geno, rows = seqTaxa)
+ }
seqTaxa <- NULL
}
}
- logging.log("Check if NAs exist in genotype...", verbose = verbose)
- if(hasNA(geno@address, geno_ind = seqTaxa, threads = ncpus)) stop("NA is not allowed in genotype, use 'MVP.Data.impute' to impute.")
- logging.log("(OK!)", "\n", verbose = verbose)
+ logging.log("Check if NAs exist in genotype...", "\n", verbose = verbose)
+ if(hasNA(geno@address, mrkbycol = MrkByCol, geno_ind = seqTaxa, threads = ncpus)) stop("NA is not allowed in genotype, use 'MVP.Data.impute' to impute.")
+
+ logging.log("Calculate allele frequency...", "\n", verbose = verbose)
+ marker_freq <- BigRowMean(geno@address, MrkByCol, threads = ncpus, geno_ind = seqTaxa) / 2
+ map$MAF <- ifelse(marker_freq > 0.5, 1 - marker_freq, marker_freq)
#remove SNPs with low MAF
geno_marker_index <- NULL
@@ -185,10 +203,7 @@ function(phe, geno, map, K=NULL, nPC.GLM=NULL, nPC.MLM=NULL, nPC.FarmCPU=NULL,
if (!is.null(maf)){
if(length(maf) != 1) stop("maf should be a value")
if(maf <= 0 || maf >= 0.5) stop("maf should be at the range of 0-0.5")
- logging.log("Calculate allele frequency...", "\n", verbose = verbose)
- marker_maf <- BigRowMean(geno@address, threads = ncpus, geno_ind = seqTaxa) / 2
- marker_maf <- ifelse(marker_maf > 0.5, 1 - marker_maf, marker_maf)
- geno_marker_index <- which(marker_maf > maf)
+ geno_marker_index <- which(map$MAF >= maf)
if(length(geno_marker_index) == 0) stop(paste("MAFs of all markers are smaller than the threshold", maf))
if(length(geno_marker_index) == 1) stop(paste("only 1 marker left on the given MAF threshold", maf))
if(length(geno_marker_index) == m) geno_marker_index <- NULL
@@ -197,12 +212,23 @@ function(phe, geno, map, K=NULL, nPC.GLM=NULL, nPC.MLM=NULL, nPC.FarmCPU=NULL,
logging.log("Total", m - length(geno_marker_index), "markers are removed at MAF threshold", maf, "\n", verbose = verbose)
m <- length(geno_marker_index)
map_sub <- map[geno_marker_index, ]
- if(length(geno_marker_index) < m * 0.85){
+ marker_freq <- marker_freq[geno_marker_index]
+ if(length(geno_marker_index) < m * 0.8){
if(!is.null(seqTaxa)){
- geno <- deepcopy(geno, cols = seqTaxa, rows = geno_marker_index)
+ logging.log("Re-build memory-mapping file for remaining individuals and markers", "\n", verbose = verbose)
+ if(MrkByCol){
+ geno <- deepcopy(geno, rows = seqTaxa, cols = geno_marker_index)
+ }else{
+ geno <- deepcopy(geno, cols = seqTaxa, rows = geno_marker_index)
+ }
seqTaxa <- NULL
}else{
- geno <- deepcopy(geno, rows = geno_marker_index)
+ logging.log("Re-build memory-mapping file for remaining markers", "\n", verbose = verbose)
+ if(MrkByCol){
+ geno <- deepcopy(geno, cols = geno_marker_index)
+ }else{
+ geno <- deepcopy(geno, rows = geno_marker_index)
+ }
}
geno_marker_index <- NULL
}
@@ -229,11 +255,13 @@ function(phe, geno, map, K=NULL, nPC.GLM=NULL, nPC.MLM=NULL, nPC.FarmCPU=NULL,
if (!is.null(nPC) | "MLM" %in% method) {
if (is.null(K)) {
K <- MVP.K.VanRaden(
- M = geno,
- ind_idx = seqTaxa,
- mrk_idx = geno_marker_index,
- maxLine = maxLine,
- cpu = ncpus,
+ M = geno,
+ ind_idx = seqTaxa,
+ mrk_idx = geno_marker_index,
+ mrk_freq = marker_freq,
+ mrk_bycol = MrkByCol,
+ maxLine = maxLine,
+ cpu = ncpus,
verbose = verbose,
checkNA = FALSE
)
@@ -312,7 +340,7 @@ function(phe, geno, map, K=NULL, nPC.GLM=NULL, nPC.MLM=NULL, nPC.FarmCPU=NULL,
logging.log("-------------------------GWAS Start-------------------------", "\n", verbose = verbose)
if (glm.run) {
logging.log("General Linear Model (GLM) Start...", "\n", verbose = verbose)
- glm.results <- MVP.GLM(phe=phe, geno=geno, CV=CV.GLM, ind_idx=seqTaxa, mrk_idx=geno_marker_index, cpu=ncpus, verbose = verbose);gc()
+ glm.results <- MVP.GLM(phe=phe, geno=geno, CV=CV.GLM, ind_idx=seqTaxa, mrk_idx=geno_marker_index, maxLine = maxLine, cpu=ncpus, verbose = verbose);gc()
colnames(glm.results) <- c("Effect", "SE", paste(colnames(phe)[2],"GLM",sep="."))
z = glm.results[, 1]/glm.results[, 2]
lambda = median(z^2, na.rm=TRUE)/qchisq(1/2, df = 1,lower.tail=FALSE)
@@ -327,7 +355,7 @@ function(phe, geno, map, K=NULL, nPC.GLM=NULL, nPC.MLM=NULL, nPC.FarmCPU=NULL,
if (mlm.run) {
logging.log("Mixed Linear Model (MLM) Start...", "\n", verbose = verbose)
- mlm.results <- MVP.MLM(phe=phe, geno=geno, K=K, eigenK=eigenK, CV=CV.MLM, ind_idx=seqTaxa, mrk_idx=geno_marker_index, cpu=ncpus, vc.method=vc.method, verbose = verbose);gc()
+ mlm.results <- MVP.MLM(phe=phe, geno=geno, K=K, eigenK=eigenK, CV=CV.MLM, ind_idx=seqTaxa, mrk_idx=geno_marker_index, maxLine = maxLine, cpu=ncpus, vc.method=vc.method, verbose = verbose);gc()
colnames(mlm.results) <- c("Effect", "SE", paste(colnames(phe)[2],"MLM",sep="."))
z = mlm.results[, 1]/mlm.results[, 2]
lambda = median(z^2, na.rm=TRUE)/qchisq(1/2, df = 1,lower.tail=FALSE)
@@ -342,7 +370,7 @@ function(phe, geno, map, K=NULL, nPC.GLM=NULL, nPC.MLM=NULL, nPC.FarmCPU=NULL,
if (farmcpu.run) {
logging.log("FarmCPU Start...", "\n", verbose = verbose)
- farmcpu.results <- MVP.FarmCPU(phe=phe, geno=geno, map=map[,1:3], CV=CV.FarmCPU, ind_idx=seqTaxa, mrk_idx=geno_marker_index, ncpus=ncpus, memo="MVP.FarmCPU", p.threshold=p.threshold, QTN.threshold=QTN.threshold, method.bin=method.bin, bin.size=bin.size, bin.selection=bin.selection, maxLoop=maxLoop, verbose = verbose)
+ farmcpu.results <- MVP.FarmCPU(phe=phe, geno=geno, map=map[,1:3], CV=CV.FarmCPU, ind_idx=seqTaxa, mrk_idx=geno_marker_index, maxLine = maxLine, ncpus=ncpus, memo="MVP.FarmCPU", p.threshold=p.threshold, QTN.threshold=QTN.threshold, method.bin=method.bin, bin.size=bin.size, bin.selection=bin.selection, maxLoop=maxLoop, verbose = verbose)
colnames(farmcpu.results) <- c("Effect", "SE", paste(colnames(phe)[2],"FarmCPU",sep="."))
z = farmcpu.results[, 1]/farmcpu.results[, 2]
lambda = median(z^2, na.rm=TRUE)/qchisq(1/2, df = 1,lower.tail=FALSE)
@@ -366,12 +394,12 @@ function(phe, geno, map, K=NULL, nPC.GLM=NULL, nPC.MLM=NULL, nPC.FarmCPU=NULL,
myY.shuffle = phe
myY.shuffle[,2] = myY.shuffle[index.shuffle,2]
#GWAS using t.test...
- myPermutation = MVP.GLM(phe=myY.shuffle[,c(1,2)], geno=geno, ind_idx=seqTaxa, mrk_idx=geno_marker_index, cpu=ncpus)
+ myPermutation = MVP.GLM(phe=myY.shuffle[,c(1,2)], geno=geno, ind_idx=seqTaxa, mrk_idx=geno_marker_index, maxLine=maxLine, cpu=ncpus)
pvalue = min(myPermutation[,3],na.rm=TRUE)
if(i==1){
- pvalue.final=pvalue
- }else{
- pvalue.final=c(pvalue.final,pvalue)
+ pvalue.final=pvalue
+ }else{
+ pvalue.final=c(pvalue.final,pvalue)
}
}#end of permutation.rep
permutation.cutoff = sort(pvalue.final)[ceiling(permutation.rep*0.05)]
@@ -451,11 +479,20 @@ function(phe, geno, map, K=NULL, nPC.GLM=NULL, nPC.MLM=NULL, nPC.FarmCPU=NULL,
)
}
}
- now <- Sys.time()
+ time_end <- Sys.time()
if (length(file.output) > 0) {
logging.log("Results are stored at Working Directory:", outpath, "\n", verbose = verbose)
}
- logging.log("End:", as.character(now), "\n", verbose = verbose)
+ logging.log("End:", format(time_end, format = "%F %T %Z"), "\n", verbose = verbose)
+ time_diff <- as.numeric(time_end) - as.numeric(time_start)
+ h <- time_diff %/% 3600
+ m <- (time_diff %% 3600) %/% 60
+ s <- ((time_diff %% 3600) %% 60)
+ index <- which(c(h, m, s) != 0)
+ num <- c(h, m, s)[index]
+ num <- round(num, 0)
+ char <- c("h", "m", "s")[index]
+ logging.log("Total running time:", paste(num, char, sep="", collapse=""), "\n", verbose = verbose)
print_accomplished(width = 60, verbose = verbose)
return(invisible(MVP.return))
diff --git a/R/RcppExports.R b/R/RcppExports.R
index 1b2c695..6cda37c 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -5,12 +5,12 @@ getRow <- function(pBigMat, row) {
.Call(`_rMVP_getRow`, pBigMat, row)
}
-glm_c <- function(y, X, iXX, pBigMat, geno_ind = NULL, marker_ind = NULL, verbose = TRUE, threads = 0L) {
- .Call(`_rMVP_glm_c`, y, X, iXX, pBigMat, geno_ind, marker_ind, verbose, threads)
+glm_c <- function(y, X, iXX, pBigMat, geno_ind = NULL, marker_ind = NULL, step = 10000L, verbose = TRUE, threads = 0L) {
+ .Call(`_rMVP_glm_c`, y, X, iXX, pBigMat, geno_ind, marker_ind, step, verbose, threads)
}
-mlm_c <- function(y, X, U, vgs, pBigMat, geno_ind = NULL, marker_ind = NULL, verbose = TRUE, threads = 0L) {
- .Call(`_rMVP_mlm_c`, y, X, U, vgs, pBigMat, geno_ind, marker_ind, verbose, threads)
+mlm_c <- function(y, X, U, vgs, pBigMat, geno_ind = NULL, marker_ind = NULL, step = 10000L, verbose = TRUE, threads = 0L) {
+ .Call(`_rMVP_mlm_c`, y, X, U, vgs, pBigMat, geno_ind, marker_ind, step, verbose, threads)
}
vcf_parser_map <- function(vcf_file, out) {
@@ -33,8 +33,8 @@ numeric_scan <- function(num_file) {
.Call(`_rMVP_numeric_scan`, num_file)
}
-write_bfile <- function(pBigMat, bed_file, threads = 0L, verbose = TRUE) {
- invisible(.Call(`_rMVP_write_bfile`, pBigMat, bed_file, threads, verbose))
+write_bfile <- function(pBigMat, bed_file, mrkbycol = TRUE, threads = 0L, verbose = TRUE) {
+ invisible(.Call(`_rMVP_write_bfile`, pBigMat, bed_file, mrkbycol, threads, verbose))
}
read_bfile <- function(bed_file, pBigMat, maxLine, threads = 0L, verbose = TRUE) {
@@ -53,16 +53,16 @@ geninv <- function(GG) {
.Call(`_rMVP_geninv`, GG)
}
-impute_marker <- function(pBigMat, threads = 0L, verbose = TRUE) {
- invisible(.Call(`_rMVP_impute_marker`, pBigMat, threads, verbose))
+impute_marker <- function(pBigMat, mrkbycol = TRUE, threads = 0L, verbose = TRUE) {
+ invisible(.Call(`_rMVP_impute_marker`, pBigMat, mrkbycol, threads, verbose))
}
-hasNA <- function(pBigMat, geno_ind = NULL, threads = 1L) {
- .Call(`_rMVP_hasNA`, pBigMat, geno_ind, threads)
+hasNA <- function(pBigMat, mrkbycol = TRUE, geno_ind = NULL, marker_ind = NULL, threads = 1L) {
+ .Call(`_rMVP_hasNA`, pBigMat, mrkbycol, geno_ind, marker_ind, threads)
}
-BigRowMean <- function(pBigMat, threads = 0L, geno_ind = NULL) {
- .Call(`_rMVP_BigRowMean`, pBigMat, threads, geno_ind)
+BigRowMean <- function(pBigMat, mrkbycol = TRUE, threads = 0L, geno_ind = NULL) {
+ .Call(`_rMVP_BigRowMean`, pBigMat, mrkbycol, threads, geno_ind)
}
kin_cal_m <- function(pBigMat, threads = 0L, verbose = TRUE) {
@@ -73,7 +73,7 @@ kin_cal_s <- function(pBigMat, threads = 0L, mkl = FALSE, verbose = TRUE) {
.Call(`_rMVP_kin_cal_s`, pBigMat, threads, mkl, verbose)
}
-kin_cal <- function(pBigMat, geno_ind = NULL, marker_ind = NULL, threads = 0L, step = 10000L, mkl = FALSE, verbose = TRUE) {
- .Call(`_rMVP_kin_cal`, pBigMat, geno_ind, marker_ind, threads, step, mkl, verbose)
+kin_cal <- function(pBigMat, geno_ind = NULL, marker_ind = NULL, marker_freq = NULL, marker_bycol = TRUE, threads = 0L, step = 10000L, mkl = FALSE, verbose = TRUE) {
+ .Call(`_rMVP_kin_cal`, pBigMat, geno_ind, marker_ind, marker_freq, marker_bycol, threads, step, mkl, verbose)
}
diff --git a/README.md b/README.md
index 515744f..5ddb3fd 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# rMVP [](https://github.com/XiaoleiLiuBio/rMVP/issues/new) [](https://cran.r-project.org/package=rMVP) [](https://CRAN.R-project.org/package=rMVP) []()  [](https://cran.r-project.org/package=rMVP)
+# rMVP [](https://github.com/XiaoleiLiuBio/rMVP/issues/new) [](https://cran.r-project.org/package=rMVP) [](https://CRAN.R-project.org/package=rMVP) []()  [](https://cran.r-project.org/package=rMVP)
## An [r](https://) package for [M](https://)emory-efficient, [V](https://)isualization-enhanced, and [P](https://)arallel-accelerated Genome-Wide Association Study
@@ -227,7 +227,7 @@ MVP.Data(fileHMP=c("hmp.chr1.txt", "hmp.chr2.txt", "hmp.chr3.txt", "hmp.chr4.txt
### Numeric
**[back to top](#contents)**
-If you have genotype data in **Numeric** (m * n, m rows and n columns, m is the number of SNPs, n is the number of individuals) format:
+If you have genotype data in **Numeric** (either m * n or n * m is acceptable, m is the number of SNPs, n is the number of individuals) format:
**fileNum**, name of genotype data in Numeric format
**filePhe**, name of phenotype file
diff --git a/inst/extdata/05_mvp/mvp.geno.bin b/inst/extdata/05_mvp/mvp.geno.bin
index 400f2cb..3dc46b8 100644
Binary files a/inst/extdata/05_mvp/mvp.geno.bin and b/inst/extdata/05_mvp/mvp.geno.bin differ
diff --git a/inst/extdata/05_mvp/mvp.geno.desc b/inst/extdata/05_mvp/mvp.geno.desc
index f3fb5a5..a84afae 100644
--- a/inst/extdata/05_mvp/mvp.geno.desc
+++ b/inst/extdata/05_mvp/mvp.geno.desc
@@ -1,5 +1,5 @@
new("big.matrix.descriptor", description = list(sharedType = "FileBacked",
- filename = "mvp.geno.bin", dirname = "./", totalRows = 15L,
- totalCols = 10L, rowOffset = c(0, 15), colOffset = c(0, 10
- ), nrow = 15, ncol = 10, rowNames = NULL, colNames = NULL,
+ filename = "mvp.geno.bin", dirname = "05_mvp/", totalRows = 10L,
+ totalCols = 15L, rowOffset = c(0, 10), colOffset = c(0, 15
+ ), nrow = 10, ncol = 15, rowNames = NULL, colNames = NULL,
type = "char", separated = FALSE))
diff --git a/inst/extdata/06_mvp-impute/mvp.imp.geno.bin b/inst/extdata/06_mvp-impute/mvp.imp.geno.bin
index 867362e..d64da2e 100644
Binary files a/inst/extdata/06_mvp-impute/mvp.imp.geno.bin and b/inst/extdata/06_mvp-impute/mvp.imp.geno.bin differ
diff --git a/inst/extdata/06_mvp-impute/mvp.imp.geno.desc b/inst/extdata/06_mvp-impute/mvp.imp.geno.desc
index cd066ba..4ced266 100644
--- a/inst/extdata/06_mvp-impute/mvp.imp.geno.desc
+++ b/inst/extdata/06_mvp-impute/mvp.imp.geno.desc
@@ -1,5 +1,5 @@
new("big.matrix.descriptor", description = list(sharedType = "FileBacked",
- filename = "mvp.imp.geno.bin", dirname = "./", totalRows = 15L,
- totalCols = 10L, rowOffset = c(0, 15), colOffset = c(0, 10
- ), nrow = 15, ncol = 10, rowNames = NULL, colNames = NULL,
+ filename = "mvp.imp.geno.bin", dirname = "06_mvp-impute/",
+ totalRows = 10L, totalCols = 15L, rowOffset = c(0, 10), colOffset = c(0,
+ 15), nrow = 10, ncol = 15, rowNames = NULL, colNames = NULL,
type = "char", separated = FALSE))
diff --git a/man/MVP.Data.Kin.Rd b/man/MVP.Data.Kin.Rd
index bb4172b..186cdd7 100644
--- a/man/MVP.Data.Kin.Rd
+++ b/man/MVP.Data.Kin.Rd
@@ -9,6 +9,7 @@ MVP.Data.Kin(
mvp_prefix = "mvp",
out = NULL,
maxLine = 10000,
+ mrk_bycol = TRUE,
sep = "\\t",
cpu = 1,
verbose = TRUE
@@ -23,6 +24,8 @@ MVP.Data.Kin(
\item{maxLine}{the number of markers handled at a time, smaller value would reduce the memory cost}
+\item{mrk_bycol}{whether the markers are stored by columns in genotype (i.e. genotype is a n by m matrix)}
+
\item{sep}{seperator for Kinship file.}
\item{cpu}{the number of cpu}
diff --git a/man/MVP.Data.PC.Rd b/man/MVP.Data.PC.Rd
index 603d7cf..9a156fd 100644
--- a/man/MVP.Data.PC.Rd
+++ b/man/MVP.Data.PC.Rd
@@ -11,6 +11,7 @@ MVP.Data.PC(
out = NULL,
pcs.keep = 5,
maxLine = 10000,
+ mrk_bycol = TRUE,
sep = "\\t",
cpu = 1,
verbose = TRUE
@@ -29,6 +30,8 @@ MVP.Data.PC(
\item{maxLine}{the number of markers handled at a time, smaller value would reduce the memory cost}
+\item{mrk_bycol}{whether the markers are stored by columns in genotype (i.e. genotype is a n by m matrix)}
+
\item{sep}{seperator for PC file.}
\item{cpu}{the number of cpu}
diff --git a/man/MVP.Data.Rd b/man/MVP.Data.Rd
index c2f722c..999972d 100644
--- a/man/MVP.Data.Rd
+++ b/man/MVP.Data.Rd
@@ -2,10 +2,7 @@
% Please edit documentation in R/MVP.Data.r
\name{MVP.Data}
\alias{MVP.Data}
-\title{MVP.Data: To prepare data for MVP package
-Author: Xiaolei Liu, Lilin Yin and Haohao Zhang
-Build date: Aug 30, 2016
-Last update: Sep 12, 2018}
+\title{MVP.Data: To prepare data for MVP package}
\usage{
MVP.Data(
fileMVP = NULL,
@@ -44,7 +41,7 @@ MVP.Data(
\item{fileBed}{Genotype in PLINK binary format}
-\item{fileNum}{Genotype in numeric format; pure 0, 1, 2 matrix; m * n, m is marker size, n is sample size}
+\item{fileNum}{Genotype in numeric format; pure 0, 1, 2 matrix; m * n or n * m, m is marker size, n is sample size}
\item{fileMap}{SNP map information, there are three columns, including SNP_ID, Chromosome, and Position}
@@ -101,9 +98,6 @@ Requirement: fileHMP, fileBed, and fileNum can not input at the same time
}
\description{
MVP.Data: To prepare data for MVP package
-Author: Xiaolei Liu, Lilin Yin and Haohao Zhang
-Build date: Aug 30, 2016
-Last update: Sep 12, 2018
}
\examples{
\donttest{
diff --git a/man/MVP.Data.impute.Rd b/man/MVP.Data.impute.Rd
index 4d1782c..8244916 100644
--- a/man/MVP.Data.impute.Rd
+++ b/man/MVP.Data.impute.Rd
@@ -9,6 +9,7 @@ Build date: Sep 12, 2018}
MVP.Data.impute(
mvp_prefix,
out = NULL,
+ mrk_bycol = TRUE,
method = "Major",
ncpus = NULL,
verbose = TRUE
@@ -19,6 +20,8 @@ MVP.Data.impute(
\item{out}{the prefix of output file}
+\item{mrk_bycol}{whether the markers are stored by columns in genotype (i.e. genotype is a n by m matrix)}
+
\item{method}{'Major', 'Minor', "Middle"}
\item{ncpus}{number of threads for imputing}
diff --git a/man/MVP.FarmCPU.Rd b/man/MVP.FarmCPU.Rd
index bcb18ea..2be28fd 100755
--- a/man/MVP.FarmCPU.Rd
+++ b/man/MVP.FarmCPU.Rd
@@ -21,6 +21,7 @@ MVP.FarmCPU(
Prior = NULL,
ncpus = 2,
maxLoop = 10,
+ maxLine = 5000,
threshold.output = 0.01,
converge = 1,
iteration.output = FALSE,
@@ -33,7 +34,7 @@ MVP.FarmCPU(
\arguments{
\item{phe}{phenotype, n by t matrix, n is sample size, t is number of phenotypes}
-\item{geno}{genotype, m by n matrix, m is marker size, n is sample size. This is Pure Genotype Data Matrix(GD). THERE IS NO COLUMN FOR TAXA.}
+\item{geno}{genotype, either m by n or n by m is supportable, m is marker size, n is population size. This is Pure Genotype Data Matrix(GD). THERE IS NO COLUMN FOR TAXA.}
\item{map}{SNP map information, m by 3 matrix, m is marker size, the three columns are SNP_ID, Chr, and Pos}
@@ -63,6 +64,8 @@ MVP.FarmCPU(
\item{maxLoop}{maximum number of iterations}
+\item{maxLine}{the number of markers handled at a time, smaller value would reduce the memory cost}
+
\item{threshold.output}{only the GWAS results with p-values lower than threshold.output will be output}
\item{converge}{a number, 0 to 1, if selected pseudo QTNs in the last and the second last iterations have a certain probality (the probability is converge) of overlap, the loop will stop}
@@ -94,7 +97,7 @@ phenotype <- phenotype[idx, ]
print(dim(phenotype))
genoPath <- system.file("extdata", "06_mvp-impute", "mvp.imp.geno.desc", package = "rMVP")
genotype <- attach.big.matrix(genoPath)
-genotype <- deepcopy(genotype, cols=idx)
+genotype <- deepcopy(genotype, rows=idx)
print(dim(genotype))
mapPath <- system.file("extdata", "06_mvp-impute", "mvp.imp.geno.map", package = "rMVP")
map <- read.table(mapPath , head = TRUE)
diff --git a/man/MVP.GLM.Rd b/man/MVP.GLM.Rd
index d015b43..f09baed 100755
--- a/man/MVP.GLM.Rd
+++ b/man/MVP.GLM.Rd
@@ -10,6 +10,7 @@ MVP.GLM(
CV = NULL,
ind_idx = NULL,
mrk_idx = NULL,
+ maxLine = 5000,
cpu = 1,
verbose = TRUE
)
@@ -17,7 +18,7 @@ MVP.GLM(
\arguments{
\item{phe}{phenotype, n * 2 matrix}
-\item{geno}{Genotype in numeric format, pure 0, 1, 2 matrix; m * n, m is marker size, n is population size}
+\item{geno}{genotype, either m by n or n by m is supportable, m is marker size, n is population size}
\item{CV}{Covariance, design matrix(n * x) for the fixed effects}
@@ -25,6 +26,8 @@ MVP.GLM(
\item{mrk_idx}{the index of effective markers used in analysis}
+\item{maxLine}{the number of markers handled at a time, smaller value would reduce the memory cost}
+
\item{cpu}{number of cpus used for parallel computation}
\item{verbose}{whether to print detail.}
@@ -45,7 +48,7 @@ phenotype <- phenotype[idx, ]
print(dim(phenotype))
genoPath <- system.file("extdata", "06_mvp-impute", "mvp.imp.geno.desc", package = "rMVP")
genotype <- attach.big.matrix(genoPath)
-genotype <- deepcopy(genotype, cols=idx)
+genotype <- deepcopy(genotype, rows=idx)
print(dim(genotype))
glm <- MVP.GLM(phe=phenotype, geno=genotype, cpu=1)
diff --git a/man/MVP.K.VanRaden.Rd b/man/MVP.K.VanRaden.Rd
index 5badf32..1800398 100755
--- a/man/MVP.K.VanRaden.Rd
+++ b/man/MVP.K.VanRaden.Rd
@@ -9,13 +9,15 @@ MVP.K.VanRaden(
maxLine = 5000,
ind_idx = NULL,
mrk_idx = NULL,
+ mrk_freq = NULL,
+ mrk_bycol = TRUE,
cpu = 1,
verbose = TRUE,
checkNA = TRUE
)
}
\arguments{
-\item{M}{Genotype, m * n, m is marker size, n is population size}
+\item{M}{genotype, either m by n or n by m is supportable, m is marker size, n is population size}
\item{maxLine}{the number of markers handled at a time, smaller value would reduce the memory cost}
@@ -23,6 +25,10 @@ MVP.K.VanRaden(
\item{mrk_idx}{the index of effective markers used in analysis}
+\item{mrk_freq}{the prior calculated major allele frequency (not MAF) for all markers used in analysis}
+
+\item{mrk_bycol}{whether the markers are stored by columns in genotype (i.e. M is a n by m matrix)}
+
\item{cpu}{the number of cpu}
\item{verbose}{whether to print detail.}
@@ -33,8 +39,7 @@ MVP.K.VanRaden(
K, n * n matrix
}
\description{
-Build date: Dec 12, 2016
-Last update: Dec 12, 2019
+Calculate Kinship matrix by VanRaden method
}
\examples{
\donttest{
diff --git a/man/MVP.MLM.Rd b/man/MVP.MLM.Rd
index b5b8be3..5943e99 100755
--- a/man/MVP.MLM.Rd
+++ b/man/MVP.MLM.Rd
@@ -13,6 +13,7 @@ MVP.MLM(
ind_idx = NULL,
mrk_idx = NULL,
REML = NULL,
+ maxLine = 5000,
cpu = 1,
vc.method = c("BRENT", "EMMA", "HE"),
verbose = TRUE
@@ -21,7 +22,7 @@ MVP.MLM(
\arguments{
\item{phe}{phenotype, n * 2 matrix}
-\item{geno}{genotype, m * n, m is marker size, n is population size}
+\item{geno}{genotype, either m by n or n by m is supportable, m is marker size, n is population size}
\item{K}{Kinship, Covariance matrix(n * n) for random effects; must be positive semi-definite}
@@ -35,6 +36,8 @@ MVP.MLM(
\item{REML}{a list that contains ve and vg}
+\item{maxLine}{the number of markers handled at a time, smaller value would reduce the memory cost}
+
\item{cpu}{number of cpus used for parallel computation}
\item{vc.method}{the methods for estimating variance component("emma" or "he" or "brent")}
@@ -45,8 +48,7 @@ MVP.MLM(
results: a m * 2 matrix, the first column is the SNP effect, the second column is the P values
}
\description{
-Build date: Aug 30, 2016
-Last update: Aug 30, 2016
+To perform GWAS with GLM and MLM model and get the P value of SNPs
}
\examples{
\donttest{
@@ -57,7 +59,7 @@ phenotype <- phenotype[idx, ]
print(dim(phenotype))
genoPath <- system.file("extdata", "06_mvp-impute", "mvp.imp.geno.desc", package = "rMVP")
genotype <- attach.big.matrix(genoPath)
-genotype <- deepcopy(genotype, cols=idx)
+genotype <- deepcopy(genotype, rows=idx)
print(dim(genotype))
K <- MVP.K.VanRaden(genotype, cpu=1)
diff --git a/man/MVP.PCA.Rd b/man/MVP.PCA.Rd
index 5e28b97..811b342 100755
--- a/man/MVP.PCA.Rd
+++ b/man/MVP.PCA.Rd
@@ -10,13 +10,14 @@ MVP.PCA(
maxLine = 10000,
ind_idx = NULL,
mrk_idx = NULL,
+ mrk_bycol = TRUE,
pcs.keep = 5,
cpu = 1,
verbose = TRUE
)
}
\arguments{
-\item{M}{Genotype in numeric format, pure 0, 1, 2 matrix; m * n, m is marker size, n is population size}
+\item{M}{genotype, either m by n or n by m is supportable, m is marker size, n is population size}
\item{K}{kinship matrix}
@@ -26,6 +27,8 @@ MVP.PCA(
\item{mrk_idx}{the index of effective markers used in analysis}
+\item{mrk_bycol}{whether the markers are stored by columns in genotype (i.e. M is a n by m matrix)}
+
\item{pcs.keep}{maximum number of PCs for output}
\item{cpu}{the number of cpu}
@@ -36,8 +39,7 @@ MVP.PCA(
Output: PCs - a n * npc matrix of top number of PCs, n is population size and npc is @param pcs.keep
}
\description{
-Build date: Dec 14, 2016
-Last update: Oct 29, 2018
+Principal Component Analysis
}
\examples{
\donttest{
@@ -50,6 +52,3 @@ str(pca)
}
}
-\author{
-Xiaolei Liu, Lilin Yin and Haohao Zhang
-}
diff --git a/man/MVP.Rd b/man/MVP.Rd
index b43661e..e0c737f 100755
--- a/man/MVP.Rd
+++ b/man/MVP.Rd
@@ -43,7 +43,7 @@ MVP(
\arguments{
\item{phe}{phenotype, n * 2 matrix, n is sample size}
-\item{geno}{Genotype in bigmatrix format; m * n, m is marker size, n is sample size}
+\item{geno}{genotype, either m by n or n by m is supportable, m is marker size, n is population size}
\item{map}{SNP map information, SNP name, Chr, Pos}
@@ -91,6 +91,8 @@ MVP(
\item{memo}{Character. A text marker on output files}
+\item{outpath}{the path of the output files}
+
\item{col}{for color of points in each chromosome on manhattan plot}
\item{file.output}{whether to output files or not}
@@ -102,11 +104,8 @@ MVP(
\item{threshold}{a cutoff line on manhattan plot, 0.05/marker size}
\item{verbose}{whether to print detail.}
-
-\item{tmppath}{the path of the temporary file}
}
\value{
-a m * 2 matrix, the first column is the SNP effect, the second column is the P values
Output: MVP.return$map - SNP map information, SNP name, Chr, Pos
Output: MVP.return$glm.results - p-values obtained by GLM method
Output: MVP.return$mlm.results - p-values obtained by MLM method
@@ -118,10 +117,6 @@ Object 2: To calculate kinship among individuals using Varaden method
Object 3: Estimate variance components using EMMA, FaST-LMM, and HE regression
Object 4: Generate high-quality figures
}
-\details{
-Build date: Aug 30, 2017
-Last update: Dec 14, 2018
-}
\examples{
\donttest{
phePath <- system.file("extdata", "07_other", "mvp.phe", package = "rMVP")
diff --git a/man/MVP.Version.Rd b/man/MVP.Version.Rd
index dfebdda..d9f453d 100755
--- a/man/MVP.Version.Rd
+++ b/man/MVP.Version.Rd
@@ -4,7 +4,7 @@
\alias{MVP.Version}
\title{Print MVP Banner}
\usage{
-MVP.Version(width = 60, verbose = TRUE)
+MVP.Version(width = 65, verbose = TRUE)
}
\arguments{
\item{width}{the width of the message}
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index 5ca50b3..a741d05 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -25,8 +25,8 @@ BEGIN_RCPP
END_RCPP
}
// glm_c
-SEXP glm_c(const arma::vec& y, const arma::mat& X, const arma::mat& iXX, SEXP pBigMat, const Nullable geno_ind, const Nullable marker_ind, const bool verbose, const int threads);
-RcppExport SEXP _rMVP_glm_c(SEXP ySEXP, SEXP XSEXP, SEXP iXXSEXP, SEXP pBigMatSEXP, SEXP geno_indSEXP, SEXP marker_indSEXP, SEXP verboseSEXP, SEXP threadsSEXP) {
+SEXP glm_c(const arma::vec& y, const arma::mat& X, const arma::mat& iXX, SEXP pBigMat, const Nullable geno_ind, const Nullable marker_ind, const int step, const bool verbose, const int threads);
+RcppExport SEXP _rMVP_glm_c(SEXP ySEXP, SEXP XSEXP, SEXP iXXSEXP, SEXP pBigMatSEXP, SEXP geno_indSEXP, SEXP marker_indSEXP, SEXP stepSEXP, SEXP verboseSEXP, SEXP threadsSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
@@ -36,15 +36,16 @@ BEGIN_RCPP
Rcpp::traits::input_parameter< SEXP >::type pBigMat(pBigMatSEXP);
Rcpp::traits::input_parameter< const Nullable >::type geno_ind(geno_indSEXP);
Rcpp::traits::input_parameter< const Nullable >::type marker_ind(marker_indSEXP);
+ Rcpp::traits::input_parameter< const int >::type step(stepSEXP);
Rcpp::traits::input_parameter< const bool >::type verbose(verboseSEXP);
Rcpp::traits::input_parameter< const int >::type threads(threadsSEXP);
- rcpp_result_gen = Rcpp::wrap(glm_c(y, X, iXX, pBigMat, geno_ind, marker_ind, verbose, threads));
+ rcpp_result_gen = Rcpp::wrap(glm_c(y, X, iXX, pBigMat, geno_ind, marker_ind, step, verbose, threads));
return rcpp_result_gen;
END_RCPP
}
// mlm_c
-SEXP mlm_c(const arma::vec& y, const arma::mat& X, const arma::mat& U, const double vgs, SEXP pBigMat, const Nullable geno_ind, const Nullable marker_ind, const bool verbose, const int threads);
-RcppExport SEXP _rMVP_mlm_c(SEXP ySEXP, SEXP XSEXP, SEXP USEXP, SEXP vgsSEXP, SEXP pBigMatSEXP, SEXP geno_indSEXP, SEXP marker_indSEXP, SEXP verboseSEXP, SEXP threadsSEXP) {
+SEXP mlm_c(const arma::vec& y, const arma::mat& X, const arma::mat& U, const double vgs, SEXP pBigMat, const Nullable geno_ind, const Nullable marker_ind, const int step, const bool verbose, const int threads);
+RcppExport SEXP _rMVP_mlm_c(SEXP ySEXP, SEXP XSEXP, SEXP USEXP, SEXP vgsSEXP, SEXP pBigMatSEXP, SEXP geno_indSEXP, SEXP marker_indSEXP, SEXP stepSEXP, SEXP verboseSEXP, SEXP threadsSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
@@ -55,9 +56,10 @@ BEGIN_RCPP
Rcpp::traits::input_parameter< SEXP >::type pBigMat(pBigMatSEXP);
Rcpp::traits::input_parameter< const Nullable >::type geno_ind(geno_indSEXP);
Rcpp::traits::input_parameter< const Nullable >::type marker_ind(marker_indSEXP);
+ Rcpp::traits::input_parameter< const int >::type step(stepSEXP);
Rcpp::traits::input_parameter< const bool >::type verbose(verboseSEXP);
Rcpp::traits::input_parameter< const int >::type threads(threadsSEXP);
- rcpp_result_gen = Rcpp::wrap(mlm_c(y, X, U, vgs, pBigMat, geno_ind, marker_ind, verbose, threads));
+ rcpp_result_gen = Rcpp::wrap(mlm_c(y, X, U, vgs, pBigMat, geno_ind, marker_ind, step, verbose, threads));
return rcpp_result_gen;
END_RCPP
}
@@ -126,15 +128,16 @@ BEGIN_RCPP
END_RCPP
}
// write_bfile
-void write_bfile(SEXP pBigMat, std::string bed_file, int threads, bool verbose);
-RcppExport SEXP _rMVP_write_bfile(SEXP pBigMatSEXP, SEXP bed_fileSEXP, SEXP threadsSEXP, SEXP verboseSEXP) {
+void write_bfile(SEXP pBigMat, std::string bed_file, bool mrkbycol, int threads, bool verbose);
+RcppExport SEXP _rMVP_write_bfile(SEXP pBigMatSEXP, SEXP bed_fileSEXP, SEXP mrkbycolSEXP, SEXP threadsSEXP, SEXP verboseSEXP) {
BEGIN_RCPP
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< SEXP >::type pBigMat(pBigMatSEXP);
Rcpp::traits::input_parameter< std::string >::type bed_file(bed_fileSEXP);
+ Rcpp::traits::input_parameter< bool >::type mrkbycol(mrkbycolSEXP);
Rcpp::traits::input_parameter< int >::type threads(threadsSEXP);
Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP);
- write_bfile(pBigMat, bed_file, threads, verbose);
+ write_bfile(pBigMat, bed_file, mrkbycol, threads, verbose);
return R_NilValue;
END_RCPP
}
@@ -194,40 +197,44 @@ BEGIN_RCPP
END_RCPP
}
// impute_marker
-void impute_marker(SEXP pBigMat, int threads, bool verbose);
-RcppExport SEXP _rMVP_impute_marker(SEXP pBigMatSEXP, SEXP threadsSEXP, SEXP verboseSEXP) {
+void impute_marker(SEXP pBigMat, bool mrkbycol, int threads, bool verbose);
+RcppExport SEXP _rMVP_impute_marker(SEXP pBigMatSEXP, SEXP mrkbycolSEXP, SEXP threadsSEXP, SEXP verboseSEXP) {
BEGIN_RCPP
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< SEXP >::type pBigMat(pBigMatSEXP);
+ Rcpp::traits::input_parameter< bool >::type mrkbycol(mrkbycolSEXP);
Rcpp::traits::input_parameter< int >::type threads(threadsSEXP);
Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP);
- impute_marker(pBigMat, threads, verbose);
+ impute_marker(pBigMat, mrkbycol, threads, verbose);
return R_NilValue;
END_RCPP
}
// hasNA
-bool hasNA(SEXP pBigMat, const Nullable geno_ind, const int threads);
-RcppExport SEXP _rMVP_hasNA(SEXP pBigMatSEXP, SEXP geno_indSEXP, SEXP threadsSEXP) {
+bool hasNA(SEXP pBigMat, bool mrkbycol, const Nullable geno_ind, const Nullable marker_ind, const int threads);
+RcppExport SEXP _rMVP_hasNA(SEXP pBigMatSEXP, SEXP mrkbycolSEXP, SEXP geno_indSEXP, SEXP marker_indSEXP, SEXP threadsSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< SEXP >::type pBigMat(pBigMatSEXP);
+ Rcpp::traits::input_parameter< bool >::type mrkbycol(mrkbycolSEXP);
Rcpp::traits::input_parameter< const Nullable >::type geno_ind(geno_indSEXP);
+ Rcpp::traits::input_parameter< const Nullable >::type marker_ind(marker_indSEXP);
Rcpp::traits::input_parameter< const int >::type threads(threadsSEXP);
- rcpp_result_gen = Rcpp::wrap(hasNA(pBigMat, geno_ind, threads));
+ rcpp_result_gen = Rcpp::wrap(hasNA(pBigMat, mrkbycol, geno_ind, marker_ind, threads));
return rcpp_result_gen;
END_RCPP
}
// BigRowMean
-arma::vec BigRowMean(SEXP pBigMat, int threads, const Nullable geno_ind);
-RcppExport SEXP _rMVP_BigRowMean(SEXP pBigMatSEXP, SEXP threadsSEXP, SEXP geno_indSEXP) {
+arma::vec BigRowMean(SEXP pBigMat, bool mrkbycol, int threads, const Nullable geno_ind);
+RcppExport SEXP _rMVP_BigRowMean(SEXP pBigMatSEXP, SEXP mrkbycolSEXP, SEXP threadsSEXP, SEXP geno_indSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< SEXP >::type pBigMat(pBigMatSEXP);
+ Rcpp::traits::input_parameter< bool >::type mrkbycol(mrkbycolSEXP);
Rcpp::traits::input_parameter< int >::type threads(threadsSEXP);
Rcpp::traits::input_parameter< const Nullable >::type geno_ind(geno_indSEXP);
- rcpp_result_gen = Rcpp::wrap(BigRowMean(pBigMat, threads, geno_ind));
+ rcpp_result_gen = Rcpp::wrap(BigRowMean(pBigMat, mrkbycol, threads, geno_ind));
return rcpp_result_gen;
END_RCPP
}
@@ -259,43 +266,45 @@ BEGIN_RCPP
END_RCPP
}
// kin_cal
-SEXP kin_cal(SEXP pBigMat, const Nullable geno_ind, const Nullable marker_ind, int threads, size_t step, bool mkl, bool verbose);
-RcppExport SEXP _rMVP_kin_cal(SEXP pBigMatSEXP, SEXP geno_indSEXP, SEXP marker_indSEXP, SEXP threadsSEXP, SEXP stepSEXP, SEXP mklSEXP, SEXP verboseSEXP) {
+SEXP kin_cal(SEXP pBigMat, const Nullable geno_ind, const Nullable marker_ind, const Nullable marker_freq, const bool marker_bycol, int threads, size_t step, bool mkl, bool verbose);
+RcppExport SEXP _rMVP_kin_cal(SEXP pBigMatSEXP, SEXP geno_indSEXP, SEXP marker_indSEXP, SEXP marker_freqSEXP, SEXP marker_bycolSEXP, SEXP threadsSEXP, SEXP stepSEXP, SEXP mklSEXP, SEXP verboseSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< SEXP >::type pBigMat(pBigMatSEXP);
Rcpp::traits::input_parameter< const Nullable >::type geno_ind(geno_indSEXP);
Rcpp::traits::input_parameter< const Nullable >::type marker_ind(marker_indSEXP);
+ Rcpp::traits::input_parameter< const Nullable >::type marker_freq(marker_freqSEXP);
+ Rcpp::traits::input_parameter< const bool >::type marker_bycol(marker_bycolSEXP);
Rcpp::traits::input_parameter< int >::type threads(threadsSEXP);
Rcpp::traits::input_parameter< size_t >::type step(stepSEXP);
Rcpp::traits::input_parameter< bool >::type mkl(mklSEXP);
Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP);
- rcpp_result_gen = Rcpp::wrap(kin_cal(pBigMat, geno_ind, marker_ind, threads, step, mkl, verbose));
+ rcpp_result_gen = Rcpp::wrap(kin_cal(pBigMat, geno_ind, marker_ind, marker_freq, marker_bycol, threads, step, mkl, verbose));
return rcpp_result_gen;
END_RCPP
}
static const R_CallMethodDef CallEntries[] = {
{"_rMVP_getRow", (DL_FUNC) &_rMVP_getRow, 2},
- {"_rMVP_glm_c", (DL_FUNC) &_rMVP_glm_c, 8},
- {"_rMVP_mlm_c", (DL_FUNC) &_rMVP_mlm_c, 9},
+ {"_rMVP_glm_c", (DL_FUNC) &_rMVP_glm_c, 9},
+ {"_rMVP_mlm_c", (DL_FUNC) &_rMVP_mlm_c, 10},
{"_rMVP_vcf_parser_map", (DL_FUNC) &_rMVP_vcf_parser_map, 2},
{"_rMVP_vcf_parser_genotype", (DL_FUNC) &_rMVP_vcf_parser_genotype, 5},
{"_rMVP_hapmap_parser_map", (DL_FUNC) &_rMVP_hapmap_parser_map, 2},
{"_rMVP_hapmap_parser_genotype", (DL_FUNC) &_rMVP_hapmap_parser_genotype, 6},
{"_rMVP_numeric_scan", (DL_FUNC) &_rMVP_numeric_scan, 1},
- {"_rMVP_write_bfile", (DL_FUNC) &_rMVP_write_bfile, 4},
+ {"_rMVP_write_bfile", (DL_FUNC) &_rMVP_write_bfile, 5},
{"_rMVP_read_bfile", (DL_FUNC) &_rMVP_read_bfile, 5},
{"_rMVP_fit_diago_brent", (DL_FUNC) &_rMVP_fit_diago_brent, 9},
{"_rMVP_crossprodcpp", (DL_FUNC) &_rMVP_crossprodcpp, 1},
{"_rMVP_geninv", (DL_FUNC) &_rMVP_geninv, 1},
- {"_rMVP_impute_marker", (DL_FUNC) &_rMVP_impute_marker, 3},
- {"_rMVP_hasNA", (DL_FUNC) &_rMVP_hasNA, 3},
- {"_rMVP_BigRowMean", (DL_FUNC) &_rMVP_BigRowMean, 3},
+ {"_rMVP_impute_marker", (DL_FUNC) &_rMVP_impute_marker, 4},
+ {"_rMVP_hasNA", (DL_FUNC) &_rMVP_hasNA, 5},
+ {"_rMVP_BigRowMean", (DL_FUNC) &_rMVP_BigRowMean, 4},
{"_rMVP_kin_cal_m", (DL_FUNC) &_rMVP_kin_cal_m, 3},
{"_rMVP_kin_cal_s", (DL_FUNC) &_rMVP_kin_cal_s, 4},
- {"_rMVP_kin_cal", (DL_FUNC) &_rMVP_kin_cal, 7},
+ {"_rMVP_kin_cal", (DL_FUNC) &_rMVP_kin_cal, 9},
{NULL, NULL, 0}
};
diff --git a/src/assoc.cpp b/src/assoc.cpp
index 7d67bdc..eb34db0 100755
--- a/src/assoc.cpp
+++ b/src/assoc.cpp
@@ -67,19 +67,20 @@ NumericVector getRow(SEXP pBigMat, const int row){
}
template
-SEXP glm_c(const arma::vec &y, const arma::mat &X, const arma::mat & iXX, XPtr pMat, const Nullable geno_ind = R_NilValue, const Nullable marker_ind = R_NilValue, const bool verbose = true, const int threads = 0){
+SEXP glm_c(const arma::vec &y, const arma::mat &X, const arma::mat & iXX, XPtr pMat, const Nullable geno_ind = R_NilValue, const Nullable marker_ind = R_NilValue, const int step = 10000, const bool verbose = true, const int threads = 0){
omp_setup(threads);
MatrixAccessor genomat = MatrixAccessor(*pMat);
-
+ bool marker_bycol = (y.n_elem == pMat->nrow());
+
int n;
uvec _geno_ind;
if(geno_ind.isNotNull()){
_geno_ind = as(geno_ind) - 1;
n = _geno_ind.n_elem;
}else{
- n = pMat->ncol();
+ n = marker_bycol ? pMat->nrow() : pMat->ncol();
}
int m;
@@ -88,7 +89,7 @@ SEXP glm_c(const arma::vec &y, const arma::mat &X, const arma::mat & iXX, XPtr(marker_ind) - 1;
m = _marker_ind.n_elem;
}else{
- m = pMat->nrow();
+ m = marker_bycol ? pMat->ncol() : pMat->nrow();
}
int q0 = X.n_cols;
@@ -101,111 +102,174 @@ SEXP glm_c(const arma::vec &y, const arma::mat &X, const arma::mat & iXX, XPtr geno_ind = R_NilValue, const Nullable marker_ind = R_NilValue, const bool verbose = true, const int threads = 0){
+SEXP glm_c(const arma::vec & y, const arma::mat & X, const arma::mat & iXX, SEXP pBigMat, const Nullable geno_ind = R_NilValue, const Nullable marker_ind = R_NilValue, const int step = 10000, const bool verbose = true, const int threads = 0){
XPtr xpMat(pBigMat);
switch(xpMat->matrix_type()){
case 1:
- return glm_c(y, X, iXX, xpMat, geno_ind, marker_ind, verbose, threads);
+ return glm_c(y, X, iXX, xpMat, geno_ind, marker_ind, step, verbose, threads);
case 2:
- return glm_c(y, X, iXX, xpMat, geno_ind, marker_ind, verbose, threads);
+ return glm_c(y, X, iXX, xpMat, geno_ind, marker_ind, step, verbose, threads);
case 4:
- return glm_c(y, X, iXX, xpMat, geno_ind, marker_ind, verbose, threads);
+ return glm_c(y, X, iXX, xpMat, geno_ind, marker_ind, step, verbose, threads);
case 8:
- return glm_c(y, X, iXX, xpMat, geno_ind, marker_ind, verbose, threads);
+ return glm_c(y, X, iXX, xpMat, geno_ind, marker_ind, step, verbose, threads);
default:
throw Rcpp::exception("unknown type detected for big.matrix object!");
}
}
template
-SEXP mlm_c(const arma::vec & y, const arma::mat & X, const arma::mat & U, const double vgs, XPtr pMat, const Nullable geno_ind = R_NilValue, const Nullable marker_ind = R_NilValue, const bool verbose = true, const int threads = 0){
+SEXP mlm_c(const arma::vec & y, const arma::mat & X, const arma::mat & U, const double vgs, XPtr pMat, const Nullable geno_ind = R_NilValue, const Nullable marker_ind = R_NilValue, const int step = 10000, const bool verbose = true, const int threads = 0){
omp_setup(threads);
MatrixAccessor genomat = MatrixAccessor(*pMat);
+ bool marker_bycol = (y.n_elem == pMat->nrow());
int n;
uvec _geno_ind;
@@ -213,7 +277,7 @@ SEXP mlm_c(const arma::vec & y, const arma::mat & X, const arma::mat & U, const
_geno_ind = as(geno_ind) - 1;
n = _geno_ind.n_elem;
}else{
- n = pMat->ncol();
+ n = marker_bycol ? pMat->nrow() : pMat->ncol();
}
int m;
@@ -222,7 +286,7 @@ SEXP mlm_c(const arma::vec & y, const arma::mat & X, const arma::mat & U, const
_marker_ind = as(marker_ind) - 1;
m = _marker_ind.n_elem;
}else{
- m = pMat->nrow();
+ m = marker_bycol ? pMat->ncol() : pMat->nrow();
}
int q0 = X.n_cols;
@@ -237,79 +301,144 @@ SEXP mlm_c(const arma::vec & y, const arma::mat & X, const arma::mat & U, const
arma::mat iUXUX = GInv(UX.t() * UX);
arma::mat res(m, 3);
- arma::vec snp(n);
arma::mat iXXs(q0 + 1, q0 + 1);
- #pragma omp parallel for schedule(dynamic) firstprivate(snp, iXXs)
- for(int i = 0; i < m; i++){
+
+ arma::mat Z_buffer(n, step, fill::none);
+ int i = 0, j = 0;
+ int i_marker = 0;
+ for (;i < m;) {
+
+ int cnt = 0;
+ for (; j < m && cnt < step; j++)
+ {
+ cnt++;
+ }
+
+ if (cnt != step) {
+ Z_buffer.set_size(n, cnt);
+ }
if(_geno_ind.is_empty()){
if(_marker_ind.is_empty()){
- for(int ii = 0; ii < n; ii++){
- snp[ii] = genomat[ii][i];
+ if(marker_bycol){
+ #pragma omp parallel for
+ for(int l = 0; l < cnt; l++){
+ for(int k = 0; k < n; k++){
+ Z_buffer(k, l) = genomat[(i_marker + l)][k];
+ }
+ }
+ }else{
+ #pragma omp parallel for
+ for(int k = 0; k < n; k++){
+ for(int l = 0; l < cnt; l++){
+ Z_buffer(k, l) = genomat[k][(i_marker + l)];
+ }
+ }
}
}else{
- for(int ii = 0; ii < n; ii++){
- snp[ii] = genomat[ii][_marker_ind[i]];
+ if(marker_bycol){
+ #pragma omp parallel for
+ for(int l = 0; l < cnt; l++){
+ for(int k = 0; k < n; k++){
+ Z_buffer(k, l) = genomat[_marker_ind[(i_marker + l)]][k];
+ }
+ }
+ }else{
+ #pragma omp parallel for
+ for(int k = 0; k < n; k++){
+ for(int l = 0; l < cnt; l++){
+ Z_buffer(k, l) = genomat[k][_marker_ind[(i_marker + l)]];
+ }
+ }
}
}
}else{
if(_marker_ind.is_empty()){
- for(int ii = 0; ii < n; ii++){
- snp[ii] = genomat[_geno_ind[ii]][i];
+ if(marker_bycol){
+ #pragma omp parallel for
+ for(int l = 0; l < cnt; l++){
+ for(int k = 0; k < n; k++){
+ Z_buffer(k, l) = genomat[(i_marker + l)][_geno_ind[k]];
+ }
+ }
+ }else{
+ #pragma omp parallel for
+ for(int k = 0; k < n; k++){
+ for(int l = 0; l < cnt; l++){
+ Z_buffer(k, l) = genomat[_geno_ind[k]][(i_marker + l)];
+ }
+ }
}
}else{
- for(int ii = 0; ii < n; ii++){
- snp[ii] = genomat[_geno_ind[ii]][_marker_ind[i]];
+ if(marker_bycol){
+ #pragma omp parallel for
+ for(int l = 0; l < cnt; l++){
+ for(int k = 0; k < n; k++){
+ Z_buffer(k, l) = genomat[_marker_ind[(i_marker + l)]][_geno_ind[k]];
+ }
+ }
+ }else{
+ #pragma omp parallel for
+ for(int k = 0; k < n; k++){
+ for(int l = 0; l < cnt; l++){
+ Z_buffer(k, l) = genomat[_geno_ind[k]][_marker_ind[(i_marker + l)]];
+ }
+ }
}
}
}
-
- arma::mat Us = U.t() * snp;
- arma::mat UXUs = UX.t() * Us;
-
- double UsUs = as_scalar(Us.t() * Us);
- double UsUy = as_scalar(Us.t() * Uy);
- double B22 = UsUs - as_scalar(UXUs.t() * iUXUX * UXUs);
- double invB22 = 1 / B22;
- arma::mat B21 = UXUs.t() * iUXUX;
- arma::mat NeginvB22B21 = -1 * invB22 * B21;
-
- iXXs(q0, q0)=invB22;
- iXXs.submat(0, 0, q0 - 1, q0 - 1) = iUXUX + invB22 * B21.t() * B21;
- iXXs(q0, span(0, q0 - 1)) = NeginvB22B21;
- iXXs(span(0, q0 - 1), q0) = NeginvB22B21.t();
-
- // statistics
- arma::mat rhs(UXUy.n_rows + 1, 1);
- rhs.rows(0, UXUy.n_rows - 1) = UXUy;
- rhs(UXUy.n_rows, 0) = UsUy;
- arma::mat beta = iXXs * rhs;
- int df = n - q0 - 1;
-
- res(i, 0) = beta(q0, 0);
- res(i, 1) = sqrt(iXXs(q0, q0) * vgs);
- res(i, 2) = 2 * R::pt(abs(res(i, 0) / res(i, 1)), df, false, false);
- progress.increment();
- }
+ #pragma omp parallel for firstprivate(iXXs)
+ for(int l = 0; l < cnt; l++){
+ arma::mat Us = U.t() * Z_buffer.col(l);
+ arma::mat UXUs = UX.t() * Us;
+ double UsUs = as_scalar(Us.t() * Us);
+ double UsUy = as_scalar(Us.t() * Uy);
+ double B22 = UsUs - as_scalar(UXUs.t() * iUXUX * UXUs);
+ double invB22 = 1 / B22;
+ arma::mat B21 = UXUs.t() * iUXUX;
+ arma::mat NeginvB22B21 = -1 * invB22 * B21;
+
+ iXXs(q0, q0)=invB22;
+ iXXs.submat(0, 0, q0 - 1, q0 - 1) = iUXUX + invB22 * B21.t() * B21;
+ iXXs(q0, span(0, q0 - 1)) = NeginvB22B21;
+ iXXs(span(0, q0 - 1), q0) = NeginvB22B21.t();
+
+ // statistics
+ arma::mat rhs(UXUy.n_rows + 1, 1);
+ rhs.rows(0, UXUy.n_rows - 1) = UXUy;
+ rhs(UXUy.n_rows, 0) = UsUy;
+ arma::mat beta = iXXs * rhs;
+ int df = n - q0 - 1;
+
+ res(i_marker + l, 0) = beta(q0, 0);
+ res(i_marker + l, 1) = sqrt(iXXs(q0, q0) * vgs);
+ res(i_marker + l, 2) = 2 * R::pt(abs(res(i_marker + l, 0) / res(i_marker + l, 1)), df, false, false);
+ }
+
+ i = j;
+ i_marker += cnt;
+ if(!Progress::check_abort()) progress.increment(cnt);
+ }
+ Z_buffer.reset();
return wrap(res);
}
// [[Rcpp::export]]
-SEXP mlm_c(const arma::vec & y, const arma::mat & X, const arma::mat & U, const double vgs, SEXP pBigMat, const Nullable geno_ind = R_NilValue, const Nullable marker_ind = R_NilValue, const bool verbose = true, const int threads = 0){
+SEXP mlm_c(const arma::vec & y, const arma::mat & X, const arma::mat & U, const double vgs, SEXP pBigMat, const Nullable geno_ind = R_NilValue, const Nullable marker_ind = R_NilValue, const int step = 10000, const bool verbose = true, const int threads = 0){
XPtr xpMat(pBigMat);
switch(xpMat->matrix_type()){
case 1:
- return mlm_c(y, X, U, vgs, xpMat, geno_ind, marker_ind, verbose, threads);
+ return mlm_c(y, X, U, vgs, xpMat, geno_ind, marker_ind, step, verbose, threads);
case 2:
- return mlm_c(y, X, U, vgs, xpMat, geno_ind, marker_ind, verbose, threads);
+ return mlm_c(y, X, U, vgs, xpMat, geno_ind, marker_ind, step, verbose, threads);
case 4:
- return mlm_c(y, X, U, vgs, xpMat, geno_ind, marker_ind, verbose, threads);
+ return mlm_c(y, X, U, vgs, xpMat, geno_ind, marker_ind, step, verbose, threads);
case 8:
- return mlm_c(y, X, U, vgs, xpMat, geno_ind, marker_ind, verbose, threads);
+ return mlm_c(y, X, U, vgs, xpMat, geno_ind, marker_ind, step, verbose, threads);
default:
throw Rcpp::exception("unknown type detected for big.matrix object!");
}
diff --git a/src/data_converter.cpp b/src/data_converter.cpp
index 18c886d..ad52aec 100755
--- a/src/data_converter.cpp
+++ b/src/data_converter.cpp
@@ -138,7 +138,7 @@ void vcf_parser_genotype(std::string vcf_file, XPtr pMat, long maxLin
// progress bar
MinimalProgressBar_perc pb;
- Progress progress(pMat->nrow(), verbose, pb);
+ Progress progress(pMat->ncol(), verbose, pb);
// Skip Header
string prefix("#CHROM");
@@ -179,7 +179,7 @@ void vcf_parser_genotype(std::string vcf_file, XPtr pMat, long maxLin
// mat[j][m + i] = markers[j];
// }
for(size_t j = 0; j < (l.size() - 9); j++) {
- mat[j][m + i] = static_cast(vcf_marker_parser(l[j + 9], NA_C));
+ mat[m + i][j] = static_cast(vcf_marker_parser(l[j + 9], NA_C));
}
}
progress.increment(buffer.size());
@@ -339,7 +339,7 @@ void hapmap_parser_genotype(std::string hmp_file, std::vector Major
// progress bar
MinimalProgressBar_perc pb;
- Progress progress(pMat->nrow(), verbose, pb);
+ Progress progress(pMat->ncol(), verbose, pb);
// Skip Header
string prefix("rs#");
@@ -380,7 +380,7 @@ void hapmap_parser_genotype(std::string hmp_file, std::vector Major
Rcpp::stop(("line " + to_string(m+i+2) + " does not have " + to_string(n_col) + " elements in HAPMAP file.").c_str());
major = Major[m + i][0];
for(size_t j = 0; j < (l.size() - 11); j++) {
- mat[j][m + i] = static_cast(hapmap_marker_parser(l[j + 11], major, NA_C));
+ mat[m + i][j] = static_cast(hapmap_marker_parser(l[j + 11], major, NA_C));
}
}
progress.increment(n_marker);
@@ -436,7 +436,7 @@ List numeric_scan(std::string num_file) {
// ***** BFILE *****
template
-void write_bfile(XPtr pMat, std::string bed_file, double NA_C, int threads=0, bool verbose=true) {
+void write_bfile(XPtr pMat, std::string bed_file, double NA_C, bool mrkbycol = true, int threads=0, bool verbose=true) {
// check input
string ending = ".bed";
if (bed_file.length() <= ending.length() ||
@@ -447,10 +447,10 @@ void write_bfile(XPtr pMat, std::string bed_file, double NA_C, int th
// define
T c;
omp_setup(threads);
- int m = pMat->nrow();
- int nind = pMat->ncol();
- int n = pMat->ncol() / 4; // 4 individual = 1 bit
- if (pMat->ncol() % 4 != 0)
+ int m = (mrkbycol ? pMat->ncol() : pMat->nrow());
+ int nind = (mrkbycol ? pMat->nrow() : pMat->ncol());
+ int n = nind / 4; // 4 individual = 1 bit
+ if (nind % 4 != 0)
n++;
vector geno(n);
@@ -460,7 +460,7 @@ void write_bfile(XPtr pMat, std::string bed_file, double NA_C, int th
// progress bar
MinimalProgressBar_perc pb;
- Progress progress(pMat->nrow(), verbose, pb);
+ Progress progress(m, verbose, pb);
// magic number of bfile
const unsigned char magic_bytes[] = { 0x6c, 0x1b, 0x01 };
@@ -474,36 +474,52 @@ void write_bfile(XPtr pMat, std::string bed_file, double NA_C, int th
code[static_cast(NA_C)] = 1;
// write bfile
- for (int i = 0; i < m; i++) {
- #pragma omp parallel for private(c)
- for (int j = 0; j < n; j++) {
- uint8_t p = 0;
- for (int x = 0; x < 4 && (4 * j + x) < nind; x++) {
- c = mat[4 * j + x][i];
- p |= code[c] << (x*2);
+ if(mrkbycol){
+ for (int i = 0; i < m; i++) {
+ #pragma omp parallel for private(c)
+ for (int j = 0; j < n; j++) {
+ uint8_t p = 0;
+ for (int x = 0; x < 4 && (4 * j + x) < nind; x++) {
+ c = mat[i][4 * j + x];
+ p |= code[c] << (x*2);
+ }
+ geno[j] = p;
}
- geno[j] = p;
+ fwrite((char*)geno.data(), 1, geno.size(), fout);
+ progress.increment();
+ }
+ }else{
+ for (int i = 0; i < m; i++) {
+ #pragma omp parallel for private(c)
+ for (int j = 0; j < n; j++) {
+ uint8_t p = 0;
+ for (int x = 0; x < 4 && (4 * j + x) < nind; x++) {
+ c = mat[4 * j + x][i];
+ p |= code[c] << (x*2);
+ }
+ geno[j] = p;
+ }
+ fwrite((char*)geno.data(), 1, geno.size(), fout);
+ progress.increment();
}
- fwrite((char*)geno.data(), 1, geno.size(), fout);
- progress.increment();
}
fclose(fout);
return;
}
// [[Rcpp::export]]
-void write_bfile(SEXP pBigMat, std::string bed_file, int threads=0, bool verbose=true) {
+void write_bfile(SEXP pBigMat, std::string bed_file, bool mrkbycol = true, int threads=0, bool verbose=true) {
XPtr xpMat(pBigMat);
switch(xpMat->matrix_type()) {
case 1:
- return write_bfile(xpMat, bed_file, NA_CHAR, threads, verbose);
+ return write_bfile(xpMat, bed_file, NA_CHAR, mrkbycol, threads, verbose);
case 2:
- return write_bfile(xpMat, bed_file, NA_SHORT, threads, verbose);
+ return write_bfile(xpMat, bed_file, NA_SHORT, mrkbycol, threads, verbose);
case 4:
- return write_bfile(xpMat, bed_file, NA_INTEGER, threads, verbose);
+ return write_bfile(xpMat, bed_file, NA_INTEGER, mrkbycol, threads, verbose);
case 8:
- return write_bfile(xpMat, bed_file, NA_REAL, threads, verbose);
+ return write_bfile(xpMat, bed_file, NA_REAL, mrkbycol, threads, verbose);
default:
throw Rcpp::exception("unknown type detected for big.matrix object!");
}
@@ -518,7 +534,7 @@ void read_bfile(std::string bed_file, XPtr pMat, long maxLine, double
// define
omp_setup(threads);
- size_t ind = pMat->ncol();
+ size_t ind = pMat->nrow();
long n = ind / 4; // 4 individual = 1 bit
if (ind % 4 != 0)
n++;
@@ -572,7 +588,7 @@ void read_bfile(std::string bed_file, XPtr pMat, long maxLine, double
uint8_t p = buffer[j];
for (size_t x = 0; x < 4 && (c + x) < ind; x++) {
- mat[c + x][r] = code[(p >> (2*x)) & 0x03];
+ mat[r][c + x] = code[(p >> (2*x)) & 0x03];
}
}
progress.increment();
diff --git a/src/impute.cpp b/src/impute.cpp
index 7b20ca9..edde606 100755
--- a/src/impute.cpp
+++ b/src/impute.cpp
@@ -23,116 +23,200 @@ using namespace Rcpp;
using namespace arma;
template
-void impute_marker(XPtr pMat, int threads=0, bool verbose=true) {
+void impute_marker(XPtr pMat, bool mrkbycol = true, int threads=0, bool verbose=true) {
omp_setup(threads);
- MinimalProgressBar_perc pb;
- Progress progress(pMat->nrow(), verbose, pb);
-
MatrixAccessor mat = MatrixAccessor(*pMat);
- const size_t n = pMat->ncol();
- const size_t m = pMat->nrow();
+ const size_t n = (mrkbycol ? pMat->nrow() : pMat->ncol());
+ const size_t m = (mrkbycol ? pMat->ncol() : pMat->nrow());
+
+ MinimalProgressBar_perc pb;
+ Progress progress(m, verbose, pb);
// loop marker
- #pragma omp parallel for
- for (size_t i = 0; i < m; i++) {
- std::vector na_index = {};;
- size_t counts[3] = { 0 };
-
- // count allele, record missing index
- for (size_t j = 0; j < n; j++) {
- switch(int(mat[j][i])) {
- case 0: counts[0]++; break;
- case 1: counts[1]++; break;
- case 2: counts[2]++; break;
- default: na_index.push_back(j);
+ if(mrkbycol){
+ #pragma omp parallel for
+ for (size_t i = 0; i < m; i++) {
+ std::vector na_index = {};;
+ size_t counts[3] = { 0 };
+
+ // count allele, record missing index
+ for (size_t j = 0; j < n; j++) {
+ switch(int(mat[i][j])) {
+ case 0: counts[0]++; break;
+ case 1: counts[1]++; break;
+ case 2: counts[2]++; break;
+ default: na_index.push_back(j);
+ }
}
+
+ // find major allele
+ T major = counts[2] > counts[1] ? (counts[2] > counts[0] ? 2 : 0) : (counts[1] > counts[0] ? 1 : 0);
+
+ // impute
+ for (auto&& x: na_index) {
+ mat[i][x] = major;
+ }
+ progress.increment();
}
-
- // find major allele
- T major = counts[2] > counts[1] ? (counts[2] > counts[0] ? 2 : 0) : (counts[1] > counts[0] ? 1 : 0);
-
- // impute
- for (auto&& x: na_index) {
- mat[x][i] = major;
+ }else{
+ #pragma omp parallel for
+ for (size_t i = 0; i < m; i++) {
+ std::vector na_index = {};;
+ size_t counts[3] = { 0 };
+
+ // count allele, record missing index
+ for (size_t j = 0; j < n; j++) {
+ switch(int(mat[j][i])) {
+ case 0: counts[0]++; break;
+ case 1: counts[1]++; break;
+ case 2: counts[2]++; break;
+ default: na_index.push_back(j);
+ }
+ }
+
+ // find major allele
+ T major = counts[2] > counts[1] ? (counts[2] > counts[0] ? 2 : 0) : (counts[1] > counts[0] ? 1 : 0);
+
+ // impute
+ for (auto&& x: na_index) {
+ mat[x][i] = major;
+ }
+ progress.increment();
}
- progress.increment();
}
}
// [[Rcpp::export]]
-void impute_marker(SEXP pBigMat, int threads=0, bool verbose=true) {
+void impute_marker(SEXP pBigMat, bool mrkbycol = true, int threads=0, bool verbose=true) {
XPtr xpMat(pBigMat);
switch(xpMat->matrix_type()) {
case 1:
- return impute_marker(xpMat, threads, verbose);
+ return impute_marker(xpMat, mrkbycol, threads, verbose);
case 2:
- return impute_marker(xpMat, threads, verbose);
+ return impute_marker(xpMat, mrkbycol, threads, verbose);
case 4:
- return impute_marker(xpMat, threads, verbose);
+ return impute_marker(xpMat, mrkbycol, threads, verbose);
case 8:
- return impute_marker(xpMat, threads, verbose);
+ return impute_marker(xpMat, mrkbycol, threads, verbose);
default:
throw Rcpp::exception("unknown type detected for big.matrix object!");
}
}
template
-bool hasNA(XPtr pMat, double NA_C, const Nullable geno_ind=R_NilValue, const int threads=1) {
+bool hasNA(XPtr pMat, double NA_C, bool mrkbycol = true, const Nullable geno_ind = R_NilValue, const Nullable marker_ind = R_NilValue, const int threads = 1) {
omp_setup(threads);
- size_t m = pMat->nrow();
- size_t n;
-
- uvec _geno_ind;
- if(geno_ind.isNotNull()){
- _geno_ind = as(geno_ind) - 1;
- n = _geno_ind.n_elem;
- }else{
- n = pMat->ncol();
- }
bool HasNA = false;
MatrixAccessor mat = MatrixAccessor(*pMat);
- if(_geno_ind.is_empty()){
- #pragma omp parallel for shared(HasNA)
- for (size_t j = 0; j < n; j++) {
- if(HasNA) continue;
- for (size_t i = 0; i < m; i++) {
- if (mat[j][i] == NA_C) {
- HasNA = true;
+ if(geno_ind.isNotNull()){
+ uvec _geno_ind = as(geno_ind) - 1;
+ int n = _geno_ind.n_elem;
+ if(marker_ind.isNotNull()){
+ uvec _marker_ind = as(marker_ind) - 1;
+ int m = _marker_ind.n_elem;
+ if(mrkbycol){
+ #pragma omp parallel for shared(HasNA)
+ for (int j = 0; j < m; j++) {
+ if(HasNA) continue;
+ for (int i = 0; i < n; i++) {
+ if (mat[_marker_ind[j]][_geno_ind[i]] == NA_C) {
+ HasNA = true;
+ }
+ }
+ }
+ }else{
+ #pragma omp parallel for shared(HasNA)
+ for (int j = 0; j < n; j++) {
+ if(HasNA) continue;
+ for (int i = 0; i < m; i++) {
+ if (mat[_geno_ind[j]][_marker_ind[i]] == NA_C) {
+ HasNA = true;
+ }
+ }
+ }
+ }
+ }else{
+ if(mrkbycol){
+ #pragma omp parallel for shared(HasNA)
+ for (int j = 0; j < pMat->ncol(); j++) {
+ if(HasNA) continue;
+ for (int i = 0; i < n; i++) {
+ if (mat[j][_geno_ind[i]] == NA_C) {
+ HasNA = true;
+ }
+ }
+ }
+ }else{
+ #pragma omp parallel for shared(HasNA)
+ for (int j = 0; j < n; j++) {
+ if(HasNA) continue;
+ for (int i = 0; i < pMat->nrow(); i++) {
+ if (mat[_geno_ind[j]][i] == NA_C) {
+ HasNA = true;
+ }
+ }
}
}
}
}else{
- #pragma omp parallel for shared(HasNA)
- for (size_t j = 0; j < n; j++) {
- if(HasNA) continue;
- for (size_t i = 0; i < m; i++) {
- if (mat[_geno_ind[j]][i] == NA_C) {
- HasNA = true;
+ if(marker_ind.isNotNull()){
+ uvec _marker_ind = as(marker_ind) - 1;
+ int m = _marker_ind.n_elem;
+ if(mrkbycol){
+ #pragma omp parallel for shared(HasNA)
+ for (int j = 0; j < m; j++) {
+ if(HasNA) continue;
+ for (int i = 0; i < pMat->nrow(); i++) {
+ if (mat[_marker_ind[j]][i] == NA_C) {
+ HasNA = true;
+ }
+ }
+ }
+ }else{
+ #pragma omp parallel for shared(HasNA)
+ for (int j = 0; j < pMat->ncol(); j++) {
+ if(HasNA) continue;
+ for (int i = 0; i < m; i++) {
+ if (mat[j][_marker_ind[i]] == NA_C) {
+ HasNA = true;
+ }
+ }
+ }
+ }
+ }else{
+ #pragma omp parallel for shared(HasNA)
+ for (int j = 0; j < pMat->ncol(); j++) {
+ if(HasNA) continue;
+ for (int i = 0; i < pMat->nrow(); i++) {
+ if (mat[j][i] == NA_C) {
+ HasNA = true;
+ }
}
}
}
}
+
return HasNA;
}
// [[Rcpp::export]]
-bool hasNA(SEXP pBigMat, const Nullable geno_ind = R_NilValue, const int threads=1) {
+bool hasNA(SEXP pBigMat, bool mrkbycol = true, const Nullable geno_ind = R_NilValue, const Nullable marker_ind = R_NilValue, const int threads=1) {
XPtr xpMat(pBigMat);
switch(xpMat->matrix_type()) {
case 1:
- return hasNA(xpMat, NA_CHAR, geno_ind, threads);
+ return hasNA(xpMat, NA_CHAR, mrkbycol, geno_ind, marker_ind, threads);
case 2:
- return hasNA(xpMat, NA_SHORT, geno_ind, threads);
+ return hasNA(xpMat, NA_SHORT, mrkbycol, geno_ind, marker_ind, threads);
case 4:
- return hasNA(xpMat, NA_INTEGER, geno_ind, threads);
+ return hasNA(xpMat, NA_INTEGER, mrkbycol, geno_ind, marker_ind, threads);
case 8:
- return hasNA(xpMat, NA_REAL, geno_ind, threads);
+ return hasNA(xpMat, NA_REAL, mrkbycol, geno_ind, marker_ind, threads);
default:
throw Rcpp::exception("unknown type detected for big.matrix object!");
}
diff --git a/src/kinship.cpp b/src/kinship.cpp
index 652ebbc..f993274 100755
--- a/src/kinship.cpp
+++ b/src/kinship.cpp
@@ -14,13 +14,13 @@ using namespace Rcpp;
using namespace arma;
template
-arma::vec BigRowMean(XPtr pMat, int threads = 0, const Nullable geno_ind = R_NilValue){
+arma::vec BigRowMean(XPtr pMat, bool mrkbycol = true, int threads = 0, const Nullable geno_ind = R_NilValue){
omp_setup(threads);
MatrixAccessor bigm = MatrixAccessor(*pMat);
int n;
- int m = pMat->nrow();
+ int m = mrkbycol ? pMat->ncol() : pMat->nrow();
arma::vec mean(m);
uvec _geno_ind;
@@ -28,26 +28,48 @@ arma::vec BigRowMean(XPtr pMat, int threads = 0, const Nullable(geno_ind) - 1;
n = _geno_ind.n_elem;
}else{
- n = pMat->ncol();
+ n = mrkbycol ? pMat->nrow() : pMat->ncol();
}
- if(_geno_ind.is_empty()){
- #pragma omp parallel for
- for (int j = 0; j < m; j++){
- double p1 = 0.0;
- for(int k = 0; k < n; k++){
- p1 += bigm[k][j];
+ if(mrkbycol){
+ if(_geno_ind.is_empty()){
+ #pragma omp parallel for
+ for (int j = 0; j < m; j++){
+ double p1 = 0.0;
+ for(int k = 0; k < n; k++){
+ p1 += bigm[j][k];
+ }
+ mean[j] = p1 / n;
+ }
+ }else{
+ #pragma omp parallel for
+ for (int j = 0; j < m; j++){
+ double p1 = 0.0;
+ for(int k = 0; k < n; k++){
+ p1 += bigm[j][_geno_ind[k]];
+ }
+ mean[j] = p1 / n;
}
- mean[j] = p1 / n;
}
}else{
- #pragma omp parallel for
- for (int j = 0; j < m; j++){
- double p1 = 0.0;
- for(int k = 0; k < n; k++){
- p1 += bigm[_geno_ind[k]][j];
+ if(_geno_ind.is_empty()){
+ #pragma omp parallel for
+ for (int j = 0; j < m; j++){
+ double p1 = 0.0;
+ for(int k = 0; k < n; k++){
+ p1 += bigm[k][j];
+ }
+ mean[j] = p1 / n;
+ }
+ }else{
+ #pragma omp parallel for
+ for (int j = 0; j < m; j++){
+ double p1 = 0.0;
+ for(int k = 0; k < n; k++){
+ p1 += bigm[_geno_ind[k]][j];
+ }
+ mean[j] = p1 / n;
}
- mean[j] = p1 / n;
}
}
@@ -55,19 +77,19 @@ arma::vec BigRowMean(XPtr pMat, int threads = 0, const Nullable geno_ind = R_NilValue){
+arma::vec BigRowMean(SEXP pBigMat, bool mrkbycol = true, int threads = 0, const Nullable geno_ind = R_NilValue){
XPtr xpMat(pBigMat);
switch(xpMat->matrix_type()) {
case 1:
- return BigRowMean(xpMat, threads, geno_ind);
+ return BigRowMean(xpMat, mrkbycol, threads, geno_ind);
case 2:
- return BigRowMean(xpMat, threads, geno_ind);
+ return BigRowMean(xpMat, mrkbycol, threads, geno_ind);
case 4:
- return BigRowMean(xpMat, threads, geno_ind);
+ return BigRowMean(xpMat, mrkbycol, threads, geno_ind);
case 8:
- return BigRowMean(xpMat, threads, geno_ind);
+ return BigRowMean(xpMat, mrkbycol, threads, geno_ind);
default:
throw Rcpp::exception("unknown type detected for big.matrix object!");
}
@@ -228,7 +250,7 @@ SEXP kin_cal_s(SEXP pBigMat, int threads = 0, bool mkl = false, bool verbose = t
template
-SEXP kin_cal(XPtr pMat, const Nullable geno_ind = R_NilValue, const Nullable marker_ind = R_NilValue, int threads = 0, int step = 5000, bool mkl = false, bool verbose = true){
+SEXP kin_cal(XPtr pMat, const Nullable geno_ind = R_NilValue, const Nullable marker_ind = R_NilValue, const Nullable marker_freq = R_NilValue, const bool marker_bycol = true, int threads = 0, int step = 5000, bool mkl = false, bool verbose = true){
omp_setup(threads);
@@ -248,7 +270,7 @@ SEXP kin_cal(XPtr pMat, const Nullable geno_ind = R_NilVa
_geno_ind = as(geno_ind) - 1;
n = _geno_ind.n_elem;
}else{
- n = pMat->ncol();
+ n = marker_bycol ? pMat->nrow() : pMat->ncol();
}
int m;
@@ -257,52 +279,102 @@ SEXP kin_cal(XPtr pMat, const Nullable geno_ind = R_NilVa
_marker_ind = as(marker_ind) - 1;
m = _marker_ind.n_elem;
}else{
- m = pMat->nrow();
+ m = marker_bycol ? pMat->ncol() : pMat->nrow();
}
- arma::vec means(m);
- if(_geno_ind.is_empty()){
- if(_marker_ind.is_empty()){
- #pragma omp parallel for
- for(int j = 0; j < m; j++){
- double p1 = 0.0;
- for(int k = 0; k < n; k++){
- p1 += bigm[k][j];
- }
- means[j] = p1 / n;
- }
- }else{
- #pragma omp parallel for
- for(int j = 0; j < m; j++){
- double p1 = 0.0;
- for(int k = 0; k < n; k++){
- p1 += bigm[k][_marker_ind[j]];
- }
- means[j] = p1 / n;
- }
- }
+ vec means;
+ if(marker_freq.isNotNull()){
+ means = as(marker_freq) * 2;
}else{
- if(_marker_ind.is_empty()){
- #pragma omp parallel for
- for(int j = 0; j < m; j++){
- double p1 = 0.0;
- for(int k = 0; k < n; k++){
- p1 += bigm[_geno_ind[k]][j];
+ means.resize(m);
+ if(_geno_ind.is_empty()){
+ if(_marker_ind.is_empty()){
+ if(marker_bycol){
+ #pragma omp parallel for
+ for(int j = 0; j < m; j++){
+ double p1 = 0.0;
+ for(int k = 0; k < n; k++){
+ p1 += bigm[j][k];
+ }
+ means[j] = p1 / n;
+ }
+ }else{
+ #pragma omp parallel for
+ for(int j = 0; j < m; j++){
+ double p1 = 0.0;
+ for(int k = 0; k < n; k++){
+ p1 += bigm[k][j];
+ }
+ means[j] = p1 / n;
+ }
+ }
+ }else{
+ if(marker_bycol){
+ #pragma omp parallel for
+ for(int j = 0; j < m; j++){
+ double p1 = 0.0;
+ for(int k = 0; k < n; k++){
+ p1 += bigm[_marker_ind[j]][k];
+ }
+ means[j] = p1 / n;
+ }
+ }else{
+ #pragma omp parallel for
+ for(int j = 0; j < m; j++){
+ double p1 = 0.0;
+ for(int k = 0; k < n; k++){
+ p1 += bigm[k][_marker_ind[j]];
+ }
+ means[j] = p1 / n;
+ }
}
- means[j] = p1 / n;
}
}else{
- #pragma omp parallel for
- for(int j = 0; j < m; j++){
- double p1 = 0.0;
- for(int k = 0; k < n; k++){
- p1 += bigm[_geno_ind[k]][_marker_ind[j]];
+ if(_marker_ind.is_empty()){
+ if(marker_bycol){
+ #pragma omp parallel for
+ for(int j = 0; j < m; j++){
+ double p1 = 0.0;
+ for(int k = 0; k < n; k++){
+ p1 += bigm[j][_geno_ind[k]];
+ }
+ means[j] = p1 / n;
+ }
+ }else{
+ #pragma omp parallel for
+ for(int j = 0; j < m; j++){
+ double p1 = 0.0;
+ for(int k = 0; k < n; k++){
+ p1 += bigm[_geno_ind[k]][j];
+ }
+ means[j] = p1 / n;
+ }
+ }
+ }else{
+ if(marker_bycol){
+ #pragma omp parallel for
+ for(int j = 0; j < m; j++){
+ double p1 = 0.0;
+ for(int k = 0; k < n; k++){
+ p1 += bigm[_marker_ind[j]][_geno_ind[k]];
+ }
+ means[j] = p1 / n;
+ }
+ }else{
+ #pragma omp parallel for
+ for(int j = 0; j < m; j++){
+ double p1 = 0.0;
+ for(int k = 0; k < n; k++){
+ p1 += bigm[_geno_ind[k]][_marker_ind[j]];
+ }
+ means[j] = p1 / n;
+ }
}
- means[j] = p1 / n;
}
}
+ if(means.has_nan()) throw Rcpp::exception("NA is not allowed in genotype, use 'MVP.Data.impute' to impute!");
}
-
+
arma::mat kin = zeros(n, n);
arma::mat Z_buffer(step, n, fill::none);
@@ -325,33 +397,69 @@ SEXP kin_cal(XPtr pMat, const Nullable geno_ind = R_NilVa
if(_geno_ind.is_empty()){
if(_marker_ind.is_empty()){
- #pragma omp parallel for
- for(int k = 0; k < n; k++){
+ if(marker_bycol){
+ #pragma omp parallel for
for(int l = 0; l < cnt; l++){
- Z_buffer(l, k) = bigm[k][(i_marker + l)] - means[(i_marker + l)];
+ for(int k = 0; k < n; k++){
+ Z_buffer(l, k) = bigm[(i_marker + l)][k] - means[(i_marker + l)];
+ }
+ }
+ }else{
+ #pragma omp parallel for
+ for(int k = 0; k < n; k++){
+ for(int l = 0; l < cnt; l++){
+ Z_buffer(l, k) = bigm[k][(i_marker + l)] - means[(i_marker + l)];
+ }
}
}
}else{
- #pragma omp parallel for
- for(int k = 0; k < n; k++){
+ if(marker_bycol){
+ #pragma omp parallel for
for(int l = 0; l < cnt; l++){
- Z_buffer(l, k) = bigm[k][_marker_ind[(i_marker + l)]] - means[(i_marker + l)];
+ for(int k = 0; k < n; k++){
+ Z_buffer(l, k) = bigm[_marker_ind[(i_marker + l)]][k] - means[(i_marker + l)];
+ }
+ }
+ }else{
+ #pragma omp parallel for
+ for(int k = 0; k < n; k++){
+ for(int l = 0; l < cnt; l++){
+ Z_buffer(l, k) = bigm[k][_marker_ind[(i_marker + l)]] - means[(i_marker + l)];
+ }
}
}
}
}else{
if(_marker_ind.is_empty()){
- #pragma omp parallel for
- for(int k = 0; k < n; k++){
+ if(marker_bycol){
+ #pragma omp parallel for
for(int l = 0; l < cnt; l++){
- Z_buffer(l, k) = bigm[_geno_ind[k]][(i_marker + l)] - means[(i_marker + l)];
+ for(int k = 0; k < n; k++){
+ Z_buffer(l, k) = bigm[(i_marker + l)][_geno_ind[k]] - means[(i_marker + l)];
+ }
+ }
+ }else{
+ #pragma omp parallel for
+ for(int k = 0; k < n; k++){
+ for(int l = 0; l < cnt; l++){
+ Z_buffer(l, k) = bigm[_geno_ind[k]][(i_marker + l)] - means[(i_marker + l)];
+ }
}
}
}else{
- #pragma omp parallel for
- for(int k = 0; k < n; k++){
+ if(marker_bycol){
+ #pragma omp parallel for
for(int l = 0; l < cnt; l++){
- Z_buffer(l, k) = bigm[_geno_ind[k]][_marker_ind[(i_marker + l)]] - means[(i_marker + l)];
+ for(int k = 0; k < n; k++){
+ Z_buffer(l, k) = bigm[_marker_ind[(i_marker + l)]][_geno_ind[k]] - means[(i_marker + l)];
+ }
+ }
+ }else{
+ #pragma omp parallel for
+ for(int k = 0; k < n; k++){
+ for(int l = 0; l < cnt; l++){
+ Z_buffer(l, k) = bigm[_geno_ind[k]][_marker_ind[(i_marker + l)]] - means[(i_marker + l)];
+ }
}
}
}
@@ -391,19 +499,19 @@ SEXP kin_cal(XPtr pMat, const Nullable geno_ind = R_NilVa
}
// [[Rcpp::export]]
-SEXP kin_cal(SEXP pBigMat, const Nullable geno_ind = R_NilValue, const Nullable marker_ind = R_NilValue, int threads = 0, size_t step = 10000, bool mkl = false, bool verbose = true){
+SEXP kin_cal(SEXP pBigMat, const Nullable geno_ind = R_NilValue, const Nullable marker_ind = R_NilValue, const Nullable marker_freq = R_NilValue, const bool marker_bycol = true, int threads = 0, size_t step = 10000, bool mkl = false, bool verbose = true){
XPtr xpMat(pBigMat);
switch(xpMat->matrix_type()){
case 1:
- return kin_cal(xpMat, geno_ind, marker_ind, threads, step, mkl, verbose);
+ return kin_cal(xpMat, geno_ind, marker_ind, marker_freq, marker_bycol, threads, step, mkl, verbose);
case 2:
- return kin_cal(xpMat, geno_ind, marker_ind, threads, step, mkl, verbose);
+ return kin_cal(xpMat, geno_ind, marker_ind, marker_freq, marker_bycol, threads, step, mkl, verbose);
case 4:
- return kin_cal(xpMat, geno_ind, marker_ind, threads, step, mkl, verbose);
+ return kin_cal(xpMat, geno_ind, marker_ind, marker_freq, marker_bycol, threads, step, mkl, verbose);
case 8:
- return kin_cal(xpMat, geno_ind, marker_ind, threads, step, mkl, verbose);
+ return kin_cal(xpMat, geno_ind, marker_ind, marker_freq, marker_bycol, threads, step, mkl, verbose);
default:
throw Rcpp::exception("unknown type detected for big.matrix object!");
}
diff --git a/src/mvp_omp.h b/src/mvp_omp.h
index 8eb9c99..de1d07f 100755
--- a/src/mvp_omp.h
+++ b/src/mvp_omp.h
@@ -17,7 +17,6 @@
#if defined(_OPENMP)
#include
-#else
#endif
#include