Change header to 8x bytes

Such that Mmap will work.
xijiang · Jan 17, 2023 · 4d717bd · 4d717bd
1 parent 94f6708
commit 4d717bd
Show file tree

Hide file tree

Showing 6 changed files with 347 additions and 4 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "GeneMatrix"
 uuid = "b017c6bb-e7c7-4437-8ee6-a27b72ea26ab"
 authors = ["Xijiang Yu <[email protected]> and contributors"]
-version = "0.1.6"
+version = "0.1.7"
 
 [deps]
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"

diff --git a/src/header.jl b/src/header.jl
@@ -18,6 +18,15 @@ struct xyheader
     n::Int64                    # ncol, seek(_, 15)
 end
 
+"""
+For two ID pairs in a pedigree.
+Using `Int32` should be enough for a pedigree.
+"""
+struct iPair
+    i::Int32
+    j::Int32
+end
+
 """
     function mkhdr(nrow::Int64, ncol::Int64; mattp = 'F', trans = 'N', type = Int8)
 Make a header for storage of a matrix in `XY` format.

diff --git a/src/kinship.jl b/src/kinship.jl
@@ -16,7 +16,6 @@ function kinship(ped, i, j)
     return .5(kinship(ped, j, ipa) + kinship(ped, j, ima))
 end
 
-#=
 """
     function kinship(ped, i::Int, j::Int, dic::Dict{Tuple{Int, Int}, Float64})
 Recursive kinship calculation with kinship of ID pair `(i, j)`
@@ -25,7 +24,7 @@ The memory usage may be bigger than Meuwissen and Luo 1992, or Quaas 1995.
 The speed is however standable.
 The recursive algorithm is also easy to understand.
 """
-function kinship(ped, i::Int, j::Int, dic::Dict{Tuple{Int, Int}, Float64})
+function kinship(ped, i::Int, j::Int, dic::Dict{iPair, Float64})
     (i == 0 || j == 0) && return 0
     ip, im = ped[i, :]
     if i == j
@@ -41,6 +40,7 @@ function kinship(ped, i::Int, j::Int, dic::Dict{Tuple{Int, Int}, Float64})
     return .5(kinship(ped, j, ip, dic) + kinship(ped, j, im, dic))
 end
 
+#=
 """
     function ped_F(ped; force = false, mdc = 1_000_000)
 - When column `:F` is not in DataFrame `ped`, this function calculate

diff --git a/src/mate.jl b/src/mate.jl
@@ -0,0 +1,316 @@
+"""
+    function random_mate(nsire::Int, ndam::Int)
+Random pair sire `1:nsire` and dam `nsire+1:nsire+ndam-1`.
+Returns a sorted two-column matrix, `[sires dams]`.
+"""
+function random_mate(nsire::Int, ndam::Int)
+    nf = max(nsire, ndam)
+    ma = shuffle(1:ndam)
+    while length(ma) < nf
+        append!(ma, shuffle(1:ndam))
+    end
+    pa = collect(1:nsire)
+    while length(pa) <nf
+        append!(pa, shuffle(1:nsire))
+    end
+    sortslices([pa[1:nf] ma[1:nf].+nsire], dims=1, by=x->(x[1], x[2]))
+end
+
+"""
+    function random_mate(sires, dams)
+Given names listed in `sires` and `dams`, this function randomly match
+them, such that every ID is used as much as possible.  Returns a
+sorted two columns name matrix.
+"""
+function random_mate(sires, dams)
+    nf = max(length(sires), length(dams))
+    ma = shuffle(dams)
+    while length(ma) < nf
+        append!(ma, shuffle(dams))
+    end
+    pa = shuffle(sires)
+    while length(pa) < nf
+        append!(pa, shuffle(sires))
+    end
+    sortslices([pa[1:nf] ma[1:nf]], dims=1, by=x->(x[1], x[2]))
+end
+
+"""
+    function summap(lmp; cM = 1e6)
+Summary of a linkage map, which has 3 columns, :chr, :pos(in bp), and :frq.
+A DataFrame of 4-column for each chromosome is returned:
+- numbering (1, 2, ...)
+- length in Morgen
+- number of loci
+- beginning number of the first locus
+"""
+function summap(lmp; cM = 1e6)
+    df = DataFrame(chr = Int8[],
+                   len = Float64[], # as λ
+                   nlc = Int[],
+                   bgn = Int[])
+    bgn = 1
+    for grp in groupby(lmp, :chr)
+        chr = first(grp).chr
+        len = (last(grp).pos - first(grp).pos) / cM / 100
+        nlc = nrow(grp)
+        push!(df, (chr, len, nlc, bgn))
+        bgn += nlc
+    end
+    return df
+end
+
+"""
+    function crossovers(lms)
+Give a linkage map summary `lms`, which can be from `summap`, this
+function return a vector of crossover points along the whole genome.
+
+DataFrame `lms` has for columns, chromosome number, its length in
+Morgen, number of loci it contains, and the beginning number of its
+first locus in the whole genome.
+
+The first number is `1`, or `2`, the starting haplotype.  The vector
+is then start from locus `1`, and loci that cross-over happens.  A
+"cross-over" may also happen at the first locus of a chromsome.
+Otherwise, the segment continues with the previous haplotype.  This
+eases the segment copy.  At the beginning of a "cross-over" happens on
+`rand(false:true)`.  The number of cross-over occurred on a chromosome
+follows a Poisson distribution.
+
+The cross-over location is then sampled from a uniform distribution.
+It can can also be U-shaped abouth centromere, which might be
+implemented later.
+
+The chromosome, or linkage group, should be in the order of the
+genotype file.  Or, unpredicted errors can be resulted.
+
+## Caution!
+This is better for uniformly distributed many SNP.  In the future, I
+will take into account the different location of SNP, hotspots etc.
+"""
+function crossovers(lms)
+    pts = [rand(1:2)]           # crossover points
+    for (_, λ, nlc, bgn) in eachrow(lms)
+        bgn > 1 && rand(false:true) && push!(pts, bgn)
+        nc = rand(Poisson(λ))
+        append!(pts, rand(1:nlc, nc) .+ (bgn - 1))
+    end
+    push!(pts, last(lms).nlc - 1 + last(lms.bgn))
+    pts
+end
+
+"""
+    function gamete(prt, hap, lms)
+Generate a gamete `hap`, a **vector** view of a child's haplotype,
+from `prt`, a view of a parents genotypes,
+according crossovers generated from a linkage map summary, `lms`.
+"""
+function gamete(prt, hap, lms)
+    cvs = crossovers(lms)
+    h, l = cvs[1], 1            # starting
+    for cv in cvs[2:end]
+        copyto!(view(hap, l:cv), view(prt, l:cv, h))
+        l = cv + 1
+        h = 3 - h
+    end
+end
+
+"""
+    function drop(pg::Matrix{Int8}, og::Matrix{Int8}, pm, lms)
+Drop haplotypes `pg` of parents into `og`, their offspring genotypes.
+Parents of each offspring are defined in `pm`, which are rows of ``pa ma``.
+Linkage map summary `lms` is from `summap`.
+
+!!! ``Caution``:
+- Merged data matrix from `MaCS` is `n-ID × n-loci`. Treat it with care.
+- It is supposed all the genes to drop from are in `pg`.
+- This function will be removed in the future.
+"""
+function drop(pg::AbstractArray, og::Matrix{Int8}, pm, lms)
+    nf = size(pm)[1]
+    Threads.@threads for id in 1:nf
+        ip = pm[id, 1]
+        pa = view(pg, :, 2ip-1:2ip)
+        zi = vec(view(og, :, 2id - 1))
+        gamete(pa, zi, lms)
+    end
+    Threads.@threads for id in 1:nf
+        im = pm[id, 2]
+        ma = view(pg, :, 2im-1:2im)
+        zi = vec(view(og, :, 2id))
+        gamete(ma, zi, lms)
+    end
+end
+
+"""
+    function drop(pg::AbstractArray, pm, lms)
+Drop genotyeps in `pg` of ``n-loci × n-Haplotypes`` into a new matrix
+`og`, according to sire and dam info in `pm` (``pa ma`` of n-ID rows),
+and linkage summary `lms`.  The new matrix `og` is returned.
+"""
+function drop(pg::AbstractArray, pm, lms)
+    nid, nlc = size(pm, 1), size(pg, 1)
+    og = zeros(Int8, nlc, nid * 2)
+
+    Threads.@threads for id in 1:nid
+        ip = pm[id, 1]
+        pa = view(pg, :, 2ip-1:2ip)
+        zi = vec(view(og, :, 2id - 1))
+        gamete(pa, zi, lms)
+    end
+    Threads.@threads for id in 1:nid
+        im = pm[id, 2]
+        ma = view(pg, :, 2im-1:2im)
+        zi = vec(view(og, :, 2id))
+        gamete(ma, zi, lms)
+    end
+    og
+end
+
+"""
+    function drop_by_chr(fph::String, bar::String, pm, lms; merge = true)
+Drop haplotypes of parents in `fph` into `bar-{1..nchr}.bin`,
+their offspring genotypes.
+Matrix in `fph` should be of dimension `nLoci × nHap`.
+Parents of each offspring are defined in `pm`, which are rows of ``pa ma``.
+Linkage map summary `lms` is from `summap` of module ``Sim``.
+
+When `merge = true`, the dropped haplotypes will be merged into genotypes.
+It is also transposed before written to a file.
+"""
+function drop_by_chr(fph::String, bar::String, pm, lms; merge = false)
+    nlc, nhp = Fio.readdim(fph)
+    nof = size(pm)[1]
+    mph = Mmap.mmap(fph, Matrix{Int8}, (nlc, nhp), 24)
+
+    tprintln("    - Dropping on chromosome:")
+    for (chr, len, cln, fra) in eachrow(lms)
+        tprint(" $chr")
+        til = cln + fra - 1
+        oh = zeros(Int8, cln, 2nof)
+        ph = copy(mph[fra:til, :])
+        drop(ph, oh, pm, DataFrame(chr=chr, len=len, nlc=cln, bgn=1))
+        if merge
+            open("$bar-$chr.bin", "w") do io
+                # write(io, [cln, nof, Fio.typec(Int8)])
+                # for i in 1:nof
+                #     write(io, oh[:, 2i-1] + oh[:, 2i])
+                # end
+                write(io, [nof, cln, Fio.typec(Int8)])
+                for i in 1:cln
+                    write(io, oh[i, 1:2:2nof] + oh[i, 2:2:2nof])
+                end
+            end
+        else
+            Fio.writemat("$bar-$chr.bin", oh)
+        end
+    end
+    println()
+end
+
+"""
+    function reproduce()
+Give a matrix `haps` of haplotypes of nHap (= 2nID) by nLoc, and a
+pedigree `ped`, (`[ID pa ma]`) this function drop the parents
+haplotype according to the crossovers generated on linkage map summary
+`lms`.
+"""
+function reproduce(haps, ped, lms)
+    for (id, ip, im) in eachrow(ped)
+        pa = view(haps, 2ip-1:2ip, :)
+        zi = vec(view(haps, 2id-1, :)) # offspring
+        gamete(pa, zi, lms)
+        ma = view(haps, 2im-1:2im, :)
+        zi = vec(view(haps, 2id, :))
+        gamete(ma, zi, lms)
+    end
+end
+
+"""
+    function collect_gt(bar, lms, loci; rev = false)
+In case when genotypes are stored in different file names started with
+`bar` by chromosomes, This function collect the genotypes on `loci` in
+total genome order.  Linkage map summary `lms` which file to find
+according to number in `loci`.
+
+Note, the genotypes are of `nid × nlc` of each chromosome.  When `rev
+= true`, the genotypes are collected in reverse order of chrosome
+numbers.
+"""
+function collect_gt(bar, lms, loci; rev = false)
+    nid, _ = Fio.readdim("$bar-$(lms.chr[1]).bin")
+    # below commented codes are for matrix of `nlc × nid`.
+    #_, nid = Fio.readdim("$bar-$(lms.chr[1]).bin")
+    gt = zeros(Int8, length(loci), nid)
+    i, _lms = rev ? (length(loci), reverse(lms)) : (0, lms)
+    for (chr, _, nlc, fra) in eachrow(_lms)
+        chunk = intersect(fra:fra+nlc-1, loci) .+ 1 .- fra
+        slc = length(chunk)     # number of selected loci
+        rev && (i -= slc)
+        open("$bar-$chr.bin", "r") do ig
+            cg = Mmap.mmap(ig, Matrix{Int8}, (nid, nlc), 24)
+            copyto!(view(gt, i+1:i+slc, :), view(cg, :, chunk)')
+            # cg = Mmap.mmap(ig, Matrix{Int8}, (nlc, nid), 24)
+            # copyto!(view(gt, i+1:i+slc, :), view(cg, chunk, :))
+        end
+        rev || (i += slc)
+    end
+    gt
+end
+
+"""
+    drop(php::String, gt::String, 
+              pa::AbstractVector{Int}, ma::AbstractVector{Int},
+              nsb::Int, lms, ped, ohp::String)
+Drop parent haplotypes in file `php` into file `gt` according to
+random mating among (unique) sires `pa` and dams `ma`.  Crossovers
+were sampled according to linkage map summary `lms`.  Each full
+sibship size is `nsb`.  When `gt` exists, new genotypes will be
+appended to it, or a new file `gt` is created.
+
+The function will check if 2length of `pa` and `ma` is less or equal
+to number of columns in `php`.  It is assumed haplotypes of `pa` are
+packed in the first half columns of the `php`, and in that order.  The
+function also check if any intersect of `pa` and `ma` exists.
+
+The newly created haplotypes are written in to `ohp`.
+"""
+function drop(php::String, gt::String, 
+              pa::AbstractVector{Int}, ma::AbstractVector{Int},
+              nsb::Int, lms, ped, ohp::String)
+    ig = ped.grt[end] + 1
+    tprintln("  - Dropping into generation $ig")
+    # The drop procedure
+    pg = Fio.readmat(php)
+    nsr, ndm = length(pa), length(ma)
+    2(nsr + ndm) ≤ size(pg)[2] || error("Not enough haplotypes in $hap for list pa and ma")
+    length(intersect(pa, ma)) > 0 && error("Some ID are both sire and dam")
+    pm = repeat(Sim.random_mate(nsr, ndm), inner = (nsb, 1))
+    og = drop(pg, pm, lms)
+    Fio.writemat(ohp, og)
+
+    # update pedigree
+    nlc, nid = size(og)
+    nid ÷= 2
+    ndum = ncol(ped) - 4
+    sex = rand(0:1, nid)
+    for id in 1:nid
+        push!(ped, (ig, pa[pm[id, 1]], ma[pm[id, 2] - nsr], sex[id], zeros(ndum)...))
+    end
+
+    # Create genotypes file, or append new genotypes.
+    isfile(gt) || write(gt, [nlc, 0, Fio.typec(Int8)])
+
+    # update header
+    hd = zeros(Int, 2)
+    read!(gt, hd)
+    open(gt, "a+") do io
+        hd[2] += nid
+        seekstart(io)
+        write(io, hd)
+        seekend(io)
+        for i in 1:nid
+            write(io, og[:, 2i-1] + og[:, 2i])
+        end
+    end
+end
diff --git a/src/pedigree.jl b/src/pedigree.jl
@@ -20,7 +20,7 @@ function codeped(id, pa, ma)
     om = setdiff(setdiff(ma, id), ["0"])
     isempty(intersect(op, om)) || error("ID is/are both sire and dam")
 
-    df = DataFrame(sire = Int[], dam = Int[], name = AbstractString[])
+    df = DataFrame(sire = Int32[], dam = Int32[], name = AbstractString[])
     dc = Dict{AbstractString, Int}()
     dc["0"] = uid = 0
 

diff --git a/src/sim.jl b/src/sim.jl
@@ -92,3 +92,21 @@ function simPtQTL(gt, nqtl; d = MvNormal(zeros(2), I(2)))
     efct = rand(d, nqtl)
     return (locus = loci, e1 = efct[1, :], e2 = efct[2, :])
 end
+
+"""
+    simMap(len, nlc...; cM = 1_000_000)
+
+Simulate a linkage map DataFrame with chromosome number of ``UInt8`` and 
+base pair positions of ``UInt32``.
+This can be used for crossover simulations.
+"""
+function simMap(len, nlc...; cM = 1_000_000)
+    lmp = DataFrame(chr = UInt8[], pos = UInt32[])
+    ich = UInt8(1)
+    for n in nlc
+        pos = sort(rand(1:len, n))
+        append!(lmp, DataFrame(chr=repeat([ich], length(pos)), pos = pos))
+        ich += 1
+    end
+    lmp
+end