Skip to content

Commit

Permalink
Change header to 8x bytes
Browse files Browse the repository at this point in the history
Such that Mmap will work.
  • Loading branch information
xijiang committed Jan 17, 2023
1 parent 94f6708 commit 4d717bd
Show file tree
Hide file tree
Showing 6 changed files with 347 additions and 4 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "GeneMatrix"
uuid = "b017c6bb-e7c7-4437-8ee6-a27b72ea26ab"
authors = ["Xijiang Yu <[email protected]> and contributors"]
version = "0.1.6"
version = "0.1.7"

[deps]
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
Expand Down
9 changes: 9 additions & 0 deletions src/header.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,15 @@ struct xyheader
n::Int64 # ncol, seek(_, 15)
end

"""
For two ID pairs in a pedigree.
Using `Int32` should be enough for a pedigree.
"""
struct iPair
i::Int32
j::Int32
end

"""
function mkhdr(nrow::Int64, ncol::Int64; mattp = 'F', trans = 'N', type = Int8)
Make a header for storage of a matrix in `XY` format.
Expand Down
4 changes: 2 additions & 2 deletions src/kinship.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ function kinship(ped, i, j)
return .5(kinship(ped, j, ipa) + kinship(ped, j, ima))
end

#=
"""
function kinship(ped, i::Int, j::Int, dic::Dict{Tuple{Int, Int}, Float64})
Recursive kinship calculation with kinship of ID pair `(i, j)`
Expand All @@ -25,7 +24,7 @@ The memory usage may be bigger than Meuwissen and Luo 1992, or Quaas 1995.
The speed is however standable.
The recursive algorithm is also easy to understand.
"""
function kinship(ped, i::Int, j::Int, dic::Dict{Tuple{Int, Int}, Float64})
function kinship(ped, i::Int, j::Int, dic::Dict{iPair, Float64})
(i == 0 || j == 0) && return 0
ip, im = ped[i, :]
if i == j
Expand All @@ -41,6 +40,7 @@ function kinship(ped, i::Int, j::Int, dic::Dict{Tuple{Int, Int}, Float64})
return .5(kinship(ped, j, ip, dic) + kinship(ped, j, im, dic))
end

#=
"""
function ped_F(ped; force = false, mdc = 1_000_000)
- When column `:F` is not in DataFrame `ped`, this function calculate
Expand Down
316 changes: 316 additions & 0 deletions src/mate.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,316 @@
"""
function random_mate(nsire::Int, ndam::Int)
Random pair sire `1:nsire` and dam `nsire+1:nsire+ndam-1`.
Returns a sorted two-column matrix, `[sires dams]`.
"""
function random_mate(nsire::Int, ndam::Int)
nf = max(nsire, ndam)
ma = shuffle(1:ndam)
while length(ma) < nf
append!(ma, shuffle(1:ndam))
end
pa = collect(1:nsire)
while length(pa) <nf
append!(pa, shuffle(1:nsire))
end
sortslices([pa[1:nf] ma[1:nf].+nsire], dims=1, by=x->(x[1], x[2]))
end

"""
function random_mate(sires, dams)
Given names listed in `sires` and `dams`, this function randomly match
them, such that every ID is used as much as possible. Returns a
sorted two columns name matrix.
"""
function random_mate(sires, dams)
nf = max(length(sires), length(dams))
ma = shuffle(dams)
while length(ma) < nf
append!(ma, shuffle(dams))
end
pa = shuffle(sires)
while length(pa) < nf
append!(pa, shuffle(sires))
end
sortslices([pa[1:nf] ma[1:nf]], dims=1, by=x->(x[1], x[2]))
end

"""
function summap(lmp; cM = 1e6)
Summary of a linkage map, which has 3 columns, :chr, :pos(in bp), and :frq.
A DataFrame of 4-column for each chromosome is returned:
- numbering (1, 2, ...)
- length in Morgen
- number of loci
- beginning number of the first locus
"""
function summap(lmp; cM = 1e6)
df = DataFrame(chr = Int8[],
len = Float64[], # as λ
nlc = Int[],
bgn = Int[])
bgn = 1
for grp in groupby(lmp, :chr)
chr = first(grp).chr
len = (last(grp).pos - first(grp).pos) / cM / 100
nlc = nrow(grp)
push!(df, (chr, len, nlc, bgn))
bgn += nlc
end
return df
end

"""
function crossovers(lms)
Give a linkage map summary `lms`, which can be from `summap`, this
function return a vector of crossover points along the whole genome.
DataFrame `lms` has for columns, chromosome number, its length in
Morgen, number of loci it contains, and the beginning number of its
first locus in the whole genome.
The first number is `1`, or `2`, the starting haplotype. The vector
is then start from locus `1`, and loci that cross-over happens. A
"cross-over" may also happen at the first locus of a chromsome.
Otherwise, the segment continues with the previous haplotype. This
eases the segment copy. At the beginning of a "cross-over" happens on
`rand(false:true)`. The number of cross-over occurred on a chromosome
follows a Poisson distribution.
The cross-over location is then sampled from a uniform distribution.
It can can also be U-shaped abouth centromere, which might be
implemented later.
The chromosome, or linkage group, should be in the order of the
genotype file. Or, unpredicted errors can be resulted.
## Caution!
This is better for uniformly distributed many SNP. In the future, I
will take into account the different location of SNP, hotspots etc.
"""
function crossovers(lms)
pts = [rand(1:2)] # crossover points
for (_, λ, nlc, bgn) in eachrow(lms)
bgn > 1 && rand(false:true) && push!(pts, bgn)
nc = rand(Poisson(λ))
append!(pts, rand(1:nlc, nc) .+ (bgn - 1))
end
push!(pts, last(lms).nlc - 1 + last(lms.bgn))
pts
end

"""
function gamete(prt, hap, lms)
Generate a gamete `hap`, a **vector** view of a child's haplotype,
from `prt`, a view of a parents genotypes,
according crossovers generated from a linkage map summary, `lms`.
"""
function gamete(prt, hap, lms)
cvs = crossovers(lms)
h, l = cvs[1], 1 # starting
for cv in cvs[2:end]
copyto!(view(hap, l:cv), view(prt, l:cv, h))
l = cv + 1
h = 3 - h
end
end

"""
function drop(pg::Matrix{Int8}, og::Matrix{Int8}, pm, lms)
Drop haplotypes `pg` of parents into `og`, their offspring genotypes.
Parents of each offspring are defined in `pm`, which are rows of ``pa ma``.
Linkage map summary `lms` is from `summap`.
!!! ``Caution``:
- Merged data matrix from `MaCS` is `n-ID × n-loci`. Treat it with care.
- It is supposed all the genes to drop from are in `pg`.
- This function will be removed in the future.
"""
function drop(pg::AbstractArray, og::Matrix{Int8}, pm, lms)
nf = size(pm)[1]
Threads.@threads for id in 1:nf
ip = pm[id, 1]
pa = view(pg, :, 2ip-1:2ip)
zi = vec(view(og, :, 2id - 1))
gamete(pa, zi, lms)
end
Threads.@threads for id in 1:nf
im = pm[id, 2]
ma = view(pg, :, 2im-1:2im)
zi = vec(view(og, :, 2id))
gamete(ma, zi, lms)
end
end

"""
function drop(pg::AbstractArray, pm, lms)
Drop genotyeps in `pg` of ``n-loci × n-Haplotypes`` into a new matrix
`og`, according to sire and dam info in `pm` (``pa ma`` of n-ID rows),
and linkage summary `lms`. The new matrix `og` is returned.
"""
function drop(pg::AbstractArray, pm, lms)
nid, nlc = size(pm, 1), size(pg, 1)
og = zeros(Int8, nlc, nid * 2)

Threads.@threads for id in 1:nid
ip = pm[id, 1]
pa = view(pg, :, 2ip-1:2ip)
zi = vec(view(og, :, 2id - 1))
gamete(pa, zi, lms)
end
Threads.@threads for id in 1:nid
im = pm[id, 2]
ma = view(pg, :, 2im-1:2im)
zi = vec(view(og, :, 2id))
gamete(ma, zi, lms)
end
og
end

"""
function drop_by_chr(fph::String, bar::String, pm, lms; merge = true)
Drop haplotypes of parents in `fph` into `bar-{1..nchr}.bin`,
their offspring genotypes.
Matrix in `fph` should be of dimension `nLoci × nHap`.
Parents of each offspring are defined in `pm`, which are rows of ``pa ma``.
Linkage map summary `lms` is from `summap` of module ``Sim``.
When `merge = true`, the dropped haplotypes will be merged into genotypes.
It is also transposed before written to a file.
"""
function drop_by_chr(fph::String, bar::String, pm, lms; merge = false)
nlc, nhp = Fio.readdim(fph)
nof = size(pm)[1]
mph = Mmap.mmap(fph, Matrix{Int8}, (nlc, nhp), 24)

tprintln(" - Dropping on chromosome:")
for (chr, len, cln, fra) in eachrow(lms)
tprint(" $chr")
til = cln + fra - 1
oh = zeros(Int8, cln, 2nof)
ph = copy(mph[fra:til, :])
drop(ph, oh, pm, DataFrame(chr=chr, len=len, nlc=cln, bgn=1))
if merge
open("$bar-$chr.bin", "w") do io
# write(io, [cln, nof, Fio.typec(Int8)])
# for i in 1:nof
# write(io, oh[:, 2i-1] + oh[:, 2i])
# end
write(io, [nof, cln, Fio.typec(Int8)])
for i in 1:cln
write(io, oh[i, 1:2:2nof] + oh[i, 2:2:2nof])
end
end
else
Fio.writemat("$bar-$chr.bin", oh)
end
end
println()
end

"""
function reproduce()
Give a matrix `haps` of haplotypes of nHap (= 2nID) by nLoc, and a
pedigree `ped`, (`[ID pa ma]`) this function drop the parents
haplotype according to the crossovers generated on linkage map summary
`lms`.
"""
function reproduce(haps, ped, lms)
for (id, ip, im) in eachrow(ped)
pa = view(haps, 2ip-1:2ip, :)
zi = vec(view(haps, 2id-1, :)) # offspring
gamete(pa, zi, lms)
ma = view(haps, 2im-1:2im, :)
zi = vec(view(haps, 2id, :))
gamete(ma, zi, lms)
end
end

"""
function collect_gt(bar, lms, loci; rev = false)
In case when genotypes are stored in different file names started with
`bar` by chromosomes, This function collect the genotypes on `loci` in
total genome order. Linkage map summary `lms` which file to find
according to number in `loci`.
Note, the genotypes are of `nid × nlc` of each chromosome. When `rev
= true`, the genotypes are collected in reverse order of chrosome
numbers.
"""
function collect_gt(bar, lms, loci; rev = false)
nid, _ = Fio.readdim("$bar-$(lms.chr[1]).bin")
# below commented codes are for matrix of `nlc × nid`.
#_, nid = Fio.readdim("$bar-$(lms.chr[1]).bin")
gt = zeros(Int8, length(loci), nid)
i, _lms = rev ? (length(loci), reverse(lms)) : (0, lms)
for (chr, _, nlc, fra) in eachrow(_lms)
chunk = intersect(fra:fra+nlc-1, loci) .+ 1 .- fra
slc = length(chunk) # number of selected loci
rev && (i -= slc)
open("$bar-$chr.bin", "r") do ig
cg = Mmap.mmap(ig, Matrix{Int8}, (nid, nlc), 24)
copyto!(view(gt, i+1:i+slc, :), view(cg, :, chunk)')
# cg = Mmap.mmap(ig, Matrix{Int8}, (nlc, nid), 24)
# copyto!(view(gt, i+1:i+slc, :), view(cg, chunk, :))
end
rev || (i += slc)
end
gt
end

"""
drop(php::String, gt::String,
pa::AbstractVector{Int}, ma::AbstractVector{Int},
nsb::Int, lms, ped, ohp::String)
Drop parent haplotypes in file `php` into file `gt` according to
random mating among (unique) sires `pa` and dams `ma`. Crossovers
were sampled according to linkage map summary `lms`. Each full
sibship size is `nsb`. When `gt` exists, new genotypes will be
appended to it, or a new file `gt` is created.
The function will check if 2length of `pa` and `ma` is less or equal
to number of columns in `php`. It is assumed haplotypes of `pa` are
packed in the first half columns of the `php`, and in that order. The
function also check if any intersect of `pa` and `ma` exists.
The newly created haplotypes are written in to `ohp`.
"""
function drop(php::String, gt::String,
pa::AbstractVector{Int}, ma::AbstractVector{Int},
nsb::Int, lms, ped, ohp::String)
ig = ped.grt[end] + 1
tprintln(" - Dropping into generation $ig")
# The drop procedure
pg = Fio.readmat(php)
nsr, ndm = length(pa), length(ma)
2(nsr + ndm) size(pg)[2] || error("Not enough haplotypes in $hap for list pa and ma")
length(intersect(pa, ma)) > 0 && error("Some ID are both sire and dam")
pm = repeat(Sim.random_mate(nsr, ndm), inner = (nsb, 1))
og = drop(pg, pm, lms)
Fio.writemat(ohp, og)

# update pedigree
nlc, nid = size(og)
nid ÷= 2
ndum = ncol(ped) - 4
sex = rand(0:1, nid)
for id in 1:nid
push!(ped, (ig, pa[pm[id, 1]], ma[pm[id, 2] - nsr], sex[id], zeros(ndum)...))
end

# Create genotypes file, or append new genotypes.
isfile(gt) || write(gt, [nlc, 0, Fio.typec(Int8)])

# update header
hd = zeros(Int, 2)
read!(gt, hd)
open(gt, "a+") do io
hd[2] += nid
seekstart(io)
write(io, hd)
seekend(io)
for i in 1:nid
write(io, og[:, 2i-1] + og[:, 2i])
end
end
end
2 changes: 1 addition & 1 deletion src/pedigree.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ function codeped(id, pa, ma)
om = setdiff(setdiff(ma, id), ["0"])
isempty(intersect(op, om)) || error("ID is/are both sire and dam")

df = DataFrame(sire = Int[], dam = Int[], name = AbstractString[])
df = DataFrame(sire = Int32[], dam = Int32[], name = AbstractString[])
dc = Dict{AbstractString, Int}()
dc["0"] = uid = 0

Expand Down
18 changes: 18 additions & 0 deletions src/sim.jl
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,21 @@ function simPtQTL(gt, nqtl; d = MvNormal(zeros(2), I(2)))
efct = rand(d, nqtl)
return (locus = loci, e1 = efct[1, :], e2 = efct[2, :])
end

"""
simMap(len, nlc...; cM = 1_000_000)
Simulate a linkage map DataFrame with chromosome number of ``UInt8`` and
base pair positions of ``UInt32``.
This can be used for crossover simulations.
"""
function simMap(len, nlc...; cM = 1_000_000)
lmp = DataFrame(chr = UInt8[], pos = UInt32[])
ich = UInt8(1)
for n in nlc
pos = sort(rand(1:len, n))
append!(lmp, DataFrame(chr=repeat([ich], length(pos)), pos = pos))
ich += 1
end
lmp
end

0 comments on commit 4d717bd

Please sign in to comment.