Skip to content

Commit

Permalink
codeped, kinship
Browse files Browse the repository at this point in the history
Also add an example from L&W, pp139.
  • Loading branch information
xijiang committed Dec 6, 2022
1 parent 4e8df69 commit 0e5ef2a
Show file tree
Hide file tree
Showing 7 changed files with 343 additions and 5 deletions.
4 changes: 3 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
name = "GeneMatrix"
uuid = "b017c6bb-e7c7-4437-8ee6-a27b72ea26ab"
authors = ["Xijiang Yu <[email protected]> and contributors"]
version = "0.1.4"
version = "0.1.5"

[deps]
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"

[compat]
DataFrames = "1"
CSV = "0"
julia = "1"

[extras]
Expand Down
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ plink bed files.

- `nrm`: numerical relationship matrix
- `nrminv`: the inverse of nrm
- `sortped`, recode pedigree to 1:nID.
- `codeped`, recode pedigree to 1:nID.
- `readmacs`, read simulation results from MaCS.
- `bed2int8`, convert bed file formats to Int8
- tests merging.
18 changes: 18 additions & 0 deletions dat/lw-139.ped
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# From Lynch & Walsh, 1996, pp139
# The fields are separated with &, as ',' is in one name
# To read -->
# using CSV, DataFrames
# df = CSV.read("lw139.ped", DataFrame, header=7, delim='&', stripwhitespace=true)
# <--
ID & Sire & Dam
Lord Raglan & 0 & 0
Mistletoe & Lord Raglan & 0
Champion of England & 0 & 0
Duchess of Gloster, 9th & Lord Raglan & 0
The Czar & Lord Raglan & 0
Mimulus & Champion of England & Mistletoe
Grand Duke of Gloster & Champion of England & Duchess of Gloster, 9th
Carmine & The Czar & 0
Royal Duke of Gloster & Grand Duke of Gloster & Mimulus
Princess Royal & Champion of England & Carmine
Roan Gauntlet & Royal Duke of Gloster & Princess Royal
3 changes: 2 additions & 1 deletion src/GeneMatrix.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ include("nrm.jl") # numerical relationship matrix
include("grm.jl") # genomic relationship matrix
include("graph.jl") # store huge genomic data with graph theory
include("pedigree.jl") # pedigree related operations
include("kinship.jl")

export readxy, writexy, readbed, writebed
export readxy, writexy, readbed, writebed, xyappend
end
263 changes: 263 additions & 0 deletions src/kinship.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,263 @@
"""
function kinship(ped, i, j)
---
This function is handy if just to calculate relationship of a few (pairs of) ID.
The first 2 columns of ped must be `pa` and `ma`.
It can also speed up by adding `Thread.@threads` before your pair loop.
"""
function kinship(ped, i, j)
(i == 0 || j == 0) && return 0
ipa, ima = ped[i, :] # used both for below and the last
i == j && (return 1 + .5kinship(ped, ipa, ima))
if i < j
jpa, jma = ped[j, :]
return .5(kinship(ped, i, jpa) + kinship(ped, i, jma))
end
return .5(kinship(ped, j, ipa) + kinship(ped, j, ima))
end

#=
"""
function kinship(ped, i::Int, j::Int, dic::Dict{Tuple{Int, Int}, Float64})
Recursive kinship calculation with kinship of ID pair `(i, j)`
stored in dictionary `dic`. The first 2 columns of ped must be `pa` and `ma`.
The memory usage may be bigger than Meuwissen and Luo 1992, or Quaas 1995.
The speed is however standable.
The recursive algorithm is also easy to understand.
"""
function kinship(ped, i::Int, j::Int, dic::Dict{Tuple{Int, Int}, Float64})
(i == 0 || j == 0) && return 0
ip, im = ped[i, :]
if i == j
haskey(dic, (i, i)) || (dic[(i, i)] = 1 + .5kinship(ped, ip, im, dic))
return dic[(i, i)]
end
if i < j
jp, jm = ped[j, :]
haskey(dic, (i, jp)) || (dic[(i, jp)] = kinship(ped, i, jp, dic))
haskey(dic, (i, jm)) || (dic[(i, jm)] = kinship(ped, i, jm, dic))
return .5(dic[(i, jp)] + dic[(i, jm)])
end
return .5(kinship(ped, j, ip, dic) + kinship(ped, j, im, dic))
end
"""
function ped_F(ped; force = false, mdc = 1_000_000)
- When column `:F` is not in DataFrame `ped`, this function calculate
inbreeding coefficients of every ID, and add a `:F` column in `ped`.
- If `:F` exists, and `force = true`, drop `:F` from `ped` and do above.
- Or do nothing.
The relationship dictionary is limited to 1M to save memory
when the pedigree is too large.
"""
function ped_F(ped; force = false, mdc = 1_000_000)
if "F" ∈ names(ped)
force ? select!(ped, Not([:F])) : return
end
N = nrow(ped)
F = zeros(N)
dic = Dict{Tuple{Int, Int}, Float64}()
@showprogress for i in 1:N
F[i] = kinship(ped, i, i, dic) - 1
end
ped.F = F
nothing
end
"""
function ped_D(ped; force = false)
Calculate diagonals of `D` for `A` or `A⁻¹` calculation.
It is computed as a vector and a new column of `ped` of name `:D`.
"""
function ped_D(ped; force = false)
if "D" ∈ names(ped)
force ? select!(ped, Not([:D])) : return
end
"F" ∈ names(ped) || ped_F(ped)
N = nrow(ped)
D = .5ones(N)
@showprogress for i in 1:N
pa, ma = ped[i, :]
vp = (pa == 0) ? -.25 : .25ped.F[pa]
vm = (ma == 0) ? -.25 : .25ped.F[ma]
D[i] -= (vp + vm)
end
ped.D = D
nothing
end
"""
function _D4A_ni(ped; inverse = false)
Construct a `D` diagonal matrix for `A` related calculation.
This one is for test only, and is hence internal.
"""
function _D4A_ni(ped; inverse = false)
D = ones(nrow(ped))
for (i, (pa, ma, ..)) in enumerate(eachrow(ped))
pa > 0 && (D[i] -= .25)
ma > 0 && (D[i] -= .25)
end
inverse && (D = 1 ./ D)
Diagnal(D)
end
"""
function _pushRCV!(R, C, V, r, c, v)
Push a value row, column, and value into vector `R`, `C` and `V`.
The vectors are later to be used to construct a sparse matrix.
"""
function _pushRCV!(R, C, V, r, c, v)
push!(R, r)
push!(C, c)
push!(V, v)
end
"""
function T4AI(ped)
Give a pedigree DataFrame, with its first 2 column as `pa`, and `ma`,
this function return the T matrix used for A⁻¹ calculation.
"""
function T4AI(ped)
N = nrow(ped)
# R, C, V: row, column and value specifying values in a sparse matrix
# 3N are enough, as diagonal → N, all parents known → another 2N.
R, C, V = Int[], Int[], Float64[]
for (id, (pa, ma, ..)) in enumerate(eachrow(ped))
pa > 0 && _pushRCV!(R, C, V, id, pa, -.5)
ma > 0 && _pushRCV!(R, C, V, id, ma, -.5)
_pushRCV!(R, C, V, id, id, 1.)
end
T = sparse(R, C, V)
end
"""
function T4A(ped; m = 1000)
Calculate `T`, which can be used to construct `A` as `TDT'`.
`T` can also be deemed as genetic contribution matrix.
"""
function T4A(ped; m = 1000)
N = nrow(ped)
N > m && error("Pedigree size ($N > $m), too big")
T = zeros(N, N) + I(N)
for (i, (pa, ma, ..)) in enumerate(eachrow(ped))
if pa > 0
for j in 1:i-1
T[i, j] += .5T[pa, j]
end
end
if ma > 0
for j in 1:i-1
T[i, j] += .5T[ma, j]
end
end
end
T
end
"""
function Amat(ped; m = 1000)
Given a pedigree `ped`,
this function returns a full numerical relationship matrix, `A`.
This function is better for small pedigrees, and for demonstration only.
The maximal matrix size is thus limited to 1000.
One can try to set `m` to a bigger value if RAM is enough.
"""
function Amat(ped; m = 1000)
N = nrow(ped)
N > m && error("Pedigree size ($N > $m) too big")
A = zeros(N, N) + I(N)
for (i, (pa, ma, ..)) in enumerate(eachrow(ped))
pa * ma ≠ 0 && (A[i, i] += .5A[pa, ma])
for j in 1:i-1
pa ≠ 0 && (A[i, j] = 0.5A[j, pa])
ma ≠ 0 && (A[i, j] += 0.5A[j, ma])
A[j, i] = A[i, j]
end
end
A
end
"""
function A22(ped, idc, list::Vector{String}, oo::String)
Construct sub matrix of `A` of ID in `list`.
Dictionary `idc` records the coded ID, i.e., row numbe, in `ped`.
It was created when reading `ped`.
To prevent out of memory error, results are written to file `oo`.
"""
function A22(ped, idc, list::Vector{String}, oo::String)
N = length(list)
ilst = zeros(Int, N)
i = 0
for id in list
i += 1
haskey(idc, id) ? (ilst[i] = idc[id]) : error("$id not found in idc")
end
df = DataFrame(x = Int[], y = Int[], v = Float64[])
th = Threads.nthreads() * 10000 # number of pairs to calculate per batch
open(oo, "w+") do io
write(io, [N, N, Fio.typec(Float64)])
rst = Mmap.mmap(io, Matrix{Float64}, (N, N), 24)
@showprogress 1 "Parallel computing A[i, j]" for i in 1:N
for j in 1:i
push!(df, (ilst[i], ilst[j], 0))
if nrow(df) == th
Threads.@threads for k in 1:th
df.v[k] = kinship(ped, df.x[k], df.y[k])
end
for (x, y, v) in eachrow(df)
A[x, y] = v
end
empty!(df)
end
end
end
@info "Finalizing ..."
if !isempty(pairs)
Threads.@threads for k in 1:nrow(df)
df.v[k] = kinship(ped, df.x[k], df.y[k])
end
for (x, y, v) in eachrow(df)
A[x, y] = v
end
end
end
nothing
end
"""
function Ai(ped)
Given a pedigree `ped`, this function return a sparse matrix of `A⁻¹`,
where `A` is the numerical relationship matrix.
"""
function Ai(ped)
T = T4AI(ped)
"D" ∈ names(ped) || ped_D(ped)
D = Diagonal(1. ./ ped.D)
T'D*T
end
"""
"""
function effID(ped, idc, lst)
epp = Set{Int}()
id = Set{Int}()
for s in lst
push!(id, idc[s])
end
ng = 1
while !isempty(id)
union!(epp, unique(id))
pm = Set{Int}()
ng += 1
for i in id
ped.pa[i] == 0 || push!(pm, ped.pa[i])
ped.ma[i] == 0 || push!(pm, ped.ma[i])
end
id = pm
@show ng, length(epp), length(pm)
end
sort(collect(epp))
end
=#
48 changes: 48 additions & 0 deletions src/pedigree.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""
codeped(id, pa, ma)
Code these 3 string vectors into integers starts from 1.
A DataFrame is returned which has 3 columns,
`sire::Int`, `dam::Int`. and `name::AbstractString`.
String "0" and number 0 is deemed as unknown.
Row numbers of the DataFrame are the coded ID number.
The `Name` column contains the original ID names.
The reason to have 3 String vectors here is to make this as general
as possible. One needs to prepare them for this function,
which is easy.
"""
function codeped(id, pa, ma)
# QC
length(id) == length(pa) == length(ma) ||
error("Lengths of id, pa, ma differ")
length(unique(id)) == length(id) || error("Repeated ID")
op = setdiff(setdiff(pa, id), ["0"])
om = setdiff(setdiff(ma, id), ["0"])
isempty(intersect(op, om)) || error("ID is/are both sire and dam")

df = DataFrame(sire = Int[], dam = Int[], name = AbstractString[])
dc = Dict{AbstractString, Int}()
dc["0"] = uid = 0

for x in union(op, om)
uid += 1
dc[x] = uid
push!(df, (0, 0, x))
end

remain = ref = length(id)
while ref > 0
for i in eachindex(id)
haskey(dc, id[i]) && continue
haskey(dc, pa[i]) || continue
haskey(dc, ma[i]) || continue
uid += 1
dc[id[i]] = uid
push!(df, (dc[pa[i]], dc[ma[i]], id[i]))
remain -= 1
end
ref > remain || error("Looped pedigree")
ref = remain
end
df
end
9 changes: 8 additions & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
using GeneMatrix
using Test
using Test, CSV, DataFrames

@testset "I/O" begin
file = tempname()
Expand Down Expand Up @@ -34,3 +34,10 @@ using Test
rm(file)
rm(f2)
end

@testset "A matrix" begin
df = CSV.read("../dat/lw-139.ped", DataFrame, header=7, delim='&', stripwhitespace=true)
@test length(union(df.ID, df.Sire, df.Dam)) == 12
ped = GeneMatrix.codeped(df.ID, df.Sire, df.Dam)
@test GeneMatrix.kinship(ped, 11, 11) == 1.140625
end

0 comments on commit 0e5ef2a

Please sign in to comment.