From 619ff585efc08320c31797c33425edab36d03d7c Mon Sep 17 00:00:00 2001 From: zhanglei Date: Tue, 27 Feb 2024 14:02:28 +0800 Subject: [PATCH 1/6] removing R code, sort eigen values and vectors before optimising --- rrBLUP.py | 85 ++++++++++++++++++------------------------------------- 1 file changed, 28 insertions(+), 57 deletions(-) diff --git a/rrBLUP.py b/rrBLUP.py index f2fcadd..b34a113 100644 --- a/rrBLUP.py +++ b/rrBLUP.py @@ -376,12 +376,11 @@ def tcrossprod(A, B): try: svd_SZBt_u, svd_SZBt_d, v = np.linalg.svd(SZBt, full_matrices = 0) except: - svd_SZBt_u, svd_SZBt_d, v = np.linalg.svd(SZBt + 1e-10, full_matrices = 0) - QR = robjects.r['qr'](np.hstack((X, svd_SZBt_u))) - q = robjects.r['qr.Q'](QR, complete=True) - r = robjects.r['qr.R'](QR) - Q = q[:, p:n] - R = r[p:m, p:m] + svd_SZBt_u, svd_SZBt_d, v = np.linalg.svd(SZBt + 1e-10, full_matrices=0) + matrix_to_decompose = np.hstack((X, svd_SZBt_u)) + Q_numpy, R_numpy = np.linalg.qr(matrix_to_decompose, mode='complete') + Q = Q_numpy[:, p:n] + R = R_numpy[p:m, p:m] try: ans = np.linalg.solve(np.transpose(R * R), svd_SZBt_d * svd_SZBt_d) theta = np.append(ans, np.zeros(n - p - m)) @@ -393,72 +392,44 @@ def tcrossprod(A, B): Hb = tcrossprod(Z, Z) + offset * np.eye(n) else: Hb = tcrossprod(np.dot(Z, K), Z) + offset * np.eye(n) - tmp = robjects.r['eigen'](Hb) - Hb_system_values = list(tmp)[0] - Hb_system_vectors = list(tmp)[1] + + eigenvalues, eigenvectors = np.linalg.eig(Hb) + + # Sort the eigenvalues in descending order, and sort the eigenvectors to match + idx = eigenvalues.argsort()[::-1] + Hb_system_values = eigenvalues[idx] + Hb_system_vectors = eigenvectors[:, idx] + phi = Hb_system_values - offset if np.nanmin(phi) < -1e-06: print("K not positive semi-definite.") return U = Hb_system_vectors SHbS = np.dot(np.dot(S, Hb), S) - tmp = robjects.r['eigen'](SHbS) - SHbS_system_values = list(tmp)[0] - SHbS_system_vectors = list(tmp)[1] + eigenvalues, eigenvectors = np.linalg.eig(SHbS) + + # Sort the eigenvalues in descending order, and sort the eigenvectors to match + idx = eigenvalues.argsort()[::-1] + SHbS_system_values = eigenvalues[idx] + SHbS_system_vectors = eigenvectors[:, idx] + theta = SHbS_system_values[0:(n - p)] - offset Q = SHbS_system_vectors[:, 0:(n - p)] omega = crossprod(Q, y) omega_sq = omega * omega if method == 'ML': - #def f_ML(Lambda): - # return n * np.log(np.nansum(omega_sq.reshape(-1) / (theta + Lambda))) + np.nansum(np.log(phi + Lambda)) - #lambda_opt = fminbound(f_ML, bounds[0], bounds[1]) - #objective = f_ML(lambda_opt) df = n - pd.DataFrame(np.array(bounds)).to_csv('bounds.csv') - pd.DataFrame(np.array([n])).to_csv('df.csv') - pd.DataFrame(phi).to_csv('phi.csv') - pd.DataFrame(theta).to_csv('theta.csv') - pd.DataFrame(omega_sq).to_csv('omega_sq.csv') - rcode = """ - bounds <- c(read.csv('bounds.csv',row.names=1)[1,1], read.csv('bounds.csv',row.names=1)[2,1]) - df <- read.csv('df.csv',row.names=1)[1,1] - phi <- read.csv('phi.csv',row.names=1) - theta <- read.csv('theta.csv',row.names=1) - omega_sq <- read.csv('omega_sq.csv',row.names=1) - f_ML <- function(Lambda) { - df * log(sum(omega.sq/(theta + lambda))) + sum(log(phi + lambda)) - } - optimize(f_ML, interval = bounds) - """ - tmp = robjects.r(rcode) - lambda_opt = list(tmp)[0] - objective = list(tmp)[1] + def f_ML(Lambda): + return n * np.log(np.nansum(omega_sq.reshape(-1) / (theta + Lambda))) + np.nansum(np.log(phi + Lambda)) + lambda_opt = fminbound(f_ML, bounds[0], bounds[1]) + objective = f_ML(lambda_opt) os.system('rm bounds.csv df.csv phi.csv theta.csv omega_sq.csv') else: - #def f_REML(Lambda): - # return (n - p) * np.log(np.nansum(omega_sq.reshape(-1) / (theta + Lambda))) + np.nansum(np.log(theta + Lambda)) - #lambda_opt = fminbound(f_REML, bounds[0], bounds[1]) - #objective = f_REML(lambda_opt) df = n - p - pd.DataFrame(np.array(bounds)).to_csv('bounds.csv') - pd.DataFrame(np.array([n-p])).to_csv('df.csv') - pd.DataFrame(omega_sq).to_csv('omega_sq.csv') - pd.DataFrame(theta).to_csv('theta.csv') - rcode = """ - bounds <- c(read.csv('bounds.csv',row.names=1)[1,1], read.csv('bounds.csv',row.names=1)[2,1]) - df <- read.csv('df.csv',row.names=1)[1,1] - theta <- read.csv('theta.csv',row.names=1) - omega_sq <- read.csv('omega_sq.csv',row.names=1) - f_REML <- function(Lambda) { - df * log(sum(omega_sq/(theta + Lambda))) + sum(log(theta + Lambda)) - } - optimize(f_REML, interval = bounds) - """ - tmp = robjects.r(rcode) - lambda_opt = list(tmp)[0] - objective = list(tmp)[1] - os.system('rm bounds.csv df.csv omega_sq.csv theta.csv') + def f_REML(Lambda): + return (n - p) * np.log(np.nansum(omega_sq.reshape(-1) / (theta + Lambda))) + np.nansum(np.log(theta + Lambda)) + lambda_opt = fminbound(f_REML, bounds[0], bounds[1]) + objective = f_REML(lambda_opt) Vu_opt = np.nansum(omega_sq.reshape(-1) / (theta + lambda_opt)) / df Ve_opt = lambda_opt * Vu_opt Hinv = np.dot(U, (np.transpose(U) / (phi + lambda_opt).reshape(-1,1))) From 3b162513ca520df4301fe66f498e4740637057c2 Mon Sep 17 00:00:00 2001 From: zhanglei Date: Thu, 7 Mar 2024 16:34:47 +0800 Subject: [PATCH 2/6] removing R code, sort eigen values and vectors before optimising --- aMat.py | 43 +++++++++++++++++++++++++++++++++ a_mat_jax.py | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 110 insertions(+) create mode 100644 aMat.py create mode 100644 a_mat_jax.py diff --git a/aMat.py b/aMat.py new file mode 100644 index 0000000..753b5a6 --- /dev/null +++ b/aMat.py @@ -0,0 +1,43 @@ +import time + +import numpy as np + +# Define the add_mat function as previously described with corrections +def add_mat_numpy(mark): + p = np.mean(0.5 * mark, axis=0) # Ensure p is an ndarray for element-wise operations + t = np.ones((mark.shape[0], 1)) + p_reshape = p.reshape(1,-1) + x = np.dot(t, p.reshape(1, -1)) + m = mark - 2 * np.dot(t, p.reshape(1, -1)) # Corrected matrix multiplication + rel = np.dot(m, m.T) + q = 1 - p + sum_val = np.sum(2 * p * q) # Element-wise multiplication + relf = rel / sum_val # Element-wise division + return relf + +# Create fixed fake data as specified +mark_numpy = np.array([ + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # First individual with all 0s + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], # Second individual with all 1s + [2, 2, 2, 2, 2, 2, 2, 2, 2, 2], # Third individual with all 2s + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] # Fourth individual with all 0s +]) + +start_time = time.time() +relf_numpy = add_mat_numpy(mark_numpy) +print("Additive genetic relationship matrix:\n", relf_numpy) +end_time = time.time() +print("NumPy version execution time:", end_time - start_time, "seconds") + + +# Create large test data for NumPy version +num_individuals = 20000 +num_markers = 32000 +# Each row alternates between 0, 1, and 2 +mark_numpy_large = np.tile(np.array([0, 1, 2]), (num_individuals, num_markers // 3 + 1))[:, :num_markers] + +# Time the execution with large data for NumPy version +start_time = time.time() +relf_numpy_large = add_mat_numpy(mark_numpy_large) +end_time = time.time() +print("NumPy version execution time:", end_time - start_time, "seconds") diff --git a/a_mat_jax.py b/a_mat_jax.py new file mode 100644 index 0000000..e9b29f4 --- /dev/null +++ b/a_mat_jax.py @@ -0,0 +1,67 @@ +import jax.numpy as jnp +from jax import jit +import time +from sklearn.decomposition import PCA +import matplotlib.pyplot as plt +@jit +def add_mat_jax(mark): + p = jnp.mean(0.5 * mark, axis=0) + t = jnp.ones((mark.shape[0], 1)) + m = mark - 2 * jnp.dot(t, p.reshape(1, -1)) + rel = jnp.dot(m, m.T) + q = 1 - p + sum_val = jnp.sum(2 * p * q) + relf = rel / sum_val + return relf + + +def small_test(): + global _, start_time, end_time + # Fixed fake data for JAX + mark_jax = jnp.array([ + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [2, 2, 2, 2, 2, 2, 2, 2, 2, 2], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + ]) + # Warm-up call to include JIT compilation time separately + _ = add_mat_jax(mark_jax).block_until_ready() + # Measure execution time after compilation + start_time = time.time() + relf_jax = add_mat_jax(mark_jax).block_until_ready() + print("Additive genetic relationship matrix:\n", relf_jax) + end_time = time.time() + print("JAX version execution time (after JIT compilation):", end_time - start_time, "seconds") + # Perform PCA + pca = PCA(n_components=3) # Adjust n_components as needed + principal_components = pca.fit_transform(relf_jax) + + # Plot the first two principal components + plt.figure(figsize=(8, 6)) + plt.scatter(principal_components[:, 0], principal_components[:, 1]) + plt.xlabel('Principal Component 1') + plt.ylabel('Principal Component 2') + plt.title('PCA of Relationship Matrix') + plt.show() + + +def large_test(): + global _, start_time, end_time + num_individuals = 20000 + num_markers = 40000 + mark_large = jnp.tile(jnp.array([0, 1, 2]), (num_individuals, num_markers // 3 + 1))[:, :num_markers] + # Warm-up JIT compilation with large data + _ = add_mat_jax(mark_large).block_until_ready() + # Time the execution with large data + start_time = time.time() + relf_large = add_mat_jax(mark_large).block_until_ready() + end_time = time.time() + print("JAX version execution time (after JIT compilation):", end_time - start_time, "seconds") + + pca = PCA(n_components=3) # Adjust n_components as needed + principal_components = pca.fit_transform(relf_large) + +#small_test() + +large_test() + From fe235324b80e5d5b94456f0314d62e7156c63be2 Mon Sep 17 00:00:00 2001 From: zhanglei Date: Sat, 23 Mar 2024 09:17:47 +0800 Subject: [PATCH 3/6] update a kinship --- aMat.py | 45 ++--- a_mat_jax.py | 11 +- geonomic_kinship_test.ipynb | 356 ++++++++++++++++++++++++++++++++++++ 3 files changed, 387 insertions(+), 25 deletions(-) create mode 100644 geonomic_kinship_test.ipynb diff --git a/aMat.py b/aMat.py index 753b5a6..d63a8af 100644 --- a/aMat.py +++ b/aMat.py @@ -15,29 +15,30 @@ def add_mat_numpy(mark): relf = rel / sum_val # Element-wise division return relf -# Create fixed fake data as specified -mark_numpy = np.array([ - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # First individual with all 0s - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], # Second individual with all 1s - [2, 2, 2, 2, 2, 2, 2, 2, 2, 2], # Third individual with all 2s - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] # Fourth individual with all 0s -]) -start_time = time.time() -relf_numpy = add_mat_numpy(mark_numpy) -print("Additive genetic relationship matrix:\n", relf_numpy) -end_time = time.time() -print("NumPy version execution time:", end_time - start_time, "seconds") +if __name__ == '__main__': + # Create fixed fake data as specified + mark_numpy = np.array([ + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # First individual with all 0s + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], # Second individual with all 1s + [2, 2, 2, 2, 2, 2, 2, 2, 2, 2], # Third individual with all 2s + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] # Fourth individual with all 0s + ]) + start_time = time.time() + relf_numpy = add_mat_numpy(mark_numpy) + print("Additive genetic relationship matrix:\n", relf_numpy) + end_time = time.time() + print("NumPy version execution time:", end_time - start_time, "seconds") -# Create large test data for NumPy version -num_individuals = 20000 -num_markers = 32000 -# Each row alternates between 0, 1, and 2 -mark_numpy_large = np.tile(np.array([0, 1, 2]), (num_individuals, num_markers // 3 + 1))[:, :num_markers] + # Create large test data for NumPy version + num_individuals = 20000 + num_markers = 32000 + # Each row alternates between 0, 1, and 2 + mark_numpy_large = np.tile(np.array([0, 1, 2]), (num_individuals, num_markers // 3 + 1))[:, :num_markers] -# Time the execution with large data for NumPy version -start_time = time.time() -relf_numpy_large = add_mat_numpy(mark_numpy_large) -end_time = time.time() -print("NumPy version execution time:", end_time - start_time, "seconds") + # Time the execution with large data for NumPy version + start_time = time.time() + relf_numpy_large = add_mat_numpy(mark_numpy_large) + end_time = time.time() + print("NumPy version execution time:", end_time - start_time, "seconds") diff --git a/a_mat_jax.py b/a_mat_jax.py index e9b29f4..3b86dfe 100644 --- a/a_mat_jax.py +++ b/a_mat_jax.py @@ -5,6 +5,14 @@ import matplotlib.pyplot as plt @jit def add_mat_jax(mark): + """ + mark use 0,1,2 encoding + Args: + mark: shape is n,m + + Returns: + """ + p = jnp.mean(0.5 * mark, axis=0) t = jnp.ones((mark.shape[0], 1)) m = mark - 2 * jnp.dot(t, p.reshape(1, -1)) @@ -61,7 +69,4 @@ def large_test(): pca = PCA(n_components=3) # Adjust n_components as needed principal_components = pca.fit_transform(relf_large) -#small_test() - -large_test() diff --git a/geonomic_kinship_test.ipynb b/geonomic_kinship_test.ipynb new file mode 100644 index 0000000..7dc6cfb --- /dev/null +++ b/geonomic_kinship_test.ipynb @@ -0,0 +1,356 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "initial_id", + "metadata": { + "collapsed": true, + "ExecuteTime": { + "end_time": "2024-03-23T00:51:49.603854400Z", + "start_time": "2024-03-23T00:51:45.861543100Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 2763 entries, 112 to 2396\n", + "Columns: 9642 entries, Genotype to 10_149535062_G\n", + "dtypes: int64(9641), object(1)\n", + "memory usage: 203.3+ MB\n", + "None\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import time\n", + "import rrBLUP\n", + "df_geno = pd.read_csv('data/23TC1_YLD14_SU_geno_final.txt', sep='\\t')\n", + "df_geno = df_geno.sort_values(by=['Genotype'])\n", + "print(df_geno.info())\n", + "df_pheno = pd.read_csv(\"data/23TC1_YLD14_SU_pheno_final.txt\", sep='\\t')\n" + ] + }, + { + "cell_type": "code", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 2763 entries, 1929 to 179\n", + "Data columns (total 2 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Genotype 2763 non-null object \n", + " 1 YLD14_SU 2763 non-null float64\n", + "dtypes: float64(1), object(1)\n", + "memory usage: 64.8+ KB\n", + "None\n", + "(2763, 9641)\n", + "(2763, 1)\n" + ] + } + ], + "source": [ + "\n", + "df_pheno_ordered = df_pheno.sort_values(by=['Genotype'])\n", + "print(df_pheno_ordered.info())\n", + "train_x = np.array(df_geno.drop('Genotype', axis=1))\n", + "train_y = np.array(df_pheno_ordered['YLD14_SU']).reshape(-1, 1)\n", + "train_x = train_x - 1\n", + "print(train_x.shape)\n", + "print(train_y.shape)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-23T00:52:01.713868700Z", + "start_time": "2024-03-23T00:52:01.567439200Z" + } + }, + "id": "4ea00660ee17a7da", + "execution_count": 2 + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "a_1 = rrBLUP.A_mat(train_x)\n", + "a_1" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-23T00:53:45.474818Z", + "start_time": "2024-03-23T00:52:44.054145500Z" + } + }, + "id": "1f778cf653c205a5", + "execution_count": 3 + }, + { + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": "numpy.ndarray" + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(a_1)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-23T01:04:41.642221Z", + "start_time": "2024-03-23T01:04:41.624443300Z" + } + }, + "id": "eb6e2174ae759be4", + "execution_count": 22 + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "import aMat\n", + "a_2 = aMat.add_mat_numpy(train_x+1)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-23T01:10:23.791477300Z", + "start_time": "2024-03-23T01:10:22.853119Z" + } + }, + "id": "22084ea87f9364b1", + "execution_count": 27 + }, + { + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": "numpy.ndarray" + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import a_mat_jax\n", + "import jax\n", + "a_3 = a_mat_jax.add_mat_jax(train_x+1)\n", + "a_3 = np.asarray(a_3)\n", + "type(a_3)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-23T01:16:10.802205200Z", + "start_time": "2024-03-23T01:16:10.233960Z" + } + }, + "id": "1ed66c383a884e0e", + "execution_count": 38 + }, + { + "cell_type": "code", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Arrays are close: True\n" + ] + } + ], + "source": [ + "# Using allclose to compare\n", + "close = np.allclose(a_1, a_2, rtol=0.0001, atol=0.0001)\n", + "\n", + "print(f\"Arrays are close: {close}\")" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-23T01:10:45.248073700Z", + "start_time": "2024-03-23T01:10:45.146571300Z" + } + }, + "id": "bdbcc53bb039085a", + "execution_count": 30 + }, + { + "cell_type": "code", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Running Time: 17.74 s\n" + ] + } + ], + "source": [ + "start_time = time.time()\n", + "g_result = rrBLUP.mixed_solve(y = train_y, K = a_1)\n", + "end_time = time.time()\n", + "#print(g_result)\n", + "print('Running Time: '+str(round(end_time-start_time, 2))+' s')\n", + "train_pred = g_result['u'] " + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-23T01:01:35.992522700Z", + "start_time": "2024-03-23T01:01:18.250576300Z" + } + }, + "id": "d32dd35631566b26", + "execution_count": 15 + }, + { + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": "array([[-17.59836984],\n [-20.95136377],\n [ -0.13633499],\n ...,\n [ -4.65200707],\n [-20.01554517],\n [-20.1281834 ]])" + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_pred" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-23T01:01:45.967475Z", + "start_time": "2024-03-23T01:01:45.954682900Z" + } + }, + "id": "3d0c303c9230d183", + "execution_count": 16 + }, + { + "cell_type": "code", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "K not positive semi-definite.\n", + "Running Time: 8.41 s\n" + ] + }, + { + "ename": "TypeError", + "evalue": "'NoneType' object is not subscriptable", + "output_type": "error", + "traceback": [ + "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", + "\u001B[1;31mTypeError\u001B[0m Traceback (most recent call last)", + "Cell \u001B[1;32mIn[39], line 6\u001B[0m\n\u001B[0;32m 4\u001B[0m \u001B[38;5;66;03m#print(g_result)\u001B[39;00m\n\u001B[0;32m 5\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mRunning Time: \u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;241m+\u001B[39m\u001B[38;5;28mstr\u001B[39m(\u001B[38;5;28mround\u001B[39m(end_time\u001B[38;5;241m-\u001B[39mstart_time, \u001B[38;5;241m2\u001B[39m))\u001B[38;5;241m+\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m s\u001B[39m\u001B[38;5;124m'\u001B[39m)\n\u001B[1;32m----> 6\u001B[0m train_pred_new \u001B[38;5;241m=\u001B[39m \u001B[43mg_result_new\u001B[49m\u001B[43m[\u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mu\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m]\u001B[49m \n\u001B[0;32m 7\u001B[0m train_pred_new\n", + "\u001B[1;31mTypeError\u001B[0m: 'NoneType' object is not subscriptable" + ] + } + ], + "source": [ + "start_time = time.time()\n", + "g_result_new = rrBLUP.mixed_solve(y = train_y, K = a_3)\n", + "end_time = time.time()\n", + "#print(g_result)\n", + "print('Running Time: '+str(round(end_time-start_time, 2))+' s')\n", + "train_pred_new = g_result_new['u'] \n", + "train_pred_new" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-23T01:16:26.084069700Z", + "start_time": "2024-03-23T01:16:17.660955900Z" + } + }, + "id": "557f3d43c301ad45", + "execution_count": 39 + }, + { + "cell_type": "code", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "import jax.numpy as jnp\n", + "import numpy as np\n", + "\n", + "# Create a JAX array\n", + "jax_array = jnp.array([1, 2, 3])\n", + "\n", + "# Convert to a NumPy array\n", + "numpy_array = np.asarray(jax_array)\n", + "\n", + "print(type(numpy_array)) # \n" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-23T01:15:38.449024800Z", + "start_time": "2024-03-23T01:15:38.443083500Z" + } + }, + "id": "2fdea088f03fd77", + "execution_count": 37 + }, + { + "cell_type": "code", + "outputs": [], + "source": [], + "metadata": { + "collapsed": false + }, + "id": "79a0fb4f44e8b0df" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 19eb1917ff9a8bfaf012b0592fdc4c9f15eead39 Mon Sep 17 00:00:00 2001 From: zhanglei Date: Mon, 25 Mar 2024 17:22:21 +0800 Subject: [PATCH 4/6] gblup and arseml test using gblup --- arseml_geonomic_kinship_test.ipynb | 608 +++++++++++++++++++++++++++++ blup_animal.ipynb | 289 ++++++++++++++ gblup.py | 36 ++ geonomic_kinship_test.ipynb | 356 ----------------- 4 files changed, 933 insertions(+), 356 deletions(-) create mode 100644 arseml_geonomic_kinship_test.ipynb create mode 100644 blup_animal.ipynb create mode 100644 gblup.py delete mode 100644 geonomic_kinship_test.ipynb diff --git a/arseml_geonomic_kinship_test.ipynb b/arseml_geonomic_kinship_test.ipynb new file mode 100644 index 0000000..b62700a --- /dev/null +++ b/arseml_geonomic_kinship_test.ipynb @@ -0,0 +1,608 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "initial_id", + "metadata": { + "collapsed": true, + "ExecuteTime": { + "end_time": "2024-03-25T00:43:12.613827100Z", + "start_time": "2024-03-25T00:43:00.090048600Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 2763 entries, 112 to 2396\n", + "Columns: 9642 entries, Genotype to 10_149535062_G\n", + "dtypes: int64(9641), object(1)\n", + "memory usage: 203.3+ MB\n", + "None\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import time\n", + "import rrBLUP as p_rrBlup\n", + "from rpy2.robjects.packages import importr\n", + "from rpy2.robjects import Formula as f\n", + "from rpy2.robjects import pandas2ri\n", + "import rpy2.robjects as ro\n", + "pandas2ri.activate()\n", + "rrBLUP = importr('rrBLUP')\n", + "df_geno = pd.read_csv('data/23TC1_YLD14_SU_geno_final.txt', sep='\\t')\n", + "df_geno = df_geno.sort_values(by=['Genotype'])\n", + "print(df_geno.info())\n", + "df_pheno = pd.read_csv(\"data/23TC1_YLD14_SU_pheno_final.txt\", sep='\\t')\n" + ] + }, + { + "cell_type": "code", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 2763 entries, 1929 to 179\n", + "Data columns (total 2 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Genotype 2763 non-null object \n", + " 1 YLD14_SU 2763 non-null float64\n", + "dtypes: float64(1), object(1)\n", + "memory usage: 64.8+ KB\n", + "None\n", + "(2763, 9641)\n", + "(2763, 1)\n" + ] + } + ], + "source": [ + "\n", + "df_pheno_ordered = df_pheno.sort_values(by=['Genotype'])\n", + "print(df_pheno_ordered.info())\n", + "train_x = np.array(df_geno.drop('Genotype', axis=1))\n", + "train_y = np.array(df_pheno_ordered['YLD14_SU']).reshape(-1, 1)\n", + "train_x = train_x - 1\n", + "print(train_x.shape)\n", + "print(train_y.shape)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-25T00:43:14.833783200Z", + "start_time": "2024-03-25T00:43:14.642438500Z" + } + }, + "id": "4ea00660ee17a7da", + "execution_count": 2 + }, + { + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": "array([[ 0.96460593, 0.40092433, 0.25768902, ..., -0.30201703,\n -0.23842051, -0.23847136],\n [ 0.40092433, 0.81633067, 0.42894227, ..., -0.22019376,\n -0.27356791, -0.27299987],\n [ 0.25768902, 0.42894227, 0.75699359, ..., -0.24150724,\n -0.2534156 , -0.25408534],\n ...,\n [-0.30201703, -0.22019376, -0.24150724, ..., 0.70114823,\n 0.31388157, 0.31383073],\n [-0.23842051, -0.27356791, -0.2534156 , ..., 0.31388157,\n 0.89487222, 0.89420248],\n [-0.23847136, -0.27299987, -0.25408534, ..., 0.31383073,\n 0.89420248, 0.89569887]])" + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "a_1 = rrBLUP.A_mat(train_x)\n", + "a_1" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-25T00:43:53.731595900Z", + "start_time": "2024-03-25T00:43:16.866956900Z" + } + }, + "id": "1f778cf653c205a5", + "execution_count": 3 + }, + { + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": "numpy.ndarray" + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(a_1)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-23T01:35:43.596344300Z", + "start_time": "2024-03-23T01:35:43.592192800Z" + } + }, + "id": "eb6e2174ae759be4", + "execution_count": 52 + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "import aMat\n", + "a_2 = aMat.add_mat_numpy(train_x+1)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-25T00:44:38.861429500Z", + "start_time": "2024-03-25T00:44:37.795651Z" + } + }, + "id": "22084ea87f9364b1", + "execution_count": 5 + }, + { + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": "numpy.ndarray" + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import a_mat_jax\n", + "import jax\n", + "a_3 = a_mat_jax.add_mat_jax(train_x+1)\n", + "a_3 = np.asarray(a_3)\n", + "type(a_3)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-23T01:35:48.457885400Z", + "start_time": "2024-03-23T01:35:47.864412600Z" + } + }, + "id": "1ed66c383a884e0e", + "execution_count": 53 + }, + { + "cell_type": "code", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Arrays are close: True\n" + ] + } + ], + "source": [ + "# Using allclose to compare\n", + "close = np.allclose(a_2, a_3, rtol=0.00001, atol=0.00001)\n", + "\n", + "print(f\"Arrays are close: {close}\")" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-23T01:35:50.365137700Z", + "start_time": "2024-03-23T01:35:50.287741600Z" + } + }, + "id": "bdbcc53bb039085a", + "execution_count": 54 + }, + { + "cell_type": "code", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Running Time: 20.03 s\n" + ] + }, + { + "ename": "AttributeError", + "evalue": "'dict' object has no attribute 'rx2'", + "output_type": "error", + "traceback": [ + "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", + "\u001B[1;31mAttributeError\u001B[0m Traceback (most recent call last)", + "Cell \u001B[1;32mIn[4], line 7\u001B[0m\n\u001B[0;32m 5\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mRunning Time: \u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;241m+\u001B[39m\u001B[38;5;28mstr\u001B[39m(\u001B[38;5;28mround\u001B[39m(end_time\u001B[38;5;241m-\u001B[39mstart_time, \u001B[38;5;241m2\u001B[39m))\u001B[38;5;241m+\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m s\u001B[39m\u001B[38;5;124m'\u001B[39m)\n\u001B[0;32m 6\u001B[0m \u001B[38;5;66;03m#train_pred = g_result['u'] \u001B[39;00m\n\u001B[1;32m----> 7\u001B[0m train_pred \u001B[38;5;241m=\u001B[39m np\u001B[38;5;241m.\u001B[39marray(\u001B[43mg_result\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mrx2\u001B[49m(\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mu\u001B[39m\u001B[38;5;124m'\u001B[39m))\n", + "\u001B[1;31mAttributeError\u001B[0m: 'dict' object has no attribute 'rx2'" + ] + } + ], + "source": [ + "start_time = time.time()\n", + "g_result = rrBLUP.mixed_solve(y = train_y, K = a_1)\n", + "end_time = time.time()\n", + "#print(g_result)\n", + "print('Running Time: '+str(round(end_time-start_time, 2))+' s')\n", + "#train_pred = g_result['u'] \n", + "train_pred = np.array(g_result.rx2('u'))" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-25T00:44:26.308267100Z", + "start_time": "2024-03-25T00:44:05.897407300Z" + } + }, + "id": "d32dd35631566b26", + "execution_count": 4 + }, + { + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": "array([[689.5701105],\n [615.9386355],\n [661.3197776],\n ...,\n [692.8649345],\n [709.3628695],\n [684.4678623]])" + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_y" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-25T06:34:24.962192500Z", + "start_time": "2024-03-25T06:34:24.951436400Z" + } + }, + "id": "3d0c303c9230d183", + "execution_count": 39 + }, + { + "cell_type": "code", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'Vu': 462.6130366195777, 'Ve': 969.4607183018168, 'beta': array([[703.72274814]]), 'u': array([[-17.58200532],\n", + " [-21.2687656 ],\n", + " [ -0.33706766],\n", + " ...,\n", + " [ -4.68147966],\n", + " [-19.89370123],\n", + " [-20.12403582]]), 'LL': -13572.19170915865}\n", + "Running Time: 17.25 s\n" + ] + } + ], + "source": [ + "start_time = time.time()\n", + "g_result = p_rrBlup.mixed_solve(y = train_y, K = a_2)\n", + "end_time = time.time()\n", + "print(g_result)\n", + "print('Running Time: '+str(round(end_time-start_time, 2))+' s')\n", + "#train_pred = g_result['u'] \n", + "rrblup_u = g_result['u']" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-25T06:52:04.948507200Z", + "start_time": "2024-03-25T06:51:47.691501200Z" + } + }, + "id": "557f3d43c301ad45", + "execution_count": 64 + }, + { + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": "array([[-17.59836984],\n [-20.95136378],\n [ -0.13633498],\n ...,\n [ -4.65200707],\n [-20.01554518],\n [-20.12818341]])" + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rrblup_u" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-25T00:45:22.463225400Z", + "start_time": "2024-03-25T00:45:22.454325500Z" + } + }, + "id": "2fdea088f03fd77", + "execution_count": 7 + }, + { + "cell_type": "code", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Correlation Coefficient (相关系数): 0.9999505295951803\n" + ] + } + ], + "source": [ + "import gblup\n", + "g_result = gblup.solve_mixed_gblup_model(n_obs=len(train_y), obs_ids=[i for i in range(len(train_y))], y_values=train_y, ainv=np.linalg.inv(a_2))\n", + "gblue_values = g_result[1:,]\n", + "from scipy.stats import pearsonr\n", + "\n", + "rrblup_u_1d = rrblup_u.flatten()\n", + "gblue_values_1d = gblue_values.flatten()\n", + "\n", + "# Now you can calculate the correlation coefficient\n", + "correlation_coef = np.corrcoef(rrblup_u_1d, gblue_values_1d)[0, 1]\n", + "print(\"Correlation Coefficient (相关系数):\", correlation_coef)\n" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-25T08:34:01.640160900Z", + "start_time": "2024-03-25T08:34:00.704205500Z" + } + }, + "id": "173e3d457a409f8e", + "execution_count": 76 + }, + { + "cell_type": "code", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Online License checked out Mon Mar 25 13:40:58 2024\n" + ] + } + ], + "source": [ + "asreml = importr('asreml')" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-25T05:40:58.749306200Z", + "start_time": "2024-03-25T05:40:56.371246400Z" + } + }, + "id": "79a0fb4f44e8b0df", + "execution_count": 8 + }, + { + "cell_type": "code", + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "R[write to console]: Reciprocal conditional number for original matrix is: 1.59179018378963e-06\n", + "\n", + "R[write to console]: Reciprocal conditional number for inverted matrix is: 1.24079893155397e-06\n", + "\n", + "R[write to console]: Inverse of matrix G does not appear to be ill-conditioned.\n", + "\n" + ] + }, + { + "data": { + "text/plain": "array([[ 1.00000000e+00, 1.00000000e+00, 1.04168312e+01],\n [ 2.00000000e+00, 1.00000000e+00, 5.02454510e-01],\n [ 2.00000000e+00, 2.00000000e+00, 2.62023822e+01],\n ...,\n [ 2.76300000e+03, 2.76100000e+03, -2.16311000e-03],\n [ 2.76300000e+03, 2.76200000e+03, -2.17669057e+01],\n [ 2.76300000e+03, 2.76300000e+03, 2.63256497e+01]])" + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from rpy2.robjects import r, pandas2ri\n", + "# Transfer G_matrix to R\n", + "r.assign('ID', df_pheno_ordered['Genotype'])\n", + "r.assign('Gmat', a_2)\n", + "r('''\n", + "# Adjust G_matrix and invert\n", + "library(ASRgenomics)\n", + "rownames(Gmat) = ID\n", + "colnames(Gmat) = ID\n", + "diag(Gmat) <- diag(Gmat) + 0.01\n", + "ginv = G.inverse(Gmat,sparseform = T)$Ginv.sparse''')\n" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-25T06:48:06.249000300Z", + "start_time": "2024-03-25T06:47:55.518842300Z" + } + }, + "id": "b3e7ef8025b116", + "execution_count": 55 + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "\n", + "df_pheno_ordered['Genotype'] = df_pheno_ordered['Genotype'].astype('category')" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-25T06:17:55.988217Z", + "start_time": "2024-03-25T06:17:55.974047700Z" + } + }, + "id": "acabf46be808b8ab", + "execution_count": 18 + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "import rpy2.robjects as robjects\n", + "\n", + "modg = asreml.asreml(\n", + "fixed = f('YLD14_SU ~ 1'),\n", + "random = f('~vm(Genotype,ginv)'),\n", + "dense = f('~vm(Genotype,ginv)'),\n", + "residual =f('~idv(units)'), \n", + "workspace = \"2Gb\",\n", + "data = df_pheno_ordered)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-25T06:49:06.121751300Z", + "start_time": "2024-03-25T06:48:43.097036700Z" + } + }, + "id": "5e0bb540a4a4c72c", + "execution_count": 56 + }, + { + "cell_type": "code", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " component std.error z.ratio bound %ch\n", + "vm(Genotype, G_matrix_inv) 5.972789e-06 NA NA B NA\n", + "units!units 1.170023e+03 31.48448 37.16189 P 0\n", + "units!R 1.000000e+00 NA NA F 0\n" + ] + } + ], + "source": [ + "summ = asreml.summary_asreml(modg)\n", + "print(summ.rx2('varcomp'))" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-25T06:20:01.616208300Z", + "start_time": "2024-03-25T06:20:01.609791200Z" + } + }, + "id": "305ddc385d71eed4", + "execution_count": 21 + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "asreml_u = (modg.rx2['coefficients'].rx2['random'])\n" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-25T06:50:25.449294Z", + "start_time": "2024-03-25T06:50:25.444424600Z" + } + }, + "id": "5b5a60ff985757ff", + "execution_count": 58 + }, + { + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": "array([[-17.5657198 ],\n [-21.58628211],\n [ -0.53769809],\n ...,\n [ -4.71096706],\n [-19.77202265],\n [-20.12006455]])" + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "close = np.allclose(rrblup_u, asreml_u, rtol=0.00001, atol=0.00001)\n", + "\n" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-25T06:51:13.081435600Z", + "start_time": "2024-03-25T06:51:13.077258400Z" + } + }, + "id": "e2d6659412f49f53", + "execution_count": 63 + }, + { + "cell_type": "code", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Correlation Coefficient (相关系数): 0.9999217812834612\n" + ] + } + ], + "source": [ + "from scipy.stats import pearsonr\n", + "\n", + "rrblup_u_1d = rrblup_u.flatten()\n", + "asreml_u_1d = asreml_u.flatten()\n", + "\n", + "# Now you can calculate the correlation coefficient\n", + "correlation_coef = np.corrcoef(rrblup_u_1d, asreml_u_1d)[0, 1]\n", + "print(\"Correlation Coefficient (相关系数):\", correlation_coef)\n" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-25T06:55:01.965913100Z", + "start_time": "2024-03-25T06:55:01.955398600Z" + } + }, + "id": "d0553370feb1b95f", + "execution_count": 70 + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/blup_animal.ipynb b/blup_animal.ipynb new file mode 100644 index 0000000..680e41c --- /dev/null +++ b/blup_animal.ipynb @@ -0,0 +1,289 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 35, + "id": "initial_id", + "metadata": { + "collapsed": true, + "ExecuteTime": { + "end_time": "2024-03-25T09:19:22.281546600Z", + "start_time": "2024-03-25T09:19:22.075735900Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[1. 0. 0. 0.5 0. 0.5 0.25 0.25 ]\n", + " [0. 1. 0. 0. 0.5 0.5 0.25 0.25 ]\n", + " [0. 0. 1. 0. 0.5 0. 0.25 0.5 ]\n", + " [0.5 0. 0. 1. 0. 0.25 0.5 0.125]\n", + " [0. 0.5 0.5 0. 1. 0.25 0.5 0.375]\n", + " [0.5 0.5 0. 0.25 0.25 1. 0.25 0.5 ]\n", + " [0.25 0.25 0.25 0.5 0.5 0.25 1. 0.25 ]\n", + " [0.25 0.25 0.5 0.125 0.375 0.5 0.25 1. ]]\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from rpy2.robjects import pandas2ri, r\n", + "from rpy2.robjects.packages import importr\n", + "pandas2ri.activate()\n", + "import pandas as pd\n", + "# Load qgenet package in R\n", + "qgenet = importr('qgenet')\n", + "\n", + "# Define your pedigree data\n", + "ped_data = {\n", + " 'ID': [1, 2, 3, 4, 5, 6, 7, 8],\n", + " 'SIRE': [0, 0, 0, 1, 3, 1, 4, 3],\n", + " 'DAM': [0, 0, 0, 0, 2, 2, 5, 6]\n", + "}\n", + "\n", + "# Convert your pedigree data to an R dataframe\n", + "ped_df = pandas2ri.py2rpy_pandasdataframe(pd.DataFrame(ped_data))\n", + "\n", + "# Compute the A matrix using qgenet package in R\n", + "A = qgenet.amatrix(ped_df)\n", + "\n", + "# Invert the A matrix\n", + "#ainv = np.array(r.solve(A))\n", + "ainv = np.linalg.inv(A)\n", + "print(np.array(A))\n", + "#np.allclose(ainv,e)" + ] + }, + { + "cell_type": "code", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Solution: [[ 3.97256482]\n", + " [ 0.12596923]\n", + " [-0.15757814]\n", + " [-0.024685 ]\n", + " [ 0.14742549]\n", + " [-0.31838503]\n", + " [ 0.05387085]\n", + " [-0.16289678]\n", + " [ 0.21716138]]\n" + ] + } + ], + "source": [ + "# Example fixed effects (mean) and random effects (animal genetic effects) in Python\n", + "X = np.ones((5,1)) # Assuming 5 observations and only a mean effect\n", + "Z = np.array([\n", + " [0, 0, 0, 1, 0, 0, 0, 0], # Observation for individual 4\n", + " [0, 0, 0, 0, 1, 0, 0, 0], # Observation for individual 5\n", + " [0, 0, 0, 0, 0, 1, 0, 0], # Observation for individual 6\n", + " [0, 0, 0, 0, 0, 0, 1, 0], # Observation for individual 7\n", + " [0, 0, 0, 0, 0, 0, 0, 1] # Observation for individual 8\n", + "])\n", + "# Observations\n", + "y = np.array([4.5, 2.9, 3.9, 3.5, 5.0]).reshape(-1, 1)\n", + "\n", + "# Matrix operations\n", + "XX = X.T @ X\n", + "XZ = X.T @ Z\n", + "ZX = Z.T @ X\n", + "ZZ = Z.T @ Z\n", + "ZZA = ZZ + ainv * 2 # Adjust based on your specific model\n", + "Xy = X.T @ y\n", + "Zy = Z.T @ y\n", + "\n", + "# Assemble mixed model equations\n", + "mme = np.vstack([\n", + " np.hstack([XX, XZ]),\n", + " np.hstack([ZX, ZZA])\n", + "])\n", + "\n", + "# Solution vector\n", + "sol = np.linalg.solve(mme, np.vstack([Xy, Zy]))\n", + "\n", + "print(\"Solution:\", sol)\n" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-25T08:25:27.920019400Z", + "start_time": "2024-03-25T08:25:27.910221100Z" + } + }, + "id": "2d0bde54f8d10808", + "execution_count": 25 + }, + { + "cell_type": "code", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Solution vector: [[ 3.97256482]\n", + " [ 0.12596923]\n", + " [-0.15757814]\n", + " [-0.024685 ]\n", + " [ 0.14742549]\n", + " [-0.31838503]\n", + " [ 0.05387085]\n", + " [-0.16289678]\n", + " [ 0.21716138]]\n" + ] + } + ], + "source": [ + "import gblup\n", + "# Number of observations\n", + "n_obs = 5\n", + "\n", + "# IDs of individuals with observations (assuming these are 1-indexed and correspond to the order in 'ainv')\n", + "obs_ids = [3,4,5,6,7]\n", + "\n", + "# Phenotype values for these observations\n", + "y_values = [4.5, 2.9, 3.9, 3.5, 5.0]\n", + "\n", + "\n", + "# Solve the mixed model\n", + "solution_vector = gblup.solve_mixed_gblup_model(n_obs, obs_ids, y_values, A)\n", + "\n", + "print(\"Solution vector:\", solution_vector)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-25T08:35:28.902084100Z", + "start_time": "2024-03-25T08:35:28.849465700Z" + } + }, + "id": "b2cc4cc1ae75c9e2", + "execution_count": 30 + }, + { + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": "True" + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.allclose(solution_vector, sol)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-25T08:35:34.146131Z", + "start_time": "2024-03-25T08:35:34.139378800Z" + } + }, + "id": "a70e3dd6866c165d", + "execution_count": 31 + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "def generate_synthetic_data(n_individuals, n_observations):\n", + " \"\"\"\n", + " Generate synthetic pedigree, observation IDs, and phenotype values.\n", + " \n", + " Parameters:\n", + " - n_individuals: Number of individuals in the pedigree.\n", + " - n_observations: Number of observations (must be <= n_individuals).\n", + " \n", + " Returns:\n", + " - ainv: Synthetic additive relationship matrix inverse for the pedigree.\n", + " - obs_ids: IDs of individuals with observations.\n", + " - y_values: Synthetic phenotype values for observations.\n", + " \"\"\"\n", + " # Generate a synthetic additive relationship matrix (A matrix) and invert it\n", + " # Here, we'll use an identity matrix for simplicity; in practice, A would be based on pedigree\n", + " A = np.eye(n_individuals)\n", + " ainv = np.linalg.inv(A + np.random.normal(0, 0.05, A.shape)) # Adding noise for a more realistic scenario\n", + " \n", + " # Generate observation IDs (random sample of individuals)\n", + " obs_ids = np.random.choice(range(0, n_individuals), n_observations, replace=False)\n", + " \n", + " # Generate synthetic phenotype values (normal distribution)\n", + " y_values = np.random.normal(10, 2, n_observations)\n", + " \n", + " return ainv, obs_ids, y_values\n", + "\n", + "# Parameters for synthetic data generation\n", + "n_individuals = 6000 # Number of individuals in the pedigree\n", + "n_observations = 3000 # Number of observations\n", + "\n", + "# Generate synthetic data\n", + "ainv, obs_ids, y_values = generate_synthetic_data(n_individuals, n_observations)\n" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-25T08:41:54.236514800Z", + "start_time": "2024-03-25T08:41:49.193187800Z" + } + }, + "id": "640368833e3eb822", + "execution_count": 32 + }, + { + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": "array([[ 9.65855164],\n [ 26.53936216],\n [ -3.45564058],\n ...,\n [ -1.37455656],\n [ 14.0913003 ],\n [-18.79276052]])" + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "solution_vector = gblup.solve_mixed_gblup_model(n_observations, obs_ids, y_values, ainv)\n", + "solution_vector" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-25T08:42:03.490882200Z", + "start_time": "2024-03-25T08:41:55.848715200Z" + } + }, + "id": "17d9a37deccf8a55", + "execution_count": 33 + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/gblup.py b/gblup.py new file mode 100644 index 0000000..0c67316 --- /dev/null +++ b/gblup.py @@ -0,0 +1,36 @@ +def solve_mixed_gblup_model(n_obs, obs_ids, y_values, a): + import numpy as np + ainv = np.linalg.inv(np.array(a)) + + + # Fixed effects: Assuming only a mean effect for simplicity + X = np.ones((n_obs, 1)) + + # Random effects design matrix Z based on observation IDs + Z = np.zeros( + (n_obs, ainv.shape[0])) # Z should have a row for each observation and a column for each individual in ainv + for i, obs_id in enumerate(obs_ids): + Z[i, obs_id] = 1 # Assuming obs_ids are 0-indexed and match the order in ainv + + # Observations + y = np.array(y_values).reshape(-1, 1) + + # Matrix operations + XX = X.T @ X + XZ = X.T @ Z + ZX = Z.T @ X + ZZ = Z.T @ Z + ZZA = ZZ + ainv * 2 # Adjust based on your specific model + Xy = X.T @ y + Zy = Z.T @ y + + # Assemble mixed model equations (MME) + mme = np.vstack([ + np.hstack([XX, XZ]), + np.hstack([ZX, ZZA]) + ]) + + # Solve the MME for solution vector + sol = np.linalg.solve(mme, np.vstack([Xy, Zy])) + + return sol \ No newline at end of file diff --git a/geonomic_kinship_test.ipynb b/geonomic_kinship_test.ipynb deleted file mode 100644 index 7dc6cfb..0000000 --- a/geonomic_kinship_test.ipynb +++ /dev/null @@ -1,356 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "initial_id", - "metadata": { - "collapsed": true, - "ExecuteTime": { - "end_time": "2024-03-23T00:51:49.603854400Z", - "start_time": "2024-03-23T00:51:45.861543100Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Index: 2763 entries, 112 to 2396\n", - "Columns: 9642 entries, Genotype to 10_149535062_G\n", - "dtypes: int64(9641), object(1)\n", - "memory usage: 203.3+ MB\n", - "None\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import time\n", - "import rrBLUP\n", - "df_geno = pd.read_csv('data/23TC1_YLD14_SU_geno_final.txt', sep='\\t')\n", - "df_geno = df_geno.sort_values(by=['Genotype'])\n", - "print(df_geno.info())\n", - "df_pheno = pd.read_csv(\"data/23TC1_YLD14_SU_pheno_final.txt\", sep='\\t')\n" - ] - }, - { - "cell_type": "code", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Index: 2763 entries, 1929 to 179\n", - "Data columns (total 2 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 Genotype 2763 non-null object \n", - " 1 YLD14_SU 2763 non-null float64\n", - "dtypes: float64(1), object(1)\n", - "memory usage: 64.8+ KB\n", - "None\n", - "(2763, 9641)\n", - "(2763, 1)\n" - ] - } - ], - "source": [ - "\n", - "df_pheno_ordered = df_pheno.sort_values(by=['Genotype'])\n", - "print(df_pheno_ordered.info())\n", - "train_x = np.array(df_geno.drop('Genotype', axis=1))\n", - "train_y = np.array(df_pheno_ordered['YLD14_SU']).reshape(-1, 1)\n", - "train_x = train_x - 1\n", - "print(train_x.shape)\n", - "print(train_y.shape)" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-03-23T00:52:01.713868700Z", - "start_time": "2024-03-23T00:52:01.567439200Z" - } - }, - "id": "4ea00660ee17a7da", - "execution_count": 2 - }, - { - "cell_type": "code", - "outputs": [], - "source": [ - "a_1 = rrBLUP.A_mat(train_x)\n", - "a_1" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-03-23T00:53:45.474818Z", - "start_time": "2024-03-23T00:52:44.054145500Z" - } - }, - "id": "1f778cf653c205a5", - "execution_count": 3 - }, - { - "cell_type": "code", - "outputs": [ - { - "data": { - "text/plain": "numpy.ndarray" - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "type(a_1)" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-03-23T01:04:41.642221Z", - "start_time": "2024-03-23T01:04:41.624443300Z" - } - }, - "id": "eb6e2174ae759be4", - "execution_count": 22 - }, - { - "cell_type": "code", - "outputs": [], - "source": [ - "import aMat\n", - "a_2 = aMat.add_mat_numpy(train_x+1)" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-03-23T01:10:23.791477300Z", - "start_time": "2024-03-23T01:10:22.853119Z" - } - }, - "id": "22084ea87f9364b1", - "execution_count": 27 - }, - { - "cell_type": "code", - "outputs": [ - { - "data": { - "text/plain": "numpy.ndarray" - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import a_mat_jax\n", - "import jax\n", - "a_3 = a_mat_jax.add_mat_jax(train_x+1)\n", - "a_3 = np.asarray(a_3)\n", - "type(a_3)" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-03-23T01:16:10.802205200Z", - "start_time": "2024-03-23T01:16:10.233960Z" - } - }, - "id": "1ed66c383a884e0e", - "execution_count": 38 - }, - { - "cell_type": "code", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Arrays are close: True\n" - ] - } - ], - "source": [ - "# Using allclose to compare\n", - "close = np.allclose(a_1, a_2, rtol=0.0001, atol=0.0001)\n", - "\n", - "print(f\"Arrays are close: {close}\")" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-03-23T01:10:45.248073700Z", - "start_time": "2024-03-23T01:10:45.146571300Z" - } - }, - "id": "bdbcc53bb039085a", - "execution_count": 30 - }, - { - "cell_type": "code", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Running Time: 17.74 s\n" - ] - } - ], - "source": [ - "start_time = time.time()\n", - "g_result = rrBLUP.mixed_solve(y = train_y, K = a_1)\n", - "end_time = time.time()\n", - "#print(g_result)\n", - "print('Running Time: '+str(round(end_time-start_time, 2))+' s')\n", - "train_pred = g_result['u'] " - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-03-23T01:01:35.992522700Z", - "start_time": "2024-03-23T01:01:18.250576300Z" - } - }, - "id": "d32dd35631566b26", - "execution_count": 15 - }, - { - "cell_type": "code", - "outputs": [ - { - "data": { - "text/plain": "array([[-17.59836984],\n [-20.95136377],\n [ -0.13633499],\n ...,\n [ -4.65200707],\n [-20.01554517],\n [-20.1281834 ]])" - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "train_pred" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-03-23T01:01:45.967475Z", - "start_time": "2024-03-23T01:01:45.954682900Z" - } - }, - "id": "3d0c303c9230d183", - "execution_count": 16 - }, - { - "cell_type": "code", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "K not positive semi-definite.\n", - "Running Time: 8.41 s\n" - ] - }, - { - "ename": "TypeError", - "evalue": "'NoneType' object is not subscriptable", - "output_type": "error", - "traceback": [ - "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[1;31mTypeError\u001B[0m Traceback (most recent call last)", - "Cell \u001B[1;32mIn[39], line 6\u001B[0m\n\u001B[0;32m 4\u001B[0m \u001B[38;5;66;03m#print(g_result)\u001B[39;00m\n\u001B[0;32m 5\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mRunning Time: \u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;241m+\u001B[39m\u001B[38;5;28mstr\u001B[39m(\u001B[38;5;28mround\u001B[39m(end_time\u001B[38;5;241m-\u001B[39mstart_time, \u001B[38;5;241m2\u001B[39m))\u001B[38;5;241m+\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m s\u001B[39m\u001B[38;5;124m'\u001B[39m)\n\u001B[1;32m----> 6\u001B[0m train_pred_new \u001B[38;5;241m=\u001B[39m \u001B[43mg_result_new\u001B[49m\u001B[43m[\u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mu\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m]\u001B[49m \n\u001B[0;32m 7\u001B[0m train_pred_new\n", - "\u001B[1;31mTypeError\u001B[0m: 'NoneType' object is not subscriptable" - ] - } - ], - "source": [ - "start_time = time.time()\n", - "g_result_new = rrBLUP.mixed_solve(y = train_y, K = a_3)\n", - "end_time = time.time()\n", - "#print(g_result)\n", - "print('Running Time: '+str(round(end_time-start_time, 2))+' s')\n", - "train_pred_new = g_result_new['u'] \n", - "train_pred_new" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-03-23T01:16:26.084069700Z", - "start_time": "2024-03-23T01:16:17.660955900Z" - } - }, - "id": "557f3d43c301ad45", - "execution_count": 39 - }, - { - "cell_type": "code", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "import jax.numpy as jnp\n", - "import numpy as np\n", - "\n", - "# Create a JAX array\n", - "jax_array = jnp.array([1, 2, 3])\n", - "\n", - "# Convert to a NumPy array\n", - "numpy_array = np.asarray(jax_array)\n", - "\n", - "print(type(numpy_array)) # \n" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-03-23T01:15:38.449024800Z", - "start_time": "2024-03-23T01:15:38.443083500Z" - } - }, - "id": "2fdea088f03fd77", - "execution_count": 37 - }, - { - "cell_type": "code", - "outputs": [], - "source": [], - "metadata": { - "collapsed": false - }, - "id": "79a0fb4f44e8b0df" - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From 1a48a4acc4c115212e90bf9853a75629d025a9e8 Mon Sep 17 00:00:00 2001 From: zhanglei Date: Tue, 26 Mar 2024 16:21:04 +0800 Subject: [PATCH 5/6] gblup and arseml test using gblup --- arseml_geonomic_kinship_test.ipynb | 230 +++++---------- asreml_gblup.ipynb | 455 +++++++++++++++++++++++++++++ blup_animal.ipynb | 28 +- gblup.py | 106 +++++++ 4 files changed, 664 insertions(+), 155 deletions(-) create mode 100644 asreml_gblup.ipynb diff --git a/arseml_geonomic_kinship_test.ipynb b/arseml_geonomic_kinship_test.ipynb index b62700a..cde0c6d 100644 --- a/arseml_geonomic_kinship_test.ipynb +++ b/arseml_geonomic_kinship_test.ipynb @@ -2,13 +2,13 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 5, "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { - "end_time": "2024-03-25T00:43:12.613827100Z", - "start_time": "2024-03-25T00:43:00.090048600Z" + "end_time": "2024-03-26T01:25:28.301180100Z", + "start_time": "2024-03-26T01:25:25.491502Z" } }, "outputs": [ @@ -77,12 +77,12 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-25T00:43:14.833783200Z", - "start_time": "2024-03-25T00:43:14.642438500Z" + "end_time": "2024-03-26T01:25:28.446814900Z", + "start_time": "2024-03-26T01:25:28.298414Z" } }, "id": "4ea00660ee17a7da", - "execution_count": 2 + "execution_count": 6 }, { "cell_type": "code", @@ -91,7 +91,7 @@ "data": { "text/plain": "array([[ 0.96460593, 0.40092433, 0.25768902, ..., -0.30201703,\n -0.23842051, -0.23847136],\n [ 0.40092433, 0.81633067, 0.42894227, ..., -0.22019376,\n -0.27356791, -0.27299987],\n [ 0.25768902, 0.42894227, 0.75699359, ..., -0.24150724,\n -0.2534156 , -0.25408534],\n ...,\n [-0.30201703, -0.22019376, -0.24150724, ..., 0.70114823,\n 0.31388157, 0.31383073],\n [-0.23842051, -0.27356791, -0.2534156 , ..., 0.31388157,\n 0.89487222, 0.89420248],\n [-0.23847136, -0.27299987, -0.25408534, ..., 0.31383073,\n 0.89420248, 0.89569887]])" }, - "execution_count": 3, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -103,12 +103,12 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-25T00:43:53.731595900Z", - "start_time": "2024-03-25T00:43:16.866956900Z" + "end_time": "2024-03-26T01:26:14.165137100Z", + "start_time": "2024-03-26T01:25:28.446814900Z" } }, "id": "1f778cf653c205a5", - "execution_count": 3 + "execution_count": 7 }, { "cell_type": "code", @@ -117,7 +117,7 @@ "data": { "text/plain": "numpy.ndarray" }, - "execution_count": 52, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -128,12 +128,12 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-23T01:35:43.596344300Z", - "start_time": "2024-03-23T01:35:43.592192800Z" + "end_time": "2024-03-26T01:26:14.178057500Z", + "start_time": "2024-03-26T01:26:14.165137100Z" } }, "id": "eb6e2174ae759be4", - "execution_count": 52 + "execution_count": 8 }, { "cell_type": "code", @@ -145,12 +145,12 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-25T00:44:38.861429500Z", - "start_time": "2024-03-25T00:44:37.795651Z" + "end_time": "2024-03-26T01:26:15.714776700Z", + "start_time": "2024-03-26T01:26:14.178057500Z" } }, "id": "22084ea87f9364b1", - "execution_count": 5 + "execution_count": 9 }, { "cell_type": "code", @@ -159,7 +159,7 @@ "data": { "text/plain": "numpy.ndarray" }, - "execution_count": 53, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -174,12 +174,12 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-23T01:35:48.457885400Z", - "start_time": "2024-03-23T01:35:47.864412600Z" + "end_time": "2024-03-26T01:26:23.285849200Z", + "start_time": "2024-03-26T01:26:15.721070900Z" } }, "id": "1ed66c383a884e0e", - "execution_count": 53 + "execution_count": 10 }, { "cell_type": "code", @@ -201,12 +201,12 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-23T01:35:50.365137700Z", - "start_time": "2024-03-23T01:35:50.287741600Z" + "end_time": "2024-03-26T01:26:23.456729100Z", + "start_time": "2024-03-26T01:26:23.285849200Z" } }, "id": "bdbcc53bb039085a", - "execution_count": 54 + "execution_count": 11 }, { "cell_type": "code", @@ -215,18 +215,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Running Time: 20.03 s\n" - ] - }, - { - "ename": "AttributeError", - "evalue": "'dict' object has no attribute 'rx2'", - "output_type": "error", - "traceback": [ - "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[1;31mAttributeError\u001B[0m Traceback (most recent call last)", - "Cell \u001B[1;32mIn[4], line 7\u001B[0m\n\u001B[0;32m 5\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mRunning Time: \u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;241m+\u001B[39m\u001B[38;5;28mstr\u001B[39m(\u001B[38;5;28mround\u001B[39m(end_time\u001B[38;5;241m-\u001B[39mstart_time, \u001B[38;5;241m2\u001B[39m))\u001B[38;5;241m+\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m s\u001B[39m\u001B[38;5;124m'\u001B[39m)\n\u001B[0;32m 6\u001B[0m \u001B[38;5;66;03m#train_pred = g_result['u'] \u001B[39;00m\n\u001B[1;32m----> 7\u001B[0m train_pred \u001B[38;5;241m=\u001B[39m np\u001B[38;5;241m.\u001B[39marray(\u001B[43mg_result\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mrx2\u001B[49m(\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mu\u001B[39m\u001B[38;5;124m'\u001B[39m))\n", - "\u001B[1;31mAttributeError\u001B[0m: 'dict' object has no attribute 'rx2'" + "Running Time: 232.65 s\n" ] } ], @@ -242,12 +231,12 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-25T00:44:26.308267100Z", - "start_time": "2024-03-25T00:44:05.897407300Z" + "end_time": "2024-03-26T01:30:16.134754800Z", + "start_time": "2024-03-26T01:26:23.456729100Z" } }, "id": "d32dd35631566b26", - "execution_count": 4 + "execution_count": 12 }, { "cell_type": "code", @@ -256,7 +245,7 @@ "data": { "text/plain": "array([[689.5701105],\n [615.9386355],\n [661.3197776],\n ...,\n [692.8649345],\n [709.3628695],\n [684.4678623]])" }, - "execution_count": 39, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -267,12 +256,12 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-25T06:34:24.962192500Z", - "start_time": "2024-03-25T06:34:24.951436400Z" + "end_time": "2024-03-26T01:30:16.135764Z", + "start_time": "2024-03-26T01:30:16.125907400Z" } }, "id": "3d0c303c9230d183", - "execution_count": 39 + "execution_count": 13 }, { "cell_type": "code", @@ -281,14 +270,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'Vu': 462.6130366195777, 'Ve': 969.4607183018168, 'beta': array([[703.72274814]]), 'u': array([[-17.58200532],\n", - " [-21.2687656 ],\n", - " [ -0.33706766],\n", + "{'Vu': 462.61305045468987, 'Ve': 974.0868459500658, 'beta': array([[703.72274814]]), 'u': array([[-17.59836984],\n", + " [-20.95136378],\n", + " [ -0.13633498],\n", " ...,\n", - " [ -4.68147966],\n", - " [-19.89370123],\n", - " [-20.12403582]]), 'LL': -13572.19170915865}\n", - "Running Time: 17.25 s\n" + " [ -4.65200707],\n", + " [-20.01554518],\n", + " [-20.12818341]]), 'LL': -13572.19170879463}\n", + "Running Time: 37.24 s\n" ] } ], @@ -304,12 +293,12 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-25T06:52:04.948507200Z", - "start_time": "2024-03-25T06:51:47.691501200Z" + "end_time": "2024-03-26T01:30:53.386672Z", + "start_time": "2024-03-26T01:30:16.135764Z" } }, "id": "557f3d43c301ad45", - "execution_count": 64 + "execution_count": 14 }, { "cell_type": "code", @@ -318,7 +307,7 @@ "data": { "text/plain": "array([[-17.59836984],\n [-20.95136378],\n [ -0.13633498],\n ...,\n [ -4.65200707],\n [-20.01554518],\n [-20.12818341]])" }, - "execution_count": 7, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -329,21 +318,25 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-25T00:45:22.463225400Z", - "start_time": "2024-03-25T00:45:22.454325500Z" + "end_time": "2024-03-26T01:30:53.404147700Z", + "start_time": "2024-03-26T01:30:53.386672Z" } }, "id": "2fdea088f03fd77", - "execution_count": 7 + "execution_count": 15 }, { "cell_type": "code", "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Correlation Coefficient (相关系数): 0.9999505295951803\n" + "ename": "TypeError", + "evalue": "solve_mixed_gblup_model() got an unexpected keyword argument 'ainv'", + "output_type": "error", + "traceback": [ + "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", + "\u001B[1;31mTypeError\u001B[0m Traceback (most recent call last)", + "Cell \u001B[1;32mIn[16], line 2\u001B[0m\n\u001B[0;32m 1\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mgblup\u001B[39;00m\n\u001B[1;32m----> 2\u001B[0m g_result \u001B[38;5;241m=\u001B[39m \u001B[43mgblup\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43msolve_mixed_gblup_model\u001B[49m\u001B[43m(\u001B[49m\u001B[43mn_obs\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;28;43mlen\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43mtrain_y\u001B[49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mobs_ids\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43m[\u001B[49m\u001B[43mi\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43;01mfor\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43mi\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;129;43;01min\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[38;5;28;43mrange\u001B[39;49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mlen\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43mtrain_y\u001B[49m\u001B[43m)\u001B[49m\u001B[43m)\u001B[49m\u001B[43m]\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43my_values\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mtrain_y\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mainv\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mnp\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mlinalg\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43minv\u001B[49m\u001B[43m(\u001B[49m\u001B[43ma_2\u001B[49m\u001B[43m)\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 3\u001B[0m gblue_values \u001B[38;5;241m=\u001B[39m g_result[\u001B[38;5;241m1\u001B[39m:,]\n\u001B[0;32m 4\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mscipy\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mstats\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m pearsonr\n", + "\u001B[1;31mTypeError\u001B[0m: solve_mixed_gblup_model() got an unexpected keyword argument 'ainv'" ] } ], @@ -363,61 +356,32 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-25T08:34:01.640160900Z", - "start_time": "2024-03-25T08:34:00.704205500Z" + "end_time": "2024-03-26T01:31:01.081182100Z", + "start_time": "2024-03-26T01:30:53.400772100Z" } }, "id": "173e3d457a409f8e", - "execution_count": 76 + "execution_count": 16 }, { "cell_type": "code", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Online License checked out Mon Mar 25 13:40:58 2024\n" - ] - } - ], + "outputs": [], "source": [ "asreml = importr('asreml')" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-25T05:40:58.749306200Z", - "start_time": "2024-03-25T05:40:56.371246400Z" + "end_time": "2024-03-26T01:31:01.091405100Z", + "start_time": "2024-03-26T01:31:01.086189800Z" } }, "id": "79a0fb4f44e8b0df", - "execution_count": 8 + "execution_count": null }, { "cell_type": "code", - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "R[write to console]: Reciprocal conditional number for original matrix is: 1.59179018378963e-06\n", - "\n", - "R[write to console]: Reciprocal conditional number for inverted matrix is: 1.24079893155397e-06\n", - "\n", - "R[write to console]: Inverse of matrix G does not appear to be ill-conditioned.\n", - "\n" - ] - }, - { - "data": { - "text/plain": "array([[ 1.00000000e+00, 1.00000000e+00, 1.04168312e+01],\n [ 2.00000000e+00, 1.00000000e+00, 5.02454510e-01],\n [ 2.00000000e+00, 2.00000000e+00, 2.62023822e+01],\n ...,\n [ 2.76300000e+03, 2.76100000e+03, -2.16311000e-03],\n [ 2.76300000e+03, 2.76200000e+03, -2.17669057e+01],\n [ 2.76300000e+03, 2.76300000e+03, 2.63256497e+01]])" - }, - "execution_count": 55, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "from rpy2.robjects import r, pandas2ri\n", "# Transfer G_matrix to R\n", @@ -434,12 +398,11 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-25T06:48:06.249000300Z", - "start_time": "2024-03-25T06:47:55.518842300Z" + "start_time": "2024-03-26T01:31:01.089831300Z" } }, "id": "b3e7ef8025b116", - "execution_count": 55 + "execution_count": null }, { "cell_type": "code", @@ -451,12 +414,12 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-25T06:17:55.988217Z", - "start_time": "2024-03-25T06:17:55.974047700Z" + "end_time": "2024-03-26T01:31:01.091405100Z", + "start_time": "2024-03-26T01:31:01.091405100Z" } }, "id": "acabf46be808b8ab", - "execution_count": 18 + "execution_count": null }, { "cell_type": "code", @@ -475,27 +438,15 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-25T06:49:06.121751300Z", - "start_time": "2024-03-25T06:48:43.097036700Z" + "start_time": "2024-03-26T01:31:01.091405100Z" } }, "id": "5e0bb540a4a4c72c", - "execution_count": 56 + "execution_count": null }, { "cell_type": "code", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " component std.error z.ratio bound %ch\n", - "vm(Genotype, G_matrix_inv) 5.972789e-06 NA NA B NA\n", - "units!units 1.170023e+03 31.48448 37.16189 P 0\n", - "units!R 1.000000e+00 NA NA F 0\n" - ] - } - ], + "outputs": [], "source": [ "summ = asreml.summary_asreml(modg)\n", "print(summ.rx2('varcomp'))" @@ -503,12 +454,11 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-25T06:20:01.616208300Z", - "start_time": "2024-03-25T06:20:01.609791200Z" + "start_time": "2024-03-26T01:31:01.091405100Z" } }, "id": "305ddc385d71eed4", - "execution_count": 21 + "execution_count": null }, { "cell_type": "code", @@ -519,25 +469,15 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-25T06:50:25.449294Z", - "start_time": "2024-03-25T06:50:25.444424600Z" + "start_time": "2024-03-26T01:31:01.101914600Z" } }, "id": "5b5a60ff985757ff", - "execution_count": 58 + "execution_count": null }, { "cell_type": "code", - "outputs": [ - { - "data": { - "text/plain": "array([[-17.5657198 ],\n [-21.58628211],\n [ -0.53769809],\n ...,\n [ -4.71096706],\n [-19.77202265],\n [-20.12006455]])" - }, - "execution_count": 63, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "close = np.allclose(rrblup_u, asreml_u, rtol=0.00001, atol=0.00001)\n", "\n" @@ -545,24 +485,15 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-25T06:51:13.081435600Z", - "start_time": "2024-03-25T06:51:13.077258400Z" + "start_time": "2024-03-26T01:31:01.101914600Z" } }, "id": "e2d6659412f49f53", - "execution_count": 63 + "execution_count": null }, { "cell_type": "code", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Correlation Coefficient (相关系数): 0.9999217812834612\n" - ] - } - ], + "outputs": [], "source": [ "from scipy.stats import pearsonr\n", "\n", @@ -576,12 +507,11 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-25T06:55:01.965913100Z", - "start_time": "2024-03-25T06:55:01.955398600Z" + "start_time": "2024-03-26T01:31:01.101914600Z" } }, "id": "d0553370feb1b95f", - "execution_count": 70 + "execution_count": null } ], "metadata": { diff --git a/asreml_gblup.ipynb b/asreml_gblup.ipynb new file mode 100644 index 0000000..065261e --- /dev/null +++ b/asreml_gblup.ipynb @@ -0,0 +1,455 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 29, + "id": "initial_id", + "metadata": { + "collapsed": true, + "ExecuteTime": { + "end_time": "2024-03-26T06:41:29.984856Z", + "start_time": "2024-03-26T06:41:28.523504300Z" + } + }, + "outputs": [], + "source": [ + "import pyreadr\n", + "import rpy2.robjects as ro\n", + "from rpy2.robjects import pandas2ri\n", + "from rpy2.robjects.packages import importr\n", + "\n", + "# Enable automatic conversion between pandas dataframes and R data frames\n", + "pandas2ri.activate()\n", + "\n", + "# Import the R packages needed\n", + "base = importr('base')\n", + "asreml = importr('asreml')\n", + "\n", + "# Load your data from .rds files using pyreadr\n", + "ped_result = pyreadr.read_r('data/pedigree_19_23P_1031.rds')\n", + "blue_result = pyreadr.read_r('data/blue_yield.rds')\n", + "\n", + "# Extracting the data frames from the results\n", + "ped_df = ped_result[None] # Assuming it's the first and only item in the result\n", + "blue_df = blue_result[None] # Assuming it's the first and only item in the result\n", + "\n", + "# Convert pandas dataframes to R data frames and put them into the R environment\n", + "ro.globalenv['ped'] = pandas2ri.py2rpy(ped_df)\n", + "ro.globalenv['blue'] = pandas2ri.py2rpy(blue_df)\n", + "\n" + ] + }, + { + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": " Name predicted.value std.error status Block DAM \\\n0 23TC2XM3066 817.157834 55.459393 Estimable 2 ZMN00545 \n1 23TC2XM4004 736.076043 55.698624 Estimable 2 20W041 \n2 D178070A 369.039247 46.982307 Estimable 6 ZMN00824 \n3 D178070A 358.845544 39.343070 Estimable 3 ZMN00824 \n4 D178070A 525.068916 54.857062 Estimable 2 ZMN00824 \n... ... ... ... ... ... ... \n2779 ZD958 268.765362 15.973387 Estimable 3 ZMN00113 \n2780 ZD958 510.696306 21.263064 Estimable 1 ZMN00113 \n2781 ZD958 314.059170 13.043818 Estimable 4 ZMN00113 \n2782 ZD958 333.884811 20.252772 Estimable 7 ZMN00113 \n2783 ZD958 300.026914 21.196859 Estimable 5 ZMN00113 \n\n SIRE Fam \n0 ZMN00392 ZMN00392_ZMN00545 \n1 20W065-43-2 20W065-43-2_20W041 \n2 ZMN00547 ZMN00547_ZMN00824 \n3 ZMN00547 ZMN00547_ZMN00824 \n4 ZMN00547 ZMN00547_ZMN00824 \n... ... ... \n2779 ZMN00337 ZMN00337_ZMN00113 \n2780 ZMN00337 ZMN00337_ZMN00113 \n2781 ZMN00337 ZMN00337_ZMN00113 \n2782 ZMN00337 ZMN00337_ZMN00113 \n2783 ZMN00337 ZMN00337_ZMN00113 \n\n[2784 rows x 8 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Namepredicted.valuestd.errorstatusBlockDAMSIREFam
023TC2XM3066817.15783455.459393Estimable2ZMN00545ZMN00392ZMN00392_ZMN00545
123TC2XM4004736.07604355.698624Estimable220W04120W065-43-220W065-43-2_20W041
2D178070A369.03924746.982307Estimable6ZMN00824ZMN00547ZMN00547_ZMN00824
3D178070A358.84554439.343070Estimable3ZMN00824ZMN00547ZMN00547_ZMN00824
4D178070A525.06891654.857062Estimable2ZMN00824ZMN00547ZMN00547_ZMN00824
...........................
2779ZD958268.76536215.973387Estimable3ZMN00113ZMN00337ZMN00337_ZMN00113
2780ZD958510.69630621.263064Estimable1ZMN00113ZMN00337ZMN00337_ZMN00113
2781ZD958314.05917013.043818Estimable4ZMN00113ZMN00337ZMN00337_ZMN00113
2782ZD958333.88481120.252772Estimable7ZMN00113ZMN00337ZMN00337_ZMN00113
2783ZD958300.02691421.196859Estimable5ZMN00113ZMN00337ZMN00337_ZMN00113
\n

2784 rows × 8 columns

\n
" + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "blue_df" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-26T06:41:29.996886300Z", + "start_time": "2024-03-26T06:41:29.985856400Z" + } + }, + "id": "fe6c64dc27d80b60", + "execution_count": 30 + }, + { + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": " Name FGenoID MGenoID\nrownames \n2 DMY1F 0 0\n3 DH605F 0 0\n4 PHHJC 0 0\n5 PH4CV 0 0\n6 ZMN01635 0 0\n... ... ... ...\n2537 D2284159 ZMN01648 ZMN00735\n2559 D2284296 ZMN01705 ZMN00735\n2680 D2284308 ZMN01707 ZMN00735\n27721 D2284109 ZMN01638 ZMN00735\n35432 D2284180 ZMN01655 ZMN00735\n\n[10310 rows x 3 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
NameFGenoIDMGenoID
rownames
2DMY1F00
3DH605F00
4PHHJC00
5PH4CV00
6ZMN0163500
............
2537D2284159ZMN01648ZMN00735
2559D2284296ZMN01705ZMN00735
2680D2284308ZMN01707ZMN00735
27721D2284109ZMN01638ZMN00735
35432D2284180ZMN01655ZMN00735
\n

10310 rows × 3 columns

\n
" + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ped_df" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-26T06:41:30.007435900Z", + "start_time": "2024-03-26T06:41:29.995886600Z" + } + }, + "id": "3300b0df5fd114d0", + "execution_count": 31 + }, + { + "cell_type": "code", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Pedigree order (P2): Individual \"WG603\" moved to record 1\n", + "Pedigree order (P1): Individual \"PH4CV\" moved to record 1\n", + "Pedigree order (P2): Individual \"WG5603\" moved to record 2\n" + ] + } + ], + "source": [ + "# Convert factors as in your R code\n", + "ro.r('''\n", + "ainv <- asreml::ainverse(ped)\n", + "blue$Name <- as.factor(blue$Name)\n", + "blue <- merge(blue, ped, by = \"Name\")\n", + "asr2 <- asreml::asreml(predicted.value ~ 1,\n", + " random = ~ vm(Name, ainv),\n", + " residual = ~ idv(units),\n", + " data = blue)\n", + "''')\n", + "\n", + "# To extract results from the asr2 model in R to Python\n", + "varcomp = ro.r('summary(asr2)$varcomp')\n", + "g = ro.r('coef(asr2)$random')\n", + "\n" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-26T06:41:30.341690500Z", + "start_time": "2024-03-26T06:41:30.005436100Z" + } + }, + "id": "47f01cf2234a3eb8", + "execution_count": 32 + }, + { + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": " Name Value\nvm(Name, ainv)_PH4CV PH4CV 0.000000\nvm(Name, ainv)_WG5603 WG5603 0.000000\nvm(Name, ainv)_WG603 WG603 0.000000\nvm(Name, ainv)_DMY1F DMY1F 0.000000\nvm(Name, ainv)_DH605F DH605F -23.800054\n... ... ...\nvm(Name, ainv)_D2284159 D2284159 16.686925\nvm(Name, ainv)_D2284296 D2284296 14.130782\nvm(Name, ainv)_D2284308 D2284308 16.686925\nvm(Name, ainv)_D2284109 D2284109 16.201840\nvm(Name, ainv)_D2284180 D2284180 16.686925\n\n[10310 rows x 2 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
NameValue
vm(Name, ainv)_PH4CVPH4CV0.000000
vm(Name, ainv)_WG5603WG56030.000000
vm(Name, ainv)_WG603WG6030.000000
vm(Name, ainv)_DMY1FDMY1F0.000000
vm(Name, ainv)_DH605FDH605F-23.800054
.........
vm(Name, ainv)_D2284159D228415916.686925
vm(Name, ainv)_D2284296D228429614.130782
vm(Name, ainv)_D2284308D228430816.686925
vm(Name, ainv)_D2284109D228410916.201840
vm(Name, ainv)_D2284180D228418016.686925
\n

10310 rows × 2 columns

\n
" + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Converting R objects back to pandas DataFrames or numpy arrays if needed\n", + "import pandas as pd\n", + "# Extract row names from the R object\n", + "g_row_names = ro.r('rownames(coef(asr2)$random)')\n", + "\n", + "\n", + "g_df = pd.DataFrame(g)\n", + "\n", + "# Convert the row names to a Python list\n", + "g_row_names_py = list(g_row_names)\n", + "\n", + "# Assign the row names to the pandas DataFrame index\n", + "g_df.index = g_row_names_py\n", + "g_df['Name'] = [name.split('_')[1] for name in g_df.index]\n", + "\n", + "g_df.columns = ['Value', 'Name']\n", + "g_df = g_df[['Name', 'Value']]\n", + "g_df" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-26T06:41:30.398371500Z", + "start_time": "2024-03-26T06:41:30.340685100Z" + } + }, + "id": "386282876cd5ef83", + "execution_count": 33 + }, + { + "cell_type": "code", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Pedigree order (P2): Individual \"WG603\" moved to record 1\n", + "Pedigree order (P1): Individual \"PH4CV\" moved to record 1\n", + "Pedigree order (P2): Individual \"WG5603\" moved to record 2\n" + ] + }, + { + "data": { + "text/plain": " Name Value\nvm(Name, ainv)_PH4CV PH4CV 0.000000\nvm(Name, ainv)_WG5603 WG5603 0.000000\nvm(Name, ainv)_WG603 WG603 0.000000\nvm(Name, ainv)_DMY1F DMY1F 0.000000\nvm(Name, ainv)_DH605F DH605F -23.800054\n... ... ...\nvm(Name, ainv)_D2284159 D2284159 16.686925\nvm(Name, ainv)_D2284296 D2284296 14.130782\nvm(Name, ainv)_D2284308 D2284308 16.686925\nvm(Name, ainv)_D2284109 D2284109 16.201840\nvm(Name, ainv)_D2284180 D2284180 16.686925\n\n[10310 rows x 2 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
NameValue
vm(Name, ainv)_PH4CVPH4CV0.000000
vm(Name, ainv)_WG5603WG56030.000000
vm(Name, ainv)_WG603WG6030.000000
vm(Name, ainv)_DMY1FDMY1F0.000000
vm(Name, ainv)_DH605FDH605F-23.800054
.........
vm(Name, ainv)_D2284159D228415916.686925
vm(Name, ainv)_D2284296D228429614.130782
vm(Name, ainv)_D2284308D228430816.686925
vm(Name, ainv)_D2284109D228410916.201840
vm(Name, ainv)_D2284180D228418016.686925
\n

10310 rows × 2 columns

\n
" + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import gblup\n", + "gblup.run_asreml_ped_gblup(ped_df,blue_df)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-26T06:41:30.876737300Z", + "start_time": "2024-03-26T06:41:30.398371500Z" + } + }, + "id": "d30dc3ae150a2065", + "execution_count": 34 + }, + { + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": " marker_1 marker_2 marker_3 marker_4 marker_5 marker_6 \\\nIID_1 1 2 0 0 1 2 \nIID_2 2 0 0 2 0 2 \nIID_3 2 2 1 2 0 0 \nIID_4 0 1 0 0 0 1 \nIID_5 0 2 1 2 1 2 \n... ... ... ... ... ... ... \nIID_5996 1 1 1 1 2 1 \nIID_5997 1 1 2 2 0 1 \nIID_5998 2 2 1 0 2 1 \nIID_5999 2 2 1 2 0 0 \nIID_6000 2 2 1 0 2 2 \n\n marker_7 marker_8 marker_9 marker_10 ... marker_8357 \\\nIID_1 1 2 0 1 ... 0 \nIID_2 1 1 2 1 ... 2 \nIID_3 1 2 1 0 ... 0 \nIID_4 0 1 0 1 ... 2 \nIID_5 0 2 0 2 ... 1 \n... ... ... ... ... ... ... \nIID_5996 0 1 0 2 ... 2 \nIID_5997 1 1 2 0 ... 0 \nIID_5998 1 1 1 1 ... 2 \nIID_5999 1 0 0 0 ... 0 \nIID_6000 0 1 1 0 ... 2 \n\n marker_8358 marker_8359 marker_8360 marker_8361 marker_8362 \\\nIID_1 2 0 0 0 1 \nIID_2 2 2 0 2 1 \nIID_3 2 1 1 0 0 \nIID_4 2 0 0 2 1 \nIID_5 0 2 1 1 2 \n... ... ... ... ... ... \nIID_5996 1 1 0 1 2 \nIID_5997 1 2 0 1 0 \nIID_5998 0 1 2 0 2 \nIID_5999 2 1 0 1 0 \nIID_6000 1 0 1 0 1 \n\n marker_8363 marker_8364 marker_8365 marker_8366 \nIID_1 0 2 1 2 \nIID_2 2 0 2 1 \nIID_3 1 2 0 1 \nIID_4 1 1 1 0 \nIID_5 1 2 1 0 \n... ... ... ... ... \nIID_5996 0 2 1 1 \nIID_5997 2 1 1 2 \nIID_5998 0 2 1 1 \nIID_5999 1 1 2 2 \nIID_6000 2 2 2 0 \n\n[6000 rows x 8366 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
marker_1marker_2marker_3marker_4marker_5marker_6marker_7marker_8marker_9marker_10...marker_8357marker_8358marker_8359marker_8360marker_8361marker_8362marker_8363marker_8364marker_8365marker_8366
IID_11200121201...0200010212
IID_22002021121...2220212021
IID_32212001210...0211001201
IID_40100010101...2200211110
IID_50212120202...1021121210
..................................................................
IID_59961111210102...2110120211
IID_59971122011120...0120102112
IID_59982210211111...2012020211
IID_59992212001000...0210101122
IID_60002210220110...2101012220
\n

6000 rows × 8366 columns

\n
" + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import numpy as np\n", + "G_matrix = np.random.randint(0, 3, size=(6000, 8366))\n", + "\n", + "# Convert the numpy matrix to a pandas DataFrame for easier manipulation and visualization\n", + "G_matrix = pd.DataFrame(G_matrix)\n", + "\n", + "# Optionally, name the columns and index\n", + "G_matrix.columns = [f\"marker_{i+1}\" for i in range(G_matrix.shape[1])]\n", + "G_matrix.index = [f\"IID_{i+1}\" for i in range(G_matrix.shape[0])]\n", + "G_matrix" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-26T06:41:31.387574600Z", + "start_time": "2024-03-26T06:41:30.872935500Z" + } + }, + "id": "1c5098ff572f68de", + "execution_count": 35 + }, + { + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": " marker_1 marker_2 marker_3 marker_4 marker_5 marker_6 marker_7 \\\nIID_1 1 2 0 0 1 2 1 \nIID_2 2 0 0 2 0 2 1 \nIID_3 2 2 1 2 0 0 1 \nIID_4 0 1 0 0 0 1 0 \nIID_5 0 2 1 2 1 2 0 \nIID_6 0 0 1 0 0 2 0 \n\n marker_8 marker_9 marker_10 ... marker_8357 marker_8358 \\\nIID_1 2 0 1 ... 0 2 \nIID_2 1 2 1 ... 2 2 \nIID_3 2 1 0 ... 0 2 \nIID_4 1 0 1 ... 2 2 \nIID_5 2 0 2 ... 1 0 \nIID_6 0 2 1 ... 2 1 \n\n marker_8359 marker_8360 marker_8361 marker_8362 marker_8363 \\\nIID_1 0 0 0 1 0 \nIID_2 2 0 2 1 2 \nIID_3 1 1 0 0 1 \nIID_4 0 0 2 1 1 \nIID_5 2 1 1 2 1 \nIID_6 0 2 1 0 0 \n\n marker_8364 marker_8365 marker_8366 \nIID_1 2 1 2 \nIID_2 0 2 1 \nIID_3 2 0 1 \nIID_4 1 1 0 \nIID_5 2 1 0 \nIID_6 1 0 2 \n\n[6 rows x 8366 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
marker_1marker_2marker_3marker_4marker_5marker_6marker_7marker_8marker_9marker_10...marker_8357marker_8358marker_8359marker_8360marker_8361marker_8362marker_8363marker_8364marker_8365marker_8366
IID_11200121201...0200010212
IID_22002021121...2220212021
IID_32212001210...0211001201
IID_40100010101...2200211110
IID_50212120202...1021121210
IID_60010020021...2102100102
\n

6 rows × 8366 columns

\n
" + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Pass the adjusted DataFrame into the R environment\n", + "ro.globalenv['G_matrix'] = pandas2ri.py2rpy(G_matrix)\n", + "\n", + "# Verify by printing the first few rows in R\n", + "# Ensure that the dimnames (row and column names) are correctly set in R\n", + "ro.r('''\n", + "dimnames(G_matrix) <- list(rownames(G_matrix), colnames(G_matrix))\n", + "''')\n", + "\n", + "ro.r('head(G_matrix)')" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-26T06:43:09.773274800Z", + "start_time": "2024-03-26T06:41:31.387574600Z" + } + }, + "id": "e025ceac8fe61ecf", + "execution_count": 36 + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "blue_df = pd.DataFrame({\n", + " \"Name\": [\"IID_3\", \"IID_4\", \"IID_5\"],\n", + " \"Value\": [2.4, 2.4, 5.3]\n", + "})\n", + "\n" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-26T06:43:09.775274500Z", + "start_time": "2024-03-26T06:43:09.761671300Z" + } + }, + "id": "8a0429505eafc8d6", + "execution_count": 37 + }, + { + "cell_type": "code", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initial data: \n", + "\tNumber of Individuals: 6000 \n", + "\tNumber of Markers: 8366 \n", + "\n", + "Missing data check: \n", + "\tTotal SNPs: 8366 \n", + "\t 0 SNPs dropped due to missing data threshold of 0.5 \n", + "\tTotal of: 8366 SNPs \n", + "\n", + "MAF check: \n", + "\tNo SNPs with MAF below 0 \n", + "\n", + "Heterozigosity data check: \n", + "\tNo SNPs with heterozygosity, missing threshold of = 0 \n", + "\n", + "Summary check: \n", + "\tInitial: 8366 SNPs \n", + "\tFinal: 8366 SNPs ( 0 SNPs removed) \n", + " \n", + "Completed! Time = 470.51 seconds \n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "R[write to console]: Reciprocal conditional number for original matrix is: 5.75725494224797e-05\n", + "\n", + "R[write to console]: Reciprocal conditional number for inverted matrix is: 5.43301590950856e-05\n", + "\n", + "R[write to console]: Inverse of matrix G does not appear to be ill-conditioned.\n", + "\n" + ] + } + ], + "source": [ + "ro.globalenv['G_matrix'] = pandas2ri.py2rpy(G_matrix)\n", + "\n", + "# Verify by printing the first few rows in R\n", + "# Ensure that the dimnames (row and column names) are correctly set in R\n", + "ro.r('''\n", + "dimnames(G_matrix) <- list(rownames(G_matrix), colnames(G_matrix))\n", + "''')\n", + "\n", + "ro.globalenv['blue'] = pandas2ri.py2rpy(blue_df)\n", + "#ro.r('rownames(blue)')\n", + "ro.r('''\n", + "library(asreml)\n", + "library(ASRgenomics)\n", + "library(tidyverse)\n", + "G_matrix <- as.matrix(G_matrix)\n", + "Gmat = G.matrix(G_matrix)$G\n", + "rownames(Gmat) <- rownames(G_matrix)\n", + "colnames(Gmat) <- rownames(G_matrix)\n", + "diag(Gmat) = diag(Gmat) + 0.01\n", + "ginv = G.inverse(Gmat,sparseform = T)\n", + "ginv = ginv$Ginv.sparse\n", + "\n", + "attr(ginv,\"rowNames\") %>% head\n", + "\n", + "blue$Name <- as.factor(blue$Name)\n", + "\n", + "\n", + "asr2 <- asreml(Value ~ 1,\n", + " random = ~ vm(Name,ginv), \n", + " residual = ~idv(units),\n", + " data = blue)\n", + "summary(asr2)$varcomp\n", + "\n", + "''')\n", + "g = ro.r('coef(asr2)$random')\n", + "\n", + "g_row_names = ro.r('rownames(coef(asr2)$random)')\n", + "\n", + "g_df = pd.DataFrame(g)\n", + "\n", + "# Convert the row names to a Python list\n", + "g_row_names_py = list(g_row_names)\n", + "\n", + "# Assign the row names to the pandas DataFrame index\n", + "g_df.index = g_row_names_py\n", + "g_df['Name'] = [name.split('_')[1:] for name in g_df.index]\n", + "g_df.columns = ['Value', 'Name']\n", + "g_df = g_df[['Name', 'Value']]" + ], + "metadata": { + "collapsed": false, + "is_executing": true, + "ExecuteTime": { + "start_time": "2024-03-26T06:43:09.772278600Z" + } + }, + "id": "21d78c7525765a4b", + "execution_count": null + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "g_df" + ], + "metadata": { + "collapsed": false, + "is_executing": true + }, + "id": "7acf3d154cb5052c", + "execution_count": null + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "import gblup\n", + "g_gblup = gblup.run_asreml_G_gblup(G_matrix,blue_df)\n", + "g_gblup" + ], + "metadata": { + "collapsed": false, + "is_executing": true + }, + "id": "4a3a8a5a537bdd5f", + "execution_count": null + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/blup_animal.ipynb b/blup_animal.ipynb index 680e41c..49c3ff9 100644 --- a/blup_animal.ipynb +++ b/blup_animal.ipynb @@ -2,13 +2,13 @@ "cells": [ { "cell_type": "code", - "execution_count": 35, + "execution_count": 46, "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { - "end_time": "2024-03-25T09:19:22.281546600Z", - "start_time": "2024-03-25T09:19:22.075735900Z" + "end_time": "2024-03-25T09:48:46.209035Z", + "start_time": "2024-03-25T09:48:46.009607200Z" } }, "outputs": [ @@ -25,6 +25,15 @@ " [0.25 0.25 0.25 0.5 0.5 0.25 1. 0.25 ]\n", " [0.25 0.25 0.5 0.125 0.375 0.5 0.25 1. ]]\n" ] + }, + { + "data": { + "text/plain": " ID SIRE DAM\n0 1 0 0\n1 2 0 0\n2 3 0 0\n3 4 1 0\n4 5 3 2\n5 6 1 2\n6 7 4 5\n7 8 3 6", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
IDSIREDAM
0100
1200
2300
3410
4532
5612
6745
7836
\n
" + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -53,6 +62,7 @@ "#ainv = np.array(r.solve(A))\n", "ainv = np.linalg.inv(A)\n", "print(np.array(A))\n", + "pd.DataFrame(ped_data)\n", "#np.allclose(ainv,e)" ] }, @@ -248,6 +258,14 @@ "execution_count": 33, "metadata": {}, "output_type": "execute_result" + }, + { + "data": { + "text/plain": "array([[ 8.74646688],\n [120.73507016],\n [-90.13470959],\n ...,\n [141.13168562],\n [-44.48005818],\n [ 84.26178548]])" + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -257,12 +275,12 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-25T08:42:03.490882200Z", + "end_time": "2024-03-25T09:36:13.096766900Z", "start_time": "2024-03-25T08:41:55.848715200Z" } }, "id": "17d9a37deccf8a55", - "execution_count": 33 + "execution_count": 45 } ], "metadata": { diff --git a/gblup.py b/gblup.py index 0c67316..7389eec 100644 --- a/gblup.py +++ b/gblup.py @@ -1,3 +1,109 @@ +import pandas as pd +import rpy2.robjects as ro +from rpy2.robjects import pandas2ri +from rpy2.robjects.conversion import localconverter + +def run_asreml_G_gblup(G_matrix, blue_df): + pandas2ri.activate() + print(f'G_matrix:\n{G_matrix.shape}, blue_df:\n{blue_df.shape}') + ro.r('''Sys.setlocale("LC_CTYPE", "en_US.UTF-8")''') + ro.globalenv['G_matrix'] = pandas2ri.py2rpy(G_matrix) + # Verify by printing the first few rows in R + # Ensure that the dimnames (row and column names) are correctly set in R + ro.r(''' + dimnames(G_matrix) <- list(rownames(G_matrix), colnames(G_matrix)) + ''') + + ro.globalenv['blue'] = pandas2ri.py2rpy(blue_df) + # ro.r('rownames(blue)') + ro.r(''' + library(asreml) + library(ASRgenomics) + library(tidyverse) + G_matrix <- as.matrix(G_matrix) + Gmat = G.matrix(G_matrix)$G + rownames(Gmat) <- rownames(G_matrix) + colnames(Gmat) <- rownames(G_matrix) + diag(Gmat) = diag(Gmat) + 0.01 + ginv = G.inverse(Gmat,sparseform = T) + ginv = ginv$Ginv.sparse + + attr(ginv,"rowNames") %>% head + + blue$Name <- as.factor(blue$Name) + + + asr2 <- asreml(Value ~ 1, + random = ~ vm(Name,ginv), + residual = ~idv(units), + data = blue,workspace = 20480000 * 12) + summary(asr2)$varcomp + + ''') + g = ro.r('coef(asr2)$random') + + g_row_names = ro.r('rownames(coef(asr2)$random)') + + g_df = pd.DataFrame(g) + + # Convert the row names to a Python list + g_row_names_py = list(g_row_names) + + # Assign the row names to the pandas DataFrame index + g_df.index = g_row_names_py + g_df['Name'] = ['_'.join(name.split('_')[1:]) for name in g_df.index] + g_df.columns = ['Value', 'Name'] + g_df = g_df[['Name', 'Value']] + g_df.to_csv('output/gblup.csv',index=True) + print(g_df.head(10)) + return g_df + +def run_asreml_ped_gblup(ped_df, blue_df): + # Ensure automatic conversion between pandas dataframes and R data frames + pandas2ri.activate() + + # Convert pandas dataframes to R data frames and put them into the R environment + ro.globalenv['ped'] = pandas2ri.py2rpy(ped_df) + ro.globalenv['blue'] = pandas2ri.py2rpy(blue_df) + + # Run the ASReml model in R + ro.r(''' + library(asreml) + ainv <- asreml::ainverse(ped) + blue$Name <- as.factor(blue$Name) + blue <- merge(blue, ped, by = "Name") + asr2 <- asreml::asreml(predicted.value ~ 1, + random = ~ vm(Name, ainv), + residual = ~ idv(units), + data = blue) + ''') + + # Extract results from the asr2 model in R to Python + g = ro.r('coef(asr2)$random') + + g_row_names = ro.r('rownames(coef(asr2)$random)') + + g_df = pd.DataFrame(g) + + # Convert the row names to a Python list + g_row_names_py = list(g_row_names) + + # Assign the row names to the pandas DataFrame index + g_df.index = g_row_names_py + g_df['Name'] = [name.split('_')[1] for name in g_df.index] + + g_df.columns = ['Value', 'Name'] + g_df = g_df[['Name', 'Value']] + + return g_df + + +# Example usage +# ped_df and blue_df should be defined as pandas DataFrames before calling this function +# results_df = run_asreml_and_extract_results(ped_df, blue_df) +# print(results + + def solve_mixed_gblup_model(n_obs, obs_ids, y_values, a): import numpy as np ainv = np.linalg.inv(np.array(a)) From fcb0fdde26ffd0068d0f6031e5bd5dc3f98d30bd Mon Sep 17 00:00:00 2001 From: zhanglei Date: Tue, 26 Mar 2024 16:21:21 +0800 Subject: [PATCH 6/6] gblup and arseml test using gblup --- .idea/.gitignore | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 .idea/.gitignore diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml