From f6a665b9502d93db67d1a7b3ce6dfa21d62911ca Mon Sep 17 00:00:00 2001 From: Gustavo Jeuken Date: Tue, 25 Feb 2025 15:41:33 +0100 Subject: [PATCH] A memory efficient implementation of the .mtx reading function (#3389) Co-authored-by: Lukas Heumos Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- src/scanpy/datasets/_ebi_expression_atlas.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/scanpy/datasets/_ebi_expression_atlas.py b/src/scanpy/datasets/_ebi_expression_atlas.py index b7e1886e71..05d5f88fc6 100644 --- a/src/scanpy/datasets/_ebi_expression_atlas.py +++ b/src/scanpy/datasets/_ebi_expression_atlas.py @@ -67,13 +67,21 @@ def read_mtx_from_stream(stream: BinaryIO) -> sparse.csr_matrix: max_int32 = np.iinfo(np.int32).max coord_dtype = np.int64 if n > max_int32 or m > max_int32 else np.int32 - data = pd.read_csv( + chunks = pd.read_csv( stream, sep=r"\s+", header=None, dtype={0: coord_dtype, 1: coord_dtype, 2: np.float32}, + chunksize=1e7, ) - mtx = sparse.csr_matrix((data[2], (data[1] - 1, data[0] - 1)), shape=(m, n)) + data = np.array([], dtype=np.float64) + i = np.array([], dtype=int) + j = np.array([], dtype=int) + for chunk in chunks: + data = np.append(data, chunk[2]) + i = np.append(i, chunk[1] - 1) + j = np.append(j, chunk[0] - 1) + mtx = sparse.csr_matrix((data, (i, j)), shape=(m, n)) return mtx