From f6a665b9502d93db67d1a7b3ce6dfa21d62911ca Mon Sep 17 00:00:00 2001
From: Gustavo Jeuken <gjeuken@gmail.com>
Date: Tue, 25 Feb 2025 15:41:33 +0100
Subject: [PATCH] A memory efficient implementation of the .mtx reading
 function (#3389)

Co-authored-by: Lukas Heumos <lukas.heumos@posteo.net>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 src/scanpy/datasets/_ebi_expression_atlas.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/scanpy/datasets/_ebi_expression_atlas.py b/src/scanpy/datasets/_ebi_expression_atlas.py
index b7e1886e71..05d5f88fc6 100644
--- a/src/scanpy/datasets/_ebi_expression_atlas.py
+++ b/src/scanpy/datasets/_ebi_expression_atlas.py
@@ -67,13 +67,21 @@ def read_mtx_from_stream(stream: BinaryIO) -> sparse.csr_matrix:
     max_int32 = np.iinfo(np.int32).max
     coord_dtype = np.int64 if n > max_int32 or m > max_int32 else np.int32
 
-    data = pd.read_csv(
+    chunks = pd.read_csv(
         stream,
         sep=r"\s+",
         header=None,
         dtype={0: coord_dtype, 1: coord_dtype, 2: np.float32},
+        chunksize=1e7,
     )
-    mtx = sparse.csr_matrix((data[2], (data[1] - 1, data[0] - 1)), shape=(m, n))
+    data = np.array([], dtype=np.float64)
+    i = np.array([], dtype=int)
+    j = np.array([], dtype=int)
+    for chunk in chunks:
+        data = np.append(data, chunk[2])
+        i = np.append(i, chunk[1] - 1)
+        j = np.append(j, chunk[0] - 1)
+    mtx = sparse.csr_matrix((data, (i, j)), shape=(m, n))
     return mtx