Skip to content

Commit

Permalink
Backport PR scverse#3389: A memory efficient implementation of the .m…
Browse files Browse the repository at this point in the history
…tx reading function
  • Loading branch information
gjeuken authored and meeseeksmachine committed Feb 25, 2025
1 parent 14d58b4 commit 1baada2
Showing 1 changed file with 10 additions and 2 deletions.
12 changes: 10 additions & 2 deletions src/scanpy/datasets/_ebi_expression_atlas.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,13 +67,21 @@ def read_mtx_from_stream(stream: BinaryIO) -> sparse.csr_matrix:
max_int32 = np.iinfo(np.int32).max
coord_dtype = np.int64 if n > max_int32 or m > max_int32 else np.int32

data = pd.read_csv(
chunks = pd.read_csv(
stream,
sep=r"\s+",
header=None,
dtype={0: coord_dtype, 1: coord_dtype, 2: np.float32},
chunksize=1e7,
)
mtx = sparse.csr_matrix((data[2], (data[1] - 1, data[0] - 1)), shape=(m, n))
data = np.array([], dtype=np.float64)
i = np.array([], dtype=int)
j = np.array([], dtype=int)
for chunk in chunks:
data = np.append(data, chunk[2])
i = np.append(i, chunk[1] - 1)
j = np.append(j, chunk[0] - 1)
mtx = sparse.csr_matrix((data, (i, j)), shape=(m, n))
return mtx


Expand Down

0 comments on commit 1baada2

Please sign in to comment.