Skip to content

Commit

Permalink
A memory efficient implementation of the .mtx reading function (#3389)
Browse files Browse the repository at this point in the history
Co-authored-by: Lukas Heumos <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
3 people authored Feb 25, 2025
1 parent d2db761 commit f6a665b
Showing 1 changed file with 10 additions and 2 deletions.
12 changes: 10 additions & 2 deletions src/scanpy/datasets/_ebi_expression_atlas.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,13 +67,21 @@ def read_mtx_from_stream(stream: BinaryIO) -> sparse.csr_matrix:
max_int32 = np.iinfo(np.int32).max
coord_dtype = np.int64 if n > max_int32 or m > max_int32 else np.int32

data = pd.read_csv(
chunks = pd.read_csv(
stream,
sep=r"\s+",
header=None,
dtype={0: coord_dtype, 1: coord_dtype, 2: np.float32},
chunksize=1e7,
)
mtx = sparse.csr_matrix((data[2], (data[1] - 1, data[0] - 1)), shape=(m, n))
data = np.array([], dtype=np.float64)
i = np.array([], dtype=int)
j = np.array([], dtype=int)
for chunk in chunks:
data = np.append(data, chunk[2])
i = np.append(i, chunk[1] - 1)
j = np.append(j, chunk[0] - 1)
mtx = sparse.csr_matrix((data, (i, j)), shape=(m, n))
return mtx


Expand Down

0 comments on commit f6a665b

Please sign in to comment.