Skip to content

Commit

Permalink
Use os.walk to speedup reading from DirectoryStore
Browse files Browse the repository at this point in the history
usage of os.walk for tree (os.scandir for folders) is faster than
listdir as it avoids many stats call.

This Should make the DirectoryStore faster.
  • Loading branch information
Carreau committed Sep 9, 2020
1 parent c18537d commit 5af8f20
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 9 deletions.
2 changes: 2 additions & 0 deletions docs/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ Release notes
Next release
------------

* `DirectoryStore` now uses `os.scandir`, which should make listing large store
faster, :issue:`563`
* Fix minor bug in `N5Store`.
By :user:`gsakkis`, :issue:`550`.
* Improve error message in Jupyter when trying to use the ``ipytree`` widget
Expand Down
30 changes: 21 additions & 9 deletions zarr/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -854,15 +854,27 @@ def __eq__(self, other):

def keys(self):
if os.path.exists(self.path):
directories = [(self.path, '')]
while directories:
dir_name, prefix = directories.pop()
for name in os.listdir(dir_name):
path = os.path.join(dir_name, name)
if os.path.isfile(path):
yield prefix + name
elif os.path.isdir(path):
directories.append((path, prefix + name + '/'))
yield from self._keys_fast(self.path)

@staticmethod
def _keys_fast(path, walker=os.walk):
"""
Faster logic on platform where the separator is `/` and using
`os.walk()` to decrease the number of stats.call.
"""
it = iter(walker(path))
d0, dirnames, filenames = next(it)
if d0.endswith('/'):
root_len = len(d0)
else:
root_len = len(d0)+1
for f in filenames:
yield f
for dirpath, _, filenames in it:
for f in filenames:
yield dirpath[root_len:].replace('\\', '/')+'/'+f

def __iter__(self):
return self.keys()
Expand Down
28 changes: 28 additions & 0 deletions zarr/tests/test_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -826,6 +826,34 @@ def test_normalize_keys(self):
assert 'FOO' in store
assert 'foo' in store

def test_listing_keys_slash(self):

def mock_walker_slash(_path):
yield from [
# trailing slash in first key
('root_with_slash/', ['d1', 'g1'], ['.zgroup']),
('root_with_slash/d1', [], ['.zarray']),
('root_with_slash/g1', [], ['.zgroup'])
]

res = set(DirectoryStore._keys_fast('root_with_slash/', walker=mock_walker_slash))
assert res == {'.zgroup', 'g1/.zgroup', 'd1/.zarray'}

def test_listing_keys_no_slash(self):

def mock_walker_no_slash(_path):
yield from [
# no trainling slash in first key
('root_with_no_slash', ['d1', 'g1'], ['.zgroup']),
('root_with_no_slash/d1', [], ['.zarray']),
('root_with_no_slash/g1', [], ['.zgroup'])
]

res = set(
DirectoryStore._keys_fast('root_with_no_slash', mock_walker_no_slash)
)
assert res == {'.zgroup', 'g1/.zgroup', 'd1/.zarray'}


class TestNestedDirectoryStore(TestDirectoryStore, unittest.TestCase):

Expand Down

0 comments on commit 5af8f20

Please sign in to comment.