Skip to content

Commit

Permalink
cluster/dht: use readdir for fix-layout in rebalance (#2243)
Browse files Browse the repository at this point in the history
Problem:
On a cluster with 15 million files, when fix-layout was started, it was
not progressing at all. So we tried to do a os.walk() + os.stat() on the
backend filesystem directly. It took 2.5 days. We removed os.stat() and
re-ran it on another brick with similar data-set. It took 15 minutes. We
realized that readdirp is extremely costly compared to readdir if the
stat is not useful. fix-layout operation only needs to know that the
entry is a directory so that fix-layout operation can be triggered on
it. Most of the modern filesystems provide this information in readdir
operation. We don't need readdirp i.e. readdir+stat.

Fix:
Use readdir operation in fix-layout. Do readdir+stat/lookup for
filesystems that don't provide d_type in readdir operation.

fixes: #2241
Change-Id: I5fe2ecea25a399ad58e31a2e322caf69fc7f49eb
Signed-off-by: Pranith Kumar K <[email protected]>
  • Loading branch information
pranithk authored Mar 22, 2021
1 parent 1da141a commit ec189a4
Show file tree
Hide file tree
Showing 10 changed files with 131 additions and 98 deletions.
8 changes: 4 additions & 4 deletions cli/src/cli-rpc-ops.c
Original file line number Diff line number Diff line change
Expand Up @@ -906,9 +906,9 @@ gf_cli_get_volume_cbk(struct rpc_req *req, struct iovec *iov, int count,
replica_count, disperse_count,
redundancy_count, arbiter_count);

cli_out("Transport-type: %s",
((transport == 0) ? "tcp"
: (transport == 1) ? "rdma" : "tcp,rdma"));
cli_out("Transport-type: %s", ((transport == 0) ? "tcp"
: (transport == 1) ? "rdma"
: "tcp,rdma"));
j = 1;

GF_FREE(local->get_vol.volname);
Expand Down Expand Up @@ -1576,7 +1576,7 @@ gf_cli_print_rebalance_status(dict_t *dict, enum gf_task_types task_type)
sec = ((uint64_t)elapsed % 3600) % 60;

if (fix_layout) {
cli_out("%35s %50s %8d:%d:%d", node_name, status_str, hrs, min,
cli_out("%35s %50s %8d:%02d:%02d", node_name, status_str, hrs, min,
sec);
} else {
if (size_str) {
Expand Down
28 changes: 26 additions & 2 deletions libglusterfs/src/common-utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -3325,7 +3325,7 @@ gf_process_reserved_ports(unsigned char *ports, uint32_t ceiling)
out:
GF_FREE(ports_info);

#else /* FIXME: Non Linux Host */
#else /* FIXME: Non Linux Host */
ret = 0;
#endif /* GF_LINUX_HOST_OS */

Expand Down Expand Up @@ -5197,7 +5197,7 @@ close_fds_except_custom(int *fdv, size_t count, void *prm,
closer(i, prm);
}
sys_closedir(d);
#else /* !GF_LINUX_HOST_OS */
#else /* !GF_LINUX_HOST_OS */
struct rlimit rl;
int ret = -1;

Expand Down Expand Up @@ -5433,6 +5433,30 @@ gf_d_type_from_ia_type(ia_type_t type)
}
}

int
gf_d_type_from_st_mode(mode_t st_mode)
{
switch (st_mode & S_IFMT) {
case S_IFREG:
return DT_REG;
case S_IFDIR:
return DT_DIR;
case S_IFLNK:
return DT_LNK;
case S_IFBLK:
return DT_BLK;
case S_IFCHR:
return DT_CHR;
case S_IFIFO:
return DT_FIFO;
case S_IFSOCK:
return DT_SOCK;
default:
return DT_UNKNOWN;
}
return DT_UNKNOWN;
}

int
gf_nanosleep(uint64_t nsec)
{
Expand Down
2 changes: 2 additions & 0 deletions libglusterfs/src/glusterfs/common-utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -1257,4 +1257,6 @@ gf_tsdiff(struct timespec *start, struct timespec *end)
(int64_t)(end->tv_nsec - start->tv_nsec);
}

int
gf_d_type_from_st_mode(mode_t st_mode);
#endif /* _COMMON_UTILS_H */
1 change: 1 addition & 0 deletions libglusterfs/src/libglusterfs.sym
Original file line number Diff line number Diff line change
Expand Up @@ -1179,6 +1179,7 @@ gf_changelog_register_generic
gf_gfid_generate_from_xxh64
find_xlator_option_in_cmd_args_t
gf_d_type_from_ia_type
gf_d_type_from_st_mode
glusterfs_graph_fini
glusterfs_process_svc_attach_volfp
glusterfs_mux_volfile_reconfigure
Expand Down
6 changes: 6 additions & 0 deletions tests/bugs/glusterd/rebalance-operations-in-single-node.t
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,16 @@ done
TEST $CLI volume add-brick $V0 $H0:$B0/${V0}{5,6};

#perform rebalance fix-layout
TEST $CLI volume profile $V0 start
TEST $CLI volume rebalance $V0 fix-layout start

EXPECT_WITHIN $REBALANCE_TIMEOUT "fix-layout completed" fix-layout_status_field $V0;

readdir_count=$($CLI volume profile $V0 info | grep -w READDIR | wc -l)
readdirp_count=$($CLI volume profile $V0 info | grep -w READDIRP | wc -l)
EXPECT_NOT "^0$" echo $readdir_count
EXPECT "^0$" echo $readdirp_count

#bug-1075087 - rebalance post add brick
TEST mkdir $M0/dir{21..30};
TEST touch $M0/dir{21..30}/files{1..10};
Expand Down
2 changes: 2 additions & 0 deletions tests/bugs/glusterd/replace-brick-operations.t
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ TEST $CLI volume replace-brick $V0 $H0:$B0/${V0}2 $H0:$B0/${V0}3 commit force
#bug-1242543-replace-brick validation

TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0;
#Make sure new brick comes online before doing replace-brick on next-brick.
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1

# Replace brick1 without killing
TEST $CLI volume replace-brick $V0 $H0:$B0/${V0}1 $H0:$B0/${V0}1_new commit force
Expand Down
24 changes: 20 additions & 4 deletions xlators/cluster/dht/src/dht-common.c
Original file line number Diff line number Diff line change
Expand Up @@ -7033,6 +7033,8 @@ dht_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
dht_conf_t *conf = NULL;
dht_methods_t *methods = NULL;
gf_boolean_t skip_hashed_check = _gf_false;
gf_boolean_t readdir_optimize = _gf_false;
gf_boolean_t add = _gf_false;

INIT_LIST_HEAD(&entries.list);

Expand All @@ -7042,6 +7044,7 @@ dht_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
conf = this->private;
GF_VALIDATE_OR_GOTO(this->name, conf, done);

readdir_optimize = conf->readdir_optimize;
methods = &(conf->methods);

if (op_ret <= 0)
Expand All @@ -7065,12 +7068,25 @@ dht_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
{
next_offset = orig_entry->d_off;

gf_msg_debug(this->name, 0, "%s: entry = %s, type = %d", prev->name,
orig_entry->d_name, orig_entry->d_type);

subvol = methods->layout_search(this, layout, orig_entry->d_name);
gf_msg_debug(this->name, 0, "%s: entry = %s, type = %d %p, %p",
prev->name, orig_entry->d_name, orig_entry->d_type, subvol,
prev);

/* a) If rebalance is running, pick from first_up_subvol
*/
if (DT_ISDIR(orig_entry->d_type) && readdir_optimize) {
if (prev == local->first_up_subvol) {
add = _gf_true;
} else {
continue;
}
} else if (!subvol || (subvol == prev)) {
add = _gf_true;
}

if (!subvol || (subvol == prev)) {
if (add) {
add = _gf_false;
entry = gf_dirent_for_name(orig_entry->d_name);
if (!entry) {
gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
Expand Down
59 changes: 24 additions & 35 deletions xlators/cluster/dht/src/dht-rebalance.c
Original file line number Diff line number Diff line change
Expand Up @@ -3608,6 +3608,9 @@ gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
struct iatt iatt = {
0,
};
struct iatt entry_iatt = {
0,
};
inode_t *linked_inode = NULL, *inode = NULL;
dht_conf_t *conf = NULL;
int perrno = 0;
Expand Down Expand Up @@ -3643,6 +3646,12 @@ gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
goto out;
}

linked_inode = inode_link(loc->inode, loc->parent, loc->name, &iatt);

inode = loc->inode;
loc->inode = linked_inode;
inode_unref(inode);

fd = fd_create(loc->inode, defrag->pid);
if (!fd) {
gf_log(this->name, GF_LOG_ERROR, "Failed to create fd");
Expand Down Expand Up @@ -3675,8 +3684,8 @@ gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
fd_bind(fd);
INIT_LIST_HEAD(&entries.list);

while ((ret = syncop_readdirp(this, fd, 131072, offset, &entries, NULL,
NULL)) != 0) {
while ((ret = syncop_readdir(this, fd, 131072, offset, &entries, NULL,
NULL)) != 0) {
if (ret < 0) {
if (-ret == ENOENT || -ret == ESTALE) {
if (conf->decommission_subvols_cnt) {
Expand Down Expand Up @@ -3711,9 +3720,11 @@ gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,

if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, ".."))
continue;
if (!IA_ISDIR(entry->d_stat.ia_type)) {

if ((DT_DIR != entry->d_type) && (DT_UNKNOWN != entry->d_type)) {
continue;
}

loc_wipe(&entry_loc);

ret = dht_build_child_loc(this, &entry_loc, loc, entry->d_name);
Expand All @@ -3734,40 +3745,18 @@ gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
}
}

if (gf_uuid_is_null(entry->d_stat.ia_gfid)) {
gf_log(this->name, GF_LOG_ERROR,
"%s/%s"
" gfid not present",
loc->path, entry->d_name);
continue;
}

gf_uuid_copy(entry_loc.gfid, entry->d_stat.ia_gfid);

/*In case the gfid stored in the inode by inode_link
* and the gfid obtained in the lookup differs, then
* client3_3_lookup_cbk will return ESTALE and proper
* error will be captured
*/

linked_inode = inode_link(entry_loc.inode, loc->inode,
entry->d_name, &entry->d_stat);

inode = entry_loc.inode;
entry_loc.inode = linked_inode;
inode_unref(inode);

if (gf_uuid_is_null(loc->gfid)) {
gf_log(this->name, GF_LOG_ERROR,
"%s/%s"
" gfid not present",
loc->path, entry->d_name);
defrag->total_failures++;
continue;
if (DT_UNKNOWN == entry->d_type) {
ret = syncop_lookup(this, &entry_loc, &entry_iatt, NULL, NULL,
NULL);
if ((ret == 0) && (entry_iatt.ia_type != IA_IFDIR)) {
continue;
}
/*If it is directory, gf_defrag_fix_layout() call will again do
* one more lookup. Not optimizing this part as all modern
* filesystems populate entry->d_type. We can optimize it when
* such a filesystem is found.*/
}

gf_uuid_copy(entry_loc.pargfid, loc->gfid);

/* A return value of 2 means, either process_dir or
* lookup of a dir failed. Hence, don't commit hash
* for the current directory*/
Expand Down
2 changes: 1 addition & 1 deletion xlators/mgmt/glusterd/src/glusterd-rebalance.c
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ glusterd_handle_defrag_start(glusterd_volinfo_t *volinfo, char *op_errstr,

runner_add_args(
&runner, SBIN_DIR "/glusterfs", "-s", volfileserver, "--volfile-id",
volname, "--xlator-option", "*dht.use-readdirp=yes", "--xlator-option",
volname, "--xlator-option", "*dht.use-readdirp=no", "--xlator-option",
"*dht.lookup-unhashed=yes", "--xlator-option",
"*dht.assert-no-child-down=yes", "--xlator-option",
"*dht.readdir-optimize=on", "--process-name", "rebalance", NULL);
Expand Down
Loading

0 comments on commit ec189a4

Please sign in to comment.