Skip to content

Commit

Permalink
client: satisfy read requests from local extent info if possible
Browse files Browse the repository at this point in the history
This adds a second segment tree to the client metadata of each file
(fid) that tracks all extents written to the file.  We use this to
satisfy later read requests by having the client refer to this
segment tree and then copy data directly from its data logs.  Any
request that cannot be fully satisfied on the client is sent to the
server for processing.

The goal is to make reads as fast as writes in the case that a
client process reads back the same data it wrote.

This optimization cannot be used as currently written for apps that
call truncate, because the server currently has no way to inform
the client that it should truncate its extent list.  To let users
opt-in, a new UNIFYFS_CLIENT_LOCAL_EXTENTS=1 option is added to enable
local read back.  This optimization is off by default.

This moves read request splitting from the client to the server.
Splitting the read requests into 1MB chunks was limiting performance,
and this splitting is not necessary when the client is reading its own
data.

This also moves the write index splitting from the client to the server.

TEST_CHECKPATCH_SKIP_FILES=common/src/unifyfs_configurator.h
  • Loading branch information
adammoody committed Jan 22, 2020
1 parent cddc9f9 commit ce78c0d
Show file tree
Hide file tree
Showing 9 changed files with 794 additions and 481 deletions.
199 changes: 25 additions & 174 deletions client/src/unifyfs-fixed.c
Original file line number Diff line number Diff line change
Expand Up @@ -337,151 +337,6 @@ static int unifyfs_coalesce_index(
return UNIFYFS_SUCCESS;
}

/*
* Given an index, split it into multiple indices whose range is equal or
* smaller than slice_range size. For example, if you passed a cur_index
* for a 3.5MB write, and the slice size was 1MB, it would split it into
* four indexes, and update cur_index.length to be zero. This also takes
* in a 'maxcount' field, so you can limit the number of indexes you
* create. Using our above example, if 'maxcount=2', then this would
* create two indexes, and update cur_index.length to 1.5MB (for the remaining
* data).
*
* @param cur_idx: The index to split
* @param slice_range: The slice size of the key-value store
* @return index_set: The set of split indices
* @param maxcount: Number of entries in output array
* @param used_count: Number of entries we actually added in the split
*/
static int unifyfs_split_index(
unifyfs_index_t* cur_idx, /* write index to split (offset and length) */
long slice_range, /* number of bytes in each slice */
unifyfs_index_t* index_set, /* output array to store new indexes in */
off_t maxcount, /* max number of items in output array */
off_t* used_count) /* number of entries we added in split */
{
/* first byte offset this write will write to */
long idx_start = cur_idx->file_pos;

/* last byte offset this write will write to */
long idx_end = cur_idx->file_pos + cur_idx->length - 1;

/* starting byte offset of slice that first write offset falls in */
long slice_start = (idx_start / slice_range) * slice_range;

/* last byte offset of slice that first write offset falls in */
long slice_end = slice_start + slice_range - 1;

/* get pointer to first output index structure */
unifyfs_index_t* set = index_set;

/* initialize count of output index entries */
off_t count = 0;

/* define new index entries in index_set by splitting write index
* at slice boundaries */
if (idx_end <= slice_end) {
/* index falls fully within one slice
*
* slice_start slice_end
* idx_start idx_end
*/
set[count] = *cur_idx;
count++;

/* update write index to account for index we just added */
long length = cur_idx->length;
cur_idx->file_pos += length;
cur_idx->log_pos += length;
cur_idx->length -= length;
} else {
/* ending offset of index is beyond last offset in first slice,
* so this index spans across multiple slices
*
* slice_start slice_end next_slice_start next_slice_end
* idx_start idx_end
*/

/* compute number of bytes until end of first slice */
long length = slice_end - idx_start + 1;

/* copy over all fields in current index,
* update length field to adjust for boundary of first slice */
set[count].gfid = cur_idx->gfid;
set[count].file_pos = cur_idx->file_pos;
set[count].length = length;
set[count].log_pos = cur_idx->log_pos;
count++;

/* update write index to account for index we just added */
cur_idx->file_pos += length;
cur_idx->log_pos += length;
cur_idx->length -= length;

/* check that we have room to write more index values */
if (count >= maxcount) {
/* no room to write more index values,
* and we have at least one more,
* record number we wrote and return with success */
*used_count = count;
return UNIFYFS_SUCCESS;
}

/* advance slice boundary offsets to next slice */
slice_end += slice_range;

/* loop until we find the slice that contains
* ending offset of write */
while (idx_end > slice_end) {
/* ending offset of write is beyond end of this slice,
* so write spans the full length of this slice */
length = slice_range;

/* define index for this slice */
set[count].gfid = cur_idx->gfid;
set[count].file_pos = cur_idx->file_pos;
set[count].length = length;
set[count].log_pos = cur_idx->log_pos;
count++;

/* update write index to account for index we just added */
cur_idx->file_pos += length;
cur_idx->log_pos += length;
cur_idx->length -= length;

/* check that we have room to write more index values */
if (count >= maxcount) {
/* no room to write more index values,
* and we have at least one more,
* record number we wrote and return with success */
*used_count = count;
return UNIFYFS_SUCCESS;
}

/* advance slice boundary offsets to next slice */
slice_end += slice_range;
}

/* this slice contains the remainder of write */
length = cur_idx->length;
set[count].gfid = cur_idx->gfid;
set[count].file_pos = cur_idx->file_pos;
set[count].length = length;
set[count].log_pos = cur_idx->log_pos;
count++;

/* update write index to account for index we just added */
cur_idx->file_pos += length;
cur_idx->log_pos += length;
cur_idx->length -= length;
}

/* record number of entires we added */
*used_count = count;

return UNIFYFS_SUCCESS;
}

/*
* Clear all entries in the log index. This only clears the metadata,
* not the data itself.
Expand Down Expand Up @@ -541,6 +396,13 @@ void unifyfs_add_index_entry_to_seg_tree(
unifyfs_filemeta_t* meta,
unifyfs_index_t* index)
{
/* add index to our local log */
if (unifyfs_local_extents) {
seg_tree_add(&meta->extents, index->file_pos,
index->file_pos + index->length - 1,
index->log_pos);
}

if (!unifyfs_flatten_writes) {
/* We're not flattening writes. Nothing to do */
return;
Expand All @@ -549,13 +411,13 @@ void unifyfs_add_index_entry_to_seg_tree(
/* to update the global running segment count, we need to capture
* the count in this tree before adding and the count after to
* add the difference */
unsigned long count_before = seg_tree_count(&meta->seg_tree);
unsigned long count_before = seg_tree_count(&meta->extents_sync);

/*
* Store the write in our segment tree. We will later use this for
* flattening writes.
*/
seg_tree_add(&meta->seg_tree, index->file_pos,
seg_tree_add(&meta->extents_sync, index->file_pos,
index->file_pos + index->length - 1,
index->log_pos);

Expand All @@ -572,7 +434,7 @@ void unifyfs_add_index_entry_to_seg_tree(
} else {
/* increase the running global segment count by the number of
* new entries we added to this tree */
unsigned long count_after = seg_tree_count(&meta->seg_tree);
unsigned long count_after = seg_tree_count(&meta->extents_sync);
unifyfs_segment_count += (count_after - count_before);
}
}
Expand All @@ -581,8 +443,10 @@ void unifyfs_add_index_entry_to_seg_tree(
static int unifyfs_logio_add_write_meta_to_index(unifyfs_filemeta_t* meta,
off_t file_pos, off_t log_pos, size_t length)
{
/* define an new index entry for this write operation */
/* global file id for this entry */
int gfid = meta->gfid;

/* define an new index entry for this write operation */
unifyfs_index_t cur_idx;
cur_idx.gfid = gfid;
cur_idx.file_pos = file_pos;
Expand Down Expand Up @@ -612,8 +476,8 @@ static int unifyfs_logio_add_write_meta_to_index(unifyfs_filemeta_t* meta,
}
}

/* add new index entries if needed */
while (cur_idx.length > 0) {
/* add new index entry if needed */
if (cur_idx.length > 0) {
/* remaining entries we can fit in the shared memory region */
off_t remaining_entries = unifyfs_max_index_entries - num_entries;

Expand All @@ -628,32 +492,19 @@ static int unifyfs_logio_add_write_meta_to_index(unifyfs_filemeta_t* meta,
}
}

/* split any remaining write index at boundaries of
* unifyfs_key_slice_range */
off_t used_entries = 0;
int split_rc = unifyfs_split_index(&cur_idx,
unifyfs_key_slice_range, &idxs[num_entries],
remaining_entries, &used_entries);
if (split_rc != UNIFYFS_SUCCESS) {
/* in this case, we have copied data to the log,
* but we failed to generate index entries,
* we're returning with an error and leaving the data
* in the log */
LOGERR("failed to split write index");
return UNIFYFS_ERROR_IO;
}
/* copy entry into index buffer */
idxs[num_entries] = cur_idx;

/* Add our split index entries to our seg_tree */
for (int i = 0; i < used_entries; i++) {
unifyfs_add_index_entry_to_seg_tree(meta, &idxs[num_entries + i]);
}
/* Add index entry to our seg_tree */
unifyfs_add_index_entry_to_seg_tree(meta, &idxs[num_entries]);

/* account for entries we just added */
num_entries += used_entries;
num_entries += 1;

/* update number of entries in index array */
(*unifyfs_indices.ptr_num_entries) = num_entries;
}

return UNIFYFS_SUCCESS;
}

Expand Down Expand Up @@ -770,22 +621,22 @@ void unifyfs_rewrite_index_from_seg_tree(void)

int gfid = unifyfs_gfid_from_fid(fid);

seg_tree_rdlock(&meta->seg_tree);
seg_tree_rdlock(&meta->extents_sync);

/* For each write in this file's seg_tree ... */
struct seg_tree_node* node = NULL;
while ((node = seg_tree_iter(&meta->seg_tree, node))) {
while ((node = seg_tree_iter(&meta->extents_sync, node))) {
indexes[idx].file_pos = node->start;
indexes[idx].log_pos = node->ptr;
indexes[idx].length = node->end - node->start + 1;
indexes[idx].gfid = gfid;
idx++;
}

seg_tree_unlock(&meta->seg_tree);
seg_tree_unlock(&meta->extents_sync);

/* All done processing this files writes. Clear its seg_tree */
seg_tree_clear(&meta->seg_tree);
seg_tree_clear(&meta->extents_sync);
}

/* reset our segment count since we just dumped them all */
Expand Down
48 changes: 27 additions & 21 deletions client/src/unifyfs-internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -266,26 +266,27 @@ typedef struct {
} unifyfs_chunkmeta_t;

typedef struct {
off_t global_size; /* Global size of the file */
off_t local_size; /* Local size of the file */
off_t log_size; /* Log size. This is the sum of all the
* write counts. */
pthread_spinlock_t fspinlock; /* file lock variable */
enum flock_enum flock_status; /* file lock status */

int storage; /* FILE_STORAGE type */

int gfid; /* global file id for this file */
int needs_sync; /* have unsynced writes */

off_t chunks; /* number of chunks allocated to file */
off_t chunkmeta_idx; /* starting index in unifyfs_chunkmeta */
int is_laminated; /* Is this file laminated */
uint32_t mode; /* st_mode bits. This has file
* permission info and will tell you if this
* is a regular file or directory. */
struct seg_tree seg_tree; /* Segment tree containing our coalesced
* writes */
off_t global_size; /* Global size of the file */
off_t local_size; /* Local size of the file */
off_t log_size; /* Log size. This is the sum of all the
* write counts. */
pthread_spinlock_t fspinlock; /* file lock variable */
enum flock_enum flock_status; /* file lock status */

int storage; /* FILE_STORAGE type */

int gfid; /* global file id for this file */
int needs_sync; /* have unsynced writes */

off_t chunks; /* number of chunks allocated to file */
off_t chunkmeta_idx; /* starting index in unifyfs_chunkmeta */
int is_laminated; /* Is this file laminated */
uint32_t mode; /* st_mode bits. This has file
* permission info and will tell you if this
* is a regular file or directory. */
struct seg_tree extents_sync; /* Segment tree containing our coalesced
* writes between sync operations */
struct seg_tree extents; /* Segment tree of all local data extents */
} unifyfs_filemeta_t;

/* struct used to map a full path to its local file id,
Expand Down Expand Up @@ -393,7 +394,8 @@ extern int unifyfs_use_memfs;
extern int unifyfs_use_spillover;

extern int unifyfs_max_files; /* maximum number of files to store */
extern bool unifyfs_flatten_writes; /* enable write flattening */
extern bool unifyfs_flatten_writes; /* enable write flattening */
extern bool unifyfs_local_extents; /* enable tracking of local extents */
extern size_t
unifyfs_chunk_mem; /* number of bytes in memory to be used for chunk storage */
extern int unifyfs_chunk_bits; /* we set chunk size = 2^unifyfs_chunk_bits */
Expand Down Expand Up @@ -484,6 +486,10 @@ const char* unifyfs_path_from_fid(int fid);
/* Given a fid, return a gfid */
int unifyfs_gfid_from_fid(const int fid);

/* returns fid for corresponding gfid, if one is active,
* returns -1 otherwise */
int unifyfs_fid_from_gfid(const int gfid);

/* given an UNIFYFS error code, return corresponding errno code */
int unifyfs_err_map_to_errno(int rc);

Expand Down
Loading

0 comments on commit ce78c0d

Please sign in to comment.