Skip to content

Commit

Permalink
Clean up code
Browse files Browse the repository at this point in the history
  • Loading branch information
anuragkh committed Nov 26, 2019
1 parent 6233fd4 commit c182cd1
Show file tree
Hide file tree
Showing 13 changed files with 199 additions and 237 deletions.
8 changes: 4 additions & 4 deletions core/include/succinct_core.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,10 +112,8 @@ class SuccinctCore : public SuccinctBase {
Range BwdSearch(std::string mgram);
Range ContinueBwdSearch(std::string mgram, Range range);

Range FwdSearch(std::string mgram);
Range ContinueFwdSearch(std::string mgram, Range range, size_t len);

protected:
Range FwdSearch(const std::string& mgram);
Range ContinueFwdSearch(const std::string& mgram, Range range, size_t len);

// Allocates high level containers
void Allocate(uint32_t sa_sampling_rate, uint32_t isa_sampling_rate,
Expand All @@ -141,6 +139,8 @@ class SuccinctCore : public SuccinctBase {
NPA::NPAEncodingScheme npa_encoding_scheme,
uint32_t sampling_range);

protected:

/* Metadata */
uint64_t input_size_; // Size of input

Expand Down
2 changes: 1 addition & 1 deletion core/include/succinct_file.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

class SuccinctFile : public SuccinctCore {
public:
SuccinctFile(std::string filename, SuccinctMode s_mode =
SuccinctFile(const std::string& filename, SuccinctMode s_mode =
SuccinctMode::CONSTRUCT_IN_MEMORY,
uint32_t sa_sampling_rate = 32, uint32_t isa_sampling_rate = 32,
uint32_t npa_sampling_rate = 128,
Expand Down
85 changes: 35 additions & 50 deletions core/include/succinct_semistructured_shard.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,19 @@ class SuccinctSemistructuredShard : public SuccinctShard {
typedef std::unordered_map<std::string, uint8_t> FwdMap;
typedef std::unordered_map<uint8_t, std::string> BwdMap;

SuccinctSemistructuredShard(const std::string& filename, SuccinctMode s_mode =
SuccinctMode::CONSTRUCT_IN_MEMORY,
uint32_t sa_sampling_rate = 32,
uint32_t isa_sampling_rate = 32,
uint32_t npa_sampling_rate = 128,
uint32_t context_len = 3,
SamplingScheme sa_sampling_scheme =
SamplingScheme::FLAT_SAMPLE_BY_INDEX,
SamplingScheme isa_sampling_scheme =
SamplingScheme::FLAT_SAMPLE_BY_INDEX,
NPA::NPAEncodingScheme npa_encoding_scheme =
NPA::NPAEncodingScheme::ELIAS_GAMMA_ENCODED,
uint32_t sampling_range = 1024)
explicit SuccinctSemistructuredShard(const std::string &filename,
SuccinctMode s_mode = SuccinctMode::CONSTRUCT_IN_MEMORY,
uint32_t sa_sampling_rate = 32,
uint32_t isa_sampling_rate = 32,
uint32_t npa_sampling_rate = 128,
uint32_t context_len = 3,
SamplingScheme sa_sampling_scheme =
SamplingScheme::FLAT_SAMPLE_BY_INDEX,
SamplingScheme isa_sampling_scheme =
SamplingScheme::FLAT_SAMPLE_BY_INDEX,
NPA::NPAEncodingScheme npa_encoding_scheme =
NPA::NPAEncodingScheme::ELIAS_GAMMA_ENCODED,
uint32_t sampling_range = 1024)
: SuccinctShard() {
switch (s_mode) {
case SuccinctMode::CONSTRUCT_IN_MEMORY: {
Expand Down Expand Up @@ -64,8 +64,7 @@ class SuccinctSemistructuredShard : public SuccinctShard {

// Read values
size_t value_offsets_size;
keyval.read(reinterpret_cast<char *>(&value_offsets_size),
sizeof(size_t));
keyval.read(reinterpret_cast<char *>(&value_offsets_size), sizeof(size_t));
value_offsets_.reserve(value_offsets_size);
for (size_t i = 0; i < value_offsets_size; i++) {
uint64_t value_offset;
Expand All @@ -90,9 +89,7 @@ class SuccinctSemistructuredShard : public SuccinctShard {
size_t keys_size = *((size_t *) data);
data += sizeof(size_t);
buf_allocator<int64_t> key_allocator((int64_t *) data);
keys_ = std::vector<int64_t>((int64_t *) data,
(int64_t *) data + keys_size,
key_allocator);
keys_ = std::vector<int64_t>((int64_t *) data,(int64_t *) data + keys_size, key_allocator);
data += (sizeof(int64_t) * keys_size);

// Read values
Expand All @@ -111,8 +108,8 @@ class SuccinctSemistructuredShard : public SuccinctShard {
}
}

int64_t CountAttribute(const std::string& attr_key,
const std::string& attr_val) {
int64_t CountAttribute(const std::string &attr_key,
const std::string &attr_val) {
if (attr_key_to_delimiter_map_.find(attr_key)
== attr_key_to_delimiter_map_.end())
return 0;
Expand All @@ -122,8 +119,8 @@ class SuccinctSemistructuredShard : public SuccinctShard {
return Count(query);
}

void SearchAttribute(std::set<int64_t>& keys, const std::string& attr_key,
const std::string& attr_val) {
void SearchAttribute(std::set<int64_t> &keys, const std::string &attr_key,
const std::string &attr_val) {
if (attr_key_to_delimiter_map_.find(attr_key)
== attr_key_to_delimiter_map_.end())
return;
Expand All @@ -133,7 +130,7 @@ class SuccinctSemistructuredShard : public SuccinctShard {
Search(keys, query);
}

void Get(std::string& result, int64_t key) {
void Get(std::string &result, int64_t key) override {
std::string data;
SuccinctShard::Get(data, key);

Expand All @@ -150,7 +147,7 @@ class SuccinctSemistructuredShard : public SuccinctShard {
}
}

void Get(std::string& result, int64_t key, std::string& attr_key) {
void Get(std::string &result, int64_t key, std::string &attr_key) {
std::string data;
SuccinctShard::Get(data, key);

Expand All @@ -168,17 +165,14 @@ class SuccinctSemistructuredShard : public SuccinctShard {
}
}

private:
std::string ExtractField(std::string& data, size_t start_offset,
uint8_t delim) {
protected:
static std::string ExtractField(std::string &data, size_t start_offset, uint8_t delim) {
size_t i = start_offset;
while (((uint8_t) data[i]) != delim)
i++;

while (((uint8_t) data[i]) != delim) i++;
return data.substr(start_offset, i - start_offset);
}

std::string Format(std::string filename, char delim = ',') {
std::string Format(const std::string &filename, char delim = ',') {
std::string outf = filename + ".tmp.formatted";
std::ifstream infile = std::ifstream(filename);
std::ofstream formatted = std::ofstream(outf);
Expand All @@ -187,8 +181,7 @@ class SuccinctSemistructuredShard : public SuccinctShard {
while (std::getline(infile, line)) {
if (line_no != 0) {
char newline = '\n';
formatted.write(reinterpret_cast<const char *>(&newline),
sizeof(uint8_t));
formatted.write(reinterpret_cast<const char *>(&newline), sizeof(uint8_t));
}
std::stringstream linestream(line);
std::string attr_val_pair;
Expand All @@ -197,34 +190,27 @@ class SuccinctSemistructuredShard : public SuccinctShard {
value_offsets_.push_back(formatted.tellp());
while (std::getline(linestream, attr_val_pair, delim)) {
std::string::size_type pos = attr_val_pair.find('=');
if (pos != attr_val_pair.npos) {
if (pos != std::string::npos) {
std::string attr_key = attr_val_pair.substr(0, pos);
std::string attr_val = attr_val_pair.substr(pos + 1);
if (attr_key_to_delimiter_map_.find(attr_key)
== attr_key_to_delimiter_map_.end()) {
if (cur_delim_ == 255) {
fprintf(stderr,
"Currently support <= 128 unique attribute keys.\n");
fprintf(stderr, "Currently support <= 128 unique attribute keys.\n");
exit(0);
}
// Create new entry
attr_key_to_delimiter_map_.insert(
FwdMap::value_type(attr_key, cur_delim_));
delimiter_to_attr_key_map_.insert(
BwdMap::value_type(cur_delim_, attr_key));
attr_key_to_delimiter_map_.insert(FwdMap::value_type(attr_key, cur_delim_));
delimiter_to_attr_key_map_.insert(BwdMap::value_type(cur_delim_, attr_key));
cur_delim_++;
}
uint8_t attr_delim = attr_key_to_delimiter_map_.at(attr_key);
const char* attr_val_str = attr_val.c_str();
formatted.write(reinterpret_cast<const char *>(&attr_delim),
sizeof(uint8_t));
formatted.write(reinterpret_cast<const char *>(attr_val_str),
sizeof(char) * attr_val.length());
formatted.write(reinterpret_cast<const char *>(&attr_delim),
sizeof(uint8_t));
const char *attr_val_str = attr_val.c_str();
formatted.write(reinterpret_cast<const char *>(&attr_delim), sizeof(uint8_t));
formatted.write(reinterpret_cast<const char *>(attr_val_str), sizeof(char) * attr_val.length());
formatted.write(reinterpret_cast<const char *>(&attr_delim), sizeof(uint8_t));
} else {
fprintf(stderr,
"Invalid attribute-value pair [%s] %lld on line %lld\n",
fprintf(stderr, "Invalid attribute-value pair [%s] %lld on line %lld\n",
attr_val_pair.c_str(), attr_val_no, line_no + 1);
exit(0);
}
Expand All @@ -239,7 +225,6 @@ class SuccinctSemistructuredShard : public SuccinctShard {
FwdMap attr_key_to_delimiter_map_;
BwdMap delimiter_to_attr_key_map_;
uint8_t cur_delim_ = 128;

};

#endif /* SUCCINCT_SEMISTRUCTURED_SHARD_H_ */
54 changes: 24 additions & 30 deletions core/include/succinct_shard.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,69 +24,63 @@ class SuccinctShard : public SuccinctCore {
public:
static const int64_t MAX_KEYS = 1L << 32;

SuccinctShard(uint32_t id, std::string datafile, SuccinctMode s_mode =
SuccinctMode::CONSTRUCT_IN_MEMORY,
uint32_t sa_sampling_rate = 32, uint32_t isa_sampling_rate = 32,
uint32_t npa_sampling_rate = 128,
SamplingScheme sa_sampling_scheme =
SamplingScheme::FLAT_SAMPLE_BY_INDEX,
SamplingScheme isa_sampling_scheme =
SamplingScheme::FLAT_SAMPLE_BY_INDEX,
NPA::NPAEncodingScheme npa_encoding_scheme =
NPA::NPAEncodingScheme::ELIAS_GAMMA_ENCODED,
SuccinctShard(uint32_t id, std::string datafile, SuccinctMode s_mode = SuccinctMode::CONSTRUCT_IN_MEMORY,
uint32_t sa_sampling_rate = 32, uint32_t isa_sampling_rate = 32, uint32_t npa_sampling_rate = 128,
SamplingScheme sa_sampling_scheme = SamplingScheme::FLAT_SAMPLE_BY_INDEX,
SamplingScheme isa_sampling_scheme = SamplingScheme::FLAT_SAMPLE_BY_INDEX,
NPA::NPAEncodingScheme npa_encoding_scheme = NPA::NPAEncodingScheme::ELIAS_GAMMA_ENCODED,
uint32_t context_len = 3, uint32_t sampling_range = 1024);

SuccinctShard()
: SuccinctCore() {
id_ = 0;
invalid_offsets_ = NULL;
invalid_offsets_ = nullptr;
}

virtual ~SuccinctShard() {
}
~SuccinctShard() override = default;

uint32_t GetSASamplingRate();

uint32_t GetISASamplngRate();
uint32_t GetISASamplingRate();

uint32_t GetNPASamplingRate();

size_t GetNumKeys();

virtual void Get(std::string& result, int64_t key);
virtual void Get(std::string &result, int64_t key);

void Access(std::string& result, int64_t key, int32_t offset, int32_t len);
void Access(std::string &result, int64_t key, int32_t offset, int32_t len);

int64_t Count(const std::string& str);
int64_t Count(const std::string &str);

void Search(std::set<int64_t>& result, const std::string& str);
void Search(std::set<int64_t> &result, const std::string &str);

int64_t FlatCount(const std::string& str);
int64_t FlatCount(const std::string &str);

void FlatSearch(std::vector<int64_t>& result, const std::string& str);
void FlatSearch(std::vector<int64_t> &result, const std::string &str);

void FlatExtract(std::string& result, int64_t offset, int64_t len);
void FlatExtract(std::string &result, int64_t offset, int64_t len);

void RegexSearch(std::set<std::pair<size_t, size_t>>& result,
const std::string& str, bool opt = true);
void RegexSearch(std::set<std::pair<size_t, size_t>> &result,
const std::string &str, bool opt = true);

void RegexCount(std::vector<size_t>& result, const std::string& str);
void RegexCount(std::vector<size_t> &result, const std::string &str);

// Serialize succinct data structures
virtual size_t Serialize(const std::string& path);
size_t Serialize(const std::string &path) override;

// Deserialize succinct data structures
virtual size_t Deserialize(const std::string& path);
size_t Deserialize(const std::string &path) override;

// Memory map succinct data structures
virtual size_t MemoryMap(const std::string& path);
size_t MemoryMap(const std::string &path) override;

// Get succinct shard size
virtual size_t StorageSize();
size_t StorageSize() override;

protected:
int64_t GetKeyPos(const int64_t value_offset);
int64_t GetValueOffsetPos(const int64_t key);
int64_t GetKeyPos(int64_t value_offset);
int64_t GetValueOffsetPos(int64_t key);

// std::pair<int64_t, int64_t> get_range_slow(const char *str, uint64_t len);
std::pair<int64_t, int64_t> GetRange(const char *str, uint64_t len);
Expand Down
14 changes: 7 additions & 7 deletions core/src/layered_succinct_shard.cc
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
#include "layered_succinct_shard.h"

#include <utility>

LayeredSuccinctShard::LayeredSuccinctShard(
uint32_t id, std::string datafile, SuccinctMode s_mode,
uint32_t sa_sampling_rate, uint32_t isa_sampling_rate,
uint32_t sampling_range, bool opportunistic, uint32_t npa_sampling_rate,
NPA::NPAEncodingScheme npa_encoding_scheme, uint32_t context_len)
: SuccinctShard(
id,
datafile,
std::move(datafile),
s_mode,
sa_sampling_rate,
isa_sampling_rate,
Expand Down Expand Up @@ -50,7 +52,7 @@ size_t LayeredSuccinctShard::reconstruct_layer(uint32_t layer_id) {
void LayeredSuccinctShard::get(std::string& result, int64_t key) {

if (!opportunistic) {
LayeredSampledISA *ISA_lay = (LayeredSampledISA *) isa_;
auto *ISA_lay = (LayeredSampledISA *) isa_;
result = "";
int64_t pos = GetValueOffsetPos(key);
if (pos < 0)
Expand All @@ -74,8 +76,7 @@ void LayeredSuccinctShard::get(std::string& result, int64_t key) {
return;
}

OpportunisticLayeredSampledISA *ISA_opp =
(OpportunisticLayeredSampledISA *) isa_;
auto *ISA_opp = (OpportunisticLayeredSampledISA *) isa_;

result = "";
int64_t pos = GetValueOffsetPos(key);
Expand Down Expand Up @@ -111,7 +112,7 @@ uint64_t LayeredSuccinctShard::num_sampled_values() {
void LayeredSuccinctShard::access(std::string& result, int64_t key,
int32_t offset, int32_t len) {
if (!opportunistic) {
LayeredSampledISA *ISA_lay = (LayeredSampledISA *) isa_;
auto *ISA_lay = (LayeredSampledISA *) isa_;
result = "";
int64_t pos = GetValueOffsetPos(key);
if (pos < 0)
Expand All @@ -131,8 +132,7 @@ void LayeredSuccinctShard::access(std::string& result, int64_t key,
return;
}

OpportunisticLayeredSampledISA *ISA_opp =
(OpportunisticLayeredSampledISA *) isa_;
auto *ISA_opp = (OpportunisticLayeredSampledISA *) isa_;
result = "";
int64_t pos = GetValueOffsetPos(key);
if (pos < 0)
Expand Down
Loading

0 comments on commit c182cd1

Please sign in to comment.