diff --git a/bin/Makefile.am b/bin/Makefile.am index ae4f5f1f2..20559d744 100644 --- a/bin/Makefile.am +++ b/bin/Makefile.am @@ -9,6 +9,7 @@ leech_SOURCES = main.c \ diff.c diff.h \ rebase.c rebase.h \ patch.c patch.h \ - history.c history.h + history.c history.h \ + purge.c purge.h leech_LDADD = @PSQL_LIBS@ $(top_builddir)/lib/libleech.la leech_CFLAGS = @PSQL_CFLAGS@ diff --git a/bin/main.c b/bin/main.c index 6db34f7a3..549f7ee54 100644 --- a/bin/main.c +++ b/bin/main.c @@ -12,6 +12,7 @@ #include "diff.h" #include "history.h" #include "patch.h" +#include "purge.h" #include "rebase.h" #ifdef HAVE_LIBPQ @@ -54,6 +55,7 @@ static const struct command COMMANDS[] = { {"rebase", "rebase to current table state", Rebase}, {"patch", "apply changes to tables", Patch}, {"history", "get history of a specific record", History}, + {"purge", "delete old/unreachable blocks", Purge}, {NULL, NULL, NULL}, }; diff --git a/bin/purge.c b/bin/purge.c new file mode 100644 index 000000000..2b8976751 --- /dev/null +++ b/bin/purge.c @@ -0,0 +1,52 @@ +#include "purge.h" + +#include + +#include "common.h" + +enum OPTION_VALUE { + OPTION_HELP = 1, +}; + +struct arguments { + const char *arg; + const char *desc; +}; + +static const struct option OPTIONS[] = { + {"help", no_argument, NULL, OPTION_HELP}, + {NULL, 0, NULL, 0}, +}; + +static const char *const DESCRIPTIONS[] = { + "print help message", +}; + +static void PrintHelp(void) { + PrintVersion(); + printf("\n"); + PrintOptions(OPTIONS, DESCRIPTIONS); + printf("\n"); + PrintBugreport(); + printf("\n"); +} + +int Purge(const char *const work_dir, int argc, char *argv[]) { + int opt; + while ((opt = getopt_long(argc, argv, "+", OPTIONS, NULL)) != -1) { + switch (opt) { + case OPTION_HELP: + PrintHelp(); + return EXIT_SUCCESS; + default: + return EXIT_FAILURE; + } + } + + if (!LCH_Purge(work_dir)) { + fprintf(stderr, "Failed to purge blocks"); + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +} diff --git a/bin/purge.h b/bin/purge.h new file mode 100644 index 000000000..48190c750 --- /dev/null +++ b/bin/purge.h @@ -0,0 +1,6 @@ +#ifndef _LEECH_PURGE_H +#define _LEECH_PURGE_H + +int Purge(const char *work_dir, int argc, char *argv[]); + +#endif // _LEECH_PURGE_H diff --git a/configure.ac b/configure.ac index 6a7140203..af72256bf 100644 --- a/configure.ac +++ b/configure.ac @@ -46,7 +46,7 @@ AM_CONDITIONAL([BUILD_PSQL_MODULE], [test "x$with_psql_module" = "xyes"]) AM_CONDITIONAL([BUILD_CSV_MODULE], [test "x$with_csv_module" = "xyes"]) # Config constants. -AC_DEFINE([LCH_DEFAULT_MAX_CHAIN_LENGTH], 2048, [Default max chain length used in block garbage collector]) +AC_DEFINE([LCH_DEFAULT_PREFERED_CHAIN_LENGTH], 2048, [Default prefered chain length used in block garbage collector]) AC_DEFINE([LCH_JSON_PRETTY_INDENT_SIZE], 2, [Indent size used when composing pretty JSON]) AC_DEFINE([LCH_BUFFER_SIZE], 1024, [Initial buffer size allocated by leech]) AC_DEFINE([LCH_DICT_CAPACITY], 256, [Initial dictionary capacity allocated by leech]) diff --git a/lib/block.c b/lib/block.c index 537e42e4d..96dc2966e 100644 --- a/lib/block.c +++ b/lib/block.c @@ -68,7 +68,7 @@ bool LCH_BlockStore(const LCH_Instance *const instance, assert(block != NULL); const char *const work_dir = LCH_InstanceGetWorkDirectory(instance); - const bool pretty_print = LCH_InstancePrettyPrint(instance); + const bool pretty_print = LCH_InstanceShouldPrettyPrint(instance); LCH_Buffer *const json = LCH_JsonCompose(block, pretty_print); if (json == NULL) { diff --git a/lib/files.c b/lib/files.c index de6e12ee9..863b27aa4 100644 --- a/lib/files.c +++ b/lib/files.c @@ -1,6 +1,7 @@ #include "files.h" #include +#include #include #include #include @@ -9,7 +10,6 @@ #include #include "definitions.h" -#include "list.h" #include "logger.h" #include "string_lib.h" @@ -152,3 +152,49 @@ bool LCH_FileCreateParentDirectories(const char *const filename) { LCH_ListDestroy(dirs); return true; } + +LCH_List *LCH_FileListDirectory(const char *const path, + const bool filter_hidden) { + LCH_List *const filenames = LCH_ListCreate(); + if (filenames == NULL) { + return NULL; + } + + DIR *dir = opendir(path); + if (dir == NULL) { + LCH_LOG_ERROR("Failed to open directory '%s': %s", path, strerror(errno)); + LCH_ListDestroy(filenames); + return NULL; + } + + errno = 0; // Only way to distinguish between error or end-of-directory + struct dirent *entry = NULL; + while ((entry = readdir(dir)) != NULL) { + if (filter_hidden && LCH_StringStartsWith(entry->d_name, ".")) { + continue; + } + + char *const filename = LCH_StringDuplicate(entry->d_name); + if (filename == NULL) { + LCH_ListDestroy(filenames); + closedir(dir); + return NULL; + } + + if (!LCH_ListAppend(filenames, filename, free)) { + LCH_ListDestroy(filenames); + closedir(dir); + return NULL; + } + } + + if (errno != 0) { + LCH_LOG_ERROR("Failed to read directory '%s': %s", path, strerror(errno)); + LCH_ListDestroy(filenames); + closedir(dir); + return NULL; + } + + closedir(dir); + return filenames; +} diff --git a/lib/files.h b/lib/files.h index 1f1777de1..ae65a07ed 100644 --- a/lib/files.h +++ b/lib/files.h @@ -4,6 +4,8 @@ #include #include +#include "list.h" + bool LCH_FileSize(FILE *file, size_t *size); bool LCH_FileExists(const char *path); @@ -23,4 +25,6 @@ bool LCH_FileDelete(const char *filename); bool LCH_FileCreateParentDirectories(const char *filename); +LCH_List *LCH_FileListDirectory(const char *path, bool filter_hidden); + #endif // _LEECH_FILES_H diff --git a/lib/instance.c b/lib/instance.c index 441929ca6..86c037421 100644 --- a/lib/instance.c +++ b/lib/instance.c @@ -17,8 +17,9 @@ struct LCH_Instance { size_t major; size_t minor; size_t patch; - size_t max_chain_length; + size_t chain_length; bool pretty_print; + bool auto_purge; LCH_List *tables; }; @@ -72,8 +73,7 @@ LCH_Instance *LCH_InstanceLoad(const char *const work_dir) { } { - const LCH_Buffer *const key = - LCH_BufferStaticFromString("max_chain_length"); + const LCH_Buffer *const key = LCH_BufferStaticFromString("chain_length"); if (LCH_JsonObjectHasKey(config, key)) { double number; if (!LCH_JsonObjectGetNumber(config, key, &number)) { @@ -81,16 +81,28 @@ LCH_Instance *LCH_InstanceLoad(const char *const work_dir) { LCH_JsonDestroy(config); return NULL; } - if (!LCH_DoubleToSize(number, &(instance->max_chain_length))) { + if (!LCH_DoubleToSize(number, &(instance->chain_length))) { LCH_InstanceDestroy(instance); LCH_JsonDestroy(config); return NULL; } } else { - instance->max_chain_length = LCH_DEFAULT_MAX_CHAIN_LENGTH; + instance->chain_length = LCH_DEFAULT_PREFERED_CHAIN_LENGTH; } - LCH_LOG_DEBUG("config[\"max_chain_length\"] = \"%zu\"", - instance->max_chain_length); + LCH_LOG_DEBUG("config[\"chain_length\"] = %zu", instance->chain_length); + } + + { + instance->auto_purge = false; + const LCH_Buffer *const key = LCH_BufferStaticFromString("auto_purge"); + if (LCH_JsonObjectHasKey(config, key)) { + const LCH_Json *const json = LCH_JsonObjectGet(config, key); + if (LCH_JsonIsTrue(json)) { + instance->auto_purge = true; + } + } + LCH_LOG_DEBUG("config[\"auto_purge\"] = %s", + (instance->auto_purge) ? "true" : "false"); } { @@ -208,12 +220,17 @@ const char *LCH_InstanceGetWorkDirectory(const LCH_Instance *const self) { return self->work_dir; } -size_t LCH_InstanceGetMaxChainLength(const LCH_Instance *const instance) { +size_t LCH_InstanceGetPrefferedChainLength(const LCH_Instance *const instance) { assert(instance != NULL); - return instance->max_chain_length; + return instance->chain_length; } -bool LCH_InstancePrettyPrint(const LCH_Instance *const instance) { +bool LCH_InstanceShouldPrettyPrint(const LCH_Instance *const instance) { assert(instance != NULL); return instance->pretty_print; } + +bool LCH_InstanceShouldAutoPurge(const LCH_Instance *const instance) { + assert(instance != NULL); + return instance->auto_purge; +} diff --git a/lib/instance.h b/lib/instance.h index 200d62007..be52edece 100644 --- a/lib/instance.h +++ b/lib/instance.h @@ -42,8 +42,10 @@ const LCH_List *LCH_InstanceGetTables(const LCH_Instance *instance); */ const char *LCH_InstanceGetWorkDirectory(const LCH_Instance *instance); -size_t LCH_InstanceGetMaxChainLength(const LCH_Instance *instance); +size_t LCH_InstanceGetPrefferedChainLength(const LCH_Instance *instance); -bool LCH_InstancePrettyPrint(const LCH_Instance *instance); +bool LCH_InstanceShouldPrettyPrint(const LCH_Instance *instance); + +bool LCH_InstanceShouldAutoPurge(const LCH_Instance *instance); #endif // _LEECH_INSTANCE_H diff --git a/lib/leech.c b/lib/leech.c index fe7c89dd6..8f8d43d8a 100644 --- a/lib/leech.c +++ b/lib/leech.c @@ -1,7 +1,9 @@ #include "leech.h" #include +#include #include +#include #include #include @@ -9,6 +11,7 @@ #include "csv.h" #include "definitions.h" #include "delta.h" +#include "dict.h" #include "files.h" #include "head.h" #include "instance.h" @@ -20,102 +23,173 @@ const char *LCH_Version(void) { return PACKAGE_VERSION; } -static bool CollectGarbage(const LCH_Instance *const instance) { +static bool Purge(const LCH_Instance *const instance) { const char *const work_dir = LCH_InstanceGetWorkDirectory(instance); - const size_t max_chain_length = LCH_InstanceGetMaxChainLength(instance); + const size_t chain_length = LCH_InstanceGetPrefferedChainLength(instance); - char *block_id = LCH_HeadGet("HEAD", work_dir); - if (block_id == NULL) { - free(block_id); + char *const head = LCH_HeadGet("HEAD", work_dir); + if (head == NULL) { return false; } - // Traverse all the blocks that we want to keep + // We'll use the dict as a map + LCH_Dict *const whitelist = LCH_DictCreate(); + if (whitelist == NULL) { + free(head); + return false; + } + + LCH_Json *child = NULL; + LCH_Json *parent = NULL; + const char *child_id = NULL; + const char *parent_id = head; + char path[PATH_MAX]; - for (size_t i = 0; i < max_chain_length; i++) { - if (!LCH_FilePathJoin(path, PATH_MAX, 3, work_dir, "blocks", block_id)) { + for (size_t i = 0; i < chain_length; i++) { + if (!LCH_FilePathJoin(path, PATH_MAX, 3, work_dir, "blocks", parent_id)) { + LCH_JsonDestroy(child); + LCH_JsonDestroy(parent); + LCH_DictDestroy(whitelist); + free(head); return false; } if (!LCH_FileExists(path)) { - LCH_LOG_DEBUG( - "Block with identifier %s does not exist: " - "End-of-Chain reached at index %zu", - block_id, i); - LCH_LOG_VERBOSE("Garbage collector deleted 0 blocks", i); - free(block_id); - return true; + LCH_LOG_DEBUG("End-of-Chain reached at index %zu", i); + break; } - LCH_Json *const block = LCH_BlockLoad(work_dir, block_id); - free(block_id); - if (block == NULL) { + const LCH_Buffer *const key = LCH_BufferStaticFromString(parent_id); + if (!LCH_DictSet(whitelist, key, NULL, NULL)) { + LCH_JsonDestroy(child); + LCH_JsonDestroy(parent); + LCH_DictDestroy(whitelist); + free(head); return false; } + if (child_id == NULL) { + LCH_LOG_DEBUG("Whitelisted block %.7s, head of chain (index %zu)", + parent_id, i); + } else { + LCH_LOG_DEBUG("Whitelisted block %.7s, parent of %.7s (index %zu)", + parent_id, child_id, i); + } - const char *const parent_id = LCH_BlockGetParentId(block); - if (parent_id == NULL) { - LCH_JsonDestroy(block); + LCH_Json *const block = LCH_BlockLoad(work_dir, parent_id); + if (block == NULL) { + LCH_JsonDestroy(parent); + LCH_JsonDestroy(child); + LCH_DictDestroy(whitelist); + free(head); return false; } - block_id = LCH_StringDuplicate(parent_id); - LCH_JsonDestroy(block); - if (block_id == NULL) { + LCH_JsonDestroy(child); + child = parent; + parent = block; + + child_id = parent_id; + parent_id = LCH_BlockGetParentId(parent); + if (parent_id == NULL) { + LCH_JsonDestroy(parent); + LCH_JsonDestroy(child); + LCH_DictDestroy(whitelist); + free(head); return false; } } - // Now start deleting blocks + LCH_JsonDestroy(parent); + LCH_JsonDestroy(child); + free(head); - if (!LCH_FilePathJoin(path, PATH_MAX, 3, work_dir, "blocks", block_id)) { - return NULL; + if (!LCH_FilePathJoin(path, PATH_MAX, 2, work_dir, "blocks")) { + LCH_DictDestroy(whitelist); + return false; } - size_t i = 0; - while (LCH_FileExists(path)) { - LCH_Json *const block = LCH_BlockLoad(work_dir, block_id); - if (block == NULL) { - free(block_id); + LCH_List *const files = LCH_FileListDirectory(path, true); + if (files == NULL) { + LCH_DictDestroy(whitelist); + return false; + } + + size_t num_deleted = 0; + size_t num_blocks = 0; + const size_t num_files = LCH_ListLength(files); + for (size_t i = 0; i < num_files; i++) { + const char *const filename = (char *)LCH_ListGet(files, i); + if (!LCH_FilePathJoin(path, PATH_MAX, 3, work_dir, "blocks", filename)) { + LCH_ListDestroy(files); + LCH_DictDestroy(whitelist); return false; } - LCH_LOG_DEBUG("Deleting block with identifier %.7s (path='%s')", block_id, - path); - free(block_id); - if (!LCH_FileDelete(path)) { - LCH_JsonDestroy(block); - return false; + bool is_block_id = true; + for (const char *ch = filename; *ch != '\0'; ch++) { + if (isxdigit(*ch) == 0) { + is_block_id = false; + break; + } + } + if (!is_block_id) { + LCH_LOG_DEBUG( + "Skipping deletion of file '%s': " + "Basename contains an invalid block identifier '%s'", + path, filename); + continue; } - const char *const parent_id = LCH_BlockGetParentId(block); - if (parent_id == NULL) { - LCH_JsonDestroy(block); - return false; + if (!LCH_FileIsRegular(path)) { + LCH_LOG_DEBUG("Skipping deletion of file '%s': Not a regular file", path); + continue; } - block_id = LCH_StringDuplicate(parent_id); - LCH_JsonDestroy(block); - if (block_id == NULL) { - return false; + // By now we're pretty certain that it is indeed a block. + num_blocks += 1; + + const LCH_Buffer *const key = LCH_BufferStaticFromString(filename); + if (LCH_DictHasKey(whitelist, key)) { + LCH_LOG_DEBUG("Skipping deletion of file '%s': Block is whitelisted", + path); + continue; } - if (!LCH_FilePathJoin(path, PATH_MAX, 3, work_dir, "blocks", block_id)) { - free(block_id); + if (!LCH_FileDelete(path)) { + LCH_ListDestroy(files); + LCH_DictDestroy(whitelist); return false; } + LCH_LOG_VERBOSE("Deleted file '%s'", path); + num_deleted += 1; + } + + LCH_LOG_INFO("Purged %zu out of %zu blocks", num_deleted, num_blocks); + + LCH_ListDestroy(files); + LCH_DictDestroy(whitelist); + return true; +} - i += 1; +bool LCH_Purge(const char *const work_dir) { + LCH_Instance *const instance = LCH_InstanceLoad(work_dir); + if (instance == NULL) { + LCH_LOG_ERROR("Failed to load instance from configuration file"); + return false; } - free(block_id); - LCH_LOG_VERBOSE("Garbage collector deleted %zu block(s)", i); + if (!Purge(instance)) { + LCH_InstanceDestroy(instance); + return false; + } + + LCH_InstanceDestroy(instance); return true; } static bool Commit(const LCH_Instance *const instance) { const char *const work_dir = LCH_InstanceGetWorkDirectory(instance); - const bool pretty_print = LCH_InstancePrettyPrint(instance); + const bool pretty_print = LCH_InstanceShouldPrettyPrint(instance); const LCH_List *const table_defs = LCH_InstanceGetTables(instance); size_t n_tables = LCH_ListLength(table_defs); @@ -252,12 +326,12 @@ bool LCH_Commit(const char *const work_dir) { return false; } - if (!CollectGarbage(instance)) { - LCH_LOG_ERROR( - "Failed to collect garbage: " - "NB. there may be unreachable blocks"); - LCH_InstanceDestroy(instance); - return false; + if (LCH_InstanceShouldAutoPurge(instance)) { + LCH_LOG_DEBUG("Auto purge is enabled; purging blocks"); + if (!Purge(instance)) { + LCH_InstanceDestroy(instance); + return false; + } } LCH_InstanceDestroy(instance); @@ -414,7 +488,7 @@ LCH_Buffer *LCH_Diff(const char *const work_dir, const char *const final_id) { return NULL; } - const bool pretty_print = LCH_InstancePrettyPrint(instance); + const bool pretty_print = LCH_InstanceShouldPrettyPrint(instance); char *const block_id = LCH_HeadGet("HEAD", work_dir); if (block_id == NULL) { @@ -474,7 +548,7 @@ LCH_Buffer *LCH_Rebase(const char *const work_dir) { return NULL; } - const bool pretty_print = LCH_InstancePrettyPrint(instance); + const bool pretty_print = LCH_InstanceShouldPrettyPrint(instance); const LCH_List *const table_defs = LCH_InstanceGetTables(instance); size_t n_tables = LCH_ListLength(table_defs); @@ -985,7 +1059,7 @@ LCH_Buffer *LCH_History(const char *const work_dir, const char *const table_id, LCH_BufferDestroy(primary); free(block_id); - const bool pretty = LCH_InstancePrettyPrint(instance); + const bool pretty = LCH_InstanceShouldPrettyPrint(instance); LCH_Buffer *const buffer = LCH_JsonCompose(response, pretty); LCH_JsonDestroy(response); if (buffer == NULL) { diff --git a/lib/leech.h b/lib/leech.h index 46cadfa9d..5f6f2f417 100644 --- a/lib/leech.h +++ b/lib/leech.h @@ -203,4 +203,6 @@ LCH_Buffer *LCH_History(const char *work_dir, const char *table_id, bool LCH_Patch(const char *work_dir, const char *uid_field, const char *uid_value, const char *patch, size_t size); +bool LCH_Purge(const char *work_dir); + #endif // _LEECH_LEECH_H diff --git a/tests/check_json.c b/tests/check_json.c index 7238c198c..5b6b110f3 100644 --- a/tests/check_json.c +++ b/tests/check_json.c @@ -942,9 +942,9 @@ START_TEST(test_LCH_JsonCompose) { ck_assert(LCH_JsonObjectSetString( config, LCH_BufferStaticFromString("version"), version)); - const double max_chain_length = 64.0; + const double chain_length = 64.0; ck_assert(LCH_JsonObjectSetNumber( - config, LCH_BufferStaticFromString("max_chain_length"), 64.0)); + config, LCH_BufferStaticFromString("chain_length"), 64.0)); LCH_Json *const pretty_json = LCH_JsonTrueCreate(); ck_assert_ptr_nonnull(pretty_json); @@ -1002,7 +1002,6 @@ START_TEST(test_LCH_JsonCompose) { "{\n" " \"version\": \"" PACKAGE_VERSION "\",\n" - " \"max_chain_length\": 64,\n" " \"compression\": false,\n" " \"tables\": {\n" " \"BTL\": {\n" @@ -1014,6 +1013,7 @@ START_TEST(test_LCH_JsonCompose) { " \"subsidiary_fields\": null\n" " }\n" " },\n" + " \"chain_length\": 64,\n" " \"pretty_json\": true\n" "}\n"; diff --git a/tests/test_leech.py b/tests/test_leech.py index 2d0d9ba1b..1c7a5b676 100644 --- a/tests/test_leech.py +++ b/tests/test_leech.py @@ -732,7 +732,7 @@ def test_leech_csv_binary(tmp_path): assert execute(command, True) == 0 -def test_leech_garbage_collect(tmp_path): +def test_leech_purge(tmp_path): ########################################################################## # Create config ########################################################################## @@ -744,7 +744,7 @@ def test_leech_garbage_collect(tmp_path): config = { "version": "0.1.0", - "max_chain_length": 3, + "chain_length": 3, "tables": { "BTL": { "primary_fields": ["first_name", "last_name"], @@ -774,6 +774,59 @@ def test_leech_garbage_collect(tmp_path): command = [bin_path, "--debug", f"--workdir={tmp_path}", "commit"] assert execute(command, True) == 0 + assert len(os.listdir(os.path.join(tmp_path, "blocks"))) == 5 + + command = [bin_path, "--debug", f"--workdir={tmp_path}", "purge"] + assert execute(command, True) == 0 + + assert len(os.listdir(os.path.join(tmp_path, "blocks"))) == 3 + + +def test_leech_auto_purge(tmp_path): + ########################################################################## + # Create config + ########################################################################## + + bin_path = os.path.join("bin", "leech") + leech_conf_path = os.path.join(tmp_path, "leech.json") + table_src_path = os.path.join(tmp_path, "beatles.src.csv") + table_dst_path = os.path.join(tmp_path, "beatles.dst.csv") + + config = { + "version": "0.1.0", + "auto_purge": True, + "chain_length": 3, + "tables": { + "BTL": { + "primary_fields": ["first_name", "last_name"], + "subsidiary_fields": ["born"], + "source": { + "params": table_src_path, + "schema": "leech", + "table_name": "beatles", + "callbacks": "lib/.libs/leech_csv.so", + }, + "destination": { + "params": table_dst_path, + "schema": "leech", + "table_name": "beatles", + "callbacks": "lib/.libs/leech_csv.so", + }, + } + }, + } + with open(leech_conf_path, "w") as f: + json.dump(config, f, indent=2) + print(f"Created leech config '{leech_conf_path}' with content:") + with open(leech_conf_path, "r") as f: + print(f.read()) + + for _ in range(5): + command = [bin_path, "--debug", f"--workdir={tmp_path}", "commit"] + assert execute(command, True) == 0 + + assert len(os.listdir(os.path.join(tmp_path, "blocks"))) == 3 + def test_leech_churn(tmp_path): ##########################################################################