From 383eb07fc54e0869b685f21656fce94a1b3d90ba Mon Sep 17 00:00:00 2001 From: Emmanuel Gil Peyrot Date: Mon, 9 Dec 2024 14:53:48 +0100 Subject: [PATCH] Switch exports to Zstd, from bzip2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The main benefit of this format for our users is that it decompresses much faster than bzip2, even at high compression levels. At level 19 it compresses even better than bzip2 for our files, hopefully the compression time is still acceptable, if not we can reduce it as to not overwork the server, at the price of some slightly bigger files. On my i7-8700K, unarchiving sentences.tar.bz2 takes 15.5s, compared to 994ms for sentences.csv.zst compressed at level 19. The file is 183 MiB compared to 197 MiB with bzip2. We could go down to 167 MiB with level 22 (which decompresses in 941ms), but compression time starts to get much higher, not sure this is worth it. The only downside I see to this change is that user automation will have to be changed, so perhaps announce it somehow before deploying it. I’ve also removed the tar step, which only added overhead since we only ever created a single archive per file. --- .../roles/setup_database/tasks/import_csv.yml | 12 ++-- docs/cron/export.sh | 56 ++++++++++++------- src/Template/Pages/downloads.ctp | 20 +++---- src/View/Helper/DownloadsHelper.php | 4 +- .../View/Helper/DownloadsHelperTest.php | 10 ++-- 5 files changed, 60 insertions(+), 42 deletions(-) diff --git a/ansible/roles/setup_database/tasks/import_csv.yml b/ansible/roles/setup_database/tasks/import_csv.yml index 19d8a5ed79..0c5b89a53e 100644 --- a/ansible/roles/setup_database/tasks/import_csv.yml +++ b/ansible/roles/setup_database/tasks/import_csv.yml @@ -19,10 +19,10 @@ url: "{{ download_url }}{{ item }}" dest: "/tmp/{{ item }}" with_items: - - sentences.tar.bz2 - - links.tar.bz2 + - sentences.csv.zst + - links.csv.zst - tag_metadata.csv - - tags_detailed.tar.bz2 + - tags_detailed.csv.zst when: import_csv == 'download' - name: Unpacking csv's @@ -34,9 +34,9 @@ owner: mysql group: mysql with_items: - - sentences.tar.bz2 - - links.tar.bz2 - - tags_detailed.tar.bz2 + - sentences.csv.zst + - links.csv.zst + - tags_detailed.csv.zst when: import_csv == 'download' - name: Import sentences in the tatoeba database diff --git a/docs/cron/export.sh b/docs/cron/export.sh index 1443df4bb6..360604cc4d 100755 --- a/docs/cron/export.sh +++ b/docs/cron/export.sh @@ -17,28 +17,46 @@ mv /var/tmp/*csv "$DL_DIR" mysql -u "$DB_USER" -p"$DB_PASS" "$DB" < "$ROOT"/docs/database/scripts/wwwjdic.sql mv /var/tmp/*csv "$DL_DIR" -echo "Starting tarring at $(date -Iseconds)" +compress_csv () { + # TODO: Remove the bzipped tar archive once most users have migrated to the + # zstd file. During the transition period we should monitor which files + # get downloaded to get an insight about how the migration is going on. + # + # It wastes some CPU time each week, disk space, and on users machine their + # CPU time, hence the migration to Zstd. + tar -cjf "${1%csv}tar.bz2" "$1" + zstd -19 "$1" +} + +compress_tsv () { + # TODO: Same as above, remove the bzip file once we see fewer requests to + # it. + bzip2 -qf "$1" + zstd -19 -qf "$1" +} + +echo "Starting compressing at $(date -Iseconds)" cd "$DL_DIR" -tar -cjf sentences_base.tar.bz2 sentences_base.csv -tar -cjf sentences_detailed.tar.bz2 sentences_detailed.csv -tar -cjf links.tar.bz2 links.csv -tar -cjf sentences.tar.bz2 sentences.csv -tar -cjf contributions.tar.bz2 contributions.csv +compress_csv sentences_base.csv +compress_csv sentences_detailed.csv +compress_csv links.csv +compress_csv sentences.csv +compress_csv contributions.csv rm contributions.csv -tar -cjf comments.tar.bz2 sentence_comments.csv +compress_csv sentence_comments.csv rm sentence_comments.csv -tar -cjf wall.tar.bz2 wall_posts.csv +compress_csv wall_posts.csv rm wall_posts.csv -tar -cjf tags.tar.bz2 tags.csv -tar -cjf user_lists.tar.bz2 user_lists.csv -tar -cjf sentences_in_lists.tar.bz2 sentences_in_lists.csv -tar -cjf jpn_indices.tar.bz2 jpn_indices.csv -tar -cjf sentences_with_audio.tar.bz2 sentences_with_audio.csv -tar -cjf user_languages.tar.bz2 user_languages.csv -tar -cjf tags_detailed.tar.bz2 tags_detailed.csv -tar -cjf sentences_CC0.tar.bz2 sentences_CC0.csv -tar -cjf transcriptions.tar.bz2 transcriptions.csv -tar -cjf sentences_base.tar.bz2 sentences_base.csv +compress_csv tags.csv +compress_csv user_lists.csv +compress_csv sentences_in_lists.csv +compress_csv jpn_indices.csv +compress_csv sentences_with_audio.csv +compress_csv user_languages.csv +compress_csv tags_detailed.csv +compress_csv sentences_CC0.csv +compress_csv transcriptions.csv +compress_csv sentences_base.csv echo "Starting language splitting for sentences at $(date -Iseconds)" # Create per-language files for the different sentences files @@ -173,7 +191,7 @@ mysql --skip-column-names --batch tatoeba -e \ }' echo "Starting cleanup at $(date -Iseconds)" -find $TEMP_DIR -path '*tsv' -exec bzip2 -qf '{}' + +find $TEMP_DIR -path '*tsv' -exec compress_tsv '{}' + rm -rf $DL_DIR/per_language rm transcriptions.csv mv -f $TEMP_DIR $DL_DIR diff --git a/src/Template/Pages/downloads.ctp b/src/Template/Pages/downloads.ctp index 412a202f64..0e8b200f3f 100644 --- a/src/Template/Pages/downloads.ctp +++ b/src/Template/Pages/downloads.ctp @@ -273,7 +273,7 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
- sentences_base.tar.bz2 + sentences_base.csv.zst
@@ -323,7 +323,7 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
- links.tar.bz2 + links.csv.zst
@@ -350,7 +350,7 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
- tags.tar.bz2 + tags.csv.zst
@@ -376,7 +376,7 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
- user_lists.tar.bz2 + user_lists.csv.zst
@@ -400,8 +400,8 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
- - sentences_in_lists.tar.bz2 + + sentences_in_lists.csv.zst
@@ -429,7 +429,7 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
- jpn_indices.tar.bz2 + jpn_indices.csv.zst
@@ -460,8 +460,8 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
- - sentences_with_audio.tar.bz2 + + sentences_with_audio.csv.zst
@@ -501,7 +501,7 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
- user_languages.tar.bz2 + user_languages.csv.zst
diff --git a/src/View/Helper/DownloadsHelper.php b/src/View/Helper/DownloadsHelper.php index c851665e4d..15056d8e4d 100644 --- a/src/View/Helper/DownloadsHelper.php +++ b/src/View/Helper/DownloadsHelper.php @@ -48,7 +48,7 @@ private function availableFiles($basename) { ); $dir = new Folder($perLanguageDir); - $paths = $dir->findRecursive(".*$basename\.tsv\.bz2$"); + $paths = $dir->findRecursive(".*$basename\.tsv\.zst$"); $map = []; foreach ($paths as $path) { $path = substr($path, strlen($perLanguageDir) + 1); @@ -74,7 +74,7 @@ private function availableFiles($basename) { public function createOptions($basename) { $urlForAll = Folder::addPathElement( Configure::read('Downloads.url'), - "$basename.tar.bz2" + "$basename.csv.zst" ); $options[0] = [ 'language' => __('All languages'), diff --git a/tests/TestCase/View/Helper/DownloadsHelperTest.php b/tests/TestCase/View/Helper/DownloadsHelperTest.php index 77d4716fdc..aa17b89117 100644 --- a/tests/TestCase/View/Helper/DownloadsHelperTest.php +++ b/tests/TestCase/View/Helper/DownloadsHelperTest.php @@ -14,7 +14,7 @@ class DownloadsHelperTest extends TestCase { private static function createTempTree() { $languages = ['eng', 'fra', 'jpn', 'unknown']; - $files = ['sentences.tsv.bz2', 'sentences_detailed.tsv.bz2', 'sentences_CC0.tsv.bz2']; + $files = ['sentences.tsv.zst', 'sentences_detailed.tsv.zst', 'sentences_CC0.tsv.zst']; foreach ($languages as $lang) { $path = Folder::addPathElement(TMP, ['exports', 'per_language', $lang]); $subdir = new Folder($path, true); @@ -54,7 +54,7 @@ public function testCreateOptions_InvalidBasename() { $this->assertEquals(1, count($options)); $this->assertEquals( - Folder::addPathElement(Configure::read('Downloads.url'), "foobar.tar.bz2"), + Folder::addPathElement(Configure::read('Downloads.url'), "foobar.csv.zst"), $options[0]['url'] ); } @@ -77,14 +77,14 @@ public function testCreateOptions_ValidBasename($basename) { $this->assertEquals( Folder::addPathElement( Configure::read('Downloads.url'), - "$basename.tar.bz2" + "$basename.csv.zst" ), $options[0]['url'] ); $this->assertEquals( Folder::addPathElement( Configure::read('Downloads.url'), - ['per_language', 'eng', "eng_$basename.tsv.bz2"] + ['per_language', 'eng', "eng_$basename.tsv.zst"] ), $options[1]['url'] ); @@ -103,7 +103,7 @@ public function testCreateOptions_NoPerLanguageFilesAvailable($basename) { $this->assertEquals( Folder::addPathElement( Configure::read('Downloads.url'), - "$basename.tar.bz2" + "$basename.csv.zst" ), $options[0]['url'] );