diff --git a/ansible/roles/setup_database/tasks/import_csv.yml b/ansible/roles/setup_database/tasks/import_csv.yml index 19d8a5ed79..0c5b89a53e 100644 --- a/ansible/roles/setup_database/tasks/import_csv.yml +++ b/ansible/roles/setup_database/tasks/import_csv.yml @@ -19,10 +19,10 @@ url: "{{ download_url }}{{ item }}" dest: "/tmp/{{ item }}" with_items: - - sentences.tar.bz2 - - links.tar.bz2 + - sentences.csv.zst + - links.csv.zst - tag_metadata.csv - - tags_detailed.tar.bz2 + - tags_detailed.csv.zst when: import_csv == 'download' - name: Unpacking csv's @@ -34,9 +34,9 @@ owner: mysql group: mysql with_items: - - sentences.tar.bz2 - - links.tar.bz2 - - tags_detailed.tar.bz2 + - sentences.csv.zst + - links.csv.zst + - tags_detailed.csv.zst when: import_csv == 'download' - name: Import sentences in the tatoeba database diff --git a/docs/cron/export.sh b/docs/cron/export.sh index 1443df4bb6..360604cc4d 100755 --- a/docs/cron/export.sh +++ b/docs/cron/export.sh @@ -17,28 +17,46 @@ mv /var/tmp/*csv "$DL_DIR" mysql -u "$DB_USER" -p"$DB_PASS" "$DB" < "$ROOT"/docs/database/scripts/wwwjdic.sql mv /var/tmp/*csv "$DL_DIR" -echo "Starting tarring at $(date -Iseconds)" +compress_csv () { + # TODO: Remove the bzipped tar archive once most users have migrated to the + # zstd file. During the transition period we should monitor which files + # get downloaded to get an insight about how the migration is going on. + # + # It wastes some CPU time each week, disk space, and on users machine their + # CPU time, hence the migration to Zstd. + tar -cjf "${1%csv}tar.bz2" "$1" + zstd -19 "$1" +} + +compress_tsv () { + # TODO: Same as above, remove the bzip file once we see fewer requests to + # it. + bzip2 -qf "$1" + zstd -19 -qf "$1" +} + +echo "Starting compressing at $(date -Iseconds)" cd "$DL_DIR" -tar -cjf sentences_base.tar.bz2 sentences_base.csv -tar -cjf sentences_detailed.tar.bz2 sentences_detailed.csv -tar -cjf links.tar.bz2 links.csv -tar -cjf sentences.tar.bz2 sentences.csv -tar -cjf contributions.tar.bz2 contributions.csv +compress_csv sentences_base.csv +compress_csv sentences_detailed.csv +compress_csv links.csv +compress_csv sentences.csv +compress_csv contributions.csv rm contributions.csv -tar -cjf comments.tar.bz2 sentence_comments.csv +compress_csv sentence_comments.csv rm sentence_comments.csv -tar -cjf wall.tar.bz2 wall_posts.csv +compress_csv wall_posts.csv rm wall_posts.csv -tar -cjf tags.tar.bz2 tags.csv -tar -cjf user_lists.tar.bz2 user_lists.csv -tar -cjf sentences_in_lists.tar.bz2 sentences_in_lists.csv -tar -cjf jpn_indices.tar.bz2 jpn_indices.csv -tar -cjf sentences_with_audio.tar.bz2 sentences_with_audio.csv -tar -cjf user_languages.tar.bz2 user_languages.csv -tar -cjf tags_detailed.tar.bz2 tags_detailed.csv -tar -cjf sentences_CC0.tar.bz2 sentences_CC0.csv -tar -cjf transcriptions.tar.bz2 transcriptions.csv -tar -cjf sentences_base.tar.bz2 sentences_base.csv +compress_csv tags.csv +compress_csv user_lists.csv +compress_csv sentences_in_lists.csv +compress_csv jpn_indices.csv +compress_csv sentences_with_audio.csv +compress_csv user_languages.csv +compress_csv tags_detailed.csv +compress_csv sentences_CC0.csv +compress_csv transcriptions.csv +compress_csv sentences_base.csv echo "Starting language splitting for sentences at $(date -Iseconds)" # Create per-language files for the different sentences files @@ -173,7 +191,7 @@ mysql --skip-column-names --batch tatoeba -e \ }' echo "Starting cleanup at $(date -Iseconds)" -find $TEMP_DIR -path '*tsv' -exec bzip2 -qf '{}' + +find $TEMP_DIR -path '*tsv' -exec compress_tsv '{}' + rm -rf $DL_DIR/per_language rm transcriptions.csv mv -f $TEMP_DIR $DL_DIR diff --git a/src/Template/Pages/downloads.ctp b/src/Template/Pages/downloads.ctp index 412a202f64..0e8b200f3f 100644 --- a/src/Template/Pages/downloads.ctp +++ b/src/Template/Pages/downloads.ctp @@ -273,7 +273,7 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
- sentences_base.tar.bz2 + sentences_base.csv.zst
@@ -323,7 +323,7 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
- links.tar.bz2 + links.csv.zst
@@ -350,7 +350,7 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
- tags.tar.bz2 + tags.csv.zst
@@ -376,7 +376,7 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
- user_lists.tar.bz2 + user_lists.csv.zst
@@ -400,8 +400,8 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
- - sentences_in_lists.tar.bz2 + + sentences_in_lists.csv.zst
@@ -429,7 +429,7 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
- jpn_indices.tar.bz2 + jpn_indices.csv.zst
@@ -460,8 +460,8 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
- - sentences_with_audio.tar.bz2 + + sentences_with_audio.csv.zst
@@ -501,7 +501,7 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
- user_languages.tar.bz2 + user_languages.csv.zst
diff --git a/src/View/Helper/DownloadsHelper.php b/src/View/Helper/DownloadsHelper.php index c851665e4d..15056d8e4d 100644 --- a/src/View/Helper/DownloadsHelper.php +++ b/src/View/Helper/DownloadsHelper.php @@ -48,7 +48,7 @@ private function availableFiles($basename) { ); $dir = new Folder($perLanguageDir); - $paths = $dir->findRecursive(".*$basename\.tsv\.bz2$"); + $paths = $dir->findRecursive(".*$basename\.tsv\.zst$"); $map = []; foreach ($paths as $path) { $path = substr($path, strlen($perLanguageDir) + 1); @@ -74,7 +74,7 @@ private function availableFiles($basename) { public function createOptions($basename) { $urlForAll = Folder::addPathElement( Configure::read('Downloads.url'), - "$basename.tar.bz2" + "$basename.csv.zst" ); $options[0] = [ 'language' => __('All languages'), diff --git a/tests/TestCase/View/Helper/DownloadsHelperTest.php b/tests/TestCase/View/Helper/DownloadsHelperTest.php index 77d4716fdc..aa17b89117 100644 --- a/tests/TestCase/View/Helper/DownloadsHelperTest.php +++ b/tests/TestCase/View/Helper/DownloadsHelperTest.php @@ -14,7 +14,7 @@ class DownloadsHelperTest extends TestCase { private static function createTempTree() { $languages = ['eng', 'fra', 'jpn', 'unknown']; - $files = ['sentences.tsv.bz2', 'sentences_detailed.tsv.bz2', 'sentences_CC0.tsv.bz2']; + $files = ['sentences.tsv.zst', 'sentences_detailed.tsv.zst', 'sentences_CC0.tsv.zst']; foreach ($languages as $lang) { $path = Folder::addPathElement(TMP, ['exports', 'per_language', $lang]); $subdir = new Folder($path, true); @@ -54,7 +54,7 @@ public function testCreateOptions_InvalidBasename() { $this->assertEquals(1, count($options)); $this->assertEquals( - Folder::addPathElement(Configure::read('Downloads.url'), "foobar.tar.bz2"), + Folder::addPathElement(Configure::read('Downloads.url'), "foobar.csv.zst"), $options[0]['url'] ); } @@ -77,14 +77,14 @@ public function testCreateOptions_ValidBasename($basename) { $this->assertEquals( Folder::addPathElement( Configure::read('Downloads.url'), - "$basename.tar.bz2" + "$basename.csv.zst" ), $options[0]['url'] ); $this->assertEquals( Folder::addPathElement( Configure::read('Downloads.url'), - ['per_language', 'eng', "eng_$basename.tsv.bz2"] + ['per_language', 'eng', "eng_$basename.tsv.zst"] ), $options[1]['url'] ); @@ -103,7 +103,7 @@ public function testCreateOptions_NoPerLanguageFilesAvailable($basename) { $this->assertEquals( Folder::addPathElement( Configure::read('Downloads.url'), - "$basename.tar.bz2" + "$basename.csv.zst" ), $options[0]['url'] );