Skip to content

Commit

Permalink
Switch exports to Zstd, from bzip2
Browse files Browse the repository at this point in the history
The main benefit of this format for our users is that it decompresses
much faster than bzip2, even at high compression levels.

At level 19 it compresses even better than bzip2 for our files,
hopefully the compression time is still acceptable, if not we can reduce
it as to not overwork the server, at the price of some slightly bigger
files.

On my i7-8700K, unarchiving sentences.tar.bz2 takes 15.5s, compared to
994ms for sentences.csv.zst compressed at level 19.  The file is 183 MiB
compared to 197 MiB with bzip2.  We could go down to 167 MiB with level
22 (which decompresses in 941ms), but compression time starts to get
much higher, not sure this is worth it.

The only downside I see to this change is that user automation will have
to be changed, so perhaps announce it somehow before deploying it.

I’ve also removed the tar step, which only added overhead since we only
ever created a single archive per file.
  • Loading branch information
linkmauve committed Dec 10, 2024
1 parent 3206771 commit 383eb07
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 42 deletions.
12 changes: 6 additions & 6 deletions ansible/roles/setup_database/tasks/import_csv.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@
url: "{{ download_url }}{{ item }}"
dest: "/tmp/{{ item }}"
with_items:
- sentences.tar.bz2
- links.tar.bz2
- sentences.csv.zst
- links.csv.zst
- tag_metadata.csv
- tags_detailed.tar.bz2
- tags_detailed.csv.zst
when: import_csv == 'download'

- name: Unpacking csv's
Expand All @@ -34,9 +34,9 @@
owner: mysql
group: mysql
with_items:
- sentences.tar.bz2
- links.tar.bz2
- tags_detailed.tar.bz2
- sentences.csv.zst
- links.csv.zst
- tags_detailed.csv.zst
when: import_csv == 'download'

- name: Import sentences in the tatoeba database
Expand Down
56 changes: 37 additions & 19 deletions docs/cron/export.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,28 +17,46 @@ mv /var/tmp/*csv "$DL_DIR"
mysql -u "$DB_USER" -p"$DB_PASS" "$DB" < "$ROOT"/docs/database/scripts/wwwjdic.sql
mv /var/tmp/*csv "$DL_DIR"

echo "Starting tarring at $(date -Iseconds)"
compress_csv () {
# TODO: Remove the bzipped tar archive once most users have migrated to the
# zstd file. During the transition period we should monitor which files
# get downloaded to get an insight about how the migration is going on.
#
# It wastes some CPU time each week, disk space, and on users machine their
# CPU time, hence the migration to Zstd.
tar -cjf "${1%csv}tar.bz2" "$1"
zstd -19 "$1"
}

compress_tsv () {
# TODO: Same as above, remove the bzip file once we see fewer requests to
# it.
bzip2 -qf "$1"
zstd -19 -qf "$1"
}

echo "Starting compressing at $(date -Iseconds)"
cd "$DL_DIR"
tar -cjf sentences_base.tar.bz2 sentences_base.csv
tar -cjf sentences_detailed.tar.bz2 sentences_detailed.csv
tar -cjf links.tar.bz2 links.csv
tar -cjf sentences.tar.bz2 sentences.csv
tar -cjf contributions.tar.bz2 contributions.csv
compress_csv sentences_base.csv
compress_csv sentences_detailed.csv
compress_csv links.csv
compress_csv sentences.csv
compress_csv contributions.csv
rm contributions.csv
tar -cjf comments.tar.bz2 sentence_comments.csv
compress_csv sentence_comments.csv
rm sentence_comments.csv
tar -cjf wall.tar.bz2 wall_posts.csv
compress_csv wall_posts.csv
rm wall_posts.csv
tar -cjf tags.tar.bz2 tags.csv
tar -cjf user_lists.tar.bz2 user_lists.csv
tar -cjf sentences_in_lists.tar.bz2 sentences_in_lists.csv
tar -cjf jpn_indices.tar.bz2 jpn_indices.csv
tar -cjf sentences_with_audio.tar.bz2 sentences_with_audio.csv
tar -cjf user_languages.tar.bz2 user_languages.csv
tar -cjf tags_detailed.tar.bz2 tags_detailed.csv
tar -cjf sentences_CC0.tar.bz2 sentences_CC0.csv
tar -cjf transcriptions.tar.bz2 transcriptions.csv
tar -cjf sentences_base.tar.bz2 sentences_base.csv
compress_csv tags.csv
compress_csv user_lists.csv
compress_csv sentences_in_lists.csv
compress_csv jpn_indices.csv
compress_csv sentences_with_audio.csv
compress_csv user_languages.csv
compress_csv tags_detailed.csv
compress_csv sentences_CC0.csv
compress_csv transcriptions.csv
compress_csv sentences_base.csv

echo "Starting language splitting for sentences at $(date -Iseconds)"
# Create per-language files for the different sentences files
Expand Down Expand Up @@ -173,7 +191,7 @@ mysql --skip-column-names --batch tatoeba -e \
}'

echo "Starting cleanup at $(date -Iseconds)"
find $TEMP_DIR -path '*tsv' -exec bzip2 -qf '{}' +
find $TEMP_DIR -path '*tsv' -exec compress_tsv '{}' +
rm -rf $DL_DIR/per_language
rm transcriptions.csv
mv -f $TEMP_DIR $DL_DIR
Expand Down
20 changes: 10 additions & 10 deletions src/Template/Pages/downloads.ctp
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
<dl>
<dt><?= $filename ?></dt>
<dd>
<a href="<?= $download_url ?>sentences_base.tar.bz2">sentences_base.tar.bz2</a>
<a href="<?= $download_url ?>sentences_base.csv.zst">sentences_base.csv.zst</a>
</dd>
<dt><?= $description ?></dt>
<dd>
Expand Down Expand Up @@ -323,7 +323,7 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
<dl>
<dt><?= $filename ?></dt>
<dd>
<a href="<?= $download_url ?>links.tar.bz2">links.tar.bz2</a>
<a href="<?= $download_url ?>links.csv.zst">links.csv.zst</a>
</dd>
<dt><?= $description ?></dt>
<dd>
Expand All @@ -350,7 +350,7 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
<dl>
<dt><?= $filename ?></dt>
<dd>
<a href="<?= $download_url ?>tags.tar.bz2">tags.tar.bz2</a>
<a href="<?= $download_url ?>tags.csv.zst">tags.csv.zst</a>
</dd>
<dt><?= $description ?></dt>
<dd>
Expand All @@ -376,7 +376,7 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
<dl>
<dt><?= $filename ?></dt>
<dd>
<a href="<?= $download_url ?>user_lists.tar.bz2">user_lists.tar.bz2</a>
<a href="<?= $download_url ?>user_lists.csv.zst">user_lists.csv.zst</a>
</dd>
<dt><?= $description ?></dt>
<dd>
Expand All @@ -400,8 +400,8 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
<dl>
<dt><?= $filename ?></dt>
<dd>
<a href="<?= $download_url ?>sentences_in_lists.tar.bz2">
sentences_in_lists.tar.bz2
<a href="<?= $download_url ?>sentences_in_lists.csv.zst">
sentences_in_lists.csv.zst
</a>
</dd>
<dt><?= $description ?></dt>
Expand Down Expand Up @@ -429,7 +429,7 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
<dl>
<dt><?= $filename ?></dt>
<dd>
<a href="<?= $download_url ?>jpn_indices.tar.bz2">jpn_indices.tar.bz2</a>
<a href="<?= $download_url ?>jpn_indices.csv.zst">jpn_indices.csv.zst</a>
</dd>
<dt><?= $description ?></dt>
<dd>
Expand Down Expand Up @@ -460,8 +460,8 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
<dl>
<dt><?= $filename ?></dt>
<dd>
<a href="<?= $download_url ?>sentences_with_audio.tar.bz2">
sentences_with_audio.tar.bz2
<a href="<?= $download_url ?>sentences_with_audio.csv.zst">
sentences_with_audio.csv.zst
</a>
</dd>
<dt><?= $description ?></dt>
Expand Down Expand Up @@ -501,7 +501,7 @@ $transcriptionsOptions = $this->Downloads->createOptions('transcriptions');
<dl>
<dt><?= $filename ?></dt>
<dd>
<a href="<?= $download_url ?>user_languages.tar.bz2">user_languages.tar.bz2</a>
<a href="<?= $download_url ?>user_languages.csv.zst">user_languages.csv.zst</a>
</dd>
<dt><?= $description ?></dt>
<dd>
Expand Down
4 changes: 2 additions & 2 deletions src/View/Helper/DownloadsHelper.php
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ private function availableFiles($basename) {
);

$dir = new Folder($perLanguageDir);
$paths = $dir->findRecursive(".*$basename\.tsv\.bz2$");
$paths = $dir->findRecursive(".*$basename\.tsv\.zst$");
$map = [];
foreach ($paths as $path) {
$path = substr($path, strlen($perLanguageDir) + 1);
Expand All @@ -74,7 +74,7 @@ private function availableFiles($basename) {
public function createOptions($basename) {
$urlForAll = Folder::addPathElement(
Configure::read('Downloads.url'),
"$basename.tar.bz2"
"$basename.csv.zst"
);
$options[0] = [
'language' => __('All languages'),
Expand Down
10 changes: 5 additions & 5 deletions tests/TestCase/View/Helper/DownloadsHelperTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class DownloadsHelperTest extends TestCase {

private static function createTempTree() {
$languages = ['eng', 'fra', 'jpn', 'unknown'];
$files = ['sentences.tsv.bz2', 'sentences_detailed.tsv.bz2', 'sentences_CC0.tsv.bz2'];
$files = ['sentences.tsv.zst', 'sentences_detailed.tsv.zst', 'sentences_CC0.tsv.zst'];
foreach ($languages as $lang) {
$path = Folder::addPathElement(TMP, ['exports', 'per_language', $lang]);
$subdir = new Folder($path, true);
Expand Down Expand Up @@ -54,7 +54,7 @@ public function testCreateOptions_InvalidBasename() {

$this->assertEquals(1, count($options));
$this->assertEquals(
Folder::addPathElement(Configure::read('Downloads.url'), "foobar.tar.bz2"),
Folder::addPathElement(Configure::read('Downloads.url'), "foobar.csv.zst"),
$options[0]['url']
);
}
Expand All @@ -77,14 +77,14 @@ public function testCreateOptions_ValidBasename($basename) {
$this->assertEquals(
Folder::addPathElement(
Configure::read('Downloads.url'),
"$basename.tar.bz2"
"$basename.csv.zst"
),
$options[0]['url']
);
$this->assertEquals(
Folder::addPathElement(
Configure::read('Downloads.url'),
['per_language', 'eng', "eng_$basename.tsv.bz2"]
['per_language', 'eng', "eng_$basename.tsv.zst"]
),
$options[1]['url']
);
Expand All @@ -103,7 +103,7 @@ public function testCreateOptions_NoPerLanguageFilesAvailable($basename) {
$this->assertEquals(
Folder::addPathElement(
Configure::read('Downloads.url'),
"$basename.tar.bz2"
"$basename.csv.zst"
),
$options[0]['url']
);
Expand Down

0 comments on commit 383eb07

Please sign in to comment.