Skip to content

Commit

Permalink
🐛(back) manage subtitles content starting with a BOM
Browse files Browse the repository at this point in the history
When reading a subtitle content file uploaded, sometimes it starts with
a Byte Order Mark and the srt reader is failing to detect the content as
a srt one.
We have to remove it before using the detect_format from the pycaption
library.
  • Loading branch information
lunika committed Jul 31, 2024
1 parent b5ddedf commit 69021c0
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 1 deletion.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]

### Fixed

- Manage subtitles content starting with a BOM

## [5.0.2] - 2024-07-30

### Changed
Expand Down
2 changes: 1 addition & 1 deletion src/backend/marsha/core/tasks/timed_text_track.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def convert_timed_text_track(timed_text_track_pk, stamp):
prefix_destination = timed_text_track.get_videos_storage_prefix(stamp)

with video_storage.open(source, "rt") as timed_text_file:
timed_text = timed_text_file.read()
timed_text = timed_text_file.read().replace("\ufeff", "")
reader = detect_format(timed_text)
if not reader:
raise ReaderNotImplementedError(f"Reader {reader} not supported")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,19 @@
And this is the third subtitle.
"""

SRT_EXAMPLE_WITH_BOM = b"""\xef\xbb\xbf1
00:00:01,000 --> 00:00:04,000
Hello, this is the first subtitle.
2
00:00:05,000 --> 00:00:08,000
This is the second subtitle.
3
00:00:09,000 --> 00:00:12,000
And this is the third subtitle.
"""

SRT_EXAMPLE_WITH_HTML = b"""1
00:00:01,000 --> 00:00:04,000
<b>Hello</b>, this is the <i>first</i> subtitle.
Expand Down Expand Up @@ -164,6 +177,63 @@ def test_timed_text_track_with_transcript(self):
)
self.assertTrue(video_storage.exists(new_source_file))

def test_timed_text_track_with_bom_transcript(self):
"""
Test the the convert_timed_text_track function. It should create
a new VTT file based on a srt file containing at the beginning a BOM
(Byte Order Mark).
"""
timed_text_track = TimedTextTrackFactory(
language="fr",
mode="ts",
)

stamp = "1640995201"
# Create SRT file
with BytesIO() as buffer:
buffer.write(SRT_EXAMPLE_WITH_BOM)
content_file = ContentFile(buffer.getvalue())
video_storage.save(
timed_text_track.get_videos_storage_prefix(
stamp, TMP_VIDEOS_STORAGE_BASE_DIRECTORY
),
content_file,
)

self.assertFalse(
video_storage.exists(
f"{timed_text_track.get_videos_storage_prefix(stamp)}/{stamp}_fr.vtt"
)
)

convert_timed_text_track(str(timed_text_track.pk), stamp)

timed_text_track.refresh_from_db()
self.assertEqual(timed_text_track.upload_state, READY)
self.assertEqual(timed_text_track.process_pipeline, CELERY_PIPELINE)
new_vtt_file = (
f"{timed_text_track.get_videos_storage_prefix(stamp)}/{stamp}.vtt"
)
self.assertTrue(video_storage.exists(new_vtt_file))
with video_storage.open(new_vtt_file, "rt") as new_vtt_file:
content = new_vtt_file.read()
self.assertEqual(
content,
(
"WEBVTT\n\n"
"00:01.000 --> 00:04.000\n"
"Hello, this is the first subtitle.\n\n"
"00:05.000 --> 00:08.000\n"
"This is the second subtitle.\n\n"
"00:09.000 --> 00:12.000\n"
"And this is the third subtitle.\n"
),
)
new_source_file = (
f"{timed_text_track.get_videos_storage_prefix(stamp)}/source.srt"
)
self.assertTrue(video_storage.exists(new_source_file))

def test_timed_text_track_with_invalid_transcript(self):
"""
Test the the convert_timed_text_track function. It should fail and update
Expand Down

0 comments on commit 69021c0

Please sign in to comment.