🐛(back) manage subtitles content starting with a BOM

When reading a subtitle content file uploaded, sometimes it starts with a Byte Order Mark and the srt reader is failing to detect the content as a srt one. We have to remove it before using the detect_format from the pycaption library.
openfun · Jul 31, 2024 · 69021c0 · 69021c0
1 parent b5ddedf
commit 69021c0
Show file tree

Hide file tree

Showing 3 changed files with 75 additions and 1 deletion.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,10 @@ Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [Unreleased]
 
+### Fixed
+
+- Manage subtitles content starting with a BOM
+
 ## [5.0.2] - 2024-07-30
 
 ### Changed

diff --git a/src/backend/marsha/core/tasks/timed_text_track.py b/src/backend/marsha/core/tasks/timed_text_track.py
@@ -62,7 +62,7 @@ def convert_timed_text_track(timed_text_track_pk, stamp):
         prefix_destination = timed_text_track.get_videos_storage_prefix(stamp)
 
         with video_storage.open(source, "rt") as timed_text_file:
-            timed_text = timed_text_file.read()
+            timed_text = timed_text_file.read().replace("\ufeff", "")
             reader = detect_format(timed_text)
             if not reader:
                 raise ReaderNotImplementedError(f"Reader {reader} not supported")

diff --git a/...core/tests/tasks/test_timed_test_track.py → ...core/tests/tasks/test_timed_text_track.py b/...core/tests/tasks/test_timed_test_track.py → ...core/tests/tasks/test_timed_text_track.py
@@ -30,6 +30,19 @@
 And this is the third subtitle.
 """
 
+SRT_EXAMPLE_WITH_BOM = b"""\xef\xbb\xbf1
+00:00:01,000 --> 00:00:04,000
+Hello, this is the first subtitle.
+
+2
+00:00:05,000 --> 00:00:08,000
+This is the second subtitle.
+
+3
+00:00:09,000 --> 00:00:12,000
+And this is the third subtitle.
+"""
+
 SRT_EXAMPLE_WITH_HTML = b"""1
 00:00:01,000 --> 00:00:04,000
 <b>Hello</b>, this is the <i>first</i> subtitle.
@@ -164,6 +177,63 @@ def test_timed_text_track_with_transcript(self):
         )
         self.assertTrue(video_storage.exists(new_source_file))
 
+    def test_timed_text_track_with_bom_transcript(self):
+        """
+        Test the the convert_timed_text_track function. It should create
+        a new VTT file based on a srt file containing at the beginning a BOM
+        (Byte Order Mark).
+        """
+        timed_text_track = TimedTextTrackFactory(
+            language="fr",
+            mode="ts",
+        )
+
+        stamp = "1640995201"
+        # Create SRT file
+        with BytesIO() as buffer:
+            buffer.write(SRT_EXAMPLE_WITH_BOM)
+            content_file = ContentFile(buffer.getvalue())
+            video_storage.save(
+                timed_text_track.get_videos_storage_prefix(
+                    stamp, TMP_VIDEOS_STORAGE_BASE_DIRECTORY
+                ),
+                content_file,
+            )
+
+        self.assertFalse(
+            video_storage.exists(
+                f"{timed_text_track.get_videos_storage_prefix(stamp)}/{stamp}_fr.vtt"
+            )
+        )
+
+        convert_timed_text_track(str(timed_text_track.pk), stamp)
+
+        timed_text_track.refresh_from_db()
+        self.assertEqual(timed_text_track.upload_state, READY)
+        self.assertEqual(timed_text_track.process_pipeline, CELERY_PIPELINE)
+        new_vtt_file = (
+            f"{timed_text_track.get_videos_storage_prefix(stamp)}/{stamp}.vtt"
+        )
+        self.assertTrue(video_storage.exists(new_vtt_file))
+        with video_storage.open(new_vtt_file, "rt") as new_vtt_file:
+            content = new_vtt_file.read()
+            self.assertEqual(
+                content,
+                (
+                    "WEBVTT\n\n"
+                    "00:01.000 --> 00:04.000\n"
+                    "Hello, this is the first subtitle.\n\n"
+                    "00:05.000 --> 00:08.000\n"
+                    "This is the second subtitle.\n\n"
+                    "00:09.000 --> 00:12.000\n"
+                    "And this is the third subtitle.\n"
+                ),
+            )
+        new_source_file = (
+            f"{timed_text_track.get_videos_storage_prefix(stamp)}/source.srt"
+        )
+        self.assertTrue(video_storage.exists(new_source_file))
+
     def test_timed_text_track_with_invalid_transcript(self):
         """
         Test the the convert_timed_text_track function. It should fail and update