Remove ffmpeg fallback from sox_io backend

In #2419, we added ffmpeg as fallback for sox_io backend. The was a warkaround for solving the issue with libmad removal. Now that we introduced `backend` argument to I/O functions, and libsox integration is moved to dynamic binding where users can use libsox with libmad integration, we do not need the workaround. This commit is based on reverting #2416 (fd7ace1).
pytorch · Jul 28, 2023 · b52a3f4 · b52a3f4
1 parent 7368e33
commit b52a3f4
Show file tree

Hide file tree

Showing 12 changed files with 27 additions and 71 deletions.
diff --git a/test/torchaudio_unittest/backend/sox_io/info_test.py b/test/torchaudio_unittest/backend/sox_io/info_test.py
@@ -277,6 +277,7 @@ def test_htk(self):
 
 
 @skipIfNoSox
+@skipIfNoSoxDecoder("opus")
 class TestInfoOpus(PytorchTestCase):
     @parameterized.expand(
         list(
@@ -290,6 +291,8 @@ class TestInfoOpus(PytorchTestCase):
     )
     def test_opus(self, bitrate, num_channels, compression_level):
         """`sox_io_backend.info` can check opus file correcty"""
+        import torchaudio
+        torchaudio.utils.sox_utils.set_verbosity(6)
         path = get_asset_path("io", f"{bitrate}_{compression_level}_{num_channels}ch.opus")
         info = sox_io_backend.info(path)
         assert info.sample_rate == 48000

diff --git a/torchaudio/backend/sox_io_backend.py b/torchaudio/backend/sox_io_backend.py
@@ -7,33 +7,6 @@
 from .common import AudioMetaData
 
 
-# Note: need to comply TorchScript syntax -- need annotation and no f-string
-def _fail_info(filepath: str, format: Optional[str]) -> AudioMetaData:
-    raise RuntimeError("Failed to fetch metadata from {}".format(filepath))
-
-
-# Note: need to comply TorchScript syntax -- need annotation and no f-string
-def _fail_load(
-    filepath: str,
-    frame_offset: int = 0,
-    num_frames: int = -1,
-    normalize: bool = True,
-    channels_first: bool = True,
-    format: Optional[str] = None,
-) -> Tuple[torch.Tensor, int]:
-    raise RuntimeError("Failed to load audio from {}".format(filepath))
-
-
-if torchaudio._extension._FFMPEG_EXT is not None:
-    import torchaudio.io._compat as _compat
-
-    _fallback_info = _compat.info_audio
-    _fallback_load = _compat.load_audio
-else:
-    _fallback_info = _fail_info
-    _fallback_load = _fail_load
-
-
 @torchaudio._extension.fail_if_no_sox
 def info(
     filepath: str,
@@ -58,9 +31,7 @@ def info(
             raise RuntimeError("sox_io backend does not support file-like object.")
         filepath = os.fspath(filepath)
     sinfo = torch.ops.torchaudio.sox_io_get_info(filepath, format)
-    if sinfo is not None:
-        return AudioMetaData(*sinfo)
-    return _fallback_info(filepath, format)
+    return AudioMetaData(*sinfo)
 
 
 @torchaudio._extension.fail_if_no_sox
@@ -153,12 +124,9 @@ def load(
         if hasattr(filepath, "read"):
             raise RuntimeError("sox_io backend does not support file-like object.")
         filepath = os.fspath(filepath)
-    ret = torch.ops.torchaudio.sox_io_load_audio_file(
+    return torch.ops.torchaudio.sox_io_load_audio_file(
         filepath, frame_offset, num_frames, normalize, channels_first, format
     )
-    if ret is not None:
-        return ret
-    return _fallback_load(filepath, frame_offset, num_frames, normalize, channels_first, format)
 
 
 @torchaudio._extension.fail_if_no_sox

diff --git a/torchaudio/csrc/sox/effects.cpp b/torchaudio/csrc/sox/effects.cpp
@@ -89,18 +89,15 @@ auto apply_effects_file(
     c10::optional<bool> normalize,
     c10::optional<bool> channels_first,
     const c10::optional<std::string>& format)
-    -> c10::optional<std::tuple<torch::Tensor, int64_t>> {
+    -> std::tuple<torch::Tensor, int64_t> {
   // Open input file
   SoxFormat sf(sox_open_read(
       path.c_str(),
       /*signal=*/nullptr,
       /*encoding=*/nullptr,
       /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
 
-  if (static_cast<sox_format_t*>(sf) == nullptr ||
-      sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
-    return {};
-  }
+  validate_input_file(sf, path);
 
   const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision);
 

diff --git a/torchaudio/csrc/sox/effects.h b/torchaudio/csrc/sox/effects.h
@@ -22,7 +22,7 @@ auto apply_effects_file(
     c10::optional<bool> normalize,
     c10::optional<bool> channels_first,
     const c10::optional<std::string>& format)
-    -> c10::optional<std::tuple<torch::Tensor, int64_t>>;
+    -> std::tuple<torch::Tensor, int64_t>;
 
 } // namespace torchaudio::sox
 

diff --git a/torchaudio/csrc/sox/io.cpp b/torchaudio/csrc/sox/io.cpp
@@ -8,7 +8,7 @@ using namespace torch::indexing;
 
 namespace torchaudio::sox {
 
-c10::optional<MetaDataTuple> get_info_file(
+std::tuple<int64_t, int64_t, int64_t, int64_t, std::string> get_info_file(
     const std::string& path,
     const c10::optional<std::string>& format) {
   SoxFormat sf(sox_open_read(
@@ -17,12 +17,9 @@ c10::optional<MetaDataTuple> get_info_file(
       /*encoding=*/nullptr,
       /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
 
-  if (static_cast<sox_format_t*>(sf) == nullptr ||
-      sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
-    return {};
-  }
+  validate_input_file(sf, path);
 
-  return std::forward_as_tuple(
+  return std::make_tuple(
       static_cast<int64_t>(sf->signal.rate),
       static_cast<int64_t>(sf->signal.length / sf->signal.channels),
       static_cast<int64_t>(sf->signal.channels),
@@ -58,7 +55,7 @@ std::vector<std::vector<std::string>> get_effects(
   return effects;
 }
 
-c10::optional<std::tuple<torch::Tensor, int64_t>> load_audio_file(
+std::tuple<torch::Tensor, int64_t> load_audio_file(
     const std::string& path,
     const c10::optional<int64_t>& frame_offset,
     const c10::optional<int64_t>& num_frames,

diff --git a/torchaudio/csrc/sox/io.h b/torchaudio/csrc/sox/io.h
@@ -11,14 +11,11 @@ auto get_effects(
     const c10::optional<int64_t>& num_frames)
     -> std::vector<std::vector<std::string>>;
 
-using MetaDataTuple =
-    std::tuple<int64_t, int64_t, int64_t, int64_t, std::string>;
-
-c10::optional<MetaDataTuple> get_info_file(
+std::tuple<int64_t, int64_t, int64_t, int64_t, std::string> get_info_file(
     const std::string& path,
     const c10::optional<std::string>& format);
 
-c10::optional<std::tuple<torch::Tensor, int64_t>> load_audio_file(
+std::tuple<torch::Tensor, int64_t> load_audio_file(
     const std::string& path,
     const c10::optional<int64_t>& frame_offset,
     const c10::optional<int64_t>& num_frames,

diff --git a/torchaudio/csrc/sox/pybind/effects.cpp b/torchaudio/csrc/sox/pybind/effects.cpp
@@ -30,8 +30,7 @@ auto apply_effects_fileobj(
     const std::vector<std::vector<std::string>>& effects,
     c10::optional<bool> normalize,
     c10::optional<bool> channels_first,
-    c10::optional<std::string> format)
-    -> c10::optional<std::tuple<torch::Tensor, int64_t>> {
+    c10::optional<std::string> format) -> std::tuple<torch::Tensor, int64_t> {
   // Prepare the buffer used throughout the lifecycle of SoxEffectChain.
   //
   // For certain format (such as FLAC), libsox keeps reading the content at
@@ -112,7 +111,7 @@ auto apply_effects_fileobj(
       normalize.value_or(true),
       channels_first_);
 
-  return std::forward_as_tuple(
+  return std::make_tuple(
       tensor, static_cast<int64_t>(chain.getOutputSampleRate()));
 }
 

diff --git a/torchaudio/csrc/sox/pybind/effects.h b/torchaudio/csrc/sox/pybind/effects.h
@@ -10,8 +10,7 @@ auto apply_effects_fileobj(
     const std::vector<std::vector<std::string>>& effects,
     c10::optional<bool> normalize,
     c10::optional<bool> channels_first,
-    c10::optional<std::string> format)
-    -> c10::optional<std::tuple<torch::Tensor, int64_t>>;
+    c10::optional<std::string> format) -> std::tuple<torch::Tensor, int64_t>;
 
 } // namespace torchaudio::sox
 

diff --git a/torchaudio/csrc/sox/pybind/io.cpp b/torchaudio/csrc/sox/pybind/io.cpp
@@ -10,7 +10,7 @@
 namespace torchaudio::sox {
 
 auto get_info_fileobj(py::object fileobj, c10::optional<std::string> format)
-    -> c10::optional<MetaDataTuple> {
+    -> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string> {
   // Prepare in-memory file object
   // When libsox opens a file, it also reads the header.
   // When opening a file there are two functions that might touch FILE* (and the
@@ -63,7 +63,7 @@ auto get_info_fileobj(py::object fileobj, c10::optional<std::string> format)
     return c10::optional<MetaDataTuple>{};
   }
 
-  return std::forward_as_tuple(
+  return std::make_tuple(
       static_cast<int64_t>(sf->signal.rate),
       static_cast<int64_t>(sf->signal.length / sf->signal.channels),
       static_cast<int64_t>(sf->signal.channels),
@@ -77,8 +77,7 @@ auto load_audio_fileobj(
     c10::optional<int64_t> num_frames,
     c10::optional<bool> normalize,
     c10::optional<bool> channels_first,
-    c10::optional<std::string> format)
-    -> c10::optional<std::tuple<torch::Tensor, int64_t>> {
+    c10::optional<std::string> format) -> std::tuple<torch::Tensor, int64_t> {
   auto effects = get_effects(frame_offset, num_frames);
   return apply_effects_fileobj(
       std::move(fileobj),

diff --git a/torchaudio/csrc/sox/pybind/io.h b/torchaudio/csrc/sox/pybind/io.h
@@ -5,20 +5,16 @@
 
 namespace torchaudio::sox {
 
-using MetaDataTuple =
-    std::tuple<int64_t, int64_t, int64_t, int64_t, std::string>;
-
 auto get_info_fileobj(py::object fileobj, c10::optional<std::string> format)
-    -> c10::optional<MetaDataTuple>;
+    -> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string>;
 
 auto load_audio_fileobj(
     py::object fileobj,
     c10::optional<int64_t> frame_offset,
     c10::optional<int64_t> num_frames,
     c10::optional<bool> normalize,
     c10::optional<bool> channels_first,
-    c10::optional<std::string> format)
-    -> c10::optional<std::tuple<torch::Tensor, int64_t>>;
+    c10::optional<std::string> format) -> std::tuple<torch::Tensor, int64_t>;
 
 void save_audio_fileobj(
     py::object fileobj,

diff --git a/torchaudio/csrc/sox/utils.h b/torchaudio/csrc/sox/utils.h
@@ -51,6 +51,10 @@ struct SoxFormat {
   sox_format_t* fd_;
 };
 
+///
+/// Verify that input file is found, has known encoding, and not empty
+void validate_input_file(const SoxFormat& sf, const std::string& path);
+
 ///
 /// Verify that input Tensor is 2D, CPU and either uin8, int16, int32 or float32
 void validate_input_tensor(const torch::Tensor&);

diff --git a/torchaudio/sox_effects/sox_effects.py b/torchaudio/sox_effects/sox_effects.py
@@ -269,7 +269,4 @@ def apply_effects_file(
                 "Please use torchaudio.io.AudioEffector."
             )
         path = os.fspath(path)
-    ret = torch.ops.torchaudio.sox_effects_apply_effects_file(path, effects, normalize, channels_first, format)
-    if ret is not None:
-        return ret
-    raise RuntimeError("Failed to load audio from {}".format(path))
+    return torch.ops.torchaudio.sox_effects_apply_effects_file(path, effects, normalize, channels_first, format)