audeering · hagenw · Jan 3, 2025 · Dec 17, 2024 · Dec 17, 2024 · Dec 17, 2024
diff --git a/audiofile/core/convert.py b/audiofile/core/convert.py
@@ -11,6 +11,7 @@ def convert(
     outfile: str,
     offset: float = 0,
     duration: float = None,
+    sampling_rate: int = None,
 ):
     """Convert any audio/video file to WAV.
 
@@ -19,16 +20,17 @@ def convert(
         outfile: WAV file name
         duration: return only a specified duration in seconds
         offset: start reading at offset in seconds
+        sampling_rate: sampling rate in Hz
 
     """
     try:
         # Convert to WAV file with sox
-        run_sox(infile, outfile, offset, duration)
+        run_sox(infile, outfile, offset, duration, sampling_rate)
     except (FileNotFoundError, subprocess.CalledProcessError):
         try:
             # Convert to WAV file with ffmpeg
-            run_ffmpeg(infile, outfile, offset, duration)
-        except FileNotFoundError:
-            raise binary_missing_error("ffmpeg")
-        except subprocess.CalledProcessError:
-            raise broken_file_error(infile)
+            run_ffmpeg(infile, outfile, offset, duration, sampling_rate)
+        except FileNotFoundError as e:  # pragma: no cover
+            raise binary_missing_error("ffmpeg") from e
+        except subprocess.CalledProcessError as e:  # pragma: no cover
+            raise broken_file_error(infile) from e
diff --git a/audiofile/core/io.py b/audiofile/core/io.py
@@ -254,6 +254,7 @@ def read(
 
     """  # noqa: E501
     file = audeer.safe_path(file)
+    sampling_rate = None
 
     # Parse offset and duration values
     if (
@@ -384,7 +385,16 @@ def read(
                 offset /= sampling_rate
             if duration is not None and duration != 0:
                 duration /= sampling_rate
-            convert(file, tmpfile, offset, duration)
+            if sampling_rate is None:
+                # Infer sampling rate using mediainfo before conversion,
+                # as ffmpeg does ignore the original sampling rate for opus files,
+                # see:
+                # * https://trac.ffmpeg.org/ticket/5240
+                # * https://github.com/audeering/audiofile/issues/157
+                from audiofile.core.info import sampling_rate as get_sampling_rate
+
+                sampling_rate = get_sampling_rate(file)
+            convert(file, tmpfile, offset, duration, sampling_rate)
             signal, sampling_rate = soundfile.read(
                 tmpfile,
                 dtype=dtype,

diff --git a/audiofile/core/utils.py b/audiofile/core/utils.py
@@ -91,19 +91,23 @@ def run(shell_command):
         return ""
 
 
-def run_ffmpeg(infile, outfile, offset, duration):
+def run_ffmpeg(infile, outfile, offset, duration, sampling_rate):
     """Convert audio file to WAV file."""
+    cmd = ["ffmpeg", "-ss", str(offset), "-i", infile, outfile]
     if duration:
-        cmd = ["ffmpeg", "-ss", str(offset), "-i", infile, "-t", str(duration), outfile]
-    else:
-        cmd = ["ffmpeg", "-ss", str(offset), "-i", infile, outfile]
+        cmd.insert(-1, "-t")
+        cmd.insert(-1, str(duration))
+    if sampling_rate:
+        cmd.insert(-1, "-ar")
+        cmd.insert(-1, str(sampling_rate))
     run(cmd)
 
 
-def run_sox(infile, outfile, offset, duration):
+def run_sox(infile, outfile, offset, duration, sampling_rate):
     """Convert audio file to WAV file."""
+    cmd = ["sox", infile, outfile, "trim", str(offset)]
     if duration:
-        cmd = ["sox", infile, outfile, "trim", str(offset), str(duration)]
-    else:
-        cmd = ["sox", infile, outfile, "trim", str(offset)]
+        cmd.append(str(duration))
+    if sampling_rate:
+        cmd += ["rate", str(sampling_rate)]
     run(cmd)
diff --git a/tests/assets/README.md b/tests/assets/README.md
@@ -15,6 +15,13 @@ Kevin MacLeod (incompetech.com),
 licensed under Creative Commons:
 [CC-BY-3.0](http://creativecommons.org/licenses/by/3.0/).
 
+We converted the file `gs-16b-1c-44100hz.opus`
+(which was stored wrongly with 48000 Hz)
+to `gs-16b-1c-16000hz.opus` using
+```bash
+ffmpeg -y -i gs-16b-1c-44100hz.opus -ac 1 -ar 16000 gs-16b-1c-16000hz-fixed.opus
+```
+
 ## Video test files
 
 The folder contains the video file `video.mp4`,

diff --git a/tests/assets/gs-16b-1c-16000hz.opus b/tests/assets/gs-16b-1c-16000hz.opus
diff --git a/tests/assets/gs-16b-1c-44100hz.opus b/tests/assets/gs-16b-1c-44100hz.opus
diff --git a/tests/test_audiofile.py b/tests/test_audiofile.py
@@ -202,24 +202,24 @@ def test_empty_file(tmpdir, convert, empty_file):
 def test_missing_binaries(tmpdir, hide_system_path, empty_file):
     expected_error = FileNotFoundError
     # Reading file
-    with pytest.raises(expected_error, match="ffmpeg"):
+    with pytest.raises(expected_error, match="mediainfo"):
         signal, sampling_rate = af.read(empty_file)
     # Metadata
     with pytest.raises(expected_error, match="mediainfo"):
         af.channels(empty_file)
-    with pytest.raises(expected_error, match="ffmpeg"):
+    with pytest.raises(expected_error, match="mediainfo"):
         af.duration(empty_file)
     with pytest.raises(expected_error, match="mediainfo"):
         af.duration(empty_file, sloppy=True)
     with pytest.raises(expected_error, match="mediainfo"):
         af.has_video(empty_file)
-    with pytest.raises(expected_error, match="ffmpeg"):
+    with pytest.raises(expected_error, match="mediainfo"):
         af.samples(empty_file)
     with pytest.raises(expected_error, match="mediainfo"):
         af.sampling_rate(empty_file)
 
     # Convert
-    with pytest.raises(expected_error, match="ffmpeg"):
+    with pytest.raises(expected_error, match="mediainfo"):
         converted_file = str(tmpdir.join("signal-converted.wav"))
         af.convert_to_wav(empty_file, converted_file)
 
@@ -512,7 +512,7 @@ def test_file_type(tmpdir, file_type, magnitude, sampling_rate, channels):
 @pytest.mark.parametrize(
     "file, header_duration, audio, video",  # header duration as given by mediainfo
     [
-        ("gs-16b-1c-44100hz.opus", 15.839, True, False),
+        ("gs-16b-1c-16000hz.opus", 15.839, True, False),
         ("gs-16b-1c-8000hz.amr", 15.840000, True, False),
         ("gs-16b-1c-44100hz.m4a", 15.833, True, False),
         ("gs-16b-1c-44100hz.aac", None, True, False),
@@ -1243,7 +1243,7 @@ def test_read_duration_and_offset_rounding(
     # when reading with sox or ffmpeg
 
     # soundfile
-    signal, _ = af.read(audio_file, offset=offset, duration=duration)
+    signal, sampling_rate = af.read(audio_file, offset=offset, duration=duration)
     np.testing.assert_allclose(
         signal,
         np.array(expected, dtype=np.float32),
@@ -1259,7 +1259,7 @@ def test_read_duration_and_offset_rounding(
     # sox
     convert_file = str(tmpdir.join("signal-sox.wav"))
     try:
-        af.core.utils.run_sox(audio_file, convert_file, offset, duration)
+        af.core.utils.run_sox(audio_file, convert_file, offset, duration, sampling_rate)
         signal, _ = af.read(convert_file)
         np.testing.assert_allclose(
             signal,
@@ -1272,7 +1272,7 @@ def test_read_duration_and_offset_rounding(
 
     # ffmpeg
     convert_file = str(tmpdir.join("signal-ffmpeg.wav"))
-    af.core.utils.run_ffmpeg(audio_file, convert_file, offset, duration)
+    af.core.utils.run_ffmpeg(audio_file, convert_file, offset, duration, sampling_rate)
     signal, _ = af.read(convert_file)
     np.testing.assert_allclose(
         signal,