feat: Audio loudness normalization for assets files (#332)

2025-03-13 12:14:17 +08:00
parent 7eb710d404
commit ff7f396f9d
8 changed files with 112 additions and 21 deletions
--- a/main/assets/common/exclamation.p3
+++ b/main/assets/common/exclamation.p3
--- a/main/assets/common/low_battery.p3
+++ b/main/assets/common/low_battery.p3
--- a/main/assets/common/success.p3
+++ b/main/assets/common/success.p3
--- a/main/assets/common/vibration.p3
+++ b/main/assets/common/vibration.p3
--- a/scripts/p3_tools/README.md
+++ b/scripts/p3_tools/README.md
@@ -4,14 +4,21 @@

 ## 1. 音频转换工具 (convert_audio_to_p3.py)

-将普通音频文件转换为P3格式（4字节header + Opus数据包的流式结构）。
+将普通音频文件转换为P3格式（4字节header + Opus数据包的流式结构）并进行响度标准化。

 ### 使用方法

 ```bash
-python convert_audio_to_p3.py <输入音频文件> <输出P3文件>
+python convert_audio_to_p3.py <输入音频文件> <输出P3文件> [-l LUFS] [-d]
 ```

+其中，可选选项 `-l` 用于指定响度标准化的目标响度，默认为 -16 LUFS；可选选项 `-d` 可以禁用响度标准化。
+
+如果输入的音频文件符合下面的任一条件，建议使用 `-d` 禁用响度标准化：
+- 音频过短
+- 音频已经调整过响度
+- 音频来自默认 TTS （小智当前使用的 TTS 的默认响度已是 -16 LUFS）
+
 例如：
 ```bash
 python convert_audio_to_p3.py input.mp3 output.p3
@@ -38,12 +45,29 @@ python play_p3.py <P3文件路径>
 python play_p3.py output.p3
 ```

+## 3. 音频转回工具 (convert_p3_to_audio.py)
+
+将P3格式转换回普通音频文件。
+
+### 使用方法
+
+```bash
+python convert_p3_to_audio.py <输入P3文件> <输出音频文件>
+```
+
+输出音频文件需要有扩展名。
+
+例如：
+```bash
+python convert_p3_to_audio.py input.p3 output.wav
+```
+
 ## 依赖安装

 在使用这些脚本前，请确保安装了所需的Python库：

 ```bash
-pip install librosa opuslib numpy tqdm sounddevice
+pip install librosa opuslib numpy tqdm sounddevice pyloudnorm soundfile
 ```

 或者使用提供的requirements.txt文件：
--- a/scripts/p3_tools/convert_audio_to_p3.py
+++ b/scripts/p3_tools/convert_audio_to_p3.py
@@ -5,44 +5,58 @@ import struct
 import sys
 import tqdm
 import numpy as np
+import argparse
+import pyloudnorm as pyln

-def encode_audio_to_opus(input_file, output_file):
+def encode_audio_to_opus(input_file, output_file, target_lufs=None):
    # Load audio file using librosa
    audio, sample_rate = librosa.load(input_file, sr=None, mono=False, dtype=np.float32)
    
+    # Convert to mono if stereo
+    if audio.ndim == 2:
+        audio = librosa.to_mono(audio)
+    
+    if target_lufs is not None:
+        print("Note: Automatic loudness adjustment is enabled, which may cause", file=sys.stderr) 
+        print("      audio distortion. If the input audio has already been ", file=sys.stderr)
+        print("      loudness-adjusted or if the input audio is TTS audio, ", file=sys.stderr)
+        print("      please use the `-d` parameter to disable loudness adjustment.", file=sys.stderr)
+        meter = pyln.Meter(sample_rate)
+        current_loudness = meter.integrated_loudness(audio)
+        audio = pyln.normalize.loudness(audio, current_loudness, target_lufs)
+        print(f"Adjusted loudness: {current_loudness:.1f} LUFS -> {target_lufs} LUFS")
+
    # Convert sample rate to 16000Hz if necessary
    target_sample_rate = 16000
    if sample_rate != target_sample_rate:
        audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=target_sample_rate)
        sample_rate = target_sample_rate
    
-    # Get left channel if stereo
-    if audio.ndim == 2:
-        audio = audio[0]
-    
-    # Convert audio data back to int16 after resampling
+    # Convert audio data back to int16 after processing
    audio = (audio * 32767).astype(np.int16)
    
    # Initialize Opus encoder
    encoder = opuslib.Encoder(sample_rate, 1, opuslib.APPLICATION_AUDIO)

-    # Encode audio data to Opus packets
-    # Save encoded data to file
+    # Encode and save
    with open(output_file, 'wb') as f:
-        duration = 60 # 60ms every frame
+        duration = 60  # 60ms per frame
        frame_size = int(sample_rate * duration / 1000)
        for i in tqdm.tqdm(range(0, len(audio) - frame_size, frame_size)):
            frame = audio[i:i + frame_size]
            opus_data = encoder.encode(frame.tobytes(), frame_size=frame_size)
-            # protocol format, [1u type, 1u reserved, 2u len, data]
            packet = struct.pack('>BBH', 0, 0, len(opus_data)) + opus_data
            f.write(packet)

-# Example usage
-if len(sys.argv) != 3:
-    print('Usage: python convert.py <input_file> <output_file>')
-    sys.exit(1)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Convert audio to Opus with loudness normalization')
+    parser.add_argument('input_file', help='Input audio file')
+    parser.add_argument('output_file', help='Output .opus file')
+    parser.add_argument('-l', '--lufs', type=float, default=-16.0,
+                       help='Target loudness in LUFS (default: -16)')
+    parser.add_argument('-d', '--disable-loudnorm', action='store_true',
+                       help='Disable loudness normalization')
+    args = parser.parse_args()

-input_file = sys.argv[1]
-output_file = sys.argv[2]
-encode_audio_to_opus(input_file, output_file)
+    target_lufs = None if args.disable_loudnorm else args.lufs
+    encode_audio_to_opus(args.input_file, args.output_file, target_lufs)
--- a/scripts/p3_tools/convert_p3_to_audio.py
+++ b/scripts/p3_tools/convert_p3_to_audio.py
@@ -0,0 +1,51 @@
+import struct
+import sys
+import opuslib
+import numpy as np
+from tqdm import tqdm
+import soundfile as sf
+
+
+def decode_p3_to_audio(input_file, output_file):
+    sample_rate = 16000
+    channels = 1
+    decoder = opuslib.Decoder(sample_rate, channels)
+
+    pcm_frames = []
+    frame_size = int(sample_rate * 60 / 1000)
+
+    with open(input_file, "rb") as f:
+        f.seek(0, 2)
+        total_size = f.tell()
+        f.seek(0)
+
+        with tqdm(total=total_size, unit="B", unit_scale=True) as pbar:
+            while True:
+                header = f.read(4)
+                if not header or len(header) < 4:
+                    break
+
+                pkt_type, reserved, opus_len = struct.unpack(">BBH", header)
+                opus_data = f.read(opus_len)
+                if len(opus_data) != opus_len:
+                    break
+
+                pcm = decoder.decode(opus_data, frame_size)
+                pcm_frames.append(np.frombuffer(pcm, dtype=np.int16))
+
+                pbar.update(4 + opus_len)
+
+    if not pcm_frames:
+        raise ValueError("No valid audio data found")
+
+    pcm_data = np.concatenate(pcm_frames)
+
+    sf.write(output_file, pcm_data, sample_rate, subtype="PCM_16")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print("Usage: python convert_p3_to_audio.py <input.p3> <output.wav>")
+        sys.exit(1)
+
+    decode_p3_to_audio(sys.argv[1], sys.argv[2])
--- a/scripts/p3_tools/requirements.txt
+++ b/scripts/p3_tools/requirements.txt
@@ -3,3 +3,5 @@ opuslib>=3.0.1
 numpy>=1.20.0
 tqdm>=4.62.0
 sounddevice>=0.4.4 
+pyloudnorm>=0.1.1
+soundfile>=0.13.1