forked from xiaozhi/xiaozhi-esp32
feat: Audio loudness normalization for assets files (#332)
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -4,14 +4,21 @@
|
|||||||
|
|
||||||
## 1. 音频转换工具 (convert_audio_to_p3.py)
|
## 1. 音频转换工具 (convert_audio_to_p3.py)
|
||||||
|
|
||||||
将普通音频文件转换为P3格式(4字节header + Opus数据包的流式结构)。
|
将普通音频文件转换为P3格式(4字节header + Opus数据包的流式结构)并进行响度标准化。
|
||||||
|
|
||||||
### 使用方法
|
### 使用方法
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python convert_audio_to_p3.py <输入音频文件> <输出P3文件>
|
python convert_audio_to_p3.py <输入音频文件> <输出P3文件> [-l LUFS] [-d]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
其中,可选选项 `-l` 用于指定响度标准化的目标响度,默认为 -16 LUFS;可选选项 `-d` 可以禁用响度标准化。
|
||||||
|
|
||||||
|
如果输入的音频文件符合下面的任一条件,建议使用 `-d` 禁用响度标准化:
|
||||||
|
- 音频过短
|
||||||
|
- 音频已经调整过响度
|
||||||
|
- 音频来自默认 TTS (小智当前使用的 TTS 的默认响度已是 -16 LUFS)
|
||||||
|
|
||||||
例如:
|
例如:
|
||||||
```bash
|
```bash
|
||||||
python convert_audio_to_p3.py input.mp3 output.p3
|
python convert_audio_to_p3.py input.mp3 output.p3
|
||||||
@@ -38,12 +45,29 @@ python play_p3.py <P3文件路径>
|
|||||||
python play_p3.py output.p3
|
python play_p3.py output.p3
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## 3. 音频转回工具 (convert_p3_to_audio.py)
|
||||||
|
|
||||||
|
将P3格式转换回普通音频文件。
|
||||||
|
|
||||||
|
### 使用方法
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python convert_p3_to_audio.py <输入P3文件> <输出音频文件>
|
||||||
|
```
|
||||||
|
|
||||||
|
输出音频文件需要有扩展名。
|
||||||
|
|
||||||
|
例如:
|
||||||
|
```bash
|
||||||
|
python convert_p3_to_audio.py input.p3 output.wav
|
||||||
|
```
|
||||||
|
|
||||||
## 依赖安装
|
## 依赖安装
|
||||||
|
|
||||||
在使用这些脚本前,请确保安装了所需的Python库:
|
在使用这些脚本前,请确保安装了所需的Python库:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install librosa opuslib numpy tqdm sounddevice
|
pip install librosa opuslib numpy tqdm sounddevice pyloudnorm soundfile
|
||||||
```
|
```
|
||||||
|
|
||||||
或者使用提供的requirements.txt文件:
|
或者使用提供的requirements.txt文件:
|
||||||
|
|||||||
@@ -5,44 +5,58 @@ import struct
|
|||||||
import sys
|
import sys
|
||||||
import tqdm
|
import tqdm
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import argparse
|
||||||
|
import pyloudnorm as pyln
|
||||||
|
|
||||||
def encode_audio_to_opus(input_file, output_file):
|
def encode_audio_to_opus(input_file, output_file, target_lufs=None):
|
||||||
# Load audio file using librosa
|
# Load audio file using librosa
|
||||||
audio, sample_rate = librosa.load(input_file, sr=None, mono=False, dtype=np.float32)
|
audio, sample_rate = librosa.load(input_file, sr=None, mono=False, dtype=np.float32)
|
||||||
|
|
||||||
|
# Convert to mono if stereo
|
||||||
|
if audio.ndim == 2:
|
||||||
|
audio = librosa.to_mono(audio)
|
||||||
|
|
||||||
|
if target_lufs is not None:
|
||||||
|
print("Note: Automatic loudness adjustment is enabled, which may cause", file=sys.stderr)
|
||||||
|
print(" audio distortion. If the input audio has already been ", file=sys.stderr)
|
||||||
|
print(" loudness-adjusted or if the input audio is TTS audio, ", file=sys.stderr)
|
||||||
|
print(" please use the `-d` parameter to disable loudness adjustment.", file=sys.stderr)
|
||||||
|
meter = pyln.Meter(sample_rate)
|
||||||
|
current_loudness = meter.integrated_loudness(audio)
|
||||||
|
audio = pyln.normalize.loudness(audio, current_loudness, target_lufs)
|
||||||
|
print(f"Adjusted loudness: {current_loudness:.1f} LUFS -> {target_lufs} LUFS")
|
||||||
|
|
||||||
# Convert sample rate to 16000Hz if necessary
|
# Convert sample rate to 16000Hz if necessary
|
||||||
target_sample_rate = 16000
|
target_sample_rate = 16000
|
||||||
if sample_rate != target_sample_rate:
|
if sample_rate != target_sample_rate:
|
||||||
audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=target_sample_rate)
|
audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=target_sample_rate)
|
||||||
sample_rate = target_sample_rate
|
sample_rate = target_sample_rate
|
||||||
|
|
||||||
# Get left channel if stereo
|
# Convert audio data back to int16 after processing
|
||||||
if audio.ndim == 2:
|
|
||||||
audio = audio[0]
|
|
||||||
|
|
||||||
# Convert audio data back to int16 after resampling
|
|
||||||
audio = (audio * 32767).astype(np.int16)
|
audio = (audio * 32767).astype(np.int16)
|
||||||
|
|
||||||
# Initialize Opus encoder
|
# Initialize Opus encoder
|
||||||
encoder = opuslib.Encoder(sample_rate, 1, opuslib.APPLICATION_AUDIO)
|
encoder = opuslib.Encoder(sample_rate, 1, opuslib.APPLICATION_AUDIO)
|
||||||
|
|
||||||
# Encode audio data to Opus packets
|
# Encode and save
|
||||||
# Save encoded data to file
|
|
||||||
with open(output_file, 'wb') as f:
|
with open(output_file, 'wb') as f:
|
||||||
duration = 60 # 60ms every frame
|
duration = 60 # 60ms per frame
|
||||||
frame_size = int(sample_rate * duration / 1000)
|
frame_size = int(sample_rate * duration / 1000)
|
||||||
for i in tqdm.tqdm(range(0, len(audio) - frame_size, frame_size)):
|
for i in tqdm.tqdm(range(0, len(audio) - frame_size, frame_size)):
|
||||||
frame = audio[i:i + frame_size]
|
frame = audio[i:i + frame_size]
|
||||||
opus_data = encoder.encode(frame.tobytes(), frame_size=frame_size)
|
opus_data = encoder.encode(frame.tobytes(), frame_size=frame_size)
|
||||||
# protocol format, [1u type, 1u reserved, 2u len, data]
|
|
||||||
packet = struct.pack('>BBH', 0, 0, len(opus_data)) + opus_data
|
packet = struct.pack('>BBH', 0, 0, len(opus_data)) + opus_data
|
||||||
f.write(packet)
|
f.write(packet)
|
||||||
|
|
||||||
# Example usage
|
if __name__ == "__main__":
|
||||||
if len(sys.argv) != 3:
|
parser = argparse.ArgumentParser(description='Convert audio to Opus with loudness normalization')
|
||||||
print('Usage: python convert.py <input_file> <output_file>')
|
parser.add_argument('input_file', help='Input audio file')
|
||||||
sys.exit(1)
|
parser.add_argument('output_file', help='Output .opus file')
|
||||||
|
parser.add_argument('-l', '--lufs', type=float, default=-16.0,
|
||||||
|
help='Target loudness in LUFS (default: -16)')
|
||||||
|
parser.add_argument('-d', '--disable-loudnorm', action='store_true',
|
||||||
|
help='Disable loudness normalization')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
input_file = sys.argv[1]
|
target_lufs = None if args.disable_loudnorm else args.lufs
|
||||||
output_file = sys.argv[2]
|
encode_audio_to_opus(args.input_file, args.output_file, target_lufs)
|
||||||
encode_audio_to_opus(input_file, output_file)
|
|
||||||
51
scripts/p3_tools/convert_p3_to_audio.py
Normal file
51
scripts/p3_tools/convert_p3_to_audio.py
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
import struct
|
||||||
|
import sys
|
||||||
|
import opuslib
|
||||||
|
import numpy as np
|
||||||
|
from tqdm import tqdm
|
||||||
|
import soundfile as sf
|
||||||
|
|
||||||
|
|
||||||
|
def decode_p3_to_audio(input_file, output_file):
|
||||||
|
sample_rate = 16000
|
||||||
|
channels = 1
|
||||||
|
decoder = opuslib.Decoder(sample_rate, channels)
|
||||||
|
|
||||||
|
pcm_frames = []
|
||||||
|
frame_size = int(sample_rate * 60 / 1000)
|
||||||
|
|
||||||
|
with open(input_file, "rb") as f:
|
||||||
|
f.seek(0, 2)
|
||||||
|
total_size = f.tell()
|
||||||
|
f.seek(0)
|
||||||
|
|
||||||
|
with tqdm(total=total_size, unit="B", unit_scale=True) as pbar:
|
||||||
|
while True:
|
||||||
|
header = f.read(4)
|
||||||
|
if not header or len(header) < 4:
|
||||||
|
break
|
||||||
|
|
||||||
|
pkt_type, reserved, opus_len = struct.unpack(">BBH", header)
|
||||||
|
opus_data = f.read(opus_len)
|
||||||
|
if len(opus_data) != opus_len:
|
||||||
|
break
|
||||||
|
|
||||||
|
pcm = decoder.decode(opus_data, frame_size)
|
||||||
|
pcm_frames.append(np.frombuffer(pcm, dtype=np.int16))
|
||||||
|
|
||||||
|
pbar.update(4 + opus_len)
|
||||||
|
|
||||||
|
if not pcm_frames:
|
||||||
|
raise ValueError("No valid audio data found")
|
||||||
|
|
||||||
|
pcm_data = np.concatenate(pcm_frames)
|
||||||
|
|
||||||
|
sf.write(output_file, pcm_data, sample_rate, subtype="PCM_16")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if len(sys.argv) != 3:
|
||||||
|
print("Usage: python convert_p3_to_audio.py <input.p3> <output.wav>")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
decode_p3_to_audio(sys.argv[1], sys.argv[2])
|
||||||
@@ -3,3 +3,5 @@ opuslib>=3.0.1
|
|||||||
numpy>=1.20.0
|
numpy>=1.20.0
|
||||||
tqdm>=4.62.0
|
tqdm>=4.62.0
|
||||||
sounddevice>=0.4.4
|
sounddevice>=0.4.4
|
||||||
|
pyloudnorm>=0.1.1
|
||||||
|
soundfile>=0.13.1
|
||||||
|
|||||||
Reference in New Issue
Block a user