diff --git a/main/assets/common/exclamation.p3 b/main/assets/common/exclamation.p3 index 65396687..17e96cf7 100644 Binary files a/main/assets/common/exclamation.p3 and b/main/assets/common/exclamation.p3 differ diff --git a/main/assets/common/low_battery.p3 b/main/assets/common/low_battery.p3 index 31064733..03669ef9 100644 Binary files a/main/assets/common/low_battery.p3 and b/main/assets/common/low_battery.p3 differ diff --git a/main/assets/common/success.p3 b/main/assets/common/success.p3 index 353a1b28..4f1bd1cf 100644 Binary files a/main/assets/common/success.p3 and b/main/assets/common/success.p3 differ diff --git a/main/assets/common/vibration.p3 b/main/assets/common/vibration.p3 index dc876acd..99724de3 100644 Binary files a/main/assets/common/vibration.p3 and b/main/assets/common/vibration.p3 differ diff --git a/scripts/p3_tools/README.md b/scripts/p3_tools/README.md index d760c100..e5a390c2 100644 --- a/scripts/p3_tools/README.md +++ b/scripts/p3_tools/README.md @@ -4,14 +4,21 @@ ## 1. 音频转换工具 (convert_audio_to_p3.py) -将普通音频文件转换为P3格式(4字节header + Opus数据包的流式结构)。 +将普通音频文件转换为P3格式(4字节header + Opus数据包的流式结构)并进行响度标准化。 ### 使用方法 ```bash -python convert_audio_to_p3.py <输入音频文件> <输出P3文件> +python convert_audio_to_p3.py <输入音频文件> <输出P3文件> [-l LUFS] [-d] ``` +其中,可选选项 `-l` 用于指定响度标准化的目标响度,默认为 -16 LUFS;可选选项 `-d` 可以禁用响度标准化。 + +如果输入的音频文件符合下面的任一条件,建议使用 `-d` 禁用响度标准化: +- 音频过短 +- 音频已经调整过响度 +- 音频来自默认 TTS (小智当前使用的 TTS 的默认响度已是 -16 LUFS) + 例如: ```bash python convert_audio_to_p3.py input.mp3 output.p3 @@ -38,12 +45,29 @@ python play_p3.py python play_p3.py output.p3 ``` +## 3. 音频转回工具 (convert_p3_to_audio.py) + +将P3格式转换回普通音频文件。 + +### 使用方法 + +```bash +python convert_p3_to_audio.py <输入P3文件> <输出音频文件> +``` + +输出音频文件需要有扩展名。 + +例如: +```bash +python convert_p3_to_audio.py input.p3 output.wav +``` + ## 依赖安装 在使用这些脚本前,请确保安装了所需的Python库: ```bash -pip install librosa opuslib numpy tqdm sounddevice +pip install librosa opuslib numpy tqdm sounddevice pyloudnorm soundfile ``` 或者使用提供的requirements.txt文件: diff --git a/scripts/p3_tools/convert_audio_to_p3.py b/scripts/p3_tools/convert_audio_to_p3.py index 5a92f829..519d6620 100644 --- a/scripts/p3_tools/convert_audio_to_p3.py +++ b/scripts/p3_tools/convert_audio_to_p3.py @@ -5,44 +5,58 @@ import struct import sys import tqdm import numpy as np +import argparse +import pyloudnorm as pyln -def encode_audio_to_opus(input_file, output_file): +def encode_audio_to_opus(input_file, output_file, target_lufs=None): # Load audio file using librosa audio, sample_rate = librosa.load(input_file, sr=None, mono=False, dtype=np.float32) + # Convert to mono if stereo + if audio.ndim == 2: + audio = librosa.to_mono(audio) + + if target_lufs is not None: + print("Note: Automatic loudness adjustment is enabled, which may cause", file=sys.stderr) + print(" audio distortion. If the input audio has already been ", file=sys.stderr) + print(" loudness-adjusted or if the input audio is TTS audio, ", file=sys.stderr) + print(" please use the `-d` parameter to disable loudness adjustment.", file=sys.stderr) + meter = pyln.Meter(sample_rate) + current_loudness = meter.integrated_loudness(audio) + audio = pyln.normalize.loudness(audio, current_loudness, target_lufs) + print(f"Adjusted loudness: {current_loudness:.1f} LUFS -> {target_lufs} LUFS") + # Convert sample rate to 16000Hz if necessary target_sample_rate = 16000 if sample_rate != target_sample_rate: audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=target_sample_rate) sample_rate = target_sample_rate - # Get left channel if stereo - if audio.ndim == 2: - audio = audio[0] - - # Convert audio data back to int16 after resampling + # Convert audio data back to int16 after processing audio = (audio * 32767).astype(np.int16) # Initialize Opus encoder encoder = opuslib.Encoder(sample_rate, 1, opuslib.APPLICATION_AUDIO) - # Encode audio data to Opus packets - # Save encoded data to file + # Encode and save with open(output_file, 'wb') as f: - duration = 60 # 60ms every frame + duration = 60 # 60ms per frame frame_size = int(sample_rate * duration / 1000) for i in tqdm.tqdm(range(0, len(audio) - frame_size, frame_size)): frame = audio[i:i + frame_size] opus_data = encoder.encode(frame.tobytes(), frame_size=frame_size) - # protocol format, [1u type, 1u reserved, 2u len, data] packet = struct.pack('>BBH', 0, 0, len(opus_data)) + opus_data f.write(packet) -# Example usage -if len(sys.argv) != 3: - print('Usage: python convert.py ') - sys.exit(1) +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Convert audio to Opus with loudness normalization') + parser.add_argument('input_file', help='Input audio file') + parser.add_argument('output_file', help='Output .opus file') + parser.add_argument('-l', '--lufs', type=float, default=-16.0, + help='Target loudness in LUFS (default: -16)') + parser.add_argument('-d', '--disable-loudnorm', action='store_true', + help='Disable loudness normalization') + args = parser.parse_args() -input_file = sys.argv[1] -output_file = sys.argv[2] -encode_audio_to_opus(input_file, output_file) + target_lufs = None if args.disable_loudnorm else args.lufs + encode_audio_to_opus(args.input_file, args.output_file, target_lufs) \ No newline at end of file diff --git a/scripts/p3_tools/convert_p3_to_audio.py b/scripts/p3_tools/convert_p3_to_audio.py new file mode 100644 index 00000000..f870b01c --- /dev/null +++ b/scripts/p3_tools/convert_p3_to_audio.py @@ -0,0 +1,51 @@ +import struct +import sys +import opuslib +import numpy as np +from tqdm import tqdm +import soundfile as sf + + +def decode_p3_to_audio(input_file, output_file): + sample_rate = 16000 + channels = 1 + decoder = opuslib.Decoder(sample_rate, channels) + + pcm_frames = [] + frame_size = int(sample_rate * 60 / 1000) + + with open(input_file, "rb") as f: + f.seek(0, 2) + total_size = f.tell() + f.seek(0) + + with tqdm(total=total_size, unit="B", unit_scale=True) as pbar: + while True: + header = f.read(4) + if not header or len(header) < 4: + break + + pkt_type, reserved, opus_len = struct.unpack(">BBH", header) + opus_data = f.read(opus_len) + if len(opus_data) != opus_len: + break + + pcm = decoder.decode(opus_data, frame_size) + pcm_frames.append(np.frombuffer(pcm, dtype=np.int16)) + + pbar.update(4 + opus_len) + + if not pcm_frames: + raise ValueError("No valid audio data found") + + pcm_data = np.concatenate(pcm_frames) + + sf.write(output_file, pcm_data, sample_rate, subtype="PCM_16") + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: python convert_p3_to_audio.py ") + sys.exit(1) + + decode_p3_to_audio(sys.argv[1], sys.argv[2]) diff --git a/scripts/p3_tools/requirements.txt b/scripts/p3_tools/requirements.txt index 64d4cc8a..d76d4cd5 100644 --- a/scripts/p3_tools/requirements.txt +++ b/scripts/p3_tools/requirements.txt @@ -2,4 +2,6 @@ librosa>=0.9.2 opuslib>=3.0.1 numpy>=1.20.0 tqdm>=4.62.0 -sounddevice>=0.4.4 \ No newline at end of file +sounddevice>=0.4.4 +pyloudnorm>=0.1.1 +soundfile>=0.13.1