convert_audio_to_p3.py 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
  1. # convert audio files to protocol v3 stream
  2. import librosa
  3. import opuslib
  4. import struct
  5. import sys
  6. import tqdm
  7. import numpy as np
  8. import argparse
  9. import pyloudnorm as pyln
  10. def encode_audio_to_opus(input_file, output_file, target_lufs=None):
  11. # Load audio file using librosa
  12. audio, sample_rate = librosa.load(input_file, sr=None, mono=False, dtype=np.float32)
  13. # Convert to mono if stereo
  14. if audio.ndim == 2:
  15. audio = librosa.to_mono(audio)
  16. if target_lufs is not None:
  17. print("Note: Automatic loudness adjustment is enabled, which may cause", file=sys.stderr)
  18. print(" audio distortion. If the input audio has already been ", file=sys.stderr)
  19. print(" loudness-adjusted or if the input audio is TTS audio, ", file=sys.stderr)
  20. print(" please use the `-d` parameter to disable loudness adjustment.", file=sys.stderr)
  21. meter = pyln.Meter(sample_rate)
  22. current_loudness = meter.integrated_loudness(audio)
  23. audio = pyln.normalize.loudness(audio, current_loudness, target_lufs)
  24. print(f"Adjusted loudness: {current_loudness:.1f} LUFS -> {target_lufs} LUFS")
  25. # Convert sample rate to 16000Hz if necessary
  26. target_sample_rate = 16000
  27. if sample_rate != target_sample_rate:
  28. audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=target_sample_rate)
  29. sample_rate = target_sample_rate
  30. # Convert audio data back to int16 after processing
  31. audio = (audio * 32767).astype(np.int16)
  32. # Initialize Opus encoder
  33. encoder = opuslib.Encoder(sample_rate, 1, opuslib.APPLICATION_AUDIO)
  34. # Encode and save
  35. with open(output_file, 'wb') as f:
  36. duration = 60 # 60ms per frame
  37. frame_size = int(sample_rate * duration / 1000)
  38. for i in tqdm.tqdm(range(0, len(audio) - frame_size, frame_size)):
  39. frame = audio[i:i + frame_size]
  40. opus_data = encoder.encode(frame.tobytes(), frame_size=frame_size)
  41. packet = struct.pack('>BBH', 0, 0, len(opus_data)) + opus_data
  42. f.write(packet)
  43. if __name__ == "__main__":
  44. parser = argparse.ArgumentParser(description='Convert audio to Opus with loudness normalization')
  45. parser.add_argument('input_file', help='Input audio file')
  46. parser.add_argument('output_file', help='Output .opus file')
  47. parser.add_argument('-l', '--lufs', type=float, default=-16.0,
  48. help='Target loudness in LUFS (default: -16)')
  49. parser.add_argument('-d', '--disable-loudnorm', action='store_true',
  50. help='Disable loudness normalization')
  51. args = parser.parse_args()
  52. target_lufs = None if args.disable_loudnorm else args.lufs
  53. encode_audio_to_opus(args.input_file, args.output_file, target_lufs)