123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573 |
- #include "audio_service.h"
- #include <esp_log.h>
- #if CONFIG_USE_AUDIO_PROCESSOR
- #include "processors/afe_audio_processor.h"
- #else
- #include "processors/no_audio_processor.h"
- #endif
- #if CONFIG_USE_AFE_WAKE_WORD
- #include "wake_words/afe_wake_word.h"
- #elif CONFIG_USE_ESP_WAKE_WORD
- #include "wake_words/esp_wake_word.h"
- #elif CONFIG_USE_CUSTOM_WAKE_WORD
- #include "wake_words/custom_wake_word.h"
- #endif
- #define TAG "AudioService"
- AudioService::AudioService() {
- event_group_ = xEventGroupCreate();
- }
- AudioService::~AudioService() {
- if (event_group_ != nullptr) {
- vEventGroupDelete(event_group_);
- }
- }
- void AudioService::Initialize(AudioCodec* codec) {
- codec_ = codec;
- codec_->Start();
- /* Setup the audio codec */
- opus_decoder_ = std::make_unique<OpusDecoderWrapper>(codec->output_sample_rate(), 1, OPUS_FRAME_DURATION_MS);
- opus_encoder_ = std::make_unique<OpusEncoderWrapper>(16000, 1, OPUS_FRAME_DURATION_MS);
- opus_encoder_->SetComplexity(0);
- if (codec->input_sample_rate() != 16000) {
- input_resampler_.Configure(codec->input_sample_rate(), 16000);
- reference_resampler_.Configure(codec->input_sample_rate(), 16000);
- }
- #if CONFIG_USE_AUDIO_PROCESSOR
- audio_processor_ = std::make_unique<AfeAudioProcessor>();
- #else
- audio_processor_ = std::make_unique<NoAudioProcessor>();
- #endif
- #if CONFIG_USE_AFE_WAKE_WORD
- wake_word_ = std::make_unique<AfeWakeWord>();
- #elif CONFIG_USE_ESP_WAKE_WORD
- wake_word_ = std::make_unique<EspWakeWord>();
- #elif CONFIG_USE_CUSTOM_WAKE_WORD
- wake_word_ = std::make_unique<CustomWakeWord>();
- #else
- wake_word_ = nullptr;
- #endif
- audio_processor_->OnOutput([this](std::vector<int16_t>&& data) {
- PushTaskToEncodeQueue(kAudioTaskTypeEncodeToSendQueue, std::move(data));
- });
- audio_processor_->OnVadStateChange([this](bool speaking) {
- voice_detected_ = speaking;
- if (callbacks_.on_vad_change) {
- callbacks_.on_vad_change(speaking);
- }
- });
- if (wake_word_) {
- wake_word_->OnWakeWordDetected([this](const std::string& wake_word) {
- if (callbacks_.on_wake_word_detected) {
- callbacks_.on_wake_word_detected(wake_word);
- }
- });
- }
- esp_timer_create_args_t audio_power_timer_args = {
- .callback = [](void* arg) {
- AudioService* audio_service = (AudioService*)arg;
- audio_service->CheckAndUpdateAudioPowerState();
- },
- .arg = this,
- .dispatch_method = ESP_TIMER_TASK,
- .name = "audio_power_timer",
- .skip_unhandled_events = true,
- };
- esp_timer_create(&audio_power_timer_args, &audio_power_timer_);
- }
- void AudioService::Start() {
- service_stopped_ = false;
- xEventGroupClearBits(event_group_, AS_EVENT_AUDIO_TESTING_RUNNING | AS_EVENT_WAKE_WORD_RUNNING | AS_EVENT_AUDIO_PROCESSOR_RUNNING);
- esp_timer_start_periodic(audio_power_timer_, 1000000);
- /* Start the audio input task */
- #if CONFIG_USE_AUDIO_PROCESSOR
- xTaskCreatePinnedToCore([](void* arg) {
- AudioService* audio_service = (AudioService*)arg;
- audio_service->AudioInputTask();
- vTaskDelete(NULL);
- }, "audio_input", 2048 * 3, this, 8, &audio_input_task_handle_, 1);
- #else
- xTaskCreate([](void* arg) {
- AudioService* audio_service = (AudioService*)arg;
- audio_service->AudioInputTask();
- vTaskDelete(NULL);
- }, "audio_input", 2048 * 3, this, 8, &audio_input_task_handle_);
- #endif
- /* Start the audio output task */
- xTaskCreate([](void* arg) {
- AudioService* audio_service = (AudioService*)arg;
- audio_service->AudioOutputTask();
- vTaskDelete(NULL);
- }, "audio_output", 4096, this, 3, &audio_output_task_handle_);
- /* Start the opus codec task */
- xTaskCreate([](void* arg) {
- AudioService* audio_service = (AudioService*)arg;
- audio_service->OpusCodecTask();
- vTaskDelete(NULL);
- }, "opus_codec", 4096 * 7, this, 2, &opus_codec_task_handle_);
- }
- void AudioService::Stop() {
- esp_timer_stop(audio_power_timer_);
- service_stopped_ = true;
- xEventGroupSetBits(event_group_, AS_EVENT_AUDIO_TESTING_RUNNING |
- AS_EVENT_WAKE_WORD_RUNNING |
- AS_EVENT_AUDIO_PROCESSOR_RUNNING);
- std::lock_guard<std::mutex> lock(audio_queue_mutex_);
- audio_encode_queue_.clear();
- audio_decode_queue_.clear();
- audio_playback_queue_.clear();
- audio_testing_queue_.clear();
- audio_queue_cv_.notify_all();
- }
- bool AudioService::ReadAudioData(std::vector<int16_t>& data, int sample_rate, int samples) {
- if (!codec_->input_enabled()) {
- codec_->EnableInput(true);
- esp_timer_start_periodic(audio_power_timer_, AUDIO_POWER_CHECK_INTERVAL_MS * 1000);
- }
- if (codec_->input_sample_rate() != sample_rate) {
- data.resize(samples * codec_->input_sample_rate() / sample_rate);
- if (!codec_->InputData(data)) {
- return false;
- }
- if (codec_->input_channels() == 2) {
- auto mic_channel = std::vector<int16_t>(data.size() / 2);
- auto reference_channel = std::vector<int16_t>(data.size() / 2);
- for (size_t i = 0, j = 0; i < mic_channel.size(); ++i, j += 2) {
- mic_channel[i] = data[j];
- reference_channel[i] = data[j + 1];
- }
- auto resampled_mic = std::vector<int16_t>(input_resampler_.GetOutputSamples(mic_channel.size()));
- auto resampled_reference = std::vector<int16_t>(reference_resampler_.GetOutputSamples(reference_channel.size()));
- input_resampler_.Process(mic_channel.data(), mic_channel.size(), resampled_mic.data());
- reference_resampler_.Process(reference_channel.data(), reference_channel.size(), resampled_reference.data());
- data.resize(resampled_mic.size() + resampled_reference.size());
- for (size_t i = 0, j = 0; i < resampled_mic.size(); ++i, j += 2) {
- data[j] = resampled_mic[i];
- data[j + 1] = resampled_reference[i];
- }
- } else {
- auto resampled = std::vector<int16_t>(input_resampler_.GetOutputSamples(data.size()));
- input_resampler_.Process(data.data(), data.size(), resampled.data());
- data = std::move(resampled);
- }
- } else {
- data.resize(samples);
- if (!codec_->InputData(data)) {
- return false;
- }
- }
- /* Update the last input time */
- last_input_time_ = std::chrono::steady_clock::now();
- debug_statistics_.input_count++;
- #if CONFIG_USE_AUDIO_DEBUGGER
- // 音频调试:发送原始音频数据
- if (audio_debugger_ == nullptr) {
- audio_debugger_ = std::make_unique<AudioDebugger>();
- }
- audio_debugger_->Feed(data);
- #endif
- return true;
- }
- void AudioService::AudioInputTask() {
- while (true) {
- EventBits_t bits = xEventGroupWaitBits(event_group_, AS_EVENT_AUDIO_TESTING_RUNNING |
- AS_EVENT_WAKE_WORD_RUNNING | AS_EVENT_AUDIO_PROCESSOR_RUNNING,
- pdFALSE, pdFALSE, portMAX_DELAY);
- if (service_stopped_) {
- break;
- }
- if (audio_input_need_warmup_) {
- audio_input_need_warmup_ = false;
- vTaskDelay(pdMS_TO_TICKS(120));
- continue;
- }
- /* Used for audio testing in NetworkConfiguring mode by clicking the BOOT button */
- if (bits & AS_EVENT_AUDIO_TESTING_RUNNING) {
- if (audio_testing_queue_.size() >= AUDIO_TESTING_MAX_DURATION_MS / OPUS_FRAME_DURATION_MS) {
- ESP_LOGW(TAG, "Audio testing queue is full, stopping audio testing");
- EnableAudioTesting(false);
- continue;
- }
- std::vector<int16_t> data;
- int samples = OPUS_FRAME_DURATION_MS * 16000 / 1000;
- if (ReadAudioData(data, 16000, samples)) {
- // If input channels is 2, we need to fetch the left channel data
- if (codec_->input_channels() == 2) {
- auto mono_data = std::vector<int16_t>(data.size() / 2);
- for (size_t i = 0, j = 0; i < mono_data.size(); ++i, j += 2) {
- mono_data[i] = data[j];
- }
- data = std::move(mono_data);
- }
- PushTaskToEncodeQueue(kAudioTaskTypeEncodeToTestingQueue, std::move(data));
- continue;
- }
- }
- /* Feed the wake word */
- if (bits & AS_EVENT_WAKE_WORD_RUNNING) {
- std::vector<int16_t> data;
- int samples = wake_word_->GetFeedSize();
- if (samples > 0) {
- if (ReadAudioData(data, 16000, samples)) {
- wake_word_->Feed(data);
- continue;
- }
- }
- }
- /* Feed the audio processor */
- if (bits & AS_EVENT_AUDIO_PROCESSOR_RUNNING) {
- std::vector<int16_t> data;
- int samples = audio_processor_->GetFeedSize();
- if (samples > 0) {
- if (ReadAudioData(data, 16000, samples)) {
- audio_processor_->Feed(std::move(data));
- continue;
- }
- }
- }
- ESP_LOGE(TAG, "Should not be here, bits: %lx", bits);
- break;
- }
- ESP_LOGW(TAG, "Audio input task stopped");
- }
- void AudioService::AudioOutputTask() {
- while (true) {
- std::unique_lock<std::mutex> lock(audio_queue_mutex_);
- audio_queue_cv_.wait(lock, [this]() { return !audio_playback_queue_.empty() || service_stopped_; });
- if (service_stopped_) {
- break;
- }
- auto task = std::move(audio_playback_queue_.front());
- audio_playback_queue_.pop_front();
- audio_queue_cv_.notify_all();
- lock.unlock();
- if (!codec_->output_enabled()) {
- codec_->EnableOutput(true);
- esp_timer_start_periodic(audio_power_timer_, AUDIO_POWER_CHECK_INTERVAL_MS * 1000);
- }
- codec_->OutputData(task->pcm);
- /* Update the last output time */
- last_output_time_ = std::chrono::steady_clock::now();
- debug_statistics_.playback_count++;
- #if CONFIG_USE_SERVER_AEC
- /* Record the timestamp for server AEC */
- if (task->timestamp > 0) {
- lock.lock();
- timestamp_queue_.push_back(task->timestamp);
- }
- #endif
- }
- ESP_LOGW(TAG, "Audio output task stopped");
- }
- void AudioService::OpusCodecTask() {
- while (true) {
- std::unique_lock<std::mutex> lock(audio_queue_mutex_);
- audio_queue_cv_.wait(lock, [this]() {
- return service_stopped_ ||
- (!audio_encode_queue_.empty() && audio_send_queue_.size() < MAX_SEND_PACKETS_IN_QUEUE) ||
- (!audio_decode_queue_.empty() && audio_playback_queue_.size() < MAX_PLAYBACK_TASKS_IN_QUEUE);
- });
- if (service_stopped_) {
- break;
- }
- /* Decode the audio from decode queue */
- if (!audio_decode_queue_.empty() && audio_playback_queue_.size() < MAX_PLAYBACK_TASKS_IN_QUEUE) {
- auto packet = std::move(audio_decode_queue_.front());
- audio_decode_queue_.pop_front();
- audio_queue_cv_.notify_all();
- lock.unlock();
- auto task = std::make_unique<AudioTask>();
- task->type = kAudioTaskTypeDecodeToPlaybackQueue;
- task->timestamp = packet->timestamp;
- SetDecodeSampleRate(packet->sample_rate, packet->frame_duration);
- if (opus_decoder_->Decode(std::move(packet->payload), task->pcm)) {
- // Resample if the sample rate is different
- if (opus_decoder_->sample_rate() != codec_->output_sample_rate()) {
- int target_size = output_resampler_.GetOutputSamples(task->pcm.size());
- std::vector<int16_t> resampled(target_size);
- output_resampler_.Process(task->pcm.data(), task->pcm.size(), resampled.data());
- task->pcm = std::move(resampled);
- }
- lock.lock();
- audio_playback_queue_.push_back(std::move(task));
- audio_queue_cv_.notify_all();
- } else {
- ESP_LOGE(TAG, "Failed to decode audio");
- lock.lock();
- }
- debug_statistics_.decode_count++;
- }
-
- /* Encode the audio to send queue */
- if (!audio_encode_queue_.empty() && audio_send_queue_.size() < MAX_SEND_PACKETS_IN_QUEUE) {
- auto task = std::move(audio_encode_queue_.front());
- audio_encode_queue_.pop_front();
- audio_queue_cv_.notify_all();
- lock.unlock();
- auto packet = std::make_unique<AudioStreamPacket>();
- packet->frame_duration = OPUS_FRAME_DURATION_MS;
- packet->sample_rate = 16000;
- packet->timestamp = task->timestamp;
- if (!opus_encoder_->Encode(std::move(task->pcm), packet->payload)) {
- ESP_LOGE(TAG, "Failed to encode audio");
- continue;
- }
- if (task->type == kAudioTaskTypeEncodeToSendQueue) {
- {
- std::lock_guard<std::mutex> lock(audio_queue_mutex_);
- audio_send_queue_.push_back(std::move(packet));
- }
- if (callbacks_.on_send_queue_available) {
- callbacks_.on_send_queue_available();
- }
- } else if (task->type == kAudioTaskTypeEncodeToTestingQueue) {
- std::lock_guard<std::mutex> lock(audio_queue_mutex_);
- audio_testing_queue_.push_back(std::move(packet));
- }
- debug_statistics_.encode_count++;
- lock.lock();
- }
- }
- ESP_LOGW(TAG, "Opus codec task stopped");
- }
- void AudioService::SetDecodeSampleRate(int sample_rate, int frame_duration) {
- if (opus_decoder_->sample_rate() == sample_rate && opus_decoder_->duration_ms() == frame_duration) {
- return;
- }
- opus_decoder_.reset();
- opus_decoder_ = std::make_unique<OpusDecoderWrapper>(sample_rate, 1, frame_duration);
- auto codec = Board::GetInstance().GetAudioCodec();
- if (opus_decoder_->sample_rate() != codec->output_sample_rate()) {
- ESP_LOGI(TAG, "Resampling audio from %d to %d", opus_decoder_->sample_rate(), codec->output_sample_rate());
- output_resampler_.Configure(opus_decoder_->sample_rate(), codec->output_sample_rate());
- }
- }
- void AudioService::PushTaskToEncodeQueue(AudioTaskType type, std::vector<int16_t>&& pcm) {
- auto task = std::make_unique<AudioTask>();
- task->type = type;
- task->pcm = std::move(pcm);
-
- /* Push the task to the encode queue */
- std::unique_lock<std::mutex> lock(audio_queue_mutex_);
- /* If the task is to send queue, we need to set the timestamp */
- if (type == kAudioTaskTypeEncodeToSendQueue && !timestamp_queue_.empty()) {
- if (timestamp_queue_.size() <= MAX_TIMESTAMPS_IN_QUEUE) {
- task->timestamp = timestamp_queue_.front();
- } else {
- ESP_LOGW(TAG, "Timestamp queue (%u) is full, dropping timestamp", timestamp_queue_.size());
- }
- timestamp_queue_.pop_front();
- }
- audio_queue_cv_.wait(lock, [this]() { return audio_encode_queue_.size() < MAX_ENCODE_TASKS_IN_QUEUE; });
- audio_encode_queue_.push_back(std::move(task));
- audio_queue_cv_.notify_all();
- }
- bool AudioService::PushPacketToDecodeQueue(std::unique_ptr<AudioStreamPacket> packet, bool wait) {
- std::unique_lock<std::mutex> lock(audio_queue_mutex_);
- if (audio_decode_queue_.size() >= MAX_DECODE_PACKETS_IN_QUEUE) {
- if (wait) {
- audio_queue_cv_.wait(lock, [this]() { return audio_decode_queue_.size() < MAX_DECODE_PACKETS_IN_QUEUE; });
- } else {
- return false;
- }
- }
- audio_decode_queue_.push_back(std::move(packet));
- audio_queue_cv_.notify_all();
- return true;
- }
- std::unique_ptr<AudioStreamPacket> AudioService::PopPacketFromSendQueue() {
- std::lock_guard<std::mutex> lock(audio_queue_mutex_);
- if (audio_send_queue_.empty()) {
- return nullptr;
- }
- auto packet = std::move(audio_send_queue_.front());
- audio_send_queue_.pop_front();
- audio_queue_cv_.notify_all();
- return packet;
- }
- void AudioService::EncodeWakeWord() {
- if (wake_word_) {
- wake_word_->EncodeWakeWordData();
- }
- }
- const std::string& AudioService::GetLastWakeWord() const {
- return wake_word_->GetLastDetectedWakeWord();
- }
- std::unique_ptr<AudioStreamPacket> AudioService::PopWakeWordPacket() {
- auto packet = std::make_unique<AudioStreamPacket>();
- if (wake_word_->GetWakeWordOpus(packet->payload)) {
- return packet;
- }
- return nullptr;
- }
- void AudioService::EnableWakeWordDetection(bool enable) {
- if (!wake_word_) {
- return;
- }
- ESP_LOGD(TAG, "%s wake word detection", enable ? "Enabling" : "Disabling");
- if (enable) {
- if (!wake_word_initialized_) {
- if (!wake_word_->Initialize(codec_)) {
- ESP_LOGE(TAG, "Failed to initialize wake word");
- return;
- }
- wake_word_initialized_ = true;
- }
- wake_word_->Start();
- xEventGroupSetBits(event_group_, AS_EVENT_WAKE_WORD_RUNNING);
- } else {
- wake_word_->Stop();
- xEventGroupClearBits(event_group_, AS_EVENT_WAKE_WORD_RUNNING);
- }
- }
- void AudioService::EnableVoiceProcessing(bool enable) {
- ESP_LOGD(TAG, "%s voice processing", enable ? "Enabling" : "Disabling");
- if (enable) {
- if (!audio_processor_initialized_) {
- audio_processor_->Initialize(codec_, OPUS_FRAME_DURATION_MS);
- audio_processor_initialized_ = true;
- }
- /* We should make sure no audio is playing */
- ResetDecoder();
- audio_input_need_warmup_ = true;
- audio_processor_->Start();
- xEventGroupSetBits(event_group_, AS_EVENT_AUDIO_PROCESSOR_RUNNING);
- } else {
- audio_processor_->Stop();
- xEventGroupClearBits(event_group_, AS_EVENT_AUDIO_PROCESSOR_RUNNING);
- }
- }
- void AudioService::EnableAudioTesting(bool enable) {
- ESP_LOGI(TAG, "%s audio testing", enable ? "Enabling" : "Disabling");
- if (enable) {
- xEventGroupSetBits(event_group_, AS_EVENT_AUDIO_TESTING_RUNNING);
- } else {
- xEventGroupClearBits(event_group_, AS_EVENT_AUDIO_TESTING_RUNNING);
- /* Copy audio_testing_queue_ to audio_decode_queue_ */
- std::lock_guard<std::mutex> lock(audio_queue_mutex_);
- audio_decode_queue_ = std::move(audio_testing_queue_);
- audio_queue_cv_.notify_all();
- }
- }
- void AudioService::EnableDeviceAec(bool enable) {
- ESP_LOGI(TAG, "%s device AEC", enable ? "Enabling" : "Disabling");
- audio_processor_->EnableDeviceAec(enable);
- }
- void AudioService::SetCallbacks(AudioServiceCallbacks& callbacks) {
- callbacks_ = callbacks;
- }
- void AudioService::PlaySound(const std::string_view& sound) {
- const char* data = sound.data();
- size_t size = sound.size();
- for (const char* p = data; p < data + size; ) {
- auto p3 = (BinaryProtocol3*)p;
- p += sizeof(BinaryProtocol3);
- auto payload_size = ntohs(p3->payload_size);
- auto packet = std::make_unique<AudioStreamPacket>();
- packet->sample_rate = 16000;
- packet->frame_duration = 60;
- packet->payload.resize(payload_size);
- memcpy(packet->payload.data(), p3->payload, payload_size);
- p += payload_size;
- PushPacketToDecodeQueue(std::move(packet), true);
- }
- }
- bool AudioService::IsIdle() {
- std::lock_guard<std::mutex> lock(audio_queue_mutex_);
- return audio_encode_queue_.empty() && audio_decode_queue_.empty() && audio_playback_queue_.empty() && audio_testing_queue_.empty();
- }
- void AudioService::ResetDecoder() {
- std::lock_guard<std::mutex> lock(audio_queue_mutex_);
- opus_decoder_->ResetState();
- timestamp_queue_.clear();
- audio_decode_queue_.clear();
- audio_playback_queue_.clear();
- audio_testing_queue_.clear();
- audio_queue_cv_.notify_all();
- }
- void AudioService::CheckAndUpdateAudioPowerState() {
- auto now = std::chrono::steady_clock::now();
- auto input_elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(now - last_input_time_).count();
- auto output_elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(now - last_output_time_).count();
- if (input_elapsed > AUDIO_POWER_TIMEOUT_MS && codec_->input_enabled()) {
- codec_->EnableInput(false);
- }
- if (output_elapsed > AUDIO_POWER_TIMEOUT_MS && codec_->output_enabled()) {
- codec_->EnableOutput(false);
- }
- if (!codec_->input_enabled() && !codec_->output_enabled()) {
- esp_timer_stop(audio_power_timer_);
- }
- }
|