afe_wake_word.cc 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190
  1. #include "afe_wake_word.h"
  2. #include "application.h"
  3. #include <esp_log.h>
  4. #include <model_path.h>
  5. #include <arpa/inet.h>
  6. #include <sstream>
  7. #define DETECTION_RUNNING_EVENT 1
  8. #define TAG "AfeWakeWord"
  9. AfeWakeWord::AfeWakeWord()
  10. : afe_data_(nullptr),
  11. wake_word_pcm_(),
  12. wake_word_opus_() {
  13. event_group_ = xEventGroupCreate();
  14. }
  15. AfeWakeWord::~AfeWakeWord() {
  16. if (afe_data_ != nullptr) {
  17. afe_iface_->destroy(afe_data_);
  18. }
  19. if (wake_word_encode_task_stack_ != nullptr) {
  20. heap_caps_free(wake_word_encode_task_stack_);
  21. }
  22. vEventGroupDelete(event_group_);
  23. }
  24. bool AfeWakeWord::Initialize(AudioCodec* codec) {
  25. codec_ = codec;
  26. int ref_num = codec_->input_reference() ? 1 : 0;
  27. srmodel_list_t *models = esp_srmodel_init("model");
  28. if (models == nullptr || models->num == -1) {
  29. ESP_LOGE(TAG, "Failed to initialize wakenet model");
  30. return false;
  31. }
  32. for (int i = 0; i < models->num; i++) {
  33. ESP_LOGI(TAG, "Model %d: %s", i, models->model_name[i]);
  34. if (strstr(models->model_name[i], ESP_WN_PREFIX) != NULL) {
  35. wakenet_model_ = models->model_name[i];
  36. auto words = esp_srmodel_get_wake_words(models, wakenet_model_);
  37. // split by ";" to get all wake words
  38. std::stringstream ss(words);
  39. std::string word;
  40. while (std::getline(ss, word, ';')) {
  41. wake_words_.push_back(word);
  42. }
  43. }
  44. }
  45. std::string input_format;
  46. for (int i = 0; i < codec_->input_channels() - ref_num; i++) {
  47. input_format.push_back('M');
  48. }
  49. for (int i = 0; i < ref_num; i++) {
  50. input_format.push_back('R');
  51. }
  52. afe_config_t* afe_config = afe_config_init(input_format.c_str(), models, AFE_TYPE_SR, AFE_MODE_HIGH_PERF);
  53. afe_config->aec_init = codec_->input_reference();
  54. afe_config->aec_mode = AEC_MODE_SR_HIGH_PERF;
  55. afe_config->afe_perferred_core = 1;
  56. afe_config->afe_perferred_priority = 1;
  57. afe_config->memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM;
  58. afe_iface_ = esp_afe_handle_from_config(afe_config);
  59. afe_data_ = afe_iface_->create_from_config(afe_config);
  60. xTaskCreate([](void* arg) {
  61. auto this_ = (AfeWakeWord*)arg;
  62. this_->AudioDetectionTask();
  63. vTaskDelete(NULL);
  64. }, "audio_detection", 4096, this, 3, nullptr);
  65. return true;
  66. }
  67. void AfeWakeWord::OnWakeWordDetected(std::function<void(const std::string& wake_word)> callback) {
  68. wake_word_detected_callback_ = callback;
  69. }
  70. void AfeWakeWord::Start() {
  71. xEventGroupSetBits(event_group_, DETECTION_RUNNING_EVENT);
  72. }
  73. void AfeWakeWord::Stop() {
  74. xEventGroupClearBits(event_group_, DETECTION_RUNNING_EVENT);
  75. if (afe_data_ != nullptr) {
  76. afe_iface_->reset_buffer(afe_data_);
  77. }
  78. }
  79. void AfeWakeWord::Feed(const std::vector<int16_t>& data) {
  80. if (afe_data_ == nullptr) {
  81. return;
  82. }
  83. afe_iface_->feed(afe_data_, data.data());
  84. }
  85. size_t AfeWakeWord::GetFeedSize() {
  86. if (afe_data_ == nullptr) {
  87. return 0;
  88. }
  89. return afe_iface_->get_feed_chunksize(afe_data_) * codec_->input_channels();
  90. }
  91. void AfeWakeWord::AudioDetectionTask() {
  92. auto fetch_size = afe_iface_->get_fetch_chunksize(afe_data_);
  93. auto feed_size = afe_iface_->get_feed_chunksize(afe_data_);
  94. ESP_LOGI(TAG, "Audio detection task started, feed size: %d fetch size: %d",
  95. feed_size, fetch_size);
  96. while (true) {
  97. xEventGroupWaitBits(event_group_, DETECTION_RUNNING_EVENT, pdFALSE, pdTRUE, portMAX_DELAY);
  98. auto res = afe_iface_->fetch_with_delay(afe_data_, portMAX_DELAY);
  99. if (res == nullptr || res->ret_value == ESP_FAIL) {
  100. continue;;
  101. }
  102. // Store the wake word data for voice recognition, like who is speaking
  103. StoreWakeWordData(res->data, res->data_size / sizeof(int16_t));
  104. if (res->wakeup_state == WAKENET_DETECTED) {
  105. Stop();
  106. last_detected_wake_word_ = wake_words_[res->wakenet_model_index - 1];
  107. if (wake_word_detected_callback_) {
  108. wake_word_detected_callback_(last_detected_wake_word_);
  109. }
  110. }
  111. }
  112. }
  113. void AfeWakeWord::StoreWakeWordData(const int16_t* data, size_t samples) {
  114. // store audio data to wake_word_pcm_
  115. wake_word_pcm_.emplace_back(std::vector<int16_t>(data, data + samples));
  116. // keep about 2 seconds of data, detect duration is 30ms (sample_rate == 16000, chunksize == 512)
  117. while (wake_word_pcm_.size() > 2000 / 30) {
  118. wake_word_pcm_.pop_front();
  119. }
  120. }
  121. void AfeWakeWord::EncodeWakeWordData() {
  122. wake_word_opus_.clear();
  123. if (wake_word_encode_task_stack_ == nullptr) {
  124. wake_word_encode_task_stack_ = (StackType_t*)heap_caps_malloc(4096 * 8, MALLOC_CAP_SPIRAM);
  125. }
  126. wake_word_encode_task_ = xTaskCreateStatic([](void* arg) {
  127. auto this_ = (AfeWakeWord*)arg;
  128. {
  129. auto start_time = esp_timer_get_time();
  130. auto encoder = std::make_unique<OpusEncoderWrapper>(16000, 1, OPUS_FRAME_DURATION_MS);
  131. encoder->SetComplexity(0); // 0 is the fastest
  132. int packets = 0;
  133. for (auto& pcm: this_->wake_word_pcm_) {
  134. encoder->Encode(std::move(pcm), [this_](std::vector<uint8_t>&& opus) {
  135. std::lock_guard<std::mutex> lock(this_->wake_word_mutex_);
  136. this_->wake_word_opus_.emplace_back(std::move(opus));
  137. this_->wake_word_cv_.notify_all();
  138. });
  139. packets++;
  140. }
  141. this_->wake_word_pcm_.clear();
  142. auto end_time = esp_timer_get_time();
  143. ESP_LOGI(TAG, "Encode wake word opus %d packets in %ld ms", packets, (long)((end_time - start_time) / 1000));
  144. std::lock_guard<std::mutex> lock(this_->wake_word_mutex_);
  145. this_->wake_word_opus_.push_back(std::vector<uint8_t>());
  146. this_->wake_word_cv_.notify_all();
  147. }
  148. vTaskDelete(NULL);
  149. }, "encode_detect_packets", 4096 * 8, this, 2, wake_word_encode_task_stack_, &wake_word_encode_task_buffer_);
  150. }
  151. bool AfeWakeWord::GetWakeWordOpus(std::vector<uint8_t>& opus) {
  152. std::unique_lock<std::mutex> lock(wake_word_mutex_);
  153. wake_word_cv_.wait(lock, [this]() {
  154. return !wake_word_opus_.empty();
  155. });
  156. opus.swap(wake_word_opus_.front());
  157. wake_word_opus_.pop_front();
  158. return !opus.empty();
  159. }