diff --git a/dialogflow_task_executive/node_scripts/dialogflow_client.py b/dialogflow_task_executive/node_scripts/dialogflow_client.py
index 98b99c15e..d8dbeb08a 100644
--- a/dialogflow_task_executive/node_scripts/dialogflow_client.py
+++ b/dialogflow_task_executive/node_scripts/dialogflow_client.py
@@ -271,7 +271,7 @@ def speak_result(self, result):
volume=self.volume)
# for japanese or utf-8 languages
- if self.language == 'ja-JP' and sys.version <= 2:
+ if self.language == 'ja-JP' and sys.version_info.major <= 2:
msg.arg = result.fulfillment_text.encode('utf-8')
else:
msg.arg = result.fulfillment_text
diff --git a/respeaker_ros/CMakeLists.txt b/respeaker_ros/CMakeLists.txt
index 390a82e0f..b4009cd1a 100644
--- a/respeaker_ros/CMakeLists.txt
+++ b/respeaker_ros/CMakeLists.txt
@@ -35,7 +35,9 @@ catkin_install_python(PROGRAMS ${PYTHON_SCRIPTS}
if(CATKIN_ENABLE_TESTING)
find_package(rostest REQUIRED)
+ find_package(roslaunch REQUIRED)
add_rostest(test/sample_respeaker.test
DEPENDENCIES ${PROJECT_NAME}_generate_virtualenv
)
+ roslaunch_add_file_check(launch/sample_respeaker.launch)
endif()
diff --git a/respeaker_ros/README.md b/respeaker_ros/README.md
index e42ba1202..247168ba5 100644
--- a/respeaker_ros/README.md
+++ b/respeaker_ros/README.md
@@ -92,6 +92,151 @@ A ROS Package for Respeaker Mic Array
a: 0.3"
```
+## Parameters for respeaker_node.py
+
+ - ### Publishing topics
+
+ - `audio` (`audio_common_msgs/AudioData`)
+
+ Processed audio for ASR. 1 channel.
+
+ - `audio_info` (`audio_common_msgs/AudioInfo`)
+
+ Audio info with respect to `~audio`.
+
+ - `audio_raw` (`audio_common_msgs/AudioData`)
+
+ Micarray audio data has 4-channels. Maybe you need to update respeaker firmware.
+
+ If the firmware isn't supported, this will not be output.
+
+ - `audio_info_raw` (`audio_common_msgs/AudioInfo`)
+
+ Audio info with respect to `~audio_raw`.
+
+ If the firmware isn't supported, this will not be output.
+
+ - `speech_audio` (`audio_common_msgs/AudioData`)
+
+ Audio data while a person is speaking using the VAD function.
+
+ - `speech_audio_raw` (`audio_common_msgs/AudioData`)
+
+ Audio data has 4-channels while a person is speaking using the VAD function.
+
+ If the firmware isn't supported, this will not be output.
+
+ - `audio_merged_playback` (`audio_common_msgs/AudioData`)
+
+ Data that combines the sound of mic and speaker.
+
+ If the firmware isn't supported, this will not be output.
+
+ For more detail, please see https://wiki.seeedstudio.com/ReSpeaker_Mic_Array_v2.0/
+
+ - `~is_speeching` (`std_msgs/Bool`)
+
+ Using VAD function, publish whether someone is speaking.
+
+ - `~sound_direction` (`std_msgs/Int32`)
+
+ Direction of sound.
+
+ - `~sound_localization` (`geometry_msgs/PoseStamped`)
+
+ Localized Sound Direction. The value of the position in the estimated direction with `~doa_offset` as the radius is obtained.
+
+ - ### Parameters
+
+ - `~update_rate` (`Double`, default: `10.0`)
+
+ Publishing info data such as `~is_speeching`, `~sound_direction`, `~sound_localization`, `~speech_audio` and `~speech_audio_raw`.
+
+ - `~sensor_frame_id` (`String`, default: `respeaker_base`)
+
+ Frame id.
+
+ - `~doa_xy_offset` (`Double`, default: `0.0`)
+
+ `~doa_offset` is a estimated sound direction's radius.
+
+ - `~doa_yaw_offset` (`Double`, default: `90.0`)
+
+ Estimated DoA angle offset.
+
+ - `~speech_prefetch` (`Double`, default: `0.5`)
+
+ Time to represent how long speech is pre-stored in buffer.
+
+ - `~speech_continuation` (`Double`, default: `0.5`)
+
+ If the time between the current time and the time when the speech is stopped is shorter than this time,
+ it is assumed that someone is speaking.
+
+ - `~speech_max_duration` (`Double`, default: `7.0`)
+
+ - `~speech_min_duration` (`Double`, default: `0.1`)
+
+ If the speaking interval is within these times, `~speech_audio` and `~speech_audio_raw` will be published.
+
+ - `~suppress_pyaudio_error` (`Bool`, default: `True`)
+
+ If this value is `True`, suppress error from pyaudio.
+
+## Parameters for speech_to_text.py
+
+ - ### Publishing topics
+
+ - `~speech_to_text` (`speech_recognition_msgs/SpeechRecognitionCandidates`)
+
+ Recognized text.
+
+ - ### Subscribing topics
+
+ - `audio` (`audio_common_msgs/AudioData`)
+
+ Input audio.
+
+ - ### Parameters
+
+ - `~audio_info` (`String`, default: ``)
+
+ audio_info (`audio_common_msgs/AudioInfo`) topic. If this value is specified, `~sample_rate`, `~sample_width` and `~channels` parameters are obtained from the topic.
+
+ - `~sample_rate` (`Int`, default: `16000`)
+
+ Sampling rate.
+
+ - `~sample_width` (`Int`, default: `2`)
+
+ Sample with.
+
+ - `~channels` (`Int`, default: `1`)
+
+ Number of channels.
+
+ - `~target_channel` (`Int`, default: `0`)
+
+ Target number of channel.
+
+ - `~language` (`String`, default: `ja-JP`)
+
+ language of speech to text service. For English users, you can specify `en-US`.
+
+ - `~self_cancellation` (`Bool`, default: `True`)
+
+ ignore voice input while the robot is speaking.
+
+ - `~tts_tolerance` (`String`, default: `1.0`)
+
+ time to assume as SPEAKING after tts service is finished.
+
+ - `~tts_action_names` (`List[String]`, default: `['sound_play']`)
+
+ If `~self_chancellation` is `True`, this value will be used.
+
+ When the actions are active, do nothing with the callback that subscribes to `audio`.
+
## Use cases
### Voice Recognition
diff --git a/respeaker_ros/launch/sample_respeaker.launch b/respeaker_ros/launch/sample_respeaker.launch
index 31d083608..e2c43c557 100644
--- a/respeaker_ros/launch/sample_respeaker.launch
+++ b/respeaker_ros/launch/sample_respeaker.launch
@@ -13,6 +13,8 @@
+
+
+ respawn="true" respawn_delay="10" >
+
@@ -30,6 +33,7 @@
+ audio_info: $(arg audio_info)
language: $(arg language)
self_cancellation: $(arg self_cancellation)
tts_tolerance: 0.5
diff --git a/respeaker_ros/package.xml b/respeaker_ros/package.xml
index ac83b898a..be16789a2 100644
--- a/respeaker_ros/package.xml
+++ b/respeaker_ros/package.xml
@@ -17,6 +17,7 @@
flac
geometry_msgs
std_msgs
+ sound_play
speech_recognition_msgs
tf
python-numpy
diff --git a/respeaker_ros/scripts/respeaker_node.py b/respeaker_ros/scripts/respeaker_node.py
index d9ec870aa..261598444 100644
--- a/respeaker_ros/scripts/respeaker_node.py
+++ b/respeaker_ros/scripts/respeaker_node.py
@@ -16,6 +16,13 @@
import sys
import time
from audio_common_msgs.msg import AudioData
+enable_audio_info = True
+try:
+ from audio_common_msgs.msg import AudioInfo
+except Exception as e:
+ rospy.logwarn('audio_common_msgs/AudioInfo message is not exists.'
+ ' AudioInfo message will not be published.')
+ enable_audio_info = False
from geometry_msgs.msg import PoseStamped
from std_msgs.msg import Bool, Int32, ColorRGBA
from dynamic_reconfigure.server import Server
@@ -265,7 +272,6 @@ def __init__(self, on_audio, channel=0, suppress_error=True):
if self.channels != 6:
rospy.logwarn("%d channel is found for respeaker" % self.channels)
rospy.logwarn("You may have to update firmware.")
- self.channel = min(self.channels - 1, max(0, self.channel))
self.stream = self.pyaudio.open(
input=True, start=False,
@@ -295,9 +301,8 @@ def stream_callback(self, in_data, frame_count, time_info, status):
data = np.frombuffer(in_data, dtype=np.int16)
chunk_per_channel = int(len(data) / self.channels)
data = np.reshape(data, (chunk_per_channel, self.channels))
- chan_data = data[:, self.channel]
# invoke callback
- self.on_audio(chan_data.tobytes())
+ self.on_audio(data)
return None, pyaudio.paContinue
def start(self):
@@ -333,14 +338,24 @@ def __init__(self):
self.pub_doa_raw = rospy.Publisher("sound_direction", Int32, queue_size=1, latch=True)
self.pub_doa = rospy.Publisher("sound_localization", PoseStamped, queue_size=1, latch=True)
self.pub_audio = rospy.Publisher("audio", AudioData, queue_size=10)
+ if enable_audio_info is True:
+ self.pub_audio_info = rospy.Publisher("audio_info", AudioInfo,
+ queue_size=1, latch=True)
+ self.pub_audio_raw_info = rospy.Publisher("audio_info_raw", AudioInfo,
+ queue_size=1, latch=True)
self.pub_speech_audio = rospy.Publisher("speech_audio", AudioData, queue_size=10)
# init config
self.config = None
self.dyn_srv = Server(RespeakerConfig, self.on_config)
# start
self.respeaker_audio = RespeakerAudio(self.on_audio, suppress_error=suppress_pyaudio_error)
+ self.n_channel = self.respeaker_audio.channels
+
self.speech_prefetch_bytes = int(
- self.speech_prefetch * self.respeaker_audio.rate * self.respeaker_audio.bitdepth / 8.0)
+ 1
+ * self.speech_prefetch
+ * self.respeaker_audio.rate
+ * self.respeaker_audio.bitdepth / 8.0)
self.speech_prefetch_buffer = b""
self.respeaker_audio.start()
self.info_timer = rospy.Timer(rospy.Duration(1.0 / self.update_rate),
@@ -348,6 +363,58 @@ def __init__(self):
self.timer_led = None
self.sub_led = rospy.Subscriber("status_led", ColorRGBA, self.on_status_led)
+ # processed audio for ASR
+ if enable_audio_info is True:
+ info_msg = AudioInfo(
+ channels=1,
+ sample_rate=self.respeaker_audio.rate,
+ sample_format='S16LE',
+ bitrate=self.respeaker_audio.rate * self.respeaker_audio.bitdepth,
+ coding_format='WAVE')
+ self.pub_audio_info.publish(info_msg)
+
+ if self.n_channel > 1:
+ # The respeaker has 4 microphones.
+ # Multiple microphones can be used for
+ # beam forming (strengthening the sound in a specific direction)
+ # and sound localization (the respeaker outputs the azimuth
+ # direction, but the multichannel can estimate
+ # the elevation direction). etc.
+
+ # Channel 0: processed audio for ASR
+ # Channel 1: mic1 raw data
+ # Channel 2: mic2 raw data
+ # Channel 3: mic3 raw data
+ # Channel 4: mic4 raw data
+ # Channel 5: merged playback
+ # For more detail, please see
+ # https://wiki.seeedstudio.com/ReSpeaker_Mic_Array_v2.0/
+ # (self.n_channel - 2) = 4 channels are multiple microphones.
+ self.pub_audio_raw = rospy.Publisher("audio_raw", AudioData,
+ queue_size=10)
+ self.pub_audio_merged_playback = rospy.Publisher(
+ "audio_merged_playback", AudioData,
+ queue_size=10)
+ if enable_audio_info is True:
+ info_raw_msg = AudioInfo(
+ channels=self.n_channel - 2,
+ sample_rate=self.respeaker_audio.rate,
+ sample_format='S16LE',
+ bitrate=(self.respeaker_audio.rate *
+ self.respeaker_audio.bitdepth),
+ coding_format='WAVE')
+ self.pub_audio_raw_info.publish(info_raw_msg)
+
+ self.speech_audio_raw_buffer = b""
+ self.speech_raw_prefetch_buffer = b""
+ self.pub_speech_audio_raw = rospy.Publisher(
+ "speech_audio_raw", AudioData, queue_size=10)
+ self.speech_raw_prefetch_bytes = int(
+ (self.n_channel - 2)
+ * self.speech_prefetch
+ * self.respeaker_audio.rate
+ * self.respeaker_audio.bitdepth / 8.0)
+
def on_shutdown(self):
self.info_timer.shutdown()
try:
@@ -386,14 +453,30 @@ def on_status_led(self, msg):
oneshot=True)
def on_audio(self, data):
- self.pub_audio.publish(AudioData(data=data))
+ # take processed audio for ASR.
+ processed_data = data[:, 0].tobytes()
+ self.pub_audio.publish(AudioData(data=processed_data))
+ if self.n_channel > 1:
+ raw_audio_data = data[:, 1:5].reshape(-1).tobytes()
+ self.pub_audio_raw.publish(
+ AudioData(data=raw_audio_data))
+ self.pub_audio_merged_playback.publish(
+ AudioData(data=data[:, 5].tobytes()))
if self.is_speeching:
if len(self.speech_audio_buffer) == 0:
self.speech_audio_buffer = self.speech_prefetch_buffer
- self.speech_audio_buffer += data
+ if self.n_channel > 1:
+ self.speech_audio_raw_buffer = self.speech_raw_prefetch_buffer
+ self.speech_audio_buffer += processed_data
+ if self.n_channel > 1:
+ self.speech_audio_raw_buffer += raw_audio_data
else:
- self.speech_prefetch_buffer += data
+ self.speech_prefetch_buffer += processed_data
self.speech_prefetch_buffer = self.speech_prefetch_buffer[-self.speech_prefetch_bytes:]
+ if self.n_channel > 1:
+ self.speech_raw_prefetch_buffer += raw_audio_data
+ self.speech_raw_prefetch_buffer = self.speech_raw_prefetch_buffer[
+ -self.speech_raw_prefetch_bytes:]
def on_timer(self, event):
stamp = event.current_real or rospy.Time.now()
@@ -433,13 +516,15 @@ def on_timer(self, event):
elif self.is_speeching:
buf = self.speech_audio_buffer
self.speech_audio_buffer = b""
+ buf_raw = self.speech_audio_raw_buffer
+ self.speech_audio_raw_buffer = b""
self.is_speeching = False
duration = 8.0 * len(buf) * self.respeaker_audio.bitwidth
- duration = duration / self.respeaker_audio.rate / self.respeaker_audio.bitdepth
+ duration = duration / self.respeaker_audio.rate / self.respeaker_audio.bitdepth / self.n_channel
rospy.loginfo("Speech detected for %.3f seconds" % duration)
if self.speech_min_duration <= duration < self.speech_max_duration:
-
self.pub_speech_audio.publish(AudioData(data=buf))
+ self.pub_speech_audio_raw.publish(AudioData(data=buf_raw))
if __name__ == '__main__':
diff --git a/respeaker_ros/scripts/speech_to_text.py b/respeaker_ros/scripts/speech_to_text.py
index 0974b2f65..6765e2e04 100644
--- a/respeaker_ros/scripts/speech_to_text.py
+++ b/respeaker_ros/scripts/speech_to_text.py
@@ -2,6 +2,10 @@
# -*- coding: utf-8 -*-
# Author: Yuki Furuta
+from __future__ import division
+
+import sys
+
import actionlib
import rospy
try:
@@ -9,8 +13,16 @@
except ImportError as e:
raise ImportError(str(e) + '\nplease try "pip install speechrecognition"')
+import numpy as np
from actionlib_msgs.msg import GoalStatus, GoalStatusArray
from audio_common_msgs.msg import AudioData
+enable_audio_info = True
+try:
+ from audio_common_msgs.msg import AudioInfo
+except Exception as e:
+ rospy.logwarn('audio_common_msgs/AudioInfo message is not exists.'
+ ' AudioInfo message will not be published.')
+ enable_audio_info = False
from sound_play.msg import SoundRequest, SoundRequestAction, SoundRequestGoal
from speech_recognition_msgs.msg import SpeechRecognitionCandidates
@@ -18,8 +30,32 @@
class SpeechToText(object):
def __init__(self):
# format of input audio data
- self.sample_rate = rospy.get_param("~sample_rate", 16000)
- self.sample_width = rospy.get_param("~sample_width", 2)
+ audio_info_topic_name = rospy.get_param('~audio_info', '')
+ if len(audio_info_topic_name) > 0:
+ if enable_audio_info is False:
+ rospy.logerr(
+ 'audio_common_msgs/AudioInfo message is not exists.'
+ ' Giving ~audio_info is not valid in your environment.')
+ sys.exit(1)
+ rospy.loginfo('Extract audio info params from {}'.format(
+ audio_info_topic_name))
+ audio_info_msg = rospy.wait_for_message(
+ audio_info_topic_name, AudioInfo)
+ self.sample_rate = audio_info_msg.sample_rate
+ self.sample_width = audio_info_msg.bitrate // self.sample_rate // 8
+ self.channels = audio_info_msg.channels
+ else:
+ self.sample_rate = rospy.get_param("~sample_rate", 16000)
+ self.sample_width = rospy.get_param("~sample_width", 2)
+ self.channels = rospy.get_param("~channels", 1)
+ if self.sample_width == 2:
+ self.dtype = 'int16'
+ elif self.sample_width == 4:
+ self.dtype = 'int32'
+ else:
+ raise NotImplementedError('sample_width {} is not supported'
+ .format(self.sample_width))
+ self.target_channel = rospy.get_param("~target_channel", 0)
# language of STT service
self.language = rospy.get_param("~language", "ja-JP")
# ignore voice input while the robot is speaking
@@ -78,7 +114,11 @@ def audio_cb(self, msg):
if self.is_canceling:
rospy.loginfo("Speech is cancelled")
return
- data = SR.AudioData(msg.data, self.sample_rate, self.sample_width)
+
+ data = SR.AudioData(
+ np.frombuffer(msg.data, dtype=self.dtype)[
+ self.target_channel::self.channels].tobytes(),
+ self.sample_rate, self.sample_width)
try:
rospy.loginfo("Waiting for result %d" % len(data.get_raw_data()))
result = self.recognizer.recognize_google(
diff --git a/respeaker_ros/test/sample_respeaker.test b/respeaker_ros/test/sample_respeaker.test
index 5d51c220c..61f10fb7b 100644
--- a/respeaker_ros/test/sample_respeaker.test
+++ b/respeaker_ros/test/sample_respeaker.test
@@ -3,6 +3,7 @@
+
diff --git a/rostwitter/scripts/tweet_image_server.py b/rostwitter/scripts/tweet_image_server.py
index ea5e18c63..88e9a93d4 100755
--- a/rostwitter/scripts/tweet_image_server.py
+++ b/rostwitter/scripts/tweet_image_server.py
@@ -70,7 +70,7 @@ def _execute_cb(self, goal):
self.image_topic_name = goal.image_topic_name
with self.lock:
self.img[self.image_topic_name] = None
- self.sub = rospy.Subscriber(
+ sub = rospy.Subscriber(
self.image_topic_name, Image,
self._image_cb)
@@ -130,8 +130,7 @@ def _execute_cb(self, goal):
else:
rospy.logerr('cannot subscribe image: {}'.format(self.image_topic_name))
ret = self.api.post_update(goal.text)
- self.sub.unregister()
- del self.sub
+ sub.unregister()
else:
ret = self.api.post_update(goal.text)