Implementing Reactions
Basically, JARVIS can respond to a command in two ways: On the one hand, JARVIS can generate a voice output to respond directly to the user. On the other hand, you can define actions that will be executed when a command is recognized. Of course, both are possible at the same time.
The voice output to be generated and the action to be performed are simply stored in a JSON file. In the following example, the command "who are you" generates the voice response "I am Jarvis..." but no further action, while the command "time" has no voice response, but action 7 is executed.
[ { "command": "who are you", "response": "I am Jarvis, an artificial intelligence", "action": 0 }, { "command": "time", "response": "", "action": 1 } ]
Voice Responses
The class "TextSynthesizer" is responsible for the voice responses and uses the library "pyttsx3" to generate the voice output. Pyttsx3 is a Python text-to-speech conversion library, unlike alternative libraries it works offline and is compatible with both Python 2 and 3.
#Installation pip install pyttsx3
In the __init__ method, the voice parameter specifies which voice (from the operating system) should be used. "self.synthesizer.setProperty('rate', 150) sets the speaking speed.
class TextSynthesizer(): def __init__(self, voice): self.synthesizer = pyttsx3.init() # set output rate = speed, higher means faster self.synthesizer.setProperty('rate', 150) voices = self.synthesizer.getProperty('voices') self.synthesizer.setProperty('voice', voices[voice].id) def speak(self, text): self.synthesizer.say(text) self.synthesizer.runAndWait() self.synthesizer.stop()
Actions
In the ActionHandler class I have implemented a few examples of actions. Of course, the possible actions depend strongly on the environment on which JARVIS runs. Among other things I have examples for reading out the current time or opening the Google web page.
class Actionhandler: def __init__(self): self.synthesizer = TextSynthesizer(1) def handleAction(self, action): if action == 0: pass; if action == 1: # read the current time dt = datetime.datetime.now() date = dt.strftime('The time right now is %H hours and %M minutes') self.synthesizer.speak(date) if action == 2: dt = datetime.datetime.now() date = dt.strftime('The date is %b %d, %Y') self.synthesizer.speak(date) if action == 3: webbrowser.open("https://www.google.com") if action == 4: # get system information m1 = "I'm running on " , platform.system() self.synthesizer.speak(m1) m2 = platform.release() self.synthesizer.speak(m2) m3 = "my IP Adress is " , socket.gethostbyname(socket.gethostname()) self.synthesizer.speak(m3)
Full Code
# JARVIS # Simple Voice Recognition # JARVIS is the main class # -> CommandMatcher tries to find a matching command from the command list file (commands.json) # -> ActionHandler is used to perform actions like opening a browser ort telling the current time ... # -> TextSynthesizer is required to generate the voice output # Stefan Hager, 2023 import json import sys import pyaudio import deepspeech import numpy as np from queue import SimpleQueue from CommandMatcher import CommandMatcher from TextSynthesizer import TextSynthesizer from ActionHandler import Actionhandler class JARVIS: def __init__(self): # TextSynthesizer to generate snthetic speach output self.synthesizer = TextSynthesizer(1) # Action handler to get some actions done self.actionhandler = Actionhandler() self.model = deepspeech.Model("deepspeech-0.9.3-models.pbmm") self.model.setBeamWidth(512) # direct audio processing self.audio = pyaudio.PyAudio() self.index, name = self.findAudioDevice(self.audio, 'pulse') print("selected audio device : ", name) # read command file try: with open('commands.json') as json_file: self.commands = json.load(json_file) except: print("No command file found ... terminating.") sys.exit() def waitForCommand(self): # important: create new queue to make sure it is empty self.buffer_queue = SimpleQueue() self.stream = self.model.createStream() buffer_size = self.model.sampleRate() self.audio_stream = self.audio.open(rate=self.model.sampleRate(), channels=1, format=self.audio.get_format_from_width(2, unsigned=False), input_device_index=self.index, input=True, frames_per_buffer=buffer_size, stream_callback=self.audio_callback) iteration_without_match = 0 while self.audio_stream.is_active(): self.stream.feedAudioContent(self.buffer_queue.get()) text = self.stream.intermediateDecode() print(">>" , text) iteration_without_match += 1 if self.processCommand(text): break if iteration_without_match > 10: # if there is no match for a longer period -> clear stram and buffer break self.stream.finishStream() self.audio_stream.close() return text # callback for new data in the audio stream def audio_callback(self, in_data, frame_count, time_info, status_flags): self.buffer_queue.put(np.frombuffer(in_data, dtype='int16')) return (None, pyaudio.paContinue) def findAudioDevice(self, pyaudio, device_name): ''' find specific device or return default input device''' default = pyaudio.get_default_input_device_info() for i in range(pyaudio.get_device_count()): name = pyaudio.get_device_info_by_index(i)['name'] if name == device_name: return (i, name) return (default['index'], default['name']) # does the recognnized text match any command? def processCommand(self, text): for com in self.commands: match = CommandMatcher.matchCommand(com["command"], text) if match: self.synthesizer.speak(com["response"]) self.actionhandler.handleAction(int(com["action"])) return True return False; x = JARVIS() stop = False while not stop: text = x.waitForCommand() if text == "stop": stop = True print("finished...")
# JARVIS # Simple Voice Recognition # -> CommandMatcher tries to find a matching command from the command list file (commands.json) # Stefan Hager, 2023 from difflib import SequenceMatcher class CommandMatcher: @staticmethod def matchCommand(command, text): if command in text: # direct match print(">>> full match : ",command) return True else: # no direct match comwords = command.split() textwords = text.split() length = len(comwords) if length == 1: # only one command word to match for word in textwords: ratio = SequenceMatcher(None, word, comwords[0]).ratio() if ratio >= 0.75: print(">>> partial match : ", word, " r : ", ratio) return True return False
# JARVIS # Simple Voice Recognition # -> ActionHandler is used to perform actions like opening a browser ort telling the current time ... # Stefan Hager, 2023 import platform import socket import webbrowser from TextSynthesizer import TextSynthesizer import datetime import datetime class Actionhandler: def __init__(self): self.synthesizer = TextSynthesizer(1) def handleAction(self, action): if action == 0: return; if action == 1: pass if action == 2: pass if action == 3: pass if action == 4: pass if action == 5: pass if action == 6: pass if action == 7: # time dt = datetime.datetime.now() date = dt.strftime('The time right now is %H hours and %M minutes') self.synthesizer.speak(date) if action == 8: dt = datetime.datetime.now() date = dt.strftime('The date is %b %d, %Y') self.synthesizer.speak(date) if action == 10: webbrowser.open("https://www.google.com") if action == 11: webbrowser.open("") if action == 12: # get system information m1 = "I'm running on " , platform.system() self.synthesizer.speak(m1) m2 = platform.release() self.synthesizer.speak(m2) m3 = "my IP Adress is " , socket.gethostbyname(socket.gethostname()) self.synthesizer.speak(m3)
# JARVIS # Simple Voice Recognition # -> TextSynthesizer is required to generate the voice output # Stefan Hager, 2023 import pyttsx3 class TextSynthesizer(): def __init__(self, voice): self.synthesizer = pyttsx3.init() # set output rate = speed, higher means faster self.synthesizer.setProperty('rate', 150) voices = self.synthesizer.getProperty('voices') self.synthesizer.setProperty('voice', voices[voice].id) def speak(self, text): self.synthesizer.say(text) self.synthesizer.runAndWait() self.synthesizer.stop()
[ { "command": "who are you", "response": "I am Jarvis, an artificial intelligence", "action": 0 }, { "command": "how are you", "response": "I am fine, up and running", "action": 0 }, { "command": "hello", "response": "Hi, nice to meet you", "action": 0 }, { "command": "master", "response": "my master is Stefan Hager, the greatest of all times", "action": 0 }, { "command": "stop", "response": "Goodbye", "action": 1 }, { "command": "lights on", "response": "Copy, switching lights on", "action": 2 }, { "command": "lights of", "response": "Copy, switching lights off", "action": 3 }, { "command": "power on", "response": "Copy, power on", "action": 4 }, { "command": "power of", "response": "Copy, power off", "action": 5 }, { "command": "message", "response": "Engines armed, power on, ready for takeoff", "action": 0 }, { "command": "status", "response": "Up and running", "action": 0 }, { "command": "time", "response": "", "action": 7 }, { "command": "date", "response": "", "action": 8 }, { "command": "weather", "response": "Weather forecast is not yet implemented", "action": 9 }, { "command": "google", "response": "Copy, start google", "action": 10 }, { "command": "browser", "response": "Copy, opening browser", "action": 11 }, { "command": "system", "response": "", "action": 12 } ]