JARVIS - A Simple Voice Control

Implementing Reactions

Basically, JARVIS can respond to a command in two ways: On the one hand, JARVIS can generate a voice output to respond directly to the user. On the other hand, you can define actions that will be executed when a command is recognized. Of course, both are possible at the same time.

The voice output to be generated and the action to be performed are simply stored in a JSON file. In the following example, the command "who are you" generates the voice response "I am Jarvis..." but no further action, while the command "time" has no voice response, but action 7 is executed.


[
    {
        "command": "who are you",
        "response": "I am Jarvis, an artificial intelligence",
        "action": 0
    },
    {
        "command": "time",
        "response": "",
        "action": 1
    }
]

Voice Responses

The class "TextSynthesizer" is responsible for the voice responses and uses the library "pyttsx3" to generate the voice output. Pyttsx3 is a Python text-to-speech conversion library, unlike alternative libraries it works offline and is compatible with both Python 2 and 3.

#Installation

pip install pyttsx3

In the __init__ method, the voice parameter specifies which voice (from the operating system) should be used. "self.synthesizer.setProperty('rate', 150) sets the speaking speed.

class TextSynthesizer():

    def __init__(self, voice):
        self.synthesizer = pyttsx3.init()
        # set output rate = speed, higher means faster
        self.synthesizer.setProperty('rate', 150)
        voices = self.synthesizer.getProperty('voices')
        self.synthesizer.setProperty('voice', voices[voice].id)

    def speak(self, text):
        self.synthesizer.say(text)
        self.synthesizer.runAndWait()
        self.synthesizer.stop()

Actions

In the ActionHandler class I have implemented a few examples of actions. Of course, the possible actions depend strongly on the environment on which JARVIS runs. Among other things I have examples for reading out the current time or opening the Google web page.

class Actionhandler:
    def __init__(self):
        self.synthesizer = TextSynthesizer(1)

    def handleAction(self, action):
        if action == 0:
            pass;
        if action == 1:
            # read the current time
            dt = datetime.datetime.now()
            date = dt.strftime('The time right now is %H hours and %M minutes')
            self.synthesizer.speak(date)
        if action == 2:
            dt = datetime.datetime.now()
            date = dt.strftime('The date is %b %d, %Y')
            self.synthesizer.speak(date)
        if action == 3:
            webbrowser.open("https://www.google.com")
        if action == 4:
            # get system information
            m1 = "I'm running on " , platform.system()
            self.synthesizer.speak(m1)
            m2 = platform.release()
            self.synthesizer.speak(m2)
            m3 = "my IP Adress is " , socket.gethostbyname(socket.gethostname())
            self.synthesizer.speak(m3)

Full Code

# JARVIS
# Simple Voice Recognition
# JARVIS is the main class
#  -> CommandMatcher tries to find a matching command from the command list file (commands.json)
#  -> ActionHandler is used to perform actions like opening a browser ort telling the current time ...
#  -> TextSynthesizer is required to generate the voice output
# Stefan Hager, 2023

import json
import sys
import pyaudio
import deepspeech
import numpy as np
from queue import SimpleQueue

from CommandMatcher import CommandMatcher
from TextSynthesizer import TextSynthesizer
from ActionHandler import Actionhandler

class JARVIS:
    def __init__(self):
        # TextSynthesizer to generate snthetic speach output
        self.synthesizer = TextSynthesizer(1)

        # Action handler to get some actions done
        self.actionhandler = Actionhandler()
        self.model = deepspeech.Model("deepspeech-0.9.3-models.pbmm")
        self.model.setBeamWidth(512)
        # direct audio processing
        self.audio = pyaudio.PyAudio()
        self.index, name = self.findAudioDevice(self.audio, 'pulse')
        print("selected audio device : ", name)

        # read command file
        try:
            with open('commands.json') as json_file:
                self.commands = json.load(json_file)
        except:
            print("No command file found ... terminating.")
            sys.exit()

    def waitForCommand(self):
        # important: create new queue to make sure it is empty
        self.buffer_queue = SimpleQueue()
        self.stream = self.model.createStream()
        buffer_size = self.model.sampleRate()
        self.audio_stream = self.audio.open(rate=self.model.sampleRate(),
                                  channels=1,
                                  format=self.audio.get_format_from_width(2, unsigned=False),
                                  input_device_index=self.index,
                                  input=True,
                                  frames_per_buffer=buffer_size,
                                  stream_callback=self.audio_callback)

        iteration_without_match = 0
        while self.audio_stream.is_active():
            self.stream.feedAudioContent(self.buffer_queue.get())
            text = self.stream.intermediateDecode()
            print(">>" , text)
            iteration_without_match += 1
            if self.processCommand(text):
                break
            if iteration_without_match > 10:
                # if there is no match for a longer period -> clear stram and buffer
                break
        self.stream.finishStream()
        self.audio_stream.close()
        return text
    # callback for new data in the audio stream
    def audio_callback(self, in_data, frame_count, time_info, status_flags):
        self.buffer_queue.put(np.frombuffer(in_data, dtype='int16'))
        return (None, pyaudio.paContinue)


    def findAudioDevice(self, pyaudio, device_name):
        ''' find specific device or return default input device'''
        default = pyaudio.get_default_input_device_info()
        for i in range(pyaudio.get_device_count()):
            name = pyaudio.get_device_info_by_index(i)['name']
            if name == device_name:
                return (i, name)
        return (default['index'], default['name'])

    # does the recognnized text match any command?
    def processCommand(self, text):
        for com in self.commands:
            match = CommandMatcher.matchCommand(com["command"], text)
            if match:
                self.synthesizer.speak(com["response"])
                self.actionhandler.handleAction(int(com["action"]))
                return True
        return False;

x = JARVIS()
stop = False
while not stop:
    text = x.waitForCommand()
    if text == "stop":
        stop = True
print("finished...")

# JARVIS
# Simple Voice Recognition
#  -> CommandMatcher tries to find a matching command from the command list file (commands.json)
# Stefan Hager, 2023

from difflib import SequenceMatcher

class CommandMatcher:

    @staticmethod
    def matchCommand(command, text):
        if command in text:
            # direct match
            print(">>> full match : ",command)
            return True
        else:
            # no direct match
            comwords = command.split()
            textwords = text.split()
            length = len(comwords)
            if length == 1:
                # only one command word to match
                for word in textwords:
                    ratio = SequenceMatcher(None, word, comwords[0]).ratio()
                    if ratio >= 0.75:
                        print(">>> partial  match : ", word, " r : ", ratio)
                        return True
        return False

# JARVIS
# Simple Voice Recognition
#  -> ActionHandler is used to perform actions like opening a browser ort telling the current time ...
# Stefan Hager, 2023

import platform
import socket
import webbrowser

from  TextSynthesizer import TextSynthesizer
import datetime

import datetime

class Actionhandler:
    def __init__(self):
        self.synthesizer = TextSynthesizer(1)

    def handleAction(self, action):
        if action == 0:
            return;
        if action == 1:
            pass
        if action == 2:
            pass
        if action == 3:
            pass
        if action == 4:
            pass
        if action == 5:
            pass
        if action == 6:
            pass
        if action == 7:
            # time
            dt = datetime.datetime.now()
            date = dt.strftime('The time right now is %H hours and %M minutes')
            self.synthesizer.speak(date)
        if action == 8:
            dt = datetime.datetime.now()
            date = dt.strftime('The date is %b %d, %Y')
            self.synthesizer.speak(date)
        if action == 10:
            webbrowser.open("https://www.google.com")
        if action == 11:
            webbrowser.open("")
        if action == 12:
            # get system information
            m1 = "I'm running on " , platform.system()
            self.synthesizer.speak(m1)
            m2 = platform.release()
            self.synthesizer.speak(m2)
            m3 = "my IP Adress is " , socket.gethostbyname(socket.gethostname())
            self.synthesizer.speak(m3)

# JARVIS
# Simple Voice Recognition
#  -> TextSynthesizer is required to generate the voice output
# Stefan Hager, 2023

import pyttsx3

class TextSynthesizer():

    def __init__(self, voice):
        self.synthesizer = pyttsx3.init()
        # set output rate = speed, higher means faster
        self.synthesizer.setProperty('rate', 150)
        voices = self.synthesizer.getProperty('voices')
        self.synthesizer.setProperty('voice', voices[voice].id)

    def speak(self, text):
        self.synthesizer.say(text)
        self.synthesizer.runAndWait()
        self.synthesizer.stop()

[
    {
        "command": "who are you",
        "response": "I am Jarvis, an artificial intelligence",
        "action": 0
    },
    {
        "command": "how are you",
        "response": "I am fine, up and running",
        "action": 0
    },
    {
        "command": "hello",
        "response": "Hi, nice to meet you",
        "action": 0
    },
    {
        "command": "master",
        "response": "my master is Stefan Hager, the greatest of all times",
        "action": 0
    },
    {
        "command": "stop",
        "response": "Goodbye",
        "action": 1
    },
    {
        "command": "lights on",
        "response": "Copy, switching lights on",
        "action": 2
    },
    {
        "command": "lights of",
        "response": "Copy, switching lights off",
        "action": 3
    },
    {
        "command": "power on",
        "response": "Copy, power on",
        "action": 4
    },
    {
        "command": "power of",
        "response": "Copy, power off",
        "action": 5
    },
    {
        "command": "message",
        "response": "Engines armed, power on, ready for takeoff",
        "action": 0
    },
    {
        "command": "status",
        "response": "Up and running",
        "action": 0
    },
    {
        "command": "time",
        "response": "",
        "action": 7
    },
    {
        "command": "date",
        "response": "",
        "action": 8
    },
    {
        "command": "weather",
        "response": "Weather forecast is not yet implemented",
        "action": 9
    },
    {
        "command": "google",
        "response": "Copy, start google",
        "action": 10
    },
    {
        "command": "browser",
        "response": "Copy, opening browser",
        "action": 11
    },
    {
        "command": "system",
        "response": "",
        "action": 12
    }
]