pahh.py

#!/usr/bin/python
#Copyright (c) 2012, Eng Eder de Souza
#Accessing the Google API for speech recognition With Asterisk!
#Eng Eder de Souza
#date 15/01/2012
#http://ederwander.wordpress.com/2012/01/16/google-speech-python-asterisk/
#
# This program is free software, distributed under the terms of
# the GNU General Public License Version 2. See the COPYING file
# at the top of the source tree.
#
#Revision 0.3
#History:
#18/01/2012 bug fix in local variable declaration
#19/01/2012 suport for old python interpretator
#19/01/2012 removed matplotlib dependencies
#19/01/2012 Submission of warnings DeprecationWarning and UserWarning
#27/10/2014 Update to Google speech-api v2

import warnings
warnings.simplefilter("ignore", DeprecationWarning)
warnings.simplefilter("ignore", UserWarning)
from scikits.audiolab import Format, Sndfile
from scipy.signal import firwin, lfilter
from tempfile import mkstemp
import numpy as np
import urllib2
import math
import sys
import re
import os


#For Portuguese Brazilian Speech Recognizer!
Lang="pt-BR"

#or for English Speech Recognizer
#Lang="en-US"

#Old google speech V1 not working
#url = 'https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&lang='+Lang

#NOw for google speech V2
url='https://www.google.com/speech-api/v2/recognize?output=json&key=AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw&lang='+Lang


silence=True
env = {}
RawRate=8000
chunk=1024

#http://en.wikipedia.org/wiki/Vocal_range
#Assuming Vocal Range Frequency upper than 75 Hz
VocalRange = 75.0


#cd, FileNameTmp    = mkstemp('TmpSpeechFile.flac')

#Assuming Energy threshold upper than 15 dB
Threshold = 15

#10 seconds x 16000 samples/second x ( 16 bits / 8bits/byte ) = 160000 bytes
#160000/1024 = +/- 157
#157*1024 = 160768
TimeoutSignal = 160768

#then 1 second x 16000 = 16000
#16000/1024 = 15,625 round to 16
#16*1024 = 16384
Timeout_NoSpeaking=16384

#normalization for RMS Calc
SHORT_NORMALIZE = (1.0/32768.0)

#
LastBlock=''

#File Descriptor delivery in Asterisk
FD=3

#Open File Descriptor
file=os.fdopen(FD, 'rb')

signal=0

all=[]


while 1:
        line = sys.stdin.readline().strip()

        if line == '':
                break
        key,data = line.split(':')
        if key[:4] <> 'agi_':
                sys.stderr.write("Did not work!\n");
                sys.stderr.flush()
                continue
        key = key.strip()
        data = data.strip()
        if key <> '':
                env[key] = data


for key in env.keys():
        sys.stderr.write(" -- %s = %s\n" % (key, env[key]))
        sys.stderr.flush()


def SendSpeech(File):
        result=''
        flac=open(File,"rb").read()
        os.remove(File)
        header = {'Content-Type' : 'audio/x-flac; rate=8000'}
        req = urllib2.Request(url, flac, header)
        data = urllib2.urlopen(req)
        #find = re.findall('"utterance":(.*),', data.read())
        find= re.findall('{"transcript":(.*)},', data.read())
        #utterance
        try:
                #result = find[0].replace('"', '')
                result = find[0].replace('{"transcript":', '')
        except:
                sys.stdout.write("EXEC " + "\"" + "NOOP" + "\" \"" + "speech not recognized ..." + "\" " + "\n")
                sys.stdout.flush()
        if result:
                test=result.split('},')
                result=test[len(test)-1].replace('"', '')
                sys.stdout.write('SET VARIABLE GoogleUtterance "%s"\n'% str(result))
                sys.stdout.flush()
                sys.stdout.write("EXEC " + "\"" + "NOOP" + "\" \"" "%s \n"% str(result))
                sys.stdout.flush()

def Filter(samps):
        FC = 0.05/(0.5*RawRate)
        N = 200
        a = 1
        b = firwin(N, cutoff=FC, window='hamming')
        return lfilter(b, a, samps)

def Pitch(signal):
        if sys.version_info < (2, 6):
                crossing =[]
                for s in signal:
                        crossing.append(s)
        else:
                crossing = [math.copysign(1.0, s) for s in signal]
        #index = find(np.diff(crossing));
        index = np.nonzero(np.diff(crossing));
        index=np.array(index)[0].tolist()
        f0=round(len(index) *RawRate /(2*np.prod(len(signal))))
        return f0;

def rms(shorts):
        rms2=0
        count = len(shorts)/2
        sum_squares = 0.0
        for sample in shorts:
                n = sample * SHORT_NORMALIZE
                sum_squares += n*n
                rms2 = math.pow(sum_squares/count,0.5)
        return rms2 * 1000

def speaking(data):
        rms_value = rms(data)
        if rms_value > Threshold:
                return True
        else:
                return False

def VAD(SumFrequency, data2):
        AVGFrequency = SumFrequency/(Timeout_NoSpeaking+1);
        if AVGFrequency > VocalRange/2:
                S=speaking(data2)
                if S:
                        return True;
                else:
                        return False;


        else:
                return False;

def RecordSpeech(TimeoutSignal, LastBlock, LastLastBlock):
        for s in LastLastBlock:
                all.append(s)
        for s in LastBlock:
                all.append(s)
        signal=0;
        while signal <= TimeoutSignal:
                RawSamps = file.read(Timeout_NoSpeaking)
                samps = np.fromstring(RawSamps, dtype=np.int16)
                for s in samps:
                        all.append(s)
                signal = signal + Timeout_NoSpeaking;
                #rms_value=rms(samps)
                Speech=speaking(samps)
                #sys.stdout.write("EXEC NOOP %s \"\"\"\n"% str(rms_value))
                #sys.stdout.flush()

                #if rms_value > Threshold:
                if Speech:
                        sys.stdout.write("EXEC " + "\"" + "NOOP" + "\" \"" + "Speech Found ..." + "\" " + "\n")
                        sys.stdout.flush()
                else:
                        sys.stdout.write("EXEC " + "\"" + "NOOP" + "\" \"" + "End of the Speech..." + "\" " + "\n")
                        sys.stdout.flush()
                        signal=TimeoutSignal+1

def PlayStream (params):
        sys.stderr.write("STREAM FILE %s \"\"\n" % str(params))
        sys.stderr.flush()
        sys.stdout.write("STREAM FILE %s \"\"\n" % str(params))
        sys.stdout.flush()
        result = sys.stdin.readline().strip()


sys.stdout.write("EXEC " + "\"" + "NOOP" + "\" \"" + "Hello Waiting For Speech ..." + "\" " + "\n")
sys.stdout.flush()


PlayStream("beep");
sys.stdout.flush()


while silence:
        #Input Real-time Data Raw Audio from Asterisk
        RawSamps = file.read(chunk)
        samps = np.fromstring(RawSamps, dtype=np.int16)
        samps2=Filter(samps)
        Frequency=Pitch(samps2)
        rms_value=rms(samps)
        signal = signal + chunk;
        if (rms_value > Threshold) and (Frequency > VocalRange):
                silence=False
                LastLastBlock=LastBlock
                LastBlock=samps
                sys.stdout.write("EXEC " + "\"" + "NOOP" + "\" \"" + "Speech Detected Recording..." + "\" " + "\n")
                sys.stdout.flush()
        if (signal > TimeoutSignal):
                sys.stdout.write("EXEC " + "\"" + "NOOP" + "\" \"" + "Time Out No Speech Detected ..." + "\" " + "\n")
                sys.stdout.flush()
                sys.exit()

RecordSpeech(TimeoutSignal, LastBlock, LastLastBlock)


array = np.array(all)


fmt         = Format('flac', 'pcm16')
nchannels   = 1

cd, FileNameTmp    = mkstemp('TmpSpeechFile.flac')


# making the file .flac
afile =  Sndfile(FileNameTmp, 'w', fmt, nchannels, RawRate)

#writing in the file
afile.write_frames(array)

SendSpeech(FileNameTmp)