Source code for psychopy.sound.transcribe

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""Classes and functions for transcribing speech in audio data to text.
"""

# Part of the PsychoPy library
# Copyright (C) 2002-2018 Jonathan Peirce (C) 2019-2022 Open Science Tools Ltd.
# Distributed under the terms of the GNU General Public License (GPL).

__all__ = [
    'TranscriptionResult',
    'transcribe',
    'TRANSCR_LANG_DEFAULT',
    'recognizerEngineValues',
    'recognizeSphinx',
    'recognizeGoogle'
]

import os
import psychopy.logging as logging
from psychopy.alerts import alert
from pathlib import Path
from psychopy.preferences import prefs
from .audioclip import *
from .exceptions import *

# ------------------------------------------------------------------------------
# Initialize the speech recognition system
#

_hasSpeechRecognition = True
try:
    import speech_recognition as sr
except (ImportError, ModuleNotFoundError):
    logging.warning(
        "Speech-to-text recognition module for PocketSphinx is not available "
        "(use command `pip install SpeechRecognition` to get it). "
        "Transcription will be unavailable using that service this session.")
    _hasSpeechRecognition = False

# Google Cloud API
_hasGoogleCloud = True
_googleCloudClient = None  # client for Google Cloud, instanced on first use
try:
    import google.cloud.speech
    import google.auth.exceptions
except (ImportError, ModuleNotFoundError):
    logging.warning(
        "Speech-to-text recognition using Google online services is not "
        "available (use command `pip install google-api-core google-auth "
        "google-cloud google-cloud-speech googleapis-common-protos` to get "
        "it). Transcription will be unavailable using that service this "
        "session.")
    _hasGoogleCloud = False

try:
    import pocketsphinx
    sphinxLangs = [folder.stem for folder
                   in Path(pocketsphinx.get_model_path()).glob('??-??')]
    haveSphinx = True
except (ImportError, ModuleNotFoundError):
    haveSphinx = False
    sphinxLangs = None

# Constants related to the transcription system.
TRANSCR_LANG_DEFAULT = 'en-US'

# Values for specifying recognizer engines. This dictionary is used by Builder
# to populate the component property dropdown.
recognizerEngineValues = {
    0: ('sphinx', "CMU Pocket Sphinx", "Offline, Built-in"),
    1: ('google', "Google Cloud Speech API", "Online, Key Required"),
}

# Get references to recognizers for various supported speech-to-text engines
# available through the `SpeechRecognition` package.
if _hasSpeechRecognition:
    _recogBase = sr.Recognizer()


# ------------------------------------------------------------------------------
# Classes and functions for speech-to-text transcription
#

[docs]class TranscriptionResult: """Descriptor for returned transcription data. Fields within this class can be used to access transcribed words and other information related to the transcription request. This is returned by functions and methods which perform speech-to-text transcription from audio data within PsychoPy. The user usually does not create instances of this class themselves. Parameters ---------- words : list of str Words extracted from the audio clip. unknownValue : bool `True` if the transcription API failed make sense of the audio and did not complete the transcription. requestFailed : bool `True` if there was an error with the transcriber itself. For instance, network error or improper formatting of the audio data. engine : str Name of engine used to perform this transcription. language : str Identifier for the language used to perform the transcription. """ __slots__ = [ '_words', '_confidence', # unused on Python for now '_engine', '_language', '_expectedWords', '_requestFailed', '_unknownValue'] def __init__(self, words, unknownValue, requestFailed, engine, language): self.words = words self.unknownValue = unknownValue self.requestFailed = requestFailed self.engine = engine self.language = language def __repr__(self): return (f"TranscriptionResult(words={self._words}, " f"unknownValue={self._unknownValue}, ", f"requestFailed={self._requestFailed}, ", f"engine={self._engine}, ", f"language={self._language})") def __str__(self): return " ".join(self._words) @property def wordCount(self): """Number of words found (`int`).""" return len(self._words) @property def words(self): """Words extracted from the audio clip (`list` of `str`).""" return self._words @words.setter def words(self, value): self._words = list(value) @property def success(self): """`True` if the transcriber returned a result successfully (`bool`).""" return not (self._unknownValue or self._requestFailed) @property def error(self): """`True` if there was an error during transcription (`bool`). Value is always the compliment of `.success`.""" return not self.success @property def unknownValue(self): """`True` if the transcription API failed make sense of the audio and did not complete the transcription (`bool`). """ return self._unknownValue @unknownValue.setter def unknownValue(self, value): self._unknownValue = bool(value) @property def requestFailed(self): """`True` if there was an error with the transcriber itself (`bool`). For instance, network error or improper formatting of the audio data, invalid key, or if there was network connection error. """ return self._requestFailed @requestFailed.setter def requestFailed(self, value): self._requestFailed = bool(value) @property def engine(self): """Name of engine used to perform this transcription (`str`). """ return self._engine @engine.setter def engine(self, value): if value == 'sphinx': if not haveSphinx: raise ModuleNotFoundError( "To perform built-in (local) transcription you need to " "have pocketsphinx installed (pip install pocketsphinx)") self._engine = str(value) @property def language(self): """Identifier for the language used to perform the transcription (`str`). """ return self._language @language.setter def language(self, value): self._language = str(value)
# empty result returned when a transcriber is given no data NULL_TRANSCRIPTION_RESULT = TranscriptionResult( words=[''], unknownValue=False, requestFailed=False, engine='null', language=TRANSCR_LANG_DEFAULT ) def transcribe(audioClip, engine='sphinx', language='en-US', expectedWords=None, config=None): """Convert speech in audio to text. This feature passes the audio clip samples to a specified text-to-speech engine which will attempt to transcribe any speech within. The efficacy of the transcription depends on the engine selected, audio quality, and language support. By default, Pocket Sphinx is used which provides decent transcription capabilities offline for English and a few other languages. For more robust transcription capabilities with a greater range of language support, online providers such as Google may be used. Speech-to-text conversion blocks the main application thread when used on Python. Don't transcribe audio during time-sensitive parts of your experiment! This issue is known to the developers and will be fixed in a later release. Parameters ---------- audioClip : :class:`~psychopy.sound.AudioClip` or tuple Audio clip containing speech to transcribe (e.g., recorded from a microphone). Can be either an :class:`~psychopy.sound.AudioClip` object or tuple where the first value is as a Nx1 or Nx2 array of audio samples (`ndarray`) and the second the sample rate (`int`) in Hertz (e.g., ``(samples, 480000)``). engine : str Speech-to-text engine to use. Can be one of 'sphinx' for CMU Pocket Sphinx or 'google' for Google Cloud. language : str BCP-47 language code (eg., 'en-US'). Note that supported languages vary between transcription engines. expectedWords : list or tuple List of strings representing expected words or phrases. This will constrain the possible output words to the ones specified. Note not all engines support this feature (only Sphinx and Google Cloud do at this time). A warning will be logged if the engine selected does not support this feature. CMU PocketSphinx has an additional feature where the sensitivity can be specified for each expected word. You can indicate the sensitivity level to use by putting a ``:`` after each word in the list (see the Example below). Sensitivity levels range between 0 and 100. A higher number results in the engine being more conservative, resulting in a higher likelihood of false rejections. The default sensitivity is 80% for words/phrases without one specified. config : dict or None Additional configuration options for the specified engine. These are specified using a dictionary (ex. `config={'pfilter': 1}` will enable the profanity filter when using the `'google'` engine). Returns ------- :class:`~psychopy.sound.transcribe.TranscriptionResult` Transcription result. Notes ----- * Online transcription services (eg., Google) provide robust and accurate speech recognition capabilities with broader language support than offline solutions. However, these services may require a paid subscription to use, reliable broadband internet connections, and may not respect the privacy of your participants as their responses are being sent to a third-party. Also consider that a track of audio data being sent over the network can be large, users on metered connections may incur additional costs to run your experiment. * If the audio clip has multiple channels, they will be combined prior to being passed to the transcription service if needed. Examples -------- Use a voice command as a response to a task:: # after doing microphone recording resp = mic.getRecording() transcribeResults = transcribe(resp) if transcribeResults.success: # successful transcription words = transcribeResults.words if 'hello' in words: print('You said hello.') Specifying expected words with sensitivity levels when using CMU Pocket Sphinx: # expected words 90% sensitivity on the first two, default for the rest expectedWords = ['right:90', 'left:90', 'up', 'down'] transcribeResults = transcribe( resp.samples, resp.sampleRateHz, expectedWords=expectedWords) if transcribeResults.success: # successful transcription # process results ... Specifying the API key to use Google's Cloud service for speech-to-text:: # set the environment variable import os os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = \ "C:\\path\\to\\my\\key.json" # you can now call the transcriber ... results = transcribe( myRecording, engine='google', expectedWords=['left', 'right']) if results.success: print("You said: {}".format(results.words[0])) """ # check if the engine parameter is valid engine = engine.lower() # make lower case # check if we have necessary keys if engine in ('google',): alert(4615, strFields={'engine': engine}) # if we got a tuple, convert to audio clip object if isinstance(audioClip, (tuple, list,)): samples, sampleRateHz = audioClip audioClip = AudioClip(samples, sampleRateHz) # pass data over to the appropriate engine for transcription if engine in ('sphinx', 'built-in'): return recognizeSphinx( audioClip, language=language, expectedWords=expectedWords, config=config) elif engine == 'google': return recognizeGoogle( audioClip, language=language, expectedWords=expectedWords, config=config) else: raise ValueError( f'Parameter `engine` for `transcribe()` should be one of ' f'"sphinx", "built-in" or "google" not "{engine}"') def _parseExpectedWords(wordList, defaultSensitivity=80): """Parse expected words list. This function is used internally by other functions and classes within the `transcribe` module. Expected words or phrases are usually specified as a list of strings. CMU Pocket Sphinx allows for additional 'sensitivity' values for each phrase ranging from *0* to *100*. This function will generate to lists, first with just words and another with specified sensitivity values. This allows the user to specify sensitivity levels which can be ignored if the recognizer engine does not support it. Parameters ---------- wordList : list of str List of words of phrases. Sensitivity levels for each can be specified by putting a value at the end of each string separated with a colon `:`. For example, ``'hello:80'`` for 80% sensitivity on 'hello'. Values are normalized between *0.0* and *1.0* when returned. defaultSensitivity : int or float Default sensitivity to use if a word does not have one specified between 0 and 100%. Returns ------- tuple Returns list of expected words and list of normalized sensitivities for each. Examples -------- Specifying expected words to CMU Pocket Sphinx:: words = [('hello:95', 'bye:50')] expectedWords = zip(_parseExpectedWords(words)) """ defaultSensitivity = defaultSensitivity / 100. # normalized sensitivities = [] if wordList is not None: # sensitivity specified as `word:80` wordListTemp = [] for word in wordList: wordAndSense = word.split(':') if len(wordAndSense) == 2: # specified as `word:80` word, sensitivity = wordAndSense sensitivity = int(sensitivity) / 100. else: word = wordAndSense[0] sensitivity = defaultSensitivity # default is 80% confidence wordListTemp.append(word) sensitivities.append(sensitivity) wordList = wordListTemp return wordList, sensitivities # ------------------------------------------------------------------------------ # Recognizers # # These functions are used to send off audio and configuration data to the # indicated speech-to-text engine. Most of these functions are synchronous, # meaning they block the application until they return. Don't run these in any # time critical parts of your program. # def recognizeSphinx(audioClip=None, language='en-US', expectedWords=None, config=None): """Perform speech-to-text conversion on the provided audio samples using CMU Pocket Sphinx. Parameters ---------- audioClip : :class:`~psychopy.sound.AudioClip` or None Audio clip containing speech to transcribe (e.g., recorded from a microphone). Specify `None` to open a client without performing a transcription, this will reduce latency when the transcriber is invoked in successive calls. language : str BCP-47 language code (eg., 'en-US'). Should match the language which the speaker is using. Pocket Sphinx requires language packs to be installed locally. expectedWords : list or None List of strings representing expected words or phrases. This will attempt bias the possible output words to the ones specified if the engine is uncertain. Sensitivity can be specified for each expected word. You can indicate the sensitivity level to use by putting a ``:`` after each word in the list (see the Example below). Sensitivity levels range between 0 and 100. A higher number results in the engine being more conservative, resulting in a higher likelihood of false rejections. The default sensitivity is 80% for words/phrases without one specified. config : dict or None Additional configuration options for the specified engine. Returns ------- TranscriptionResult Transcription result object. """ if not haveSphinx: # does not have Sphinx raise RecognizerEngineNotFoundError() # warmup the engine, not used here but needed for compatibility if audioClip is None: return NULL_TRANSCRIPTION_RESULT # check if we have a valid audio clip if not isinstance(audioClip, AudioClip): raise TypeError( "Expected parameter `audioClip` to have type " "`psychopy.sound.AudioClip`.") # engine configuration config = {} if config is None else config if not isinstance(config, dict): raise TypeError( "Invalid type for parameter `config` specified, must be `dict` " "or `None`.") if not isinstance(language, str): raise TypeError( "Invalid type for parameter `language`, must be type `str`.") language = language.lower() if language not in sphinxLangs: # missing a language pack error url = "https://sourceforge.net/projects/cmusphinx/files/" \ "Acoustic%20and%20Language%20Models/" msg = (f"Language `{language}` is not installed for " f"`pocketsphinx`. You can download languages here: {url}. " f"Install them here: {pocketsphinx.get_model_path()}") raise RecognizerLanguageNotSupportedError(msg) # configure the recognizer config['language'] = language # sphinx users en-us not en-US config['show_all'] = False if expectedWords is not None: words, sens = _parseExpectedWords(expectedWords) config['keyword_entries'] = tuple(zip(words, sens)) # convert audio to format for transcription sampleWidth = 2 # two bytes per sample audioData = sr.AudioData( audioClip.asMono().convertToWAV(), sample_rate=audioClip.sampleRateHz, sample_width=sampleWidth) # submit audio samples to the API respAPI = '' unknownValueError = requestError = False try: respAPI = _recogBase.recognize_sphinx(audioData, **config) except sr.UnknownValueError: unknownValueError = True except sr.RequestError: requestError = True # remove empty words result = [word for word in respAPI.split(' ') if word != ''] # object to return containing transcription data toReturn = TranscriptionResult( words=result, unknownValue=unknownValueError, requestFailed=requestError, engine='sphinx', language=language) # split only if the user does not want the raw API data return toReturn def recognizeGoogle(audioClip=None, language='en-US', expectedWords=None, config=None): """Perform speech-to-text conversion on the provided audio clip using the Google Cloud API. This is an online based speech-to-text engine provided by Google as a subscription service, providing exceptional accuracy compared to `built-in`. Requires an API key to use which you must generate and specify prior to calling this function. Parameters ---------- audioClip : :class:`~psychopy.sound.AudioClip` or None Audio clip containing speech to transcribe (e.g., recorded from a microphone). Specify `None` to open a client without performing a transcription, this will reduce latency when the transcriber is invoked in successive calls. language : str BCP-47 language code (eg., 'en-US'). Should match the language which the speaker is using. expectedWords : list or None List of strings representing expected words or phrases. These are passed as speech context metadata which will make the recognizer prefer a particular word in cases where there is ambiguity or uncertainty. config : dict or None Additional configuration options for the recognizer as a dictionary. Notes ----- * The first invocation of this function will take considerably longer to run that successive calls as the client has not been started yet. Only one instance of a recognizer client can be created per-session. Examples -------- Specifying the API key to use Google's Cloud service for speech-to-text:: import os os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = \ "C:\\path\\to\\my\\key.json" # you can now call the transcriber results = recognizeGoogle(myRecording, expectedWords=['left', 'right']) if results.success: print("You said: {}".format(results.words[0])) """ global _googleCloudClient if _googleCloudClient is None: if "GOOGLE_APPLICATION_CREDENTIALS" not in os.environ: os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = \ prefs.general['appKeyGoogleCloud'] # empty string indicates no key has been specified, raise error if not os.environ["GOOGLE_APPLICATION_CREDENTIALS"]: raise RecognizerAPICredentialsError( 'No application key specified for Google Cloud Services, ' 'specify the path to the key file with either the system ' 'environment variable `GOOGLE_APPLICATION_CREDENTIALS` or in ' 'preferences (General -> appKeyGoogleCloud).') # open new client, takes a while the first go try: _googleCloudClient = google.cloud.speech.SpeechClient() except google.auth.exceptions.DefaultCredentialsError: raise RecognizerAPICredentialsError( 'Invalid key specified for Google Cloud Services, check if the ' 'key file is valid and readable.') # if None, return a null transcription result and just open a client if audioClip is None: return NULL_TRANSCRIPTION_RESULT # check if we have a valid audio clip if not isinstance(audioClip, AudioClip): raise TypeError( "Expected parameter `audioClip` to have type " "`psychopy.sound.AudioClip`.") # configure the recognizer params = { 'encoding': google.cloud.speech.RecognitionConfig.AudioEncoding.LINEAR16, 'sample_rate_hertz': audioClip.sampleRateHz, 'language_code': language, 'model': 'command_and_search', 'audio_channel_count': audioClip.channels, 'max_alternatives': 1} if isinstance(config, dict): params.update(config) # speech context (i.e. expected phrases) if expectedWords is not None: expectedWords, _ = _parseExpectedWords(expectedWords) params['speech_contexts'] = \ [google.cloud.speech.SpeechContext(phrases=expectedWords)] # Detects speech in the audio file response = _googleCloudClient.recognize( config=google.cloud.speech.RecognitionConfig(**params), audio=google.cloud.speech.RecognitionAudio( content=audioClip.convertToWAV())) # package up response result = [result.alternatives[0].transcript for result in response.results] toReturn = TranscriptionResult( words=result, unknownValue=False, # not handled yet requestFailed=False, # not handled yet engine='google', language=language) return toReturn if __name__ == "__main__": pass

Back to top