How to run Ins8.ai's Speech Recognition Model

Introduction

Developed by NCS Singapore, ins8.ai’s speech to text and natural language processing algorithms are tuned to understand Asian accents and vernacular. Current Speech To Text products available in the market often suffer from poor accuracy due to how they are trained on non local datasets. Ins8.ai seeks to address this gap and help clients extract valuable insights from video and audio datasets in a scalable manner.

Getting Started

There are 3 main ways to test out our Speech To Text product:

REST API — For audio file uploads
WebSocket — For audio streaming
WebApp — For a quick trial of our product for both file upload and streaming options

Sign up for a developer account at The Ins8.ai Webapp to get trial access.
Once email verification is complete, generate an API Token by clicking on “+ Generate a New Token”

Ins8.ai WebPortal’s Dashboard Tab where you can generate a new token

REST API

Next, create a python file with your favorite IDE and save the following code as main.py

import requests
path = "Path_to_audio_file.wav"

def send_rest_api_request(audio_file, api_token):
    response = requests.post("https://stt.ins8.io/api/v1/stt/recognize",
                            params= {'api_token': api_token, 'language':'en-sg', 'punctuation':True, 'timestamp':False},
                            files = {'audio': open(audio_file, 'rb')}
                            )
    return response

api_key = "<API_TOKEN>"
response = send_rest_api_request(path, api_key)

## To view the response from the terminal
print(response.json())

Download the sample audio clip.

Run main.py using your newly generated API key and the file path of your audio clip.

Once processing is complete, result will be printed in the console.

JSON response given for the sample audio file

WebSocket

Create a main.py python file and paste the following code.

# Install required packages using
# pip install pywav
# pip install websockets
# pip install requests

import json
from pywav import WavRead
import asyncio
import ssl
import websockets

path = "Path_to_example_audio.wav"


async def receiver(websocket):
    """ (Asynchronous) function to recieve transcript for chunks asynchronously

    Args:
        websocket ([websockets.legacy.client.WebSocketClientProtocol])
        
    """
    
    while True:
        data = await websocket.recv()
        print(f"Hypothesis of recognised speech: {data}")
        if data == "!!END_OF_TRANSCRIPTION!!":
            break

async def sender(websocket, data: list, stop_str:str ):
    """ 
        (Asynchronous) function to send audio chunks asynchronously
    Args:
        websocket ([websockets.legacy.client.WebSocketClientProtocol])
        data (list): [ List of audio chunks in bytes ]
        stop_str (str): [ the string that is sent to the server to mark the end of audio streaming]
        
    """
    
    for chunk in data:
        await asyncio.sleep(0.05)
        await websocket.send(chunk)
    # ! Required: A stop string has to be sent
    #await websocket.send(stop_str)
    await websocket.send(stop_str.encode())

def get_chunks(filepath, filetype=None):
    if not filetype:
        filetype = filepath.split('.')[-1]
    # Loads the audio into memory
    assert filetype.lower() == 'wav', f"WSS endpoint only support wave file."
    audio = WavRead(filepath)
    sample_rate = audio.getsamplerate()
    assert (audio.getnumofchannels() == 1)
    content = audio.getdata()
    chunk_size = int(sample_rate * audio.getbytespersample() / 10) # 100ms
    assert (audio.getaudioformat() in [1, 7])
    assert audio.getaudioformat() == 1, "WSS endpoint only support PCM encoded wave files"
    encoding = 'raw' 
        
    config = {'encoding': encoding, 
              'sample_rate_hertz': sample_rate, 
            #   stop string could string of any choice
              'stop_string': 'EOS'}   
    stream = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]
    return config, stream


async def wss_test(hostname="wss://stt.ins8.io/api/v1/stt/websocket/recognize", token=''):
    audio_config, data_stream_list = get_chunks(filepath=path)
    url = hostname + "?api_token=" + token
    ssl_context = None
    ssl_context = ssl.SSLContext()
    ssl_context.check_hostname = False
    ssl_context.verify_mode = ssl.CERT_NONE
    if ssl_context is not None:
        async with websockets.connect(url, ssl=ssl_context) as websocket:
            #, ssl=True
            stop_string = audio_config['stop_string']
            await websocket.send(json.dumps(audio_config))
            print(audio_config)
            while True:
                await asyncio.gather(sender(websocket, data_stream_list, 
                                            stop_string
                                            ), 
                                receiver(websocket))
                break
            print(f"Client closing connection")
    else:
        async with websockets.connect(url) as websocket:
            #, ssl=True
            stop_string = audio_config['stop_string']
            await websocket.send(json.dumps(audio_config))
            print(audio_config)
            while True:
                await asyncio.gather(sender(websocket, data_stream_list, 
                                            stop_string
                                            ), 
                                receiver(websocket))
                break
            print(f"Client closing connection")


api_key = "<YOUR SERVICE TOKEN>"
asyncio.get_event_loop().run_until_complete(wss_test(token=api_key))

Download the sample audio clip.

Run main.py using your newly generated API key and the file path of your audio clip.

Once processing is complete, result will be printed in the console.

Message given for the transcribed sample audio file

WebApp

Alternatively, you can also test out the Ins8.ai Speech to Text capabilities through our portal. You can do so by navigating to our demo page and trying either the ‘File Upload’ or ‘Speak Now’ option.

Authored by Wong Wei Yuan

05 Apr 2023