I have a simple WebSocket project created by FastAPI like the following code: After running the project, I want to convert data to a numpy array. What I have tried: 1) raises error: This sample code does not raise any errors, but the output audio file does not contain any perceivable audio. Just noise is save…

How to convert audio bytes generated by mediaRecor…

I have a simple WebSocket project created by FastAPI like the following code:

import uvicorn
from fastapi import FastAPI, WebSocket
from fastapi.responses import HTMLResponse
import numpy as np
import soundfile as sf


app = FastAPI()

html = """
<!DOCTYPE html>
<html>
    <body>
        <h1>Transcribe Audio With FastAPI</h1>
        <p id="status">Connection status will go here</p>
        <p id="transcript"></p>
        <script>
               navigator.mediaDevices.getUserMedia({ audio: { sampleSize: 16, channelCount: 1, sampleRate: 16000 } }).then((stream) => {
            if (!MediaRecorder.isTypeSupported('audio/webm'))
                return alert('Browser not supported')

            const mediaRecorder = new MediaRecorder(stream, {
                mimeType: 'audio/webm',
            })

            const socket = new WebSocket('ws://localhost:8000/listen')

            socket.onopen = () => {
                document.querySelector('#status').textContent = 'Connected'
                console.log({ event: 'onopen' })
                mediaRecorder.addEventListener('dataavailable', async (event) => {
                    if (event.data.size > 0 && socket.readyState == 1) {
                        socket.send(event.data)
                    }
            })
            mediaRecorder.start(250)
            }

            socket.onclose = () => {
            console.log({ event: 'onclose' })
        }

            socket.onerror = (error) => {
                console.log({ event: 'onerror', error })
            }

           })
        </script>
    </body>
</html>
"""


@app.get("/")
async def get():
    return HTMLResponse(html)


@app.websocket("/listen")
async def websocket_endpoint(websocket: WebSocket):
    await websocket.accept()
    try:
        while True:
            data = await websocket.receive_bytes()
            print(data)
            # Convert data to numpy array
            # rest of the process!
    except Exception as e:
        raise Exception(f'Could not process audio: {e}')
    finally:
        await websocket.close()


if __name__ == '__main__':
    uvicorn.run(app)

After running the project, I want to convert data to a numpy array.

What I have tried: 1)

def tensorize(x):
    arr = np.frombuffer(x, dtype=np.float32)
    # copy to avoid warning
    arr = np.copy(arr)
    return arr

@app.websocket("/listen")
async def websocket_endpoint(websocket: WebSocket):
    print("I'm here websocket_endpoint")
    await websocket.accept()

    try:
        # deepgram_socket = await process_audio(websocket)
        whole = []
        counter = 0
        while True:
            data = await websocket.receive_bytes()
            array = tensorize(data)
    except Exception as e:
        raise Exception(f'Could not process audio: {e}')
    finally:
        await websocket.close()

raises error:

arr = np.frombuffer(x, dtype=np.float32)
ValueError: buffer size must be a multiple of element size

@app.websocket("/listen")
async def websocket_endpoint(websocket: WebSocket):
    print("I'm here websocket_endpoint")
    await websocket.accept()

    try:
        # deepgram_socket = await process_audio(websocket)
        whole = []
        counter = 0
        while True:
            data = await websocket.receive_bytes()
            data_s16 = np.frombuffer(data, dtype=np.int16, count=len(data) // 2, offset=0)
            float_data = data_s16 * 0.5 ** 15
            whole.append(float_data)
            print(data)
            counter += 1
            if counter > 20:
                data = np.concatenate(whole)
                sf.write('stereo_file1.wav', data, 16000, 'PCM_24')
                break
            print(counter)
            # await websocket.send_text(f"Message text was: {data}")
            # deepgram_socket.send(data)
    except Exception as e:
        raise Exception(f'Could not process audio: {e}')
    finally:
        await websocket.close()

This sample code does not raise any errors, but the output audio file does not contain any perceivable audio. Just noise is saved.

Tried to use librosa & soundfile to read bytes io, but it does not recognize the format

@app.websocket("/listen")
async def websocket_endpoint(websocket: WebSocket):
    await websocket.accept()
    try:
        while True:
            data = await websocket.receive_bytes()
            byte_io = BytesIO(data)
            array, sr = librosa.load(byte_io)
    except Exception as e:
        raise Exception(f'Could not process audio: {e}')
    finally:
        await websocket.close()

@app.websocket("/listen")
async def websocket_endpoint(websocket: WebSocket):
    await websocket.accept()
    try:
        while True:
            data = await websocket.receive_bytes()
            byte_io = BytesIO(data)
            array, sr = sf.read(byte_io)
    except Exception as e:
        raise Exception(f'Could not process audio: {e}')
    finally:
        await websocket.close()

Raised error:

Exception: Could not process audio: Error opening <_io.BytesIO object at 0x7f12a32cd0d0>: Format not recognised.
'''

**Update 1**
I was able to save the outputted chunk using the following code, but the audio should be created in the hard drive and then loaded using librosa, which is so slow!

import librosa @app.websocket(“/listen”) async def websocket_endpoint(websocket: WebSocket): print(“I’m here websocket_endpoint”) await websocket.accept()

try:
    while True:
        data = await websocket.receive_bytes()
        with open('audio.wav', 'wb') as f:
            f.write(data)
        array, sr = librosa.load("audio.wav")
except Exception as e:
    raise Exception(f'Could not process audio: {e}')
finally:
    await websocket.close()

Answer

Finally, by creating a bit similar issue on pytorch/audio github repository. I got the answer from moto in this comment. The final solution is as follows:

@app.websocket("/listen")
async def websocket_endpoint(websocket: WebSocket):
    await websocket.accept()
    try:
        chunk_size = 1000
        while True:
            data = await websocket.receive_bytes()
            f = BytesIO(data)
            s = torchaudio.io.StreamReader(f)
            s.add_basic_audio_stream(chunk_size)
            array = torch.concat([chunk[0] for chunk in s.stream()])
    except Exception as e:
        raise Exception(f'Could not process audio: {e}')
    finally:
        await websocket.close()

I could not convert the data to a numpy array, but the array.numpy() returns the numpy format if someone needs it.

PS: Versions of relevant libraries:

[pip3] numpy==1.23.4
[pip3] torch==1.12.1
[pip3] torchaudio==0.12.1
[conda] numpy 1.23.4 pypi_0 pypi

OS:

Ubuntu: 22.04
torchaudio.backend: sox_io

How to convert audio bytes generated by mediaRecorder and transfered using websocket to python to numpy array

Advertisement

Answer