19

I'm using the Google Cloud API for Speech-to-text, with a NodeJS back-end. The app needs to be able to listen for voice commands, and transmit them to the back-end as a buffer. For this, I need to send the buffer of the preceding audio when silence is detected.

Any help would be appreciated. Including the js code below

 if (!navigator.getUserMedia)
    navigator.getUserMedia = navigator.getUserMedia || navigator.webkitGetUserMedia ||
        navigator.mozGetUserMedia || navigator.msGetUserMedia;

if (navigator.getUserMedia) {
    navigator.getUserMedia({audio: true}, success, function (e) {
        alert('Error capturing audio.');
    });
} else alert('getUserMedia not supported in this browser.');

var recording = false;

window.startRecording = function () {
    recording = true;
};

window.stopRecording = function () {
    recording = false;
    // window.Stream.end();
};

function success(e) {
    audioContext = window.AudioContext || window.webkitAudioContext;
    context = new audioContext();

    // the sample rate is in context.sampleRate
    audioInput = context.createMediaStreamSource(e);

    var bufferSize = 4096;
    recorder = context.createScriptProcessor(bufferSize, 1, 1);

    recorder.onaudioprocess = function (e) {
        if (!recording) return;
        console.log('recording');
        var left = e.inputBuffer.getChannelData(0);
        console.log(convertoFloat32ToInt16(left));
       };

    audioInput.connect(recorder);
    recorder.connect(context.destination);
}
azhar
  • 1,591
  • 1
  • 16
  • 36
  • I'm not clear as to what you need help on: Sending the data to this API? Detect silence? Split your recorded data? – Kaiido Oct 17 '17 at 02:39
  • Sending the data to this API and get output in real time. – azhar Oct 17 '17 at 05:21
  • @azhar Does the API support real-time communication? – guest271314 Oct 17 '17 at 05:26
  • Ah so the *silence* part is actually not part of this question... The docs doesn't seem to talk about direct API calls from js, and this may not be a good idea anyway since you'd need your tokens to be visible. **Oh but you mean from node.js???** Then just follow [the docs](https://cloud.google.com/speech/docs/reference/libraries#client-libraries-install-nodejs), why are you using this front-end code? – Kaiido Oct 17 '17 at 05:32
  • @Kaiido Actually i want to detect speech from browser mic so i have to send buffer from front end. – azhar Oct 19 '17 at 05:18

3 Answers3

23

I'm not too sure as to what exactly is being asked in the question, so this answer is only intended to give a way to detect silences in an AudioStream.


To detect silence in an AudioStream, you can use an AudioAnalyser node, on which you will call the getByteFrequencyData method at regular intervals, and check whether there were sounds higher than than your expected level for a given time.

You can set the threshold level directly with the minDecibels property of the AnalyserNode.

function detectSilence(
  stream,
  onSoundEnd = _=>{},
  onSoundStart = _=>{},
  silence_delay = 500,
  min_decibels = -80
  ) {
  const ctx = new AudioContext();
  const analyser = ctx.createAnalyser();
  const streamNode = ctx.createMediaStreamSource(stream);
  streamNode.connect(analyser);
  analyser.minDecibels = min_decibels;

  const data = new Uint8Array(analyser.frequencyBinCount); // will hold our data
  let silence_start = performance.now();
  let triggered = false; // trigger only once per silence event

  function loop(time) {
    requestAnimationFrame(loop); // we'll loop every 60th of a second to check
    analyser.getByteFrequencyData(data); // get current data
    if (data.some(v => v)) { // if there is data above the given db limit
      if(triggered){
        triggered = false;
        onSoundStart();
        }
      silence_start = time; // set it to now
    }
    if (!triggered && time - silence_start > silence_delay) {
      onSoundEnd();
      triggered = true;
    }
  }
  loop();
}

function onSilence() {
  console.log('silence');
}
function onSpeak() {
  console.log('speaking');
}

navigator.mediaDevices.getUserMedia({
    audio: true
  })
  .then(stream => {
    detectSilence(stream, onSilence, onSpeak);
    // do something else with the stream
  })
  .catch(console.error);

And as a fiddle since stackSnippets may block gUM.

Kaiido
  • 87,051
  • 7
  • 143
  • 194
2

You can use SpeechRecognition result event to determine when a word or phrase has been recognized, for example, ls, cd, pwd or other commands, pass the .transcript of SpeechRecognitionAlternative to speechSynthesis.speak() where at attached start and end event of SpeechSynthesisUtterance call .start() or .resume() on MediaRecorder object where MediaStream is passed; convert the Blob at dataavailable event to an ArrayBuffer using FileReader or Response.arrayBuffer().

We could alternatively use audiostart or soundstart with audioend or soundend events of SpeechRecognition to record the users' actual voice, though the ends may not be fired consistently in relation to the actual start and end of audio captured by only a standard system microphone.

<!DOCTYPE html>
<html>

<head>
  <title>Speech Recognition Recording</title>
</head>

<body>
  <input type="button" value="Stop speech command recognition" id="stop">
  <script>
    navigator.mediaDevices.getUserMedia({
        audio: true
      })
      .then(stream => {
        const recorder = new MediaRecorder(stream);
        const recognition = new webkitSpeechRecognition();
        const synthesis = new SpeechSynthesisUtterance();
        const handleResult = e => {
          recognition.onresult = null;
          console.log(e.results);
          const result = e.results[e.results.length - 1];

          if (result.isFinal) {
            const [{transcript}] = result;
            console.log(transcript);
            synthesis.text = transcript;
            window.speechSynthesis.speak(synthesis);
          }
        }
        synthesis.onstart = () => {
          if (recorder.state === "inactive") {
            recorder.start()
          } else {
            if (recorder.state === "paused") {
              recorder.resume();
            }
          }
        }
        synthesis.onend = () => {
          recorder.pause();
          recorder.requestData();
        }
        recorder.ondataavailable = async(e) => {
          if (stream.active) {
            try {
              const blobURL = URL.createObjectURL(e.data);
              const request = await fetch(blobURL);
              const ab = await request.arrayBuffer();
              console.log(blobURL, ab);
              recognition.onresult = handleResult;
              // URL.revokeObjectURL(blobURL);
            } catch (err) {
              throw err
            }
          }
        }
        recorder.onpause = e => {
          console.log("recorder " + recorder.state);
        }
        recognition.continuous = true;
        recognition.interimResults = false;
        recognition.maxAlternatives = 1;
        recognition.start();
        recognition.onend = e => {
          console.log("recognition ended, stream.active", stream.active);

          if (stream.active) {
            console.log(e);
            // the service disconnects after a period of time
            recognition.start();
          }
        }
        recognition.onresult = handleResult;

        stream.oninactive = () => {
          console.log("stream ended");
        }

        document.getElementById("stop")
          .onclick = () => {
            console.log("stream.active:", stream.active);
            if (stream && stream.active && recognition) {
              recognition.abort();
              recorder.stop();
              for (let track of stream.getTracks()) {
                track.stop();
              }
              console.log("stream.active:", stream.active);
            }
          }

      })
      .catch(err => {
        console.error(err)
      });
  </script>
</body>

</html>

plnkr https://plnkr.co/edit/4DVEg6mhFRR94M5gdaIp?p=preview

guest271314
  • 1
  • 10
  • 82
  • 156
  • `webkitSpeechRecognition` this should be only work in chrome. I want to this in non chrome browsers also. – azhar Oct 15 '17 at 08:59
  • So i am trying to using google cloud api in back end and parse speech to text. But how to stream the voice from front end to back end live is my prob. – azhar Oct 15 '17 at 10:49
  • @azhar _"I want to this in non chrome browsers also"_ One approach would be to use two ` – guest271314 Oct 15 '17 at 13:13
  • What data type is the service expecting to be `POST`ed? – guest271314 Oct 15 '17 at 13:19
  • Is any possibility with `socket.io`? – azhar Oct 15 '17 at 16:24
  • How is `socket.io` related to requirement? The simplest solution would be to use buttons for user to click when audio should be recorded and paused, then `POST` the recorded audio to server. Does *oogle cloud api speech to text expect an `ArrayBuffer` as input data? – guest271314 Oct 16 '17 at 00:32
2

The simplest approach would be to use .pause() and .resume(), .stop() methods of MediaRecorder() to allow user to start, pause, and stop recording audio captured utilizing navigator.mediaDevices.getUserMedia() and convert the resulting Blob to an ArrayBuffer, if that is what the api is expecting to be POSTed to server

<!DOCTYPE html>
<html>

<head>
  <title>User Media Recording</title>
</head>

<body>
  <input type="button" value="Start/resume recording audio" id="start">
  <input type="button" value="Pause recording audio" id="pause">
  <input type="button" value="Stop recording audio" id="stop">
  <script>
    navigator.mediaDevices.getUserMedia({
        audio: true
      })
      .then(stream => {
        const recorder = new MediaRecorder(stream);

        recorder.ondataavailable = async(e) => {
          if (stream.active) {
            try {
              const blobURL = URL.createObjectURL(e.data);
              const request = await fetch(blobURL);
              const ab = await request.arrayBuffer();
              // do stuff with `ArrayBuffer` of recorded audio
              console.log(blobURL, ab);
              // we do not need the `Blob URL`, we can revoke the object
              // URL.revokeObjectURL(blobURL);
            } catch (err) {
              throw err
            }
          }
        }
        recorder.onpause = e => {
          console.log("recorder " + recorder.state);
          recorder.requestData();
        }

        stream.oninactive = () => {
          console.log("stream ended");
        }

        document.getElementById("start")
          .onclick = () => {

            if (recorder.state === "inactive") {
              recorder.start();
            } else {
              recorder.resume();
            }
            console.log("recorder.state:", recorder.state);
          }

        document.getElementById("pause")
          .onclick = () => {

            if (recorder.state === "recording") {
              recorder.pause();
            }
            console.log("recorder.state:", recorder.state);
          }

        document.getElementById("stop")
          .onclick = () => {

            if (recorder.state === "recording" || recorder.state === "paused") {
              recorder.stop();
            }

            for (let track of stream.getTracks()) {
              track.stop();
            }

            document.getElementById("start").onclick = null;
            document.getElementById("pause").onclick = null;
            console.log("recorder.state:", recorder.state
            , "stream.active", stream.active);
          }

      })
      .catch(err => {
        console.error(err)
      });
  </script>
</body>

</html>

plnkr https://plnkr.co/edit/7caWYMsvub90G6pwDdQp?p=preview

guest271314
  • 1
  • 10
  • 82
  • 156