I'm trying to use Google cloud speech API, by recording audio in my Angular FrontEnd, converting it to base64, sending it to my Node backend, which does the query to google speech API.
So far I have had no success with it, google only sending me empty results.
You'll see that in order to identify the problem, I'm looping through all possible sampleRate and audio format.
Another thing to note is that testing with Google example audio.raw (found here https://github.com/googleapis/nodejs-speech/tree/master/samples), it works, and I get a transcription.
Here is my front end:
const onSuccess = stream => {
var options = {
audioBitsPerSecond: 16000, // NB: I have tried several bitrates, and several audio formats (here, and in the blob creation)
// mimeType: 'audio/ogg; codecs=opus'
}
this.mediaRecorder = new MediaRecorder(stream);
this.mediaRecorder.onstop = e => {
const audio = new Audio();
const blob = new Blob(this.chunks, { 'type': 'audio/wav' });
this.chunks.length = 0;
audio.src = window.URL.createObjectURL(blob);
var reader = new FileReader();
reader.readAsDataURL(blob);
reader.onloadend = (function () {
var base64data = reader.result;
var splited = base64data.substr(base64data.indexOf(',') + 1);
this.appservice.postAudio(splited).subscribe(res => {
console.log("MyBuffer: ", res);
})
}).bind(this);
audio.load();
audio.play();
};
this.mediaRecorder.ondataavailable = e => this.chunks.push(e.data);
};
And my Node backend
post(req, res) {
var encoding = ["FLAC", "LINEAR16", "MULAW", "AMR", "AMR_WB", "OGG_OPUS", "SPEEX_WITH_HEADER_BYTE"];
var sampleRate = ["8000", "12000", "16000", "24000", "44100", "48000"];
encoding.forEach(elementencoding => {
sampleRate.forEach(elementrate => {
const projectId = 'myId';
const request = {
"config": {
"encoding": elementencoding,
"sampleRateHertz": elementrate,
"languageCode": "fr-FR"
},
"audio": {
"content": req.body.base64audio
}
};
const client = new speech.SpeechClient({
projectId: projectId,
});
// Detects speech in the audio file
client
.recognize(request)
.then(data => {
console.log("raw data:" + elementencoding + " - " + elementrate + " => ", data[0].results);
const response = data[0];
const transcription = response.results
.map(result => result.alternatives[0].transcript)
.join('\n');
})
.catch(err => {
console.error('ERROR:' + elementencoding + ' - ' + elementrate);
});
});
});
}
And here is the output I get
ERROR:OGG_OPUS - 44100
ERROR:OGG_OPUS - 24000
ERROR:AMR_WB - 44100
ERROR:SPEEX_WITH_HEADER_BYTE - 24000
ERROR:OGG_OPUS - 8000
raw data:LINEAR16 - 48000 => []
raw data:LINEAR16 - 44100 => []
raw data:LINEAR16 - 12000 => []
raw data:LINEAR16 - 16000 => []
ERROR:SPEEX_WITH_HEADER_BYTE - 48000
ERROR:AMR_WB - 48000
ERROR:AMR - 24000
ERROR:OGG_OPUS - 12000
ERROR:AMR - 44100
ERROR:SPEEX_WITH_HEADER_BYTE - 8000
ERROR:SPEEX_WITH_HEADER_BYTE - 12000
ERROR:AMR_WB - 8000
ERROR:AMR_WB - 24000
ERROR:OGG_OPUS - 48000
raw data:LINEAR16 - 8000 => []
raw data:LINEAR16 - 24000 => []
raw data:MULAW - 48000 => []
ERROR:AMR - 48000
ERROR:AMR - 12000
ERROR:AMR - 16000
raw data:FLAC - 24000 => []
Thank you to anybody that could have an idea on this.