I have made an application, with Watson Assistant, speech-to-text and text to speech in Unity, where the user can say different cities to find available air plane tickets between said cities. The conversation and interactions are working great, but sometimes I have the problem that some cities aren't recognised when the user says them. For example Berlin, sometimes it understands Berlin and another time burning. The same goes for other cities like Paris, London and Jakarta.
So the detection of city names isn't always as accurate as I had hoped. But I saw in some posts that you can make your own custom model to improve the detection of those words. But I have no idea how to set that up, make an own custom model and how to add those cities to the model and train it. Is it possible to do that in Unity C# scripting and how would i start with it? Are there some C# examples that I can look at? Any help would be appreciated.
These are some links and information that I found, but have no idea of how to implement it in C# and for my own purpose in relation of improving the accuracy of city detection.
DwAnswers1 DwAnswers2 StackOverflow IBM clouds docs Medium cURL tutorial
This is the C# script I have for my interaction between the Watson API and Unity. I think i have to add the custom model in here too, but I don't know if i should create the custom model in it too, or if it needs to be in a seperate script.
using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using IBM.Watson.DeveloperCloud.Services.TextToSpeech.v1;
using IBM.Watson.DeveloperCloud.Services.Conversation.v1;
using IBM.Watson.DeveloperCloud.Services.ToneAnalyzer.v3;
using IBM.Watson.DeveloperCloud.Services.SpeechToText.v1;
using IBM.Watson.DeveloperCloud.Logging;
using IBM.Watson.DeveloperCloud.Utilities;
using IBM.Watson.DeveloperCloud.Connection;
using IBM.Watson.DeveloperCloud.DataTypes;
using MiniJSON;
using UnityEngine.UI;
using FullSerializer;
public class WatsonAgent : MonoBehaviour
{
public string literalEntityCity;
public string destinationCity;
public string departureCity;
public string dateBegin;
public string dateEnd;
public WeatherJSON weather;
public GameObject FlightInfo;
[SerializeField]
private fsSerializer _serializer = new fsSerializer();
[System.Serializable]
public class CredentialInformation
{
public string username, password, url;
}
[System.Serializable]
public class Services
{
public CredentialInformation
textToSpeech,
conversation,
speechToText;
}
[Header("Credentials")]
[Space]
public Services
serviceCredentials;
[Space]
[Header("Agent voice settings")]
[Space]
public AudioSource
voiceSource;
public VoiceType
voiceType;
[Space]
[Header("Conversation settings")]
[Space]
public string
workspaceId;
[Space]
[Header("Feedback fields")]
[Space]
public Text
speechToTextField;
public Text
conversationInputField;
public Text
conversationOutputField;
public string
saying;
// services
SpeechToText
speechToText;
private int
recordingRoutine = 0,
recordingBufferSize = 1,
recordingHZ = 22050;
private string
microphoneID = null;
private AudioClip
recording = null;
TextToSpeech
textToSpeech;
Conversation
conversation;
private Dictionary<string, object>
conversationContext = null;
private void Start()
{
PrepareCredentials();
Initialize();
}
void PrepareCredentials()
{
speechToText = new SpeechToText(GetCredentials(serviceCredentials.speechToText));
textToSpeech = new TextToSpeech(GetCredentials(serviceCredentials.textToSpeech));
conversation = new Conversation(GetCredentials(serviceCredentials.conversation));
}
Credentials GetCredentials(CredentialInformation credentialInformation)
{
return new Credentials(credentialInformation.username, credentialInformation.password, credentialInformation.url);
}
void Initialize()
{
conversation.VersionDate = "2017-05-26";
Active = true;
StartRecording();
}
// speech to text
public bool Active
{
get { return speechToText.IsListening; }
set
{
if (value && !speechToText.IsListening)
{
speechToText.DetectSilence = true;
speechToText.EnableWordConfidence = true;
speechToText.EnableTimestamps = true;
speechToText.SilenceThreshold = 0.01f;
speechToText.MaxAlternatives = 0;
speechToText.EnableInterimResults = true;
speechToText.OnError = OnSpeechError;
speechToText.InactivityTimeout = -1;
speechToText.ProfanityFilter = false;
speechToText.SmartFormatting = true;
speechToText.SpeakerLabels = false;
speechToText.WordAlternativesThreshold = null;
speechToText.StartListening(OnSpeechRecognize);
//speechToText.CustomizationId = "customID"; // I guess i have to add the custom training model here with the customID
//speechToText.CustomizationWeight(0.2); //
}
else if (!value && speechToText.IsListening)
{
speechToText.StopListening();
}
}
}
private void StartRecording()
{
if (recordingRoutine == 0)
{
UnityObjectUtil.StartDestroyQueue();
recordingRoutine = Runnable.Run(RecordingHandler());
}
}
private void StopRecording()
{
if (recordingRoutine != 0)
{
Microphone.End(microphoneID);
Runnable.Stop(recordingRoutine);
recordingRoutine = 0;
}
}
private void OnSpeechError(string error)
{
Active = false;
Log.Debug("ExampleStreaming.OnError()", "Error! {0}", error);
}
private IEnumerator RecordingHandler()
{
recording = Microphone.Start(microphoneID, true, recordingBufferSize, recordingHZ);
yield return null; // let _recordingRoutine get set..
if (recording == null)
{
StopRecording();
yield break;
}
bool bFirstBlock = true;
int midPoint = recording.samples / 2;
float[] samples = null;
while (recordingRoutine != 0 && recording != null)
{
int writePos = Microphone.GetPosition(microphoneID);
if (writePos > recording.samples || !Microphone.IsRecording(microphoneID))
{
Debug.Log("Microphone disconnected.");
StopRecording();
yield break;
}
if ((bFirstBlock && writePos >= midPoint) || (!bFirstBlock && writePos < midPoint))
{
// front block is recorded, make a RecordClip and pass it onto our callback.
samples = new float[midPoint];
recording.GetData(samples, bFirstBlock ? 0 : midPoint);
AudioData record = new AudioData();
record.MaxLevel = Mathf.Max(Mathf.Abs(Mathf.Min(samples)), Mathf.Max(samples));
record.Clip = AudioClip.Create("Recording", midPoint, recording.channels, recordingHZ, false);
record.Clip.SetData(samples, 0);
speechToText.OnListen(record);
bFirstBlock = !bFirstBlock;
}
else
{
// calculate the number of samples remaining until we ready for a block of audio,
// and wait that amount of time it will take to record.
int remaining = bFirstBlock ? (midPoint - writePos) : (recording.samples - writePos);
float timeRemaining = (float)remaining / (float)recordingHZ;
yield return new WaitForSeconds(timeRemaining);
}
}
yield break;
}
private void OnSpeechRecognize(SpeechRecognitionEvent result, Dictionary<string, object> customData)
{
if (result != null && result.results.Length > 0)
{
foreach (var res in result.results)
{
foreach (var alt in res.alternatives)
{
string text = string.Format("{0} ({1}, {2:0.00})\n", alt.transcript, res.final ? "Final" : "Interim", alt.confidence);
if (speechToTextField != null)
{
speechToTextField.text = text;
}
if (res.final)
{
if (characterState == SocialState.listening)
{
Debug.Log("WATSON | Speech to text recorded: \n" + alt.transcript);
StartCoroutine(Message(alt.transcript));
}
}
else
{
if (characterState == SocialState.idle)
{
characterState = SocialState.listening;
}
}
}
}
}
}
// text to speech
private IEnumerator Synthesize(string text)
{
Debug.Log("WATSON CALL | Synthesize input: \n" + text);
textToSpeech.Voice = voiceType;
bool doSynthesize = textToSpeech.ToSpeech(HandleSynthesizeCallback, OnFail, text, true);
if (doSynthesize)
{
StartCoroutine(Analyze(text));
saying = text;
characterState = SocialState.talking;
}
yield return null;
}
void HandleSynthesizeCallback(AudioClip clip, Dictionary<string, object> customData = null)
{
if (Application.isPlaying && clip != null)
{
voiceSource.clip = clip;
voiceSource.Play();
}
}
// conversation
private IEnumerator Message(string text)
{
Debug.Log("WATSON | Conversation input: \n" + text);
MessageRequest messageRequest = new MessageRequest()
{
input = new Dictionary<string, object>()
{
{ "text", text }
},
context = conversationContext
};
bool doMessage = conversation.Message(HandleMessageCallback, OnFail, workspaceId, messageRequest);
if (doMessage)
{
characterState = SocialState.thinking;
if (conversationInputField != null)
{
conversationInputField.text = text;
}
}
yield return null;
}
void HandleMessageCallback(object resp, Dictionary<string, object> customData)
{
object _tempContext = null;
(resp as Dictionary<string, object>).TryGetValue("context", out _tempContext);
if (_tempContext != null)
conversationContext = _tempContext as Dictionary<string, object>;
string contextList = conversationContext.ToString();
Dictionary<string, object> dict = Json.Deserialize(customData["json"].ToString()) as Dictionary<string, object>;
Dictionary<string, object> output = dict["output"] as Dictionary<string, object>;
Debug.Log("JSON INFO: " + customData["json"].ToString());
// Send new/update context variables to the Watson Conversation Service
if (weather.temperatureCity != null && !conversationContext.ContainsKey("temperature"))
{
string currentTemperature = weather.temperatureNumber.ToString();
conversationContext.Add("temperature", currentTemperature);
}
else if (conversationContext.ContainsKey("temperature"))
{
string currentTemperature = weather.temperatureNumber.ToString();
conversationContext.Remove("temperature");
conversationContext.Add("temperature", currentTemperature);
//Debug.Log("Current Temperature: " + currentTemperature);
}
// $ call context variables
var context = dict["context"] as Dictionary<string, object>;
if (context["destination_city"] != null)
{
destinationCity = context["destination_city"].ToString();
Debug.Log("Destination city: " + destinationCity);
}
if (context["departure_city"] != null)
{
departureCity = context["departure_city"].ToString();
}
List<object> text = output["text"] as List<object>;
string answer = text[0].ToString(); //Geeft alleen de eerste response terug
Debug.Log("WATSON | Conversation output: \n" + answer);
if (conversationOutputField != null)
{
conversationOutputField.text = answer;
}
fsData fsdata = null;
fsResult r = _serializer.TrySerialize(resp.GetType(), resp, out fsdata);
if (!r.Succeeded)
{
throw new WatsonException(r.FormattedMessages);
}
//convert fsdata to MessageResponse
MessageResponse messageResponse = new MessageResponse();
object obj = messageResponse;
r = _serializer.TryDeserialize(fsdata, obj.GetType(), ref obj);
if (!r.Succeeded)
{
throw new WatsonException(r.FormattedMessages);
}
if (resp != null)
{
//Recognize intents & entities
if (messageResponse.intents.Length > 0 && messageResponse.entities.Length > 0)
{
string intent = messageResponse.intents[0].intent;
string entity = messageResponse.entities[0].entity;
string literalEntity = messageResponse.entities[0].value;
if (entity == "city")
{
literalEntityCity = literalEntity;
}
if (intent == "weather" && entity == "city")
{
literalEntityCity = literalEntity;
}
}
if (messageResponse.intents.Length > 0)
{
string intent = messageResponse.intents[0].intent;
//Debug.Log("Intent: " + intent); //intent name
}
if (messageResponse.entities.Length > 0)
{
string entity = messageResponse.entities[0].entity;
//Debug.Log("Entity: " + entity); //entity name
string literalEntity = messageResponse.entities[0].value;
//Debug.Log("Entity Literal: " + literalEntity); //literal spoken entity
if (entity == "city")
{
literalEntityCity = literalEntity;
}
}
}
StartCoroutine(Synthesize(answer));
}
}