The Centre for Speech Technology Research, The university of Edinburgh

Publications by Benigno Uria

s1054775.bib

@inproceedings{uria2011deep,
  author = {Uria, Benigno and Renals, Steve and Richmond, Korin},
  title = {A Deep Neural Network for Acoustic-Articulatory Speech Inversion},
  booktitle = {Proc. NIPS 2011 Workshop on Deep Learning and Unsupervised Feature Learning},
  address = {Sierra Nevada, Spain},
  month = {December},
  year = {2011},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2011/articulatory_inversion.pdf},
  abstract = {In this work, we implement a deep belief network for the acoustic-articulatory inversion mapping problem. We find that adding up to 3 hidden-layers improves inversion accuracy. We also show that this improvement is due to the higher ex- pressive capability of a deep model and not a consequence of adding more adjustable parameters. Additionally, we show unsupervised pretraining of the sys- tem improves its performance in all cases, even for a 1 hidden-layer model. Our implementation obtained an average root mean square error of 0.95 mm on the MNGU0 test dataset, beating all previously published results.}
}
@inproceedings{uriaIS2012,
  author = {Uria, Benigno and Murray, Iain and Renals, Steve and Richmond, Korin},
  title = {Deep Architectures for Articulatory Inversion},
  booktitle = {Proc. Interspeech},
  address = {Portland, Oregon, USA},
  month = {September},
  year = {2012},
  keywords = {Articulatory inversion, deep neural network, deep belief network, deep regression network, pretraining},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Uria_et_al_IS2012.pdf},
  abstract = {We implement two deep architectures for the acoustic-articulatory inversion mapping problem: a deep neural network and a deep trajectory mixture density network. We find that in both cases, deep architectures produce more accurate predictions than shallow architectures and that this is due to the higher expressive capability of a deep model and not a consequence of adding more adjustable parameters. We also find that a deep trajectory mixture density network is able to obtain better inversion accuracies than smoothing the results of a deep neural network. Our best model obtained an average root mean square error of 0.885 mm on the MNGU0 test dataset.},
  categories = {Articulatory inversion, deep neural network, deep belief network, deep regression network, pretraining}
}
@inproceedings{rnade_ICASSP15,
  author = {Uria, B. and Murray, I. and Renals, S. and Valentini-Botinhao, C. and Bridle, J.},
  title = {{Modelling acoustic feature dependencies with artificial neural networks: Trajectory-RNADE.}},
  booktitle = {Proc. ICASSP},
  address = {Brisbane, Australia},
  month = {April},
  pages = {4465-4469},
  year = {2015},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/Uria2015.pdf},
  abstract = {Given a transcription, sampling from a good model of acoustic feature trajectories should result in plausible realizations of an utterance. However, samples from current probabilistic speech synthesis systems result in low quality synthetic speech. Henter et al. have demonstrated the need to capture the dependencies between acoustic features conditioned on the phonetic labels in order to obtain high quality synthetic speech. These dependencies are often ignored in neural network based acoustic models. We tackle this deficiency by introducing a probabilistic neural network model of acoustic trajectories, trajectory RNADE, able to capture these dependencies.}
}