The Centre for Speech Technology Research, The university of Edinburgh

Publications by Joao Cabral


  author = {Andersson, J. Sebastian and Cabral, Joao P. and Badino, Leonardo and Yamagishi, Junichi and Clark, Robert A.J.},
  title = {Glottal Source and Prosodic Prominence Modelling in {HMM}-based Speech Synthesis for the {B}lizzard {C}hallenge 2009},
  booktitle = {The Blizzard Challenge 2009},
  address = {Edinburgh, U.K.},
  month = {September},
  year = {2009},
  pdf = {},
  abstract = {This paper describes the CSTR entry for the Blizzard Challenge 2009. The work focused on modifying two parts of the Nitech 2005 HTS speech synthesis system to improve naturalness and contextual appropriateness. The first part incorporated an implementation of the Linjencrants-Fant (LF) glottal source model. The second part focused on improving synthesis of prosodic prominence including emphasis through context dependent phonemes. Emphasis was assigned to the synthesised test sentences based on a handful of theory based rules. The two parts (LF-model and prosodic prominence) were not combined and hence evaluated separately. The results on naturalness for the LF-model showed that it is not yet perceived as natural as the Benchmark HTS system for neutral speech. The results for the prosodic prominence modelling showed that it was perceived as contextually appropriate as the Benchmark HTS system, despite a low naturalness score. The Blizzard challenge evaluation has provided valuable information on the status of our work and continued work will begin with analysing why our modifications resulted in reduced naturalness compared to the Benchmark HTS system.},
  categories = {HMM, HTS, speech synthesis, LF-model, glottal source, prosodic prominence, emphasis}
  author = {Cabral, J. and Renals, S. and Richmond, K. and Yamagishi, J.},
  title = {Glottal Spectral Separation for Parametric Speech Synthesis},
  booktitle = {Proc. Interspeech},
  address = {Brisbane, Australia},
  month = {September},
  pages = {1829--1832},
  key = {cabral:renals:richmond:yamagishi:2008a},
  year = {2008},
  pdf = {},
  abstract = {This paper presents a method to control the characteristics of synthetic speech flexibly by integrating articulatory features into a Hidden Markov Model (HMM)-based parametric speech synthesis system. In contrast to model adaptation and interpolation approaches for speaking style control, this method is driven by phonetic knowledge, and target speech samples are not required. The joint distribution of parallel acoustic and articulatory features considering cross-stream feature dependency is estimated. At synthesis time, acoustic and articulatory features are generated simultaneously based on the maximum-likelihood criterion. The synthetic speech can be controlled flexibly by modifying the generated articulatory features according to arbitrary phonetic rules in the parameter generation process. Our experiments show that the proposed method is effective in both changing the overall character of synthesized speech and in controlling the quality of a specific vowel.},
  categories = {HMM speech synthesis, Glottal Spectral Separation, LF-model}
  author = {Cabral, J. and Renals, S. and Richmond, K. and Yamagishi, J.},
  title = {{HMM}-based Speech Synthesis with an Acoustic Glottal Source Model},
  booktitle = {Proc. The First Young Researchers Workshop in Speech Technology},
  month = {April},
  year = {2009},
  pdf = {},
  abstract = {A major cause of degradation of speech quality in HMM-based speech synthesis is the use of a simple delta pulse signal to generate the excitation of voiced speech. This paper describes a new approach to using an acoustic glottal source model in HMM-based synthesisers. The goal is to improve speech quality and parametric flexibility to better model and transform voice characteristics.},
  categories = {HMM-based Speech Synthesis, LF-Model, Glottal Spectral Separation}
  author = {Cabral, J.P. and Renals, S. and Yamagishi, J. and Richmond, K.},
  doi = {10.1109/ICASSP.2011.5947405},
  title = {{HMM}-based speech synthesiser using the {LF}-model of the glottal source},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011 IEEE International Conference on},
  issn = {1520-6149},
  month = {May},
  pages = {4704--4707},
  year = {2011},
  pdf = {},
  abstract = {A major factor which causes a deterioration in speech quality in {HMM}-based speech synthesis is the use of a simple delta pulse signal to generate the excitation of voiced speech. This paper sets out a new approach to using an acoustic glottal source model in HMM-based synthesisers instead of the traditional pulse signal. The goal is to improve speech quality and to better model and transform voice characteristics. We have found the new method decreases buzziness and also improves prosodic modelling. A perceptual evaluation has supported this finding by showing a 55.6% preference for the new system, as against the baseline. This improvement, while not being as significant as we had initially expected, does encourage us to work on developing the proposed speech synthesiser further.},
  categories = {HMM-based speech synthesiser;acoustic glottal source model LF-model;delta pulse signal;perceptual evaluation;prosodic modelling;speech quality;voiced speech generation;hidden Markov models;speech synthesis;}
  author = {Cabral, J. and Renals, S. and Richmond, K. and Yamagishi, J.},
  title = {Towards an Improved Modeling of the Glottal Source in Statistical Parametric Speech Synthesis},
  booktitle = {Proc.of the 6th ISCA Workshop on Speech Synthesis},
  address = {Bonn, Germany},
  year = {2007},
  pdf = {},
  abstract = {This paper proposes the use of the Liljencrants-Fant model (LF-model) to represent the glottal source signal in HMM-based speech synthesis systems. These systems generally use a pulse train to model the periodicity of the excitation signal of voiced speech. However, this model produces a strong and uniform harmonic structure throughout the spectrum of the excitation which makes the synthetic speech sound buzzy. The use of a mixed band excitation and phase manipulation reduces this effect but it can result in degradation of the speech quality if the noise component is not weighted carefully. In turn, the LF-waveform has a decaying spectrum at higher frequencies, which is more similar to the real glottal source excitation signal. We conducted a perceptual experiment to test the hypothesis that the LF-model can perform as well as or better than the pulse train in a HMM-based speech synthesizer. In the synthesis, we used the mean values of the LF-parameters, calculated by measurements of the recorded speech. The result of this study is important not only regarding the improvement in speech quality of these type of systems, but also because the LF-model can be used to model many characteristics of the glottal source, such as voice quality, which are important for voice transformation and generation of expressive speech.},
  categories = {LF-model, Statistical parametric speech synthesis, HMM-based speech synthesis}
  author = {Cabral, J.P. and Richmond, K. and Yamagishi, J. and Renals, S.},
  doi = {10.1109/JSTSP.2014.2307274},
  title = {Glottal Spectral Separation for Speech Synthesis},
  journal = {Selected Topics in Signal Processing, IEEE Journal of},
  issn = {1932-4553},
  number = {2},
  month = {April},
  volume = {8},
  pages = {195-208},
  year = {2014},
  keywords = {Analytical models;Computational modeling;Estimation;Hidden Markov models;Mathematical model;Speech;Speech synthesis;Glottal spectral separation;LF-model;parametric speech synthesis;voice quality transformation},
  pdf = {},
  abstract = {This paper proposes an analysis method to separate the glottal source and vocal tract components of speech that is called Glottal Spectral Separation (GSS). This method can produce high-quality synthetic speech using an acoustic glottal source model. In the source-filter models commonly used in speech technology applications it is assumed the source is a spectrally flat excitation signal and the vocal tract filter can be represented by the spectral envelope of speech. Although this model can produce high-quality speech, it has limitations for voice transformation because it does not allow control over glottal parameters which are correlated with voice quality. The main problem with using a speech model that better represents the glottal source and the vocal tract filter is that current analysis methods for separating these components are not robust enough to produce the same speech quality as using a model based on the spectral envelope of speech. The proposed GSS method is an attempt to overcome this problem, and consists of the following three steps. Initially, the glottal source signal is estimated from the speech signal. Then, the speech spectrum is divided by the spectral envelope of the glottal source signal in order to remove the glottal source effects from the speech signal. Finally, the vocal tract transfer function is obtained by computing the spectral envelope of the resulting signal. In this work, the glottal source signal is represented using the Liljencrants-Fant model (LF-model). The experiments we present here show that the analysis-synthesis technique based on GSS can produce speech comparable to that of a high-quality vocoder that is based on the spectral envelope representation. However, it also permit control over voice qualities, namely to transform a modal voice into breathy and tense, by modifying the glottal parameters.}