The Centre for Speech Technology Research, The university of Edinburgh

Publications by Joao Cabral

jscabral.bib

@inproceedings{anderssoncabral09,
  author = {J. Sebastian Andersson and Joao P. Cabral and Leonardo
                   Badino and Junichi Yamagishi and Robert A.J. Clark},
  title = {Glottal Source and Prosodic Prominence Modelling in
                   {HMM}-based Speech Synthesis for the {B}lizzard
                   {C}hallenge 2009},
  booktitle = {The Blizzard Challenge 2009},
  address = {Edinburgh, U.K.},
  abstract = {This paper describes the CSTR entry for the Blizzard
                   Challenge 2009. The work focused on modifying two parts
                   of the Nitech 2005 HTS speech synthesis system to
                   improve naturalness and contextual appropriateness. The
                   first part incorporated an implementation of the
                   Linjencrants-Fant (LF) glottal source model. The second
                   part focused on improving synthesis of prosodic
                   prominence including emphasis through context dependent
                   phonemes. Emphasis was assigned to the synthesised test
                   sentences based on a handful of theory based rules. The
                   two parts (LF-model and prosodic prominence) were not
                   combined and hence evaluated separately. The results on
                   naturalness for the LF-model showed that it is not yet
                   perceived as natural as the Benchmark HTS system for
                   neutral speech. The results for the prosodic prominence
                   modelling showed that it was perceived as contextually
                   appropriate as the Benchmark HTS system, despite a low
                   naturalness score. The Blizzard challenge evaluation
                   has provided valuable information on the status of our
                   work and continued work will begin with analysing why
                   our modifications resulted in reduced naturalness
                   compared to the Benchmark HTS system.},
  categories = {HMM, HTS, speech synthesis, LF-model, glottal source,
                   prosodic prominence, emphasis},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/cstr_Blizzard2009.pdf},
  year = 2009
}
@inproceedings{cabral:renals:richmond:yamagishi:2008a,
  author = {Cabral, J. and Renals, S. and Richmond, K. and
                   Yamagishi, J.},
  title = {Glottal Spectral Separation for Parametric Speech
                   Synthesis},
  booktitle = {Proc. Interspeech},
  pages = {1829--1832},
  address = {Brisbane, Australia},
  abstract = {This paper presents a method to control the
                   characteristics of synthetic speech flexibly by
                   integrating articulatory features into a Hidden Markov
                   Model (HMM)-based parametric speech synthesis system.
                   In contrast to model adaptation and interpolation
                   approaches for speaking style control, this method is
                   driven by phonetic knowledge, and target speech samples
                   are not required. The joint distribution of parallel
                   acoustic and articulatory features considering
                   cross-stream feature dependency is estimated. At
                   synthesis time, acoustic and articulatory features are
                   generated simultaneously based on the
                   maximum-likelihood criterion. The synthetic speech can
                   be controlled flexibly by modifying the generated
                   articulatory features according to arbitrary phonetic
                   rules in the parameter generation process. Our
                   experiments show that the proposed method is effective
                   in both changing the overall character of synthesized
                   speech and in controlling the quality of a specific
                   vowel. },
  categories = {HMM speech synthesis, Glottal Spectral Separation,
                   LF-model},
  key = {cabral:renals:richmond:yamagishi:2008a},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS081086.PDF},
  year = 2008
}
@inproceedings{cabral_yrwst,
  author = {J. Cabral and S. Renals and K. Richmond and J.
                   Yamagishi},
  title = {{HMM}-based Speech Synthesis with an Acoustic Glottal
                   Source Model},
  booktitle = {Proc. The First Young Researchers Workshop in Speech
                   Technology},
  abstract = {A major cause of degradation of speech quality in
                   HMM-based speech synthesis is the use of a simple delta
                   pulse signal to generate the excitation of voiced
                   speech. This paper describes a new approach to using an
                   acoustic glottal source model in HMM-based
                   synthesisers. The goal is to improve speech quality and
                   parametric flexibility to better model and transform
                   voice characteristics.},
  categories = {HMM-based Speech Synthesis, LF-Model, Glottal Spectral
                   Separation},
  month = apr,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/jscabral-yrwss2009.pdf},
  year = 2009
}
@inproceedings{cabral2011a,
  author = {Cabral, J.P. and Renals, S. and Yamagishi, J. and
                   Richmond, K.},
  title = {{HMM}-based speech synthesiser using the {LF}-model of
                   the glottal source},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011
                   IEEE International Conference on},
  pages = {4704--4707},
  abstract = {A major factor which causes a deterioration in speech
                   quality in {HMM}-based speech synthesis is the use of a
                   simple delta pulse signal to generate the excitation of
                   voiced speech. This paper sets out a new approach to
                   using an acoustic glottal source model in HMM-based
                   synthesisers instead of the traditional pulse signal.
                   The goal is to improve speech quality and to better
                   model and transform voice characteristics. We have
                   found the new method decreases buzziness and also
                   improves prosodic modelling. A perceptual evaluation
                   has supported this finding by showing a 55.6%
                   preference for the new system, as against the baseline.
                   This improvement, while not being as significant as we
                   had initially expected, does encourage us to work on
                   developing the proposed speech synthesiser further.},
  categories = {HMM-based speech synthesiser;acoustic glottal source
                   model LF-model;delta pulse signal;perceptual
                   evaluation;prosodic modelling;speech quality;voiced
                   speech generation;hidden Markov models;speech
                   synthesis;},
  doi = {10.1109/ICASSP.2011.5947405},
  issn = {1520-6149},
  month = may,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/05947405.pdf},
  year = 2011
}
@inproceedings{cabral07,
  author = {J. Cabral and S. Renals and K. Richmond and J.
                   Yamagishi},
  title = {Towards an Improved Modeling of the Glottal Source in
                   Statistical Parametric Speech Synthesis},
  booktitle = {Proc.of the 6th ISCA Workshop on Speech Synthesis},
  address = {Bonn, Germany},
  abstract = {This paper proposes the use of the Liljencrants-Fant
                   model (LF-model) to represent the glottal source signal
                   in HMM-based speech synthesis systems. These systems
                   generally use a pulse train to model the periodicity of
                   the excitation signal of voiced speech. However, this
                   model produces a strong and uniform harmonic structure
                   throughout the spectrum of the excitation which makes
                   the synthetic speech sound buzzy. The use of a mixed
                   band excitation and phase manipulation reduces this
                   effect but it can result in degradation of the speech
                   quality if the noise component is not weighted
                   carefully. In turn, the LF-waveform has a decaying
                   spectrum at higher frequencies, which is more similar
                   to the real glottal source excitation signal. We
                   conducted a perceptual experiment to test the
                   hypothesis that the LF-model can perform as well as or
                   better than the pulse train in a HMM-based speech
                   synthesizer. In the synthesis, we used the mean values
                   of the LF-parameters, calculated by measurements of the
                   recorded speech. The result of this study is important
                   not only regarding the improvement in speech quality of
                   these type of systems, but also because the LF-model
                   can be used to model many characteristics of the
                   glottal source, such as voice quality, which are
                   important for voice transformation and generation of
                   expressive speech.},
  categories = {LF-model, Statistical parametric speech synthesis,
                   HMM-based speech synthesis},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/cabral07.pdf},
  year = 2007
}