Publications by Joao Cabral
jscabral.bib
@inproceedings{anderssoncabral09,
author = {J. Sebastian Andersson and Joao P. Cabral and Leonardo
Badino and Junichi Yamagishi and Robert A.J. Clark},
title = {Glottal Source and Prosodic Prominence Modelling in
{HMM}-based Speech Synthesis for the {B}lizzard
{C}hallenge 2009},
booktitle = {The Blizzard Challenge 2009},
address = {Edinburgh, U.K.},
abstract = {This paper describes the CSTR entry for the Blizzard
Challenge 2009. The work focused on modifying two parts
of the Nitech 2005 HTS speech synthesis system to
improve naturalness and contextual appropriateness. The
first part incorporated an implementation of the
Linjencrants-Fant (LF) glottal source model. The second
part focused on improving synthesis of prosodic
prominence including emphasis through context dependent
phonemes. Emphasis was assigned to the synthesised test
sentences based on a handful of theory based rules. The
two parts (LF-model and prosodic prominence) were not
combined and hence evaluated separately. The results on
naturalness for the LF-model showed that it is not yet
perceived as natural as the Benchmark HTS system for
neutral speech. The results for the prosodic prominence
modelling showed that it was perceived as contextually
appropriate as the Benchmark HTS system, despite a low
naturalness score. The Blizzard challenge evaluation
has provided valuable information on the status of our
work and continued work will begin with analysing why
our modifications resulted in reduced naturalness
compared to the Benchmark HTS system.},
categories = {HMM, HTS, speech synthesis, LF-model, glottal source,
prosodic prominence, emphasis},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/cstr_Blizzard2009.pdf},
year = 2009
}
@inproceedings{cabral:renals:richmond:yamagishi:2008a,
author = {Cabral, J. and Renals, S. and Richmond, K. and
Yamagishi, J.},
title = {Glottal Spectral Separation for Parametric Speech
Synthesis},
booktitle = {Proc. Interspeech},
pages = {1829--1832},
address = {Brisbane, Australia},
abstract = {This paper presents a method to control the
characteristics of synthetic speech flexibly by
integrating articulatory features into a Hidden Markov
Model (HMM)-based parametric speech synthesis system.
In contrast to model adaptation and interpolation
approaches for speaking style control, this method is
driven by phonetic knowledge, and target speech samples
are not required. The joint distribution of parallel
acoustic and articulatory features considering
cross-stream feature dependency is estimated. At
synthesis time, acoustic and articulatory features are
generated simultaneously based on the
maximum-likelihood criterion. The synthetic speech can
be controlled flexibly by modifying the generated
articulatory features according to arbitrary phonetic
rules in the parameter generation process. Our
experiments show that the proposed method is effective
in both changing the overall character of synthesized
speech and in controlling the quality of a specific
vowel. },
categories = {HMM speech synthesis, Glottal Spectral Separation,
LF-model},
key = {cabral:renals:richmond:yamagishi:2008a},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS081086.PDF},
year = 2008
}
@inproceedings{cabral_yrwst,
author = {J. Cabral and S. Renals and K. Richmond and J.
Yamagishi},
title = {{HMM}-based Speech Synthesis with an Acoustic Glottal
Source Model},
booktitle = {Proc. The First Young Researchers Workshop in Speech
Technology},
abstract = {A major cause of degradation of speech quality in
HMM-based speech synthesis is the use of a simple delta
pulse signal to generate the excitation of voiced
speech. This paper describes a new approach to using an
acoustic glottal source model in HMM-based
synthesisers. The goal is to improve speech quality and
parametric flexibility to better model and transform
voice characteristics.},
categories = {HMM-based Speech Synthesis, LF-Model, Glottal Spectral
Separation},
month = apr,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/jscabral-yrwss2009.pdf},
year = 2009
}
@inproceedings{cabral2011a,
author = {Cabral, J.P. and Renals, S. and Yamagishi, J. and
Richmond, K.},
title = {{HMM}-based speech synthesiser using the {LF}-model of
the glottal source},
booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011
IEEE International Conference on},
pages = {4704--4707},
abstract = {A major factor which causes a deterioration in speech
quality in {HMM}-based speech synthesis is the use of a
simple delta pulse signal to generate the excitation of
voiced speech. This paper sets out a new approach to
using an acoustic glottal source model in HMM-based
synthesisers instead of the traditional pulse signal.
The goal is to improve speech quality and to better
model and transform voice characteristics. We have
found the new method decreases buzziness and also
improves prosodic modelling. A perceptual evaluation
has supported this finding by showing a 55.6%
preference for the new system, as against the baseline.
This improvement, while not being as significant as we
had initially expected, does encourage us to work on
developing the proposed speech synthesiser further.},
categories = {HMM-based speech synthesiser;acoustic glottal source
model LF-model;delta pulse signal;perceptual
evaluation;prosodic modelling;speech quality;voiced
speech generation;hidden Markov models;speech
synthesis;},
doi = {10.1109/ICASSP.2011.5947405},
issn = {1520-6149},
month = may,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/05947405.pdf},
year = 2011
}
@inproceedings{cabral07,
author = {J. Cabral and S. Renals and K. Richmond and J.
Yamagishi},
title = {Towards an Improved Modeling of the Glottal Source in
Statistical Parametric Speech Synthesis},
booktitle = {Proc.of the 6th ISCA Workshop on Speech Synthesis},
address = {Bonn, Germany},
abstract = {This paper proposes the use of the Liljencrants-Fant
model (LF-model) to represent the glottal source signal
in HMM-based speech synthesis systems. These systems
generally use a pulse train to model the periodicity of
the excitation signal of voiced speech. However, this
model produces a strong and uniform harmonic structure
throughout the spectrum of the excitation which makes
the synthetic speech sound buzzy. The use of a mixed
band excitation and phase manipulation reduces this
effect but it can result in degradation of the speech
quality if the noise component is not weighted
carefully. In turn, the LF-waveform has a decaying
spectrum at higher frequencies, which is more similar
to the real glottal source excitation signal. We
conducted a perceptual experiment to test the
hypothesis that the LF-model can perform as well as or
better than the pulse train in a HMM-based speech
synthesizer. In the synthesis, we used the mean values
of the LF-parameters, calculated by measurements of the
recorded speech. The result of this study is important
not only regarding the improvement in speech quality of
these type of systems, but also because the LF-model
can be used to model many characteristics of the
glottal source, such as voice quality, which are
important for voice transformation and generation of
expressive speech.},
categories = {LF-model, Statistical parametric speech synthesis,
HMM-based speech synthesis},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/cabral07.pdf},
year = 2007
}