Publications by Cassia Valentini-Botinhao
s0968719.bib
@inproceedings{CassiaICASSP12,
author = {Valentini-Botinhao, C. and Maia, R. and Yamagishi, J.
and King, S. and Zen, H.},
title = {{Cepstral analysis based on the Glimpse proportion
measure for improving the intelligibility of
{HMM}-based synthetic speech in noise}},
booktitle = {Proc. ICASSP},
pages = {3997--4000},
address = {Kyoto, Japan},
abstract = {In this paper we introduce a new cepstral coefficient
extraction method based on an intelligibility measure
for speech in noise, the Glimpse Proportion measure.
This new method aims to increase the intelligibility of
speech in noise by modifying the clean speech, and has
applications in scenarios such as public announcement
and car navigation systems. We first explain how the
Glimpse Proportion measure operates and further show
how we approximated it to integrate it into an existing
spectral envelope parameter extraction method commonly
used in the HMM-based speech synthesis framework. We
then demonstrate how this new method changes the
modelled spectrum according to the characteristics of
the noise and show results for a listening test with
vocoded and HMM-based synthetic speech. The test
indicates that the proposed method can significantly
improve intelligibility of synthetic speech in speech
shaped noise.},
categories = {HMM-based speech synthesis, intelligibility
enhancement, speech analysis},
doi = {10.1109/ICASSP.2012.6288794},
month = {March},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_ICASSP12.pdf},
year = 2012
}
@inproceedings{Cassia_IS11,
author = {Valentini-Botinhao, Cassia and Yamagishi, Junichi and
King, Simon},
title = {Can Objective Measures Predict the Intelligibility of
Modified {HMM}-based Synthetic Speech in Noise?},
booktitle = {Proc. Interspeech},
abstract = {{Synthetic speech can be modified to improve
intelligibility in noise. In order to perform
modifications automatically, it would be useful to have
an objective measure that could predict the
intelligibility of modified synthetic speech for human
listeners. We analysed the impact on intelligibility
– and on how well objective measures predict it –
when we separately modify speaking rate, fundamental
frequency, line spectral pairs and spectral peaks.
Shifting LSPs can increase intelligibility for human
listeners; other modifications had weaker effects.
Among the objective measures we evaluated, the Dau
model and the Glimpse proportion were the best
predictors of human performance.}},
categories = {HMM-based speech synthesis, objective measures of
intelligibility},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/Cassia_IS11.pdf},
year = 2011
}
@inproceedings{Cassia_ICASSP11,
author = {Valentini-Botinhao, Cassia and Yamagishi, Junichi and
King, Simon},
title = {Evaluation of objective measures for intelligibility
prediction of {HMM}-based synthetic speech in noise},
booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011
IEEE International Conference on},
pages = {5112--5115},
abstract = {{In this paper we evaluate four objective measures of
speech with regards to intelligibility prediction of
synthesized speech in diverse noisy situations. We
evaluated three intelligibility measures, the Dau
measure, the glimpse proportion and the Speech
Intelligibility Index (SII) and a quality measure, the
Perceptual Evaluation of Speech Quality (PESQ). For the
generation of synthesized speech we used a state of the
art HMM-based speech synthesis system. The noisy
conditions comprised four additive noises. The measures
were compared with subjective intelligibility scores
obtained in listening tests. The results show the Dau
and the glimpse measures to be the best predictors of
intelligibility, with correlations of around 0.83 to
subjective scores. All measures gave less accurate
predictions of intelligibility for synthetic speech
than have previously been found for natural speech; in
particular the SII measure. In additional experiments,
we processed the synthesized speech by an ideal binary
mask before adding noise. The Glimpse measure gave the
most accurate intelligibility predictions in this
situation.}},
categories = {HMM-based speech synthesis, objective measures of
intelligibility},
doi = {10.1109/ICASSP.2011.5947507},
issn = {1520-6149},
month = may,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/Cassia_ICASSP11.pdf},
year = 2011
}
@inproceedings{CassiaSAPA12,
author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S.},
title = {{Speech intelligibility enhancement for {HMM}-based
synthetic speech in noise}},
booktitle = {Proc. Sapa Workshop},
address = {Portland, USA},
abstract = {It is possible to increase the intelligibility of
speech in noise by enhancing the clean speech signal.
In this paper we demonstrate the effects of modifying
the spectral envelope of synthetic speech according to
the environmental noise. To achieve this, we modify Mel
cepstral coefficients according to an intelligibility
measure that accounts for glimpses of speech in noise:
the Glimpse Proportion measure. We evaluate this method
against a baseline synthetic voice trained only with
normal speech and a topline voice trained with Lombard
speech, as well as natural speech. The intelligibility
of these voices was measured when mixed with
speech-shaped noise and with a competing speaker at
three different levels. The Lombard voices, both
natural and synthetic, were more intelligible than the
normal voices in all conditions. For speech-shaped
noise, the proposed modified voice was as intelligible
as the Lombard synthetic voice without requiring any
recordings of Lombard speech, which are hard to obtain.
However, in the case of competing talker noise, the
Lombard synthetic voice was more intelligible than the
proposed modified voice.},
categories = {HMM-based speech synthesis, intelligibility
enhancement},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_Sapa12.pdf},
year = 2012
}
@inproceedings{CassiaLista12,
author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S.},
title = {{Using an intelligibility measure to create noise
robust cepstral coefficients for {HMM}-based speech
synthesis}},
booktitle = {Proc. LISTA Workshop},
address = {Edinburgh, UK},
categories = {HMM-based speech synthesis, intelligibility
enhancement},
month = {May},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_Lista12.pdf},
year = 2012
}
@inproceedings{CassiaWocci12,
author = {Valentini-Botinhao, C. and Degenkolb-Weyers, S. and
Maier, A. and Noeth, E. and Eysholdt, U. and Bocklet,
T.},
title = {{Automatic detection of sigmatism in children}},
booktitle = {Proc. WOCCI},
address = {Portland, USA},
abstract = {We propose in this paper an automatic system to detect
sigmatism from the speech signal. Sigmatism occurs when
the tongue is positioned incorrectly during
articulation of sibilant phones like /s/ and /z/. For
our task we extracted various sets of features from
speech: Mel frequency cepstral coefficients, energies
in specific bandwidths of the spectral envelope, and
the so-called supervectors, which are the parameters of
an adapted speaker model. We then trained several
classifiers on a speech database of German adults
simulating three different types of sigmatism.
Recognition results were calculated at a phone, word
and speaker level for both the simulated database and
for a database of pathological speakers. For the
simulated database, we achieved recognition rates of up
to 86%, 87% and 94% at a phone, word and speaker level.
The best classifier was then integrated as part of a
Java applet that allows patients to record their own
speech, either by pronouncing isolated phones, a
specific word or a list of words, and provides them
with a feedback whether the sibilant phones are being
correctly pronounced.},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_WOCCI12.pdf},
year = 2012
}
@inproceedings{CassiaIS12,
author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S.},
title = {{Mel cepstral coefficient modification based on the
Glimpse Proportion measure for improving the
intelligibility of {HMM}-generated synthetic speech in
noise}},
booktitle = {Proc. Interspeech},
address = {Portland, USA},
abstract = {We propose a method that modifies the Mel cepstral
coefficients of HMM-generated synthetic speech in order
to increase the intelligibility of the generated speech
when heard by a listener in the presence of a known
noise. This method is based on an approximation we
previously proposed for the Glimpse Proportion measure.
Here we show how to update the Mel cepstral
coefficients using this measure as an optimization
criterion and how to control the amount of distortion
by limiting the frequency resolution of the
modifications. To evaluate the method we built eight
different voices from normal read-text speech data from
a male speaker. Some voices were also built from
Lombard speech data produced by the same speaker.
Listening experiments with speech-shaped noise and with
a single competing talker indicate that our method
significantly improves intelligibility when compared to
unmodified synthetic speech. The voices built from
Lombard speech outperformed the proposed method
particularly for the competing talker case. However,
compared to a voice using only the spectral parameters
from Lombard speech, the proposed method obtains
similar or higher performance.},
categories = {HMM-based speech synthesis, intelligibility
enhancement, Mel cepstral coefficients},
month = {September},
year = 2012
}