The Centre for Speech Technology Research, The university of Edinburgh

Publications by Cassia Valentini-Botinhao

s0968719.bib

@inproceedings{CassiaICASSP12,
  author = {Valentini-Botinhao, C. and Maia, R. and Yamagishi, J.
                   and King, S. and Zen, H.},
  title = {{Cepstral analysis based on the Glimpse proportion
                   measure for improving the intelligibility of
                   {HMM}-based synthetic speech in noise}},
  booktitle = {Proc. ICASSP},
  pages = {3997--4000},
  address = {Kyoto, Japan},
  abstract = {In this paper we introduce a new cepstral coefficient
                   extraction method based on an intelligibility measure
                   for speech in noise, the Glimpse Proportion measure.
                   This new method aims to increase the intelligibility of
                   speech in noise by modifying the clean speech, and has
                   applications in scenarios such as public announcement
                   and car navigation systems. We first explain how the
                   Glimpse Proportion measure operates and further show
                   how we approximated it to integrate it into an existing
                   spectral envelope parameter extraction method commonly
                   used in the HMM-based speech synthesis framework. We
                   then demonstrate how this new method changes the
                   modelled spectrum according to the characteristics of
                   the noise and show results for a listening test with
                   vocoded and HMM-based synthetic speech. The test
                   indicates that the proposed method can significantly
                   improve intelligibility of synthetic speech in speech
                   shaped noise.},
  categories = {HMM-based speech synthesis, intelligibility
                   enhancement, speech analysis},
  doi = {10.1109/ICASSP.2012.6288794},
  month = {March},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_ICASSP12.pdf},
  year = 2012
}
@inproceedings{Cassia_IS11,
  author = {Valentini-Botinhao, Cassia and Yamagishi, Junichi and
                   King, Simon},
  title = {Can Objective Measures Predict the Intelligibility of
                   Modified {HMM}-based Synthetic Speech in Noise?},
  booktitle = {Proc. Interspeech},
  abstract = {{Synthetic speech can be modified to improve
                   intelligibility in noise. In order to perform
                   modifications automatically, it would be useful to have
                   an objective measure that could predict the
                   intelligibility of modified synthetic speech for human
                   listeners. We analysed the impact on intelligibility
                   – and on how well objective measures predict it –
                   when we separately modify speaking rate, fundamental
                   frequency, line spectral pairs and spectral peaks.
                   Shifting LSPs can increase intelligibility for human
                   listeners; other modifications had weaker effects.
                   Among the objective measures we evaluated, the Dau
                   model and the Glimpse proportion were the best
                   predictors of human performance.}},
  categories = {HMM-based speech synthesis, objective measures of
                   intelligibility},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/Cassia_IS11.pdf},
  year = 2011
}
@inproceedings{Cassia_ICASSP11,
  author = {Valentini-Botinhao, Cassia and Yamagishi, Junichi and
                   King, Simon},
  title = {Evaluation of objective measures for intelligibility
                   prediction of {HMM}-based synthetic speech in noise},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011
                   IEEE International Conference on},
  pages = {5112--5115},
  abstract = {{In this paper we evaluate four objective measures of
                   speech with regards to intelligibility prediction of
                   synthesized speech in diverse noisy situations. We
                   evaluated three intelligibility measures, the Dau
                   measure, the glimpse proportion and the Speech
                   Intelligibility Index (SII) and a quality measure, the
                   Perceptual Evaluation of Speech Quality (PESQ). For the
                   generation of synthesized speech we used a state of the
                   art HMM-based speech synthesis system. The noisy
                   conditions comprised four additive noises. The measures
                   were compared with subjective intelligibility scores
                   obtained in listening tests. The results show the Dau
                   and the glimpse measures to be the best predictors of
                   intelligibility, with correlations of around 0.83 to
                   subjective scores. All measures gave less accurate
                   predictions of intelligibility for synthetic speech
                   than have previously been found for natural speech; in
                   particular the SII measure. In additional experiments,
                   we processed the synthesized speech by an ideal binary
                   mask before adding noise. The Glimpse measure gave the
                   most accurate intelligibility predictions in this
                   situation.}},
  categories = {HMM-based speech synthesis, objective measures of
                   intelligibility},
  doi = {10.1109/ICASSP.2011.5947507},
  issn = {1520-6149},
  month = may,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/Cassia_ICASSP11.pdf},
  year = 2011
}
@inproceedings{CassiaSAPA12,
  author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S.},
  title = {{Speech intelligibility enhancement for {HMM}-based
                   synthetic speech in noise}},
  booktitle = {Proc. Sapa Workshop},
  address = {Portland, USA},
  abstract = {It is possible to increase the intelligibility of
                   speech in noise by enhancing the clean speech signal.
                   In this paper we demonstrate the effects of modifying
                   the spectral envelope of synthetic speech according to
                   the environmental noise. To achieve this, we modify Mel
                   cepstral coefficients according to an intelligibility
                   measure that accounts for glimpses of speech in noise:
                   the Glimpse Proportion measure. We evaluate this method
                   against a baseline synthetic voice trained only with
                   normal speech and a topline voice trained with Lombard
                   speech, as well as natural speech. The intelligibility
                   of these voices was measured when mixed with
                   speech-shaped noise and with a competing speaker at
                   three different levels. The Lombard voices, both
                   natural and synthetic, were more intelligible than the
                   normal voices in all conditions. For speech-shaped
                   noise, the proposed modified voice was as intelligible
                   as the Lombard synthetic voice without requiring any
                   recordings of Lombard speech, which are hard to obtain.
                   However, in the case of competing talker noise, the
                   Lombard synthetic voice was more intelligible than the
                   proposed modified voice.},
  categories = {HMM-based speech synthesis, intelligibility
                   enhancement},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_Sapa12.pdf},
  year = 2012
}
@inproceedings{CassiaLista12,
  author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S.},
  title = {{Using an intelligibility measure to create noise
                   robust cepstral coefficients for {HMM}-based speech
                   synthesis}},
  booktitle = {Proc. LISTA Workshop},
  address = {Edinburgh, UK},
  categories = {HMM-based speech synthesis, intelligibility
                   enhancement},
  month = {May},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_Lista12.pdf},
  year = 2012
}
@inproceedings{CassiaWocci12,
  author = {Valentini-Botinhao, C. and Degenkolb-Weyers, S. and
                   Maier, A. and Noeth, E. and Eysholdt, U. and Bocklet,
                   T.},
  title = {{Automatic detection of sigmatism in children}},
  booktitle = {Proc. WOCCI},
  address = {Portland, USA},
  abstract = {We propose in this paper an automatic system to detect
                   sigmatism from the speech signal. Sigmatism occurs when
                   the tongue is positioned incorrectly during
                   articulation of sibilant phones like /s/ and /z/. For
                   our task we extracted various sets of features from
                   speech: Mel frequency cepstral coefficients, energies
                   in specific bandwidths of the spectral envelope, and
                   the so-called supervectors, which are the parameters of
                   an adapted speaker model. We then trained several
                   classifiers on a speech database of German adults
                   simulating three different types of sigmatism.
                   Recognition results were calculated at a phone, word
                   and speaker level for both the simulated database and
                   for a database of pathological speakers. For the
                   simulated database, we achieved recognition rates of up
                   to 86%, 87% and 94% at a phone, word and speaker level.
                   The best classifier was then integrated as part of a
                   Java applet that allows patients to record their own
                   speech, either by pronouncing isolated phones, a
                   specific word or a list of words, and provides them
                   with a feedback whether the sibilant phones are being
                   correctly pronounced.},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_WOCCI12.pdf},
  year = 2012
}
@inproceedings{CassiaIS12,
  author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S.},
  title = {{Mel cepstral coefficient modification based on the
                   Glimpse Proportion measure for improving the
                   intelligibility of {HMM}-generated synthetic speech in
                   noise}},
  booktitle = {Proc. Interspeech},
  address = {Portland, USA},
  abstract = {We propose a method that modifies the Mel cepstral
                   coefficients of HMM-generated synthetic speech in order
                   to increase the intelligibility of the generated speech
                   when heard by a listener in the presence of a known
                   noise. This method is based on an approximation we
                   previously proposed for the Glimpse Proportion measure.
                   Here we show how to update the Mel cepstral
                   coefficients using this measure as an optimization
                   criterion and how to control the amount of distortion
                   by limiting the frequency resolution of the
                   modifications. To evaluate the method we built eight
                   different voices from normal read-text speech data from
                   a male speaker. Some voices were also built from
                   Lombard speech data produced by the same speaker.
                   Listening experiments with speech-shaped noise and with
                   a single competing talker indicate that our method
                   significantly improves intelligibility when compared to
                   unmodified synthetic speech. The voices built from
                   Lombard speech outperformed the proposed method
                   particularly for the competing talker case. However,
                   compared to a voice using only the spectral parameters
                   from Lombard speech, the proposed method obtains
                   similar or higher performance.},
  categories = {HMM-based speech synthesis, intelligibility
                   enhancement, Mel cepstral coefficients},
  month = {September},
  year = 2012
}