The Centre for Speech Technology Research, The university of Edinburgh

Publications by Rasmus Dall

s0836504.bib

@inproceedings{dallIS2012,
  author = {Dall, Rasmus and Veaux, Christophe and Yamagishi,
                   Junichi and King, Simon},
  title = {Analysis of Speaker CLustering Strategies for
                   {HMM}-Based Speech Synthesis},
  booktitle = {Proc. Interspeech},
  address = {Portland, Oregon, USA},
  abstract = {This paper describes a method for speaker clustering,
                   with the application of building average voice models
                   for speaker-adaptive HMM-based speech synthesis that
                   are a good basis for adapting to specific target
                   speakers. Our main hypothesis is that using
                   perceptually similar speakers to build the average
                   voice model will be better than use unselected
                   speakers, even if the amount of data available from
                   perceptually similar speakers is smaller. We measure
                   the perceived similarities among a group of 30 female
                   speakers in a listening test and then apply multiple
                   linear regression to automatically predict these
                   listener judgements of speaker similarity and thus to
                   identify similar speakers automatically. We then
                   compare a variety of average voice models trained on
                   either speakers who were perceptually judged to be
                   similar to the target speaker, or speakers selected by
                   the multiple linear regression, or a large global set
                   of unselected speakers. We find that the average voice
                   model trained on perceptually similar speakers provides
                   better performance than the global model, even though
                   the latter is trained on more data, confirming our main
                   hypothesis. However, the average voice model using
                   speakers selected automatically by the multiple linear
                   regression does not reach the same level of
                   performance.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/DallIS2012.pdf},
  year = 2012
}