The Centre for Speech Technology Research, The university of Edinburgh

Publications by Sebastian Andersson

janderss.bib

@inproceedings{anderssonetal2010,
  author = {Sebastian Andersson and Kallirroi Georgila and David
                   Traum and Matthew Aylett and Robert Clark},
  title = {Prediction and Realisation of Conversational
                   Characteristics by Utilising Spontaneous Speech for
                   Unit Selection},
  booktitle = {Speech Prosody 2010},
  abstract = {Unit selection speech synthesis has reached high
                   levels of naturalness and intelligibility for neutral
                   read aloud speech. However, synthetic speech generated
                   using neutral read aloud data lacks all the attitude,
                   intention and spontaneity associated with everyday
                   conversations. Unit selection is heavily data dependent
                   and thus in order to simulate human conversational
                   speech, or create synthetic voices for believable
                   virtual characters, we need to utilise speech data with
                   examples of how people talk rather than how people
                   read. In this paper we included carefully selected
                   utterances from spontaneous conversational speech in a
                   unit selection voice. Using this voice and by
                   automatically predicting type and placement of lexical
                   fillers and filled pauses we can synthesise utterances
                   with conversational characteristics. A perceptual
                   listening test showed that it is possible to make
                   synthetic speech sound more conversational without
                   degrading naturalness.},
  categories = {speech synthesis, unit selection, conversation,
                   spontaneous speech, lexical fillers, filled pauses},
  month = may,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/100116.pdf},
  year = 2010
}
@inproceedings{anderssoncabral09,
  author = {J. Sebastian Andersson and Joao P. Cabral and Leonardo
                   Badino and Junichi Yamagishi and Robert A.J. Clark},
  title = {Glottal Source and Prosodic Prominence Modelling in
                   {HMM}-based Speech Synthesis for the {B}lizzard
                   {C}hallenge 2009},
  booktitle = {The Blizzard Challenge 2009},
  address = {Edinburgh, U.K.},
  abstract = {This paper describes the CSTR entry for the Blizzard
                   Challenge 2009. The work focused on modifying two parts
                   of the Nitech 2005 HTS speech synthesis system to
                   improve naturalness and contextual appropriateness. The
                   first part incorporated an implementation of the
                   Linjencrants-Fant (LF) glottal source model. The second
                   part focused on improving synthesis of prosodic
                   prominence including emphasis through context dependent
                   phonemes. Emphasis was assigned to the synthesised test
                   sentences based on a handful of theory based rules. The
                   two parts (LF-model and prosodic prominence) were not
                   combined and hence evaluated separately. The results on
                   naturalness for the LF-model showed that it is not yet
                   perceived as natural as the Benchmark HTS system for
                   neutral speech. The results for the prosodic prominence
                   modelling showed that it was perceived as contextually
                   appropriate as the Benchmark HTS system, despite a low
                   naturalness score. The Blizzard challenge evaluation
                   has provided valuable information on the status of our
                   work and continued work will begin with analysing why
                   our modifications resulted in reduced naturalness
                   compared to the Benchmark HTS system.},
  categories = {HMM, HTS, speech synthesis, LF-model, glottal source,
                   prosodic prominence, emphasis},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/cstr_Blizzard2009.pdf},
  year = 2009
}
@article{Andersson2012175,
  author = {Sebastian Andersson and Junichi Yamagishi and Robert
                   A.J. Clark},
  title = {Synthesis and evaluation of conversational
                   characteristics in {HMM}-based speech synthesis},
  journal = {Speech Communication},
  volume = {54},
  number = {2},
  pages = {175--188},
  note = {},
  abstract = {Spontaneous conversational speech has many
                   characteristics that are currently not modelled well by
                   HMM-based speech synthesis and in order to build
                   synthetic voices that can give an impression of someone
                   partaking in a conversation, we need to utilise data
                   that exhibits more of the speech phenomena associated
                   with conversations than the more generally used
                   carefully read aloud sentences. In this paper we show
                   that synthetic voices built with HMM-based speech
                   synthesis techniques from conversational speech data,
                   preserved segmental and prosodic characteristics of
                   frequent conversational speech phenomena. An analysis
                   of an evaluation investigating the perception of
                   quality and speaking style of HMM-based voices confirms
                   that speech with conversational characteristics are
                   instrumental for listeners to perceive successful
                   integration of conversational speech phenomena in
                   synthetic speech. The achieved synthetic speech quality
                   provides an encouraging start for the continued use of
                   conversational speech in HMM-based speech synthesis.},
  doi = {10.1016/j.specom.2011.08.001},
  issn = {0167-6393},
  keywords = {Speech synthesis, HMM, Conversation, Spontaneous
                   speech, Filled pauses, Discourse marker},
  url = {http://www.sciencedirect.com/science/article/pii/S0167639311001178},
  year = 2012
}
@article{anderssonyamagishi12,
  author = {Andersson, S. and Yamagishi, J. and Clark, R.A.J.},
  title = {Synthesis and Evaluation of Conversational
                   Characteristics in {HMM}-Based Speech Synthesis},
  journal = {Speech Communication},
  volume = 54,
  number = 2,
  pages = {175-188},
  abstract = {Spontaneous conversational speech has many
                   characteristics that are currently not modelled well by
                   HMM-based speech synthesis and in order to build
                   synthetic voices that can give an impression of someone
                   partaking in a conversation, we need to utilise data
                   that exhibits more of the speech phenomena associated
                   with conversations than the more generally used
                   carefully read aloud sentences. In this paper we show
                   that synthetic voices built with HMM-based speech
                   synthesis techniques from conversational speech data,
                   preserved segmental and prosodic characteristics of
                   frequent conversational speech phenomena. An analysis
                   of an evaluation investigating the perception of
                   quality and speaking style of HMM-based voices confirms
                   that speech with conversational characteristics are
                   instrumental for listeners to perceive successful
                   integration of conversational speech phenomena in
                   synthetic speech. The achieved synthetic speech quality
                   provides an encouraging start for the continued use of
                   conversational speech in HMM-based speech synthesis.},
  doi = {10.1016/j.specom.2011.08.001},
  year = 2012
}
@inproceedings{leo_08-3,
  author = {J. Sebastian Andersson and Leonardo Badino and Oliver
                   S. Watts and Matthew P.Aylett},
  title = {The {CSTR/Cereproc B}lizzard Entry 2008: The
                   Inconvenient Data},
  booktitle = {Proc. Blizzard Challenge Workshop (in Proc.
                   Interspeech 2008)},
  address = {Brisbane, Australia},
  abstract = {In a commercial system data used for unit selection
                   systems is collected with a heavy emphasis on
                   homogeneous neutral data that has sufficient coverage
                   for the units that will be used in the system. In this
                   years Blizzard entry CSTR and CereProc present a joint
                   entry where the emphasis has been to explore techniques
                   to deal with data which is not homogeneous (the English
                   entry) and did not have appropriate coverage for a
                   diphone based system (the Mandarin entry where
                   tone/phone combinations were treated as distinct phone
                   categories). In addition, two further problems were
                   addressed, 1) Making use of non-homogeneous data for
                   creating a voice that can realise both expressive and
                   neutral speaking styles (the English entry) 2) Building
                   a unit selection system with no native understanding of
                   the language but depending instead on external native
                   evaluation (the Mandarin Entry).},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/cstr-cereproc_Blizzard2008.pdf},
  year = 2008
}
@inproceedings{leo_09-1,
  author = {Leonardo Badino and J. Sebastian Andersson and Junichi
                   Yamagishi and Robert A.J. Clark},
  title = {Identification of Contrast and Its Emphatic
                   Realization in {HMM}-based Speech Synthesis},
  booktitle = {Proc. Interspeech 2009},
  address = {Brighton, U.K.},
  abstract = {The work presented in this paper proposes to identify
                   contrast in the form of contrastive word pairs and
                   prosodically signal it with emphatic accents in a
                   Text-to-Speech (TTS) application using a
                   Hidden-Markov-Model (HMM) based speech synthesis
                   system. We first describe a novel method to
                   automatically detect contrastive word pairs using
                   textual features only and report its performance on a
                   corpus of spontaneous conversations in English.
                   Subsequently we describe the set of features selected
                   to train a HMM-based speech synthesis system and
                   attempting to properly control prosodic prominence
                   (including emphasis). Results from a large scale
                   perceptual test show that in the majority of cases
                   listeners judge emphatic contrastive word pairs as
                   acceptable as their non-emphatic counterpart, while
                   emphasis on non-contrastive pairs is almost never
                   acceptable.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/IS090749.PDF},
  year = 2009
}
@inproceedings{leo_07-2,
  author = {Matthew P. Aylett and J. Sebastian Andersson and
                   Leonardo Badino and Christopher J. Pidcock},
  title = {The {C}erevoice {B}lizzard Entry 2007: Are Small
                   Database Errors Worse than Compression Artifacts?},
  booktitle = {Proc. Blizzard Challenge Workshop 2007},
  address = {Bonn, Germany},
  abstract = {In commercial systems the memory footprint of unit
                   selection systems is often a key issue. This is
                   especially true for PDAs and other embedded devices. In
                   this year's Blizzard entry CereProc R gave itself the
                   criteria that the full database system entered would
                   have a smaller memory footprint than either of the two
                   smaller database entries. This was accomplished by
                   applying Speex speech compression to the full database
                   entry. In turn a set of small database techniques used
                   to improve the quality of small database systems in
                   last years entry were extended. Finally, for all
                   systems, two quality control methods were applied to
                   the underlying database to improve the lexicon and
                   transcription match to the underlying data. Results
                   suggest that mild audio quality artifacts introduced by
                   lossy compression have almost as much impact on MOS
                   perceived quality as concatenation errors introduced by
                   sparse data in the smaller systems with bulked
                   diphones.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blz3_004.pdf},
  year = 2007
}
@inproceedings{anderssonetal2010_ssw7,
  author = {Sebastian Andersson and Junichi Yamagishi and Robert
                   Clark},
  title = {Utilising Spontaneous Conversational Speech in
                   {HMM}-Based Speech Synthesis},
  booktitle = {The 7th ISCA Tutorial and Research Workshop on Speech
                   Synthesis},
  abstract = {Spontaneous conversational speech has many
                   characteristics that are currently not well modelled in
                   unit selection and HMM-based speech synthesis. But in
                   order to build synthetic voices more suitable for
                   interaction we need data that exhibits more
                   conversational characteristics than the generally used
                   read aloud sentences. In this paper we will show how
                   carefully selected utterances from a spontaneous
                   conversation was instrumental for building an HMM-based
                   synthetic voices with more natural sounding
                   conversational characteristics than a voice based on
                   carefully read aloud sentences. We also investigated a
                   style blending technique as a solution to the inherent
                   problem of phonetic coverage in spontaneous speech
                   data. But the lack of an appropriate representation of
                   spontaneous speech phenomena probably contributed to
                   results showing that we could not yet compete with the
                   speech quality achieved for grammatical sentences.},
  categories = {HMM, speech synthesis, spontaneous speech,
                   conversation, lexical fillers, filled pauses},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/ssw7_paper.pdf},
  year = 2010
}