Publications by Sebastian Andersson
janderss.bib
@inproceedings{anderssonetal2010,
author = {Sebastian Andersson and Kallirroi Georgila and David
Traum and Matthew Aylett and Robert Clark},
title = {Prediction and Realisation of Conversational
Characteristics by Utilising Spontaneous Speech for
Unit Selection},
booktitle = {Speech Prosody 2010},
abstract = {Unit selection speech synthesis has reached high
levels of naturalness and intelligibility for neutral
read aloud speech. However, synthetic speech generated
using neutral read aloud data lacks all the attitude,
intention and spontaneity associated with everyday
conversations. Unit selection is heavily data dependent
and thus in order to simulate human conversational
speech, or create synthetic voices for believable
virtual characters, we need to utilise speech data with
examples of how people talk rather than how people
read. In this paper we included carefully selected
utterances from spontaneous conversational speech in a
unit selection voice. Using this voice and by
automatically predicting type and placement of lexical
fillers and filled pauses we can synthesise utterances
with conversational characteristics. A perceptual
listening test showed that it is possible to make
synthetic speech sound more conversational without
degrading naturalness.},
categories = {speech synthesis, unit selection, conversation,
spontaneous speech, lexical fillers, filled pauses},
month = may,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/100116.pdf},
year = 2010
}
@inproceedings{anderssoncabral09,
author = {J. Sebastian Andersson and Joao P. Cabral and Leonardo
Badino and Junichi Yamagishi and Robert A.J. Clark},
title = {Glottal Source and Prosodic Prominence Modelling in
{HMM}-based Speech Synthesis for the {B}lizzard
{C}hallenge 2009},
booktitle = {The Blizzard Challenge 2009},
address = {Edinburgh, U.K.},
abstract = {This paper describes the CSTR entry for the Blizzard
Challenge 2009. The work focused on modifying two parts
of the Nitech 2005 HTS speech synthesis system to
improve naturalness and contextual appropriateness. The
first part incorporated an implementation of the
Linjencrants-Fant (LF) glottal source model. The second
part focused on improving synthesis of prosodic
prominence including emphasis through context dependent
phonemes. Emphasis was assigned to the synthesised test
sentences based on a handful of theory based rules. The
two parts (LF-model and prosodic prominence) were not
combined and hence evaluated separately. The results on
naturalness for the LF-model showed that it is not yet
perceived as natural as the Benchmark HTS system for
neutral speech. The results for the prosodic prominence
modelling showed that it was perceived as contextually
appropriate as the Benchmark HTS system, despite a low
naturalness score. The Blizzard challenge evaluation
has provided valuable information on the status of our
work and continued work will begin with analysing why
our modifications resulted in reduced naturalness
compared to the Benchmark HTS system.},
categories = {HMM, HTS, speech synthesis, LF-model, glottal source,
prosodic prominence, emphasis},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/cstr_Blizzard2009.pdf},
year = 2009
}
@article{Andersson2012175,
author = {Sebastian Andersson and Junichi Yamagishi and Robert
A.J. Clark},
title = {Synthesis and evaluation of conversational
characteristics in {HMM}-based speech synthesis},
journal = {Speech Communication},
volume = {54},
number = {2},
pages = {175--188},
note = {},
abstract = {Spontaneous conversational speech has many
characteristics that are currently not modelled well by
HMM-based speech synthesis and in order to build
synthetic voices that can give an impression of someone
partaking in a conversation, we need to utilise data
that exhibits more of the speech phenomena associated
with conversations than the more generally used
carefully read aloud sentences. In this paper we show
that synthetic voices built with HMM-based speech
synthesis techniques from conversational speech data,
preserved segmental and prosodic characteristics of
frequent conversational speech phenomena. An analysis
of an evaluation investigating the perception of
quality and speaking style of HMM-based voices confirms
that speech with conversational characteristics are
instrumental for listeners to perceive successful
integration of conversational speech phenomena in
synthetic speech. The achieved synthetic speech quality
provides an encouraging start for the continued use of
conversational speech in HMM-based speech synthesis.},
doi = {10.1016/j.specom.2011.08.001},
issn = {0167-6393},
keywords = {Speech synthesis, HMM, Conversation, Spontaneous
speech, Filled pauses, Discourse marker},
url = {http://www.sciencedirect.com/science/article/pii/S0167639311001178},
year = 2012
}
@article{anderssonyamagishi12,
author = {Andersson, S. and Yamagishi, J. and Clark, R.A.J.},
title = {Synthesis and Evaluation of Conversational
Characteristics in {HMM}-Based Speech Synthesis},
journal = {Speech Communication},
volume = 54,
number = 2,
pages = {175-188},
abstract = {Spontaneous conversational speech has many
characteristics that are currently not modelled well by
HMM-based speech synthesis and in order to build
synthetic voices that can give an impression of someone
partaking in a conversation, we need to utilise data
that exhibits more of the speech phenomena associated
with conversations than the more generally used
carefully read aloud sentences. In this paper we show
that synthetic voices built with HMM-based speech
synthesis techniques from conversational speech data,
preserved segmental and prosodic characteristics of
frequent conversational speech phenomena. An analysis
of an evaluation investigating the perception of
quality and speaking style of HMM-based voices confirms
that speech with conversational characteristics are
instrumental for listeners to perceive successful
integration of conversational speech phenomena in
synthetic speech. The achieved synthetic speech quality
provides an encouraging start for the continued use of
conversational speech in HMM-based speech synthesis.},
doi = {10.1016/j.specom.2011.08.001},
year = 2012
}
@inproceedings{leo_08-3,
author = {J. Sebastian Andersson and Leonardo Badino and Oliver
S. Watts and Matthew P.Aylett},
title = {The {CSTR/Cereproc B}lizzard Entry 2008: The
Inconvenient Data},
booktitle = {Proc. Blizzard Challenge Workshop (in Proc.
Interspeech 2008)},
address = {Brisbane, Australia},
abstract = {In a commercial system data used for unit selection
systems is collected with a heavy emphasis on
homogeneous neutral data that has sufficient coverage
for the units that will be used in the system. In this
years Blizzard entry CSTR and CereProc present a joint
entry where the emphasis has been to explore techniques
to deal with data which is not homogeneous (the English
entry) and did not have appropriate coverage for a
diphone based system (the Mandarin entry where
tone/phone combinations were treated as distinct phone
categories). In addition, two further problems were
addressed, 1) Making use of non-homogeneous data for
creating a voice that can realise both expressive and
neutral speaking styles (the English entry) 2) Building
a unit selection system with no native understanding of
the language but depending instead on external native
evaluation (the Mandarin Entry).},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/cstr-cereproc_Blizzard2008.pdf},
year = 2008
}
@inproceedings{leo_09-1,
author = {Leonardo Badino and J. Sebastian Andersson and Junichi
Yamagishi and Robert A.J. Clark},
title = {Identification of Contrast and Its Emphatic
Realization in {HMM}-based Speech Synthesis},
booktitle = {Proc. Interspeech 2009},
address = {Brighton, U.K.},
abstract = {The work presented in this paper proposes to identify
contrast in the form of contrastive word pairs and
prosodically signal it with emphatic accents in a
Text-to-Speech (TTS) application using a
Hidden-Markov-Model (HMM) based speech synthesis
system. We first describe a novel method to
automatically detect contrastive word pairs using
textual features only and report its performance on a
corpus of spontaneous conversations in English.
Subsequently we describe the set of features selected
to train a HMM-based speech synthesis system and
attempting to properly control prosodic prominence
(including emphasis). Results from a large scale
perceptual test show that in the majority of cases
listeners judge emphatic contrastive word pairs as
acceptable as their non-emphatic counterpart, while
emphasis on non-contrastive pairs is almost never
acceptable.},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/IS090749.PDF},
year = 2009
}
@inproceedings{leo_07-2,
author = {Matthew P. Aylett and J. Sebastian Andersson and
Leonardo Badino and Christopher J. Pidcock},
title = {The {C}erevoice {B}lizzard Entry 2007: Are Small
Database Errors Worse than Compression Artifacts?},
booktitle = {Proc. Blizzard Challenge Workshop 2007},
address = {Bonn, Germany},
abstract = {In commercial systems the memory footprint of unit
selection systems is often a key issue. This is
especially true for PDAs and other embedded devices. In
this year's Blizzard entry CereProc R gave itself the
criteria that the full database system entered would
have a smaller memory footprint than either of the two
smaller database entries. This was accomplished by
applying Speex speech compression to the full database
entry. In turn a set of small database techniques used
to improve the quality of small database systems in
last years entry were extended. Finally, for all
systems, two quality control methods were applied to
the underlying database to improve the lexicon and
transcription match to the underlying data. Results
suggest that mild audio quality artifacts introduced by
lossy compression have almost as much impact on MOS
perceived quality as concatenation errors introduced by
sparse data in the smaller systems with bulked
diphones.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blz3_004.pdf},
year = 2007
}
@inproceedings{anderssonetal2010_ssw7,
author = {Sebastian Andersson and Junichi Yamagishi and Robert
Clark},
title = {Utilising Spontaneous Conversational Speech in
{HMM}-Based Speech Synthesis},
booktitle = {The 7th ISCA Tutorial and Research Workshop on Speech
Synthesis},
abstract = {Spontaneous conversational speech has many
characteristics that are currently not well modelled in
unit selection and HMM-based speech synthesis. But in
order to build synthetic voices more suitable for
interaction we need data that exhibits more
conversational characteristics than the generally used
read aloud sentences. In this paper we will show how
carefully selected utterances from a spontaneous
conversation was instrumental for building an HMM-based
synthetic voices with more natural sounding
conversational characteristics than a voice based on
carefully read aloud sentences. We also investigated a
style blending technique as a solution to the inherent
problem of phonetic coverage in spontaneous speech
data. But the lack of an appropriate representation of
spontaneous speech phenomena probably contributed to
results showing that we could not yet compete with the
speech quality achieved for grammatical sentences.},
categories = {HMM, speech synthesis, spontaneous speech,
conversation, lexical fillers, filled pauses},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/ssw7_paper.pdf},
year = 2010
}