Publications by Rob Clark
robert.bib
@inproceedings{anderssonetal2010,
author = {Sebastian Andersson and Kallirroi Georgila and David
Traum and Matthew Aylett and Robert Clark},
title = {Prediction and Realisation of Conversational
Characteristics by Utilising Spontaneous Speech for
Unit Selection},
booktitle = {Speech Prosody 2010},
abstract = {Unit selection speech synthesis has reached high
levels of naturalness and intelligibility for neutral
read aloud speech. However, synthetic speech generated
using neutral read aloud data lacks all the attitude,
intention and spontaneity associated with everyday
conversations. Unit selection is heavily data dependent
and thus in order to simulate human conversational
speech, or create synthetic voices for believable
virtual characters, we need to utilise speech data with
examples of how people talk rather than how people
read. In this paper we included carefully selected
utterances from spontaneous conversational speech in a
unit selection voice. Using this voice and by
automatically predicting type and placement of lexical
fillers and filled pauses we can synthesise utterances
with conversational characteristics. A perceptual
listening test showed that it is possible to make
synthetic speech sound more conversational without
degrading naturalness.},
categories = {speech synthesis, unit selection, conversation,
spontaneous speech, lexical fillers, filled pauses},
month = may,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/100116.pdf},
year = 2010
}
@inproceedings{oliverclark_interspeech05,
author = {Dominika Oliver and Robert A. J. Clark},
title = {Modelling pitch accent types for {P}olish speech
synthesis},
booktitle = {Proc. Interspeech 2005},
categories = {speech synthesis, prosody, intonation, festival,
Polish},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/oliverclark_interspeech05.pdf},
year = 2005
}
@inproceedings{anderssoncabral09,
author = {J. Sebastian Andersson and Joao P. Cabral and Leonardo
Badino and Junichi Yamagishi and Robert A.J. Clark},
title = {Glottal Source and Prosodic Prominence Modelling in
{HMM}-based Speech Synthesis for the {B}lizzard
{C}hallenge 2009},
booktitle = {The Blizzard Challenge 2009},
address = {Edinburgh, U.K.},
abstract = {This paper describes the CSTR entry for the Blizzard
Challenge 2009. The work focused on modifying two parts
of the Nitech 2005 HTS speech synthesis system to
improve naturalness and contextual appropriateness. The
first part incorporated an implementation of the
Linjencrants-Fant (LF) glottal source model. The second
part focused on improving synthesis of prosodic
prominence including emphasis through context dependent
phonemes. Emphasis was assigned to the synthesised test
sentences based on a handful of theory based rules. The
two parts (LF-model and prosodic prominence) were not
combined and hence evaluated separately. The results on
naturalness for the LF-model showed that it is not yet
perceived as natural as the Benchmark HTS system for
neutral speech. The results for the prosodic prominence
modelling showed that it was perceived as contextually
appropriate as the Benchmark HTS system, despite a low
naturalness score. The Blizzard challenge evaluation
has provided valuable information on the status of our
work and continued work will begin with analysing why
our modifications resulted in reduced naturalness
compared to the Benchmark HTS system.},
categories = {HMM, HTS, speech synthesis, LF-model, glottal source,
prosodic prominence, emphasis},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/cstr_Blizzard2009.pdf},
year = 2009
}
@inproceedings{clark_gala97,
author = {Robert A. J. Clark},
title = {Language Acquisition and Implication for Language
Change: A Computational Model},
booktitle = {Proceedings of the {GALA} 97 Conference on Language
Acquisition},
pages = {322-326},
categories = {lm},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/clark_gala97.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/clark_gala97.ps},
year = 1997
}
@inproceedings{leo_07-1,
author = {Leonardo Badino and Robert A.J. Clark},
title = {Issues of Optionality in Pitch Accent Placement},
booktitle = {Proc. 6th ISCA Speech Synthesis Workshop},
address = {Bonn, Germany},
abstract = {When comparing the prosodic realization of different
English speakers reading the same text, a significant
disagreement is usually found amongst the pitch accent
patterns of the speakers. Assuming that such
disagreement is due to a partial optionality of pitch
accent placement, it has been recently proposed to
evaluate pitch accent predictors by comparing them with
multi-speaker reference data. In this paper we face the
issue of pitch accent optionality at different levels.
At first we propose a simple mathematical definition of
intra-speaker optionality which allows us to introduce
a function for evaluating pitch accent predictors which
we show being more accurate and robust than those used
in previous works. Subsequently we compare a pitch
accent predictor trained on single speaker data with a
predictor trained on multi-speaker data in order to
point out the large overlapping between intra-speaker
and inter-speaker optionality. Finally, we show our
successful results in predicting intra-speaker
optionality and we suggest how this achievement could
be exploited to improve the performances of a unit
selection text-to speech synthesis (TTS) system.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/ssw6_252.pdf},
year = 2007
}
@article{beaver:07,
author = {David Beaver and Brady Zack Clark and Edward Flemming
and T. Florian Jaeger and Maria Wolters},
title = {When Semantics meets Phonetics: {A}coustical studies
of second occurrence focus},
journal = {Language},
volume = 83,
number = 2,
pages = {245--276},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/BeaverLanguage2007.pdf},
year = 2007
}
@inproceedings{janska_clark:2010a,
author = {Anna C. Janska and Robert A. J. Clark},
title = {Native and Non-Native Speaker Judgements on the
Quality of Synthesized Speech},
booktitle = {Proc. Interspeech},
pages = {1121--1124},
abstract = {The difference between native speakers' and non-native
speak- ers' naturalness judgements of synthetic speech
is investigated. Similar/difference judgements are
analysed via a multidimensional scaling analysis and
compared to Mean opinion scores. It is shown that
although the two groups generally behave in a similar
manner the variance of non-native speaker judgements is
generally higher. While both groups of subject can
clearly distinguish natural speech from the best
synthetic examples, the groups' responses to different
artefacts present in the synthetic speech can vary. },
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/janskaclark_interspeech2010.pdf},
year = 2010
}
@inproceedings{clark:podsiadlo:mayo:king:blizzard2007,
author = {Robert A. J. Clark and Monika Podsiadlo and Mark
Fraser and Catherine Mayo and Simon King },
title = {Statistical Analysis of the {B}lizzard {C}hallenge
2007 Listening Test Results },
booktitle = {Proc. Blizzard 2007 (in Proc. Sixth {ISCA} Workshop on
Speech Synthesis)},
address = {Bonn, Germany},
abstract = {Blizzard 2007 is the third Blizzard Challenge, in
which participants build voices from a common dataset.
A large listening test is conducted which allows
comparison of systems in terms of naturalness and
intelligibility. New sections were added to the
listening test for 2007 to test the perceived
similarity of the speaker's identity between natural
and synthetic speech. In this paper, we present the
results of the listening test and the subsequent
statistical analysis. },
categories = {blizzard,listening test},
keywords = {Blizzard},
month = {August},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blz3_003.pdf},
year = 2007
}
@article{white_clark_moore:2010,
author = {Michael White and Robert A. J. Clark and Johanna D.
Moore},
title = {Generating Tailored, Comparative Descriptions with
Contextually Appropriate Intonation},
journal = {Computational Linguistics},
volume = {36},
number = {2},
pages = {159-201},
abstract = {Generating responses that take user preferences into
account requires adaptation at all levels of the
generation process. This article describes a
multi-level approach to presenting user-tailored
information in spoken dialogues which brings together
for the first time multi-attribute decision models,
strategic content planning, surface realization that
incorporates prosody prediction, and unit selection
synthesis that takes the resulting prosodic structure
into account. The system selects the most important
options to mention and the attributes that are most
relevant to choosing between them, based on the user
model. Multiple options are selected when each offers a
compelling trade-off. To convey these trade-offs, the
system employs a novel presentation strategy which
straightforwardly lends itself to the determination of
information structure, as well as the contents of
referring expressions. During surface realization, the
prosodic structure is derived from the information
structure using Combinatory Categorial Grammar in a way
that allows phrase boundaries to be determined in a
flexible, data-driven fashion. This approach to
choosing pitch accents and edge tones is shown to yield
prosodic structures with significantly higher
acceptability than baseline prosody prediction models
in an expert evaluation. These prosodic structures are
then shown to enable perceptibly more natural synthesis
using a unit selection voice that aims to produce the
target tunes, in comparison to two baseline synthetic
voices. An expert evaluation and f0 analysis confirm
the superiority of the generator-driven intonation and
its contribution to listeners' ratings.},
doi = {10.1162/coli.09-023-R1-08-002},
year = 2010
}
@mastersthesis{clark_msc96,
author = {Robert A.J. Clark},
title = {Internal and External Factors Affecting Language
Change: A Computational Model},
school = {University of Edinburgh},
categories = {lm},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1996/clark_msc96.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1996/clark_msc96.ps},
year = 1996
}
@inproceedings{richmond_interspeech2010,
author = {Richmond, Korin and Clark, Robert and Fitt, Sue},
title = {On Generating {C}ombilex Pronunciations via
Morphological Analysis},
booktitle = {Proc. Interspeech},
pages = {1974--1977},
address = {Makuhari, Japan},
abstract = {Combilex is a high-quality lexicon that has been
developed specifically for speech technology purposes
and recently released by CSTR. Combilex benefits from
many advanced features. This paper explores one of
these: the ability to generate fully-specified
transcriptions for morphologically derived words
automatically. This functionality was originally
implemented to encode the pronunciations of derived
words in terms of their constituent morphemes, thus
accelerating lexicon development and ensuring a high
level of consistency. In this paper, we propose this
method of modelling pronunciations can be exploited
further by combining it with a morphological parser,
thus yielding a method to generate full transcriptions
for unknown derived words. Not only could this
accelerate adding new derived words to Combilex, but it
could also serve as an alternative to conventional
letter-to-sound rules. This paper presents preliminary
work indicating this is a promising direction.},
keywords = {combilex lexicon, letter-to-sound rules,
grapheme-to-phoneme conversion, morphological
decomposition},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100683.pdf},
year = 2010
}
@inproceedings{clarkrichmondking_interspeech05,
author = {Robert A.J. Clark and Korin Richmond and Simon King},
title = {Multisyn voices from {ARCTIC} data for the {B}lizzard
challenge},
booktitle = {Proc. Interspeech 2005},
abstract = {This paper describes the process of building unit
selection voices for the Festival Multisyn engine using
four ARCTIC datasets, as part of the Blizzard
evaluation challenge. The build process is almost
entirely automatic, with very little need for human
intervention. We discuss the difference in the
evaluation results for each voice and evaluate the
suitability of the ARCTIC datasets for building this
type of voice.},
categories = {speech synthesis, festival, evaluation},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/clarkrichmondking_interspeech05.pdf},
year = 2005
}
@article{anderssonyamagishi12,
author = {Andersson, S. and Yamagishi, J. and Clark, R.A.J.},
title = {Synthesis and Evaluation of Conversational
Characteristics in {HMM}-Based Speech Synthesis},
journal = {Speech Communication},
volume = 54,
number = 2,
pages = {175-188},
abstract = {Spontaneous conversational speech has many
characteristics that are currently not modelled well by
HMM-based speech synthesis and in order to build
synthetic voices that can give an impression of someone
partaking in a conversation, we need to utilise data
that exhibits more of the speech phenomena associated
with conversations than the more generally used
carefully read aloud sentences. In this paper we show
that synthetic voices built with HMM-based speech
synthesis techniques from conversational speech data,
preserved segmental and prosodic characteristics of
frequent conversational speech phenomena. An analysis
of an evaluation investigating the perception of
quality and speaking style of HMM-based voices confirms
that speech with conversational characteristics are
instrumental for listeners to perceive successful
integration of conversational speech phenomena in
synthetic speech. The achieved synthetic speech quality
provides an encouraging start for the continued use of
conversational speech in HMM-based speech synthesis.},
doi = {10.1016/j.specom.2011.08.001},
year = 2012
}
@inproceedings{janska_clark:2010b,
author = {Anna C. Janska and Robert A. J. Clark},
title = {Further exploration of the possibilities and pitfalls
of multidimensional scaling as a tool for the
evaluation of the quality of synthesized speech},
booktitle = {The 7th ISCA Tutorial and Research Workshop on Speech
Synthesis},
pages = {142--147},
abstract = {Multidimensional scaling (MDS) has been suggested as a
use- ful tool for the evaluation of the quality of
synthesized speech. However, it has not yet been
extensively tested for its applica- tion in this
specific area of evaluation. In a series of experi-
ments based on data from the Blizzard Challenge 2008
the relations between Weighted Euclidean Distance
Scaling and Simple Euclidean Distance Scaling is
investigated to understand how aggregating data affects
the MDS configuration. These results are compared to
those collected as mean opinion scores (MOS). The ranks
correspond, and MOS can be predicted from an object's
space in the MDS generated stimulus space. The big
advantage of MDS over MOS is its diagnostic value;
dimensions along which stimuli vary are not correlated,
as is the case in modular evaluation using MOS.
Finally, it will be attempted to generalize from the
MDS representations of the thoroughly tested subset to
the aggregated data of the larger-scale Blizzard
Challenge.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/janskaclark_ssw7.pdf},
year = 2010
}
@inproceedings{clarkrichmondking_ssw504,
author = {Robert A.J. Clark and Korin Richmond and Simon King},
title = {Festival 2 -- build your own general purpose unit
selection speech synthesiser},
booktitle = {Proc. 5th {ISCA} workshop on speech synthesis},
abstract = {This paper describes version 2 of the Festival speech
synthesis system. Festival 2 provides a development
environment for concatenative speech synthesis, and now
includes a general purpose unit selection speech
synthesis engine. We discuss various aspects of unit
selection speech synthesis, focusing on the research
issues that relate to voice design and the automation
of the voice development process.},
categories = {synthesis, festival, unitselection},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/clarkrichmondking_ssw504.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/clarkrichmondking_ssw504.ps},
year = 2004
}
@inproceedings{badinoclark_interspeech12,
author = {Leonardo Badino and Robert A.J. Clark and Mirjam
Wester},
title = {Towards Hierarchical Prosodic Prominence Generation in
{TTS} Synthesis},
booktitle = {Proc. Interspeech},
address = {Portland, USA},
categories = {speech synthesis, prosody},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/badinoclark_IS_2012.pdf},
year = 2012
}
@inproceedings{bakerclarkwhite_ssw504,
author = {Rachel Baker and Robert A.J. Clark and Michael White},
title = {Synthesising Contextually Appropriate Intonation in
Limited Domains},
booktitle = {Proc. 5th {ISCA} workshop on speech synthesis},
address = {Pittsburgh, USA},
categories = {synthesis, prosody, intonation, festival},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/bakerclarkwhite_ssw504.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/bakerclarkwhite_ssw504.ps},
year = 2004
}
@inproceedings{clark_icphs99,
author = {Robert A. J. Clark},
title = {Using Prosodic Structure to Improve Pitch Range
Variation in Text to Speech Synthesis},
booktitle = {Proc. {XIV}th international congress of phonetic
sciences},
volume = 1,
pages = {69--72},
categories = {synthesis, prosody, intonation, festival},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/clark_icphs99.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/clark_icphs99.ps},
year = 1999
}
@inproceedings{leo_09-1,
author = {Leonardo Badino and J. Sebastian Andersson and Junichi
Yamagishi and Robert A.J. Clark},
title = {Identification of Contrast and Its Emphatic
Realization in {HMM}-based Speech Synthesis},
booktitle = {Proc. Interspeech 2009},
address = {Brighton, U.K.},
abstract = {The work presented in this paper proposes to identify
contrast in the form of contrastive word pairs and
prosodically signal it with emphatic accents in a
Text-to-Speech (TTS) application using a
Hidden-Markov-Model (HMM) based speech synthesis
system. We first describe a novel method to
automatically detect contrastive word pairs using
textual features only and report its performance on a
corpus of spontaneous conversations in English.
Subsequently we describe the set of features selected
to train a HMM-based speech synthesis system and
attempting to properly control prosodic prominence
(including emphasis). Results from a large scale
perceptual test show that in the majority of cases
listeners judge emphatic contrastive word pairs as
acceptable as their non-emphatic counterpart, while
emphasis on non-contrastive pairs is almost never
acceptable.},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/IS090749.PDF},
year = 2009
}
@inproceedings{strom:etal:interspeech2007,
author = {Volker Strom and Ani Nenkova and Robert Clark and
Yolanda Vazquez-Alvarez and Jason Brenier and Simon
King and Dan Jurafsky},
title = {Modelling Prominence and Emphasis Improves
Unit-Selection Synthesis},
booktitle = {Proc. Interspeech 2007},
address = {Antwerp, Belgium},
abstract = {We describe the results of large scale perception
experiments showing improvements in synthesising two
distinct kinds of prominence: standard pitch-accent and
strong emphatic accents. Previously prominence
assignment has been mainly evaluated by computing
accuracy on a prominence-labelled test set. By contrast
we integrated an automatic pitch-accent classifier into
the unit selection target cost and showed that
listeners preferred these synthesised sentences. We
also describe an improved recording script for
collecting emphatic accents, and show that generating
emphatic accents leads to further improvements in the
fiction genre over incorporating pitch accent only.
Finally, we show differences in the effects of
prominence between child-directed speech and news and
fiction genres. Index Terms: speech synthesis, prosody,
prominence, pitch accent, unit selection},
categories = {speech synthesis},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/p540.pdf},
year = 2007
}
@inproceedings{clark_blizzard2006,
author = {Clark, R. and Richmond, K. and Strom, V. and King, S.},
title = {Multisyn Voices for the {B}lizzard {C}hallenge 2006},
booktitle = {Proc. Blizzard Challenge Workshop (Interspeech
Satellite)},
address = {Pittsburgh, USA},
note = {(http://festvox.org/blizzard/blizzard2006.html)},
abstract = {This paper describes the process of building unit
selection voices for the Festival Multisyn engine using
the ATR dataset provided for the Blizzard Challenge
2006. We begin by discussing recent improvements that
we have made to the Multisyn voice building process,
prompted by our participation in the Blizzard Challenge
2006. We then go on to discuss our interpretation of
the results observed. Finally, we conclude with some
comments and suggestions for the formulation of future
Blizzard Challenges.},
categories = {tts, blizzard, multisyn, unit selection},
key = {clark_blizzard2006},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/cstr_blizzard2006.pdf},
year = 2006
}
@inproceedings{clarkdusterhoff_eurospeech99,
author = {Robert. A. J. Clark and Kurt E. Dusterhoff},
title = {Objective Methods for Evaluating Synthetic Intonation},
booktitle = {Proc. {E}urospeech 1999},
volume = 4,
pages = {1623--1626},
categories = {synthesis, prosody, intonation},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/clarkdusterhoff_eurospeech99.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/clarkdusterhoff_eurospeech99.ps},
year = 1999
}
@inproceedings{clark_king:proc:2006,
author = {Robert A. J. Clark and Simon King},
title = {Joint Prosodic and Segmental Unit Selection Speech
Synthesis},
booktitle = {Proc. Interspeech 2006},
address = {Pittsburgh, USA},
abstract = {We describe a unit selection technique for
text-to-speech synthesis which jointly searches the
space of possible diphone sequences and the space of
possible prosodic unit sequences in order to produce
synthetic speech with more natural prosody. We
demonstrates that this search, although currently
computationally expensive, can achieve improved
intonation compared to a baseline in which only the
space of possible diphone sequences is searched. We
discuss ways in which the search could be made
sufficiently efficient for use in a real-time system.},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/clarkking_interspeech_2006.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/clarkking_interspeech_2006.ps},
year = 2006
}
@incollection{Pipe_etal:2011,
author = {A. G. Pipe and R. Vaidyanathan and C. Melhuish and P.
Bremner and P. Robinson and R. A. J. Clark and A. Lenz
and K. Eder and N. Hawes and Z. Ghahramani and M.
Fraser and M. Mermehdi and P. Healey and S. Skachek},
title = {Affective Robotics: Human Motion and Behavioural
Inspiration for Cooperation between Humans and
Assistive Robots},
booktitle = {Biomimetics: Nature-Based Innovation},
publisher = {Taylor and Francis},
editor = {Yoseph Bar-Cohen},
chapter = {15},
year = 2011
}
@inproceedings{janskaetal_interspeech12,
author = {Anna C. Janska and Erich Schröger and Thomas Jacobsen
and Robert A. J. Clark},
title = {Asymmetries in the perception of synthesized speech},
booktitle = {Proc. Interspeech},
address = {Portland, USA},
categories = {speech synthesis, evaluation},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/janskaeral_IS_2012.pdf},
year = 2012
}
@inproceedings{richmond2009a,
author = {Richmond, K. and Clark, R. and Fitt, S.},
title = {Robust {LTS} rules with the {Combilex} speech
technology lexicon},
booktitle = {Proc. Interspeech},
pages = {1295--1298},
address = {Brighton, UK},
abstract = {Combilex is a high quality pronunciation lexicon aimed
at speech technology applications that has recently
been released by CSTR. Combilex benefits from several
advanced features. This paper evaluates one of these:
the explicit alignment of phones to graphemes in a
word. This alignment can help to rapidly develop robust
and accurate letter-to-sound (LTS) rules, without
needing to rely on automatic alignment methods. To
evaluate this, we used Festival's LTS module, comparing
its standard automatic alignment with Combilex's
explicit alignment. Our results show using Combilex's
alignment improves LTS accuracy: 86.50\% words correct
as opposed to 84.49\%, with our most general form of
lexicon. In addition, building LTS models is greatly
accelerated, as the need to list allowed alignments is
removed. Finally, loose comparison with other studies
indicates Combilex is a superior quality lexicon in
terms of consistency and size.},
keywords = {combilex, letter-to-sound rules, grapheme-to-phoneme
conversion},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/IS090308.pdf},
year = 2009
}
@phdthesis{clark_phd03,
author = {Robert A. J. Clark},
title = {Generating Synthetic Pitch Contours Using Prosodic
Structure},
school = {The University of Edinburgh},
categories = {speech synthesis, prosody, intonation, festival},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/clark_phd03.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/clark_phd03.ps.gz},
year = 2003
}
@inproceedings{leo_08-2,
author = {Leonardo Badino and Robert A.J. Clark and Volker Strom},
title = {Including Pitch Accent Optionality in Unit Selection
Text-to-Speech Synthesis},
booktitle = {Proc.~Interspeech},
address = {Brisbane},
abstract = {A significant variability in pitch accent placement is
found when comparing the patterns of prosodic
prominence realized by different English speakers
reading the same sentences. In this paper we describe a
simple approach to incorporate this variability to
synthesize prosodic prominence in unit selection
text-to-speech synthesis. The main motivation of our
approach is that by taking into account the variability
of accent placements we enlarge the set of prosodically
acceptable speech units, thus increasing the chances of
selecting a good quality sequence of units, both in
prosodic and segmental terms. Results on a large scale
perceptual test show the benefits of our approach and
indicate directions for further improvements.},
categories = {speech synthesis, unit selection, prosodic prominence,
pitch accents},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080159.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080159.ps},
year = 2008
}
@inproceedings{strom06,
author = {Volker Strom and Robert Clark and Simon King},
title = {Expressive Prosody for Unit-selection Speech Synthesis},
booktitle = {Proc.~Interspeech},
address = {Pittsburgh},
abstract = {Current unit selection speech synthesis voices cannot
produce emphasis or interrogative contours because of a
lack of the necessary prosodic variation in the
recorded speech database. A method of recording script
design is proposed which addresses this shortcoming.
Appropriate components were added to the target cost
function of the Festival Multisyn engine, and a
perceptual evaluation showed a clear preference over
the baseline system.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/strom06.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/strom06.ps},
year = 2006
}
@inproceedings{mayoclarkking-isp05,
author = {Mayo, C. and Clark, R. A. J. and King, S.},
title = {Multidimensional Scaling of Listener Responses to
Synthetic Speech},
booktitle = {Proc. Interspeech 2005},
address = {Lisbon, Portugal},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/ie-speech-2005.pdf},
year = 2005
}
@inproceedings{clark_icphs03,
author = {Robert A. J. Clark},
title = {Modelling Pitch Accents for Concept-to-Speech
Synthesis.},
booktitle = {Proc. XVth International Congress of Phonetic Sciences},
volume = 2,
pages = {1141--1144},
categories = {speech synthesis, prosody, intonation, festival},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/clark_icphs03.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/clark_icphs03.ps},
year = 2003
}
@inproceedings{karaiskos:king:clark:mayo:blizzard2008,
author = {Vasilis Karaiskos and Simon King and Robert A. J.
Clark and Catherine Mayo},
title = {The Blizzard Challenge 2008},
booktitle = {Proc. Blizzard Challenge Workshop},
address = {Brisbane, Australia},
abstract = {The Blizzard Challenge 2008 was the fourth annual
Blizzard Challenge. This year, participants were asked
to build two voices from a UK English corpus and one
voice from a Man- darin Chinese corpus. This is the
first time that a language other than English has been
included and also the first time that a large UK
English corpus has been available. In addi- tion, the
English corpus contained somewhat more expressive
speech than that found in corpora used in previous
Blizzard Challenges. To assist participants with
limited resources or limited ex- perience in
UK-accented English or Mandarin, unaligned la- bels
were provided for both corpora and for the test
sentences. Participants could use the provided labels
or create their own. An accent-specific pronunciation
dictionary was also available for the English speaker.
A set of test sentences was released to participants,
who were given a limited time in which to synthesise
them and submit the synthetic speech. An online
listening test was con- ducted, to evaluate
naturalness, intelligibility and degree of similarity
to the original speaker.},
keywords = {Blizzard},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/summary_Blizzard2008.pdf},
year = 2008
}
@article{clarkrichmondking_specom2007,
author = {Robert A. J. Clark and Korin Richmond and Simon King},
title = {Multisyn: Open-domain unit selection for the
{F}estival speech synthesis system},
journal = {Speech Communication},
volume = 49,
number = 4,
pages = {317--330},
abstract = {We present the implementation and evaluation of an
open-domain unit selection speech synthesis engine
designed to be flexible enough to encourage further
unit selection research and allow rapid voice
development by users with minimal speech synthesis
knowledge and experience. We address the issues of
automatically processing speech data into a usable
voice using automatic segmentation techniques and how
the knowledge obtained at labelling time can be
exploited at synthesis time. We describe target cost
and join cost implementation for such a system and
describe the outcome of building voices with a number
of different sized datasets. We show that, in a
competitive evaluation, voices built using this
technology compare favourably to other systems.},
categories = {speech synthesis, festival, multisyn, unitselection},
doi = {10.1016/j.specom.2007.01.014},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/clarkrichmondking_specom2007.pdf},
year = 2007
}
@article{mayo:clark:king:10,
author = {Mayo, C. and Clark, R. A. J. and King, S.},
title = {Listeners' Weighting of Acoustic Cues to Synthetic
Speech Naturalness: A Multidimensional Scaling Analysis},
journal = {Speech Communication},
volume = {53},
number = {3},
pages = {311--326},
abstract = {The quality of current commercial speech synthesis
systems is now so high that system improvements are
being made at subtle sub- and supra-segmental levels.
Human perceptual evaluation of such subtle improvements
requires a highly sophisticated level of perceptual
attention to specific acoustic characteristics or cues.
However, it is not well understood what acoustic cues
listeners attend to by default when asked to evaluate
synthetic speech. It may, therefore, be potentially
quite difficult to design an evaluation method that
allows listeners to concentrate on only one dimension
of the signal, while ignoring others that are
perceptually more important to them. The aim of the
current study was to determine which acoustic
characteristics of unit-selection synthetic speech are
most salient to listeners when evaluating the
naturalness of such speech. This study made use of
multidimensional scaling techniques to analyse
listeners' pairwise comparisons of synthetic speech
sentences. Results indicate that listeners place a
great deal of perceptual importance on the presence of
artifacts and discontinuities in the speech, somewhat
less importance on aspects of segmental quality, and
very little importance on stress/intonation
appropriateness. These relative differences in
importance will impact on listeners' ability to attend
to these different acoustic characteristics of
synthetic speech, and should therefore be taken into
account when designing appropriate methods of synthetic
speech evaluation.},
doi = {10.1016/j.specom.2010.10.003},
keywords = {Speech synthesis; Evaluation; Speech perception;
Acoustic cue weighting; Multidimensional scaling},
year = 2011
}
@inproceedings{richmond2007b,
author = {Richmond, K. and Strom, V. and Clark, R. and
Yamagishi, J. and Fitt, S.},
title = {Festival Multisyn Voices for the 2007 Blizzard
Challenge},
booktitle = {Proc. Blizzard Challenge Workshop (in Proc. SSW6)},
address = {Bonn, Germany},
abstract = {This paper describes selected aspects of the Festival
Multisyn entry to the Blizzard Challenge 2007. We
provide an overview of the process of building the
three required voices from the speech data provided.
This paper focuses on new features of Multisyn which
are currently under development and which have been
employed in the system used for this Blizzard
Challenge. These differences are the application of a
more flexible phonetic lattice representation during
forced alignment labelling and the use of a pitch
accent target cost component. Finally, we also examine
aspects of the speech data provided for this year's
Blizzard Challenge and raise certain issues for
discussion concerning the aim of comparing voices made
with differing subsets of the data provided.},
categories = {tts, blizzard, multisyn, unit selection},
key = {richmond2007b},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blizzard2007paper.pdf},
year = 2007
}
@inproceedings{leo_08-1,
author = {Leonardo Badino and Robert A.J. Clark},
title = {Automatic labeling of contrastive word pairs from
spontaneous spoken English},
booktitle = {in 2008 IEEE/ACL Workshop on Spoken Language
Technology},
address = {Goa, India},
abstract = {This paper addresses the problem of automatically
labeling contrast in spontaneous spoken speech, where
contrast here is meant as a relation that ties two
words that explicitly contrast with each other.
Detection of contrast is certainly relevant in the
analysis of discourse and information structure and
also, because of the prosodic correlates of contrast,
could play an important role in speech applications,
such as text-to-speech synthesis, that need an accurate
and discourse context related modeling of prosody. With
this prospect we investigate the feasibility of
automatic contrast labeling by training and evaluating
on the Switchboard corpus a novel contrast tagger,
based on Support Vector Machines (SVM), that combines
lexical features, syntactic dependencies and WordNet
semantic relations.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/0000101.pdf},
year = 2008
}
@inproceedings{hofer-eurosp05,
author = {G. Hofer and K. Richmond and R. Clark},
title = {Informed Blending of Databases for Emotional Speech
Synthesis},
booktitle = {Proc. Interspeech},
abstract = {The goal of this project was to build a unit selection
voice that could portray emotions with varying
intensities. A suitable definition of an emotion was
developed along with a descriptive framework that
supported the work carried out. A single speaker was
recorded portraying happy and angry speaking styles.
Additionally a neutral database was also recorded. A
target cost function was implemented that chose units
according to emotion mark-up in the database. The
Dictionary of Affect supported the emotional target
cost function by providing an emotion rating for words
in the target utterance. If a word was particularly
'emotional', units from that emotion were favoured. In
addition intensity could be varied which resulted in a
bias to select a greater number emotional units. A
perceptual evaluation was carried out and subjects were
able to recognise reliably emotions with varying
amounts of emotional units present in the target
utterance.},
categories = {speech synthesis,emotion,edinburgh},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hofer_emosyn.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hofer_emosyn.ps},
year = 2005
}
@inproceedings{anderssonetal2010_ssw7,
author = {Sebastian Andersson and Junichi Yamagishi and Robert
Clark},
title = {Utilising Spontaneous Conversational Speech in
{HMM}-Based Speech Synthesis},
booktitle = {The 7th ISCA Tutorial and Research Workshop on Speech
Synthesis},
abstract = {Spontaneous conversational speech has many
characteristics that are currently not well modelled in
unit selection and HMM-based speech synthesis. But in
order to build synthetic voices more suitable for
interaction we need data that exhibits more
conversational characteristics than the generally used
read aloud sentences. In this paper we will show how
carefully selected utterances from a spontaneous
conversation was instrumental for building an HMM-based
synthetic voices with more natural sounding
conversational characteristics than a voice based on
carefully read aloud sentences. We also investigated a
style blending technique as a solution to the inherent
problem of phonetic coverage in spontaneous speech
data. But the lack of an appropriate representation of
spontaneous speech phenomena probably contributed to
results showing that we could not yet compete with the
speech quality achieved for grammatical sentences.},
categories = {HMM, speech synthesis, spontaneous speech,
conversation, lexical fillers, filled pauses},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/ssw7_paper.pdf},
year = 2010
}