The Centre for Speech Technology Research, The university of Edinburgh

Publications by Rob Clark

robert.bib

@inproceedings{anderssonetal2010,
  author = {Sebastian Andersson and Kallirroi Georgila and David
                   Traum and Matthew Aylett and Robert Clark},
  title = {Prediction and Realisation of Conversational
                   Characteristics by Utilising Spontaneous Speech for
                   Unit Selection},
  booktitle = {Speech Prosody 2010},
  abstract = {Unit selection speech synthesis has reached high
                   levels of naturalness and intelligibility for neutral
                   read aloud speech. However, synthetic speech generated
                   using neutral read aloud data lacks all the attitude,
                   intention and spontaneity associated with everyday
                   conversations. Unit selection is heavily data dependent
                   and thus in order to simulate human conversational
                   speech, or create synthetic voices for believable
                   virtual characters, we need to utilise speech data with
                   examples of how people talk rather than how people
                   read. In this paper we included carefully selected
                   utterances from spontaneous conversational speech in a
                   unit selection voice. Using this voice and by
                   automatically predicting type and placement of lexical
                   fillers and filled pauses we can synthesise utterances
                   with conversational characteristics. A perceptual
                   listening test showed that it is possible to make
                   synthetic speech sound more conversational without
                   degrading naturalness.},
  categories = {speech synthesis, unit selection, conversation,
                   spontaneous speech, lexical fillers, filled pauses},
  month = may,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/100116.pdf},
  year = 2010
}
@inproceedings{oliverclark_interspeech05,
  author = {Dominika Oliver and Robert A. J. Clark},
  title = {Modelling pitch accent types for {P}olish speech
                   synthesis},
  booktitle = {Proc. Interspeech 2005},
  categories = {speech synthesis, prosody, intonation, festival,
                   Polish},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/oliverclark_interspeech05.pdf},
  year = 2005
}
@inproceedings{anderssoncabral09,
  author = {J. Sebastian Andersson and Joao P. Cabral and Leonardo
                   Badino and Junichi Yamagishi and Robert A.J. Clark},
  title = {Glottal Source and Prosodic Prominence Modelling in
                   {HMM}-based Speech Synthesis for the {B}lizzard
                   {C}hallenge 2009},
  booktitle = {The Blizzard Challenge 2009},
  address = {Edinburgh, U.K.},
  abstract = {This paper describes the CSTR entry for the Blizzard
                   Challenge 2009. The work focused on modifying two parts
                   of the Nitech 2005 HTS speech synthesis system to
                   improve naturalness and contextual appropriateness. The
                   first part incorporated an implementation of the
                   Linjencrants-Fant (LF) glottal source model. The second
                   part focused on improving synthesis of prosodic
                   prominence including emphasis through context dependent
                   phonemes. Emphasis was assigned to the synthesised test
                   sentences based on a handful of theory based rules. The
                   two parts (LF-model and prosodic prominence) were not
                   combined and hence evaluated separately. The results on
                   naturalness for the LF-model showed that it is not yet
                   perceived as natural as the Benchmark HTS system for
                   neutral speech. The results for the prosodic prominence
                   modelling showed that it was perceived as contextually
                   appropriate as the Benchmark HTS system, despite a low
                   naturalness score. The Blizzard challenge evaluation
                   has provided valuable information on the status of our
                   work and continued work will begin with analysing why
                   our modifications resulted in reduced naturalness
                   compared to the Benchmark HTS system.},
  categories = {HMM, HTS, speech synthesis, LF-model, glottal source,
                   prosodic prominence, emphasis},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/cstr_Blizzard2009.pdf},
  year = 2009
}
@inproceedings{clark_gala97,
  author = {Robert A. J. Clark},
  title = {Language Acquisition and Implication for Language
                   Change: A Computational Model},
  booktitle = {Proceedings of the {GALA} 97 Conference on Language
                   Acquisition},
  pages = {322-326},
  categories = {lm},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/clark_gala97.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/clark_gala97.ps},
  year = 1997
}
@inproceedings{leo_07-1,
  author = {Leonardo Badino and Robert A.J. Clark},
  title = {Issues of Optionality in Pitch Accent Placement},
  booktitle = {Proc. 6th ISCA Speech Synthesis Workshop},
  address = {Bonn, Germany},
  abstract = {When comparing the prosodic realization of different
                   English speakers reading the same text, a significant
                   disagreement is usually found amongst the pitch accent
                   patterns of the speakers. Assuming that such
                   disagreement is due to a partial optionality of pitch
                   accent placement, it has been recently proposed to
                   evaluate pitch accent predictors by comparing them with
                   multi-speaker reference data. In this paper we face the
                   issue of pitch accent optionality at different levels.
                   At first we propose a simple mathematical definition of
                   intra-speaker optionality which allows us to introduce
                   a function for evaluating pitch accent predictors which
                   we show being more accurate and robust than those used
                   in previous works. Subsequently we compare a pitch
                   accent predictor trained on single speaker data with a
                   predictor trained on multi-speaker data in order to
                   point out the large overlapping between intra-speaker
                   and inter-speaker optionality. Finally, we show our
                   successful results in predicting intra-speaker
                   optionality and we suggest how this achievement could
                   be exploited to improve the performances of a unit
                   selection text-to speech synthesis (TTS) system.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/ssw6_252.pdf},
  year = 2007
}
@article{beaver:07,
  author = {David Beaver and Brady Zack Clark and Edward Flemming
                   and T. Florian Jaeger and Maria Wolters},
  title = {When Semantics meets Phonetics: {A}coustical studies
                   of second occurrence focus},
  journal = {Language},
  volume = 83,
  number = 2,
  pages = {245--276},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/BeaverLanguage2007.pdf},
  year = 2007
}
@inproceedings{janska_clark:2010a,
  author = {Anna C. Janska and Robert A. J. Clark},
  title = {Native and Non-Native Speaker Judgements on the
                   Quality of Synthesized Speech},
  booktitle = {Proc. Interspeech},
  pages = {1121--1124},
  abstract = {The difference between native speakers' and non-native
                   speak- ers' naturalness judgements of synthetic speech
                   is investigated. Similar/difference judgements are
                   analysed via a multidimensional scaling analysis and
                   compared to Mean opinion scores. It is shown that
                   although the two groups generally behave in a similar
                   manner the variance of non-native speaker judgements is
                   generally higher. While both groups of subject can
                   clearly distinguish natural speech from the best
                   synthetic examples, the groups' responses to different
                   artefacts present in the synthetic speech can vary. },
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/janskaclark_interspeech2010.pdf},
  year = 2010
}
@inproceedings{clark:podsiadlo:mayo:king:blizzard2007,
  author = {Robert A. J. Clark and Monika Podsiadlo and Mark
                   Fraser and Catherine Mayo and Simon King },
  title = {Statistical Analysis of the {B}lizzard {C}hallenge
                   2007 Listening Test Results },
  booktitle = {Proc. Blizzard 2007 (in Proc. Sixth {ISCA} Workshop on
                   Speech Synthesis)},
  address = {Bonn, Germany},
  abstract = {Blizzard 2007 is the third Blizzard Challenge, in
                   which participants build voices from a common dataset.
                   A large listening test is conducted which allows
                   comparison of systems in terms of naturalness and
                   intelligibility. New sections were added to the
                   listening test for 2007 to test the perceived
                   similarity of the speaker's identity between natural
                   and synthetic speech. In this paper, we present the
                   results of the listening test and the subsequent
                   statistical analysis. },
  categories = {blizzard,listening test},
  keywords = {Blizzard},
  month = {August},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blz3_003.pdf},
  year = 2007
}
@article{white_clark_moore:2010,
  author = {Michael White and Robert A. J. Clark and Johanna D.
                   Moore},
  title = {Generating Tailored, Comparative Descriptions with
                   Contextually Appropriate Intonation},
  journal = {Computational Linguistics},
  volume = {36},
  number = {2},
  pages = {159-201},
  abstract = {Generating responses that take user preferences into
                   account requires adaptation at all levels of the
                   generation process. This article describes a
                   multi-level approach to presenting user-tailored
                   information in spoken dialogues which brings together
                   for the first time multi-attribute decision models,
                   strategic content planning, surface realization that
                   incorporates prosody prediction, and unit selection
                   synthesis that takes the resulting prosodic structure
                   into account. The system selects the most important
                   options to mention and the attributes that are most
                   relevant to choosing between them, based on the user
                   model. Multiple options are selected when each offers a
                   compelling trade-off. To convey these trade-offs, the
                   system employs a novel presentation strategy which
                   straightforwardly lends itself to the determination of
                   information structure, as well as the contents of
                   referring expressions. During surface realization, the
                   prosodic structure is derived from the information
                   structure using Combinatory Categorial Grammar in a way
                   that allows phrase boundaries to be determined in a
                   flexible, data-driven fashion. This approach to
                   choosing pitch accents and edge tones is shown to yield
                   prosodic structures with significantly higher
                   acceptability than baseline prosody prediction models
                   in an expert evaluation. These prosodic structures are
                   then shown to enable perceptibly more natural synthesis
                   using a unit selection voice that aims to produce the
                   target tunes, in comparison to two baseline synthetic
                   voices. An expert evaluation and f0 analysis confirm
                   the superiority of the generator-driven intonation and
                   its contribution to listeners' ratings.},
  doi = {10.1162/coli.09-023-R1-08-002},
  year = 2010
}
@mastersthesis{clark_msc96,
  author = {Robert A.J. Clark},
  title = {Internal and External Factors Affecting Language
                   Change: A Computational Model},
  school = {University of Edinburgh},
  categories = {lm},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1996/clark_msc96.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1996/clark_msc96.ps},
  year = 1996
}
@inproceedings{richmond_interspeech2010,
  author = {Richmond, Korin and Clark, Robert and Fitt, Sue},
  title = {On Generating {C}ombilex Pronunciations via
                   Morphological Analysis},
  booktitle = {Proc. Interspeech},
  pages = {1974--1977},
  address = {Makuhari, Japan},
  abstract = {Combilex is a high-quality lexicon that has been
                   developed specifically for speech technology purposes
                   and recently released by CSTR. Combilex benefits from
                   many advanced features. This paper explores one of
                   these: the ability to generate fully-specified
                   transcriptions for morphologically derived words
                   automatically. This functionality was originally
                   implemented to encode the pronunciations of derived
                   words in terms of their constituent morphemes, thus
                   accelerating lexicon development and ensuring a high
                   level of consistency. In this paper, we propose this
                   method of modelling pronunciations can be exploited
                   further by combining it with a morphological parser,
                   thus yielding a method to generate full transcriptions
                   for unknown derived words. Not only could this
                   accelerate adding new derived words to Combilex, but it
                   could also serve as an alternative to conventional
                   letter-to-sound rules. This paper presents preliminary
                   work indicating this is a promising direction.},
  keywords = {combilex lexicon, letter-to-sound rules,
                   grapheme-to-phoneme conversion, morphological
                   decomposition},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100683.pdf},
  year = 2010
}
@inproceedings{clarkrichmondking_interspeech05,
  author = {Robert A.J. Clark and Korin Richmond and Simon King},
  title = {Multisyn voices from {ARCTIC} data for the {B}lizzard
                   challenge},
  booktitle = {Proc. Interspeech 2005},
  abstract = {This paper describes the process of building unit
                   selection voices for the Festival Multisyn engine using
                   four ARCTIC datasets, as part of the Blizzard
                   evaluation challenge. The build process is almost
                   entirely automatic, with very little need for human
                   intervention. We discuss the difference in the
                   evaluation results for each voice and evaluate the
                   suitability of the ARCTIC datasets for building this
                   type of voice.},
  categories = {speech synthesis, festival, evaluation},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/clarkrichmondking_interspeech05.pdf},
  year = 2005
}
@article{anderssonyamagishi12,
  author = {Andersson, S. and Yamagishi, J. and Clark, R.A.J.},
  title = {Synthesis and Evaluation of Conversational
                   Characteristics in {HMM}-Based Speech Synthesis},
  journal = {Speech Communication},
  volume = 54,
  number = 2,
  pages = {175-188},
  abstract = {Spontaneous conversational speech has many
                   characteristics that are currently not modelled well by
                   HMM-based speech synthesis and in order to build
                   synthetic voices that can give an impression of someone
                   partaking in a conversation, we need to utilise data
                   that exhibits more of the speech phenomena associated
                   with conversations than the more generally used
                   carefully read aloud sentences. In this paper we show
                   that synthetic voices built with HMM-based speech
                   synthesis techniques from conversational speech data,
                   preserved segmental and prosodic characteristics of
                   frequent conversational speech phenomena. An analysis
                   of an evaluation investigating the perception of
                   quality and speaking style of HMM-based voices confirms
                   that speech with conversational characteristics are
                   instrumental for listeners to perceive successful
                   integration of conversational speech phenomena in
                   synthetic speech. The achieved synthetic speech quality
                   provides an encouraging start for the continued use of
                   conversational speech in HMM-based speech synthesis.},
  doi = {10.1016/j.specom.2011.08.001},
  year = 2012
}
@inproceedings{janska_clark:2010b,
  author = {Anna C. Janska and Robert A. J. Clark},
  title = {Further exploration of the possibilities and pitfalls
                   of multidimensional scaling as a tool for the
                   evaluation of the quality of synthesized speech},
  booktitle = {The 7th ISCA Tutorial and Research Workshop on Speech
                   Synthesis},
  pages = {142--147},
  abstract = {Multidimensional scaling (MDS) has been suggested as a
                   use- ful tool for the evaluation of the quality of
                   synthesized speech. However, it has not yet been
                   extensively tested for its applica- tion in this
                   specific area of evaluation. In a series of experi-
                   ments based on data from the Blizzard Challenge 2008
                   the relations between Weighted Euclidean Distance
                   Scaling and Simple Euclidean Distance Scaling is
                   investigated to understand how aggregating data affects
                   the MDS configuration. These results are compared to
                   those collected as mean opinion scores (MOS). The ranks
                   correspond, and MOS can be predicted from an object's
                   space in the MDS generated stimulus space. The big
                   advantage of MDS over MOS is its diagnostic value;
                   dimensions along which stimuli vary are not correlated,
                   as is the case in modular evaluation using MOS.
                   Finally, it will be attempted to generalize from the
                   MDS representations of the thoroughly tested subset to
                   the aggregated data of the larger-scale Blizzard
                   Challenge.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/janskaclark_ssw7.pdf},
  year = 2010
}
@inproceedings{clarkrichmondking_ssw504,
  author = {Robert A.J. Clark and Korin Richmond and Simon King},
  title = {Festival 2 -- build your own general purpose unit
                   selection speech synthesiser},
  booktitle = {Proc. 5th {ISCA} workshop on speech synthesis},
  abstract = {This paper describes version 2 of the Festival speech
                   synthesis system. Festival 2 provides a development
                   environment for concatenative speech synthesis, and now
                   includes a general purpose unit selection speech
                   synthesis engine. We discuss various aspects of unit
                   selection speech synthesis, focusing on the research
                   issues that relate to voice design and the automation
                   of the voice development process.},
  categories = {synthesis, festival, unitselection},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/clarkrichmondking_ssw504.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/clarkrichmondking_ssw504.ps},
  year = 2004
}
@inproceedings{badinoclark_interspeech12,
  author = {Leonardo Badino and Robert A.J. Clark and Mirjam
                   Wester},
  title = {Towards Hierarchical Prosodic Prominence Generation in
                   {TTS} Synthesis},
  booktitle = {Proc. Interspeech},
  address = {Portland, USA},
  categories = {speech synthesis, prosody},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/badinoclark_IS_2012.pdf},
  year = 2012
}
@inproceedings{bakerclarkwhite_ssw504,
  author = {Rachel Baker and Robert A.J. Clark and Michael White},
  title = {Synthesising Contextually Appropriate Intonation in
                   Limited Domains},
  booktitle = {Proc. 5th {ISCA} workshop on speech synthesis},
  address = {Pittsburgh, USA},
  categories = {synthesis, prosody, intonation, festival},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/bakerclarkwhite_ssw504.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/bakerclarkwhite_ssw504.ps},
  year = 2004
}
@inproceedings{clark_icphs99,
  author = {Robert A. J. Clark},
  title = {Using Prosodic Structure to Improve Pitch Range
                   Variation in Text to Speech Synthesis},
  booktitle = {Proc. {XIV}th international congress of phonetic
                   sciences},
  volume = 1,
  pages = {69--72},
  categories = {synthesis, prosody, intonation, festival},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/clark_icphs99.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/clark_icphs99.ps},
  year = 1999
}
@inproceedings{leo_09-1,
  author = {Leonardo Badino and J. Sebastian Andersson and Junichi
                   Yamagishi and Robert A.J. Clark},
  title = {Identification of Contrast and Its Emphatic
                   Realization in {HMM}-based Speech Synthesis},
  booktitle = {Proc. Interspeech 2009},
  address = {Brighton, U.K.},
  abstract = {The work presented in this paper proposes to identify
                   contrast in the form of contrastive word pairs and
                   prosodically signal it with emphatic accents in a
                   Text-to-Speech (TTS) application using a
                   Hidden-Markov-Model (HMM) based speech synthesis
                   system. We first describe a novel method to
                   automatically detect contrastive word pairs using
                   textual features only and report its performance on a
                   corpus of spontaneous conversations in English.
                   Subsequently we describe the set of features selected
                   to train a HMM-based speech synthesis system and
                   attempting to properly control prosodic prominence
                   (including emphasis). Results from a large scale
                   perceptual test show that in the majority of cases
                   listeners judge emphatic contrastive word pairs as
                   acceptable as their non-emphatic counterpart, while
                   emphasis on non-contrastive pairs is almost never
                   acceptable.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/IS090749.PDF},
  year = 2009
}
@inproceedings{strom:etal:interspeech2007,
  author = {Volker Strom and Ani Nenkova and Robert Clark and
                   Yolanda Vazquez-Alvarez and Jason Brenier and Simon
                   King and Dan Jurafsky},
  title = {Modelling Prominence and Emphasis Improves
                   Unit-Selection Synthesis},
  booktitle = {Proc. Interspeech 2007},
  address = {Antwerp, Belgium},
  abstract = {We describe the results of large scale perception
                   experiments showing improvements in synthesising two
                   distinct kinds of prominence: standard pitch-accent and
                   strong emphatic accents. Previously prominence
                   assignment has been mainly evaluated by computing
                   accuracy on a prominence-labelled test set. By contrast
                   we integrated an automatic pitch-accent classifier into
                   the unit selection target cost and showed that
                   listeners preferred these synthesised sentences. We
                   also describe an improved recording script for
                   collecting emphatic accents, and show that generating
                   emphatic accents leads to further improvements in the
                   fiction genre over incorporating pitch accent only.
                   Finally, we show differences in the effects of
                   prominence between child-directed speech and news and
                   fiction genres. Index Terms: speech synthesis, prosody,
                   prominence, pitch accent, unit selection},
  categories = {speech synthesis},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/p540.pdf},
  year = 2007
}
@inproceedings{clark_blizzard2006,
  author = {Clark, R. and Richmond, K. and Strom, V. and King, S.},
  title = {Multisyn Voices for the {B}lizzard {C}hallenge 2006},
  booktitle = {Proc. Blizzard Challenge Workshop (Interspeech
                   Satellite)},
  address = {Pittsburgh, USA},
  note = {(http://festvox.org/blizzard/blizzard2006.html)},
  abstract = {This paper describes the process of building unit
                   selection voices for the Festival Multisyn engine using
                   the ATR dataset provided for the Blizzard Challenge
                   2006. We begin by discussing recent improvements that
                   we have made to the Multisyn voice building process,
                   prompted by our participation in the Blizzard Challenge
                   2006. We then go on to discuss our interpretation of
                   the results observed. Finally, we conclude with some
                   comments and suggestions for the formulation of future
                   Blizzard Challenges.},
  categories = {tts, blizzard, multisyn, unit selection},
  key = {clark_blizzard2006},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/cstr_blizzard2006.pdf},
  year = 2006
}
@inproceedings{clarkdusterhoff_eurospeech99,
  author = {Robert. A. J. Clark and Kurt E. Dusterhoff},
  title = {Objective Methods for Evaluating Synthetic Intonation},
  booktitle = {Proc. {E}urospeech 1999},
  volume = 4,
  pages = {1623--1626},
  categories = {synthesis, prosody, intonation},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/clarkdusterhoff_eurospeech99.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/clarkdusterhoff_eurospeech99.ps},
  year = 1999
}
@inproceedings{clark_king:proc:2006,
  author = {Robert A. J. Clark and Simon King},
  title = {Joint Prosodic and Segmental Unit Selection Speech
                   Synthesis},
  booktitle = {Proc. Interspeech 2006},
  address = {Pittsburgh, USA},
  abstract = {We describe a unit selection technique for
                   text-to-speech synthesis which jointly searches the
                   space of possible diphone sequences and the space of
                   possible prosodic unit sequences in order to produce
                   synthetic speech with more natural prosody. We
                   demonstrates that this search, although currently
                   computationally expensive, can achieve improved
                   intonation compared to a baseline in which only the
                   space of possible diphone sequences is searched. We
                   discuss ways in which the search could be made
                   sufficiently efficient for use in a real-time system.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/clarkking_interspeech_2006.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/clarkking_interspeech_2006.ps},
  year = 2006
}
@incollection{Pipe_etal:2011,
  author = {A. G. Pipe and R. Vaidyanathan and C. Melhuish and P.
                   Bremner and P. Robinson and R. A. J. Clark and A. Lenz
                   and K. Eder and N. Hawes and Z. Ghahramani and M.
                   Fraser and M. Mermehdi and P. Healey and S. Skachek},
  title = {Affective Robotics: Human Motion and Behavioural
                   Inspiration for Cooperation between Humans and
                   Assistive Robots},
  booktitle = {Biomimetics: Nature-Based Innovation},
  publisher = {Taylor and Francis},
  editor = {Yoseph Bar-Cohen},
  chapter = {15},
  year = 2011
}
@inproceedings{janskaetal_interspeech12,
  author = {Anna C. Janska and Erich Schröger and Thomas Jacobsen
                   and Robert A. J. Clark},
  title = {Asymmetries in the perception of synthesized speech},
  booktitle = {Proc. Interspeech},
  address = {Portland, USA},
  categories = {speech synthesis, evaluation},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/janskaeral_IS_2012.pdf},
  year = 2012
}
@inproceedings{richmond2009a,
  author = {Richmond, K. and Clark, R. and Fitt, S.},
  title = {Robust {LTS} rules with the {Combilex} speech
                   technology lexicon},
  booktitle = {Proc. Interspeech},
  pages = {1295--1298},
  address = {Brighton, UK},
  abstract = {Combilex is a high quality pronunciation lexicon aimed
                   at speech technology applications that has recently
                   been released by CSTR. Combilex benefits from several
                   advanced features. This paper evaluates one of these:
                   the explicit alignment of phones to graphemes in a
                   word. This alignment can help to rapidly develop robust
                   and accurate letter-to-sound (LTS) rules, without
                   needing to rely on automatic alignment methods. To
                   evaluate this, we used Festival's LTS module, comparing
                   its standard automatic alignment with Combilex's
                   explicit alignment. Our results show using Combilex's
                   alignment improves LTS accuracy: 86.50\% words correct
                   as opposed to 84.49\%, with our most general form of
                   lexicon. In addition, building LTS models is greatly
                   accelerated, as the need to list allowed alignments is
                   removed. Finally, loose comparison with other studies
                   indicates Combilex is a superior quality lexicon in
                   terms of consistency and size.},
  keywords = {combilex, letter-to-sound rules, grapheme-to-phoneme
                   conversion},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/IS090308.pdf},
  year = 2009
}
@phdthesis{clark_phd03,
  author = {Robert A. J. Clark},
  title = {Generating Synthetic Pitch Contours Using Prosodic
                   Structure},
  school = {The University of Edinburgh},
  categories = {speech synthesis, prosody, intonation, festival},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/clark_phd03.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/clark_phd03.ps.gz},
  year = 2003
}
@inproceedings{leo_08-2,
  author = {Leonardo Badino and Robert A.J. Clark and Volker Strom},
  title = {Including Pitch Accent Optionality in Unit Selection
                   Text-to-Speech Synthesis},
  booktitle = {Proc.~Interspeech},
  address = {Brisbane},
  abstract = {A significant variability in pitch accent placement is
                   found when comparing the patterns of prosodic
                   prominence realized by different English speakers
                   reading the same sentences. In this paper we describe a
                   simple approach to incorporate this variability to
                   synthesize prosodic prominence in unit selection
                   text-to-speech synthesis. The main motivation of our
                   approach is that by taking into account the variability
                   of accent placements we enlarge the set of prosodically
                   acceptable speech units, thus increasing the chances of
                   selecting a good quality sequence of units, both in
                   prosodic and segmental terms. Results on a large scale
                   perceptual test show the benefits of our approach and
                   indicate directions for further improvements.},
  categories = {speech synthesis, unit selection, prosodic prominence,
                   pitch accents},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080159.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080159.ps},
  year = 2008
}
@inproceedings{strom06,
  author = {Volker Strom and Robert Clark and Simon King},
  title = {Expressive Prosody for Unit-selection Speech Synthesis},
  booktitle = {Proc.~Interspeech},
  address = {Pittsburgh},
  abstract = {Current unit selection speech synthesis voices cannot
                   produce emphasis or interrogative contours because of a
                   lack of the necessary prosodic variation in the
                   recorded speech database. A method of recording script
                   design is proposed which addresses this shortcoming.
                   Appropriate components were added to the target cost
                   function of the Festival Multisyn engine, and a
                   perceptual evaluation showed a clear preference over
                   the baseline system.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/strom06.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/strom06.ps},
  year = 2006
}
@inproceedings{mayoclarkking-isp05,
  author = {Mayo, C. and Clark, R. A. J. and King, S.},
  title = {Multidimensional Scaling of Listener Responses to
                   Synthetic Speech},
  booktitle = {Proc. Interspeech 2005},
  address = {Lisbon, Portugal},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/ie-speech-2005.pdf},
  year = 2005
}
@inproceedings{clark_icphs03,
  author = {Robert A. J. Clark},
  title = {Modelling Pitch Accents for Concept-to-Speech
                   Synthesis.},
  booktitle = {Proc. XVth International Congress of Phonetic Sciences},
  volume = 2,
  pages = {1141--1144},
  categories = {speech synthesis, prosody, intonation, festival},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/clark_icphs03.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/clark_icphs03.ps},
  year = 2003
}
@inproceedings{karaiskos:king:clark:mayo:blizzard2008,
  author = {Vasilis Karaiskos and Simon King and Robert A. J.
                   Clark and Catherine Mayo},
  title = {The Blizzard Challenge 2008},
  booktitle = {Proc. Blizzard Challenge Workshop},
  address = {Brisbane, Australia},
  abstract = {The Blizzard Challenge 2008 was the fourth annual
                   Blizzard Challenge. This year, participants were asked
                   to build two voices from a UK English corpus and one
                   voice from a Man- darin Chinese corpus. This is the
                   first time that a language other than English has been
                   included and also the first time that a large UK
                   English corpus has been available. In addi- tion, the
                   English corpus contained somewhat more expressive
                   speech than that found in corpora used in previous
                   Blizzard Challenges. To assist participants with
                   limited resources or limited ex- perience in
                   UK-accented English or Mandarin, unaligned la- bels
                   were provided for both corpora and for the test
                   sentences. Participants could use the provided labels
                   or create their own. An accent-specific pronunciation
                   dictionary was also available for the English speaker.
                   A set of test sentences was released to participants,
                   who were given a limited time in which to synthesise
                   them and submit the synthetic speech. An online
                   listening test was con- ducted, to evaluate
                   naturalness, intelligibility and degree of similarity
                   to the original speaker.},
  keywords = {Blizzard},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/summary_Blizzard2008.pdf},
  year = 2008
}
@article{clarkrichmondking_specom2007,
  author = {Robert A. J. Clark and Korin Richmond and Simon King},
  title = {Multisyn: Open-domain unit selection for the
                   {F}estival speech synthesis system},
  journal = {Speech Communication},
  volume = 49,
  number = 4,
  pages = {317--330},
  abstract = {We present the implementation and evaluation of an
                   open-domain unit selection speech synthesis engine
                   designed to be flexible enough to encourage further
                   unit selection research and allow rapid voice
                   development by users with minimal speech synthesis
                   knowledge and experience. We address the issues of
                   automatically processing speech data into a usable
                   voice using automatic segmentation techniques and how
                   the knowledge obtained at labelling time can be
                   exploited at synthesis time. We describe target cost
                   and join cost implementation for such a system and
                   describe the outcome of building voices with a number
                   of different sized datasets. We show that, in a
                   competitive evaluation, voices built using this
                   technology compare favourably to other systems.},
  categories = {speech synthesis, festival, multisyn, unitselection},
  doi = {10.1016/j.specom.2007.01.014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/clarkrichmondking_specom2007.pdf},
  year = 2007
}
@article{mayo:clark:king:10,
  author = {Mayo, C. and Clark, R. A. J. and King, S.},
  title = {Listeners' Weighting of Acoustic Cues to Synthetic
                   Speech Naturalness: A Multidimensional Scaling Analysis},
  journal = {Speech Communication},
  volume = {53},
  number = {3},
  pages = {311--326},
  abstract = {The quality of current commercial speech synthesis
                   systems is now so high that system improvements are
                   being made at subtle sub- and supra-segmental levels.
                   Human perceptual evaluation of such subtle improvements
                   requires a highly sophisticated level of perceptual
                   attention to specific acoustic characteristics or cues.
                   However, it is not well understood what acoustic cues
                   listeners attend to by default when asked to evaluate
                   synthetic speech. It may, therefore, be potentially
                   quite difficult to design an evaluation method that
                   allows listeners to concentrate on only one dimension
                   of the signal, while ignoring others that are
                   perceptually more important to them. The aim of the
                   current study was to determine which acoustic
                   characteristics of unit-selection synthetic speech are
                   most salient to listeners when evaluating the
                   naturalness of such speech. This study made use of
                   multidimensional scaling techniques to analyse
                   listeners' pairwise comparisons of synthetic speech
                   sentences. Results indicate that listeners place a
                   great deal of perceptual importance on the presence of
                   artifacts and discontinuities in the speech, somewhat
                   less importance on aspects of segmental quality, and
                   very little importance on stress/intonation
                   appropriateness. These relative differences in
                   importance will impact on listeners' ability to attend
                   to these different acoustic characteristics of
                   synthetic speech, and should therefore be taken into
                   account when designing appropriate methods of synthetic
                   speech evaluation.},
  doi = {10.1016/j.specom.2010.10.003},
  keywords = {Speech synthesis; Evaluation; Speech perception;
                   Acoustic cue weighting; Multidimensional scaling},
  year = 2011
}
@inproceedings{richmond2007b,
  author = {Richmond, K. and Strom, V. and Clark, R. and
                   Yamagishi, J. and Fitt, S.},
  title = {Festival Multisyn Voices for the 2007 Blizzard
                   Challenge},
  booktitle = {Proc. Blizzard Challenge Workshop (in Proc. SSW6)},
  address = {Bonn, Germany},
  abstract = {This paper describes selected aspects of the Festival
                   Multisyn entry to the Blizzard Challenge 2007. We
                   provide an overview of the process of building the
                   three required voices from the speech data provided.
                   This paper focuses on new features of Multisyn which
                   are currently under development and which have been
                   employed in the system used for this Blizzard
                   Challenge. These differences are the application of a
                   more flexible phonetic lattice representation during
                   forced alignment labelling and the use of a pitch
                   accent target cost component. Finally, we also examine
                   aspects of the speech data provided for this year's
                   Blizzard Challenge and raise certain issues for
                   discussion concerning the aim of comparing voices made
                   with differing subsets of the data provided.},
  categories = {tts, blizzard, multisyn, unit selection},
  key = {richmond2007b},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blizzard2007paper.pdf},
  year = 2007
}
@inproceedings{leo_08-1,
  author = {Leonardo Badino and Robert A.J. Clark},
  title = {Automatic labeling of contrastive word pairs from
                   spontaneous spoken English},
  booktitle = {in 2008 IEEE/ACL Workshop on Spoken Language
                   Technology},
  address = {Goa, India},
  abstract = {This paper addresses the problem of automatically
                   labeling contrast in spontaneous spoken speech, where
                   contrast here is meant as a relation that ties two
                   words that explicitly contrast with each other.
                   Detection of contrast is certainly relevant in the
                   analysis of discourse and information structure and
                   also, because of the prosodic correlates of contrast,
                   could play an important role in speech applications,
                   such as text-to-speech synthesis, that need an accurate
                   and discourse context related modeling of prosody. With
                   this prospect we investigate the feasibility of
                   automatic contrast labeling by training and evaluating
                   on the Switchboard corpus a novel contrast tagger,
                   based on Support Vector Machines (SVM), that combines
                   lexical features, syntactic dependencies and WordNet
                   semantic relations.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/0000101.pdf},
  year = 2008
}
@inproceedings{hofer-eurosp05,
  author = {G. Hofer and K. Richmond and R. Clark},
  title = {Informed Blending of Databases for Emotional Speech
                   Synthesis},
  booktitle = {Proc. Interspeech},
  abstract = {The goal of this project was to build a unit selection
                   voice that could portray emotions with varying
                   intensities. A suitable definition of an emotion was
                   developed along with a descriptive framework that
                   supported the work carried out. A single speaker was
                   recorded portraying happy and angry speaking styles.
                   Additionally a neutral database was also recorded. A
                   target cost function was implemented that chose units
                   according to emotion mark-up in the database. The
                   Dictionary of Affect supported the emotional target
                   cost function by providing an emotion rating for words
                   in the target utterance. If a word was particularly
                   'emotional', units from that emotion were favoured. In
                   addition intensity could be varied which resulted in a
                   bias to select a greater number emotional units. A
                   perceptual evaluation was carried out and subjects were
                   able to recognise reliably emotions with varying
                   amounts of emotional units present in the target
                   utterance.},
  categories = {speech synthesis,emotion,edinburgh},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hofer_emosyn.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hofer_emosyn.ps},
  year = 2005
}
@inproceedings{anderssonetal2010_ssw7,
  author = {Sebastian Andersson and Junichi Yamagishi and Robert
                   Clark},
  title = {Utilising Spontaneous Conversational Speech in
                   {HMM}-Based Speech Synthesis},
  booktitle = {The 7th ISCA Tutorial and Research Workshop on Speech
                   Synthesis},
  abstract = {Spontaneous conversational speech has many
                   characteristics that are currently not well modelled in
                   unit selection and HMM-based speech synthesis. But in
                   order to build synthetic voices more suitable for
                   interaction we need data that exhibits more
                   conversational characteristics than the generally used
                   read aloud sentences. In this paper we will show how
                   carefully selected utterances from a spontaneous
                   conversation was instrumental for building an HMM-based
                   synthetic voices with more natural sounding
                   conversational characteristics than a voice based on
                   carefully read aloud sentences. We also investigated a
                   style blending technique as a solution to the inherent
                   problem of phonetic coverage in spontaneous speech
                   data. But the lack of an appropriate representation of
                   spontaneous speech phenomena probably contributed to
                   results showing that we could not yet compete with the
                   speech quality achieved for grammatical sentences.},
  categories = {HMM, speech synthesis, spontaneous speech,
                   conversation, lexical fillers, filled pauses},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/ssw7_paper.pdf},
  year = 2010
}