The Centre for Speech Technology Research, The university of Edinburgh

Publications by Matthew Aylett


  author = {Andersson, Sebastian and Georgila, Kallirroi and Traum, David and Aylett, Matthew and Clark, Robert},
  title = {Prediction and Realisation of Conversational Characteristics by Utilising Spontaneous Speech for Unit Selection},
  booktitle = {Speech Prosody 2010},
  month = {May},
  year = {2010},
  pdf = {},
  abstract = {Unit selection speech synthesis has reached high levels of naturalness and intelligibility for neutral read aloud speech. However, synthetic speech generated using neutral read aloud data lacks all the attitude, intention and spontaneity associated with everyday conversations. Unit selection is heavily data dependent and thus in order to simulate human conversational speech, or create synthetic voices for believable virtual characters, we need to utilise speech data with examples of how people talk rather than how people read. In this paper we included carefully selected utterances from spontaneous conversational speech in a unit selection voice. Using this voice and by automatically predicting type and placement of lexical fillers and filled pauses we can synthesise utterances with conversational characteristics. A perceptual listening test showed that it is possible to make synthetic speech sound more conversational without degrading naturalness.},
  categories = {speech synthesis, unit selection, conversation, spontaneous speech, lexical fillers, filled pauses}
  author = {Aylett, Matthew P. and Yamagishi, Junichi},
  title = {Combining Statistical Parameteric Speech Synthesis and Unit-Selection for Automatic Voice Cloning},
  booktitle = {Proc. LangTech 2008},
  year = {2008},
  month = {September},
  key = {cereproc-hts},
  address = {Brisbane, Australia},
  pdf = {},
  abstract = {The ability to use the recorded audio of a subject's voice to produce an open-domain synthesis system has generated much interest both in academic research and in commercial speech technology. The ability to produce synthetic versions of a subjects voice has potential commercial applications, such as virtual celebrity actors, or potential clinical applications, such as offering a synthetic replacement voice in the case of a laryngectomy. Recent developments in HMM-based speech synthesis have shown it is possible to produce synthetic voices from quite small amounts of speech data. However, mimicking the depth and variation of a speaker's prosody as well as synthesising natural voice quality is still a challenging research problem. In contrast, unit-selection systems have shown it is possible to strongly retain the character of the voice but only with sufficient original source material. Often this runs into hours and may require significant manual checking and labelling. In this paper we will present two state of the art systems, an HMM based system HTS-2007, developed by CSTR and Nagoya Institute Technology, and a commercial unit-selection system CereVoice, developed by Cereproc. Both systems have been used to mimic the voice of George W. Bush (43rd president of the United States) using freely available audio from the web. In addition we will present a hybrid system which combines both technologies. We demonstrate examples of synthetic voices created from 10, 40 and 210 minutes of randomly selected speech. We will then discuss the underlying problems associated with voice cloning using found audio, and the scalability of our solution.},
  categories = {speech synthesis, HMM-based speech synthesis, HTS, speaker adaptation, voice conversion, average voice}
  author = {Aylett, Matthew P. and King, Simon and Yamagishi, Junichi},
  title = {Speech Synthesis Without a Phone Inventory},
  booktitle = {Interspeech},
  pages = {2087--2090},
  place = {Brighton},
  year = {2009},
  pdf = {},
  abstract = {In speech synthesis the unit inventory is decided using phonological and phonetic expertise. This process is resource intensive and potentially sub-optimal. In this paper we investigate how acoustic clustering, together with lexicon constraints, can be used to build a self-organised inventory. Six English speech synthesis systems were built using two frameworks, unit selection and parametric HTS for three inventory conditions: 1) a traditional phone set, 2) a system using orthographic units, and 3) a self-organised inventory. A listening test showed a strong preference for the classic system, and for the orthographic system over the self-organised system. Results also varied by letter to sound complexity and database coverage. This suggests the self-organised approach failed to generalise pronunciation as well as introducing noise above and beyond that caused by orthographic sound mismatch.},
  categories = {speech synthesis, unit selection, parametric synthesis, phone inventory, orthographic synthesis}
  author = {Aylett, Matthew P. and King, Simon},
  title = {Single Speaker Segmentation and Inventory Selection Using Dynamic Time Warping Self Organization and Joint Multigram Mapping},
  booktitle = {SSW06},
  pages = {258--263},
  place = {Bonn},
  year = {2008},
  pdf = {},
  abstract = {In speech synthesis the inventory of units is decided by inspection and on the basis of phonological and phonetic expertise. The ephone (or emergent phone) project at CSTR is investigating how self organisation techniques can be applied to build an inventory based on collected acoustic data together with the constraints of a synthesis lexicon. In this paper we will describe a prototype inventory creation method using dynamic time warping (DTW) for acoustic clustering and a joint multigram approach for relating a series of symbols that represent the speech to these emerged units. We initially examined two symbol sets: 1) A baseline of standard phones 2) Orthographic symbols. The success of the approach is evaluated by comparing word boundaries generated by the emergent phones against those created using state-of-the-art HMM segmentation. Initial results suggest the DTW segmentation can match word boundaries with a root mean square error (RMSE) of 35ms. Results from mapping units onto phones resulted in a higher RMSE of 103ms. This error was increased when multiple multigram types were added and when the default unit clustering was altered from 40 (our baseline) to 10. Results for orthographic matching had a higher RMSE of 125ms. To conclude we discuss future work that we believe can reduce this error rate to a level sufficient for the techniques to be applied to a unit selection synthesis system.},
  categories = {speech synthesis, unit selection, parametric synthesis, phone inventory, orthographic synthesis}
  author = {Andersson, J. Sebastian and Badino, Leonardo and Watts, Oliver S. and P.Aylett, Matthew},
  title = {The {CSTR/Cereproc B}lizzard Entry 2008: The Inconvenient Data},
  booktitle = {Proc. Blizzard Challenge Workshop (in Proc. Interspeech 2008)},
  address = {Brisbane, Australia},
  year = {2008},
  pdf = {},
  abstract = {In a commercial system data used for unit selection systems is collected with a heavy emphasis on homogeneous neutral data that has sufficient coverage for the units that will be used in the system. In this years Blizzard entry CSTR and CereProc present a joint entry where the emphasis has been to explore techniques to deal with data which is not homogeneous (the English entry) and did not have appropriate coverage for a diphone based system (the Mandarin entry where tone/phone combinations were treated as distinct phone categories). In addition, two further problems were addressed, 1) Making use of non-homogeneous data for creating a voice that can realise both expressive and neutral speaking styles (the English entry) 2) Building a unit selection system with no native understanding of the language but depending instead on external native evaluation (the Mandarin Entry).}
  author = {Stan, Adriana and Yamagishi, Junichi and King, Simon and Aylett, Matthew},
  volume = {53},
  doi = {10.1016/j.specom.2010.12.002},
  title = {The {R}omanian speech synthesis ({RSS}) corpus: Building a high quality {HMM}-based speech synthesis system using a high sampling rate},
  url = {},
  journal = {Speech Communication},
  issn = {0167-6393},
  number = {3},
  pages = {442--450},
  note = {},
  year = {2011},
  keywords = {Speech synthesis, HTS, Romanian, HMMs, Sampling frequency, Auditory scale},
  abstract = {This paper first introduces a newly-recorded high quality Romanian speech corpus designed for speech synthesis, called ``RSS'', along with Romanian front-end text processing modules and HMM-based synthetic voices built from the corpus. All of these are now freely available for academic use in order to promote Romanian speech technology research. The RSS corpus comprises 3500 training sentences and 500 test sentences uttered by a female speaker and was recorded using multiple microphones at 96 kHz sampling frequency in a hemianechoic chamber. The details of the new Romanian text processor we have developed are also given. Using the database, we then revisit some basic configuration choices of speech synthesis, such as waveform sampling frequency and auditory frequency warping scale, with the aim of improving speaker similarity, which is an acknowledged weakness of current HMM-based speech synthesisers. As we demonstrate using perceptual tests, these configuration choices can make substantial differences to the quality of the synthetic speech. Contrary to common practice in automatic speech recognition, higher waveform sampling frequencies can offer enhanced feature extraction and improved speaker similarity for HMM-based speech synthesis.}
  author = {Mayo, C. and Aylett, M. and Ladd, D. R.},
  pdf = {},
  booktitle = {Intonation: Theory, Models and Applications},
  title = {Prosodic transcription of Glasgow English: an evaluation study of {GlaToBI}},
  categories = {intonation, perceptual evaluation, Glasgow English, transcription, ToBI},
  year = {1997}
  author = {Aylett, Matthew P. and Andersson, J. Sebastian and Badino, Leonardo and Pidcock, Christopher J.},
  title = {The {C}erevoice {B}lizzard Entry 2007: Are Small Database Errors Worse than Compression Artifacts?},
  booktitle = {Proc. Blizzard Challenge Workshop 2007},
  address = {Bonn, Germany},
  year = {2007},
  pdf = {},
  abstract = {In commercial systems the memory footprint of unit selection systems is often a key issue. This is especially true for PDAs and other embedded devices. In this year's Blizzard entry CereProc R gave itself the criteria that the full database system entered would have a smaller memory footprint than either of the two smaller database entries. This was accomplished by applying Speex speech compression to the full database entry. In turn a set of small database techniques used to improve the quality of small database systems in last years entry were extended. Finally, for all systems, two quality control methods were applied to the underlying database to improve the lexicon and transcription match to the underlying data. Results suggest that mild audio quality artifacts introduced by lossy compression have almost as much impact on MOS perceived quality as concatenation errors introduced by sparse data in the smaller systems with bulked diphones.}
  author = {Aylett, Matthew and Dall, Rasmus and Ghoshal, Arnab and Henter, Gustav Eje and Merritt, Thomas},
  title = {A Flexible Front-End for {HTS}},
  booktitle = {Proc. Interspeech},
  abstract = {Parametric speech synthesis techniques depend on full context acoustic models generated by language front-ends, which analyse linguistic and phonetic structure. HTS, the leading parametric synthesis system, can use a number of different front-ends to generate full context models for synthesis and training. In this paper we explore the use of a new text processing front-end that has been added to the speech recognition toolkit Kaldi as part of an ongoing project to produce a new parametric speech synthesis system, Idlak. The use of XML specification files, a modular design, and modern coding and testing approaches, make the Idlak front-end ideal for adding, altering and experimenting with the contexts used in full context acoustic models. The Idlak front-end was evaluated against the standard Festival front-end in the HTS system. Results from the Idlak front-end compare well with the more mature Festival front-end (Idlak - 2.83 MOS vs Festival - 2.85 MOS), although a slight reduction in naturalness perceived by non-native English speakers can be attributed to Festival’s insertion of non-punctuated pauses.},
  month = {September},
  year = {2014},
  pdf = {},
  pages = {1283--1287},
  categories = {speech synthesis, text processing, parametric synthesis, Kaldi, Idlak}
  author = {Wester, Mirjam and Aylett, Matthew and Tomalin, Marcus and Dall, Rasmus},
  title = {Artificial Personality and Disfluency},
  booktitle = {Proc. Interspeech},
  address = {Dresden},
  month = {September},
  year = {2015},
  pdf = {},
  abstract = {The focus of this paper is artificial voices with different personalities. Previous studies have shown links between an individual's use of disfluencies in their speech and their perceived personality. Here, filled pauses (uh and um) and discourse markers (like, you know, I mean) have been included in synthetic speech as a way of creating an artificial voice with different personalities. We discuss the automatic insertion of filled pauses and discourse markers (i.e., fillers) into otherwise fluent texts. The automatic system is compared to a ground truth of human "acted" filler insertion. Perceived personality (as defined by the big five personality dimensions) of the synthetic speech is assessed by means of a standardised questionnaire. Synthesis without fillers is compared to synthesis with either spontaneous or synthetic fillers. Our findings explore how the inclusion of disfluencies influences the way in which subjects rate the perceived personality of an artificial voice.},
  categories = {artificial personality, TTS, disfluency}