The Centre for Speech Technology Research, The university of Edinburgh

Publications by Matthew Aylett

matthewa.bib

@inproceedings{anderssonetal2010,
  author = {Sebastian Andersson and Kallirroi Georgila and David
                   Traum and Matthew Aylett and Robert Clark},
  title = {Prediction and Realisation of Conversational
                   Characteristics by Utilising Spontaneous Speech for
                   Unit Selection},
  booktitle = {Speech Prosody 2010},
  abstract = {Unit selection speech synthesis has reached high
                   levels of naturalness and intelligibility for neutral
                   read aloud speech. However, synthetic speech generated
                   using neutral read aloud data lacks all the attitude,
                   intention and spontaneity associated with everyday
                   conversations. Unit selection is heavily data dependent
                   and thus in order to simulate human conversational
                   speech, or create synthetic voices for believable
                   virtual characters, we need to utilise speech data with
                   examples of how people talk rather than how people
                   read. In this paper we included carefully selected
                   utterances from spontaneous conversational speech in a
                   unit selection voice. Using this voice and by
                   automatically predicting type and placement of lexical
                   fillers and filled pauses we can synthesise utterances
                   with conversational characteristics. A perceptual
                   listening test showed that it is possible to make
                   synthetic speech sound more conversational without
                   degrading naturalness.},
  categories = {speech synthesis, unit selection, conversation,
                   spontaneous speech, lexical fillers, filled pauses},
  month = may,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/100116.pdf},
  year = 2010
}
@inproceedings{cereproc-hts,
  author = {Matthew P. Aylett and Junichi Yamagishi},
  title = {Combining Statistical Parameteric Speech Synthesis and
                   Unit-Selection for Automatic Voice Cloning},
  booktitle = {Proc. LangTech 2008},
  address = {Brisbane, Australia},
  abstract = {The ability to use the recorded audio of a subject’¡Çs
                   voice to produce an open-domain synthesis system has
                   generated much interest both in academic research and
                   in commercial speech technology. The ability to produce
                   synthetic versions of a subjects voice has potential
                   commercial applications, such as virtual celebrity
                   actors, or potential clinical applications, such as
                   offering a synthetic replacement voice in the case of a
                   laryngectomy. Recent developments in HMM-based speech
                   synthesis have shown it is possible to produce
                   synthetic voices from quite small amounts of speech
                   data. However, mimicking the depth and variation of a
                   speaker’¡Çs prosody as well as synthesising natural
                   voice quality is still a challenging research problem.
                   In contrast, unit-selection systems have shown it is
                   possible to strongly retain the character of the voice
                   but only with sufficient original source material.
                   Often this runs into hours and may require significant
                   manual checking and labelling. In this paper we will
                   present two state of the art systems, an HMM based
                   system HTS-2007, developed by CSTR and Nagoya Institute
                   Technology, and a commercial unit-selection system
                   CereVoice, developed by Cereproc. Both systems have
                   been used to mimic the voice of George W. Bush (43rd
                   president of the United States) using freely available
                   audio from the web. In addition we will present a
                   hybrid system which combines both technologies. We
                   demonstrate examples of synthetic voices created from
                   10, 40 and 210 minutes of randomly selected speech. We
                   will then discuss the underlying problems associated
                   with voice cloning using found audio, and the
                   scalability of our solution.},
  categories = {speech synthesis, HMM-based speech synthesis, HTS,
                   speaker adaptation, voice conversion, average voice},
  key = {cereproc-hts},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/03_AYLETT.pdf},
  year = 2008
}
@inproceedings{Ayletetal09,
  author = {Matthew P. Aylett and Simon King and Junichi Yamagishi},
  title = {Speech Synthesis Without a Phone Inventory},
  booktitle = {Interspeech},
  pages = {2087--2090},
  abstract = { In speech synthesis the unit inventory is decided
                   using phonological and phonetic expertise. This process
                   is resource intensive and potentially sub-optimal. In
                   this paper we investigate how acoustic clustering,
                   together with lexicon constraints, can be used to build
                   a self-organised inventory. Six English speech
                   synthesis systems were built using two frameworks, unit
                   selection and parametric HTS for three inventory
                   conditions: 1) a traditional phone set, 2) a system
                   using orthographic units, and 3) a self-organised
                   inventory. A listening test showed a strong preference
                   for the classic system, and for the orthographic system
                   over the self-organised system. Results also varied by
                   letter to sound complexity and database coverage. This
                   suggests the self-organised approach failed to
                   generalise pronunciation as well as introducing noise
                   above and beyond that caused by orthographic sound
                   mismatch.},
  categories = {speech synthesis, unit selection, parametric
                   synthesis, phone inventory, orthographic synthesis},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/ma_interspeech09.pdf},
  place = {Brighton},
  year = 2009
}
@inproceedings{Aylett+King08,
  author = {Matthew P. Aylett and Simon King},
  title = {Single Speaker Segmentation and Inventory Selection
                   Using Dynamic Time Warping Self Organization and Joint
                   Multigram Mapping},
  booktitle = {SSW06},
  pages = {258--263},
  abstract = {In speech synthesis the inventory of units is decided
                   by inspection and on the basis of phonological and
                   phonetic expertise. The ephone (or emergent phone)
                   project at CSTR is investigating how self organisation
                   techniques can be applied to build an inventory based
                   on collected acoustic data together with the
                   constraints of a synthesis lexicon. In this paper we
                   will describe a prototype inventory creation method
                   using dynamic time warping (DTW) for acoustic
                   clustering and a joint multigram approach for relating
                   a series of symbols that represent the speech to these
                   emerged units. We initially examined two symbol sets:
                   1) A baseline of standard phones 2) Orthographic
                   symbols. The success of the approach is evaluated by
                   comparing word boundaries generated by the emergent
                   phones against those created using state-of-the-art HMM
                   segmentation. Initial results suggest the DTW
                   segmentation can match word boundaries with a root mean
                   square error (RMSE) of 35ms. Results from mapping units
                   onto phones resulted in a higher RMSE of 103ms. This
                   error was increased when multiple multigram types were
                   added and when the default unit clustering was altered
                   from 40 (our baseline) to 10. Results for orthographic
                   matching had a higher RMSE of 125ms. To conclude we
                   discuss future work that we believe can reduce this
                   error rate to a level sufficient for the techniques to
                   be applied to a unit selection synthesis system. },
  categories = {speech synthesis, unit selection, parametric
                   synthesis, phone inventory, orthographic synthesis},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/ssw06.pdf},
  place = {Bonn},
  year = 2008
}
@article{Stan2011442,
  author = {Adriana Stan and Junichi Yamagishi and Simon King and
                   Matthew Aylett},
  title = {The {R}omanian speech synthesis ({RSS}) corpus:
                   Building a high quality {HMM}-based speech synthesis
                   system using a high sampling rate},
  journal = {Speech Communication},
  volume = {53},
  number = {3},
  pages = {442--450},
  note = {},
  abstract = {This paper first introduces a newly-recorded high
                   quality Romanian speech corpus designed for speech
                   synthesis, called ``RSS'', along with Romanian
                   front-end text processing modules and HMM-based
                   synthetic voices built from the corpus. All of these
                   are now freely available for academic use in order to
                   promote Romanian speech technology research. The RSS
                   corpus comprises 3500 training sentences and 500 test
                   sentences uttered by a female speaker and was recorded
                   using multiple microphones at 96 kHz sampling
                   frequency in a hemianechoic chamber. The details of the
                   new Romanian text processor we have developed are also
                   given. Using the database, we then revisit some basic
                   configuration choices of speech synthesis, such as
                   waveform sampling frequency and auditory frequency
                   warping scale, with the aim of improving speaker
                   similarity, which is an acknowledged weakness of
                   current HMM-based speech synthesisers. As we
                   demonstrate using perceptual tests, these configuration
                   choices can make substantial differences to the quality
                   of the synthetic speech. Contrary to common practice in
                   automatic speech recognition, higher waveform sampling
                   frequencies can offer enhanced feature extraction and
                   improved speaker similarity for HMM-based speech
                   synthesis.},
  doi = {10.1016/j.specom.2010.12.002},
  issn = {0167-6393},
  keywords = {Speech synthesis, HTS, Romanian, HMMs, Sampling
                   frequency, Auditory scale},
  url = {http://www.sciencedirect.com/science/article/pii/S0167639310002074},
  year = 2011
}
@inproceedings{mayoaylettladd:97,
  author = {Mayo, C. and Aylett, M. and Ladd, D. R.},
  title = {Prosodic transcription of Glasgow English: an
                   evaluation study of {GlaToBI}},
  booktitle = {Intonation: Theory, Models and Applications},
  categories = {intonation, perceptual evaluation, Glasgow English,
                   transcription, ToBI},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/esca2.pdf},
  year = 1997
}
@inproceedings{leo_07-2,
  author = {Matthew P. Aylett and J. Sebastian Andersson and
                   Leonardo Badino and Christopher J. Pidcock},
  title = {The {C}erevoice {B}lizzard Entry 2007: Are Small
                   Database Errors Worse than Compression Artifacts?},
  booktitle = {Proc. Blizzard Challenge Workshop 2007},
  address = {Bonn, Germany},
  abstract = {In commercial systems the memory footprint of unit
                   selection systems is often a key issue. This is
                   especially true for PDAs and other embedded devices. In
                   this year's Blizzard entry CereProc R gave itself the
                   criteria that the full database system entered would
                   have a smaller memory footprint than either of the two
                   smaller database entries. This was accomplished by
                   applying Speex speech compression to the full database
                   entry. In turn a set of small database techniques used
                   to improve the quality of small database systems in
                   last years entry were extended. Finally, for all
                   systems, two quality control methods were applied to
                   the underlying database to improve the lexicon and
                   transcription match to the underlying data. Results
                   suggest that mild audio quality artifacts introduced by
                   lossy compression have almost as much impact on MOS
                   perceived quality as concatenation errors introduced by
                   sparse data in the smaller systems with bulked
                   diphones.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blz3_004.pdf},
  year = 2007
}