Publications by Matthew Aylett
matthewa.bib
@inproceedings{anderssonetal2010,
author = {Sebastian Andersson and Kallirroi Georgila and David
Traum and Matthew Aylett and Robert Clark},
title = {Prediction and Realisation of Conversational
Characteristics by Utilising Spontaneous Speech for
Unit Selection},
booktitle = {Speech Prosody 2010},
abstract = {Unit selection speech synthesis has reached high
levels of naturalness and intelligibility for neutral
read aloud speech. However, synthetic speech generated
using neutral read aloud data lacks all the attitude,
intention and spontaneity associated with everyday
conversations. Unit selection is heavily data dependent
and thus in order to simulate human conversational
speech, or create synthetic voices for believable
virtual characters, we need to utilise speech data with
examples of how people talk rather than how people
read. In this paper we included carefully selected
utterances from spontaneous conversational speech in a
unit selection voice. Using this voice and by
automatically predicting type and placement of lexical
fillers and filled pauses we can synthesise utterances
with conversational characteristics. A perceptual
listening test showed that it is possible to make
synthetic speech sound more conversational without
degrading naturalness.},
categories = {speech synthesis, unit selection, conversation,
spontaneous speech, lexical fillers, filled pauses},
month = may,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/100116.pdf},
year = 2010
}
@inproceedings{cereproc-hts,
author = {Matthew P. Aylett and Junichi Yamagishi},
title = {Combining Statistical Parameteric Speech Synthesis and
Unit-Selection for Automatic Voice Cloning},
booktitle = {Proc. LangTech 2008},
address = {Brisbane, Australia},
abstract = {The ability to use the recorded audio of a subject’¡Çs
voice to produce an open-domain synthesis system has
generated much interest both in academic research and
in commercial speech technology. The ability to produce
synthetic versions of a subjects voice has potential
commercial applications, such as virtual celebrity
actors, or potential clinical applications, such as
offering a synthetic replacement voice in the case of a
laryngectomy. Recent developments in HMM-based speech
synthesis have shown it is possible to produce
synthetic voices from quite small amounts of speech
data. However, mimicking the depth and variation of a
speaker’¡Çs prosody as well as synthesising natural
voice quality is still a challenging research problem.
In contrast, unit-selection systems have shown it is
possible to strongly retain the character of the voice
but only with sufficient original source material.
Often this runs into hours and may require significant
manual checking and labelling. In this paper we will
present two state of the art systems, an HMM based
system HTS-2007, developed by CSTR and Nagoya Institute
Technology, and a commercial unit-selection system
CereVoice, developed by Cereproc. Both systems have
been used to mimic the voice of George W. Bush (43rd
president of the United States) using freely available
audio from the web. In addition we will present a
hybrid system which combines both technologies. We
demonstrate examples of synthetic voices created from
10, 40 and 210 minutes of randomly selected speech. We
will then discuss the underlying problems associated
with voice cloning using found audio, and the
scalability of our solution.},
categories = {speech synthesis, HMM-based speech synthesis, HTS,
speaker adaptation, voice conversion, average voice},
key = {cereproc-hts},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/03_AYLETT.pdf},
year = 2008
}
@inproceedings{Ayletetal09,
author = {Matthew P. Aylett and Simon King and Junichi Yamagishi},
title = {Speech Synthesis Without a Phone Inventory},
booktitle = {Interspeech},
pages = {2087--2090},
abstract = { In speech synthesis the unit inventory is decided
using phonological and phonetic expertise. This process
is resource intensive and potentially sub-optimal. In
this paper we investigate how acoustic clustering,
together with lexicon constraints, can be used to build
a self-organised inventory. Six English speech
synthesis systems were built using two frameworks, unit
selection and parametric HTS for three inventory
conditions: 1) a traditional phone set, 2) a system
using orthographic units, and 3) a self-organised
inventory. A listening test showed a strong preference
for the classic system, and for the orthographic system
over the self-organised system. Results also varied by
letter to sound complexity and database coverage. This
suggests the self-organised approach failed to
generalise pronunciation as well as introducing noise
above and beyond that caused by orthographic sound
mismatch.},
categories = {speech synthesis, unit selection, parametric
synthesis, phone inventory, orthographic synthesis},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/ma_interspeech09.pdf},
place = {Brighton},
year = 2009
}
@inproceedings{Aylett+King08,
author = {Matthew P. Aylett and Simon King},
title = {Single Speaker Segmentation and Inventory Selection
Using Dynamic Time Warping Self Organization and Joint
Multigram Mapping},
booktitle = {SSW06},
pages = {258--263},
abstract = {In speech synthesis the inventory of units is decided
by inspection and on the basis of phonological and
phonetic expertise. The ephone (or emergent phone)
project at CSTR is investigating how self organisation
techniques can be applied to build an inventory based
on collected acoustic data together with the
constraints of a synthesis lexicon. In this paper we
will describe a prototype inventory creation method
using dynamic time warping (DTW) for acoustic
clustering and a joint multigram approach for relating
a series of symbols that represent the speech to these
emerged units. We initially examined two symbol sets:
1) A baseline of standard phones 2) Orthographic
symbols. The success of the approach is evaluated by
comparing word boundaries generated by the emergent
phones against those created using state-of-the-art HMM
segmentation. Initial results suggest the DTW
segmentation can match word boundaries with a root mean
square error (RMSE) of 35ms. Results from mapping units
onto phones resulted in a higher RMSE of 103ms. This
error was increased when multiple multigram types were
added and when the default unit clustering was altered
from 40 (our baseline) to 10. Results for orthographic
matching had a higher RMSE of 125ms. To conclude we
discuss future work that we believe can reduce this
error rate to a level sufficient for the techniques to
be applied to a unit selection synthesis system. },
categories = {speech synthesis, unit selection, parametric
synthesis, phone inventory, orthographic synthesis},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/ssw06.pdf},
place = {Bonn},
year = 2008
}
@article{Stan2011442,
author = {Adriana Stan and Junichi Yamagishi and Simon King and
Matthew Aylett},
title = {The {R}omanian speech synthesis ({RSS}) corpus:
Building a high quality {HMM}-based speech synthesis
system using a high sampling rate},
journal = {Speech Communication},
volume = {53},
number = {3},
pages = {442--450},
note = {},
abstract = {This paper first introduces a newly-recorded high
quality Romanian speech corpus designed for speech
synthesis, called ``RSS'', along with Romanian
front-end text processing modules and HMM-based
synthetic voices built from the corpus. All of these
are now freely available for academic use in order to
promote Romanian speech technology research. The RSS
corpus comprises 3500 training sentences and 500 test
sentences uttered by a female speaker and was recorded
using multiple microphones at 96 kHz sampling
frequency in a hemianechoic chamber. The details of the
new Romanian text processor we have developed are also
given. Using the database, we then revisit some basic
configuration choices of speech synthesis, such as
waveform sampling frequency and auditory frequency
warping scale, with the aim of improving speaker
similarity, which is an acknowledged weakness of
current HMM-based speech synthesisers. As we
demonstrate using perceptual tests, these configuration
choices can make substantial differences to the quality
of the synthetic speech. Contrary to common practice in
automatic speech recognition, higher waveform sampling
frequencies can offer enhanced feature extraction and
improved speaker similarity for HMM-based speech
synthesis.},
doi = {10.1016/j.specom.2010.12.002},
issn = {0167-6393},
keywords = {Speech synthesis, HTS, Romanian, HMMs, Sampling
frequency, Auditory scale},
url = {http://www.sciencedirect.com/science/article/pii/S0167639310002074},
year = 2011
}
@inproceedings{mayoaylettladd:97,
author = {Mayo, C. and Aylett, M. and Ladd, D. R.},
title = {Prosodic transcription of Glasgow English: an
evaluation study of {GlaToBI}},
booktitle = {Intonation: Theory, Models and Applications},
categories = {intonation, perceptual evaluation, Glasgow English,
transcription, ToBI},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/esca2.pdf},
year = 1997
}
@inproceedings{leo_07-2,
author = {Matthew P. Aylett and J. Sebastian Andersson and
Leonardo Badino and Christopher J. Pidcock},
title = {The {C}erevoice {B}lizzard Entry 2007: Are Small
Database Errors Worse than Compression Artifacts?},
booktitle = {Proc. Blizzard Challenge Workshop 2007},
address = {Bonn, Germany},
abstract = {In commercial systems the memory footprint of unit
selection systems is often a key issue. This is
especially true for PDAs and other embedded devices. In
this year's Blizzard entry CereProc R gave itself the
criteria that the full database system entered would
have a smaller memory footprint than either of the two
smaller database entries. This was accomplished by
applying Speex speech compression to the full database
entry. In turn a set of small database techniques used
to improve the quality of small database systems in
last years entry were extended. Finally, for all
systems, two quality control methods were applied to
the underlying database to improve the lexicon and
transcription match to the underlying data. Results
suggest that mild audio quality artifacts introduced by
lossy compression have almost as much impact on MOS
perceived quality as concatenation errors introduced by
sparse data in the smaller systems with bulked
diphones.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blz3_004.pdf},
year = 2007
}