Publications by Oliver Watts
s0676515.bib
@phdthesis{watts-2012,
author = {Oliver Watts},
title = {Unsupervised Learning for Text-to-Speech Synthesis},
school = {University of Edinburgh},
abstract = {This thesis introduces a general method for incorporating
the distributional analysis of textual and linguistic
objects into text-to-speech (TTS) conversion systems.
Conventional TTS conversion uses intermediate layers of
representation to bridge the gap between text and speech.
Collecting the annotated data needed to produce these
intermediate layers is a far from trivial task, possibly
prohibitively so for languages in which no such resources
are in existence. Distributional analysis, in contrast,
proceeds in an unsupervised manner, and so enables the
creation of systems using textual data that are not
annotated. The method therefore aids the building of
systems for languages in which conventional linguistic
resources are scarce, but is not restricted to these
languages. The distributional analysis proposed here
places the textual objects analysed in a continuous-valued
space, rather than specifying a hard categorisation of
those objects. This space is then partitioned during the
training of acoustic models for synthesis, so that the
models generalise over objects' surface forms in a way
that is acoustically relevant. The method is applied to
three levels of textual analysis: to the characterisation
of sub-syllabic units, word units and utterances. Entire
systems for three languages (English, Finnish and
Romanian) are built with no reliance on manually labelled
data or language-specific expertise. Results of a
subjective evaluation are presented.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/oliver_watts_thesis.pdf},
year = 2012
}
@inproceedings{watts_yamagishi_king_2011,
author = {Oliver Watts and Junichi Yamagishi and Simon King},
title = {Unsupervised continuous-valued word features for
phrase-break prediction without a part-of-speech tagger},
booktitle = {Proc. Interspeech},
pages = {2157--2160},
address = {Florence, Italy},
abstract = {Part of speech (POS) tags are foremost among the
features conventionally used to predict intonational
phrase-breaks for text to speech (TTS) conversion. The
construction of such systems therefore presupposes the
availability of a POS tagger for the relevant language,
or of a corpus manually tagged with POS. However, such
tools and resources are not available in the majority
of the world’s languages, and manually labelling text
with POS tags is an expensive and time-consuming
process. We therefore propose the use of
continuous-valued features that summarise the
distributional characteristics of word types as
surrogates for POS features. Importantly, such features
are obtained in an unsupervised manner from an untagged
text corpus. We present results on the phrase-break
prediction task, where use of the features closes the
gap in performance between a baseline system (using
only basic punctuation-related features) and a topline
system (incorporating a state-of-the-art POS tagger).},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/watts_yamagishi_king_2011.pdf},
year = 2011
}
@inproceedings{hts-child-oliver,
author = {Oliver Watts and Junichi Yamagishi and Kay Berkling
and Simon King},
title = {{HMM}-based synthesis of child speech},
booktitle = {Proc. of The 1st Workshop on Child, Computer and
Interaction (ICMI'08 post-conference workshop)},
address = {Crete, Greece},
abstract = {The synthesis of child speech presents challenges both
in the collection of data and in the building of a
synthesiser from that data. Because only limited data
can be collected, and the domain of that data is
constrained, it is difficult to obtain the type of
phonetically-balanced corpus usually used in speech
synthesis. As a consequence, building a synthesiser
from this data is difficult. Concatenative synthesisers
are not robust to corpora with many missing units (as
is likely when the corpus content is not carefully
designed), so we chose to build a statistical
parametric synthesiser using the HMM-based system HTS.
This technique has previously been shown to perform
well for limited amounts of data, and for data
collected under imperfect conditions. We compared 6
different configurations of the synthesiser, using both
speaker-dependent and speaker-adaptive modelling
techniques, and using varying amounts of data. The
output from these systems was evaluated alongside
natural and vocoded speech, in a Blizzard-style
listening test.},
categories = {speech synthesis, HMM-based speech synthesis, HTS,
speaker adaptation, voice conversion, average voice,
child speech},
key = {hts-child-oliver},
month = oct,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/child-hts-oliver.pdf},
year = 2008
}
@inproceedings{higher_level,
author = {Oliver Watts and Junichi Yamagishi and Simon King},
title = {The role of higher-level linguistic features in
{HMM}-based speech synthesis},
booktitle = {Proc. Interspeech},
pages = {841-844},
address = {Makuhari, Japan},
abstract = {We analyse the contribution of higher-level elements
of the linguistic specification of a data-driven speech
synthesiser to the naturalness of the synthetic speech
which it generates. The system is trained using various
subsets of the full feature-set, in which features
relating to syntactic category, intonational phrase
boundary, pitch accent and boundary tones are
selectively removed. Utterances synthesised by the
different configurations of the system are then
compared in a subjective evaluation of their
naturalness. The work presented forms background
analysis for an ongoing set of experiments in
performing text-to-speech (TTS) conversion based on
shallow features: features that can be trivially
extracted from text. By building a range of systems,
each assuming the availability of a different level of
linguistic annotation, we obtain benchmarks for our
on-going work.},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100565.pdf},
year = 2010
}
@inproceedings{child_synthesis_2009,
author = {Oliver Watts and Junichi Yamagishi and Simon King and
Kay Berkling},
title = {{HMM} Adaptation and Voice Conversion for the
Synthesis of Child Speech: A Comparison},
booktitle = {Proc. Interspeech 2009},
pages = {2627--2630},
address = {Brighton, U.K.},
abstract = {This study compares two different methodologies for
producing data-driven synthesis of child speech from
existing systems that have been trained on the speech
of adults. On one hand, an existing statistical
parametric synthesiser is transformed using model
adaptation techniques, informed by linguistic and
prosodic knowledge, to the speaker characteristics of a
child speaker. This is compared with the application of
voice conversion techniques to convert the output of an
existing waveform concatenation synthesiser with no
explicit linguistic or prosodic knowledge. In a
subjective evaluation of the similarity of synthetic
speech to natural speech from the target speaker, the
HMM-based systems evaluated are generally preferred,
although this is at least in part due to the higher
dimensional acoustic features supported by these
techniques.},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/child_synthesis_2009.pdf},
year = 2009
}
@inproceedings{watts_zhou_2011,
author = {Oliver Watts and Bowen Zhou},
title = {Unsupervised features from text for speech synthesis
in a speech-to-speech translation system},
booktitle = {Proc. Interspeech},
pages = {2153--2156},
address = {Florence, Italy},
abstract = {We explore the use of linguistic features for text to
speech (TTS) conversion in the context of a
speech-to-speech translation system that can be
extracted from unannotated text in an unsupervised,
language-independent fashion. The features are intended
to act as surrogates for conventional part of speech
(POS) features. Unlike POS features, the experimental
features assume only the availability of tools and data
that must already be in place for the construction of
other components of the translation system, and can
therefore be used for the TTS module without incurring
additional TTS-specific costs. We here describe the use
of the experimental features in a speech synthesiser,
using six different configurations of the system to
allow the comparison of the proposed features with
conventional, knowledge-based POS features. We present
results of objective and subjective evaluations of the
usefulness of the new features.},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/watts_zhou_2011.pdf},
year = 2011
}
@article{child_speech_journal_2010,
author = {Watts, O. and Yamagishi, J. and King, S. and Berkling,
K.},
title = {Synthesis of Child Speech with {HMM} Adaptation and
Voice Conversion},
journal = {Audio, Speech, and Language Processing, IEEE
Transactions on},
volume = {18},
number = {5},
pages = {1005--1016},
abstract = {The synthesis of child speech presents challenges both
in the collection of data and in the building of a
synthesizer from that data. We chose to build a
statistical parametric synthesizer using the hidden
Markov model (HMM)-based system HTS, as this technique
has previously been shown to perform well for limited
amounts of data, and for data collected under imperfect
conditions. Six different configurations of the
synthesizer were compared, using both speaker-dependent
and speaker-adaptive modeling techniques, and using
varying amounts of data. For comparison with HMM
adaptation, techniques from voice conversion were used
to transform existing synthesizers to the
characteristics of the target speaker. Speaker-adaptive
voices generally outperformed child speaker-dependent
voices in the evaluation. HMM adaptation outperformed
voice conversion style techniques when using the full
target speaker corpus; with fewer adaptation data,
however, no significant listener preference for either
HMM adaptation or voice conversion methods was found.},
doi = {10.1109/TASL.2009.2035029},
issn = {1558-7916},
keywords = {HMM adaptation techniques;child speech
synthesis;hidden Markov model;speaker adaptive modeling
technique;speaker dependent technique;speaker-adaptive
voice;statistical parametric synthesizer;target speaker
corpus;voice conversion;hidden Markov models;speech
synthesis;},
month = jul,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/YamagishiJ_Synthesis\%20of\%20Child\%20Speech.pdf},
year = 2010
}
@inproceedings{junichi:interspeech2010,
author = {Junichi Yamagishi and Oliver Watts and Simon King and
Bela Usabaev},
title = {Roles of the Average Voice in Speaker-adaptive
{HMM}-based Speech Synthesis},
booktitle = {{Proc. Interspeech}},
pages = {418--421},
address = {Makuhari, Japan},
abstract = {In speaker-adaptive HMM-based speech synthesis, there
are typically a few speakers for which the output
synthetic speech sounds worse than that of other
speakers, despite having the same amount of adaptation
data from within the same corpus. This paper
investigates these fluctuations in quality and
concludes that as mel-cepstral distance from the
average voice becomes larger, the MOS naturalness
scores generally become worse. Although this negative
correlation is not that strong, it suggests a way to
improve the training and adaptation strategies. We also
draw comparisons between our findings and the work of
other researchers regarding ``vocal attractiveness.''},
keywords = {speech synthesis, HMM, average voice, speaker
adaptation},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100361.pdf},
year = 2010
}
@article{junichi:ieee2010,
author = {J. Yamagishi and B. Usabaev and S. King and O. Watts
and J. Dines and J. Tian and R. Hu and Y. Guan and K.
Oura and K. Tokuda and R. Karhila and M. Kurimo},
title = {Thousands of Voices for {HMM}-based Speech Synthesis
-- Analysis and Application of {TTS} Systems Built on
Various {ASR} Corpora},
journal = {IEEE Transactions on Audio, Speech and Language
Processing},
volume = 18,
number = 5,
pages = {984--1004},
abstract = {In conventional speech synthesis, large amounts of
phonetically balanced speech data recorded in highly
controlled recording studio environments are typically
required to build a voice. Although using such data is
a straightforward solution for high quality synthesis,
the number of voices available will always be limited,
because recording costs are high. On the other hand,
our recent experiments with HMM-based speech synthesis
systems have demonstrated that speaker-adaptive
HMM-based speech synthesis (which uses an ``average
voice model'' plus model adaptation) is robust to
non-ideal speech data that are recorded under various
conditions and with varying microphones, that are not
perfectly clean, and/or that lack phonetic balance.
This enables us to consider building high-quality
voices on ``non-TTS'' corpora such as ASR corpora.
Since ASR corpora generally include a large number of
speakers, this leads to the possibility of producing an
enormous number of voices automatically. In this paper,
we demonstrate the thousands of voices for HMM-based
speech synthesis that we have made from several popular
ASR corpora such as the Wall Street Journal (WSJ0,
WSJ1, and WSJCAM0), Resource Management, Globalphone,
and SPEECON databases. We also present the results of
associated analysis based on perceptual evaluation, and
discuss remaining issues.},
doi = {10.1109/TASL.2010.2045237},
keywords = {Automatic speech recognition (ASR), H Triple S (HTS),
SPEECON database, WSJ database, average voice, hidden
Markov model (HMM)-based speech synthesis, speaker
adaptation, speech synthesis, voice conversion},
month = jul,
year = 2010
}
@inproceedings{jyamagis:1000sHTS,
author = {J. Yamagishi and Bela Usabaev and Simon King and
Oliver Watts and John Dines and Jilei Tian and Rile Hu
and Yong Guan and Keiichiro Oura and Keiichi Tokuda and
Reima Karhila and Mikko Kurimo},
title = {Thousands of voices for {HMM}-based speech synthesis},
booktitle = {Proc. Interspeech},
pages = {420--423},
address = {Brighton, U.K.},
abstract = {Our recent experiments with HMM-based speech synthesis
systems have demonstrated that speaker-adaptive
HMM-based speech synthesis (which uses an ‘average
voice model’ plus model adaptation) is robust to
non-ideal speech data that are recorded under various
conditions and with varying microphones, that are not
perfectly clean, and/or that lack of phonetic balance.
This enables us consider building high-quality voices
on ’non-TTS’ corpora such as ASR corpora. Since ASR
corpora generally include a large number of speakers,
this leads to the possibility of producing an enormous
number of voices automatically. In this paper we show
thousands of voices for HMM-based speech synthesis that
we have made from several popular ASR corpora such as
the Wall Street Journal databases (WSJ0/WSJ1/WSJCAM0),
Resource Management, Globalphone and Speecon. We report
some perceptual evaluation results and outline the
outstanding issues.},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/index.php},
year = 2009
}
@inproceedings{letter_based_TTS,
author = {Oliver Watts and Junichi Yamagishi and Simon King},
title = {Letter-based speech synthesis},
booktitle = {Proc. Speech Synthesis Workshop 2010},
pages = {317-322},
address = {Nara, Japan},
abstract = {Initial attempts at performing text-to-speech
conversion based on standard orthographic units are
presented, forming part of a larger scheme of training
TTS systems on features that can be trivially extracted
from text. We evaluate the possibility of using the
technique of decision-tree-based context clustering
conventionally used in HMM-based systems for
parametertying to handle letter-to-sound conversion. We
present the application of a method of compound-feature
discovery to corpusbased speech synthesis. Finally, an
evaluation of intelligibility of letter-based systems
and more conventional phoneme-based systems is
presented.},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/ssw7.pdf},
year = 2010
}