The Centre for Speech Technology Research, The university of Edinburgh

Publications by Oliver Watts

s0676515.bib

@phdthesis{watts-2012,
  author = {Oliver Watts},
  title = {Unsupervised Learning for Text-to-Speech Synthesis},
  school = {University of Edinburgh},
  abstract = {This thesis introduces a general method for incorporating 
                    the distributional analysis of textual and linguistic 
                    objects into text-to-speech (TTS) conversion systems. 
                    Conventional TTS conversion uses intermediate layers of 
                    representation to bridge the gap between text and speech. 
                    Collecting the annotated data needed to produce these 
                    intermediate layers is a far from trivial task, possibly 
                    prohibitively so for languages in which no such resources 
                    are in existence. Distributional analysis, in contrast, 
                    proceeds in an unsupervised manner, and so enables the 
                    creation of systems using textual data that are not 
                    annotated. The method therefore aids the building of 
                    systems for languages in which conventional linguistic 
                    resources are scarce, but is not restricted to these 
                    languages. The distributional analysis proposed here 
                    places the textual objects analysed in a continuous-valued
                    space, rather than specifying a hard categorisation of 
                    those objects. This space is then partitioned during the 
                    training of acoustic models for synthesis, so that the 
                    models generalise over objects' surface forms in a way 
                    that is acoustically relevant. The method is applied to 
                    three levels of textual analysis: to the characterisation
                    of sub-syllabic units, word units and utterances. Entire 
                    systems for three languages (English, Finnish and 
                    Romanian) are built with no reliance on manually labelled 
                    data or language-specific expertise. Results of a 
                    subjective evaluation are presented.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/oliver_watts_thesis.pdf},
  year = 2012
}
@inproceedings{watts_yamagishi_king_2011,
  author = {Oliver Watts and Junichi Yamagishi and Simon King},
  title = {Unsupervised continuous-valued word features for
                   phrase-break prediction without a part-of-speech tagger},
  booktitle = {Proc. Interspeech},
  pages = {2157--2160},
  address = {Florence, Italy},
  abstract = {Part of speech (POS) tags are foremost among the
                   features conventionally used to predict intonational
                   phrase-breaks for text to speech (TTS) conversion. The
                   construction of such systems therefore presupposes the
                   availability of a POS tagger for the relevant language,
                   or of a corpus manually tagged with POS. However, such
                   tools and resources are not available in the majority
                   of the world’s languages, and manually labelling text
                   with POS tags is an expensive and time-consuming
                   process. We therefore propose the use of
                   continuous-valued features that summarise the
                   distributional characteristics of word types as
                   surrogates for POS features. Importantly, such features
                   are obtained in an unsupervised manner from an untagged
                   text corpus. We present results on the phrase-break
                   prediction task, where use of the features closes the
                   gap in performance between a baseline system (using
                   only basic punctuation-related features) and a topline
                   system (incorporating a state-of-the-art POS tagger).},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/watts_yamagishi_king_2011.pdf},
  year = 2011
}
@inproceedings{hts-child-oliver,
  author = {Oliver Watts and Junichi Yamagishi and Kay Berkling
                   and Simon King},
  title = {{HMM}-based synthesis of child speech},
  booktitle = {Proc. of The 1st Workshop on Child, Computer and
                   Interaction (ICMI'08 post-conference workshop)},
  address = {Crete, Greece},
  abstract = {The synthesis of child speech presents challenges both
                   in the collection of data and in the building of a
                   synthesiser from that data. Because only limited data
                   can be collected, and the domain of that data is
                   constrained, it is difficult to obtain the type of
                   phonetically-balanced corpus usually used in speech
                   synthesis. As a consequence, building a synthesiser
                   from this data is difficult. Concatenative synthesisers
                   are not robust to corpora with many missing units (as
                   is likely when the corpus content is not carefully
                   designed), so we chose to build a statistical
                   parametric synthesiser using the HMM-based system HTS.
                   This technique has previously been shown to perform
                   well for limited amounts of data, and for data
                   collected under imperfect conditions. We compared 6
                   different configurations of the synthesiser, using both
                   speaker-dependent and speaker-adaptive modelling
                   techniques, and using varying amounts of data. The
                   output from these systems was evaluated alongside
                   natural and vocoded speech, in a Blizzard-style
                   listening test.},
  categories = {speech synthesis, HMM-based speech synthesis, HTS,
                   speaker adaptation, voice conversion, average voice,
                   child speech},
  key = {hts-child-oliver},
  month = oct,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/child-hts-oliver.pdf},
  year = 2008
}
@inproceedings{higher_level,
  author = {Oliver Watts and Junichi Yamagishi and Simon King},
  title = {The role of higher-level linguistic features in
                   {HMM}-based speech synthesis},
  booktitle = {Proc. Interspeech},
  pages = {841-844},
  address = {Makuhari, Japan},
  abstract = {We analyse the contribution of higher-level elements
                   of the linguistic specification of a data-driven speech
                   synthesiser to the naturalness of the synthetic speech
                   which it generates. The system is trained using various
                   subsets of the full feature-set, in which features
                   relating to syntactic category, intonational phrase
                   boundary, pitch accent and boundary tones are
                   selectively removed. Utterances synthesised by the
                   different configurations of the system are then
                   compared in a subjective evaluation of their
                   naturalness. The work presented forms background
                   analysis for an ongoing set of experiments in
                   performing text-to-speech (TTS) conversion based on
                   shallow features: features that can be trivially
                   extracted from text. By building a range of systems,
                   each assuming the availability of a different level of
                   linguistic annotation, we obtain benchmarks for our
                   on-going work.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100565.pdf},
  year = 2010
}
@inproceedings{child_synthesis_2009,
  author = {Oliver Watts and Junichi Yamagishi and Simon King and
                   Kay Berkling},
  title = {{HMM} Adaptation and Voice Conversion for the
                   Synthesis of Child Speech: A Comparison},
  booktitle = {Proc. Interspeech 2009},
  pages = {2627--2630},
  address = {Brighton, U.K.},
  abstract = {This study compares two different methodologies for
                   producing data-driven synthesis of child speech from
                   existing systems that have been trained on the speech
                   of adults. On one hand, an existing statistical
                   parametric synthesiser is transformed using model
                   adaptation techniques, informed by linguistic and
                   prosodic knowledge, to the speaker characteristics of a
                   child speaker. This is compared with the application of
                   voice conversion techniques to convert the output of an
                   existing waveform concatenation synthesiser with no
                   explicit linguistic or prosodic knowledge. In a
                   subjective evaluation of the similarity of synthetic
                   speech to natural speech from the target speaker, the
                   HMM-based systems evaluated are generally preferred,
                   although this is at least in part due to the higher
                   dimensional acoustic features supported by these
                   techniques.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/child_synthesis_2009.pdf},
  year = 2009
}
@inproceedings{watts_zhou_2011,
  author = {Oliver Watts and Bowen Zhou},
  title = {Unsupervised features from text for speech synthesis
                   in a speech-to-speech translation system},
  booktitle = {Proc. Interspeech},
  pages = {2153--2156},
  address = {Florence, Italy},
  abstract = {We explore the use of linguistic features for text to
                   speech (TTS) conversion in the context of a
                   speech-to-speech translation system that can be
                   extracted from unannotated text in an unsupervised,
                   language-independent fashion. The features are intended
                   to act as surrogates for conventional part of speech
                   (POS) features. Unlike POS features, the experimental
                   features assume only the availability of tools and data
                   that must already be in place for the construction of
                   other components of the translation system, and can
                   therefore be used for the TTS module without incurring
                   additional TTS-specific costs. We here describe the use
                   of the experimental features in a speech synthesiser,
                   using six different configurations of the system to
                   allow the comparison of the proposed features with
                   conventional, knowledge-based POS features. We present
                   results of objective and subjective evaluations of the
                   usefulness of the new features.},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/watts_zhou_2011.pdf},
  year = 2011
}
@article{child_speech_journal_2010,
  author = {Watts, O. and Yamagishi, J. and King, S. and Berkling,
                   K.},
  title = {Synthesis of Child Speech with {HMM} Adaptation and
                   Voice Conversion},
  journal = {Audio, Speech, and Language Processing, IEEE
                   Transactions on},
  volume = {18},
  number = {5},
  pages = {1005--1016},
  abstract = {The synthesis of child speech presents challenges both
                   in the collection of data and in the building of a
                   synthesizer from that data. We chose to build a
                   statistical parametric synthesizer using the hidden
                   Markov model (HMM)-based system HTS, as this technique
                   has previously been shown to perform well for limited
                   amounts of data, and for data collected under imperfect
                   conditions. Six different configurations of the
                   synthesizer were compared, using both speaker-dependent
                   and speaker-adaptive modeling techniques, and using
                   varying amounts of data. For comparison with HMM
                   adaptation, techniques from voice conversion were used
                   to transform existing synthesizers to the
                   characteristics of the target speaker. Speaker-adaptive
                   voices generally outperformed child speaker-dependent
                   voices in the evaluation. HMM adaptation outperformed
                   voice conversion style techniques when using the full
                   target speaker corpus; with fewer adaptation data,
                   however, no significant listener preference for either
                   HMM adaptation or voice conversion methods was found.},
  doi = {10.1109/TASL.2009.2035029},
  issn = {1558-7916},
  keywords = {HMM adaptation techniques;child speech
                   synthesis;hidden Markov model;speaker adaptive modeling
                   technique;speaker dependent technique;speaker-adaptive
                   voice;statistical parametric synthesizer;target speaker
                   corpus;voice conversion;hidden Markov models;speech
                   synthesis;},
  month = jul,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/YamagishiJ_Synthesis\%20of\%20Child\%20Speech.pdf},
  year = 2010
}
@inproceedings{junichi:interspeech2010,
  author = {Junichi Yamagishi and Oliver Watts and Simon King and
                   Bela Usabaev},
  title = {Roles of the Average Voice in Speaker-adaptive
                   {HMM}-based Speech Synthesis},
  booktitle = {{Proc. Interspeech}},
  pages = {418--421},
  address = {Makuhari, Japan},
  abstract = {In speaker-adaptive HMM-based speech synthesis, there
                   are typically a few speakers for which the output
                   synthetic speech sounds worse than that of other
                   speakers, despite having the same amount of adaptation
                   data from within the same corpus. This paper
                   investigates these fluctuations in quality and
                   concludes that as mel-cepstral distance from the
                   average voice becomes larger, the MOS naturalness
                   scores generally become worse. Although this negative
                   correlation is not that strong, it suggests a way to
                   improve the training and adaptation strategies. We also
                   draw comparisons between our findings and the work of
                   other researchers regarding ``vocal attractiveness.''},
  keywords = {speech synthesis, HMM, average voice, speaker
                   adaptation},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100361.pdf},
  year = 2010
}
@article{junichi:ieee2010,
  author = {J. Yamagishi and B. Usabaev and S. King and O. Watts
                   and J. Dines and J. Tian and R. Hu and Y. Guan and K.
                   Oura and K. Tokuda and R. Karhila and M. Kurimo},
  title = {Thousands of Voices for {HMM}-based Speech Synthesis
                   -- Analysis and Application of {TTS} Systems Built on
                   Various {ASR} Corpora},
  journal = {IEEE Transactions on Audio, Speech and Language
                   Processing},
  volume = 18,
  number = 5,
  pages = {984--1004},
  abstract = {In conventional speech synthesis, large amounts of
                   phonetically balanced speech data recorded in highly
                   controlled recording studio environments are typically
                   required to build a voice. Although using such data is
                   a straightforward solution for high quality synthesis,
                   the number of voices available will always be limited,
                   because recording costs are high. On the other hand,
                   our recent experiments with HMM-based speech synthesis
                   systems have demonstrated that speaker-adaptive
                   HMM-based speech synthesis (which uses an ``average
                   voice model'' plus model adaptation) is robust to
                   non-ideal speech data that are recorded under various
                   conditions and with varying microphones, that are not
                   perfectly clean, and/or that lack phonetic balance.
                   This enables us to consider building high-quality
                   voices on ``non-TTS'' corpora such as ASR corpora.
                   Since ASR corpora generally include a large number of
                   speakers, this leads to the possibility of producing an
                   enormous number of voices automatically. In this paper,
                   we demonstrate the thousands of voices for HMM-based
                   speech synthesis that we have made from several popular
                   ASR corpora such as the Wall Street Journal (WSJ0,
                   WSJ1, and WSJCAM0), Resource Management, Globalphone,
                   and SPEECON databases. We also present the results of
                   associated analysis based on perceptual evaluation, and
                   discuss remaining issues.},
  doi = {10.1109/TASL.2010.2045237},
  keywords = {Automatic speech recognition (ASR), H Triple S (HTS),
                   SPEECON database, WSJ database, average voice, hidden
                   Markov model (HMM)-based speech synthesis, speaker
                   adaptation, speech synthesis, voice conversion},
  month = jul,
  year = 2010
}
@inproceedings{jyamagis:1000sHTS,
  author = {J. Yamagishi and Bela Usabaev and Simon King and
                   Oliver Watts and John Dines and Jilei Tian and Rile Hu
                   and Yong Guan and Keiichiro Oura and Keiichi Tokuda and
                   Reima Karhila and Mikko Kurimo},
  title = {Thousands of voices for {HMM}-based speech synthesis},
  booktitle = {Proc. Interspeech},
  pages = {420--423},
  address = {Brighton, U.K.},
  abstract = {Our recent experiments with HMM-based speech synthesis
                   systems have demonstrated that speaker-adaptive
                   HMM-based speech synthesis (which uses an ‘average
                   voice model’ plus model adaptation) is robust to
                   non-ideal speech data that are recorded under various
                   conditions and with varying microphones, that are not
                   perfectly clean, and/or that lack of phonetic balance.
                   This enables us consider building high-quality voices
                   on ’non-TTS’ corpora such as ASR corpora. Since ASR
                   corpora generally include a large number of speakers,
                   this leads to the possibility of producing an enormous
                   number of voices automatically. In this paper we show
                   thousands of voices for HMM-based speech synthesis that
                   we have made from several popular ASR corpora such as
                   the Wall Street Journal databases (WSJ0/WSJ1/WSJCAM0),
                   Resource Management, Globalphone and Speecon. We report
                   some perceptual evaluation results and outline the
                   outstanding issues.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/index.php},
  year = 2009
}
@inproceedings{letter_based_TTS,
  author = {Oliver Watts and Junichi Yamagishi and Simon King},
  title = {Letter-based speech synthesis},
  booktitle = {Proc. Speech Synthesis Workshop 2010},
  pages = {317-322},
  address = {Nara, Japan},
  abstract = {Initial attempts at performing text-to-speech
                   conversion based on standard orthographic units are
                   presented, forming part of a larger scheme of training
                   TTS systems on features that can be trivially extracted
                   from text. We evaluate the possibility of using the
                   technique of decision-tree-based context clustering
                   conventionally used in HMM-based systems for
                   parametertying to handle letter-to-sound conversion. We
                   present the application of a method of compound-feature
                   discovery to corpusbased speech synthesis. Finally, an
                   evaluation of intelligibility of letter-based systems
                   and more conventional phoneme-based systems is
                   presented.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/ssw7.pdf},
  year = 2010
}