The Centre for Speech Technology Research, The university of Edinburgh

Publications by Oliver Watts

owatts.bib

@inproceedings{LorenzoAlbayzinProposal2012,
  author = {Lorenzo-Trueba, Jaime and Watts, Oliver and Barra-Chicote, Roberto and Yamagishi, Junichi and King, Simon and Montero, Juan M},
  title = {Simple4All proposals for the Albayzin Evaluations in Speech Synthesis},
  abstract = {Simple4All is a European funded project that aims to streamline the production of multilanguage expressive synthetic voices by means of unsupervised data extraction techniques, allowing the automatic processing of freely available data into flexible task-specific voices. In this paper we describe three different approaches for this task, the first two covering enhancements in expressivity and flexibility with the final one focusing on the development of unsupervised voices. The first technique introduces the principle of speaker adaptation from average models consisting of multiple voices, with the second being an extension of this adaptation concept into allowing the control of the expressive strength of the synthetic voice. Finally, an unsupervised approach to synthesis capable of learning from unlabelled text data is introduced in detail},
  year = {2012},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/simple4all-proposal.pdf},
  booktitle = {Proc. Iberspeech 2012},
  categories = {Albayzin challenge, expressive speech synthesis}
}
@article{child_speech_journal_2010,
  author = {Watts, O. and Yamagishi, J. and King, S. and Berkling, K.},
  doi = {10.1109/TASL.2009.2035029},
  title = {Synthesis of Child Speech with {HMM} Adaptation and Voice Conversion},
  journal = {Audio, Speech, and Language Processing, IEEE Transactions on},
  issn = {1558-7916},
  number = {5},
  abstract = {The synthesis of child speech presents challenges both in the collection of data and in the building of a synthesizer from that data. We chose to build a statistical parametric synthesizer using the hidden Markov model (HMM)-based system HTS, as this technique has previously been shown to perform well for limited amounts of data, and for data collected under imperfect conditions. Six different configurations of the synthesizer were compared, using both speaker-dependent and speaker-adaptive modeling techniques, and using varying amounts of data. For comparison with HMM adaptation, techniques from voice conversion were used to transform existing synthesizers to the characteristics of the target speaker. Speaker-adaptive voices generally outperformed child speaker-dependent voices in the evaluation. HMM adaptation outperformed voice conversion style techniques when using the full target speaker corpus; with fewer adaptation data, however, no significant listener preference for either HMM adaptation or voice conversion methods was found.},
  month = {July},
  volume = {18},
  year = {2010},
  keywords = {HMM adaptation techniques;child speech synthesis;hidden Markov model;speaker adaptive modeling technique;speaker dependent technique;speaker-adaptive voice;statistical parametric synthesizer;target speaker corpus;voice conversion;hidden Markov models;speech synthesis;},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/YamagishiJ_SynthesisofChildSpeech.pdf},
  pages = {1005--1016}
}
@article{Ekpenyong2013,
  author = {Ekpenyong, Moses and Urua, Eno-Abasi and Watts, Oliver and King, Simon and Yamagishi, Junichi},
  numpages = {9},
  issue_date = {January, 2014},
  doi = {10.1016/j.specom.2013.02.003},
  title = {Statistical Parametric Speech Synthesis for {I}bibio},
  url = {http://dx.doi.org/10.1016/j.specom.2013.02.003},
  journal = {Speech Communication},
  issn = {0167-6393},
  month = {January},
  volume = {56},
  pages = {243--251},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Moses_Ibibio.pdf},
  abstract = {Ibibio is a Nigerian tone language, spoken in the south-east coastal region of Nigeria. Like most African languages, it is resource-limited. This presents a major challenge to conventional approaches to speech synthesis, which typically require the training of numerous predictive models of linguistic features such as the phoneme sequence (i.e., a pronunciation dictionary plus a letter-to-sound model) and prosodic structure (e.g., a phrase break predictor). This training is invariably supervised, requiring a corpus of training data labelled with the linguistic feature to be predicted. In this paper, we investigate what can be achieved in the absence of many of these expensive resources, and also with a limited amount of speech recordings. We employ a statistical parametric method, because this has been found to offer good performance even on small corpora, and because it is able to directly learn the relationship between acoustics and whatever linguistic features are available, potentially mitigating the absence of explicit representations of intermediate linguistic layers such as prosody. We present an evaluation that compares systems that have access to varying degrees of linguistic structure. The simplest system only uses phonetic context (quinphones), and this is compared to systems with access to a richer set of context features, with or without tone marking. It is found that the use of tone marking contributes significantly to the quality of synthetic speech. Future work should therefore address the problem of tone assignment using a dictionary and the building of a prediction module for out-of-vocabulary words.},
  categories = {HTS, Ibibio, Low-resource languages, Speech synthesis}
}
@phdthesis{watts-2012,
  author = {Watts, Oliver},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/oliver_watts_thesis.pdf},
  school = {University of Edinburgh},
  title = {Unsupervised Learning for Text-to-Speech Synthesis},
  abstract = {This thesis introduces a general method for incorporating the distributional analysis of textual and linguistic objects into text-to-speech (TTS) conversion systems. Conventional TTS conversion uses intermediate layers of representation to bridge the gap between text and speech. Collecting the annotated data needed to produce these intermediate layers is a far from trivial task, possibly prohibitively so for languages in which no such resources are in existence. Distributional analysis, in contrast, proceeds in an unsupervised manner, and so enables the creation of systems using textual data that are not annotated. The method therefore aids the building of systems for languages in which conventional linguistic resources are scarce, but is not restricted to these languages. The distributional analysis proposed here places the textual objects analysed in a continuous-valued space, rather than specifying a hard categorisation of those objects. This space is then partitioned during the training of acoustic models for synthesis, so that the models generalise over objects' surface forms in a way that is acoustically relevant. The method is applied to three levels of textual analysis: to the characterisation of sub-syllabic units, word units and utterances. Entire systems for three languages (English, Finnish and Romanian) are built with no reliance on manually labelled data or language-specific expertise. Results of a subjective evaluation are presented.},
  year = {2012}
}
@inproceedings{watts_yamagishi_king_2011,
  author = {Watts, Oliver and Yamagishi, Junichi and King, Simon},
  title = {Unsupervised continuous-valued word features for phrase-break prediction without a part-of-speech tagger},
  booktitle = {Proc. Interspeech},
  address = {Florence, Italy},
  month = {August},
  pages = {2157--2160},
  year = {2011},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/watts_yamagishi_king_2011.pdf},
  abstract = {Part of speech (POS) tags are foremost among the features conventionally used to predict intonational phrase-breaks for text to speech (TTS) conversion. The construction of such systems therefore presupposes the availability of a POS tagger for the relevant language, or of a corpus manually tagged with POS. However, such tools and resources are not available in the majority of the world’s languages, and manually labelling text with POS tags is an expensive and time-consuming process. We therefore propose the use of continuous-valued features that summarise the distributional characteristics of word types as surrogates for POS features. Importantly, such features are obtained in an unsupervised manner from an untagged text corpus. We present results on the phrase-break prediction task, where use of the features closes the gap in performance between a baseline system (using only basic punctuation-related features) and a topline system (incorporating a state-of-the-art POS tagger).}
}
@inproceedings{child_synthesis_2009,
  author = {Watts, Oliver and Yamagishi, Junichi and King, Simon and Berkling, Kay},
  title = {{HMM} Adaptation and Voice Conversion for the Synthesis of Child Speech: A Comparison},
  booktitle = {Proc. Interspeech 2009},
  address = {Brighton, U.K.},
  month = {September},
  pages = {2627--2630},
  year = {2009},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/child_synthesis_2009.pdf},
  abstract = {This study compares two different methodologies for producing data-driven synthesis of child speech from existing systems that have been trained on the speech of adults. On one hand, an existing statistical parametric synthesiser is transformed using model adaptation techniques, informed by linguistic and prosodic knowledge, to the speaker characteristics of a child speaker. This is compared with the application of voice conversion techniques to convert the output of an existing waveform concatenation synthesiser with no explicit linguistic or prosodic knowledge. In a subjective evaluation of the similarity of synthetic speech to natural speech from the target speaker, the HMM-based systems evaluated are generally preferred, although this is at least in part due to the higher dimensional acoustic features supported by these techniques.}
}
@inproceedings{leo_08-3,
  author = {Andersson, J. Sebastian and Badino, Leonardo and Watts, Oliver S. and P.Aylett, Matthew},
  title = {The {CSTR/Cereproc B}lizzard Entry 2008: The Inconvenient Data},
  booktitle = {Proc. Blizzard Challenge Workshop (in Proc. Interspeech 2008)},
  address = {Brisbane, Australia},
  year = {2008},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/cstr-cereproc_Blizzard2008.pdf},
  abstract = {In a commercial system data used for unit selection systems is collected with a heavy emphasis on homogeneous neutral data that has sufficient coverage for the units that will be used in the system. In this years Blizzard entry CSTR and CereProc present a joint entry where the emphasis has been to explore techniques to deal with data which is not homogeneous (the English entry) and did not have appropriate coverage for a diphone based system (the Mandarin entry where tone/phone combinations were treated as distinct phone categories). In addition, two further problems were addressed, 1) Making use of non-homogeneous data for creating a voice that can realise both expressive and neutral speaking styles (the English entry) 2) Building a unit selection system with no native understanding of the language but depending instead on external native evaluation (the Mandarin Entry).}
}
@inproceedings{hts-child-oliver,
  author = {Watts, Oliver and Yamagishi, Junichi and Berkling, Kay and King, Simon},
  title = {{HMM}-based synthesis of child speech},
  booktitle = {Proc. 1st Workshop on Child, Computer and Interaction (ICMI'08 post-conference workshop)},
  year = {2008},
  month = {October},
  key = {hts-child-oliver},
  address = {Crete, Greece},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/child-hts-oliver.pdf},
  abstract = {The synthesis of child speech presents challenges both in the collection of data and in the building of a synthesiser from that data. Because only limited data can be collected, and the domain of that data is constrained, it is difficult to obtain the type of phonetically-balanced corpus usually used in speech synthesis. As a consequence, building a synthesiser from this data is difficult. Concatenative synthesisers are not robust to corpora with many missing units (as is likely when the corpus content is not carefully designed), so we chose to build a statistical parametric synthesiser using the HMM-based system HTS. This technique has previously been shown to perform well for limited amounts of data, and for data collected under imperfect conditions. We compared 6 different configurations of the synthesiser, using both speaker-dependent and speaker-adaptive modelling techniques, and using varying amounts of data. The output from these systems was evaluated alongside natural and vocoded speech, in a Blizzard-style listening test.},
  categories = {speech synthesis, HMM-based speech synthesis, HTS, speaker adaptation, voice conversion, average voice, child speech}
}
@inproceedings{higher_level,
  author = {Watts, Oliver and Yamagishi, Junichi and King, Simon},
  title = {The role of higher-level linguistic features in {HMM}-based speech synthesis},
  booktitle = {Proc. Interspeech},
  address = {Makuhari, Japan},
  month = {September},
  pages = {841-844},
  year = {2010},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100565.pdf},
  abstract = {We analyse the contribution of higher-level elements of the linguistic specification of a data-driven speech synthesiser to the naturalness of the synthetic speech which it generates. The system is trained using various subsets of the full feature-set, in which features relating to syntactic category, intonational phrase boundary, pitch accent and boundary tones are selectively removed. Utterances synthesised by the different configurations of the system are then compared in a subjective evaluation of their naturalness. The work presented forms background analysis for an ongoing set of experiments in performing text-to-speech (TTS) conversion based on shallow features: features that can be trivially extracted from text. By building a range of systems, each assuming the availability of a different level of linguistic annotation, we obtain benchmarks for our on-going work.}
}
@inproceedings{watts_zhou_2011,
  author = {Watts, Oliver and Zhou, Bowen},
  title = {Unsupervised features from text for speech synthesis in a speech-to-speech translation system},
  booktitle = {Proc. Interspeech},
  address = {Florence, Italy},
  month = {August},
  pages = {2153--2156},
  year = {2011},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/watts_zhou_2011.pdf},
  abstract = {We explore the use of linguistic features for text to speech (TTS) conversion in the context of a speech-to-speech translation system that can be extracted from unannotated text in an unsupervised, language-independent fashion. The features are intended to act as surrogates for conventional part of speech (POS) features. Unlike POS features, the experimental features assume only the availability of tools and data that must already be in place for the construction of other components of the translation system, and can therefore be used for the TTS module without incurring additional TTS-specific costs. We here describe the use of the experimental features in a speech synthesiser, using six different configurations of the system to allow the comparison of the proposed features with conventional, knowledge-based POS features. We present results of objective and subjective evaluations of the usefulness of the new features.}
}
@inproceedings{junichi:interspeech2010,
  author = {Yamagishi, Junichi and Watts, Oliver and King, Simon and Usabaev, Bela},
  title = {Roles of the Average Voice in Speaker-adaptive {HMM}-based Speech Synthesis},
  booktitle = {{Proc. Interspeech}},
  address = {Makuhari, Japan},
  month = {September},
  pages = {418--421},
  year = {2010},
  keywords = {speech synthesis, HMM, average voice, speaker adaptation},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100361.pdf},
  abstract = {In speaker-adaptive HMM-based speech synthesis, there are typically a few speakers for which the output synthetic speech sounds worse than that of other speakers, despite having the same amount of adaptation data from within the same corpus. This paper investigates these fluctuations in quality and concludes that as mel-cepstral distance from the average voice becomes larger, the MOS naturalness scores generally become worse. Although this negative correlation is not that strong, it suggests a way to improve the training and adaptation strategies. We also draw comparisons between our findings and the work of other researchers regarding ``vocal attractiveness.''}
}
@article{junichi:ieee2010,
  author = {Yamagishi, J. and Usabaev, B. and King, S. and Watts, O. and Dines, J. and Tian, J. and Hu, R. and Guan, Y. and Oura, K. and Tokuda, K. and Karhila, R. and Kurimo, M.},
  doi = {10.1109/TASL.2010.2045237},
  title = {Thousands of Voices for {HMM}-based Speech Synthesis -- Analysis and Application of {TTS} Systems Built on Various {ASR} Corpora},
  journal = {IEEE Transactions on Audio, Speech and Language Processing},
  number = {5},
  month = {July},
  volume = {18},
  pages = {984--1004},
  year = {2010},
  keywords = {Automatic speech recognition (ASR), H Triple S (HTS), SPEECON database, WSJ database, average voice, hidden Markov model (HMM)-based speech synthesis, speaker adaptation, speech synthesis, voice conversion},
  abstract = {In conventional speech synthesis, large amounts of phonetically balanced speech data recorded in highly controlled recording studio environments are typically required to build a voice. Although using such data is a straightforward solution for high quality synthesis, the number of voices available will always be limited, because recording costs are high. On the other hand, our recent experiments with HMM-based speech synthesis systems have demonstrated that speaker-adaptive HMM-based speech synthesis (which uses an ``average voice model'' plus model adaptation) is robust to non-ideal speech data that are recorded under various conditions and with varying microphones, that are not perfectly clean, and/or that lack phonetic balance. This enables us to consider building high-quality voices on ``non-TTS'' corpora such as ASR corpora. Since ASR corpora generally include a large number of speakers, this leads to the possibility of producing an enormous number of voices automatically. In this paper, we demonstrate the thousands of voices for HMM-based speech synthesis that we have made from several popular ASR corpora such as the Wall Street Journal (WSJ0, WSJ1, and WSJCAM0), Resource Management, Globalphone, and SPEECON databases. We also present the results of associated analysis based on perceptual evaluation, and discuss remaining issues.}
}
@inproceedings{jyamagis:1000sHTS,
  author = {Yamagishi, J. and Usabaev, Bela and King, Simon and Watts, Oliver and Dines, John and Tian, Jilei and Hu, Rile and Guan, Yong and Oura, Keiichiro and Tokuda, Keiichi and Karhila, Reima and Kurimo, Mikko},
  title = {Thousands of voices for {HMM}-based speech synthesis},
  booktitle = {Proc. Interspeech},
  address = {Brighton, U.K.},
  month = {September},
  pages = {420--423},
  year = {2009},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/index.php},
  abstract = {Our recent experiments with HMM-based speech synthesis systems have demonstrated that speaker-adaptive HMM-based speech synthesis (which uses an ‘average voice model’ plus model adaptation) is robust to non-ideal speech data that are recorded under various conditions and with varying microphones, that are not perfectly clean, and/or that lack of phonetic balance. This enables us consider building high-quality voices on ’non-TTS’ corpora such as ASR corpora. Since ASR corpora generally include a large number of speakers, this leads to the possibility of producing an enormous number of voices automatically. In this paper we show thousands of voices for HMM-based speech synthesis that we have made from several popular ASR corpora such as the Wall Street Journal databases (WSJ0/WSJ1/WSJCAM0), Resource Management, Globalphone and Speecon. We report some perceptual evaluation results and outline the outstanding issues.}
}
@inproceedings{letter_based_TTS,
  author = {Watts, Oliver and Yamagishi, Junichi and King, Simon},
  title = {Letter-based speech synthesis},
  booktitle = {Proc. Speech Synthesis Workshop 2010},
  address = {Nara, Japan},
  month = {September},
  pages = {317-322},
  year = {2010},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/ssw7.pdf},
  abstract = {Initial attempts at performing text-to-speech conversion based on standard orthographic units are presented, forming part of a larger scheme of training TTS systems on features that can be trivially extracted from text. We evaluate the possibility of using the technique of decision-tree-based context clustering conventionally used in HMM-based systems for parametertying to handle letter-to-sound conversion. We present the application of a method of compound-feature discovery to corpusbased speech synthesis. Finally, an evaluation of intelligibility of letter-based systems and more conventional phoneme-based systems is presented.}
}
@inproceedings{Lu_SSW8,
  author = {Lu, Heng and King, Simon and Watts, Oliver},
  title = {Combining a Vector Space Representation of Linguistic Context with a Deep Neural Network for Text-To-Speech Synthesis},
  booktitle = {8th ISCA Workshop on Speech Synthesis},
  address = {Barcelona, Spain},
  month = {August},
  pages = {281--285},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/ssw8_PS3-3_Lu.pdf},
  abstract = {Conventional statistical parametric speech synthesis relies on decision trees to cluster together similar contexts, result- ing in tied-parameter context-dependent hidden Markov models (HMMs). However, decision tree clustering has a major weak- ness: it use hard division and subdivides the model space based on one feature at a time, fragmenting the data and failing to exploit interactions between linguistic context features. These linguistic features themselves are also problematic, being noisy and of varied relevance to the acoustics. We propose to combine our previous work on vector-space representations of linguistic context, which have the added ad- vantage of working directly from textual input, and Deep Neural Networks (DNNs), which can directly accept such continuous representations as input. The outputs of the network are probability distributions over speech features. Maximum Likelihood Parameter Generation is then used to create parameter trajectories, which in turn drive a vocoder to generate the waveform. Various configurations of the system are compared, using both conventional and vector space context representations and with the DNN making speech parameter predictions at two dif- ferent temporal resolutions: frames, or states. Both objective and subjective results are presented.}
}
@inproceedings{Mamiya_SSW8,
  author = {Mamiya, Yoshitaka and Stan, Adriana and Yamagishi, Junichi and Bell, Peter and Watts, Oliver and Clark, Robert and King, Simon},
  title = {Using Adaptation to Improve Speech Transcription Alignment in Noisy and Reverberant Environments},
  booktitle = {8th ISCA Workshop on Speech Synthesis},
  address = {Barcelona, Spain},
  month = {August},
  pages = {61--66},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/ssw8_PS1-4_Mamiya.pdf},
  abstract = {When using data retrieved from the internet to create new speech databases, the recording conditions can often be highly variable within and between sessions. This variance influences the overall performance of any automatic speech and text alignment techniques used to process this data. In this paper we discuss the use of speaker adaptation methods to address this issue. Starting from a baseline system for automatic sentence-level segmentation and speech and text alignment based on GMMs and grapheme HMMs, respectively, we employ Maximum A Posteriori (MAP) and Constrained Maximum Likelihood Linear Regression (CMLLR) techniques to model the variation in the data in order to increase the amount of confidently aligned speech. We tested 29 different scenarios, which include reverberation, 8 talker babble noise and white noise, each in various combinations and SNRs. Results show that the MAP-based segmentation's performance is very much influenced by the noise type, as well as the presence or absence of reverberation. On the other hand, the CMLLR adaptation of the acoustic models gives an average 20\% increase in the aligned data percentage for the majority of the studied scenarios.}
}
@inproceedings{Watts_SSW8,
  author = {Watts, Oliver and Stan, Adriana and Clark, Rob and Mamiya, Yoshitaka and Giurgiu, Mircea and Yamagishi, Junichi and King, Simon},
  title = {Unsupervised and lightly-supervised learning for rapid construction of {TTS} systems in multiple languages from 'found' data: evaluation and analysis},
  booktitle = {8th ISCA Workshop on Speech Synthesis},
  address = {Barcelona, Spain},
  month = {August},
  pages = {121--126},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/ssw8_OS2-3_Watts.pdf},
  abstract = {This paper presents techniques for building text-to-speech front-ends in a way that avoids the need for language-specific expert knowledge, but instead relies on universal resources (such as the Unicode character database) and unsupervised learning from unannotated data to ease system development. The acquisition of expert language-specific knowledge and expert annotated data is a major bottleneck in the development of corpus-based TTS systems in new languages. The methods presented here side-step the need for such resources as pronunciation lexicons, phonetic feature sets, part of speech tagged data, etc. The paper explains how the techniques introduced are applied to the 14 languages of a corpus of `found' audiobook data. Results of an evaluation of the intelligibility of the systems resulting from applying these novel techniques to this data are presented.}
}
@inproceedings{Lorenzo-Trueba_SSW8,
  author = {Lorenzo-Trueba, Jaime and Barra-Chicote, Roberto and Yamagishi, Junichi and Watts, Oliver and Montero, Juan M.},
  title = {Towards Speaking Style Transplantation in Speech Synthesis},
  booktitle = {8th ISCA Workshop on Speech Synthesis},
  address = {Barcelona, Spain},
  month = {August},
  pages = {179--183},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/ssw8_PS2-3_Lorenzo-Trueba.pdf},
  abstract = {One of the biggest challenges in speech synthesis is the production of naturally sounding synthetic voices. This means that the resulting voice must be not only of high enough quality but also that it must be able to capture the natural expressiveness imbued in human speech. This paper focus on solving the expressiveness problem by proposing a set of different techniques that could be used for extrapolating the expressiveness of proven high quality speaking style models into neutral speakers in HMM-based synthesis. As an additional advantage, the proposed techniques are based on adaptation approaches, which means that they can be used with little training data (around 15 minutes of training data are used in each style for this pa- per). For the final implementation, a set of 4 speaking styles were considered: news broadcasts, live sports commentary, interviews and parliamentary speech. Finally, the implementation of the 5 techniques were tested through a perceptual evaluation that proves that the deviations between neutral and speaking style average models can be learned and used to imbue expressiveness into target neutral speakers as intended.}
}
@inproceedings{Stan_IS13,
  author = {Stan, Adriana and Watts, Oliver and Mamiya, Yoshitaka and Giurgiu, Mircea and Clark, Rob and Yamagishi, Junichi and King, Simon},
  title = {{TUNDRA: A Multilingual Corpus of Found Data for TTS Research Created with Light Supervision}},
  booktitle = {Proc. Interspeech},
  address = {Lyon, France},
  month = {August},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/IS131055.pdf},
  abstract = {Simple4All Tundra (version 1.0) is the first release of a standardised multilingual corpus designed for text-to-speech research with imperfect or found data. The corpus consists of approximately 60 hours of speech data from audiobooks in 14 languages, as well as utterance-level alignments obtained with a lightly-supervised process. Future versions of the corpus will include finer-grained alignment and prosodic annotation, all of which will be made freely available. This paper gives a general outline of the data collected so far, as well as a detailed description of how this has been done, emphasizing the minimal language-specific knowledge and manual intervention used to compile the corpus. To demonstrate its potential use, text-to-speech systems have been built for all languages using unsupervised or lightly supervised methods, also briefly presented in the paper.}
}
@inproceedings{blizzard_13,
  author = {Watts, Oliver and Stan, Adriana and Mamiya, Yoshitaka and Suni, Antti and Burgos, José Martín and Montero, Juan Manuel},
  title = {{The {Simple4All} entry to the Blizzard Challenge 2013}},
  booktitle = {Proc. Blizzard Challenge 2013},
  month = {August},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/S4A_blizzard_2013.pdf},
  abstract = {We describe the synthetic voices entered into the 2013 Blizzard Challenge by the SIMPLE4ALL consortium. The 2013 Blizzard Challenge presents an opportunity to test and benchmark some of the tools we have been developing to address two problems of interest: 1) how best to learn from plentiful 'found' data, and 2) how to produce systems in arbitrary new languages with minimal annotated data and language-specific expertise on the part of the system builders. We here explain how our tools were used to address these problems on the different tasks of the challenge, and provide some discussion of the evaluation results.}
}
@inproceedings{Mamiya_13a,
  author = {Mamiya, Yoshitaka and Yamagishi, Junichi and Watts, Oliver and Clark, Robert A.J. and King, Simon and Stan, Adriana},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/0007987.pdf},
  booktitle = {Proc. ICASSP},
  title = {LIGHTLY SUPERVISED GMM VAD TO USE AUDIOBOOK FOR SPEECH SYNTHESISER},
  abstract = {Audiobooks have been focused on as promising data for training Text-to-Speech (TTS) systems. However, they usually do not have a correspondence between audio and text data. Moreover, they are usually divided only into chapter units. In practice, we have to make a correspondence of audio and text data before we use them for building TTS synthesisers. However aligning audio and text data is time-consuming and involves manual labor. It also requires persons skilled in speech processing. Previously, we have proposed to use graphemes for automatically aligning speech and text data. This paper further integrates a lightly supervised voice activity detection (VAD) technique to detect sentence boundaries as a pre-processing step before the grapheme approach. This lightly supervised technique requires time stamps of speech and silence only for the first fifty sentences. Combining those, we can semi-automatically build TTS systems from audiobooks with minimum manual intervention. From subjective evaluations we analyse how the grapheme-based aligner and/or the proposed VAD technique impact the quality of HMM-based speech synthesisers trained on audiobooks.},
  year = {2013}
}
@inproceedings{boros-2014,
  author = {Boroș, Tiberiu and Stan, Adriana and Watts, Oliver and Dumitrescu, Stefan Daniel},
  title = {{RSS-TOBI} - a Prosodically Enhanced {R}omanian Speech Corpus},
  booktitle = {Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC'14)},
  address = {Reykjavik, Iceland},
  month = {May},
  year = {2014},
  date = {26-31},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/boros-2014.pdf},
  abstract = {This paper introduces a recent development of a Romanian Speech corpus to include prosodic annotations of the speech data in the form of ToBI labels. We describe the methodology of determining the required pitch patterns that are common for the Romanian language, annotate the speech resource, and then provide a comparison of two text-to-speech synthesis systems to establish the benefits of using this type of information to our speech resource. The result is a publicly available speech dataset which can be used to further develop speech synthesis systems or to automatically learn the prediction of ToBI labels from text in Romanian language.},
  categories = {text-to-speech synthesis, Romanian language, ToBI}
}
@inproceedings{watts-2014,
  author = {Watts, Oliver and Gangireddy, Siva and Yamagishi, Junichi and King, Simon and Renals, Steve and Stan, Adriana and Giurgiu, Mircea},
  title = {NEURAL NET WORD REPRESENTATIONS FOR PHRASE-BREAK PREDICTION WITHOUT A PART OF SPEECH TAGGER},
  booktitle = {Proc. ICASSP},
  address = {Florence, Italy},
  abstract = {The use of shared projection neural nets of the sort used in language modelling is proposed as a way of sharing parameters between multiple text-to-speech system components. We experiment with pretraining the weights of such a shared projection on an auxiliary language modelling task and then apply the resulting word representations to the task of phrase-break prediction. Doing so allows us to build phrase-break predictors that rival conventional systems without any reliance on conventional knowledge-based resources such as part of speech taggers.},
  month = {May},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/watts-2014.pdf},
  pages = {2618--2622},
  categories = {Speech synthesis, TTS, unsupervised learning, neural net language modelling, multitask learning}
}
@inproceedings{blizzard_14,
  author = {Suni, Antti and Raitio, Tuomo and Gowda, Dhananjaya and Karhila, Reima and Gibson, Matt and Watts, Oliver},
  title = {{The {Simple4All} entry to the Blizzard Challenge 2014}},
  booktitle = {Proc. Blizzard Challenge 2014},
  month = {September},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/blizzard_14.pdf},
  abstract = {We describe the synthetic voices entered into the 2014 Blizzard Challenge by the SIMPLE4ALL consortium. The 2014 Blizzard Challenge presents an opportunity to test and benchmark some of the tools we have been developing to address the problem of how to produce systems in arbitrary new languages with minimal annotated data and language-specific expertise on the part of the system builders. We here explain how our tools were used to address these problems on the different tasks of the challenge, and provide some discussion of the evaluation results. Several additions to the system used to build voices for the previous Challenge are described: naive alphabetisation, unsupervised syllabification, and glottal flow pulse prediction using deep neural networks.},
  categories = {statistical parametric speech synthesis, unsupervised learning, vector space model, glottal inverse filtering, deep neural network, glottal flow pulse library}
}
@inproceedings{dnnbmtl_ICASSP15,
  author = {Wu, Z. and Valentini-Botinhao, C. and Watts, O. and King, S.},
  title = {{Deep neural networks employing multi-task learning and stacked bottleneck features for speech synthesis.}},
  booktitle = {Proc. ICASSP},
  address = {Brisbane, Australia},
  month = {April},
  pages = {4460-4464},
  year = {2015},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/dnnbmtl_ICASSP15.pdf},
  abstract = {Deep neural networks (DNNs) use a cascade of hidden representations to enable the learning of complex mappings from input to output features. They are able to learn the complex mapping from textbased linguistic features to speech acoustic features, and so perform text-to-speech synthesis. Recent results suggest that DNNs can produce more natural synthetic speech than conventional HMM-based statistical parametric systems. In this paper, we show that the hidden representation used within a DNN can be improved through the use of Multi-Task Learning, and that stacking multiple frames of hidden layer activations (stacked bottleneck features) also leads to improvements. Experimental results confirmed the effectiveness of the proposed methods, and in listening tests we find that stacked bottleneck features in particular offer a significant improvement over both a baseline DNN and a benchmark HMM system.}
}
@inproceedings{Merritt2015RichContext,
  author = {Merritt, Thomas and Yamagishi, Junichi and Wu, Zhizheng and Watts, Oliver and King, Simon},
  title = {{Deep neural network context embeddings for model selection in rich-context HMM synthesis}},
  booktitle = {{Proc. Interspeech}},
  year = {2015},
  month = {September},
  address = {Dresden},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/merritt2015RichContext.pdf},
  abstract = {{This paper introduces a novel form of parametric synthesis that uses context embeddings produced by the bottleneck layer of a deep neural network to guide the selection of models in a rich-context HMM-based synthesiser. Rich-context synthesis – in which Gaussian distributions estimated from single linguistic contexts seen in the training data are used for synthesis, rather than more conventional decision tree-tied models – was originally proposed to address over-smoothing due to averaging across contexts. Our previous investigations have confirmed experimentally that averaging across different contexts is indeed one of the largest factors contributing to the limited quality of statistical parametric speech synthesis. However, a possible weakness of the rich context approach as previously formulated is that a conventional tied model is still used to guide selection of Gaussians at synthesis time. Our proposed approach replaces this with context embeddings derived from a neural network.}},
  categories = {{speech synthesis, hidden Markov model, deep neural networks, rich context, embedding}}
}
@inproceedings{wu2015mtl,
  author = {Wu, Zhizheng and Valentini-Botinhao, Cassia and Watts, Oliver and King, Simon},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/icassp2015_dnn_tts.pdf},
  booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
  title = {Deep neural network employing multi-task learning and stacked bottleneck features for speech synthesis},
  year = {2015}
}
@article{stan-2016,
  author = {Stan, Adriana and Mamiya, Yoshitaka and Yamagishi, Junichi and Bell, Peter and Watts, Oliver and Clark, Rob and King, Simon},
  doi = {http://dx.doi.org/10.1016/j.csl.2015.06.006},
  title = {{ALISA}: An automatic lightly supervised speech segmentation and alignment tool},
  url = {http://www.sciencedirect.com/science/article/pii/S0885230815000650},
  journal = {Computer Speech and Language},
  issn = {0885-2308},
  pages = {116--133},
  volume = {35},
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/stan-2016.pdf},
  abstract = {This paper describes the ALISA tool, which implements a lightly supervised method for sentence-level alignment of speech with imperfect transcripts. Its intended use is to enable the creation of new speech corpora from a multitude of resources in a language-independent fashion, thus avoiding the need to record or transcribe speech data. The method is designed so that it requires minimum user intervention and expert knowledge, and it is able to align data in languages which employ alphabetic scripts. It comprises a GMM-based voice activity detector and a highly constrained grapheme-based speech aligner. The method is evaluated objectively against a gold standard segmentation and transcription, as well as subjectively through building and testing speech synthesis systems from the retrieved data. Results show that on average, 70% of the original data is correctly aligned, with a word error rate of less than 0.5%. In one case, subjective listening tests show a statistically significant preference for voices built on the gold transcript, but this is small and in other tests, no statistically significant differences between the systems built from the fully supervised training data and the one which uses the proposed method are found.},
  categories = {Speech segmentation, speech and text alignment, grapheme acoustic models, lightly supervised system, imperfect transcripts}
}
@inproceedings{wester2016evaluating,
  author = {Wester, Mirjam and Watts, Oliver and Henter, Gustav Eje},
  title = {Evaluating comprehension of natural and synthetic conversational speech},
  url = {http://www.isca-speech.org/archive/sp2016/pdfs_stamped/41.pdf},
  abstract = {Current speech synthesis methods typically operate on isolated sentences and lack convincing prosody when generating longer segments of speech. Similarly, prevailing TTS evaluation paradigms, such as intelligibility (transcription word error rate) or MOS, only score sentences in isolation, even though overall comprehension is arguably more important for speech-based communication. In an effort to develop more ecologically-relevant evaluation techniques that go beyond isolated sentences, we investigated comprehension of natural and synthetic speech dialogues. Specifically, we tested listener comprehension on long segments of spontaneous and engaging conversational speech (three 10-minute radio interviews of comedians). Interviews were reproduced either as natural speech, synthesised from carefully prepared transcripts, or synthesised using durations from forced-alignment against the natural speech, all in a balanced design. Comprehension was measured using multiple choice questions. A significant difference was measured between the comprehension/retention of natural speech (74\% correct responses) and synthetic speech with forced-aligned durations (61\% correct responses). However, no significant difference was observed between natural and regular synthetic speech (70\% correct responses). Effective evaluation of comprehension remains elusive.},
  year = {2016},
  month = {June},
  volume = {8},
  pages = {736--740},
  address = {Boston, MA},
  keywords = {evaluation, comprehension, conversational speech, statistical parametric speech synthesis},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/41.pdf},
  booktitle = {Speech Prosody},
  categories = {evaluation, comprehension, conversational speech, statistical parametric speech synthesis}
}
@inproceedings{henter2016robust,
  author = {Henter, Gustav Eje and Ronanki, Srikanth and Watts, Oliver and Wester, Mirjam and Wu, Zhizheng and King, Simon},
  title = {Robust {TTS} duration modelling using {DNN}s},
  url = {http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=7472655},
  abstract = {Accurate modelling and prediction of speech-sound durations is an important component in generating more natural synthetic speech. Deep neural networks (DNNs) offer a powerful modelling paradigm, and large, found corpora of natural and expressive speech are easy to acquire for training them. Unfortunately, found datasets are seldom subject to the quality-control that traditional synthesis methods expect. Common issues likely to affect duration modelling include transcription errors, reductions, filled pauses, and forced-alignment inaccuracies. To combat this, we propose to improve modelling and prediction of speech durations using methods from robust statistics, which are able to disregard ill-fitting points in the training material. We describe a robust fitting criterion based on the density power divergence (the beta-divergence) and a robust generation heuristic using mixture density networks (MDNs). Perceptual tests indicate that subjects prefer synthetic speech generated using robust models of duration over the baselines.},
  year = {2016},
  month = {March},
  volume = {41},
  pages = {5130--5134},
  address = {Shanghai, China},
  keywords = {Speech synthesis, duration modelling, robust statistics},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/henter2016robust.pdf},
  booktitle = {Proc. ICASSP},
  categories = {Speech synthesis, duration modelling, robust statistics}
}
@inproceedings{watts2016hmms,
  author = {Watts, Oliver and Henter, Gustav Eje and Merritt, Thomas and Wu, Zhizheng and King, Simon},
  title = {From {HMM}s to {DNN}s: where do the improvements come from?},
  url = {http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=7472730},
  abstract = {Deep neural networks (DNNs) have recently been the focus of much text-to-speech research as a replacement for decision trees and hidden Markov models (HMMs) in statistical parametric synthesis systems. Performance improvements have been reported; however, the configuration of systems evaluated makes it impossible to judge how much of the improvement is due to the new machine learning methods, and how much is due to other novel aspects of the systems. Specifically, whereas the decision trees in HMM-based systems typically operate at the state-level, and separate trees are used to handle separate acoustic streams, most DNN-based systems are trained to make predictions simultaneously for all streams at the level of the acoustic frame. This paper isolates the influence of three factors (machine learning method; state vs. frame predictions; separate vs. combined stream predictions) by building a continuum of systems along which only a single factor is varied at a time. We find that replacing decision trees with DNNs and moving from state-level to frame-level predictions both significantly improve listeners' naturalness ratings of synthetic speech produced by the systems. No improvement is found to result from switching from separate-stream to combined-stream predictions.},
  year = {2016},
  month = {March},
  volume = {41},
  pages = {5505--5509},
  address = {Shanghai, China},
  keywords = {speech synthesis, hidden Markov model, decision tree, deep neural network},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/watts2016hmms.pdf},
  booktitle = {Proc. ICASSP},
  categories = {speech synthesis, hidden Markov model, decision tree, deep neural network}
}
@inproceedings{cstr2016blizzard,
  author = {Merritt, Thomas and Ronanki, Srikanth and Wu, Zhizheng and Watts, Oliver},
  title = {The {CSTR} entry to the {Blizzard Challenge} 2016},
  booktitle = {Proc. Blizzard Challenge},
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/Cstr2016BlizzardEntry.pdf},
  abstract = {This paper describes the text-to-speech system entered by The Centre for Speech Technology Research into the 2016 Blizzard Challenge. This system is a hybrid synthesis system which uses output from a recurrent neural network to drive a unit selection synthesiser. The annual Blizzard Challenge conducts side-by-side testing of a number of speech synthesis systems trained on a common set of speech data. The task of the 2016 Blizzard Challenge is to train on expressively-read children’s storybooks, and to synthesise speech in the same domain. The Challenge therefore presents an opportunity to test the effectiveness of several techniques we have developed when applied to expressive speech data.},
  categories = {hybrid synthesis, statistical parametric speech synthesis, deep neural network, recurrent neural network, unit selection}
}