The Centre for Speech Technology Research, The university of Edinburgh

Publications by Simon King

simonk.bib

@article{Cassia_CSL13,
  author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S. and Maia, R.},
  doi = {10.1016/j.csl.2013.06.001},
  title = {Intelligibility enhancement of {HMM}-generated speech in additive noise by modifying Mel cepstral coefficients to increase the Glimpse Proportion},
  journal = {Computer Speech and Language},
  number = {2},
  pages = {665--686},
  volume = {28},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/Cassia_CSL14.pdf},
  abstract = {This paper describes speech intelligibility enhancement for hidden Markov model (HMM) generated synthetic speech in noise. We present a method for modifying the Mel cepstral coefficients generated by statistical parametric models that have been trained on plain speech. We update these coefficients such that the Glimpse Proportion – an objective measure of the intelligibility of speech in noise – increases, while keeping the speech energy fixed. An acoustic analysis reveals that the modified speech is boosted in the region 1-4kHz, particularly for vowels, nasals and approximants. Results from listening tests employing speech-shaped noise show that the modified speech is as intelligible as a synthetic voice trained on plain speech whose duration, Mel cepstral coefficients and excitation signal parameters have been adapted to Lombard speech from the same speaker. Our proposed method does not require these additional recordings of Lombard speech. In the presence of a competing talker, both modification and adaptation of spectral coefficients give more modest gains.}
}
@inproceedings{Cassia_IS13,
  author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S. and Stylianou, Y.},
  title = {{Combining perceptually-motivated spectral shaping with loudness and duration modification for intelligibility enhancement of HMM-based synthetic speech in noise}},
  booktitle = {Proc. Interspeech},
  address = {Lyon, France},
  month = {August},
  year = {2013},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2013/Cassia_IS13.pdf},
  abstact = {This paper presents our entry to a speech-in-noise intelligibility enhancement evaluation: the Hurricane Challenge. The system consists of a Text-To-Speech voice manipulated through a combination of enhancement strategies, each of which is known to be individually successful: a perceptually-motivated spectral shaper based on the Glimpse Proportion measure, dynamic range compression, and adaptation to Lombard excitation and duration patterns. We achieved substantial intelligibility improvements relative to unmodified synthetic speech: 4.9 dB in competing speaker and 4.1 dB in speech-shaped noise. An analysis conducted across this and other two similar evaluations shows that the spectral shaper and the compressor (both of which are loudness boosters) contribute most under higher SNR conditions, particularly for speech-shaped noise. Duration and excitation Lombard-adapted changes are more beneficial in lower SNR conditions, and for competing speaker noise.}
}
@inproceedings{Cassia_ICASSP13,
  author = {Valentini-Botinhao, C. and Godoy, E. and Stylianou, Y. and Sauert, B. and King, S. and Yamagishi, J.},
  title = {{Improving intelligibility in noise of HMM-generated speech via noise-dependent and -independent methods.}},
  booktitle = {Proc. ICASSP},
  address = {Vancouver, Canada},
  month = {May},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Cassia_ICASSP13.pdf},
  abstact = {In order to improve the intelligibility of HMM-generated Text-to- Speech (TTS) in noise, this work evaluates several speech enhancement methods, exploring combinations of noise-independent and -dependent approaches as well as algorithms previously developed for natural speech. We evaluate one noise-dependent method proposed for TTS, based on the glimpse proportion measure, and three approaches originally proposed for natural speech - one that estimates the noise and is based on the speech intelligibility index, and two noise-independent methods based on different spectral shaping techniques followed by dynamic range compression. We demonstrate how these methods influence the average spectra for different phone classes. We then present results of a listening experiment with speech-shaped noise and a competing speaker. A few methods made the TTS voice even more intelligible than the natural one. Although noise-dependent methods did not improve gains, the intelligibility differences found in distinct noises motivates such dependency.}
}
@conference{Heng13,
  author = {Lu, H. and King, S.},
  title = {Factorized context modelling for text-to-speech synthesis},
  abstract = {Because speech units are so context-dependent, a large number of linguistic context features are generally used by HMM- based Text-to-Speech (TTS) speech synthesis systems, via context-dependent models. Since it is impossible to train separate models for every context, decision trees are used to discover the most important combinations of features that should be modelled. The task of the decision tree is very hard to generalize from a very small observed part of the context feature space to the rest and they have a major weakness: they cannot directly take advantage of factorial properties: they subdivide the model space based on one feature at a time. We propose a Dynamic Bayesian Network (DBN) based Mixed Memory Markov Model (MMMM) to provide factorization of the context space. The results of a listening test are provided as evidence that the model successfully learns the factorial nature of this space.},
  address = {Vancouver, Canada},
  month = {May},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/IC13HengSimon.pdf},
  booktitle = {Proc. ICASSP}
}
@inproceedings{6423522,
  author = {Yang, Chen-Yu and Brown, G. and Lu, Liang and Yamagishi, J. and King, S.},
  doi = {10.1109/ISCSLP.2012.6423522},
  title = {Noise-robust whispered speech recognition using a non-audible-murmur microphone with VTS compensation},
  abstract = {In this paper, we introduce a newly-created corpus of whispered speech simultaneously recorded via a close-talking microphone and a non-audible murmur (NAM) microphone in both clean and noisy conditions. To benchmark the corpus, which has been freely released recently, experiments on automatic recognition of continuous whispered speech were conducted. When training and test conditions are matched, the NAM microphone is found to be more robust against background noise than the close-talking microphone. In mismatched conditions (noisy data, models trained on clean speech), we found that Vector Taylor Series (VTS) compensation is particularly effective for the NAM signal.},
  year = {2012},
  booktitle = {Chinese Spoken Language Processing (ISCSLP), 2012 8th International Symposium on},
  pages = {220-223}
}
@inproceedings{LorenzoAlbayzinProposal2012,
  author = {Lorenzo-Trueba, Jaime and Watts, Oliver and Barra-Chicote, Roberto and Yamagishi, Junichi and King, Simon and Montero, Juan M},
  title = {Simple4All proposals for the Albayzin Evaluations in Speech Synthesis},
  abstract = {Simple4All is a European funded project that aims to streamline the production of multilanguage expressive synthetic voices by means of unsupervised data extraction techniques, allowing the automatic processing of freely available data into flexible task-specific voices. In this paper we describe three different approaches for this task, the first two covering enhancements in expressivity and flexibility with the final one focusing on the development of unsupervised voices. The first technique introduces the principle of speaker adaptation from average models consisting of multiple voices, with the second being an extension of this adaptation concept into allowing the control of the expressive strength of the synthetic voice. Finally, an unsupervised approach to synthesis capable of learning from unlabelled text data is introduced in detail},
  year = {2012},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/simple4all-proposal.pdf},
  booktitle = {Proc. Iberspeech 2012},
  categories = {Albayzin challenge, expressive speech synthesis}
}
@article{child_speech_journal_2010,
  author = {Watts, O. and Yamagishi, J. and King, S. and Berkling, K.},
  doi = {10.1109/TASL.2009.2035029},
  title = {Synthesis of Child Speech with {HMM} Adaptation and Voice Conversion},
  journal = {Audio, Speech, and Language Processing, IEEE Transactions on},
  issn = {1558-7916},
  number = {5},
  abstract = {The synthesis of child speech presents challenges both in the collection of data and in the building of a synthesizer from that data. We chose to build a statistical parametric synthesizer using the hidden Markov model (HMM)-based system HTS, as this technique has previously been shown to perform well for limited amounts of data, and for data collected under imperfect conditions. Six different configurations of the synthesizer were compared, using both speaker-dependent and speaker-adaptive modeling techniques, and using varying amounts of data. For comparison with HMM adaptation, techniques from voice conversion were used to transform existing synthesizers to the characteristics of the target speaker. Speaker-adaptive voices generally outperformed child speaker-dependent voices in the evaluation. HMM adaptation outperformed voice conversion style techniques when using the full target speaker corpus; with fewer adaptation data, however, no significant listener preference for either HMM adaptation or voice conversion methods was found.},
  month = {July},
  volume = {18},
  year = {2010},
  keywords = {HMM adaptation techniques;child speech synthesis;hidden Markov model;speaker adaptive modeling technique;speaker dependent technique;speaker-adaptive voice;statistical parametric synthesizer;target speaker corpus;voice conversion;hidden Markov models;speech synthesis;},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/YamagishiJ_SynthesisofChildSpeech.pdf},
  pages = {1005--1016}
}
@inproceedings{ting_embc13,
  author = {Ting, Chee-Ming and King, Simon and Salleh, Sh-Hussain and Ariff, A. K.},
  title = {Discriminative Tandem Features for {HMM}-based {EEG} Classification},
  abstract = {We investigate the use of discriminative feature extractors in tandem configuration with generative EEG classification system. Existing studies on dynamic EEG classification typically use hidden Markov models (HMMs) which lack discriminative capability. In this paper, a linear and a non-linear classifier are discriminatively trained to produce complementary input features to the conventional HMM system. Two sets of tandem features are derived from linear discriminant analysis (LDA) projection output and multilayer perceptron (MLP) class-posterior probability, before appended to the standard autoregressive (AR) features. Evaluation on a two-class motor-imagery classification task shows that both the proposed tandem features yield consistent gains over the AR baseline, resulting in significant relative improvement of 6.2% and 11.2% for the LDA and MLP features respectively. We also explore portability of these features across different subjects.},
  address = {Osaka, Japan},
  month = {July},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/EBMC-2013-Tandem-Features.pdf},
  booktitle = {Proc. 35th Annual International Conference of the IEEE Engineering in Medicine and Biology Society (EMBC 13)}
}
@article{Ekpenyong2013,
  author = {Ekpenyong, Moses and Urua, Eno-Abasi and Watts, Oliver and King, Simon and Yamagishi, Junichi},
  numpages = {9},
  issue_date = {January, 2014},
  doi = {10.1016/j.specom.2013.02.003},
  title = {Statistical Parametric Speech Synthesis for {I}bibio},
  url = {http://dx.doi.org/10.1016/j.specom.2013.02.003},
  journal = {Speech Communication},
  issn = {0167-6393},
  month = {January},
  volume = {56},
  pages = {243--251},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Moses_Ibibio.pdf},
  abstract = {Ibibio is a Nigerian tone language, spoken in the south-east coastal region of Nigeria. Like most African languages, it is resource-limited. This presents a major challenge to conventional approaches to speech synthesis, which typically require the training of numerous predictive models of linguistic features such as the phoneme sequence (i.e., a pronunciation dictionary plus a letter-to-sound model) and prosodic structure (e.g., a phrase break predictor). This training is invariably supervised, requiring a corpus of training data labelled with the linguistic feature to be predicted. In this paper, we investigate what can be achieved in the absence of many of these expensive resources, and also with a limited amount of speech recordings. We employ a statistical parametric method, because this has been found to offer good performance even on small corpora, and because it is able to directly learn the relationship between acoustics and whatever linguistic features are available, potentially mitigating the absence of explicit representations of intermediate linguistic layers such as prosody. We present an evaluation that compares systems that have access to varying degrees of linguistic structure. The simplest system only uses phonetic context (quinphones), and this is compared to systems with access to a richer set of context features, with or without tone marking. It is found that the use of tone marking contributes significantly to the quality of synthetic speech. Future work should therefore address the problem of tone assignment using a dictionary and the building of a prediction module for out-of-vocabulary words.},
  categories = {HTS, Ibibio, Low-resource languages, Speech synthesis}
}
@inproceedings{sinclair_ICASSP13,
  author = {Sinclair, Mark and King, Simon},
  title = {Where are the challenges in speaker diarization?},
  abstract = {We present a study on the contributions to Diarization Error Rate by the various components of speaker diarization system. Following on from an earlier study by Huijbregts and Wooters, we extend into more areas and draw somewhat different conclusions. From a series of experiments combining real, oracle and ideal system components, we are able to conclude that the primary cause of error in diarization is the training of speaker models on impure data, something that is in fact done in every current system. We conclude by suggesting ways to improve future systems, including a focus on training the speaker models from smaller quantities of pure data instead of all the data, as is currently done.},
  address = {Vancouver, British Columbia, USA},
  month = {May},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/3512.pdf},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2013 IEEE International Conference on},
  categories = {speaker diarization, diarization error rate}
}
@inproceedings{frankel07:AF_MLP,
  author = {Frankel, J. and Magimai-Doss, M. and King, S. and Livescu, K. and Çetin, Ö.},
  title = {Articulatory Feature Classifiers Trained on 2000 hours of Telephone Speech},
  booktitle = {Proc. Interspeech},
  address = {Antwerp, Belgium},
  month = {August},
  year = {2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/frankel_AF-MLP.pdf},
  abstract = {This paper is intended to advertise the public availability of the articulatory feature (AF) classification multi-layer perceptrons (MLPs) which were used in the Johns Hopkins 2006 summer workshop. We describe the design choices, data preparation, AF label generation, and the training of MLPs for feature classification on close to 2000 hours of telephone speech. In addition, we present some analysis of the MLPs in terms of classification accuracy and confusions along with a brief summary of the results obtained during the workshop using the MLPs. We invite interested parties to make use of these MLPs.}
}
@misc{turk2010,
  author = {Turk, Alice and Scobbie, James and Geng, Christian and Campbell, Barry and Dickie, Catherine and Dubourg, Eddie and Bard, Ellen Gurman and Hardcastle, William and Hartinger, Mariam and King, Simon and Lickley, Robin and Macmartin, Cedric and Nakai, Satsuki and Renals, Steve and Richmond, Korin and Schaeffler, Sonja and White, Kevin and Wiegand, Ronny and Wrench, Alan},
  howpublished = {Poster presented at the 12th Conference on Laboratory Phonology, Albuquerque, New Mexico.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/ESPF.pdf},
  month = {July},
  year = {2010},
  title = {An {E}dinburgh speech production facility}
}
@article{Wang_JCST2012,
  author = {Wang, Dong and Tejedor, Javier and King, Simon and Frankel, Joe},
  doi = {http://dx.doi.org/10.1007/s11390-012-1228-x},
  title = {Term-dependent Confidence Normalization for Out-of-Vocabulary Spoken Term Detection},
  journal = {Journal of Computer Science and Technology},
  number = {2},
  volume = {27},
  year = {2012},
  abstract = {Spoken Term Detection (STD) is a fundamental component of spoken information retrieval systems. A key task of an STD system is to determine reliable detections and reject false alarms based on certain confidence measures. The detection posterior probability, which is often computed from lattices, is a widely used confidence measure. However, a potential problem of this confidence measure is that the confidence scores of detections of all search terms are treated uniformly, regardless of how much they may differ in terms of phonetic or linguistic properties. This problem is particularly evident for out-of-vocabulary (OOV) terms which tend to exhibit high intra-term diversity. To address the discrepancy on confidence levels that the same confidence score may convey for different terms, a term-dependent decision strategy is desirable -- for example, the term-specific threshold (TST) approach. In this work, we propose a term-dependent normalisation technique which compensates for term diversity on confidence estimation. Particularly, we propose a linear bias compensation and a discriminative compensation to deal with the bias problem that is inherent in lattice-based confidence measuring from which the TST approach suffers. We tested the proposed technique on speech data from the multi-party meeting domain with two state-of-the-art STD systems based on phonemes and words respectively. The experimental results demonstrate that the confidence normalisation approach leads to a significant performance improvement in STD, particularly for OOV terms with phoneme-based systems.}
}
@inproceedings{bell_king_shrinkage_is2008,
  author = {Bell, Peter and King, Simon},
  title = {A Shrinkage Estimator for Speech Recognition with Full Covariance {HMM}s},
  booktitle = {Proc. Interspeech},
  address = {Brisbane, Australia},
  month = {September},
  note = {Shortlisted for best student paper award.},
  year = {2008},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/shrinkage_is2008.pdf},
  abstract = {We consider the problem of parameter estimation in full-covariance Gaussian mixture systems for automatic speech recognition. Due to the high dimensionality of the acoustic feature vector, the standard sample covariance matrix has a high variance and is often poorly-conditioned when the amount of training data is limited. We explain how the use of a shrinkage estimator can solve these problems, and derive a formula for the optimal shrinkage intensity. We present results of experiments on a phone recognition task, showing that the estimator gives a performance improvement over a standard full-covariance system}
}
@inproceedings{shig042,
  author = {Shiga, Yoshinori and King, Simon},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_icslp04b.ps},
  title = {Source-Filter Separation for Articulation-to-Speech Synthesis},
  booktitle = {Proc. ICSLP},
  address = {Jeju, Korea},
  month = {October},
  year = {2004},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_icslp04b.pdf},
  abstract = {In this paper we examine a method for separating out the vocal-tract filter response from the voice source characteristic using a large articulatory database. The method realises such separation for voiced speech using an iterative approximation procedure under the assumption that the speech production process is a linear system composed of a voice source and a vocal-tract filter, and that each of the components is controlled independently by different sets of factors. Experimental results show that the spectral variation is evidently influenced by the fundamental frequency or the power of speech, and that the tendency of the variation may be related closely to speaker identity. The method enables independent control over the voice source characteristic in our articulation-to-speech synthesis.},
  categories = {artic, lbg, clustering, mocha, source-filter, edinburgh}
}
@inproceedings{jyamagis07:avss2006,
  author = {Yamagishi, Junichi and Kobayashi, Takao and Renals, Steve and King, Simon and Zen, Heiga and Toda, Tomoki and Tokuda, Keiichi},
  title = {Improved Average-Voice-based Speech Synthesis Using Gender-Mixed Modeling and a Parameter Generation Algorithm Considering {GV}},
  booktitle = {Proc. 6th ISCA Workshop on Speech Synthesis (SSW-6)},
  month = {August},
  year = {2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/ssw6-yamagishi.pdf},
  abstract = {For constructing a speech synthesis system which can achieve diverse voices, we have been developing a speaker independent approach of HMM-based speech synthesis in which statistical average voice models are adapted to a target speaker using a small amount of speech data. In this paper, we incorporate a high-quality speech vocoding method STRAIGHT and a parameter generation algorithm with global variance into the system for improving quality of synthetic speech. Furthermore, we introduce a feature-space speaker adaptive training algorithm and a gender mixed modeling technique for conducting further normalization of the average voice model. We build an English text-to-speech system using these techniques and show the performance of the system.},
  categories = {HMM, speech synthesis, speaker adaptation, HTS}
}
@inproceedings{king:portele:hoefer:eurospeech1997,
  author = {King, Simon and Portele, Thomas and H\"ofer, Florian},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/King_Portele_Hoefer_eurospeech1997.ps},
  title = {Speech synthesis using non-uniform units in the {V}erbmobil project},
  booktitle = {Proc. {E}urospeech 97},
  address = {Rhodes, Greece},
  month = {September},
  volume = {2},
  pages = {569-572},
  year = {1997},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/King_Portele_Hoefer_eurospeech1997.pdf},
  abstract = {We describe a concatenative speech synthesiser for British English which uses the HADIFIX inventory structure originally developed for German by Portele. An inventory of non-uniform units was investigated with the aim of improving segmental quality compared to diphones. A combination of soft (diphone) and hard concatenation was used, which allowed a dramatic reduction in inventory size. We also present a unit selection algorithm which selects an optimum sequence of units from this inventory for a given phoneme sequence. The work described is part of the concept-to-speech synthesiser for the language and speech project Verbmobil which is funded by the German Ministry of Science (BMBF).},
  categories = {}
}
@inproceedings{isard:king:taylor:kowtko:snowbird95,
  author = {Isard, Stephen and King, Simon and Taylor, Paul A. and Kowtko, Jacqueline},
  title = {Prosodic Information in a Speech Recognition System intended for Dialogue},
  booktitle = {IEEE Workshop in speech recognition},
  address = {Snowbird, Utah},
  year = {1995},
  abstract = {We report on an automatic speech recognition system intended for use in dialogue, whose original aspect is its use of prosodic information for two different purposes. The first is to improve the word level accuracy of the system. The second is to constrain the language model applied to a given utterance by taking into account the way that dialogue context and intonational tune interact to limit the possibilities for what an utterance might be.},
  categories = {}
}
@conference{hengluIS2012,
  author = {Lu, Heng and King, Simon},
  title = {Using {Bayesian} Networks to find relevant context features for {HMM}-based speech synthesis},
  abstract = {Speech units are highly context-dependent, so taking contextual features into account is essential for speech modelling. Context is employed in HMM-based Text-to-Speech speech synthesis systems via context-dependent phone models. A very wide context is taken into account, represented by a large set of contextual factors. However, most of these factors probably have no significant influence on the speech, most of the time. To discover which combinations of features should be taken into account, decision tree-based context clustering is used. But the space of context-dependent models is vast, and the number of contexts seen in the training data is only a tiny fraction of this space, so the task of the decision tree is very hard: to generalise from observations of a tiny fraction of the space to the rest of the space, whilst ignoring uninformative or redundant context features. The structure of the context feature space has not been systematically studied for speech synthesis. In this paper we discover a dependency structure by learning a Bayesian Network over the joint distribution of the features and the speech. We demonstrate that it is possible to discard the majority of context features with minimal impact on quality, measured by a perceptual test.},
  address = {Portland, Oregon, USA},
  month = {September},
  year = {2012},
  keywords = {HMM-based speech synthesis, Bayesian Networks, context information},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/HengLuSimonKing.pdf},
  booktitle = {Proc. Interspeech},
  categories = {HMM-based speech synthesis, Bayesian Networks, context information}
}
@inproceedings{wang_acmsccs2010,
  author = {Wang, Dong and King, Simon and Evans, Nick and Troncy, Raphael},
  doi = {10.1145/1878101.1878107},
  title = {Direct Posterior Confidence For Out-of-Vocabulary Spoken Term Detection},
  booktitle = {Proc. ACM Multimedia 2010 Searching Spontaneous Conversational Speech Workshop},
  month = {October},
  year = {2010},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wang_acmsccs2010.pdf},
  abstract = {Spoken term detection (STD) is a fundamental task in spoken information retrieval. Compared to conventional speech transcription and keyword spotting, STD is an open-vocabulary task and is necessarily required to address out-of-vocabulary (OOV) terms. Approaches based on subword units, e.g. phonemes, are widely used to solve the OOV issue; however, performance on OOV terms is still significantly inferior to that for in-vocabulary (INV) terms. The performance degradation on OOV terms can be attributed to a multitude of factors. A particular factor we address in this paper is that the acoustic and language models used for speech transcribing are highly vulnerable to OOV terms, which leads to unreliable confidence measures and error-prone detections. A direct posterior confidence measure that is derived from discriminative models has been proposed for STD. In this paper, we utilize this technique to tackle the weakness of OOV terms in confidence estimation. Neither acoustic models nor language models being included in the computation, the new confidence avoids the weak modeling problem with OOV terms. Our experiments, set up on multi-party meeting speech which is highly spontaneous and conversational, demonstrate that the proposed technique improves STD performance on OOV terms significantly; when combined with conventional lattice-based confidence, a significant improvement in performance is obtained on both INVs and OOVs. Furthermore, the new confidence measure technique can be combined together with other advanced techniques for OOV treatment, such as stochastic pronunciation modeling and term-dependent confidence discrimination, which leads to an integrated solution for OOV STD with greatly improved performance.},
  categories = {confidence estimation, spoken term detection, speech recognition}
}
@inproceedings{vepa-king-isca04,
  author = {Vepa, J. and King, S.},
  title = {Subjective evaluation of join cost and smoothing methods},
  booktitle = {Proc. 5th {ISCA} speech synthesis workshop},
  address = {Pittsburgh, USA},
  month = {June},
  year = {2004},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/vepa_tts04.pdf},
  abstract = {In our previous papers, we have proposed join cost functions derived from spectral distances, which have good correlations with perceptual scores obtained for a range of concatenation discontinuities. To further validate their ability to predict concatenation discontinuities, we have chosen the best three spectral distances and evaluated them subjectively in a listening test. The units for synthesis stimuli are obtained from a state-of-the-art unit selection text-to-speech system: `rVoice' from Rhetorical Systems Ltd. We also compared three different smoothing methods in this listening test. In this paper, we report listeners' preferences for each join costs in combination with each smoothing method.},
  categories = {join cost, Kalman filter, smoothing, evaluation, rVoice, edinburgh}
}
@inproceedings{salomon:king:osborne:icslp2002,
  author = {Salomon, Jesper and King, Simon and Osborne, Miles},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/Salomon_King_Osborne_icslp2002.ps},
  title = {Framewise phone classification using support vector machines},
  booktitle = {Proceedings International Conference on Spoken Language Processing},
  address = {Denver},
  year = {2002},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/Salomon_King_Osborne_icslp2002.pdf},
  abstract = {We describe the use of Support Vector Machines for phonetic classification on the TIMIT corpus. Unlike previous work, in which entire phonemes are classified, our system operates in a \textit{framewise} manner and is intended for use as the front-end of a hybrid system similar to ABBOT. We therefore avoid the problems of classifying variable-length vectors. Our frame-level phone classification accuracy on the complete TIMIT test set is competitive with other results from the literature. In addition, we address the serious problem of \textit{scaling} Support Vector Machines by using the Kernel Fisher Discriminant.},
  categories = {}
}
@inproceedings{Wang_TOIS2012,
  author = {Wang, Dong and King, Simon and Evans, Nicholas W. D. and Troncy, Raphaël},
  doi = {http://dx.doi.org/10.1145/1878101.1878107},
  title = {Direct posterior confidence for out-of-vocabulary spoken term detection},
  booktitle = {{SSCS} 2010, {ACM} {W}orkshop on {S}earching {S}pontaneous {C}onversational {S}peech, {S}eptember 20-24, 2010, {F}irenze, {I}taly},
  address = {{F}irenze, {ITALY}},
  month = {September},
  year = {2010},
  abstract = {Spoken term detection (STD) is a fundamental task in spoken information retrieval. Compared to conventional speech transcription and keyword spotting, STD is an open-vocabulary task and is necessarily required to address out-of-vocabulary (OOV) terms. Approaches based on subword units, e.g. phonemes, are widely used to solve the OOV issue; however, performance on OOV terms is still significantly inferior to that for in-vocabulary (INV) terms. The performance degradation on OOV terms can be attributed to a multitude of factors. A particular factor we address in this paper is that the acoustic and language models used for speech transcribing are highly vulnerable to OOV terms, which leads to unreliable confidence measures and error-prone detections. A direct posterior confidence measure that is derived from discriminative models has been proposed for STD. In this paper, we utilize this technique to tackle the weakness of OOV terms in confidence estimation. Neither acoustic models nor language models being included in the computation, the new confidence avoids the weak modeling problem with OOV terms. Our experiments, set up on multi-party meeting speech which is highly spontaneous and conversational, demonstrate that the proposed technique improves STD performance on OOV terms significantly; when combined with conventional lattice-based confidence, a significant improvement in performance is obtained on both INVs and OOVs. Furthermore, the new confidence measure technique can be combined together with other advanced techniques for OOV treatment, such as stochastic pronunciation modeling and term-dependent confidence discrimination, which leads to an integrated solution for OOV STD with greatly improved performance.}
}
@inproceedings{wang:frankel:tejedor:king:icassp2008,
  author = {Wang, Dong and Frankel, Joe and Tejedor, Javier and King, Simon},
  doi = {10.1109/ICASSP.2008.4518773},
  title = {A comparison of phone and grapheme-based spoken term detection},
  booktitle = {Proc. ICASSP},
  month = {March},
  pages = {4969--4972},
  year = {2008},
  abstract = {We propose grapheme-based sub-word units for spoken term detection (STD). Compared to phones, graphemes have a number of potential advantages. For out-of-vocabulary search terms, phone- based approaches must generate a pronunciation using letter-to-sound rules. Using graphemes obviates this potentially error-prone hard decision, shifting pronunciation modelling into the statistical models describing the observation space. In addition, long-span grapheme language models can be trained directly from large text corpora. We present experiments on Spanish and English data, comparing phone and grapheme-based STD. For Spanish, where phone and grapheme-based systems give similar transcription word error rates (WERs), grapheme-based STD significantly outperforms a phone- based approach. The converse is found for English, where the phone-based system outperforms a grapheme approach. However, we present additional analysis which suggests that phone-based STD performance levels may be achieved by a grapheme-based approach despite lower transcription accuracy, and that the two approaches may usefully be combined. We propose a number of directions for future development of these ideas, and suggest that if grapheme-based STD can match phone-based performance, the inherent flexibility in dealing with out-of-vocabulary terms makes this a desirable approach.}
}
@inproceedings{vepa-king-taylor_icslp02,
  author = {Vepa, J. and King, S. and Taylor, P.},
  title = {Objective Distance Measures for Spectral Discontinuities in Concatenative Speech Synthesis},
  booktitle = {Proc. {ICSLP}},
  address = {Denver, USA},
  month = {September},
  year = {2002},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/vepa_icslp02.pdf},
  abstract = {In unit selection based concatenative speech systems, `join cost', which measures how well two units can be joined together, is one of the main criteria for selecting appropriate units from the inventory. The ideal join cost will measure `perceived' discontinuity, based on easily measurable spectral properties of the units being joined, in order to ensure smooth and natural-sounding synthetic speech. In this paper we report a perceptual experiment conducted to measure the correlation between `subjective' human perception and various `objective' spectrally-based measures proposed in the literature. Our experiments used a state-of-the-art unit-selection text-to-speech system: `rVoice' from Rhetorical Systems Ltd.},
  categories = {join cost, distance measures, MCA, rVoice, edinburgh}
}
@incollection{king:gold_and_morgan_chapter2009,
  editor = {Morgan and Ellis},
  author = {King, Simon},
  publisher = {Wiley},
  title = {Speech Synthesis},
  booktitle = {Speech and Audio Signal Processing},
  year = {2010},
  abstract = {No abstract (this is a book chapter)},
  categories = {speech synthesis}
}
@inproceedings{CassiaICASSP12,
  author = {Valentini-Botinhao, C. and Maia, R. and Yamagishi, J. and King, S. and Zen, H.},
  doi = {10.1109/ICASSP.2012.6288794},
  title = {{Cepstral analysis based on the Glimpse proportion measure for improving the intelligibility of {HMM}-based synthetic speech in noise}},
  booktitle = {Proc. ICASSP},
  address = {Kyoto, Japan},
  month = {March},
  pages = {3997--4000},
  year = {2012},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_ICASSP12.pdf},
  abstract = {In this paper we introduce a new cepstral coefficient extraction method based on an intelligibility measure for speech in noise, the Glimpse Proportion measure. This new method aims to increase the intelligibility of speech in noise by modifying the clean speech, and has applications in scenarios such as public announcement and car navigation systems. We first explain how the Glimpse Proportion measure operates and further show how we approximated it to integrate it into an existing spectral envelope parameter extraction method commonly used in the HMM-based speech synthesis framework. We then demonstrate how this new method changes the modelled spectrum according to the characteristics of the noise and show results for a listening test with vocoded and HMM-based synthetic speech. The test indicates that the proposed method can significantly improve intelligibility of synthetic speech in speech shaped noise.},
  categories = {HMM-based speech synthesis, intelligibility enhancement, speech analysis}
}
@article{roberto:specom2010,
  author = {Barra-Chicote, R. and Yamagishi, J. and King, S. and Monero, J. Manuel and Macias-Guarasa, J.},
  doi = {10.1016/j.specom.2009.12.007},
  title = {Analysis of Statistical Parametric and Unit-Selection Speech Synthesis Systems Applied to Emotional Speech},
  journal = {Speech Communication},
  number = {5},
  month = {May},
  volume = {52},
  pages = {394--404},
  year = {2010},
  keywords = {Emotional speech synthesis; HMM-based synthesis; Unit selection},
  abstract = {We have applied two state-of-the-art speech synthesis techniques (unit selection and HMM-based synthesis) to the synthesis of emotional speech. A series of carefully designed perceptual tests to evaluate speech quality, emotion identification rates and emotional strength were used for the six emotions which we recorded -- happiness, sadness, anger, surprise, fear, disgust. For the HMM-based method, we evaluated spectral and source components separately and identified which components contribute to which emotion. Our analysis shows that, although the HMM method produces significantly better neutral speech, the two methods produce emotional speech of similar quality, except for emotions having context-dependent prosodic patterns. Whilst synthetic speech produced using the unit selection method has better emotional strength scores than the HMM-based method, the HMM-based method has the ability to manipulate the emotional strength. For emotions that are characterized by both spectral and prosodic components, synthetic speech using unit selection methods was more accurately identified by listeners. For emotions mainly characterized by prosodic components, HMM-based synthetic speech was more accurately identified. This finding differs from previous results regarding listener judgements of speaker similarity for neutral speech. We conclude that unit selection methods require improvements to prosodic modeling and that HMM-based methods require improvements to spectral modeling for emotional speech. Certain emotions cannot be reproduced well by either method.}
}
@article{Taylor_1998_b,
  author = {Taylor, Paul A. and King, S. and Isard, S. D. and Wright, H.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/Taylor_1998_b.ps},
  title = {Intonation and Dialogue Context as Constraints for Speech Recognition},
  journal = {Language and Speech},
  number = {3},
  volume = {41},
  year = {1998},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/Taylor_1998_b.pdf},
  pages = {493-512},
  categories = {asr, intonation, dialogue, lm, id4s}
}
@inproceedings{5947571,
  author = {Andraszewicz, S. and Yamagishi, J. and King, S.},
  doi = {10.1109/ICASSP.2011.5947571},
  title = {Vocal attractiveness of statistical speech synthesisers},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011 IEEE International Conference on},
  issn = {1520-6149},
  month = {May},
  pages = {5368--5371},
  year = {2011},
  keywords = {speaker-adaptive HMM-based speech synthesis methods;speaker-dependent voices;statistical speech synthesisers;vocal attractiveness;hidden Markov models;speaker recognition;speech synthesis;},
  abstract = {Our previous analysis of speaker-adaptive HMM-based speech synthesis methods suggested that there are two possible reasons why average voices can obtain higher subjective scores than any individual adapted voice: 1) model adaptation degrades speech quality proportionally to the distance 'moved' by the transforms, and 2) psychoacoustic effects relating to the attractiveness of the voice. This paper is a follow-on from that analysis and aims to separate these effects out. Our latest perceptual experiments focus on attractiveness, using average voices and speaker-dependent voices without model trans formation, and show that using several speakers to create a voice improves smoothness (measured by Harmonics-to-Noise Ratio), reduces distance from the the average voice in the log F0-F1 space of the final voice and hence makes it more attractive at the segmental level. However, this is weakened or overridden at supra-segmental or sentence levels.}
}
@article{frankel07:factoring,
  author = {Frankel, J. and King, S.},
  doi = {10.1016/j.patrec.2007.07.008},
  title = {Factoring {G}aussian Precision Matrices for Linear Dynamic Models},
  journal = {Pattern Recognition Letters},
  number = {16},
  month = {December},
  volume = {28},
  pages = {2264-2272},
  year = {2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Frankel_LDM_covar.pdf},
  abstract = {The linear dynamic model (LDM), also known as the Kalman filter model, has been the subject of research in the engineering, control, and more recently, machine learning and speech technology communities. The Gaussian noise processes are usually assumed to have diagonal, or occasionally full, covariance matrices. A number of recent papers have considered modelling the precision rather than covariance matrix of a Gaussian distribution, and this work applies such ideas to the LDM. A Gaussian precision matrix P can be factored into the form P = UTSU where U is a transform and S a diagonal matrix. By varying the form of U, the covariance can be specified as being diagonal or full, or used to model a given set of spatial dependencies. Furthermore, the transform and scaling components can be shared between models, allowing richer distributions with only marginally more parameters than required to specify diagonal covariances. The method described in this paper allows the construction of models with an appropriate number of parameters for the amount of available training data. We provide illustrative experimental results on synthetic and real speech data in which models with factored precision matrices and automatically-selected numbers of parameters are as good as or better than models with diagonal covariances on small data sets and as good as models with full covariance matrices on larger data sets.},
  categories = {LDM}
}
@inproceedings{king:stephenson:isard:taylor:strachan:icslp1998,
  author = {King, Simon and Stephenson, Todd and Isard, Stephen and Taylor, Paul and Strachan, Alex},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/King_Stephenson_Isard_Taylor_Strachan_icslp1998.ps},
  title = {Speech Recognition via Phonetically Featured Syllables},
  booktitle = {Proc. {ICSLP} `98},
  address = {Sydney, Australia},
  month = {December},
  pages = {1031-1034},
  year = {1998},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/King_Stephenson_Isard_Taylor_Strachan_icslp1998.pdf},
  abstract = {We describe a speech recogniser which uses a speech production-motivated phonetic-feature description of speech. We argue that this is a natural way to describe the speech signal and offers an efficient intermediate parameterisation for use in speech recognition. We also propose to model this description at the syllable rather than phone level. The ultimate goal of this work is to generate syllable models whose parameters explicitly describe the trajectories of the phonetic features of the syllable. We hope to move away from Hidden Markov Models (HMMs) of context-dependent phone units. As a step towards this, we present a preliminary system which consists of two parts: recognition of the phonetic features from the speech signal using a neural network; and decoding of the feature-based description into phonemes using HMMs.},
  categories = {asr}
}
@inproceedings{cetin07:crosslingual,
  author = {Çetin, Ö. and Magimai-Doss, M. and Kantor, A. and King, S. and Bartels, C. and Frankel, J. and Livescu, K.},
  title = {Monolingual and crosslingual comparison of tandem features derived from articulatory and phone {MLP}s},
  booktitle = {Proc. ASRU},
  address = {Kyoto},
  month = {December},
  year = {2007},
  organization = {IEEE},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Cetin_etal_ASRU2007.pdf},
  abstract = {In recent years, the features derived from posteriors of a multilayer perceptron (MLP), known as tandem features, have proven to be very effective for automatic speech recognition. Most tandem features to date have relied on MLPs trained for phone classification. We recently showed on a relatively small data set that MLPs trained for articulatory feature classification can be equally effective. In this paper, we provide a similar comparison using MLPs trained on a much larger data set - 2000 hours of English conversational telephone speech. We also explore how portable phone- and articulatory feature- based tandem features are in an entirely different language - Mandarin - without any retraining. We find that while phone-based features perform slightly better in the matched-language condition, they perform significantly better in the cross-language condition. Yet, in the cross-language condition, neither approach is as effective as the tandem features extracted from an MLP trained on a relatively small amount of in-domain data. Beyond feature concatenation, we also explore novel observation modelling schemes that allow for greater flexibility in combining the tandem and standard features at hidden Markov model (HMM) outputs.}
}
@inproceedings{frankel01:alternative,
  author = {Frankel, J. and King, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/Frankel_King_WISP2001.ps},
  title = {Speech recognition in the articulatory domain: investigating an alternative to acoustic {HMM}s},
  booktitle = {Proc. Workshop on Innovations in Speech Processing},
  month = {April},
  year = {2001},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/Frankel_King_WISP2001.pdf},
  abstract = {We describe a speech recognition system which uses a combination of acoustic and articulatory features as input. Linear dynamic models capture the trajectories which characterize each segment type. We describe classification and recognition tasks for systems based on acoustic data in conjunction with both real and automatically recovered articulatory parameters.},
  categories = {am,artic,asr,ldm,mocha,edinburgh}
}
@article{5510125,
  author = {Wang, D. and King, S. and Frankel, J.},
  doi = {10.1109/TASL.2010.2058800},
  title = {Stochastic Pronunciation Modelling for Out-of-Vocabulary Spoken Term Detection},
  journal = {Audio, Speech, and Language Processing, IEEE Transactions on},
  issn = {1558-7916},
  number = {99},
  month = {July},
  volume = {PP},
  year = {2010},
  abstract = {Spoken term detection (STD) is the name given to the task of searching large amounts of audio for occurrences of spoken terms, which are typically single words or short phrases. One reason that STD is a hard task is that search terms tend to contain a disproportionate number of out-of-vocabulary (OOV) words. The most common approach to STD uses subword units. This, in conjunction with some method for predicting pronunciations of OOVs from their written form, enables the detection of OOV terms but performance is considerably worse than for in-vocabulary terms. This performance differential can be largely attributed to the special properties of OOVs. One such property is the high degree of uncertainty in the pronunciation of OOVs. We present a stochastic pronunciation model (SPM) which explicitly deals with this uncertainty. The key insight is to search for all possible pronunciations when detecting an OOV term, explicitly capturing the uncertainty in pronunciation. This requires a probabilistic model of pronunciation, able to estimate a distribution over all possible pronunciations. We use a joint-multigram model (JMM) for this and compare the JMM-based SPM with the conventional soft match approach. Experiments using speech from the meetings domain demonstrate that the SPM performs better than soft match in most operating regions, especially at low false alarm probabilities. Furthermore, SPM and soft match are found to be complementary: their combination provides further performance gains.},
  categories = {confidence estimation, spoken term detection, speech recognition, OOVs}
}
@inproceedings{vepa_king_icslp2004,
  author = {Vepa, Jithendra and King, Simon},
  title = {Subjective Evaluation Of Join Cost Functions Used In Unit Selection Speech Synthesis},
  booktitle = {Proc. 8th International Conference on Spoken Language Processing (ICSLP)},
  address = {Jeju, Korea},
  month = {October},
  year = {2004},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/vepa_king_icslp2004.pdf},
  abstract = {In our previous papers, we have proposed join cost functions derived from spectral distances, which have good correlations with perceptual scores obtained for a range of concatenation discontinuities. To further validate their ability to predict concatenation discontinuities, we have chosen the best three spectral distances and evaluated them subjectively in a listening test. The unit sequences for synthesis stimuli are obtained from a state-of-the-art unit selection text-tospeech system: rVoice from Rhetorical Systems Ltd. In this paper, we report listeners preferences for each of the three join cost functions.}
}
@inproceedings{Ayletetal09,
  author = {Aylett, Matthew P. and King, Simon and Yamagishi, Junichi},
  title = {Speech Synthesis Without a Phone Inventory},
  booktitle = {Interspeech},
  pages = {2087--2090},
  place = {Brighton},
  year = {2009},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/ma_interspeech09.pdf},
  abstract = {In speech synthesis the unit inventory is decided using phonological and phonetic expertise. This process is resource intensive and potentially sub-optimal. In this paper we investigate how acoustic clustering, together with lexicon constraints, can be used to build a self-organised inventory. Six English speech synthesis systems were built using two frameworks, unit selection and parametric HTS for three inventory conditions: 1) a traditional phone set, 2) a system using orthographic units, and 3) a self-organised inventory. A listening test showed a strong preference for the classic system, and for the orthographic system over the self-organised system. Results also varied by letter to sound complexity and database coverage. This suggests the self-organised approach failed to generalise pronunciation as well as introducing noise above and beyond that caused by orthographic sound mismatch.},
  categories = {speech synthesis, unit selection, parametric synthesis, phone inventory, orthographic synthesis}
}
@inproceedings{clark:podsiadlo:mayo:king:blizzard2007,
  author = {Clark, Robert A. J. and Podsiadlo, Monika and Fraser, Mark and Mayo, Catherine and King, Simon},
  title = {Statistical Analysis of the {B}lizzard {C}hallenge 2007 Listening Test Results},
  booktitle = {Proc. Blizzard 2007 (in Proc. Sixth {ISCA} Workshop on Speech Synthesis)},
  address = {Bonn, Germany},
  month = {August},
  year = {2007},
  keywords = {Blizzard},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blz3_003.pdf},
  abstract = {Blizzard 2007 is the third Blizzard Challenge, in which participants build voices from a common dataset. A large listening test is conducted which allows comparison of systems in terms of naturalness and intelligibility. New sections were added to the listening test for 2007 to test the perceived similarity of the speaker's identity between natural and synthetic speech. In this paper, we present the results of the listening test and the subsequent statistical analysis.},
  categories = {blizzard,listening test}
}
@article{frankel07:AF_DBN,
  author = {Frankel, J. and Wester, M. and King, S.},
  title = {Articulatory feature recognition using dynamic {B}ayesian networks},
  journal = {Computer Speech & Language},
  number = {4},
  month = {October},
  volume = {21},
  pages = {620--640},
  year = {2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Frankel_etal_CSL2007.pdf},
  abstract = {We describe a dynamic Bayesian network for articulatory feature recognition. The model is intended to be a component of a speech recognizer that avoids the problems of conventional ``beads-on-a-string'' phoneme-based models. We demonstrate that the model gives superior recognition of articulatory features from the speech signal compared with a stateof- the art neural network system. We also introduce a training algorithm that offers two major advances: it does not require time-aligned feature labels and it allows the model to learn a set of asynchronous feature changes in a data-driven manner.}
}
@inproceedings{dongwang_interspeech09_spm,
  author = {Wang, Dong and King, Simon and Frankel, Joe},
  title = {Stochastic Pronunciation Modelling for Spoken Term Detection},
  booktitle = {Proc. Interspeech},
  address = {Brighton, UK},
  month = {September},
  pages = {2135--2138},
  year = {2009},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/spm.pdf},
  abstract = {A major challenge faced by a spoken term detection (STD) system is the detection of out-of-vocabulary (OOV) terms. Although a subword-based STD system is able to detect OOV terms, performance reduction is always observed compared to in-vocabulary terms. Current approaches to STD do not acknowledge the particular properties of OOV terms, such as pronunciation uncertainty. In this paper, we use a stochastic pronunciation model to deal with the uncertain pronunciations of OOV terms. By considering all possible term pronunciations, predicted by a joint-multigram model, we observe a significant performance improvement.},
  categories = {joint-multigram, pronunciation model, spoken term detection, speech recognition}
}
@inproceedings{watts_yamagishi_king_2011,
  author = {Watts, Oliver and Yamagishi, Junichi and King, Simon},
  title = {Unsupervised continuous-valued word features for phrase-break prediction without a part-of-speech tagger},
  booktitle = {Proc. Interspeech},
  address = {Florence, Italy},
  month = {August},
  pages = {2157--2160},
  year = {2011},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/watts_yamagishi_king_2011.pdf},
  abstract = {Part of speech (POS) tags are foremost among the features conventionally used to predict intonational phrase-breaks for text to speech (TTS) conversion. The construction of such systems therefore presupposes the availability of a POS tagger for the relevant language, or of a corpus manually tagged with POS. However, such tools and resources are not available in the majority of the world’s languages, and manually labelling text with POS tags is an expensive and time-consuming process. We therefore propose the use of continuous-valued features that summarise the distributional characteristics of word types as surrogates for POS features. Importantly, such features are obtained in an unsupervised manner from an untagged text corpus. We present results on the phrase-break prediction task, where use of the features closes the gap in performance between a baseline system (using only basic punctuation-related features) and a topline system (incorporating a state-of-the-art POS tagger).}
}
@inproceedings{Cassia_IS11,
  author = {Valentini-Botinhao, Cassia and Yamagishi, Junichi and King, Simon},
  title = {Can Objective Measures Predict the Intelligibility of Modified {HMM}-based Synthetic Speech in Noise?},
  booktitle = {Proc. Interspeech},
  month = {August},
  year = {2011},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/Cassia_IS11.pdf},
  abstract = {{Synthetic speech can be modified to improve intelligibility in noise. In order to perform modifications automatically, it would be useful to have an objective measure that could predict the intelligibility of modified synthetic speech for human listeners. We analysed the impact on intelligibility – and on how well objective measures predict it – when we separately modify speaking rate, fundamental frequency, line spectral pairs and spectral peaks. Shifting LSPs can increase intelligibility for human listeners; other modifications had weaker effects. Among the objective measures we evaluated, the Dau model and the Glimpse proportion were the best predictors of human performance.}},
  categories = {HMM-based speech synthesis, objective measures of intelligibility}
}
@inproceedings{richmond2011a,
  author = {Richmond, Korin and Hoole, Phil and King, Simon},
  title = {Announcing the Electromagnetic Articulography (Day 1) Subset of the mngu0 Articulatory Corpus},
  booktitle = {Proc. Interspeech},
  address = {Florence, Italy},
  month = {August},
  pages = {1505--1508},
  year = {2011},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/IS110767.pdf},
  abstract = {This paper serves as an initial announcement of the availability of a corpus of articulatory data called mngu0. This corpus will ultimately consist of a collection of multiple sources of articulatory data acquired from a single speaker: electromagnetic articulography (EMA), audio, video, volumetric MRI scans, and 3D scans of dental impressions. This data will be provided free for research use. In this first stage of the release, we are making available one subset of EMA data, consisting of more than 1,300 phonetically diverse utterances recorded with a Carstens AG500 electromagnetic articulograph. Distribution of mngu0 will be managed by a dedicated ``forum-style'' web site. This paper both outlines the general goals motivating the distribution of the data and the creation of the mngu0 web forum, and also provides a description of the EMA data contained in this initial release.},
  categories = {articulography, corpus, EMA}
}
@article{goubanova:king:specom2008,
  author = {Goubanova, Olga and King, Simon},
  doi = {10.1016/j.specom.2007.10.002},
  title = {Bayesian networks for phone duration prediction},
  journal = {Speech Communication},
  number = {4},
  month = {April},
  volume = {50},
  pages = {301-311},
  year = {2008},
  abstract = {In a text-to-speech system, the duration of each phone may be predicted by a duration model. This model is usually trained using a database of phones with known durations; each phone (and the context it appears in) is characterised by a feature vector that is composed of a set of linguistic factor values. We describe the use of a graphical model -- a Bayesian network -- for predicting the duration of a phone, given the values for these factors. The network has one discrete variable for each of the linguistic factors and a single continuous variable for the phone's duration. Dependencies between variables (or the lack of them) are represented in the BN structure by arcs (or missing arcs) between pairs of nodes. During training, both the topology of the network and its parameters are learned from labelled data. We compare the results of the BN model with results for sums of products and CART models on the same data. In terms of the root mean square error, the BN model performs much better than both CART and SoP models. In terms of correlation coefficient, the BN model performs better than the SoP model, and as well as the CART model. A BN model has certain advantages over CART and SoP models. Training SoP models requires a high degree of expertise. CART models do not deal with interactions between factors in any explicit way. As we demonstrate, a BN model can also make accurate predictions of a phone's duration, even when the values for some of the linguistic factors are unknown.},
  categories = {Text-to-speech; Bayesian networks; Duration modelling; Sums of products; Classification and regression trees}
}
@inproceedings{Aylett+King08,
  author = {Aylett, Matthew P. and King, Simon},
  title = {Single Speaker Segmentation and Inventory Selection Using Dynamic Time Warping Self Organization and Joint Multigram Mapping},
  booktitle = {SSW06},
  pages = {258--263},
  place = {Bonn},
  year = {2008},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/ssw06.pdf},
  abstract = {In speech synthesis the inventory of units is decided by inspection and on the basis of phonological and phonetic expertise. The ephone (or emergent phone) project at CSTR is investigating how self organisation techniques can be applied to build an inventory based on collected acoustic data together with the constraints of a synthesis lexicon. In this paper we will describe a prototype inventory creation method using dynamic time warping (DTW) for acoustic clustering and a joint multigram approach for relating a series of symbols that represent the speech to these emerged units. We initially examined two symbol sets: 1) A baseline of standard phones 2) Orthographic symbols. The success of the approach is evaluated by comparing word boundaries generated by the emergent phones against those created using state-of-the-art HMM segmentation. Initial results suggest the DTW segmentation can match word boundaries with a root mean square error (RMSE) of 35ms. Results from mapping units onto phones resulted in a higher RMSE of 103ms. This error was increased when multiple multigram types were added and when the default unit clustering was altered from 40 (our baseline) to 10. Results for orthographic matching had a higher RMSE of 125ms. To conclude we discuss future work that we believe can reduce this error rate to a level sufficient for the techniques to be applied to a unit selection synthesis system.},
  categories = {speech synthesis, unit selection, parametric synthesis, phone inventory, orthographic synthesis}
}
@inproceedings{Cetin07:tandem,
  author = {Çetin, Ö. and Kantor, A. and King, S. and Bartels, C. and Magimai-Doss, M. and Frankel, J. and Livescu, K.},
  title = {An articulatory feature-based tandem approach and factored observation modeling},
  booktitle = {Proc. ICASSP},
  address = {Honolulu},
  month = {April},
  year = {2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Cetin_icassp07_tandem.pdf},
  abstract = {The so-called tandem approach, where the posteriors of a multilayer perceptron (MLP) classifier are used as features in an automatic speech recognition (ASR) system has proven to be a very effective method. Most tandem approaches up to date have relied on MLPs trained for phone classification, and appended the posterior features to some standard feature hidden Markov model (HMM). In this paper, we develop an alternative tandem approach based on MLPs trained for articulatory feature (AF) classification. We also develop a factored observation model for characterizing the posterior and standard features at the HMM outputs, allowing for separate hidden mixture and state-tying structures for each factor. In experiments on a subset of Switchboard, we show that the AFbased tandem approach is as effective as the phone-based approach, and that the factored observation model significantly outperforms the simple feature concatenation approach while using fewer parameters.}
}
@inproceedings{gillett:king:eurospeech2003b,
  author = {Gillett, Ben and King, Simon},
  title = {Transforming {F0} Contours},
  booktitle = {Proc. {E}urospeech},
  address = {Geneva},
  month = {September},
  year = {2003},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/Gillett_King_eurospeech2003a.pdf},
  abstract = {Voice transformation is the process of transforming the characteristics of speech uttered by a source speaker, such that a listener would believe the speech was uttered by a target speaker. Training F0 contour generation models for speech synthesis requires a large corpus of speech. If it were possible to adapt the F0 contour of one speaker to sound like that of another speaker, using a small, easily obtainable parameter set, this would be extremely valuable. We present a new method for the transformation of F0 contours from one speaker to another based on a small linguistically motivated parameter set. The system performs a piecewise linear mapping using these parameters. A perceptual experiment clearly demonstrates that the presented system is at least as good as an existing technique for all speaker pairs, and that in many cases it is much better and almost as good as using the target F0 contour},
  categories = {}
}
@incollection{vepa:king:joincostchapter2004,
  editor = {Alwan, Abeer and Narayanan, Shri},
  author = {Vepa, Jithendra and King, Simon},
  publisher = {Prentice Hall},
  title = {Join Cost for Unit Selection Speech Synthesis},
  booktitle = {Speech Synthesis},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Vepa_King_joincostchapter2004.ps},
  year = {2004},
  categories = {}
}
@inproceedings{robust-hts,
  author = {Yamagishi, Junichi and Ling, Zhenhua and King, Simon},
  title = {Robustness of HMM-based Speech Synthesis},
  booktitle = {Proc. Interspeech 2008},
  address = {Brisbane, Australia},
  month = {September},
  pages = {581--584},
  key = {robust-hts},
  year = {2008},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/RobustnessHTS.pdf},
  abstract = {As speech synthesis techniques become more advanced, we are able to consider building high-quality voices from data collected outside the usual highly-controlled recording studio environment. This presents new challenges that are not present in conventional text-to-speech synthesis: the available speech data are not perfectly clean, the recording conditions are not consistent, and/or the phonetic balance of the material is not ideal. Although a clear picture of the performance of various speech synthesis techniques (e.g., concatenative, HMM-based or hybrid) under good conditions is provided by the Blizzard Challenge, it is not well understood how robust these algorithms are to less favourable conditions. In this paper, we analyse the performance of several speech synthesis methods under such conditions. This is, as far as we know, a new research topic: ``Robust speech synthesis.'' As a consequence of our investigations, we propose a new robust training method for the HMM-based speech synthesis in for use with speech data collected in unfavourable conditions.},
  categories = {speech synthesis, HMM-based speech synthesis, HTS, speaker adaptation, voice conversion, average voice, unit selection}
}
@inproceedings{taylor:shimodaira:isard:king:kowtko:icslp1996,
  author = {Taylor, Paul A. and Shimodaira, Hiroshi and Isard, Stephen and King, Simon and Kowtko, Jacqueline},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1996/Taylor_1996_a.ps},
  title = {Using Prosodic Information to Constrain Language Models for Spoken dialogue},
  booktitle = {Proc. {ICSLP} `96},
  address = {Philadelphia},
  year = {1996},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1996/Taylor_1996_a.pdf},
  abstract = {We present work intended to improve speech recognition performance for computer dialogue by taking into account the way that dialogue context and intonational tune interact to limit the possibilities for what an utterance might be. We report here on the extra constraint achieved in a bigram language model expressed in terms of entropy by using separate submodels for different sorts of dialogue acts and trying to predict which submodel to apply by analysis of the intonation of the sentence being recognised.},
  categories = {asr, intonation, dialogue, lm,id4s}
}
@inproceedings{fraser:king:blizzard2007,
  author = {Fraser, Mark and King, Simon},
  title = {The {B}lizzard {C}hallenge 2007},
  booktitle = {Proc. Blizzard 2007 (in Proc. Sixth ISCA Workshop on Speech Synthesis)},
  address = {Bonn, Germany},
  month = {August},
  year = {2007},
  keywords = {Blizzard},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blz3_001.pdf},
  abstract = {In Blizzard 2007, the third Blizzard Challenge, participants were asked to build voices from a dataset, a defined subset and, following certain constraints, a subset of their choice. A set of test sentences was then released to be synthesised. An online evaluation of the submitted synthesised sentences focused on naturalness and intelligibility, and added new sec- tions for degree of similarity to the original speaker, and similarity in terms of naturalness of pairs of sentences from different systems. We summarise this year's Blizzard Challenge and look ahead to possible designs for Blizzard 2008 in the light of participant and listener feedback.},
  categories = {blizzard, listening test}
}
@techreport{king:verbmobil1996a,
  author = {King, Simon},
  title = {Final report for {V}erbmobil {T}eilprojekt 4.4},
  abstract = {Final report for Verbmobil English speech synthesis},
  number = {ISSN 1434-8845},
  month = {January},
  note = {Verbmobil-Report 195 available at {\tt http://verbmobil.dfki.de}},
  year = {1997},
  institution = {IKP, Universitaet Bonn},
  categories = {}
}
@inproceedings{livescu07:JHU_summary,
  author = {Livescu, K. and Çetin, Ö. and Hasegawa-Johnson, M. and King, S. and Bartels, C. and Borges, N. and Kantor, A. and Lal, P. and Yung, L. and Bezman, Dawson-Haggerty, S. and Woods, B. and Frankel, J. and Magimai-Doss, M. and Saenko, K.},
  title = {Articulatory feature-based methods for acoustic and audio-visual speech recognition: {S}ummary from the 2006 {JHU} {S}ummer {W}orkshop},
  booktitle = {Proc. ICASSP},
  address = {Honolulu},
  month = {April},
  year = {2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/livescu_icassp07_sum.pdf},
  abstract = {We report on investigations, conducted at the 2006 Johns HopkinsWorkshop, into the use of articulatory features (AFs) for observation and pronunciation models in speech recognition. In the area of observation modeling, we use the outputs of AF classiers both directly, in an extension of hybrid HMM/neural network models, and as part of the observation vector, an extension of the tandem approach. In the area of pronunciation modeling, we investigate a model having multiple streams of AF states with soft synchrony constraints, for both audio-only and audio-visual recognition. The models are implemented as dynamic Bayesian networks, and tested on tasks from the Small-Vocabulary Switchboard (SVitchboard) corpus and the CUAVE audio-visual digits corpus. Finally, we analyze AF classication and forced alignment using a newly collected set of feature-level manual transcriptions.}
}
@inproceedings{wester04:asynch,
  author = {Wester, M. and Frankel, J. and King, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Wester_et_al_IEICE.ps},
  title = {Asynchronous Articulatory Feature Recognition Using Dynamic {B}ayesian Networks},
  booktitle = {Proc. IEICI Beyond HMM Workshop},
  address = {Kyoto},
  month = {December},
  year = {2004},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Wester_et_al_IEICE.pdf},
  abstract = {This paper builds on previous work where dynamic Bayesian networks (DBN) were proposed as a model for articulatory feature recognition. Using DBNs makes it possible to model the dependencies between features, an addition to previous approaches which was found to improve feature recognition performance. The DBN results were promising, giving close to the accuracy of artificial neural nets (ANNs). However, the system was trained on canonical labels, leading to an overly strong set of constraints on feature co-occurrence. In this study, we describe an embedded training scheme which learns a set of data-driven asynchronous feature changes where supported in the data. Using a subset of the OGI Numbers corpus, we describe articulatory feature recognition experiments using both canonically-trained and asynchronous DBNs. Performance using DBNs is found to exceed that of ANNs trained on an identical task, giving a higher recognition accuracy. Furthermore, inter-feature dependencies result in a more structured model, giving rise to fewer feature combinations in the recognition output. In addition to an empirical evaluation of this modelling approach, we give a qualitative analysis, comparing asynchrony found through our data-driven methods to the asynchrony which may be expected on the basis of linguistic knowledge.},
  categories = {am,artic,asr,dbn,oginumbers,edinburgh}
}
@inproceedings{child_synthesis_2009,
  author = {Watts, Oliver and Yamagishi, Junichi and King, Simon and Berkling, Kay},
  title = {{HMM} Adaptation and Voice Conversion for the Synthesis of Child Speech: A Comparison},
  booktitle = {Proc. Interspeech 2009},
  address = {Brighton, U.K.},
  month = {September},
  pages = {2627--2630},
  year = {2009},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/child_synthesis_2009.pdf},
  abstract = {This study compares two different methodologies for producing data-driven synthesis of child speech from existing systems that have been trained on the speech of adults. On one hand, an existing statistical parametric synthesiser is transformed using model adaptation techniques, informed by linguistic and prosodic knowledge, to the speaker characteristics of a child speaker. This is compared with the application of voice conversion techniques to convert the output of an existing waveform concatenation synthesiser with no explicit linguistic or prosodic knowledge. In a subjective evaluation of the similarity of synthetic speech to natural speech from the target speaker, the HMM-based systems evaluated are generally preferred, although this is at least in part due to the higher dimensional acoustic features supported by these techniques.}
}
@incollection{king:ELL2_2006b,
  editor = {Brown, Keith},
  author = {King, Simon},
  edition = {2nd},
  booktitle = {Encyclopedia of Language and Linguistics},
  publisher = {Elsevier},
  year = {2006},
  title = {Handling variation in speech and language processing}
}
@inproceedings{Blizzard_summary_09,
  author = {King, Simon and Karaiskos, Vasilis},
  title = {The {B}lizzard {C}hallenge 2009},
  booktitle = {Proc. Blizzard Challenge Workshop},
  address = {Edinburgh, UK},
  month = {September},
  year = {2009},
  keywords = {Blizzard},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/summary_Blizzard2009.pdf},
  abstract = {The Blizzard Challenge 2009 was the fifth annual Blizzard Challenge. As in 2008, UK English and Mandarin Chinese were the chosen languages for the 2009 Challenge. The English corpus was the same one used in 2008. The Mandarin corpus was pro- vided by iFLYTEK. As usual, participants with limited resources or limited experience in these languages had the option of using unaligned labels that were provided for both corpora and for the test sentences. An accent-specific pronunciation dictionary was also available for the English speaker. This year, the tasks were organised in the form of `hubs' and `spokes' where each hub task involved building a general-purpose voice and each spoke task involved building a voice for a specific application. A set of test sentences was released to participants, who were given a limited time in which to synthesise them and submit the synthetic speech. An online listening test was conducted to evaluate naturalness, intelligibility, degree of similarity to the original speaker and, for one of the spoke tasks, "appropriateness."},
  categories = {Blizzard Challenge, speech synthesis, evaluation, listening test}
}
@inproceedings{clarkrichmondking_interspeech05,
  author = {Clark, Robert A.J. and Richmond, Korin and King, Simon},
  title = {Multisyn voices from {ARCTIC} data for the {B}lizzard challenge},
  booktitle = {Proc. Interspeech 2005},
  month = {September},
  year = {2005},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/clarkrichmondking_interspeech05.pdf},
  abstract = {This paper describes the process of building unit selection voices for the Festival Multisyn engine using four ARCTIC datasets, as part of the Blizzard evaluation challenge. The build process is almost entirely automatic, with very little need for human intervention. We discuss the difference in the evaluation results for each voice and evaluate the suitability of the ARCTIC datasets for building this type of voice.},
  categories = {speech synthesis, festival, evaluation}
}
@inproceedings{shig043,
  author = {Shiga, Yoshinori and King, Simon},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_icslp04a.ps},
  title = {Estimating detailed spectral envelopes using articulatory clustering},
  booktitle = {Proc. ICSLP},
  address = {Jeju, Korea},
  month = {October},
  year = {2004},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_icslp04a.pdf},
  abstract = {This paper presents an articulatory-acoustic mapping where detailed spectral envelopes are estimated. During the estimation, the harmonics of a range of F0 values are derived from the spectra of multiple voiced speech signals vocalized with similar articulator settings. The envelope formed by these harmonics is represented by a cepstrum, which is computed by fitting the peaks of all the harmonics based on the weighted least square method in the frequency domain. The experimental result shows that the spectral envelopes are estimated with the highest accuracy when the cepstral order is 48--64 for a female speaker, which suggests that representing the real response of the vocal tract requires high-quefrency elements that conventional speech synthesis methods are forced to discard in order to eliminate the pitch component of speech.},
  categories = {artic, lbg, clustering, mocha, harmonic, envelope, edinburgh}
}
@article{Dines2011,
  author = {Dines, John and Liang, Hui and Saheer, Lakshmi and Gibson, Matthew and Byrne, William and Oura, Keiichiro and Tokuda, Keiichi and Yamagishi, Junichi and King, Simon and Wester, Mirjam and Hirsimäki, Teemu and Karhila, Reima and Kurimo, Mikko},
  doi = {10.1016/j.csl.2011.08.003},
  title = {Personalising speech-to-speech translation: Unsupervised cross-lingual speaker adaptation for {HMM}-based speech synthesis},
  url = {http://www.sciencedirect.com/science/article/pii/S0885230811000441},
  journal = {Computer Speech and Language},
  issn = {0885-2308},
  number = {2},
  month = {February},
  volume = {27},
  pages = {420--437},
  year = {2013},
  keywords = {Speech-to-speech translation, Cross-lingual speaker adaptation, HMM-based speech synthesis, Speaker adaptation, Voice conversion},
  abstract = {In this paper we present results of unsupervised cross-lingual speaker adaptation applied to text-to-speech synthesis. The application of our research is the personalisation of speech-to-speech translation in which we employ a HMM statistical framework for both speech recognition and synthesis. This framework provides a logical mechanism to adapt synthesised speech output to the voice of the user by way of speech recognition. In this work we present results of several different unsupervised and cross-lingual adaptation approaches as well as an end-to-end speaker adaptive speech-to-speech translation system. Our experiments show that we can successfully apply speaker adaptation in both unsupervised and cross-lingual scenarios and our proposed algorithms seem to generalise well for several language pairs. We also discuss important future directions including the need for better evaluation metrics.}
}
@inproceedings{shig031,
  author = {Shiga, Yoshinori and King, Simon},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/shiga_eurospeech03a.ps},
  title = {Estimating the Spectral Envelope of Voiced Speech Using Multi-frame Analysis},
  booktitle = {Proc. {E}urospeech-2003},
  address = {Geneva, Switzerland},
  month = {September},
  volume = {3},
  pages = {1737--1740},
  year = {2003},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/shiga_eurospeech03a.pdf},
  abstract = {This paper proposes a novel approach for estimating the spectral envelope of voiced speech independently of its harmonic structure. Because of the quasi-periodicity of voiced speech, its spectrum indicates harmonic structure and only has energy at frequencies corresponding to integral multiples of F0. It is hence impossible to identify transfer characteristics between the adjacent harmonics. In order to resolve this problem, Multi-frame Analysis (MFA) is introduced. The MFA estimates a spectral envelope using many portions of speech which are vocalised using the same vocal-tract shape. Since each of the portions usually has a different F0 and ensuing different harmonic structure, a number of harmonics can be obtained at various frequencies to form a spectral envelope. The method thereby gives a closer approximation to the vocal-tract transfer function.},
  categories = {artic, lbg, clustering, mocha, harmonic, envelope, edinburgh}
}
@inproceedings{dongwang_icassp09,
  author = {Wang, Dong and Tejedor, Tejedor and Frankel, Joe and King, Simon},
  title = {Posterior-based confidence measures for spoken term detection},
  booktitle = {Proc. ICASSP09},
  address = {Taiwan},
  month = {April},
  year = {2009},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/posterior.pdf},
  abstract = {Confidence measures play a key role in spoken term detection (STD) tasks. The confidence measure expresses the posterior probability of the search term appearing in the detection period, given the speech. Traditional approaches are based on the acoustic and language model scores for candidate detections found using automatic speech recognition, with Bayes' rule being used to compute the desired posterior probability. In this paper, we present a novel direct posterior-based confidence measure which, instead of resorting to the Bayesian formula, calculates posterior probabilities from a multi-layer perceptron (MLP) directly. Compared with traditional Bayesian-based methods, the direct-posterior approach is conceptually and mathematically simpler. Moreover, the MLP-based model does not require assumptions to be made about the acoustic features such as their statistical distribution and the independence of static and dynamic co-efficients. Our experimental results in both English and Spanish demonstrate that the proposed direct posterior-based confidence improves STD performance.},
  categories = {Spoken term detection, confidence measure, posterior probabilities, MLP},
  page = {4889--4892}
}
@inproceedings{frankel01:ASR,
  author = {Frankel, J. and King, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/Frankel_King_Eurospeech2001.ps},
  title = {{ASR} - Articulatory Speech Recognition},
  booktitle = {Proc. {E}urospeech},
  address = {Aalborg, Denmark},
  month = {September},
  pages = {599-602},
  year = {2001},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/Frankel_King_Eurospeech2001.pdf},
  abstract = {In this paper we report recent work on a speech recognition system using a combination of acoustic and articulatory features as input. Linear dynamic models are used to capture the trajectories which characterize each segment type. We describe classification and recognition tasks for systems based on acoustic data in conjunction with both real and automatically recovered articulatory parameters.},
  categories = {am,artic,asr,ldm,mocha,edinburgh}
}
@inproceedings{lei2011a,
  author = {Lei, Ming and Yamagishi, Junichi and Richmond, Korin and Ling, Zhen-Hua and King, Simon and Dai, Li-Rong},
  title = {Formant-controlled {HMM}-based speech synthesis},
  booktitle = {Proc. Interspeech},
  address = {Florence, Italy},
  month = {August},
  pages = {2777--2780},
  year = {2011},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/IS110769.pdf},
  abstract = {This paper proposes a novel framework that enables us to manipulate and control formants in HMM-based speech synthesis. In this framework, the dependency between formants and spectral features is modelled by piecewise linear transforms; formant parameters are effectively mapped by these to the means of Gaussian distributions over the spectral synthesis parameters. The spectral envelope features generated under the influence of formants in this way may then be passed to high-quality vocoders to generate the speech waveform. This provides two major advantages over conventional frameworks. First, we can achieve spectral modification by changing formants only in those parts where we want control, whereas the user must specify all formants manually in conventional formant synthesisers (e.g. Klatt). Second, this can produce high-quality speech. Our results show the proposed method can control vowels in the synthesized speech by manipulating F 1 and F 2 without any degradation in synthesis quality.},
  categories = {speech synthesis, hidden Markov model, formants, controllability}
}
@inproceedings{frankel04:artic_dbn,
  author = {Frankel, J. and Wester, M. and King, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Frankel_et_al_ICSLP2004.ps},
  title = {Articulatory feature recognition using dynamic {B}ayesian networks},
  booktitle = {Proc. {ICSLP}},
  month = {September},
  year = {2004},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Frankel_et_al_ICSLP2004.pdf},
  abstract = {This paper describes the use of dynamic Bayesian networks for the task of articulatory feature recognition. We show that by modeling the dependencies between a set of 6 multi-leveled articulatory features, recognition accuracy is increased over an equivalent system in which features are considered independent. Results are compared to those found using artificial neural networks on an identical task.},
  categories = {am,artic,asr,dbn,timit,edinburgh}
}
@article{john:ieee2011,
  author = {Dines, J. and Yamagishi, J. and King, S.},
  doi = {10.1109/JSTSP.2010.2079315},
  title = {Measuring the gap between {HMM}-based {ASR} and {TTS}},
  journal = {IEEE Selected Topics in Signal Processing},
  note = {(in press)},
  year = {2011},
  keywords = {Acoustics, Adaptation model, Context modeling, Hidden Markov models, Speech, Speech recognition, Training, speech recognition, speech synthesis, unified models},
  abstract = {The EMIME European project is conducting research in the development of technologies for mobile, personalised speech-to-speech translation systems. The hidden Markov model (HMM) is being used as the underlying technology in both automatic speech recognition (ASR) and text-to-speech synthesis (TTS) components, thus, the investigation of unified statistical modelling approaches has become an implicit goal of our research. As one of the first steps towards this goal, we have been investigating commonalities and differences between HMM-based ASR and TTS. In this paper we present results and analysis of a series of experiments that have been conducted on English ASR and TTS systems measuring their performance with respect to phone set and lexicon; acoustic feature type and dimensionality; HMM topology; and speaker adaptation. Our results show that, although the fundamental statistical model may be essentially the same, optimal ASR and TTS performance often demands diametrically opposed system designs. This represents a major challenge to be addressed in the investigation of such unified modelling approaches.}
}
@inproceedings{Dall_Veaux_Yamagishi_King_Interspeech2012,
  author = {Dall, Rasmus and Veaux, Christophe and Yamagishi, Junichi and King, Simon},
  title = {Analysis of speaker clustering techniques for {HMM}-based speech synthesis},
  booktitle = {Proc. Interspeech},
  month = {September},
  year = {2012},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Dall_Veaux_Yamagishi_King_Interspeech2012.pdf},
  abstract = {This paper describes a method for speaker clustering, with the application of building average voice models for speaker-adaptive HMM-based speech synthesis that are a good basis for adapting to specific target speakers. Our main hypothesis is that using perceptually similar speakers to build the average voice model will be better than use unselected speakers, even if the amount of data available from perceptually similar speakers is smaller. We measure the perceived similarities among a group of 30 female speakers in a listening test and then apply multiple linear regression to automatically predict these listener judgements of speaker similarity and thus to identify similar speakers automatically. We then compare a variety of average voice models trained on either speakers who were perceptually judged to be similar to the target speaker, or speakers selected by the multiple linear regression, or a large global set of unselected speakers. We find that the average voice model trained on perceptually similar speakers provides better performance than the global model, even though the latter is trained on more data, confirming our main hypothesis. However, the average voice model using speakers selected automatically by the multiple linear regression does not reach the same level of performance.},
  categories = {Statistical parametric speech synthesis, hidden Markov models, speaker adaptation}
}
@inproceedings{Gutkin:King:pris05,
  editor = {Gamboa, Hugo and Fred, Ana},
  author = {Gutkin, Alexander and King, Simon},
  publisher = {INSTICC Press},
  isbn = {972-8865-28-7},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/gutkin_king_pris05.ps.gz},
  booktitle = {Proc. 5th International Workshop on Pattern Recognition in Information Systems (PRIS-2005), In conjunction with the 7th International Conference on Enterprise Information Systems (ICEIS-2005)},
  title = {{I}nductive {S}tring {T}emplate-{B}ased {L}earning of {S}poken {L}anguage},
  year = {2005},
  month = {May},
  pages = {43--51},
  address = {Miami, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/gutkin_king_pris05.pdf},
  abstract = {This paper deals with formulation of alternative structural approach to the speech recognition problem. In this approach, we require both the representation and the learning algorithms defined on it to be linguistically meaningful, which allows the speech recognition system to discover the nature of the linguistic classes of speech patterns corresponding to the speech waveforms. We briefly discuss the current formalisms and propose an alternative --- a phonologically inspired string-based inductive speech representation, defined within an analytical framework specifically designed to address the issues of class and object representation. We also present the results of the phoneme classification experiments conducted on the TIMIT corpus of continuous speech.},
  categories = {structural,recognition,acoustic,phonetic_feature,timit,edinburgh}
}
@article{vepa_king_tsap05,
  author = {Vepa, Jithendra and King, Simon},
  title = {Subjective Evaluation of Join Cost and Smoothing Methods for Unit Selection Speech Synthesis},
  journal = {IEEE Transactions on Speech and Audio Processing},
  number = {5},
  month = {September},
  volume = {14},
  pages = {1763--1771},
  year = {2006},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/vepa_king_ieee2005.pdf},
  abstract = {In unit selection-based concatenative speech synthesis, join cost (also known as concatenation cost), which measures how well two units can be joined together, is one of the main criteria for selecting appropriate units from the inventory. Usually, some form of local parameter smoothing is also needed to disguise the remaining discontinuities. This paper presents a subjective evaluation of three join cost functions and three smoothing methods. We describe the design and performance of a listening test. The three join cost functions were taken from our previous study, where we proposed join cost functions derived from spectral distances, which have good correlations with perceptual scores obtained for a range of concatenation discontinuities. This evaluation allows us to further validate their ability to predict concatenation discontinuities. The units for synthesis stimuli are obtained from a state-of-the-art unit selection text-to-speech system: rVoice from Rhetorical Systems Ltd. In this paper, we report listeners' preferences for each join cost in combination with each smoothing method.},
  categories = {TTS, join cost, listening test}
}
@inproceedings{hts-child-oliver,
  author = {Watts, Oliver and Yamagishi, Junichi and Berkling, Kay and King, Simon},
  title = {{HMM}-based synthesis of child speech},
  booktitle = {Proc. 1st Workshop on Child, Computer and Interaction (ICMI'08 post-conference workshop)},
  year = {2008},
  month = {October},
  key = {hts-child-oliver},
  address = {Crete, Greece},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/child-hts-oliver.pdf},
  abstract = {The synthesis of child speech presents challenges both in the collection of data and in the building of a synthesiser from that data. Because only limited data can be collected, and the domain of that data is constrained, it is difficult to obtain the type of phonetically-balanced corpus usually used in speech synthesis. As a consequence, building a synthesiser from this data is difficult. Concatenative synthesisers are not robust to corpora with many missing units (as is likely when the corpus content is not carefully designed), so we chose to build a statistical parametric synthesiser using the HMM-based system HTS. This technique has previously been shown to perform well for limited amounts of data, and for data collected under imperfect conditions. We compared 6 different configurations of the synthesiser, using both speaker-dependent and speaker-adaptive modelling techniques, and using varying amounts of data. The output from these systems was evaluated alongside natural and vocoded speech, in a Blizzard-style listening test.},
  categories = {speech synthesis, HMM-based speech synthesis, HTS, speaker adaptation, voice conversion, average voice, child speech}
}
@inproceedings{dongwang_interspeech09_conf,
  author = {Wang, Dong and King, Simon and Frankel, Joe and Bell, Peter},
  title = {Term-Dependent Confidence for Out-of-Vocabulary Term Detection},
  booktitle = {Proc. Interspeech},
  address = {Brighton, UK},
  month = {September},
  pages = {2139--2142},
  year = {2009},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/conf.pdf},
  abstract = {Within a spoken term detection (STD) system, the decision maker plays an important role in retrieving reliable detections. Most of the state-of-the-art STD systems make decisions based on a confidence measure that is term-independent, which poses a serious problem for out-of-vocabulary (OOV) term detection. In this paper, we study a term-dependent confidence measure based on confidence normalisation and discriminative modelling, particularly focusing on its remarkable effectiveness for detecting OOV terms. Experimental results indicate that the term-dependent confidence provides much more significant improvement for OOV terms than terms in-vocabulary.},
  categories = {joint-multigram, pronunciation model, spoken term detection, speech recognition}
}
@article{Oura2012703,
  author = {Oura, Keiichiro and Yamagishi, Junichi and Wester, Mirjam and King, Simon and Tokuda, Keiichi},
  doi = {10.1016/j.specom.2011.12.004},
  title = {Analysis of unsupervised cross-lingual speaker adaptation for {HMM}-based speech synthesis using {KLD}-based transform mapping},
  url = {http://www.sciencedirect.com/science/article/pii/S0167639311001774},
  journal = {Speech Communication},
  issn = {0167-6393},
  number = {6},
  pages = {703--714},
  volume = {54},
  year = {2012},
  keywords = {HMM-based speech synthesis, Unsupervised speaker adaptation, Cross-lingual speaker adaptation, Speech-to-speech translation},
  abstract = {In the EMIME project, we developed a mobile device that performs personalized speech-to-speech translation such that a user's spoken input in one language is used to produce spoken output in another language, while continuing to sound like the user's voice. We integrated two techniques into a single architecture: unsupervised adaptation for HMM-based TTS using word-based large-vocabulary continuous speech recognition, and cross-lingual speaker adaptation (CLSA) for HMM-based TTS. The CLSA is based on a state-level transform mapping learned using minimum Kullback-Leibler divergence between pairs of HMM states in the input and output languages. Thus, an unsupervised cross-lingual speaker adaptation system was developed. End-to-end speech-to-speech translation systems for four languages (English, Finnish, Mandarin, and Japanese) were constructed within this framework. In this paper, the English-to-Japanese adaptation is evaluated. Listening tests demonstrate that adapted voices sound more similar to a target speaker than average voices and that differences between supervised and unsupervised cross-lingual speaker adaptation are small. Calculating the KLD state-mapping on only the first 10 mel-cepstral coefficients leads to huge savings in computational costs, without any detrimental effect on the quality of the synthetic speech.}
}
@inproceedings{strom08,
  author = {Strom, Volker and King, Simon},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080514.ps},
  title = {Investigating {F}estival's target cost function using perceptual experiments},
  booktitle = {Proc.~Interspeech},
  address = {Brisbane},
  year = {2008},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080514.pdf},
  abstract = {We describe an investigation of the target cost used in the Festival unit selection speech synthesis system. Our ultimate goal is to automatically learn a perceptually optimal target cost function. In this study, we investigated the behaviour of the target cost for one segment type. The target cost is based on counting the mismatches in several context features. A carrier sentence (``My name is Roger'') was synthesised using all 147,820 possible combinations of the diphones /n_ei/ and /ei_m/. 92 representative versions were selected and presented to listeners as 460 pairwise comparisons. The listeners' preference votes were used to analyse the behaviour of the target cost, with respect to the values of its component linguistic context features.},
  categories = {speech synthesis, unit selection, target costs}
}
@inproceedings{clarkrichmondking_ssw504,
  author = {Clark, Robert A.J. and Richmond, Korin and King, Simon},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/clarkrichmondking_ssw504.ps},
  title = {Festival 2 -- build your own general purpose unit selection speech synthesiser},
  booktitle = {Proc. 5th {ISCA} workshop on speech synthesis},
  year = {2004},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/clarkrichmondking_ssw504.pdf},
  abstract = {This paper describes version 2 of the Festival speech synthesis system. Festival 2 provides a development environment for concatenative speech synthesis, and now includes a general purpose unit selection speech synthesis engine. We discuss various aspects of unit selection speech synthesis, focusing on the research issues that relate to voice design and the automation of the voice development process.},
  categories = {synthesis, festival, unitselection}
}
@phdthesis{king:thesis1998,
  author = {King, Simon},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/King_thesis1998.ps},
  school = {University of Edinburgh},
  title = {Using Information Above the Word Level for Automatic Speech Recognition},
  abstract = {This thesis introduces a general method for using information at the utterance level and across utterances for automatic speech recognition. The method involves classification of utterances into types. Using constraints at the utterance level via this classification method allows information sources to be exploited which cannot necessarily be used directly for word recognition. The classification power of three sources of information is investigated: the language model in the speech recogniser, dialogue context and intonation. The method is applied to a challenging task: the recognition of spontaneous dialogue speech. The results show success in automatic utterance type classification, and subsequent word error rate reduction over a baseline system, when all three information sources are probabilistically combined.},
  year = {1998},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/King_thesis1998.pdf},
  categories = {asr, lm, intonation, dialogue, systems}
}
@inproceedings{dong_ivan_joe_simon_interspeech08_marray,
  author = {Wang, Dong and Himawan, Ivan and Frankel, Joe and King, Simon},
  title = {A Posterior Approach for Microphone Array Based Speech Recognition},
  booktitle = {Proc. Interspeech},
  month = {September},
  pages = {996--999},
  year = {2008},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/marray.a.pdf},
  abstract = {Automatic speech recognition (ASR) becomes rather difficult in meetings domains because of the adverse acoustic conditions, including more background noise, more echo and reverberation and frequent cross-talking. Microphone arrays have been demonstrated able to boost ASR performance dramatically in such noisy and reverberant environments, with various beamforming algorithms. However, almost all existing beamforming measures work in the acoustic domain, resorting to signal processing theories and geometric explanation. This limits their application, and induces significant performance degradation when the geometric property is unavailable or hard to estimate, or if heterogenous channels exist in the audio system. In this paper, we preset a new posterior-based approach for array-based speech recognition. The main idea is, instead of enhancing speech signals, we try to enhance the posterior probabilities that frames belonging to recognition units, e.g., phones. These enhanced posteriors are then transferred to posterior probability based features and are modeled by HMMs, leading to a tandem ANN-HMM hybrid system presented by Hermansky et al.. Experimental results demonstrated the validity of this posterior approach. With the posterior accumulation or enhancement, significant improvement was achieved over the single channel baseline. Moreover, we can combine the acoustic enhancement and posterior enhancement together, leading to a hybrid acoustic-posterior beamforming approach, which works significantly better than just the acoustic beamforming, especially in the scenario with moving-speakers.},
  categories = {speech recognition, microphone array, beamforming, tandem approach}
}
@inproceedings{Cassia_ICASSP11,
  author = {Valentini-Botinhao, Cassia and Yamagishi, Junichi and King, Simon},
  doi = {10.1109/ICASSP.2011.5947507},
  title = {Evaluation of objective measures for intelligibility prediction of {HMM}-based synthetic speech in noise},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011 IEEE International Conference on},
  issn = {1520-6149},
  month = {May},
  pages = {5112--5115},
  year = {2011},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/Cassia_ICASSP11.pdf},
  abstract = {{In this paper we evaluate four objective measures of speech with regards to intelligibility prediction of synthesized speech in diverse noisy situations. We evaluated three intelligibility measures, the Dau measure, the glimpse proportion and the Speech Intelligibility Index (SII) and a quality measure, the Perceptual Evaluation of Speech Quality (PESQ). For the generation of synthesized speech we used a state of the art HMM-based speech synthesis system. The noisy conditions comprised four additive noises. The measures were compared with subjective intelligibility scores obtained in listening tests. The results show the Dau and the glimpse measures to be the best predictors of intelligibility, with correlations of around 0.83 to subjective scores. All measures gave less accurate predictions of intelligibility for synthetic speech than have previously been found for natural speech; in particular the SII measure. In additional experiments, we processed the synthesized speech by an ideal binary mask before adding noise. The Glimpse measure gave the most accurate intelligibility predictions in this situation.}},
  categories = {HMM-based speech synthesis, objective measures of intelligibility}
}
@inproceedings{king_bartels_bilmes_isp05,
  author = {King, Simon and Bartels, Chris and Bilmes, Jeff},
  title = {SVitchboard 1: Small Vocabulary Tasks from Switchboard 1},
  booktitle = {Proc. Interspeech 2005},
  address = {Lisbon, Portugal},
  year = {2005},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/king_bartels_bilmes_svitchboard.pdf},
  abstract = {We present a conversational telephone speech data set designed to support research on novel acoustic models. Small vocabulary tasks from 10 words up to 500 words are defined using subsets of the Switchboard-1 corpus; each task has a completely closed vocabulary (an OOV rate of 0\%). We justify the need for these tasks, de- scribe the algorithm for selecting them from a large cor- pus, give a statistical analysis of the data and present baseline whole-word hidden Markov model recognition results. The goal of the paper is to define a common data set and to encourage other researchers to use it.}
}
@inproceedings{Gutkin:King:icslp04,
  author = {Gutkin, Alexander and King, Simon},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/gutkin_king_icslp04.ps.gz},
  title = {Phone classification in pseudo-{E}uclidean Vector Spaces},
  booktitle = {Proc. 8th International Conference on Spoken Language Processing (ICSLP)},
  issn = {1225-441x},
  year = {2004},
  month = {October},
  volume = {II},
  pages = {1453--1457},
  address = {Jeju Island, Korea},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/gutkin_king_icslp04.pdf},
  abstract = {Recently we have proposed a structural framework for modelling speech, which is based on patterns of phonological distinctive features, a linguistically well-motivated alternative to standard vector-space acoustic models like HMMs. This framework gives considerable representational freedom by working with features that have explicit linguistic interpretation, but at the expense of the ability to apply the wide range of analytical decision algorithms available in vector spaces, restricting oneself to more computationally expensive and less-developed symbolic metric tools. In this paper we show that a dissimilarity-based distance-preserving transition from the original structural representation to a corresponding pseudo-Euclidean vector space is possible. Promising results of phone classification experiments conducted on the TIMIT database are reported.},
  categories = {structural,recognition,acoustic,phonetic_feature,timit,edinburgh}
}
@article{yamagishi2009,
  author = {Yamagishi, Junichi and Nose, Takashi and Zen, Heiga and Ling, Zhenhua and Toda, Tomoki and Tokuda, Keiichi and King, Simon and Renals, Steve},
  title = {Robust Speaker-Adaptive {HMM}-based Text-to-Speech Synthesis},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=5109758&arnumber=5153555&count=14&index=12},
  journal = {IEEE Transactions on Audio, Speech and Language Processing},
  number = {6},
  pages = {1208--1230},
  volume = {17},
  year = {2009},
  pdf = {},
  abstract = {This paper describes a speaker-adaptive HMM-based speech synthesis system. The new system, called ``HTS-2007,'' employs speaker adaptation (CSMAPLR+MAP), feature-space adaptive training, mixed-gender modeling, and full-covariance modeling using CSMAPLR transforms, in addition to several other techniques that have proved effective in our previous systems. Subjective evaluation results show that the new system generates significantly better quality synthetic speech than speaker-dependent approaches with realistic amounts of speech data, and that it bears comparison with speaker-dependent approaches even when large amounts of speech data are available. In addition, a comparison study with several speech synthesis techniques shows the new system is very robust: It is able to build voices from less-than-ideal speech data and synthesize good-quality speech even for out-of-domain sentences.}
}
@article{Stan2011442,
  author = {Stan, Adriana and Yamagishi, Junichi and King, Simon and Aylett, Matthew},
  volume = {53},
  doi = {10.1016/j.specom.2010.12.002},
  title = {The {R}omanian speech synthesis ({RSS}) corpus: Building a high quality {HMM}-based speech synthesis system using a high sampling rate},
  url = {http://www.sciencedirect.com/science/article/pii/S0167639310002074},
  journal = {Speech Communication},
  issn = {0167-6393},
  number = {3},
  pages = {442--450},
  note = {},
  year = {2011},
  keywords = {Speech synthesis, HTS, Romanian, HMMs, Sampling frequency, Auditory scale},
  abstract = {This paper first introduces a newly-recorded high quality Romanian speech corpus designed for speech synthesis, called ``RSS'', along with Romanian front-end text processing modules and HMM-based synthetic voices built from the corpus. All of these are now freely available for academic use in order to promote Romanian speech technology research. The RSS corpus comprises 3500 training sentences and 500 test sentences uttered by a female speaker and was recorded using multiple microphones at 96 kHz sampling frequency in a hemianechoic chamber. The details of the new Romanian text processor we have developed are also given. Using the database, we then revisit some basic configuration choices of speech synthesis, such as waveform sampling frequency and auditory frequency warping scale, with the aim of improving speaker similarity, which is an acknowledged weakness of current HMM-based speech synthesisers. As we demonstrate using perceptual tests, these configuration choices can make substantial differences to the quality of the synthetic speech. Contrary to common practice in automatic speech recognition, higher waveform sampling frequencies can offer enhanced feature extraction and improved speaker similarity for HMM-based speech synthesis.}
}
@incollection{renals2010,
  editor = {Hardcastle, William J. and Laver, John and Gibbon, Fiona E.},
  author = {Renals, Steve and King, Simon},
  chapter = {22},
  publisher = {Wiley Blackwell},
  booktitle = {Handbook of Phonetic Sciences},
  title = {Automatic Speech Recognition},
  year = {2010}
}
@article{frankel06:adapt,
  author = {Frankel, J. and King, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/Frankel_King_SPECOM2006.ps},
  title = {Observation Process Adaptation for Linear Dynamic Models},
  journal = {Speech Communication},
  number = {9},
  month = {September},
  volume = {48},
  pages = {1192-1199},
  year = {2006},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/Frankel_King_SPECOM2006.pdf},
  abstract = {This work introduces two methods for adapting the observation process parameters of linear dynamic models (LDM) or other linear-Gaussian models. The first method uses the expectation-maximization (EM) algorithm to estimate transforms for location and covariance parameters, and the second uses a generalized EM (GEM) approach which reduces computation in making updates from $O(p^6)$ to $O(p^3)$, where $p$ is the feature dimension. We present the results of speaker adaptation on TIMIT phone classification and recognition experiments with relative error reductions of up to $6\%$. Importantly, we find minimal differences in the results from EM and GEM. We therefore propose that the GEM approach be applied to adaptation of hidden Markov models which use non-diagonal covariances. We provide the necessary update equations.},
  categories = {am,asr,ldm,timit,edinburgh}
}
@article{richmond2003,
  author = {Richmond, K. and King, S. and Taylor, P.},
  title = {Modelling the Uncertainty in Recovering Articulation from Acoustics},
  journal = {Computer Speech and Language},
  pages = {153--172},
  volume = {17},
  key = {richmond2003},
  year = {2003},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/richmond2003.pdf},
  abstract = {This paper presents an experimental comparison of the performance of the multilayer perceptron (MLP) with that of the mixture density network (MDN) for an acoustic-to-articulatory mapping task. A corpus of acoustic-articulatory data recorded by electromagnetic articulography (EMA) for a single speaker was used as training and test data for this purpose. In theory, the MDN is able to provide a richer, more flexible description of the target variables in response to a given input vector than the least-squares trained MLP. Our results show that the mean likelihoods of the target articulatory parameters for an unseen test set were indeed consistently higher with the MDN than with the MLP. The increase ranged from approximately 3\% to 22\%, depending on the articulatory channel in question. On the basis of these results, we argue that using a more flexible description of the target domain, such as that offered by the MDN, can prove beneficial when modelling the acoustic-to-articulatory mapping.},
  categories = {artic, ann, mlp, mdn, inversion, mocha, edinburgh}
}
@inproceedings{oura:icassp:10,
  author = {Oura, Keiichiro and Tokuda, Keiichi and Yamagishi, Junichi and Wester, Mirjam and King, Simon},
  title = {Unsupervised Cross-lingual Speaker Adaptation for {HMM}-based Speech Synthesis},
  booktitle = {Proc. ICASSP},
  pages = {4954-4957},
  volume = {I},
  year = {2010},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/oura_icassp2010.pdf},
  abstract = {In the EMIME project, we are developing a mobile device that performs personalized speech-to-speech translation such that a user's spoken input in one language is used to produce spoken output in another language, while continuing to sound like the user's voice. We integrate two techniques, unsupervised adaptation for HMM-based TTS using a word-based large-vocabulary continuous speech recognizer and cross-lingual speaker adaptation for HMM-based TTS, into a single architecture. Thus, an unsupervised cross-lingual speaker adaptation system can be developed. Listening tests show very promising results, demonstrating that adapted voices sound similar to the target speaker and that differences between supervised and unsupervised cross-lingual speaker adaptation are small.},
  categories = {speaker adaptation, TTS}
}
@inproceedings{strom:etal:interspeech2007,
  author = {Strom, Volker and Nenkova, Ani and Clark, Robert and Vazquez-Alvarez, Yolanda and Brenier, Jason and King, Simon and Jurafsky, Dan},
  title = {Modelling Prominence and Emphasis Improves Unit-Selection Synthesis},
  booktitle = {Proc. Interspeech 2007},
  address = {Antwerp, Belgium},
  month = {August},
  year = {2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/p540.pdf},
  abstract = {We describe the results of large scale perception experiments showing improvements in synthesising two distinct kinds of prominence: standard pitch-accent and strong emphatic accents. Previously prominence assignment has been mainly evaluated by computing accuracy on a prominence-labelled test set. By contrast we integrated an automatic pitch-accent classifier into the unit selection target cost and showed that listeners preferred these synthesised sentences. We also describe an improved recording script for collecting emphatic accents, and show that generating emphatic accents leads to further improvements in the fiction genre over incorporating pitch accent only. Finally, we show differences in the effects of prominence between child-directed speech and news and fiction genres. Index Terms: speech synthesis, prosody, prominence, pitch accent, unit selection},
  categories = {speech synthesis}
}
@inproceedings{clark_blizzard2006,
  author = {Clark, R. and Richmond, K. and Strom, V. and King, S.},
  title = {Multisyn Voices for the {B}lizzard {C}hallenge 2006},
  booktitle = {Proc. Blizzard Challenge Workshop (Interspeech Satellite)},
  address = {Pittsburgh, USA},
  month = {September},
  note = {(http://festvox.org/blizzard/blizzard2006.html)},
  key = {clark_blizzard2006},
  year = {2006},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/cstr_blizzard2006.pdf},
  abstract = {This paper describes the process of building unit selection voices for the Festival Multisyn engine using the ATR dataset provided for the Blizzard Challenge 2006. We begin by discussing recent improvements that we have made to the Multisyn voice building process, prompted by our participation in the Blizzard Challenge 2006. We then go on to discuss our interpretation of the results observed. Finally, we conclude with some comments and suggestions for the formulation of future Blizzard Challenges.},
  categories = {tts, blizzard, multisyn, unit selection}
}
@article{Hashimoto2012857,
  author = {Hashimoto, Kei and Yamagishi, Junichi and Byrne, William and King, Simon and Tokuda, Keiichi},
  volume = {54},
  doi = {10.1016/j.specom.2012.02.004},
  title = {Impacts of machine translation and speech synthesis on speech-to-speech translation},
  url = {http://www.sciencedirect.com/science/article/pii/S0167639312000283},
  journal = {Speech Communication},
  issn = {0167-6393},
  number = {7},
  pages = {857--866},
  note = {},
  year = {2012},
  keywords = {Speech-to-speech translation, Machine translation, Speech synthesis, Subjective evaluation},
  abstract = {This paper analyzes the impacts of machine translation and speech synthesis on speech-to-speech translation systems. A typical speech-to-speech translation system consists of three components: speech recognition, machine translation and speech synthesis. Many techniques have been proposed for integration of speech recognition and machine translation. However, corresponding techniques have not yet been considered for speech synthesis. The focus of the current work is machine translation and speech synthesis, and we present a subjective evaluation designed to analyze their impact on speech-to-speech translation. The results of these analyses show that the naturalness and intelligibility of the synthesized speech are strongly affected by the fluency of the translated sentences. In addition, several features were found to correlate well with the average fluency of the translated sentences and the average naturalness of the synthesized speech.}
}
@inproceedings{vanbael:king:icphs2003,
  author = {Bael, Christophe Van and King, Simon},
  title = {An Accent-Independent Lexicon for Automatic Speech Recognition},
  booktitle = {Proc. ICPhS},
  abstract = {Recent work at the Centre for Speech Technology Re- search (CSTR) at the University of Edinburgh has de- veloped an accent-independent lexicon for speech syn- thesis (the Unisyn project). The main purpose of this lexicon is to avoid the problems and cost of writing a new lexicon for every new accent needed for synthesis. Only recently, a first attempt has been made to use the Keyword Lexicon for automatic speech recognition.},
  year = {2003},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/VanBael_King_icphs2003.pdf},
  pages = {1165-1168},
  categories = {}
}
@inproceedings{horlock:king:eurospeech2003a,
  author = {Horlock, James and King, Simon},
  title = {Named Entity Extraction from Word Lattices},
  booktitle = {Proc. Eurospeech},
  address = {Geneva},
  month = {September},
  year = {2003},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/Horlock_King_eurospeech2003a.pdf},
  abstract = {We present a method for named entity extraction from word lattices produced by a speech recogniser. Previous work by others on named entity extraction from speech has used either a manual transcript or 1-best recogniser output. We describe how a single Viterbi search can recover both the named entity sequence and the corresponding word sequence from a word lattice, and further that it is possible to trade off an increase in word error rate for improved named entity extraction.},
  categories = {}
}
@inproceedings{stan12_grapheme_alignment,
  author = {Stan, Adriana and Bell, Peter and King, Simon},
  title = {A Grapheme-based Method for Automatic Alignment of Speech and Text Data},
  booktitle = {Proc. IEEE Workshop on Spoken Language Technology},
  address = {Miami, Florida, USA},
  month = {December},
  year = {2012},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/grapheme_alignment_slt2012.pdf},
  abstract = {This paper introduces a method for automatic alignment of speech data with unsynchronised, imperfect transcripts, for a domain where no initial acoustic models are available. Using grapheme-based acoustic models, word skip networks and orthographic speech transcripts, we are able to harvest 55\% of the speech with a 93\% utterance-level accuracy and 99\% word accuracy for the produced transcriptions. The work is based on the assumption that there is a high degree of correspondence between the speech and text, and that a full transcription of all of the speech is not required. The method is language independent and the only prior knowledge and resources required are the speech and text transcripts, and a few minor user interventions.}
}
@inproceedings{CassiaSAPA12,
  author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S.},
  title = {{Evaluating speech intelligibility enhancement for {HMM}-based synthetic speech in noise}},
  booktitle = {Proc. Sapa Workshop},
  year = {2012},
  month = {September},
  address = {Portland, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_Sapa12.pdf},
  abstract = {It is possible to increase the intelligibility of speech in noise by enhancing the clean speech signal. In this paper we demonstrate the effects of modifying the spectral envelope of synthetic speech according to the environmental noise. To achieve this, we modify Mel cepstral coefficients according to an intelligibility measure that accounts for glimpses of speech in noise: the Glimpse Proportion measure. We evaluate this method against a baseline synthetic voice trained only with normal speech and a topline voice trained with Lombard speech, as well as natural speech. The intelligibility of these voices was measured when mixed with speech-shaped noise and with a competing speaker at three different levels. The Lombard voices, both natural and synthetic, were more intelligible than the normal voices in all conditions. For speech-shaped noise, the proposed modified voice was as intelligible as the Lombard synthetic voice without requiring any recordings of Lombard speech, which are hard to obtain. However, in the case of competing talker noise, the Lombard synthetic voice was more intelligible than the proposed modified voice.},
  categories = {HMM-based speech synthesis, intelligibility enhancement}
}
@inproceedings{clark_king:proc:2006,
  author = {Clark, Robert A. J. and King, Simon},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/clarkking_interspeech_2006.ps},
  title = {Joint Prosodic and Segmental Unit Selection Speech Synthesis},
  booktitle = {Proc. Interspeech 2006},
  address = {Pittsburgh, USA},
  month = {September},
  year = {2006},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/clarkking_interspeech_2006.pdf},
  abstract = {We describe a unit selection technique for text-to-speech synthesis which jointly searches the space of possible diphone sequences and the space of possible prosodic unit sequences in order to produce synthetic speech with more natural prosody. We demonstrates that this search, although currently computationally expensive, can achieve improved intonation compared to a baseline in which only the space of possible diphone sequences is searched. We discuss ways in which the search could be made sufficiently efficient for use in a real-time system.}
}
@inproceedings{wang_interspeech10,
  author = {Wang, Dong and King, Simon and Evans, Nick and Troncy, Raphael},
  title = {{CRF}-based Stochastic Pronunciation Modelling for Out-of-Vocabulary Spoken Term Detection},
  booktitle = {Proc. Interspeech},
  address = {Makuhari, Chiba, Japan},
  month = {September},
  year = {2010},
  abstract = {Out-of-vocabulary (OOV) terms present a significant challenge to spoken term detection (STD). This challenge, to a large extent, lies in the high degree of uncertainty in pronunciations of OOV terms. In previous work, we presented a stochastic pronunciation modeling (SPM) approach to compensate for this uncertainty. A shortcoming of our original work, however, is that the SPM was based on a joint-multigram model (JMM), which is suboptimal. In this paper, we propose to use conditional random fields (CRFs) for letter-to-sound conversion, which significantly improves quality of the predicted pronunciations. When applied to OOV STD, we achieve consider- able performance improvement with both a 1-best system and an SPM-based system.},
  categories = {speech recognition, spoken term detection, conditional random field, joint multigram model}
}
@inproceedings{strom10d,
  author = {Strom, Volker and King, Simon},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/strom10d.ps},
  title = {A classifier-based target cost for unit selection speech synthesis trained on perceptual data},
  booktitle = {Proc.~Interspeech},
  address = {Makuhari, Japan},
  year = {2010},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/strom10d.pdf},
  abstract = {Our goal is to automatically learn a PERCEPTUALLY-optimal target cost function for a unit selection speech synthesiser. The approach we take here is to train a classifier on human perceptual judgements of synthetic speech. The output of the classifier is used to make a simple three-way distinction rather than to estimate a continuously-valued cost. In order to collect the necessary perceptual data, we synthesised 145,137 short sentences with the usual target cost switched off, so that the search was driven by the join cost only. We then selected the 7200 sentences with the best joins and asked 60 listeners to judge them, providing their ratings for each syllable. From this, we derived a rating for each demiphone. Using as input the same context features employed in our conventional target cost function, we trained a classifier on these human perceptual ratings. We synthesised two sets of test sentences with both our standard target cost and the new target cost based on the classifier. A/B preference tests showed that the classifier-based target cost, which was learned completely automatically from modest amounts of perceptual data, is almost as good as our carefully- and expertly-tuned standard target cost.},
  categories = {speech synthesis, unit selection, target cost}
}
@article{frankel07:ldm,
  author = {Frankel, J. and King, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Frankel_King_IEEE2007.ps},
  title = {Speech Recognition using Linear Dynamic Models},
  journal = {IEEE {T}ransactions on {S}peech and {A}udio {P}rocessing},
  number = {1},
  month = {January},
  volume = {15},
  pages = {246--256},
  year = {2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Frankel_King_IEEE2007.pdf},
  abstract = {The majority of automatic speech recognition (ASR) systems rely on hidden Markov models, in which Gaussian mixtures model the output distributions associated with sub-phone states. This approach, whilst successful, models consecutive feature vectors (augmented to include derivative information) as statistically independent. Furthermore, spatial correlations present in speech parameters are frequently ignored through the use of diagonal covariance matrices. This paper continues the work of Digalakis and others who proposed instead a first-order linear state-space model which has the capacity to model underlying dynamics, and furthermore give a model of spatial correlations. This paper examines the assumptions made in applying such a model and shows that the addition of a hidden dynamic state leads to increases in accuracy over otherwise equivalent static models. We also propose a time-asynchronous decoding strategy suited to recognition with segment models. We describe implementation of decoding for linear dynamic models and present TIMIT phone recognition results.},
  categories = {am,asr,ldm,timit,search,edinburgh}
}
@article{wang_ieeesigprocletters2011,
  author = {Wang, Dong and King, Simon},
  doi = {10.1109/LSP.2010.2098440},
  title = {Letter-to-Sound Pronunciation Prediction Using Conditional Random Fields},
  journal = {IEEE Signal Processing Letters},
  number = {2},
  month = {February},
  volume = {18},
  pages = {122--125},
  year = {2011},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/wang_ieeesigprocletters2011.pdf},
  abstract = {Pronunciation prediction, or letter-to-sound (LTS) conversion, is an essential task for speech synthesis, open vo- cabulary spoken term detection and other applications dealing with novel words. Most current approaches (at least for English) employ data-driven methods to learn and represent pronunciation ``rules'' using statistical models such as decision trees, hidden Markov models (HMMs) or joint-multigram models (JMMs). The LTS task remains challenging, particularly for languages with a complex relationship between spelling and pronunciation such as English. In this paper, we propose to use a conditional random field (CRF) to perform LTS because it avoids having to model a distribution over observations and can perform global inference, suggesting that it may be more suitable for LTS than decision trees, HMMs or JMMs. One challenge in applying CRFs to LTS is that the phoneme and grapheme sequences of a word are generally of different lengths, which makes CRF training difficult. To solve this problem, we employed a joint-multigram model to generate aligned training exemplars. Experiments conducted with the AMI05 dictionary demonstrate that a CRF significantly outperforms other models, especially if n-best lists of predictions are generated.},
  categories = {Terms—letter-to-sound, conditional random field, joint multigram model, speech synthesis, spoken term detection}
}
@manual{king:verbmobil1996c,
  author = {King, Simon},
  title = {Users {M}anual for {V}erbmobil {T}eilprojekt 4.4},
  abstract = {Verbmobil English synthesiser users manual},
  month = {October},
  year = {1996},
  organization = {IKP, Universitaet Bonn},
  categories = {}
}
@inproceedings{Gutkin:King:icpr04,
  author = {Gutkin, Alexander and King, Simon},
  publisher = {IEEE Computer Society Press},
  isbn = {0-7695-2128-2},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/gutkin_king_icpr04.ps.gz},
  booktitle = {Proc. 17th International Conference on Pattern Recognition (ICPR)},
  title = {{S}tructural {R}epresentation of {S}peech for {P}honetic {C}lassification},
  year = {2004},
  month = {August},
  volume = {3},
  pages = {438--441},
  address = {Cambridge, UK},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/gutkin_king_icpr04.pdf},
  abstract = {This paper explores the issues involved in using symbolic metric algorithms for automatic speech recognition (ASR), via a structural representation of speech. This representation is based on a set of phonological distinctive features which is a linguistically well-motivated alternative to the ``beads-on-a-string'' view of speech that is standard in current ASR systems. We report the promising results of phoneme classification experiments conducted on a standard continuous speech task.},
  categories = {structural,recognition,acoustic,phonetic_feature,timit,edinburgh}
}
@inproceedings{Gutkin:King:icassp05,
  author = {Gutkin, Alexander and King, Simon},
  publisher = {IEEE Signal Processing Society Press},
  isbn = {0-7803-8875-5},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/gutkin_king_icassp2005.ps.gz},
  booktitle = {Proc. IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP-05)},
  title = {{D}etection of {S}ymbolic {G}estural {E}vents in {A}rticulatory {D}ata for {U}se in {S}tructural {R}epresentations of {C}ontinuous {S}peech},
  year = {2005},
  month = {March},
  volume = {I},
  pages = {885--888},
  address = {Philadelphia, PA, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/gutkin_king_icassp2005.pdf},
  abstract = {One of the crucial issues which often needs to be addressed in structural approaches to speech representation is the choice of fundamental symbolic units of representation. In this paper, a physiologically inspired methodology for defining these symbolic atomic units in terms of primitive articulatory events is proposed. It is shown how the atomic articulatory events (gestures) can be detected directly in the articulatory data. An algorithm for evaluating the reliability of the articulatory events is described and promising results of the experiments conducted on MOCHA articulatory database are presented.},
  categories = {structural,recognition,artic,mocha,edinburgh}
}
@inproceedings{wang_std_covariance_icassp2010,
  author = {Wang, Dong and King, Simon and Frankel, Joe and Bell, Peter},
  title = {Stochastic Pronunciation Modelling and Soft Match for Out-of-vocabulary Spoken Term Detection},
  booktitle = {Proc. ICASSP},
  address = {Dallas, Texas, USA},
  month = {March},
  year = {2010},
  keywords = {confidence estimation, spoken term detection, speech recognition},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wang10_icassp.pdf},
  abstract = {A major challenge faced by a spoken term detection (STD) system is the detection of out-of-vocabulary (OOV) terms. Although a subword-based STD system is able to detect OOV terms, performance reduction is always observed compared to in-vocabulary terms. One challenge that OOV terms bring to STD is the pronunciation uncertainty. A commonly used approach to address this problem is a soft matching procedure,and the other is the stochastic pronunciation modelling (SPM) proposed by the authors. In this paper we compare these two approaches, and combine them using a discriminative decision strategy. Experimental results demonstrated that SPM and soft match are highly complementary, and their combination gives significant performance improvement to OOV term detection.}
}
@inproceedings{jyamagis:emime,
  author = {Yamagishi, Junichi and Lincoln, Mike and King, Simon and Dines, John and Gibson, Matthew and Tian, Jilei and Guan, Yong},
  title = {Analysis of Unsupervised and Noise-Robust Speaker-Adaptive {HMM}-Based Speech Synthesis Systems toward a Unified {ASR} and {TTS} Framework},
  booktitle = {Proc. Interspeech 2009},
  address = {Edinburgh, U.K.},
  month = {September},
  year = {2009},
  abstract = {For the 2009 Blizzard Challenge we have built an unsupervised version of the HTS-2008 speaker-adaptive HMM-based speech synthesis system for English, and a noise robust version of the systems for Mandarin. They are designed from a multidisciplinary application point of view in that we attempt to integrate the components of the TTS system with other technologies such as ASR. All the average voice models are trained exclusively from recognized, publicly available, ASR databases. Multi-pass LVCSR and confidence scores calculated from confusion network are used for the unsupervised systems, and noisy data recorded in cars or public spaces is used for the noise robust system. We believe the developed systems form solid benchmarks and provide good connections to ASR fields. This paper describes the development of the systems and reports the results and analysis of their evaluation.}
}
@inproceedings{CassiaLista12,
  author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S.},
  title = {{Using an intelligibility measure to create noise robust cepstral coefficients for {HMM}-based speech synthesis}},
  booktitle = {Proc. LISTA Workshop},
  address = {Edinburgh, UK},
  month = {May},
  year = {2012},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_Lista12.pdf},
  categories = {HMM-based speech synthesis, intelligibility enhancement}
}
@inproceedings{vepa-king_euro03,
  author = {Vepa, J. and King, S.},
  title = {Kalman-filter based Join Cost for Unit-selection Speech Synthesis},
  booktitle = {Proc. {E}urospeech},
  address = {Geneva, Switzerland},
  year = {2003},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/vepa_eurospeech03.pdf},
  abstract = {We introduce a new method for computing join cost in unit-selection speech synthesis which uses a linear dynamical model (also known as a Kalman filter) to model line spectral frequency trajectories. The model uses an underlying subspace in which it makes smooth, continuous trajectories. This subspace can be seen as an analogy for underlying articulator movement. Once trained, the model can be used to measure how well concatenated speech segments join together. The objective join cost is based on the error between model predictions and actual observations. We report correlations between this measure and mean listener scores obtained from a perceptual listening experiment. Our experiments use a state-of-the art unit-selection text-to-speech system: `rVoice' from Rhetorical Systems Ltd.},
  categories = {join cost, Kalman filter, LDM, rVoice, edinburgh}
}
@incollection{king:ELL2_2006a,
  editor = {Brown, Keith},
  author = {King, Simon},
  edition = {2nd},
  booktitle = {Encyclopedia of Language and Linguistics},
  publisher = {Elsevier},
  year = {2006},
  title = {Language variation in speech technologies}
}
@inproceedings{livescu07:manual,
  author = {Livescu, K. and Bezman, A. and Borges, N. and Yung, L. and Çetin, Ö. and Frankel, J. and King, S. and Magimai-Doss, M. and Chi, X. and Lavoie, L.},
  title = {Manual transcription of conversational speech at the articulatory feature level},
  booktitle = {Proc. ICASSP},
  address = {Honolulu},
  month = {April},
  year = {2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/livescu_icassp07_trans.pdf},
  abstract = {We present an approach for the manual labeling of speech at the articulatory feature level, and a new set of labeled conversational speech collected using this approach. A detailed transcription, including overlapping or reduced gestures, is useful for studying the great pronunciation variability in conversational speech. It also facilitates the testing of feature classiers, such as those used in articulatory approaches to automatic speech recognition. We describe an effort to transcribe a small set of utterances drawn from the Switchboard database using eight articulatory tiers. Two transcribers have labeled these utterances in a multi-pass strategy, allowing for correction of errors. We describe the data collection methods and analyze the data to determine how quickly and reliably this type of transcription can be done. Finally, we demonstrate one use of the new data set by testing a set of multilayer perceptron feature classiers against both the manual labels and forced alignments.}
}
@inproceedings{frankel00:NN_LDM,
  author = {Frankel, J. and Richmond, K. and King, S. and Taylor, P.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Frankel_et_al_ICSLP2000.ps},
  title = {An automatic speech recognition system using neural networks and linear dynamic models to recover and model articulatory traces},
  booktitle = {Proc. {ICSLP}},
  year = {2000},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Frankel_et_al_ICSLP2000.pdf},
  abstract = {In this paper we describe a speech recognition system using linear dynamic models and articulatory features. Experiments are reported in which measured articulation from the MOCHA corpus has been used, along with those where the articulatory parameters are estimated from the speech signal using a recurrent neural network.},
  categories = {am,artic,asr,ldm,mocha,edinburgh,inversion,ann}
}
@inproceedings{vepa-king-taylor_ieee02,
  author = {Vepa, J. and King, S. and Taylor, P.},
  title = {New Objective Distance Measures for Spectral Discontinuities in Concatenative Speech Synthesis},
  booktitle = {Proc. {IEEE} 2002 workshop on speech synthesis},
  address = {Santa Monica, USA},
  month = {September},
  year = {2002},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/vepa_tts02.pdf},
  abstract = {The quality of unit selection based concatenative speech synthesis mainly depends on how well two successive units can be joined together to minimise the audible discontinuities. The objective measure of discontinuity used when selecting units is known as the `join cost'. The ideal join cost will measure `perceived' discontinuity, based on easily measurable spectral properties of the units being joined, in order to ensure smooth and natural-sounding synthetic speech. In this paper we describe a perceptual experiment conducted to measure the correlation between `subjective' human perception and various `objective' spectrally-based measures proposed in the literature. Also we report new objective distance measures derived from various distance metrics based on these spectral features, which have good correlation with human perception to concatenation discontinuities. Our experiments used a state-of-the art unit-selection text-to-speech system: `rVoice' from Rhetorical Systems Ltd.},
  categories = {join cost, weighted distances, MCA, rVoice, edinburgh}
}
@inproceedings{horlock:king:eurospeech2003b,
  author = {Horlock, James and King, Simon},
  title = {Discriminative Methods for Improving Named Entity Extraction on Speech Data},
  booktitle = {Proc. Eurospeech},
  address = {Geneva},
  month = {September},
  year = {2003},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/Horlock_King_eurospeech2003b.pdf},
  abstract = {In this paper we present a method of discriminatively training language models for spoken language understanding; we show improvements in named entity F-scores on speech data using these improved language models. A comparison between theoretical probabilities associated with manual markup and the actual probabilities of output markup is used to identify probabilities requiring adjustment. We present results which support our hypothesis that improvements in F-scores are possible by using either previously used training data or held out development data to improve discrimination amongst a set of N-gram language models.},
  categories = {}
}
@inproceedings{higher_level,
  author = {Watts, Oliver and Yamagishi, Junichi and King, Simon},
  title = {The role of higher-level linguistic features in {HMM}-based speech synthesis},
  booktitle = {Proc. Interspeech},
  address = {Makuhari, Japan},
  month = {September},
  pages = {841-844},
  year = {2010},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100565.pdf},
  abstract = {We analyse the contribution of higher-level elements of the linguistic specification of a data-driven speech synthesiser to the naturalness of the synthetic speech which it generates. The system is trained using various subsets of the full feature-set, in which features relating to syntactic category, intonational phrase boundary, pitch accent and boundary tones are selectively removed. Utterances synthesised by the different configurations of the system are then compared in a subjective evaluation of their naturalness. The work presented forms background analysis for an ongoing set of experiments in performing text-to-speech (TTS) conversion based on shallow features: features that can be trivially extracted from text. By building a range of systems, each assuming the availability of a different level of linguistic annotation, we obtain benchmarks for our on-going work.}
}
@inproceedings{joe_dong_simon_interspeech08_bottle,
  author = {Frankel, Joe and Wang, Dong and King, Simon},
  title = {Growing bottleneck features for tandem {ASR}},
  booktitle = {Proc. Interspeech},
  month = {September},
  pages = {1549},
  year = {2008},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/bottlenet.a.pdf},
  abstract = {We present a method for training bottleneck MLPs for use in tandem ASR. Experiments on meetings data show that this approach leads to improved performance compared with training MLPs from a random initialization.},
  categories = {tandem ASR, bottleneck MLP}
}
@inproceedings{tts_barra08,
  author = {Barra-Chicote, R. and Yamagishi, J. and Montero, J.M. and King, S. and Lutfi, S. and Macias-Guarasa, J.},
  title = {Generacion de una voz sintetica en {C}astellano basada en {HSMM} para la {E}valuacion {A}lbayzin 2008: conversion texto a voz},
  booktitle = {V Jornadas en Tecnologia del Habla},
  month = {November},
  note = {(in Spanish)},
  year = {2008},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/tts-jth08.pdf},
  pages = {115-118}
}
@article{king:jphon2003,
  author = {King, Simon},
  title = {Dependence and independence in automatic speech recognition and synthesis},
  journal = {Journal of Phonetics},
  number = {3-4},
  pages = {407-411},
  volume = {31},
  year = {2003},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/King_jphon2003.pdf},
  abstract = {A short review paper},
  categories = {}
}
@inproceedings{john:HTSGAP,
  author = {Dines, J. and Yamagishi, J. and King, S.},
  title = {Measuring the gap between {HMM}-based {ASR} and {TTS}},
  booktitle = {Proc. Interspeech},
  address = {Brighton, U.K.},
  month = {September},
  pages = {1391--1394},
  year = {2009},
  abstract = {The EMIME European project is conducting research in the development of technologies for mobile, personalised speech-to-speech translation systems. The hidden Markov model is being used as the underlying technology in both automatic speech recognition (ASR) and text-to-speech synthesis (TTS) components, thus, the investigation of unified statistical modelling approaches has become an implicit goal of our research. As one of the first steps towards this goal, we have been investigating commonalities and differences between HMM-based ASR and TTS. In this paper we present results and analysis of a series of experiments that have been conducted on English ASR and TTS systems, measuring their performance with respect to phone set and lexicon, acoustic feature type and dimensionality and HMM topology. Our results show that, although the fundamental statistical model may be essentially the same, optimal ASR and TTS performance often demands diametrically opposed system designs. This represents a major challenge to be addressed in the investigation of such unified modelling approaches.}
}
@inproceedings{strom06,
  author = {Strom, Volker and Clark, Robert and King, Simon},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/strom06.ps},
  title = {Expressive Prosody for Unit-selection Speech Synthesis},
  booktitle = {Proc.~Interspeech},
  address = {Pittsburgh},
  year = {2006},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/strom06.pdf},
  abstract = {Current unit selection speech synthesis voices cannot produce emphasis or interrogative contours because of a lack of the necessary prosodic variation in the recorded speech database. A method of recording script design is proposed which addresses this shortcoming. Appropriate components were added to the target cost function of the Festival Multisyn engine, and a perceptual evaluation showed a clear preference over the baseline system.}
}
@article{turk:2429,
  author = {Turk, Alice and Scobbie, James and Geng, Christian and Macmartin, Cedric and Bard, Ellen and Campbell, Barry and Dickie, Catherine and Dubourg, Eddie and Hardcastle, Bill and Hoole, Phil and Kanaida, Evia and Lickley, Robin and Nakai, Satsuki and Pouplier, Marianne and King, Simon and Renals, Steve and Richmond, Korin and Schaeffler, Sonja and Wiegand, Ronnie and White, Kevin and Wrench, Alan},
  publisher = {ASA},
  doi = {10.1121/1.3508679},
  title = {The {Edinburgh Speech Production Facility's} articulatory corpus of spontaneous dialogue.},
  journal = {The Journal of the Acoustical Society of America},
  number = {4},
  pages = {2429-2429},
  volume = {128},
  year = {2010},
  abstract = {The EPSRC‐funded Edinburgh Speech Production is built around two synchronized Carstens AG500 electromagnetic articulographs (EMAs) in order to capture articulatory∕acoustic data from spontaneous dialogue. An initial articulatory corpus was designed with two aims. The first was to elicit a range of speech styles∕registers from speakers, and therefore provide an alternative to fully scripted corpora. The second was to extend the corpus beyond monologue, by using tasks that promote natural discourse and interaction. A subsidiary driver was to use dialects from outwith North America: dialogues paired up a Scottish English and a Southern British English speaker. Tasks. Monologue: Story reading of ``Comma Gets a Cure'' [Honorof et al. (2000)], lexical sets [Wells (1982)], spontaneous story telling, diadochokinetic tasks. Dialogue: Map tasks [Anderson et al. (1991)], ``Spot the Difference'' picture tasks [Bradlow et al. (2007)], story‐recall. Shadowing of the spontaneous story telling by the second participant. Each dialogue session includes approximately 30 min of speech, and there are acoustics‐only baseline materials. We will introduce the corpus and highlight the role of articulatory production data in helping provide a fuller understanding of various spontaneous speech phenomena by presenting examples of naturally occurring covert speech errors, accent accommodation, turn taking negotiation, and shadowing.}
}
@inproceedings{kurimo:acl:10,
  author = {Kurimo, Mikko and Byrne, William and Dines, John and Garner, Philip N. and Gibson, Matthew and Guan, Yong and Hirsim\"{a}ki, Teemu and Karhila, Reima and King, Simon and Liang, Hui and Oura, Keiichiro and Saheer, Lakshmi and Shannon, Matt and Shiota, Sayaka and Tian, Jilei and Tokuda, Keiichi and Wester, Mirjam and Wu, Yi-Jian and Yamagishi, Junichi},
  title = {Personalising speech-to-speech translation in the {EMIME} project},
  booktitle = {Proc. ACL 2010 System Demonstrations},
  address = {Uppsala, Sweden},
  month = {July},
  year = {2010},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/kurimo_acl_2010.pdf},
  abstract = {In the EMIME project we have studied unsupervised cross-lingual speaker adaptation. We have employed an HMM statistical framework for both speech recognition and synthesis which provides transformation mechanisms to adapt the synthesized voice in TTS (text-to-speech) using the recognized voice in ASR (automatic speech recognition). An important application for this research is personalised speech-to-speech translation that will use the voice of the speaker in the input language to utter the translated sentences in the output language. In mobile environments this enhances the users' interaction across language barriers by making the output speech sound more like the original speaker's way of speaking, even if she or he could not speak the output language.},
  categories = {speaker adaptation}
}
@inproceedings{dongwang_interspeech09_cmb,
  author = {Tejedor, Javier and Wang, Dong and King, Simon and Frankel, Joe and Colas, Jose},
  title = {A Posterior Probability-based System Hybridisation and Combination for Spoken Term Detection},
  booktitle = {Proc. Interspeech},
  address = {Brighton, UK},
  month = {September},
  pages = {2131--2134},
  year = {2009},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/cmb.pdf},
  abstract = {Spoken term detection (STD) is a fundamental task for multimedia information retrieval. To improve the detection performance, we have presented a direct posterior-based confidence measure generated from a neural network. In this paper, we propose a detection-independent confidence estimation based on the direct posterior confidence measure, in which the decision making is totally separated from the term detection. Based on this idea, we first present a hybrid system which conducts the term detection and confidence estimation based on different sub-word units, and then propose a combination method which merges detections from heterogeneous term detectors based on the direct posterior-based confidence. Experimental results demonstrated that the proposed methods improved system performance considerably for both English and Spanish.},
  categories = {joint-multigram, pronunciation model, spoken term detection, speech recognition}
}
@inproceedings{bell_king_full_covariance_asru2009,
  author = {Bell, Peter and King, Simon},
  doi = {10.1109/ASRU.2009.5373344},
  title = {Diagonal Priors for Full Covariance Speech Recognition},
  booktitle = {Proc. IEEE Workshop on Automatic Speech Recognition and Understanding},
  address = {Merano, Italy},
  month = {December},
  year = {2009},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/shrinkage_asru2009.pdf},
  abstract = {We investigate the use of full covariance Gaussians for large-vocabulary speech recognition. The large number of parameters gives high modelling power, but when training data is limited, the standard sample covariance matrix is often poorly conditioned, and has high variance. We explain how these problems may be solved by the use of a diagonal covariance smoothing prior, and relate this to the shrinkage estimator, for which the optimal shrinkage parameter may itself be estimated from the training data. We also compare the use of generatively and discriminatively trained priors. Results are presented on a large vocabulary conversational telephone speech recognition task.}
}
@inproceedings{shig041,
  author = {Shiga, Yoshinori and King, Simon},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_ssw504.ps},
  title = {Accurate spectral envelope estimation for articulation-to-speech synthesis},
  booktitle = {Proc. 5th ISCA Speech Synthesis Workshop},
  address = {CMU, Pittsburgh, USA},
  month = {June},
  pages = {19--24},
  year = {2004},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_ssw504.pdf},
  abstract = {This paper introduces a novel articulatory-acoustic mapping in which detailed spectral envelopes are estimated based on the cepstrum, inclusive of the high-quefrency elements which are discarded in conventional speech synthesis to eliminate the pitch component of speech. For this estimation, the method deals with the harmonics of multiple voiced-speech spectra so that several sets of harmonics can be obtained at various pitch frequencies to form a spectral envelope. The experimental result shows that the method estimates spectral envelopes with the highest accuracy when the cepstral order is 48--64, which suggests that the higher order coeffcients are required to represent detailed envelopes reflecting the real vocal-tract responses.},
  categories = {artic, lbg, clustering, mocha, harmonic, envelope, edinburgh}
}
@inproceedings{king:tokuda:zen:yamagishi:interspeech2008,
  author = {King, Simon and Tokuda, Keiichi and Zen, Heiga and Yamagishi, Junichi},
  title = {Unsupervised adaptation for HMM-based speech synthesis},
  booktitle = {Proc. Interspeech},
  address = {Brisbane, Australia},
  month = {September},
  pages = {1869-1872},
  year = {2008},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080299.PDF},
  abstract = {It is now possible to synthesise speech using HMMs with a comparable quality to unit-selection techniques. Generating speech from a model has many potential advantages over concatenating waveforms. The most exciting is model adaptation. It has been shown that supervised speaker adaptation can yield high-quality synthetic voices with an order of magnitude less data than required to train a speaker-dependent model or to build a basic unit-selection system. Such supervised methods require labelled adaptation data for the target speaker. In this paper, we introduce a method capable of unsupervised adaptation, using only speech from the target speaker without any labelling.},
  categories = {speech synthesis, HMM-based speech synthesis, HTS, trajectory HMMs, speaker adaptation, MLLR}
}
@inproceedings{sansegundo_et_al_IS2012,
  author = {San-Segundo, Ruben and Montero, Juan M. and Lopez-Luden, Veronica and King, Simon},
  title = {Detecting Acronyms from Capital Letter Sequences in Spanish},
  booktitle = {Proc. Interspeech},
  address = {Portland, Oregon, USA},
  month = {September},
  year = {2012},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Thu-P10a-07.pdf},
  abstract = {This paper presents an automatic strategy to decide how to pronounce a Capital Letter Sequence (CLS) in a Text to Speech system (TTS). If CLS is well known by the TTS, it can be expanded in several words. But when the CLS is unknown, the system has two alternatives: spelling it (abbreviation) or pronouncing it as a new word (acronym). In Spanish, there is a high relationship between letters and phonemes. Because of this, when a CLS is similar to other words in Spanish, there is a high tendency to pronounce it as a standard word. This paper proposes an automatic method for detecting acronyms. Additionally, this paper analyses the discrimination capability of some features, and several strategies for combining them in order to obtain the best classifier. For the best classifier, the classification error is 8.45\%. About the feature analysis, the best features have been the Letter Sequence Perplexity and the Average N-gram order.}
}
@inproceedings{toth:frankel:goztolya:king:interspeech2008,
  author = {Toth, Laszlo and Frankel, Joe and Gosztolya, Gabor and King, Simon},
  title = {Cross-lingual Portability of MLP-Based Tandem Features -- A Case Study for English and Hungarian},
  booktitle = {Proc. Interspeech},
  address = {Brisbane, Australia},
  month = {September},
  pages = {2695-2698},
  year = {2008},
  keywords = {tandem, ASR},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080729.PDF},
  abstract = {One promising approach for building ASR systems for less-resourced languages is cross-lingual adaptation. Tandem ASR is particularly well suited to such adaptation, as it includes two cascaded modelling steps: feature extraction using multi-layer perceptrons (MLPs), followed by modelling using a standard HMM. The language-specific tuning can be performed by adjusting the HMM only, leaving the MLP untouched. Here we examine the portability of feature extractor MLPs between an Indo-European (English) and a Finno-Ugric (Hungarian) language. We present experiments which use both conventional phone-posterior and articulatory feature (AF) detector MLPs, both trained on a much larger quantity of (English) data than the monolingual (Hungarian) system. We find that the cross-lingual configurations achieve similar performance to the monolingual system, and that, interestingly, the AF detectors lead to slightly worse performance, despite the expectation that they should be more language-independent than phone-based MLPs. However, the cross-lingual system outperforms all other configurations when the English phone MLP is adapted on the Hungarian data.}
}
@article{tejedor:wang:frankel:king:colas:specom2008,
  author = {Tejedor, Javier and Wang, Dong and Frankel, Joe and King, Simon and Colás, José},
  doi = {10.1016/j.specom.2008.03.005},
  title = {A comparison of grapheme and phoneme-based units for {S}panish spoken term detection},
  journal = {Speech Communication},
  number = {11-12},
  month = {November},
  volume = {50},
  pages = {980-991},
  year = {2008},
  abstract = {The ever-increasing volume of audio data available online through the world wide web means that automatic methods for indexing and search are becoming essential. Hidden Markov model (HMM) keyword spotting and lattice search techniques are the two most common approaches used by such systems. In keyword spotting, models or templates are defined for each search term prior to accessing the speech and used to find matches. Lattice search (referred to as spoken term detection), uses a pre-indexing of speech data in terms of word or sub-word units, which can then quickly be searched for arbitrary terms without referring to the original audio. In both cases, the search term can be modelled in terms of sub-word units, typically phonemes. For in-vocabulary words (i.e. words that appear in the pronunciation dictionary), the letter-to-sound conversion systems are accepted to work well. However, for out-of-vocabulary (OOV) search terms, letter-to-sound conversion must be used to generate a pronunciation for the search term. This is usually a hard decision (i.e. not probabilistic and with no possibility of backtracking), and errors introduced at this step are difficult to recover from. We therefore propose the direct use of graphemes (i.e., letter-based sub-word units) for acoustic modelling. This is expected to work particularly well in languages such as Spanish, where despite the letter-to-sound mapping being very regular, the correspondence is not one-to-one, and there will be benefits from avoiding hard decisions at early stages of processing. In this article, we compare three approaches for Spanish keyword spotting or spoken term detection, and within each of these we compare acoustic modelling based on phone and grapheme units. Experiments were performed using the Spanish geographical-domain Albayzin corpus. Results achieved in the two approaches proposed for spoken term detection show us that trigrapheme units for acoustic modelling match or exceed the performance of phone-based acoustic models. In the method proposed for keyword spotting, the results achieved with each acoustic model are very similar.},
  categories = {Spoken term detection; Keyword spotting; Graphemes; Spanish}
}
@inproceedings{mayoclarkking-isp05,
  author = {Mayo, C. and Clark, R. A. J. and King, S.},
  title = {Multidimensional Scaling of Listener Responses to Synthetic Speech},
  booktitle = {Proc. Interspeech 2005},
  address = {Lisbon, Portugal},
  month = {September},
  year = {2005},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/ie-speech-2005.pdf}
}
@inproceedings{king00:recognition_syll,
  author = {King, S. and Taylor, P. and Frankel, J. and Richmond, K.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/King_et_al_Phonus2000.ps},
  title = {Speech recognition via phonetically-featured syllables},
  booktitle = {PHONUS},
  address = {Institute of Phonetics, University of the Saarland},
  pages = {15-34},
  volume = {5},
  year = {2000},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/King_et_al_Phonus2000.pdf},
  abstract = {We describe recent work on two new automatic speech recognition systems. The first part of this paper describes the components of a system based on phonological features (which we call EspressoA) in which the values of these features are estimated from the speech signal before being used as the basis for recognition. In the second part of the paper, another system (which we call EspressoB) is described in which articulatory parameters are used instead of phonological features and a linear dynamical system model is used to perform recognition from automatically estimated values of these articulatory parameters.},
  categories = {am,artic,asr,ldm,phonetic_feature,mocha,timit,edinburgh}
}
@techreport{king:verbmobil1996b,
  author = {King, Simon},
  title = {Inventory design for {V}erbmobil {T}eilprojekt 4.4},
  abstract = {Inventory design for Verbmobil English speech synthesis synthesis},
  month = {October},
  year = {1996},
  institution = {IKP, Universität Bonn},
  categories = {}
}
@inproceedings{5947506,
  author = {Hashimoto, K. and Yamagishi, J. and Byrne, W. and King, S. and Tokuda, K.},
  doi = {10.1109/ICASSP.2011.5947506},
  title = {An analysis of machine translation and speech synthesis in speech-to-speech translation system},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011 IEEE International Conference on},
  issn = {1520-6149},
  month = {May},
  pages = {5108--5111},
  year = {2011},
  keywords = {machine translation;speech recognition;speech synthesis;speech-to-speech translation system;speech recognition;speech synthesis;},
  abstract = {This paper provides an analysis of the impacts of machine translation and speech synthesis on speech-to-speech translation systems. The speech-to-speech translation system consists of three components: speech recognition, machine translation and speech synthesis. Many techniques for integration of speech recognition and machine translation have been proposed. However, speech synthesis has not yet been considered. Therefore, in this paper, we focus on machine translation and speech synthesis, and report a subjective evaluation to analyze the impact of each component. The results of these analyses show that the naturalness and intelligibility of synthesized speech are strongly affected by the fluency of the translated sentences.}
}
@inproceedings{junichi:interspeech2010,
  author = {Yamagishi, Junichi and Watts, Oliver and King, Simon and Usabaev, Bela},
  title = {Roles of the Average Voice in Speaker-adaptive {HMM}-based Speech Synthesis},
  booktitle = {{Proc. Interspeech}},
  address = {Makuhari, Japan},
  month = {September},
  pages = {418--421},
  year = {2010},
  keywords = {speech synthesis, HMM, average voice, speaker adaptation},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100361.pdf},
  abstract = {In speaker-adaptive HMM-based speech synthesis, there are typically a few speakers for which the output synthetic speech sounds worse than that of other speakers, despite having the same amount of adaptation data from within the same corpus. This paper investigates these fluctuations in quality and concludes that as mel-cepstral distance from the average voice becomes larger, the MOS naturalness scores generally become worse. Although this negative correlation is not that strong, it suggests a way to improve the training and adaptation strategies. We also draw comparisons between our findings and the work of other researchers regarding ``vocal attractiveness.''}
}
@inproceedings{junichi:icassp2010,
  author = {Yamagishi, J. and King, S.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/JunichiICASSP10.pdf},
  booktitle = {{Proc. ICASSP 2010}},
  address = {Dallas, Texas, USA},
  year = {2010},
  title = {Simple methods for improving speaker-similarity of {HMM}-based speech synthesis}
}
@inproceedings{frankel05:hybrid,
  author = {Frankel, J. and King, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/Frankel_King_INTER2005.ps},
  title = {A Hybrid {ANN/DBN} Approach to Articulatory Feature Recognition},
  booktitle = {Proc. Eurospeech},
  address = {Lisbon},
  month = {September},
  year = {2005},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/Frankel_King_INTER2005.pdf},
  abstract = {Artificial neural networks (ANN) have proven to be well suited to the task of articulatory feature (AF) recognition. Previous studies have taken a cascaded approach where separate ANNs are trained for each feature group, making the assumption that features are statistically independent. We address this by using ANNs to provide virtual evidence to a dynamic Bayesian network (DBN). This gives a hybrid ANN/DBN model and allows modelling of inter-feature dependencies. We demonstrate significant increases in AF recognition accuracy from modelling dependencies between features, and present the results of embedded training experiments in which a set of asynchronous feature changes are learned. Furthermore, we report on the application of a Viterbi training scheme in which we alternate between realigning the AF training labels and retraining the ANNs.},
  categories = {am,artic,asr,dbn,oginumbers,edinburgh}
}
@inproceedings{gillett:king:eurospeech2003a,
  author = {Gillett, Ben and King, Simon},
  title = {Transforming Voice Quality},
  booktitle = {Proc. {E}urospeech},
  address = {Geneva},
  month = {September},
  year = {2003},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/Gillett_King_eurospeech2003b.pdf},
  abstract = {Voice transformation is the process of transforming the characteristics of speech uttered by a source speaker, such that a listener would believe the speech was uttered by a target speaker. In this paper we address the problem of transforming voice quality. We do not attempt to transform prosody. Our system has two main parts corresponding to the two components of the source-filter model of speech production. The first component transforms the spectral envelope as represented by a linear prediction model. The transformation is achieved using a Gaussian mixture model, which is trained on aligned speech from source and target speakers. The second part of the system predicts the spectral detail from the transformed linear prediction coefficients. A novel approach is proposed, which is based on a classifier and residual codebooks. On the basis of a number of performance metrics it outperforms existing systems.},
  categories = {}
}
@article{junichi:ieee2010,
  author = {Yamagishi, J. and Usabaev, B. and King, S. and Watts, O. and Dines, J. and Tian, J. and Hu, R. and Guan, Y. and Oura, K. and Tokuda, K. and Karhila, R. and Kurimo, M.},
  doi = {10.1109/TASL.2010.2045237},
  title = {Thousands of Voices for {HMM}-based Speech Synthesis -- Analysis and Application of {TTS} Systems Built on Various {ASR} Corpora},
  journal = {IEEE Transactions on Audio, Speech and Language Processing},
  number = {5},
  month = {July},
  volume = {18},
  pages = {984--1004},
  year = {2010},
  keywords = {Automatic speech recognition (ASR), H Triple S (HTS), SPEECON database, WSJ database, average voice, hidden Markov model (HMM)-based speech synthesis, speaker adaptation, speech synthesis, voice conversion},
  abstract = {In conventional speech synthesis, large amounts of phonetically balanced speech data recorded in highly controlled recording studio environments are typically required to build a voice. Although using such data is a straightforward solution for high quality synthesis, the number of voices available will always be limited, because recording costs are high. On the other hand, our recent experiments with HMM-based speech synthesis systems have demonstrated that speaker-adaptive HMM-based speech synthesis (which uses an ``average voice model'' plus model adaptation) is robust to non-ideal speech data that are recorded under various conditions and with varying microphones, that are not perfectly clean, and/or that lack phonetic balance. This enables us to consider building high-quality voices on ``non-TTS'' corpora such as ASR corpora. Since ASR corpora generally include a large number of speakers, this leads to the possibility of producing an enormous number of voices automatically. In this paper, we demonstrate the thousands of voices for HMM-based speech synthesis that we have made from several popular ASR corpora such as the Wall Street Journal (WSJ0, WSJ1, and WSJCAM0), Resource Management, Globalphone, and SPEECON databases. We also present the results of associated analysis based on perceptual evaluation, and discuss remaining issues.}
}
@inproceedings{karaiskos:king:clark:mayo:blizzard2008,
  author = {Karaiskos, Vasilis and King, Simon and Clark, Robert A. J. and Mayo, Catherine},
  title = {The Blizzard Challenge 2008},
  booktitle = {Proc. Blizzard Challenge Workshop},
  address = {Brisbane, Australia},
  month = {September},
  year = {2008},
  keywords = {Blizzard},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/summary_Blizzard2008.pdf},
  abstract = {The Blizzard Challenge 2008 was the fourth annual Blizzard Challenge. This year, participants were asked to build two voices from a UK English corpus and one voice from a Man- darin Chinese corpus. This is the first time that a language other than English has been included and also the first time that a large UK English corpus has been available. In addi- tion, the English corpus contained somewhat more expressive speech than that found in corpora used in previous Blizzard Challenges. To assist participants with limited resources or limited ex- perience in UK-accented English or Mandarin, unaligned la- bels were provided for both corpora and for the test sentences. Participants could use the provided labels or create their own. An accent-specific pronunciation dictionary was also available for the English speaker. A set of test sentences was released to participants, who were given a limited time in which to synthesise them and submit the synthetic speech. An online listening test was con- ducted, to evaluate naturalness, intelligibility and degree of similarity to the original speaker.}
}
@inproceedings{goubanova_king_isp05,
  author = {Goubanova, Olga and King, Simon},
  title = {Predicting Consonant Duration with {B}ayesian Belief Networks},
  booktitle = {Proc. Interspeech 2005},
  address = {Lisbon, Portugal},
  year = {2005},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/goubanova_king_isp2005.pdf},
  abstract = {Consonant duration is influenced by a number of linguistic factors such as the consonant s identity, within-word position, stress level of the previous and following vowels, phrasal position of the word containing the target consonant, its syllabic position, identity of the previous and following segments. In our work, consonant duration is predicted from a Bayesian belief network (BN) consisting of discrete nodes for the linguistic factors and a single continuous node for the consonant s duration. Interactions between factors are represented as conditional dependency arcs in this graphical model. Given the parameters of the belief network, the duration of each consonant in the test set is then predicted as the value with the maximum probability. We compare the results of the belief network model with those of sums-of-products (SoP) and classification and regression tree (CART) models using the same data. In terms of RMS error, our BN model performs better than both CART and SoP models. In terms of the correlation coefficient, our BN model performs better than SoP model, and no worse than CART model. In addition, the Bayesian model reliably predicts consonant duration in cases of missing or hidden linguistic factors.}
}
@inproceedings{CassiaIS12,
  author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S.},
  title = {{Mel cepstral coefficient modification based on the Glimpse Proportion measure for improving the intelligibility of {HMM}-generated synthetic speech in noise}},
  booktitle = {Proc. Interspeech},
  address = {Portland, USA},
  month = {September},
  year = {2012},
  abstract = {We propose a method that modifies the Mel cepstral coefficients of HMM-generated synthetic speech in order to increase the intelligibility of the generated speech when heard by a listener in the presence of a known noise. This method is based on an approximation we previously proposed for the Glimpse Proportion measure. Here we show how to update the Mel cepstral coefficients using this measure as an optimization criterion and how to control the amount of distortion by limiting the frequency resolution of the modifications. To evaluate the method we built eight different voices from normal read-text speech data from a male speaker. Some voices were also built from Lombard speech data produced by the same speaker. Listening experiments with speech-shaped noise and with a single competing talker indicate that our method significantly improves intelligibility when compared to unmodified synthetic speech. The voices built from Lombard speech outperformed the proposed method particularly for the competing talker case. However, compared to a voice using only the spectral parameters from Lombard speech, the proposed method obtains similar or higher performance.},
  categories = {HMM-based speech synthesis, intelligibility enhancement, Mel cepstral coefficients}
}
@article{clarkrichmondking_specom2007,
  author = {Clark, Robert A. J. and Richmond, Korin and King, Simon},
  doi = {10.1016/j.specom.2007.01.014},
  title = {Multisyn: Open-domain unit selection for the {F}estival speech synthesis system},
  journal = {Speech Communication},
  number = {4},
  pages = {317--330},
  volume = {49},
  year = {2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/clarkrichmondking_specom2007.pdf},
  abstract = {We present the implementation and evaluation of an open-domain unit selection speech synthesis engine designed to be flexible enough to encourage further unit selection research and allow rapid voice development by users with minimal speech synthesis knowledge and experience. We address the issues of automatically processing speech data into a usable voice using automatic segmentation techniques and how the knowledge obtained at labelling time can be exploited at synthesis time. We describe target cost and join cost implementation for such a system and describe the outcome of building voices with a number of different sized datasets. We show that, in a competitive evaluation, voices built using this technology compare favourably to other systems.},
  categories = {speech synthesis, festival, multisyn, unitselection}
}
@inproceedings{bell_king_is2007,
  author = {Bell, Peter and King, Simon},
  title = {Sparse Gaussian Graphical Models for Speech Recognition},
  booktitle = {Proc. Interspeech 2007},
  address = {Antwerp, Belgium},
  month = {August},
  year = {2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/sparseGM_is2007.pdf},
  abstract = {We address the problem of learning the structure of Gaussian graphical models for use in automatic speech recognition, a means of controlling the form of the inverse covariance matrices of such systems. With particular focus on data sparsity issues, we implement a method for imposing graphical model structure on a Gaussian mixture system, using a convex optimisation technique to maximise a penalised likelihood expression. The results of initial experiments on a phone recognition task show a performance improvement over an equivalent full-covariance system.},
  categories = {speech recognition, acoustic models, graphical models, precision matrix models}
}
@inproceedings{wester:ssw7:10,
  author = {Wester, Mirjam and Dines, John and Gibson, Matthew and Liang, Hui and Wu, Yi-Jian and Saheer, Lakshmi and King, Simon and Oura, Keiichiro and Garner, Philip N. and Byrne, William and Guan, Yong and Hirsim\"{a}ki, Teemu and Karhila, Reima and Kurimo, Mikko and Shannon, Matt and Shiota, Sayaka and Tian, Jilei and Tokuda, Keiichi and Yamagishi, Junichi},
  title = {Speaker adaptation and the evaluation of speaker similarity in the {EMIME} speech-to-speech translation project},
  booktitle = {Proc. 7th ISCA Speech Synthesis Workshop},
  address = {Kyoto, Japan},
  month = {September},
  year = {2010},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wester_ssw7_2010.pdf},
  abstract = {This paper provides an overview of speaker adaptation research carried out in the EMIME speech-to-speech translation (S2ST) project. We focus on how speaker adaptation transforms can be learned from speech in one language and applied to the acoustic models of another language. The adaptation is transferred across languages and/or from recognition models to synthesis models. The various approaches investigated can all be viewed as a process in which a mapping is defined in terms of either acoustic model states or linguistic units. The mapping is used to transfer either speech data or adaptation transforms between the two models. Because the success of speaker adaptation in text-to-speech synthesis is measured by judging speaker similarity, we also discuss issues concerning evaluation of speaker similarity in an S2ST scenario.},
  categories = {speaker adaptation, evaluation}
}
@inproceedings{taylor:king:isard:wright:kowtko:eurospeech1997,
  author = {Taylor, Paul A. and King, Simon and Isard, Stephen and Wright, Helen and Kowtko, Jacqueline},
  title = {Using Intonation to Constrain Language Models in Speech Recognition},
  booktitle = {Proc. {E}urospeech'97},
  address = {Rhodes},
  year = {1997},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/Taylor_King_Isard_Wright_Kowtko_eurospeech1997.pdf},
  abstract = {This paper describes a method for using intonation to reduce word error rate in a speech recognition system designed to recognise spontaneous dialogue speech. We use a form of dialogue analysis based on the theory of conversational games. Different move types under this analysis conform to different language models. Different move types are also characterised by different intonational tunes. Our overall recognition strategy is first to predict from intonation the type of game move that a test utterance represents, and then to use a bigram language model for that type of move during recognition. point in a game.},
  categories = {asr, intonation, dialogue, lm,id4s}
}
@inproceedings{shig032,
  author = {Shiga, Yoshinori and King, Simon},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/shiga_eurospeech03b.ps},
  title = {Estimation of voice source and vocal tract characteristics based on multi-frame analysis},
  booktitle = {Proc. Eurospeech},
  address = {Geneva, Switzerland},
  month = {September},
  volume = {3},
  pages = {1749--1752},
  year = {2003},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/shiga_eurospeech03b.pdf},
  abstract = {This paper presents a new approach for estimating voice source and vocal tract filter characteristics of voiced speech. When it is required to know the transfer function of a system in signal processing, the input and output of the system are experimentally observed and used to calculate the function. However, in the case of source-filter separation we deal with in this paper, only the output (speech) is observed and the characteristics of the system (vocal tract) and the input (voice source) must simultaneously be estimated. Hence the estimate becomes extremely difficult, and it is usually solved approximately using oversimplified models. We demonstrate that these characteristics are separable under the assumption that they are independently controlled by different factors. The separation is realised using an iterative approximation along with the Multi-frame Analysis method, which we have proposed to find spectral envelopes of voiced speech with minimum interference of the harmonic structure.},
  categories = {artic, lbg, clustering, mocha, source-filter, edinburgh}
}
@article{king:taylor:csl2000,
  author = {King, Simon and Taylor, Paul},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/King_Taylor_csl2000.ps},
  title = {Detection of Phonological Features in Continuous Speech using Neural Networks},
  journal = {Computer {S}peech and {L}anguage},
  number = {4},
  pages = {333-353},
  volume = {14},
  year = {2000},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/King_Taylor_csl2000.pdf},
  abstract = {We report work on the first component of a two stage speech recognition architecture based on phonological features rather than phones. The paper reports experiments on three phonological feature systems: 1) the Sound Pattern of English (SPE) system which uses binary features, 2)a multi valued (MV) feature system which uses traditional phonetic categories such as manner, place etc, and 3) Government Phonology (GP) which uses a set of structured primes. All experiments used recurrent neural networks to perform feature detection. In these networks the input layer is a standard framewise cepstral representation, and the output layer represents the values of the features. The system effectively produces a representation of the most likely phonological features for each input frame. All experiments were carried out on the TIMIT speaker independent database. The networks performed well in all cases, with the average accuracy for a single feature ranging from 86 to 93 percent. We describe these experiments in detail, and discuss the justification and potential advantages of using phonological features rather than phones for the basis of speech recognition.},
  categories = {}
}
@article{mayo:clark:king:10,
  author = {Mayo, C. and Clark, R. A. J. and King, S.},
  doi = {10.1016/j.specom.2010.10.003},
  title = {Listeners' Weighting of Acoustic Cues to Synthetic Speech Naturalness: A Multidimensional Scaling Analysis},
  journal = {Speech Communication},
  number = {3},
  pages = {311--326},
  volume = {53},
  year = {2011},
  keywords = {Speech synthesis; Evaluation; Speech perception; Acoustic cue weighting; Multidimensional scaling},
  abstract = {The quality of current commercial speech synthesis systems is now so high that system improvements are being made at subtle sub- and supra-segmental levels. Human perceptual evaluation of such subtle improvements requires a highly sophisticated level of perceptual attention to specific acoustic characteristics or cues. However, it is not well understood what acoustic cues listeners attend to by default when asked to evaluate synthetic speech. It may, therefore, be potentially quite difficult to design an evaluation method that allows listeners to concentrate on only one dimension of the signal, while ignoring others that are perceptually more important to them. The aim of the current study was to determine which acoustic characteristics of unit-selection synthetic speech are most salient to listeners when evaluating the naturalness of such speech. This study made use of multidimensional scaling techniques to analyse listeners' pairwise comparisons of synthetic speech sentences. Results indicate that listeners place a great deal of perceptual importance on the presence of artifacts and discontinuities in the speech, somewhat less importance on aspects of segmental quality, and very little importance on stress/intonation appropriateness. These relative differences in importance will impact on listeners' ability to attend to these different acoustic characteristics of synthetic speech, and should therefore be taken into account when designing appropriate methods of synthetic speech evaluation.}
}
@inproceedings{bell_king_lineSearch_is2008,
  author = {Bell, Peter and King, Simon},
  title = {Covariance Updates for Discriminative Training by Constrained Line Search},
  booktitle = {Proc. Interspeech},
  address = {Brisbane, Australia},
  month = {September},
  year = {2008},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/lineSearch_is2008.pdf},
  abstract = {We investigate the recent Constrained Line Search algorithm for discriminative training of HMMs and propose an alternative formula for variance update. We compare the method to standard techniques on a phone recognition task.}
}
@article{2012E121001,
  author = {Yamagishi, Junichi and Veaux, Christophe and King, Simon and Renals, Steve},
  doi = {10.1250/ast.33.1},
  title = {Speech synthesis technologies for individuals with vocal disabilities: Voice banking and reconstruction},
  url = {http://www.jstage.jst.go.jp/browse/ast/33/1/_contents},
  journal = {Acoustical Science and Technology},
  number = {1},
  abstract = {In this invited paper, we overview the clinical applications of speech synthesis technologies and explain a few selected researches. We also introduce the University of Edinburgh’s new project ``Voice Banking and reconstruction'' for patients with degenerative diseases, such as motor neurone disease and Parkinson's disease and show how speech synthesis technologies can improve the quality of life for the patients.},
  volume = {33},
  year = {2012},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/AST-33_1.pdf},
  pages = {1--5}
}
@inproceedings{king:wrench:icphs1999,
  author = {King, Simon and Wrench, Alan},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/King_Wrench_icphs1999.ps},
  title = {Dynamical System Modelling of Articulator Movement},
  booktitle = {Proc. {ICPhS} 99},
  address = {San Francisco},
  month = {August},
  pages = {2259-2262},
  year = {1999},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/King_Wrench_icphs1999.pdf},
  abstract = {We describe the modelling of articulatory movements using (hidden) dynamical system models trained on Electro-Magnetic Articulograph (EMA) data. These models can be used for automatic speech recognition and to give insights into articulatory behaviour. They belong to a class of continuous-state Markov models, which we believe can offer improved performance over conventional Hidden Markov Models (HMMs) by better accounting for the continuous nature of the underlying speech production process -- that is, the movements of the articulators. To assess the performance of our models, a simple speech recognition task was used, on which the models show promising results.},
  categories = {asr, artic, ema}
}
@inproceedings{jyamagis:1000sHTS,
  author = {Yamagishi, J. and Usabaev, Bela and King, Simon and Watts, Oliver and Dines, John and Tian, Jilei and Hu, Rile and Guan, Yong and Oura, Keiichiro and Tokuda, Keiichi and Karhila, Reima and Kurimo, Mikko},
  title = {Thousands of voices for {HMM}-based speech synthesis},
  booktitle = {Proc. Interspeech},
  address = {Brighton, U.K.},
  month = {September},
  pages = {420--423},
  year = {2009},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/index.php},
  abstract = {Our recent experiments with HMM-based speech synthesis systems have demonstrated that speaker-adaptive HMM-based speech synthesis (which uses an ‘average voice model’ plus model adaptation) is robust to non-ideal speech data that are recorded under various conditions and with varying microphones, that are not perfectly clean, and/or that lack of phonetic balance. This enables us consider building high-quality voices on ’non-TTS’ corpora such as ASR corpora. Since ASR corpora generally include a large number of speakers, this leads to the possibility of producing an enormous number of voices automatically. In this paper we show thousands of voices for HMM-based speech synthesis that we have made from several popular ASR corpora such as the Wall Street Journal databases (WSJ0/WSJ1/WSJCAM0), Resource Management, Globalphone and Speecon. We report some perceptual evaluation results and outline the outstanding issues.}
}
@inproceedings{king_hmm_tutorial:india2010,
  author = {King, Simon},
  title = {A tutorial on {HMM} speech synthesis (Invited paper)},
  booktitle = {Sadhana -- Academy Proceedings in Engineering Sciences, Indian Institute of Sciences},
  year = {2010},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/king_hmm_tutorial.pdf},
  abstract = {Statistical parametric speech synthesis, based on HMM-like models, has become competitive with established concatenative techniques over the last few years. This paper offers a non-mathematical introduction to this method of speech synthesis. It is intended to be complementary to the wide range of excellent technical publications already available. Rather than offer a comprehensive literature review, this paper instead gives a small number of carefully chosen references which are good starting points for further reading.},
  categories = {speech synthesis, HMM synthesis}
}
@article{king07:JASA2007,
  author = {King, S. and Frankel, J. and Livescu, K. and McDermott, E. and Richmond, K. and Wester, M.},
  title = {Speech production knowledge in automatic speech recognition},
  journal = {Journal of the Acoustical Society of America},
  number = {2},
  month = {February},
  volume = {121},
  pages = {723--742},
  year = {2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/King_et_al_review.pdf},
  abstract = {Although much is known about how speech is produced, and research into speech production has resulted in measured articulatory data, feature systems of different kinds and numerous models, speech production knowledge is almost totally ignored in current mainstream approaches to automatic speech recognition. Representations of speech production allow simple explanations for many phenomena observed in speech which cannot be easily analyzed from either acoustic signal or phonetic transcription alone. In this article, we provide a survey of a growing body of work in which such representations are used to improve automatic speech recognition.}
}
@inproceedings{tejedor_interspeech10,
  author = {Tejedor, Javier and Toledano, Doroteo T. and Bautista, Miguel and King, Simon and Wang, Dong and Colas, Jose},
  title = {Augmented set of features for confidence estimation in spoken term detection},
  booktitle = {Proc. Interspeech},
  month = {September},
  year = {2010},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/features.pdf},
  abstract = {Discriminative confidence estimation along with confidence normalisation have been shown to construct robust decision maker modules in spoken term detection (STD) systems. Discriminative confidence estimation, making use of termdependent features, has been shown to improve the widely used lattice-based confidence estimation in STD. In this work, we augment the set of these term-dependent features and show a significant improvement in the STD performance both in terms of ATWV and DET curves in experiments conducted on a Spanish geographical corpus. This work also proposes a multiple linear regression analysis to carry out the feature selection. Next, the most informative features derived from it are used within the discriminative confidence on the STD system.},
  categories = {confidence estimation, feature selection, spoken term detection, speech recognition}
}
@inproceedings{letter_based_TTS,
  author = {Watts, Oliver and Yamagishi, Junichi and King, Simon},
  title = {Letter-based speech synthesis},
  booktitle = {Proc. Speech Synthesis Workshop 2010},
  address = {Nara, Japan},
  month = {September},
  pages = {317-322},
  year = {2010},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/ssw7.pdf},
  abstract = {Initial attempts at performing text-to-speech conversion based on standard orthographic units are presented, forming part of a larger scheme of training TTS systems on features that can be trivially extracted from text. We evaluate the possibility of using the technique of decision-tree-based context clustering conventionally used in HMM-based systems for parametertying to handle letter-to-sound conversion. We present the application of a method of compound-feature discovery to corpusbased speech synthesis. Finally, an evaluation of intelligibility of letter-based systems and more conventional phoneme-based systems is presented.}
}
@inproceedings{wang_icassp2011a,
  author = {Wang, Dong and Evans, Nicholas and Troncy, Raphael and King, Simon},
  doi = {10.1109/ICASSP.2011.5947643},
  title = {Handling overlaps in spoken term detection},
  booktitle = {Proc. International Conference on Acoustics, Speech and Signal Processing},
  month = {May},
  pages = {5656--5659},
  year = {2011},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/wang_icassp2011a.pdf},
  abstract = {Spoken term detection (STD) systems usually arrive at many overlapping detections which are often addressed with some pragmatic approaches, e.g. choosing the best detection to represent all the overlaps. In this paper we present a theoretical study based on a concept of acceptance space. In particular, we present two confidence estimation approaches based on Bayesian and evidence perspectives respectively. Analysis shows that both approaches possess respective ad vantages and shortcomings, and that their combination has the potential to provide an improved confidence estimation. Experiments conducted on meeting data confirm our analysis and show considerable performance improvement with the combined approach, in particular for out-of-vocabulary spoken term detection with stochastic pronunciation modeling.},
  categories = {spoken term detection, speech recognition}
}
@inproceedings{Valentini-Botinhao_SSW8,
  author = {Valentini-Botinhao, Cassia and Wester, Mirjam and Yamagishi, Junichi and King, Simon},
  title = {Using neighbourhood density and selective {SNR} boosting to increase the intelligibility of synthetic speech in noise},
  booktitle = {8th ISCA Workshop on Speech Synthesis},
  address = {Barcelona, Spain},
  month = {August},
  pages = {133--138},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Cassia_SSW13.pdf},
  abstract = {Motivated by the fact that words are not equally confusable, we explore the idea of using word-level intelligibility predictions to selectively boost the harder-to-understand words in a sentence, aiming to improve overall intelligibility in the presence of noise. First, the intelligibility of a set of words from dense and sparse phonetic neighbourhoods was evaluated in isolation. The resulting intelligibility scores were used to inform two sentencelevel experiments. In the first experiment the signal-to-noise ratio of one word was boosted to the detriment of another word. Sentence intelligibility did not generally improve. The intelligibility of words in isolation and in a sentence were found to be significantly different, both in clean and in noisy conditions. For the second experiment, one word was selectively boosted while slightly attenuating all other words in the sentence. This strategy was successful for words that were poorly recognised in that particular context. However, a reliable predictor of word-in-context intelligibility remains elusive, since this involves – as our results indicate – semantic, syntactic and acoustic information about the word and the sentence.}
}
@inproceedings{Merritt_SSW8,
  author = {Merritt, Thomas and King, Simon},
  title = {Investigating the shortcomings of {HMM} synthesis},
  booktitle = {8th ISCA Workshop on Speech Synthesis},
  address = {Barcelona, Spain},
  month = {August},
  pages = {185--190},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/ssw8_PS2-4_Merritt.pdf},
  abstract = {This paper presents the beginnings of a framework for formal testing of the causes of the current limited quality of HMM (Hidden Markov Model) speech synthesis. This framework separates each of the effects of modelling to observe their independent effects on vocoded speech parameters in order to address the issues that are restricting the progression to highly intelligible and natural-sounding speech synthesis. The simulated HMM synthesis conditions are performed on spectral speech parameters and tested via a pairwise listening test, asking listeners to perform a “same or different” judgement on the quality of the synthesised speech produced between these conditions. These responses are then processed using multidimensional scaling to identify the qualities in modelled speech that listeners are attending to and thus forms the basis of why they are distinguishable from natural speech. The future improvements to be made to the framework will finally be discussed which include the extension to more of the parameters modelled during speech synthesis.},
  categories = {speech synthesis, hidden markov models, vocoding}
}
@inproceedings{Astrinaki_SSW8,
  author = {Astrinaki, Maria and Moinet, Alexis and Yamagishi, Junichi and Richmond, Korin and Ling, Zhen-Hua and King, Simon and Dutoit, Thierry},
  title = {Mage - Reactive articulatory feature control of {HMM}-based parametric speech synthesis},
  booktitle = {8th ISCA Workshop on Speech Synthesis},
  address = {Barcelona, Spain},
  month = {August},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/ssw8_OS5-1_Astrinaki.pdf},
  pages = {227--231}
}
@inproceedings{stan13_lightly_supervised_discriminative,
  author = {Stan, Adriana and Bell, Peter and Yamagishi, Junichi and King, Simon},
  title = {Lightly Supervised Discriminative Training of Grapheme Models for Improved Sentence-level Alignment of Speech and Text Data},
  booktitle = {Proc. Interspeech},
  year = {2013},
  month = {August},
  address = {Lyon, France},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/lightly_supervised_discriminative_is2013.pdf},
  abstract = {This paper introduces a method for lightly supervised discriminative training using MMI to improve the alignment of speech and text data for use in training HMM-based TTS systems for low-resource languages. In TTS applications, due to the use of long-span contexts, it is important to select training utterances which have wholly correct transcriptions. In a low-resource setting, when using poorly trained grapheme models, we show that the use of MMI discriminative training at the grapheme-level enables us to increase the amount of correctly aligned data by 40\%, while maintaining a 7\% sentence error rate and 0.8\% word error rate. We present the procedure for lightly supervised discriminative training with regard to the objective of minimising sentence error rate.}
}
@inproceedings{christensen13_disordered,
  author = {Christensen, H. and Aniol, M. and Bell, P. and Green, P. and Hain, T. and King, S. and Swietojanski, P.},
  title = {Combining in-domain and out-of-domain speech data for automatic recognition of disordered speech},
  booktitle = {Proc. Interspeech},
  year = {2013},
  month = {August},
  address = {Lyon, France},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/christensen_is13_2_final.pdf},
  abstract = {Recently there has been increasing interest in ways of using out-of-domain (OOD) data to improve automatic speech recognition performance in domains where only limited data is available. This paper focuses on one such domain, namely that of disordered speech for which only very small databases exist, but where normal speech can be considered OOD. Standard approaches for handling small data domains use adaptation from OOD models into the target domain, but here we investigate an alternative approach with its focus on the feature extraction stage: OOD data is used to train feature-generating deep belief neural networks. Using AMI meeting and TED talk datasets, we investigate various tandem-based speaker independent systems as well as maximum a posteriori adapted speaker dependent systems. Results on the UAspeech isolated word task of disordered speech are very promising with our overall best system (using a combination of AMI and TED data) giving a correctness of 62.5\%; an increase of 15\% on previously best published results based on conventional model adaptation. We show that the relative benefit of using OOD data varies considerably from speaker to speaker and is only loosely correlated with the severity of a speaker's impairments.}
}
@inproceedings{Yanagisawa_SSW8,
  author = {Yanagisawa, Kayoko and Latorre, Javier and Wan, Vincent and Gales, Mark J. F. and King, Simon},
  title = {Noise Robustness in {HMM-TTS} Speaker Adaptation},
  booktitle = {8th ISCA Workshop on Speech Synthesis},
  address = {Barcelona, Spain},
  month = {August},
  pages = {139--144},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/ssw8_OS3-3_Yanagisawa.pdf},
  abstract = {Speaker adaptation for TTS applications has been receiving more attention in recent years for applications such as voice customisation or voice banking. If these applications are offered as an Internet service, there is no control on the quality of the data that can be collected. It can be noisy with people talking in the background or recorded in a reverberant environment. This makes the adaptation more difficult. This paper explores the effect of different levels of additive and convolutional noise on speaker adaptation techniques based on cluster adaptive training (CAT) and average voice model (AVM). The results indicate that although both techniques suffer degradation to some extent, CAT is in general more robust than AVM.}
}
@article{Tejedor2013,
  author = {Tejedor, Javier and Toledano, Doroteo T. and Wang, Dong and King, Simon and Colas, Jose},
  title = {Feature analysis for discriminative confidence estimation in Spoken Term Detection},
  journal = {Computer Speech and Language},
  number = {},
  pages = {},
  volume = {To appear},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Tejedor_CSL2013.pdf},
  abstract = {Discriminative confidence based on multi-layer perceptrons (MLPs) and multiple features has shown significant advantage compared to the widely used lattice-based confidence in spoken term detection (STD). Although the MLP-based framework can handle any features derived from a multitude of sources, choosing all possible features may lead to over complex models and hence less generality. In this paper, we design an extensive set of features and analyze their contribution to STD individually and as a group. The main goal is to choose a small set of features that are sufficiently informative while keeping the model simple and generalizable. We employ two established models to conduct the analysis: one is linear regression which targets for the most relevant features and the other is logistic linear regression which targets for the most discriminative features. We find the most informative features are comprised of those derived from diverse sources (ASR decoding, duration and lexical properties) and the two models deliver highly consistent feature ranks. STD experiments on both English and Spanish data demonstrate significant performance gains with the proposed feature sets.}
}
@inproceedings{San-Segundo_SSW8,
  author = {San-Segundo, Rubén and Montero, Juan Manuel and Giurgiu, Mircea and Muresan, Ioana and King, Simon},
  title = {Multilingual Number Transcription for Text-to-Speech Conversion},
  booktitle = {8th ISCA Workshop on Speech Synthesis},
  address = {Barcelona, Spain},
  month = {August},
  pages = {85--89},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/ssw8_PS1-8_San-Segundo.pdf},
  abstract = {This paper describes the text normalization module of a text to speech fully-trainable conversion system and its application to number transcription. The main target is to generate a language independent text normalization module, based on data instead of on expert rules. This paper proposes a general architecture based on statistical ma- chine translation techniques. This proposal is composed of three main modules: a tokenizer for splitting the text input into a token graph, a phrase-based translation module for token translation, and a post-processing module for removing some tokens. This architecture has been evaluated for number transcription in several languages: English, Spanish and Romanian. Number transcription is an important aspect in the text normalization problem.}
}
@inproceedings{Lu_SSW8,
  author = {Lu, Heng and King, Simon and Watts, Oliver},
  title = {Combining a Vector Space Representation of Linguistic Context with a Deep Neural Network for Text-To-Speech Synthesis},
  booktitle = {8th ISCA Workshop on Speech Synthesis},
  address = {Barcelona, Spain},
  month = {August},
  pages = {281--285},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/ssw8_PS3-3_Lu.pdf},
  abstract = {Conventional statistical parametric speech synthesis relies on decision trees to cluster together similar contexts, result- ing in tied-parameter context-dependent hidden Markov models (HMMs). However, decision tree clustering has a major weak- ness: it use hard division and subdivides the model space based on one feature at a time, fragmenting the data and failing to exploit interactions between linguistic context features. These linguistic features themselves are also problematic, being noisy and of varied relevance to the acoustics. We propose to combine our previous work on vector-space representations of linguistic context, which have the added ad- vantage of working directly from textual input, and Deep Neural Networks (DNNs), which can directly accept such continuous representations as input. The outputs of the network are probability distributions over speech features. Maximum Likelihood Parameter Generation is then used to create parameter trajectories, which in turn drive a vocoder to generate the waveform. Various configurations of the system are compared, using both conventional and vector space context representations and with the DNN making speech parameter predictions at two dif- ferent temporal resolutions: frames, or states. Both objective and subjective results are presented.}
}
@article{6578128,
  author = {Lal, P. and King, S.},
  doi = {10.1109/TASL.2013.2277932},
  title = {Cross-lingual Automatic Speech Recognition using Tandem Features},
  journal = {IEEE Transactions on Audio, Speech, and Language Processing},
  issn = {1558-7916},
  number = {},
  pages = {},
  volume = {To appear},
  year = {2013},
  keywords = {Acoustics;Data models;Hidden Markov models;Speech;Speech recognition;Training;Transforms;Automatic speech recognition;Multilayer perceptrons},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Lal_TASLP2013.pdf},
  abstract = {Automatic speech recognition depends on large amounts of transcribed speech recordings in order to estimate the parameters of the acoustic model. Recording such large speech corpora is time-consuming and expensive; as a result, sufficient quantities of data exist only for a handful of languages — there are many more languages for which little or no data exist. Given that there are acoustic similarities between speech in different languages, it may be fruitful to use data from a well-resourced source language to estimate the acoustic models for a recogniser in a poorly-resourced target language. Previous approaches to this task have often involved making assumptions about shared phonetic inventories between the languages. Unfortunately pairs of languages do not generally share a common phonetic inventory. We propose an indirect way of transferring information from a source language acoustic model to a target language acoustic model without having to make any assumptions about the phonetic inventory overlap. To do this, we employ tandem features, in which class-posteriors from a separate classifier are decorrelated and appended to conventional acoustic features. Tandem features have the advantage that the language of the speech data used to train the classifier need not be the same as the target language to be recognised. This is because the class-posteriors are not used directly, so do not have to be over any particular set of classes. We demonstrate the use of tandem features in cross-lingual settings, including training on one or several source languages. We also examine factors which may predict a priori how much relative improvement will be brought about by using such tandem features, for a given source and target pair. In addition to conventional phoneme class-posteriors, we also investigate whether articulatory features (AFs) - a multistream, discrete, multi-valued labelling of speech — can be used instead. This is motivated by an assumption that AFs are less language-specific than a phoneme set.}
}
@inproceedings{Mamiya_SSW8,
  author = {Mamiya, Yoshitaka and Stan, Adriana and Yamagishi, Junichi and Bell, Peter and Watts, Oliver and Clark, Robert and King, Simon},
  title = {Using Adaptation to Improve Speech Transcription Alignment in Noisy and Reverberant Environments},
  booktitle = {8th ISCA Workshop on Speech Synthesis},
  address = {Barcelona, Spain},
  month = {August},
  pages = {61--66},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/ssw8_PS1-4_Mamiya.pdf},
  abstract = {When using data retrieved from the internet to create new speech databases, the recording conditions can often be highly variable within and between sessions. This variance influences the overall performance of any automatic speech and text alignment techniques used to process this data. In this paper we discuss the use of speaker adaptation methods to address this issue. Starting from a baseline system for automatic sentence-level segmentation and speech and text alignment based on GMMs and grapheme HMMs, respectively, we employ Maximum A Posteriori (MAP) and Constrained Maximum Likelihood Linear Regression (CMLLR) techniques to model the variation in the data in order to increase the amount of confidently aligned speech. We tested 29 different scenarios, which include reverberation, 8 talker babble noise and white noise, each in various combinations and SNRs. Results show that the MAP-based segmentation's performance is very much influenced by the noise type, as well as the presence or absence of reverberation. On the other hand, the CMLLR adaptation of the acoustic models gives an average 20\% increase in the aligned data percentage for the majority of the studied scenarios.}
}
@inproceedings{Watts_SSW8,
  author = {Watts, Oliver and Stan, Adriana and Clark, Rob and Mamiya, Yoshitaka and Giurgiu, Mircea and Yamagishi, Junichi and King, Simon},
  title = {Unsupervised and lightly-supervised learning for rapid construction of {TTS} systems in multiple languages from 'found' data: evaluation and analysis},
  booktitle = {8th ISCA Workshop on Speech Synthesis},
  address = {Barcelona, Spain},
  month = {August},
  pages = {121--126},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/ssw8_OS2-3_Watts.pdf},
  abstract = {This paper presents techniques for building text-to-speech front-ends in a way that avoids the need for language-specific expert knowledge, but instead relies on universal resources (such as the Unicode character database) and unsupervised learning from unannotated data to ease system development. The acquisition of expert language-specific knowledge and expert annotated data is a major bottleneck in the development of corpus-based TTS systems in new languages. The methods presented here side-step the need for such resources as pronunciation lexicons, phonetic feature sets, part of speech tagged data, etc. The paper explains how the techniques introduced are applied to the 14 languages of a corpus of `found' audiobook data. Results of an evaluation of the intelligibility of the systems resulting from applying these novel techniques to this data are presented.}
}
@inproceedings{Stan_IS13,
  author = {Stan, Adriana and Watts, Oliver and Mamiya, Yoshitaka and Giurgiu, Mircea and Clark, Rob and Yamagishi, Junichi and King, Simon},
  title = {{TUNDRA: A Multilingual Corpus of Found Data for TTS Research Created with Light Supervision}},
  booktitle = {Proc. Interspeech},
  address = {Lyon, France},
  month = {August},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/IS131055.pdf},
  abstract = {Simple4All Tundra (version 1.0) is the first release of a standardised multilingual corpus designed for text-to-speech research with imperfect or found data. The corpus consists of approximately 60 hours of speech data from audiobooks in 14 languages, as well as utterance-level alignments obtained with a lightly-supervised process. Future versions of the corpus will include finer-grained alignment and prosodic annotation, all of which will be made freely available. This paper gives a general outline of the data collected so far, as well as a detailed description of how this has been done, emphasizing the minimal language-specific knowledge and manual intervention used to compile the corpus. To demonstrate its potential use, text-to-speech systems have been built for all languages using unsupervised or lightly supervised methods, also briefly presented in the paper.}
}
@inproceedings{Mamiya_13a,
  author = {Mamiya, Yoshitaka and Yamagishi, Junichi and Watts, Oliver and Clark, Robert A.J. and King, Simon and Stan, Adriana},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/0007987.pdf},
  booktitle = {Proc. ICASSP},
  title = {LIGHTLY SUPERVISED GMM VAD TO USE AUDIOBOOK FOR SPEECH SYNTHESISER},
  abstract = {Audiobooks have been focused on as promising data for training Text-to-Speech (TTS) systems. However, they usually do not have a correspondence between audio and text data. Moreover, they are usually divided only into chapter units. In practice, we have to make a correspondence of audio and text data before we use them for building TTS synthesisers. However aligning audio and text data is time-consuming and involves manual labor. It also requires persons skilled in speech processing. Previously, we have proposed to use graphemes for automatically aligning speech and text data. This paper further integrates a lightly supervised voice activity detection (VAD) technique to detect sentence boundaries as a pre-processing step before the grapheme approach. This lightly supervised technique requires time stamps of speech and silence only for the first fifty sentences. Combining those, we can semi-automatically build TTS systems from audiobooks with minimum manual intervention. From subjective evaluations we analyse how the grapheme-based aligner and/or the proposed VAD technique impact the quality of HMM-based speech synthesisers trained on audiobooks.},
  year = {2013}
}
@inproceedings{doubletalk_IS2013,
  author = {Scobbie, James and Turk, Alice and Geng, Christian and King, Simon and Lickley, Robin and Richmond, Korin},
  title = {The {E}dinburgh Speech Production Facility {DoubleTalk} Corpus},
  abstract = {The DoubleTalk articulatory corpus was collected at the Edinburgh Speech Production Facility (ESPF) using two synchronized Carstens AG500 electromagnetic articulometers. The first release of the corpus comprises orthographic transcriptions aligned at phrasal level to EMA and audio data for each of 6 mixed-dialect speaker pairs. It is available from the ESPF online archive. A variety of tasks were used to elicit a wide range of speech styles, including monologue (a modified Comma Gets a Cure and spontaneous story-telling), structured spontaneous dialogue (Map Task and Diapix), a wordlist task, a memory-recall task, and a shadowing task. In this session we will demo the corpus with various examples.},
  address = {Lyon, France},
  month = {August},
  year = {2013},
  keywords = {discourse, EMA, spontaneous speech},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/doubletalk_IS2013.pdf},
  booktitle = {Proc. Interspeech}
}
@article{Geng2013421,
  author = {Geng, Christian and Turk, Alice and Scobbie, James M. and Macmartin, Cedric and Hoole, Philip and Richmond, Korin and Wrench, Alan and Pouplier, Marianne and Bard, Ellen Gurman and Campbell, Ziggy and Dickie, Catherine and Dubourg, Eddie and Hardcastle, William and Kainada, Evia and King, Simon and Lickley, Robin and Nakai, Satsuki and Renals, Steve and White, Kevin and Wiegand, Ronny},
  doi = {http://dx.doi.org/10.1016/j.wocn.2013.07.002},
  title = {Recording speech articulation in dialogue: Evaluating a synchronized double electromagnetic articulography setup},
  url = {http://www.sciencedirect.com/science/article/pii/S0095447013000375},
  journal = {Journal of Phonetics},
  issn = {0095-4470},
  number = {6},
  pages = {421 - 431},
  volume = {41},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Geng2013421.pdf},
  abstract = {Abstract We demonstrate the workability of an experimental facility that is geared towards the acquisition of articulatory data from a variety of speech styles common in language use, by means of two synchronized electromagnetic articulography (EMA) devices. This approach synthesizes the advantages of real dialogue settings for speech research with a detailed description of the physiological reality of speech production. We describe the facility's method for acquiring synchronized audio streams of two speakers and the system that enables communication among control room technicians, experimenters and participants. Further, we demonstrate the feasibility of the approach by evaluating problems inherent to this specific setup: The first problem is the accuracy of temporal synchronization of the two \{EMA\} machines, the second is the severity of electromagnetic interference between the two machines. Our results suggest that the synchronization method used yields an accuracy of approximately 1 ms. Electromagnetic interference was derived from the complex-valued signal amplitudes. This dependent variable was analyzed as a function of the recording status -- i.e. on/off -- of the interfering machine's transmitters. The intermachine distance was varied between 1 m and 8.5 m. Results suggest that a distance of approximately 6.5 m is appropriate to achieve data quality comparable to that of single speaker recordings.}
}
@inproceedings{Lan14,
  author = {Lanchantin, P. and Gales, M. J. F. and King, S. and Yamagishi, J.},
  booktitle = {Proc. ICASSP},
  year = {2014},
  abstract = {This paper describes a novel approach for the speaker adaptation of statistical parametric speech synthesis systems based on the interpolation of a set of average voice models (AVM). Recent results have shown that the quality/naturalness of adapted voices directly depends on the distance from the average voice model that the speaker adaptation starts from. This suggests the use of several AVMs trained on carefully chosen speaker clusters from which a more suitable AVM can be selected/interpolated during the adaptation. In the proposed approach, a Multiple-AVM is trained on clusters of speakers, iteratively re-assigned during the estimation process initialised according to metadata. In contrast with the cluster adaptive training (CAT) framework, the training stage is computationally less expensive as the amount of training data and clusters gets larger. Additionally, during adaptation, each AVM constituting the multiple-AVM are first adapted towards the speaker which suggests a better tuning to the individual speaker of the space in which the interpolation takes place. It is shown via experiments, ran on a corpus of British speakers with various regional accents, that the quality/naturalness of synthetic speech of adapted voices is significantly higher than when considering a single factor-independent AVM selected according to the target speaker characteristics.},
  title = {Multiple-Average-Voice-based Speech Synthesis}
}
@inproceedings{watts-2014,
  author = {Watts, Oliver and Gangireddy, Siva and Yamagishi, Junichi and King, Simon and Renals, Steve and Stan, Adriana and Giurgiu, Mircea},
  title = {NEURAL NET WORD REPRESENTATIONS FOR PHRASE-BREAK PREDICTION WITHOUT A PART OF SPEECH TAGGER},
  booktitle = {Proc. ICASSP},
  address = {Florence, Italy},
  abstract = {The use of shared projection neural nets of the sort used in language modelling is proposed as a way of sharing parameters between multiple text-to-speech system components. We experiment with pretraining the weights of such a shared projection on an auxiliary language modelling task and then apply the resulting word representations to the task of phrase-break prediction. Doing so allows us to build phrase-break predictors that rival conventional systems without any reliance on conventional knowledge-based resources such as part of speech taggers.},
  month = {May},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/watts-2014.pdf},
  pages = {2618--2622},
  categories = {Speech synthesis, TTS, unsupervised learning, neural net language modelling, multitask learning}
}
@inproceedings{Dall_Yamagishi_King_SpeechProsody2014,
  author = {Dall, Rasmus and Yamagishi, Junichi and King, Simon},
  title = {Rating Naturalness in Speech Synthesis: The Effect of Style and Expectation},
  booktitle = {Proc. Speech Prosody},
  month = {May},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/Dall_Yamagishi_King_SpeechProsody2014.pdf},
  abstract = {In this paper we present evidence that speech produced spontaneously in a conversation is considered more natural than read prompts. We also explore the relationship between participants' expectations of the speech style under evaluation and their actual ratings. In successive listening tests subjects rated the naturalness of either spontaneously produced, read aloud or written sentences, with instructions toward either conversational, reading or general naturalness. It was found that, when presented with spontaneous or read aloud speech, participants consistently rated spontaneous speech more natural - even when asked to rate naturalness in the reading case. Presented with only text, participants generally preferred transcriptions of spontaneous utterances, except when asked to evaluate naturalness in terms of reading aloud. This has implications for the application of MOS-scale naturalness ratings in Speech Synthesis, and potentially on the type of data suitable for use both in general TTS, dialogue systems and specifically in Conversational TTS, in which the goal is to reproduce speech as it is produced in a spontaneous conversational setting.},
  categories = {speech synthesis, evaluation, naturalness, MOS, spontaneous speech, read speech, TTS}
}
@inproceedings{Dall_Tomalin_IS14,
  author = {Dall, Rasmus and Tomalin, Marcus and Wester, Mirjam and Byrne, William and King, Simon},
  title = {Investigating Automatic & Human Filled Pause Insertion for Speech Synthesis},
  booktitle = {Proc. Interspeech},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/Dall_Tomalin_Wester.pdf},
  abstract = {Filled pauses are pervasive in conversational speech and have been shown to serve several psychological and structural purposes. Despite this, they are seldom modelled overtly by state-of-the-art speech synthesis systems. This paper seeks to motivate the incorporation of filled pauses into speech synthesis systems by exploring their use in conversational speech, and by comparing the performance of several automatic systems inserting filled pauses into fluent text. Two initial experiments are described which seek to determine whether people's predicted insertion points are consistent with actual practice and/or with each other. The experiments also investigate whether there are `right' and `wrong' places to insert filled pauses. The results show good consistency between people's predictions of usage and their actual practice, as well as a perceptual preference for the `right' placement. The third experiment contrasts the performance of several automatic systems that insert filled pauses into fluent sentences. The best performance (determined by F-score) was achieved through the by-word interpolation of probabilities predicted by Recurrent Neural Network and 4gram Language Models. The results offer insights into the use and perception of filled pauses by humans, and how automatic systems can be used to predict their locations.},
  categories = {filled pause, HMM TTS, SVM, RNN}
}
@inproceedings{merritt2014investigating,
  author = {Merritt, Thomas and Raitio, Tuomo and King, Simon},
  title = {Investigating source and filter contributions, and their interaction, to statistical parametric speech synthesis},
  booktitle = {Proc. Interspeech},
  year = {2014},
  abstract = {This paper presents an investigation of the separate perceptual degradations introduced by the modelling of source and filter features in statistical parametric speech synthesis. This is achieved using stimuli in which various permutations of natural, vocoded and modelled source and filter are combined, optionally with the addition of filter modifications (e.g. global variance or modulation spectrum scaling). We also examine the assumption of independence between source and filter parameters. Two complementary perceptual testing paradigms are adopted. In the first, we ask listeners to perform “same or different quality” judgements between pairs of stimuli from different configurations. In the second, we ask listeners to give an opinion score for individual stimuli. Combining the findings from these tests, we draw some conclusions regarding the relative contributions of source and filter to the currently rather limited naturalness of statistical parametric synthetic speech, and test whether current independence assumptions are justified.},
  month = {September},
  address = {Singapore},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/merritt2014investigating.pdf},
  pages = {1509--1513},
  categories = {speech synthesis, hidden markov modelling, GlottHMM, source filter model, source filter interaction}
}
@inproceedings{henter2014measuring,
  author = {Henter, Gustav Eje and Merritt, Thomas and Shannon, Matt and Mayo, Catherine and King, Simon},
  title = {Measuring the perceptual effects of modelling assumptions in speech synthesis using stimuli constructed from repeated natural speech},
  abstract = {Acoustic models used for statistical parametric speech synthesis typically incorporate many modelling assumptions. It is an open question to what extent these assumptions limit the naturalness of synthesised speech. To investigate this question, we recorded a speech corpus where each prompt was read aloud multiple times. By combining speech parameter trajectories extracted from different repetitions, we were able to quantify the perceptual effects of certain commonly used modelling assumptions. Subjective listening tests show that taking the source and filter parameters to be conditionally independent, or using diagonal covariance matrices, significantly limits the naturalness that can be achieved. Our experimental results also demonstrate the shortcomings of mean-based parameter generation.},
  month = {September},
  volume = {15},
  year = {2014},
  keywords = {speech synthesis, acoustic modelling, stream independence, diagonal covariance matrices, repeated speech},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/henter2014measuring.pdf},
  booktitle = {Proc. Interspeech},
  pages = {1504--1508}
}
@inproceedings{dnnbmtl_ICASSP15,
  author = {Wu, Z. and Valentini-Botinhao, C. and Watts, O. and King, S.},
  title = {{Deep neural networks employing multi-task learning and stacked bottleneck features for speech synthesis.}},
  booktitle = {Proc. ICASSP},
  address = {Brisbane, Australia},
  month = {April},
  pages = {4460-4464},
  year = {2015},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/dnnbmtl_ICASSP15.pdf},
  abstract = {Deep neural networks (DNNs) use a cascade of hidden representations to enable the learning of complex mappings from input to output features. They are able to learn the complex mapping from textbased linguistic features to speech acoustic features, and so perform text-to-speech synthesis. Recent results suggest that DNNs can produce more natural synthetic speech than conventional HMM-based statistical parametric systems. In this paper, we show that the hidden representation used within a DNN can be improved through the use of Multi-Task Learning, and that stacking multiple frames of hidden layer activations (stacked bottleneck features) also leads to improvements. Experimental results confirmed the effectiveness of the proposed methods, and in listening tests we find that stacked bottleneck features in particular offer a significant improvement over both a baseline DNN and a benchmark HMM system.}
}
@inproceedings{dnncost_IS15,
  author = {Valentini-Botinhao, C. and Wu, Z. and King, S.},
  title = {{Towards minimum perceptual error training for {DNN}-based speech synthesis}},
  booktitle = {Proc. Interspeech},
  address = {Dresden, Germany},
  month = {September},
  year = {2015},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/dnncost_IS15.pdf},
  abstract = {We propose to use a perceptually-oriented domain to improve the quality of text-to-speech generated by deep neural networks (DNNs). We train a DNN that predicts the parameters required for speech reconstruction but whose cost function is calculated in another domain. In this paper, to represent this perceptual domain we extract an approximated version of the Spectro-Temporal Excitation Pattern that was originally proposed as part of a model of hearing speech in noise. We train DNNs that predict band aperiodicity, fundamental frequency and Mel cepstral coefficients and compare generated speech when the spectral cost function is defined in the Mel cepstral, warped log spectrum or perceptual domains. Objective results indicate that the perceptual domain system achieves the highest quality.}
}
@inproceedings{Merritt2015Attributing,
  author = {Merritt, Thomas and Latorre, Javier and King, Simon},
  title = {{Attributing modelling errors in HMM synthesis by stepping gradually from natural to modelled speech}},
  booktitle = {{Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)}},
  year = {2015},
  abstract = {{Even the best statistical parametric speech synthesis systems do not achieve the naturalness of good unit selection. We investigated possible causes of this. By constructing speech signals that lie inbetween natural speech and the output from a complete HMM synthesis system, we investigated various effects of modelling. We manipulated the temporal smoothness and the variance of the spectral parameters to create stimuli, then presented these to listeners alongside natural and vocoded speech, as well as output from a full HMM-based text-to-speech system and from an idealised `pseudo-HMM'. All speech signals, except the natural waveform, were created using vocoders employing one of two popular spectral parameterisations: Mel-Cepstra or Mel-Line Spectral Pairs. Listeners made `same or different' pairwise judgements, from which we generated a perceptual map using Multidimensional Scaling. We draw conclusions about which aspects of HMM synthesis are limiting the naturalness of the synthetic speech.}},
  month = {April},
  address = {Brisbane},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/merritt2015AttributingErrors.pdf},
  pages = {4220--4224},
  categories = {{speech synthesis, hidden Markov modelling, vocoding}}
}
@inproceedings{Merritt2015RichContext,
  author = {Merritt, Thomas and Yamagishi, Junichi and Wu, Zhizheng and Watts, Oliver and King, Simon},
  title = {{Deep neural network context embeddings for model selection in rich-context HMM synthesis}},
  booktitle = {{Proc. Interspeech}},
  year = {2015},
  month = {September},
  address = {Dresden},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/merritt2015RichContext.pdf},
  abstract = {{This paper introduces a novel form of parametric synthesis that uses context embeddings produced by the bottleneck layer of a deep neural network to guide the selection of models in a rich-context HMM-based synthesiser. Rich-context synthesis – in which Gaussian distributions estimated from single linguistic contexts seen in the training data are used for synthesis, rather than more conventional decision tree-tied models – was originally proposed to address over-smoothing due to averaging across contexts. Our previous investigations have confirmed experimentally that averaging across different contexts is indeed one of the largest factors contributing to the limited quality of statistical parametric speech synthesis. However, a possible weakness of the rich context approach as previously formulated is that a conventional tied model is still used to guide selection of Gaussians at synthesis time. Our proposed approach replaces this with context embeddings derived from a neural network.}},
  categories = {{speech synthesis, hidden Markov model, deep neural networks, rich context, embedding}}
}
@inproceedings{kamper+etal_slt14,
  author = {Kamper, Herman and Jansen, Aren and King, Simon and Goldwater, S. J.},
  title = {Unsupervised lexical clustering of speech segments using fixed-dimensional acoustic embeddings},
  booktitle = {Proc. SLT},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/kamper+jansen+king+goldwater_slt2014.pdf},
  abstract = {Unsupervised speech processing methods are essential for applications ranging from zero-resource speech technology to modelling child language acquisition. One challenging problem is discovering the word inventory of the language: the lexicon. Lexical clustering is the task of grouping unlabelled acoustic word tokens according to type. We propose a novel lexical clustering model: variable-length word segments are embedded in a fixed-dimensional acoustic space in which clustering is then performed. We evaluate several clustering algorithms and find that the best methods produce clusters with wide variation in sizes, as observed in natural language. The best probabilistic approach is an infinite Gaussian mixture model (IGMM), which automatically chooses the number of clusters. Performance is comparable to that of non-probabilistic Chinese Whispers and average-linkage hierarchical clustering. We conclude that IGMM clustering of fixed-dimensional embeddings holds promise as the lexical clustering component in unsupervised speech processing systems.},
  categories = {lexical clustering, unsupervised learning, fixed-dimensional embeddings, lexical discovery}
}
@inproceedings{tomalin:diss:2015,
  author = {Tomalin, Marcus and Wester, Mirjam and Dall, Rasmus and Byrne, Bill and King, Simon},
  title = {A Lattice-based Approach to Automatic Filled Pause Insertion},
  booktitle = {Proc. DiSS 2015},
  address = {Edinburgh},
  month = {August},
  year = {2015},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/tomalin:diss:2015.pdf},
  abstract = {This paper describes a novel method for automatically inserting filled pauses (e.g., UM) into fluent texts. Although filled pauses are known to serve a wide range of psychological and structural functions in conversational speech, they have not traditionally been modelled overtly by state-of-the-art speech synthesis systems. However, several recent systems have started to model disfluencies specifically, and so there is an increasing need to create disfluent speech synthesis input by automatically inserting filled pauses into otherwise fluent text. The approach presented here interpolates Ngrams and Full-Output Recurrent Neural Network Language Models (f-RNNLMs) in a lattice-rescoring framework. It is shown that the interpolated system outperforms separate Ngram and f-RNNLM systems, where performance is analysed using the Precision, Recall, and F-score metrics.},
  categories = {Disfluency, Filled Pauses, f-RNNLMs, Ngrams, Lattices}
}
@inproceedings{wu2015minimum,
  author = {Wu, Zhizheng and King, Simon},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/is2015_trajectory_dnn.pdf},
  booktitle = {Interspeech},
  title = {Minimum trajectory error training for deep neural networks, combined with stacked bottleneck features},
  year = {2015}
}
@inproceedings{wu2015adaptation,
  author = {Wu, Zhizheng and Swietojanski, Pawel and Veaux, Christophe and Renals, Steve and King, Simon},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/is2015_dnn_adaptation.pdf},
  booktitle = {Interspeech},
  title = {A study of speaker adaptation for {DNN}-based speech synthesis},
  year = {2015}
}
@inproceedings{wu2015mtl,
  author = {Wu, Zhizheng and Valentini-Botinhao, Cassia and Watts, Oliver and King, Simon},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/icassp2015_dnn_tts.pdf},
  booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
  title = {Deep neural network employing multi-task learning and stacked bottleneck features for speech synthesis},
  year = {2015}
}
@inproceedings{wu2015sas,
  author = {Wu, Zhizheng and Khodabakhsh, Ali and Demiroglu, Cenk and Yamagishi, Junichi and Saito, Daisuke and Toda, Tomoki and King, Simon},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/icassp2015_sas.pdf},
  booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
  title = {{SAS}: A speaker verification spoofing database containing diverse attacks},
  year = {2015}
}
@article{stan-2016,
  author = {Stan, Adriana and Mamiya, Yoshitaka and Yamagishi, Junichi and Bell, Peter and Watts, Oliver and Clark, Rob and King, Simon},
  doi = {http://dx.doi.org/10.1016/j.csl.2015.06.006},
  title = {{ALISA}: An automatic lightly supervised speech segmentation and alignment tool},
  url = {http://www.sciencedirect.com/science/article/pii/S0885230815000650},
  journal = {Computer Speech and Language},
  issn = {0885-2308},
  pages = {116--133},
  volume = {35},
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/stan-2016.pdf},
  abstract = {This paper describes the ALISA tool, which implements a lightly supervised method for sentence-level alignment of speech with imperfect transcripts. Its intended use is to enable the creation of new speech corpora from a multitude of resources in a language-independent fashion, thus avoiding the need to record or transcribe speech data. The method is designed so that it requires minimum user intervention and expert knowledge, and it is able to align data in languages which employ alphabetic scripts. It comprises a GMM-based voice activity detector and a highly constrained grapheme-based speech aligner. The method is evaluated objectively against a gold standard segmentation and transcription, as well as subjectively through building and testing speech synthesis systems from the retrieved data. Results show that on average, 70% of the original data is correctly aligned, with a word error rate of less than 0.5%. In one case, subjective listening tests show a statistically significant preference for voices built on the gold transcript, but this is small and in other tests, no statistically significant differences between the systems built from the fully supervised training data and the one which uses the proposed method are found.},
  categories = {Speech segmentation, speech and text alignment, grapheme acoustic models, lightly supervised system, imperfect transcripts}
}
@inproceedings{CassiaIOS14,
  author = {Valentini-Botinhao, Cassia and Yamagishi, Junichi and King, Simon},
  title = {Intelligibility Enhancement of Speech in Noise},
  booktitle = {Proceedings of the Institute of Acoustics},
  address = {Birmingham, UK},
  number = {2},
  month = {October},
  volume = {36},
  pages = {96-103},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/CassiaIOS14.pdf},
  abstract = {To maintain communication success, humans change the way they speak and hear according to many factors, like the age, gender, native language and social relationship between talker and listener. Other factors are dictated by how communication takes place, such as environmental factors like an active competing speaker or limitations on the communication channel. As in natural interaction, we expect to communicate with and use synthetic voices that can also adapt to different listening scenarios and keep the level of intelligibility high. Research in speech technology needs to account for this to change the way we transmit, store and artificially generate speech accordingly.}
}
@inproceedings{astrinaki2013b,
  author = {Astrinaki, Maria and Moinet, Alexis and Yamagishi, Junichi and Richmond, Korin and Ling, Zhen-Hua and King, Simon and Dutoit, Thierry},
  title = {Mage - {HMM}-based speech synthesis reactively controlled by the articulators},
  abstract = {In this paper, we present the recent progress in the MAGE project. MAGE is a library for realtime and interactive (reactive) parametric speech synthesis using hidden Markov models (HMMs). Here, it is broadened in order to support not only the standard acoustic features (spectrum and f0) to model and synthesize speech but also to combine acoustic and articulatory features, such as tongue, lips and jaw positions. Such an integration enables the user to have a straight forward and meaningful control space to intuitively modify the synthesized phones in real time only by configuring the position of the articulators.},
  address = {Barcelona, Spain},
  month = {August},
  pages = {243},
  year = {2013},
  keywords = {speech synthesis, reactive, articulators},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/ICPHS0724.pdf},
  booktitle = {8th ISCA Workshop on Speech Synthesis}
}
@inproceedings{richmond2016smooth,
  author = {Richmond, Korin and King, Simon},
  title = {Smooth Talking: Articulatory Join Costs for Unit Selection},
  abstract = {Join cost calculation has so far dealt exclusively with acoustic speech parameters, and a large number of distance metrics have previously been tested in conjunction with a wide variety of acoustic parameterisations. In contrast, we propose here to calculate distance in articulatory space. The motivation for this is simple: physical constraints mean a human talker's mouth cannot ``jump'' from one configuration to a different one, so smooth evolution of articulator positions would also seem desirable for a good candidate unit sequence. To test this, we built Festival Multisyn voices using a large articulatory-acoustic dataset. We first synthesised 460 TIMIT sentences and confirmed our articulatory join cost gives appreciably different unit sequences compared to the standard Multisyn acoustic join cost. A listening test (3 sets of 25 sentence pairs, 30 listeners) then showed our articulatory cost is preferred at a rate of 58\% compared to the standard Multisyn acoustic join cost.},
  month = {March},
  year = {2016},
  keywords = {speech synthesis, unit selection, electromagnetic articulography, join cost},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2016/richmond2016smooth.pdf},
  booktitle = {Proc. IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
  pages = {5150-5154}
}
@inproceedings{henter2016robust,
  author = {Henter, Gustav Eje and Ronanki, Srikanth and Watts, Oliver and Wester, Mirjam and Wu, Zhizheng and King, Simon},
  title = {Robust {TTS} duration modelling using {DNN}s},
  url = {http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=7472655},
  abstract = {Accurate modelling and prediction of speech-sound durations is an important component in generating more natural synthetic speech. Deep neural networks (DNNs) offer a powerful modelling paradigm, and large, found corpora of natural and expressive speech are easy to acquire for training them. Unfortunately, found datasets are seldom subject to the quality-control that traditional synthesis methods expect. Common issues likely to affect duration modelling include transcription errors, reductions, filled pauses, and forced-alignment inaccuracies. To combat this, we propose to improve modelling and prediction of speech durations using methods from robust statistics, which are able to disregard ill-fitting points in the training material. We describe a robust fitting criterion based on the density power divergence (the beta-divergence) and a robust generation heuristic using mixture density networks (MDNs). Perceptual tests indicate that subjects prefer synthetic speech generated using robust models of duration over the baselines.},
  year = {2016},
  month = {March},
  volume = {41},
  pages = {5130--5134},
  address = {Shanghai, China},
  keywords = {Speech synthesis, duration modelling, robust statistics},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/henter2016robust.pdf},
  booktitle = {Proc. ICASSP},
  categories = {Speech synthesis, duration modelling, robust statistics}
}
@inproceedings{watts2016hmms,
  author = {Watts, Oliver and Henter, Gustav Eje and Merritt, Thomas and Wu, Zhizheng and King, Simon},
  title = {From {HMM}s to {DNN}s: where do the improvements come from?},
  url = {http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=7472730},
  abstract = {Deep neural networks (DNNs) have recently been the focus of much text-to-speech research as a replacement for decision trees and hidden Markov models (HMMs) in statistical parametric synthesis systems. Performance improvements have been reported; however, the configuration of systems evaluated makes it impossible to judge how much of the improvement is due to the new machine learning methods, and how much is due to other novel aspects of the systems. Specifically, whereas the decision trees in HMM-based systems typically operate at the state-level, and separate trees are used to handle separate acoustic streams, most DNN-based systems are trained to make predictions simultaneously for all streams at the level of the acoustic frame. This paper isolates the influence of three factors (machine learning method; state vs. frame predictions; separate vs. combined stream predictions) by building a continuum of systems along which only a single factor is varied at a time. We find that replacing decision trees with DNNs and moving from state-level to frame-level predictions both significantly improve listeners' naturalness ratings of synthetic speech produced by the systems. No improvement is found to result from switching from separate-stream to combined-stream predictions.},
  year = {2016},
  month = {March},
  volume = {41},
  pages = {5505--5509},
  address = {Shanghai, China},
  keywords = {speech synthesis, hidden Markov model, decision tree, deep neural network},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/watts2016hmms.pdf},
  booktitle = {Proc. ICASSP},
  categories = {speech synthesis, hidden Markov model, decision tree, deep neural network}
}
@inproceedings{ronanki2016dnn,
  author = {Ronanki, Srikanth and Reddy, Siva and Bollepalli, Bajibabu and King, Simon},
  title = {{DNN-based Speech Synthesis for Indian Languages from ASCII text}},
  booktitle = {Proc. 9th ISCA Speech Synthesis Workshop (SSW9)},
  address = {Sunnyvale, CA, USA},
  month = sep,
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/ronanki2016ilsynth.pdf},
  abstract = {Text-to-Speech synthesis in Indian languages has a seen lot of progress over the decade partly due to the annual Blizzard challenges. These systems assume the text to be written in Devanagari or Dravidian scripts which are nearly phonemic orthography scripts. However, the most common form of computer interaction among Indians is ASCII written transliterated text. Such text is generally noisy with many variations in spelling for the same word. In this paper we evaluate three approaches to synthesize speech from such noisy ASCII text: a naive Uni-Grapheme approach, a Multi-Grapheme approach, and a supervised Grapheme-to-Phoneme (G2P) approach. These methods first convert the ASCII text to a phonetic script, and then learn a Deep Neural Network to synthesize speech from that. We train and test our models on Blizzard Challenge datasets that were transliterated to ASCII using crowdsourcing. Our experiments on Hindi, Tamil and Telugu demonstrate that our models generate speech of competetive quality from ASCII text compared to the speech synthesized from the native scripts. All the accompanying transliterated datasets are released for public access.},
  categories = {Indian Languages, Speech Synthesis, Deep Neural Networks, ASCII transliteration}
}
@inproceedings{ronanki2016template,
  author = {Ronanki, Srikanth and Henter, Gustav Eje and Wu, Zhizheng and King, Simon},
  title = {A template-based approach for speech synthesis intonation generation using {LSTM}s},
  booktitle = {Proc. Interspeech},
  address = {San Francisco, USA},
  month = {September},
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/ronanki2016template.pdf},
  abstract = {The absence of convincing intonation makes current parametric speech synthesis systems sound dull and lifeless, even when trained on expressive speech data. Typically, these systems use regression techniques to predict the fundamental frequency (F0) frame-by-frame. This approach leads to overly-smooth pitch contours and fails to construct an appropriate prosodic structure across the full utterance. In order to capture and reproduce larger-scale pitch patterns, this paper proposes a template-based approach for automatic F0 generation, where per-syllable pitch-contour templates (from a small, automatically learned set) are predicted by a recurrent neural network (RNN). The use of syllable templates mitigates the over-smoothing problem and is able to reproduce pitch patterns observed in the data. The use of an RNN, paired with connectionist temporal classification (CTC), enables the prediction of structure in the pitch contour spanning the entire utterance. This novel F0 prediction system is used alongside separate LSTMs for predicting phone durations and the other acoustic features, to construct a complete text-to-speech system. We report the results of objective and subjective tests on an expressive speech corpus of children's audiobooks, and include comparisons to a conventional baseline that predicts F0 directly at the frame level.},
  categories = {speech synthesis, intonation modelling, F0 templates, LSTM, CTC}
}
@inproceedings{merritt2016hybrid,
  author = {Merritt, Thomas and Clark, Robert A J and Wu, Zhizheng and Yamagishi, Junichi and King, Simon},
  title = {Deep neural network-guided unit selection synthesis},
  booktitle = {Proc. ICASSP},
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/Merritt_ICASSP2016.pdf},
  abstract = {Vocoding of speech is a standard part of statistical parametric speech synthesis systems. It imposes an upper bound of the naturalness that can possibly be achieved. Hybrid systems using parametric models to guide the selection of natural speech units can combine the benefits of robust statistical models with the high level of naturalness of waveform concatenation. Existing hybrid systems use Hidden Markov Models (HMMs) as the statistical model. This paper demonstrates that the superiority of Deep Neural Network (DNN) acoustic models over HMMs in conventional statistical parametric speech synthesis also carries over to hybrid synthesis. We compare various DNN and HMM hybrid configurations, guiding the selection of waveform units in either the vocoder parameter domain, or in the domain of embeddings (bottleneck features).},
  categories = {speech synthesis, hybrid synthesis, deep neural networks, embedding, unit selection}
}