The Centre for Speech Technology Research, The university of Edinburgh

Publications by Simon King

simonk.bib

@inproceedings{frankel07:AF_MLP,
  author = {Frankel, J. and Magimai-Doss, M. and King, S. and
                   Livescu, K. and Çetin, Ö.},
  title = {Articulatory Feature Classifiers Trained on 2000 hours
                   of Telephone Speech},
  booktitle = {Proc. Interspeech},
  address = {Antwerp, Belgium},
  abstract = {This paper is intended to advertise the public
                   availability of the articulatory feature (AF)
                   classification multi-layer perceptrons (MLPs) which
                   were used in the Johns Hopkins 2006 summer workshop. We
                   describe the design choices, data preparation, AF label
                   generation, and the training of MLPs for feature
                   classification on close to 2000 hours of telephone
                   speech. In addition, we present some analysis of the
                   MLPs in terms of classification accuracy and confusions
                   along with a brief summary of the results obtained
                   during the workshop using the MLPs. We invite
                   interested parties to make use of these MLPs.},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/frankel_AF-MLP.pdf},
  year = 2007
}
@misc{turk2010,
  author = {Turk, Alice and Scobbie, James and Geng, Christian and
                   Campbell, Barry and Dickie, Catherine and Dubourg,
                   Eddie and Bard, Ellen Gurman and Hardcastle, William
                   and Hartinger, Mariam and King, Simon and Lickley,
                   Robin and Macmartin, Cedric and Nakai, Satsuki and
                   Renals, Steve and Richmond, Korin and Schaeffler, Sonja
                   and White, Kevin and Wiegand, Ronny and Wrench, Alan},
  title = {An {E}dinburgh speech production facility},
  howpublished = {Poster presented at the 12th Conference on Laboratory
                   Phonology, Albuquerque, New Mexico.},
  month = {July},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/ESPF.pdf},
  year = 2010
}
@article{Wang_JCST2012,
  author = {Dong Wang and Javier Tejedor and Simon King and Joe
                   Frankel},
  title = {Term-dependent Confidence Normalization for
                   Out-of-Vocabulary Spoken Term Detection},
  journal = {Journal of Computer Science and Technology},
  volume = {27},
  number = {2},
  abstract = {Spoken Term Detection (STD) is a fundamental component
                   of spoken information retrieval systems. A key task of
                   an STD system is to determine reliable detections and
                   reject false alarms based on certain confidence
                   measures. The detection posterior probability, which is
                   often computed from lattices, is a widely used
                   confidence measure. However, a potential problem of
                   this confidence measure is that the confidence scores
                   of detections of all search terms are treated
                   uniformly, regardless of how much they may differ in
                   terms of phonetic or linguistic properties. This
                   problem is particularly evident for out-of-vocabulary
                   (OOV) terms which tend to exhibit high intra-term
                   diversity. To address the discrepancy on confidence
                   levels that the same confidence score may convey for
                   different terms, a term-dependent decision strategy is
                   desirable – for example, the term-specific threshold
                   (TST) approach. In this work, we propose a
                   term-dependent normalisation technique which
                   compensates for term diversity on confidence
                   estimation. Particularly, we propose a linear bias
                   compensation and a discriminative compensation to deal
                   with the bias problem that is inherent in lattice-based
                   confidence measuring from which the TST approach
                   suffers. We tested the proposed technique on speech
                   data from the multi-party meeting domain with two
                   state-of-the-art STD systems based on phonemes and
                   words respectively. The experimental results
                   demonstrate that the confidence normalisation approach
                   leads to a significant performance improvement in STD,
                   particularly for OOV terms with phoneme-based systems.},
  doi = {http://dx.doi.org/10.1007/s11390-012-1228-x},
  year = 2012
}
@inproceedings{bell_king_shrinkage_is2008,
  author = {Bell, Peter and King, Simon},
  title = {A Shrinkage Estimator for Speech Recognition with Full
                   Covariance {HMM}s},
  booktitle = {Proc. Interspeech},
  address = {Brisbane, Australia},
  note = {Shortlisted for best student paper award.},
  abstract = {We consider the problem of parameter estimation in
                   full-covariance Gaussian mixture systems for automatic
                   speech recognition. Due to the high dimensionality of
                   the acoustic feature vector, the standard sample
                   covariance matrix has a high variance and is often
                   poorly-conditioned when the amount of training data is
                   limited. We explain how the use of a shrinkage
                   estimator can solve these problems, and derive a
                   formula for the optimal shrinkage intensity. We present
                   results of experiments on a phone recognition task,
                   showing that the estimator gives a performance
                   improvement over a standard full-covariance system},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/shrinkage_is2008.pdf},
  year = 2008
}
@inproceedings{shig042,
  author = {Yoshinori Shiga and Simon King},
  title = {Source-Filter Separation for Articulation-to-Speech
                   Synthesis},
  booktitle = {Proc. ICSLP},
  address = {Jeju, Korea},
  abstract = {In this paper we examine a method for separating out
                   the vocal-tract filter response from the voice source
                   characteristic using a large articulatory database. The
                   method realises such separation for voiced speech using
                   an iterative approximation procedure under the
                   assumption that the speech production process is a
                   linear system composed of a voice source and a
                   vocal-tract filter, and that each of the components is
                   controlled independently by different sets of factors.
                   Experimental results show that the spectral variation
                   is evidently influenced by the fundamental frequency or
                   the power of speech, and that the tendency of the
                   variation may be related closely to speaker identity.
                   The method enables independent control over the voice
                   source characteristic in our articulation-to-speech
                   synthesis.},
  categories = {artic, lbg, clustering, mocha, source-filter,
                   edinburgh},
  month = oct,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_icslp04b.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_icslp04b.ps},
  year = 2004
}
@inproceedings{jyamagis07:avss2006,
  author = {Junichi Yamagishi and Takao Kobayashi and Steve Renals
                   and Simon King and Heiga Zen and Tomoki Toda and
                   Keiichi Tokuda },
  title = {Improved Average-Voice-based Speech Synthesis Using
                   Gender-Mixed Modeling and a Parameter Generation
                   Algorithm Considering {GV}},
  booktitle = {Proc. 6th ISCA Workshop on Speech Synthesis (SSW-6)},
  abstract = {For constructing a speech synthesis system which can
                   achieve diverse voices, we have been developing a
                   speaker independent approach of HMM-based speech
                   synthesis in which statistical average voice models are
                   adapted to a target speaker using a small amount of
                   speech data. In this paper, we incorporate a
                   high-quality speech vocoding method STRAIGHT and a
                   parameter generation algorithm with global variance
                   into the system for improving quality of synthetic
                   speech. Furthermore, we introduce a feature-space
                   speaker adaptive training algorithm and a gender mixed
                   modeling technique for conducting further normalization
                   of the average voice model. We build an English
                   text-to-speech system using these techniques and show
                   the performance of the system.},
  categories = {HMM, speech synthesis, speaker adaptation, HTS},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/ssw6-yamagishi.pdf},
  year = 2007
}
@inproceedings{king:portele:hoefer:eurospeech1997,
  author = {Simon King and Thomas Portele and Florian H\"ofer},
  title = {Speech synthesis using non-uniform units in the
                   {V}erbmobil project},
  booktitle = {Proc. {E}urospeech 97},
  volume = 2,
  pages = {569-572},
  address = {Rhodes, Greece},
  abstract = {We describe a concatenative speech synthesiser for
                   British English which uses the HADIFIX inventory
                   structure originally developed for German by Portele.
                   An inventory of non-uniform units was investigated with
                   the aim of improving segmental quality compared to
                   diphones. A combination of soft (diphone) and hard
                   concatenation was used, which allowed a dramatic
                   reduction in inventory size. We also present a unit
                   selection algorithm which selects an optimum sequence
                   of units from this inventory for a given phoneme
                   sequence. The work described is part of the
                   concept-to-speech synthesiser for the language and
                   speech project Verbmobil which is funded by the German
                   Ministry of Science (BMBF).},
  categories = {},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/King_Portele_Hoefer_eurospeech1997.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/King_Portele_Hoefer_eurospeech1997.ps},
  year = 1997
}
@inproceedings{isard:king:taylor:kowtko:snowbird95,
  author = {Stephen Isard and Simon King and Paul A. Taylor and
                   Jacqueline Kowtko},
  title = {Prosodic Information in a Speech Recognition System
                   intended for Dialogue},
  booktitle = {IEEE Workshop in speech recognition},
  address = {Snowbird, Utah},
  abstract = {We report on an automatic speech recognition system
                   intended for use in dialogue, whose original aspect is
                   its use of prosodic information for two different
                   purposes. The first is to improve the word level
                   accuracy of the system. The second is to constrain the
                   language model applied to a given utterance by taking
                   into account the way that dialogue context and
                   intonational tune interact to limit the possibilities
                   for what an utterance might be.},
  categories = {},
  year = 1995
}
@conference{hengluIS2012,
  abstract = {Speech units are highly context-dependent, so taking
                   contextual features into account is essential for
                   speech modelling. Context is employed in HMM-based
                   Text-to-Speech speech synthesis systems via
                   context-dependent phone models. A very wide context is
                   taken into account, represented by a large set of
                   contextual factors. However, most of these factors
                   probably have no significant influence on the speech,
                   most of the time. To discover which combinations of
                   features should be taken into account, decision
                   tree-based context clustering is used. But the space of
                   context-dependent models is vast, and the number of
                   contexts seen in the training data is only a tiny
                   fraction of this space, so the task of the decision
                   tree is very hard: to generalise from observations of a
                   tiny fraction of the space to the rest of the space,
                   whilst ignoring uninformative or redundant context
                   features. The structure of the context feature space
                   has not been systematically studied for speech
                   synthesis. In this paper we discover a dependency
                   structure by learning a Bayesian Network over the joint
                   distribution of the features and the speech. We
                   demonstrate that it is possible to discard the majority
                   of context features with minimal impact on quality,
                   measured by a perceptual test.},
  address = {Portland, Oregon, USA},
  author = {Heng Lu and Simon King},
  booktitle = {Proc. Interspeech},
  categories = {HMM-based speech synthesis, Bayesian Networks, context
                   information},
  keywords = {HMM-based speech synthesis, Bayesian Networks, context
                   information},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/HengLuSimonKing.pdf},
  title = {Using {Bayesian} Networks to find relevant context
                   features for {HMM}-based speech synthesis},
  year = 2012
}
@inproceedings{wang_acmsccs2010,
  author = {Dong Wang and Simon King and Nick Evans and Raphael
                   Troncy},
  title = {Direct Posterior Confidence For Out-of-Vocabulary
                   Spoken Term Detection},
  booktitle = {Proc. ACM Multimedia 2010 Searching Spontaneous
                   Conversational Speech Workshop},
  abstract = {Spoken term detection (STD) is a fundamental task in
                   spoken information retrieval. Compared to conventional
                   speech transcription and keyword spotting, STD is an
                   open-vocabulary task and is necessarily required to
                   address out-of-vocabulary (OOV) terms. Approaches based
                   on subword units, e.g. phonemes, are widely used to
                   solve the OOV issue; however, performance on OOV terms
                   is still significantly inferior to that for
                   in-vocabulary (INV) terms. The performance degradation
                   on OOV terms can be attributed to a multitude of
                   factors. A particular factor we address in this paper
                   is that the acoustic and language models used for
                   speech transcribing are highly vulnerable to OOV terms,
                   which leads to unreliable confidence measures and
                   error-prone detections. A direct posterior confidence
                   measure that is derived from discriminative models has
                   been proposed for STD. In this paper, we utilize this
                   technique to tackle the weakness of OOV terms in
                   confidence estimation. Neither acoustic models nor
                   language models being included in the computation, the
                   new confidence avoids the weak modeling problem with
                   OOV terms. Our experiments, set up on multi-party
                   meeting speech which is highly spontaneous and
                   conversational, demonstrate that the proposed technique
                   improves STD performance on OOV terms significantly;
                   when combined with conventional lattice-based
                   confidence, a significant improvement in performance is
                   obtained on both INVs and OOVs. Furthermore, the new
                   confidence measure technique can be combined together
                   with other advanced techniques for OOV treatment, such
                   as stochastic pronunciation modeling and term-dependent
                   confidence discrimination, which leads to an integrated
                   solution for OOV STD with greatly improved performance.},
  categories = {confidence estimation, spoken term detection, speech
                   recognition},
  doi = {10.1145/1878101.1878107},
  month = oct,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wang_acmsccs2010.pdf},
  year = 2010
}
@inproceedings{vepa-king-isca04,
  author = {Vepa, J. and King, S.},
  title = {Subjective evaluation of join cost and smoothing
                   methods},
  booktitle = {Proc. 5th {ISCA} speech synthesis workshop},
  address = {Pittsburgh, USA},
  abstract = {In our previous papers, we have proposed join cost
                   functions derived from spectral distances, which have
                   good correlations with perceptual scores obtained for a
                   range of concatenation discontinuities. To further
                   validate their ability to predict concatenation
                   discontinuities, we have chosen the best three spectral
                   distances and evaluated them subjectively in a
                   listening test. The units for synthesis stimuli are
                   obtained from a state-of-the-art unit selection
                   text-to-speech system: `rVoice' from Rhetorical Systems
                   Ltd. We also compared three different smoothing methods
                   in this listening test. In this paper, we report
                   listeners' preferences for each join costs in
                   combination with each smoothing method.},
  categories = {join cost, Kalman filter, smoothing, evaluation,
                   rVoice, edinburgh},
  month = jun,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/vepa_tts04.pdf},
  year = 2004
}
@inproceedings{salomon:king:osborne:icslp2002,
  author = {Jesper Salomon and Simon King and Miles Osborne},
  title = {Framewise phone classification using support vector
                   machines},
  booktitle = {Proceedings International Conference on Spoken
                   Language Processing},
  address = {Denver},
  abstract = {We describe the use of Support Vector Machines for
                   phonetic classification on the TIMIT corpus. Unlike
                   previous work, in which entire phonemes are classified,
                   our system operates in a \textit{framewise} manner and
                   is intended for use as the front-end of a hybrid system
                   similar to ABBOT. We therefore avoid the problems of
                   classifying variable-length vectors. Our frame-level
                   phone classification accuracy on the complete TIMIT
                   test set is competitive with other results from the
                   literature. In addition, we address the serious problem
                   of \textit{scaling} Support Vector Machines by using
                   the Kernel Fisher Discriminant.},
  categories = {},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/Salomon_King_Osborne_icslp2002.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/Salomon_King_Osborne_icslp2002.ps},
  year = 2002
}
@inproceedings{Wang_TOIS2012,
  author = {Wang, Dong and King, Simon and Evans, Nicholas W. D.
                   and Troncy, Raphaël},
  title = {Direct posterior confidence for out-of-vocabulary
                   spoken term detection},
  booktitle = {{SSCS} 2010, {ACM} {W}orkshop on {S}earching
                   {S}pontaneous {C}onversational {S}peech, {S}eptember
                   20-24, 2010, {F}irenze, {I}taly},
  address = {{F}irenze, {ITALY}},
  abstract = {Spoken term detection (STD) is a fundamental task in
                   spoken information retrieval. Compared to conventional
                   speech transcription and keyword spotting, STD is an
                   open-vocabulary task and is necessarily required to
                   address out-of-vocabulary (OOV) terms. Approaches based
                   on subword units, e.g. phonemes, are widely used to
                   solve the OOV issue; however, performance on OOV terms
                   is still significantly inferior to that for
                   in-vocabulary (INV) terms. The performance degradation
                   on OOV terms can be attributed to a multitude of
                   factors. A particular factor we address in this paper
                   is that the acoustic and language models used for
                   speech transcribing are highly vulnerable to OOV terms,
                   which leads to unreliable confidence measures and
                   error-prone detections. A direct posterior confidence
                   measure that is derived from discriminative models has
                   been proposed for STD. In this paper, we utilize this
                   technique to tackle the weakness of OOV terms in
                   confidence estimation. Neither acoustic models nor
                   language models being included in the computation, the
                   new confidence avoids the weak modeling problem with
                   OOV terms. Our experiments, set up on multi-party
                   meeting speech which is highly spontaneous and
                   conversational, demonstrate that the proposed technique
                   improves STD performance on OOV terms significantly;
                   when combined with conventional lattice-based
                   confidence, a significant improvement in performance is
                   obtained on both INVs and OOVs. Furthermore, the new
                   confidence measure technique can be combined together
                   with other advanced techniques for OOV treatment, such
                   as stochastic pronunciation modeling and term-dependent
                   confidence discrimination, which leads to an integrated
                   solution for OOV STD with greatly improved performance.},
  doi = {http://dx.doi.org/10.1145/1878101.1878107},
  month = sep,
  year = 2010
}
@inproceedings{wang:frankel:tejedor:king:icassp2008,
  author = {Dong Wang and Joe Frankel and Javier Tejedor and Simon
                   King},
  title = {A comparison of phone and grapheme-based spoken term
                   detection},
  booktitle = {Proc. ICASSP},
  pages = {4969--4972 },
  abstract = {We propose grapheme-based sub-word units for spoken
                   term detection (STD). Compared to phones, graphemes
                   have a number of potential advantages. For
                   out-of-vocabulary search terms, phone- based approaches
                   must generate a pronunciation using letter-to-sound
                   rules. Using graphemes obviates this potentially
                   error-prone hard decision, shifting pronunciation
                   modelling into the statistical models describing the
                   observation space. In addition, long-span grapheme
                   language models can be trained directly from large text
                   corpora. We present experiments on Spanish and English
                   data, comparing phone and grapheme-based STD. For
                   Spanish, where phone and grapheme-based systems give
                   similar transcription word error rates (WERs),
                   grapheme-based STD significantly outperforms a phone-
                   based approach. The converse is found for English,
                   where the phone-based system outperforms a grapheme
                   approach. However, we present additional analysis which
                   suggests that phone-based STD performance levels may be
                   achieved by a grapheme-based approach despite lower
                   transcription accuracy, and that the two approaches may
                   usefully be combined. We propose a number of directions
                   for future development of these ideas, and suggest that
                   if grapheme-based STD can match phone-based
                   performance, the inherent flexibility in dealing with
                   out-of-vocabulary terms makes this a desirable
                   approach.},
  doi = {10.1109/ICASSP.2008.4518773},
  month = {March-April},
  year = 2008
}
@inproceedings{vepa-king-taylor_icslp02,
  author = {Vepa, J. and King, S. and Taylor, P.},
  title = {Objective Distance Measures for Spectral
                   Discontinuities in Concatenative Speech Synthesis},
  booktitle = {Proc. {ICSLP}},
  address = {Denver, USA},
  abstract = {In unit selection based concatenative speech systems,
                   `join cost', which measures how well two units can be
                   joined together, is one of the main criteria for
                   selecting appropriate units from the inventory. The
                   ideal join cost will measure `perceived' discontinuity,
                   based on easily measurable spectral properties of the
                   units being joined, in order to ensure smooth and
                   natural-sounding synthetic speech. In this paper we
                   report a perceptual experiment conducted to measure the
                   correlation between `subjective' human perception and
                   various `objective' spectrally-based measures proposed
                   in the literature. Our experiments used a
                   state-of-the-art unit-selection text-to-speech system:
                   `rVoice' from Rhetorical Systems Ltd.},
  categories = {join cost, distance measures, MCA, rVoice, edinburgh},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/vepa_icslp02.pdf},
  year = 2002
}
@incollection{king:gold_and_morgan_chapter2009,
  author = {Simon King},
  title = {Speech Synthesis},
  booktitle = {Speech and Audio Signal Processing},
  publisher = {Wiley},
  editor = {Morgan and Ellis},
  abstract = {No abstract (this is a book chapter)},
  categories = {speech synthesis},
  year = 2010
}
@inproceedings{CassiaICASSP12,
  author = {Valentini-Botinhao, C. and Maia, R. and Yamagishi, J.
                   and King, S. and Zen, H.},
  title = {{Cepstral analysis based on the Glimpse proportion
                   measure for improving the intelligibility of
                   {HMM}-based synthetic speech in noise}},
  booktitle = {Proc. ICASSP},
  pages = {3997--4000},
  address = {Kyoto, Japan},
  abstract = {In this paper we introduce a new cepstral coefficient
                   extraction method based on an intelligibility measure
                   for speech in noise, the Glimpse Proportion measure.
                   This new method aims to increase the intelligibility of
                   speech in noise by modifying the clean speech, and has
                   applications in scenarios such as public announcement
                   and car navigation systems. We first explain how the
                   Glimpse Proportion measure operates and further show
                   how we approximated it to integrate it into an existing
                   spectral envelope parameter extraction method commonly
                   used in the HMM-based speech synthesis framework. We
                   then demonstrate how this new method changes the
                   modelled spectrum according to the characteristics of
                   the noise and show results for a listening test with
                   vocoded and HMM-based synthetic speech. The test
                   indicates that the proposed method can significantly
                   improve intelligibility of synthetic speech in speech
                   shaped noise.},
  categories = {HMM-based speech synthesis, intelligibility
                   enhancement, speech analysis},
  doi = {10.1109/ICASSP.2012.6288794},
  month = {March},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_ICASSP12.pdf},
  year = 2012
}
@article{roberto:specom2010,
  author = {R. Barra-Chicote and J. Yamagishi and S. King and J.
                   Manuel Monero and J. Macias-Guarasa},
  title = {Analysis of Statistical Parametric and Unit-Selection
                   Speech Synthesis Systems Applied to Emotional Speech},
  journal = {Speech Communication},
  volume = {52},
  number = {5},
  pages = {394--404},
  abstract = {We have applied two state-of-the-art speech synthesis
                   techniques (unit selection and HMM-based synthesis) to
                   the synthesis of emotional speech. A series of
                   carefully designed perceptual tests to evaluate speech
                   quality, emotion identification rates and emotional
                   strength were used for the six emotions which we
                   recorded -- happiness, sadness, anger, surprise, fear,
                   disgust. For the HMM-based method, we evaluated
                   spectral and source components separately and
                   identified which components contribute to which
                   emotion. Our analysis shows that, although the HMM
                   method produces significantly better neutral speech,
                   the two methods produce emotional speech of similar
                   quality, except for emotions having context-dependent
                   prosodic patterns. Whilst synthetic speech produced
                   using the unit selection method has better emotional
                   strength scores than the HMM-based method, the
                   HMM-based method has the ability to manipulate the
                   emotional strength. For emotions that are characterized
                   by both spectral and prosodic components, synthetic
                   speech using unit selection methods was more accurately
                   identified by listeners. For emotions mainly
                   characterized by prosodic components, HMM-based
                   synthetic speech was more accurately identified. This
                   finding differs from previous results regarding
                   listener judgements of speaker similarity for neutral
                   speech. We conclude that unit selection methods require
                   improvements to prosodic modeling and that HMM-based
                   methods require improvements to spectral modeling for
                   emotional speech. Certain emotions cannot be reproduced
                   well by either method.},
  doi = {10.1016/j.specom.2009.12.007},
  keywords = {Emotional speech synthesis; HMM-based synthesis; Unit
                   selection},
  month = may,
  year = 2010
}
@article{Taylor_1998_b,
  author = {Paul A. Taylor and S. King and S. D. Isard and H.
                   Wright},
  title = {Intonation and Dialogue Context as Constraints for
                   Speech Recognition},
  journal = {Language and Speech},
  volume = 41,
  number = {3},
  pages = {493-512},
  categories = {asr, intonation, dialogue, lm, id4s},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/Taylor_1998_b.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/Taylor_1998_b.ps},
  year = 1998
}
@inproceedings{5947571,
  author = {Andraszewicz, S. and Yamagishi, J. and King, S.},
  title = {Vocal attractiveness of statistical speech
                   synthesisers},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011
                   IEEE International Conference on},
  pages = {5368--5371},
  abstract = {Our previous analysis of speaker-adaptive HMM-based
                   speech synthesis methods suggested that there are two
                   possible reasons why average voices can obtain higher
                   subjective scores than any individual adapted voice: 1)
                   model adaptation degrades speech quality proportionally
                   to the distance 'moved' by the transforms, and 2)
                   psychoacoustic effects relating to the attractiveness
                   of the voice. This paper is a follow-on from that
                   analysis and aims to separate these effects out. Our
                   latest perceptual experiments focus on attractiveness,
                   using average voices and speaker-dependent voices
                   without model trans formation, and show that using
                   several speakers to create a voice improves smoothness
                   (measured by Harmonics-to-Noise Ratio), reduces
                   distance from the the average voice in the log F0-F1
                   space of the final voice and hence makes it more
                   attractive at the segmental level. However, this is
                   weakened or overridden at supra-segmental or sentence
                   levels.},
  doi = {10.1109/ICASSP.2011.5947571},
  issn = {1520-6149},
  keywords = {speaker-adaptive HMM-based speech synthesis
                   methods;speaker-dependent voices;statistical speech
                   synthesisers;vocal attractiveness;hidden Markov
                   models;speaker recognition;speech synthesis;},
  month = may,
  year = 2011
}
@article{frankel07:factoring,
  author = {Frankel, J. and King, S.},
  title = {Factoring {G}aussian Precision Matrices for Linear
                   Dynamic Models},
  journal = {Pattern Recognition Letters},
  volume = {28},
  number = {16},
  pages = {2264-2272},
  abstract = {The linear dynamic model (LDM), also known as the
                   Kalman filter model, has been the subject of research
                   in the engineering, control, and more recently, machine
                   learning and speech technology communities. The
                   Gaussian noise processes are usually assumed to have
                   diagonal, or occasionally full, covariance matrices. A
                   number of recent papers have considered modelling the
                   precision rather than covariance matrix of a Gaussian
                   distribution, and this work applies such ideas to the
                   LDM. A Gaussian precision matrix P can be factored into
                   the form P = UTSU where U is a transform and S a
                   diagonal matrix. By varying the form of U, the
                   covariance can be specified as being diagonal or full,
                   or used to model a given set of spatial dependencies.
                   Furthermore, the transform and scaling components can
                   be shared between models, allowing richer distributions
                   with only marginally more parameters than required to
                   specify diagonal covariances. The method described in
                   this paper allows the construction of models with an
                   appropriate number of parameters for the amount of
                   available training data. We provide illustrative
                   experimental results on synthetic and real speech data
                   in which models with factored precision matrices and
                   automatically-selected numbers of parameters are as
                   good as or better than models with diagonal covariances
                   on small data sets and as good as models with full
                   covariance matrices on larger data sets.},
  categories = {LDM},
  doi = {10.1016/j.patrec.2007.07.008},
  month = {December},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Frankel_LDM_covar.pdf},
  year = 2007
}
@inproceedings{king:stephenson:isard:taylor:strachan:icslp1998,
  author = {Simon King and Todd Stephenson and Stephen Isard and
                   Paul Taylor and Alex Strachan},
  title = {Speech Recognition via Phonetically Featured Syllables},
  booktitle = {Proc. {ICSLP} `98},
  pages = {1031-1034},
  address = {Sydney, Australia},
  abstract = {We describe a speech recogniser which uses a speech
                   production-motivated phonetic-feature description of
                   speech. We argue that this is a natural way to describe
                   the speech signal and offers an efficient intermediate
                   parameterisation for use in speech recognition. We also
                   propose to model this description at the syllable
                   rather than phone level. The ultimate goal of this work
                   is to generate syllable models whose parameters
                   explicitly describe the trajectories of the phonetic
                   features of the syllable. We hope to move away from
                   Hidden Markov Models (HMMs) of context-dependent phone
                   units. As a step towards this, we present a preliminary
                   system which consists of two parts: recognition of the
                   phonetic features from the speech signal using a neural
                   network; and decoding of the feature-based description
                   into phonemes using HMMs.},
  categories = {asr},
  month = dec,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/King_Stephenson_Isard_Taylor_Strachan_icslp1998.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/King_Stephenson_Isard_Taylor_Strachan_icslp1998.ps},
  year = 1998
}
@inproceedings{cetin07:crosslingual,
  author = {Çetin, Ö. and Magimai-Doss, M. and Kantor, A. and
                   King, S. and Bartels, C. and Frankel, J. and Livescu,
                   K.},
  title = {Monolingual and crosslingual comparison of tandem
                   features derived from articulatory and phone {MLP}s},
  booktitle = {Proc. ASRU},
  address = {Kyoto},
  organization = {IEEE},
  abstract = {In recent years, the features derived from posteriors
                   of a multilayer perceptron (MLP), known as tandem
                   features, have proven to be very effective for
                   automatic speech recognition. Most tandem features to
                   date have relied on MLPs trained for phone
                   classification. We recently showed on a relatively
                   small data set that MLPs trained for articulatory
                   feature classification can be equally effective. In
                   this paper, we provide a similar comparison using MLPs
                   trained on a much larger data set - 2000 hours of
                   English conversational telephone speech. We also
                   explore how portable phone- and articulatory feature-
                   based tandem features are in an entirely different
                   language - Mandarin - without any retraining. We find
                   that while phone-based features perform slightly better
                   in the matched-language condition, they perform
                   significantly better in the cross-language condition.
                   Yet, in the cross-language condition, neither approach
                   is as effective as the tandem features extracted from
                   an MLP trained on a relatively small amount of
                   in-domain data. Beyond feature concatenation, we also
                   explore novel observation modelling schemes that allow
                   for greater flexibility in combining the tandem and
                   standard features at hidden Markov model (HMM) outputs.},
  month = {December},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Cetin_etal_ASRU2007.pdf},
  year = 2007
}
@inproceedings{frankel01:alternative,
  author = {Frankel, J. and King, S.},
  title = {Speech recognition in the articulatory domain:
                   investigating an alternative to acoustic {HMM}s},
  booktitle = {Proc. Workshop on Innovations in Speech Processing},
  abstract = {We describe a speech recognition system which uses a
                   combination of acoustic and articulatory features as
                   input. Linear dynamic models capture the trajectories
                   which characterize each segment type. We describe
                   classification and recognition tasks for systems based
                   on acoustic data in conjunction with both real and
                   automatically recovered articulatory parameters.},
  categories = {am,artic,asr,ldm,mocha,edinburgh},
  month = apr,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/Frankel_King_WISP2001.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/Frankel_King_WISP2001.ps},
  year = 2001
}
@article{5510125,
  author = {Wang, D. and King, S. and Frankel, J.},
  title = {Stochastic Pronunciation Modelling for
                   Out-of-Vocabulary Spoken Term Detection},
  journal = {Audio, Speech, and Language Processing, IEEE
                   Transactions on},
  volume = {PP},
  number = {99},
  abstract = {Spoken term detection (STD) is the name given to the
                   task of searching large amounts of audio for
                   occurrences of spoken terms, which are typically single
                   words or short phrases. One reason that STD is a hard
                   task is that search terms tend to contain a
                   disproportionate number of out-of-vocabulary (OOV)
                   words. The most common approach to STD uses subword
                   units. This, in conjunction with some method for
                   predicting pronunciations of OOVs from their written
                   form, enables the detection of OOV terms but
                   performance is considerably worse than for
                   in-vocabulary terms. This performance differential can
                   be largely attributed to the special properties of
                   OOVs. One such property is the high degree of
                   uncertainty in the pronunciation of OOVs. We present a
                   stochastic pronunciation model (SPM) which explicitly
                   deals with this uncertainty. The key insight is to
                   search for all possible pronunciations when detecting
                   an OOV term, explicitly capturing the uncertainty in
                   pronunciation. This requires a probabilistic model of
                   pronunciation, able to estimate a distribution over all
                   possible pronunciations. We use a joint-multigram model
                   (JMM) for this and compare the JMM-based SPM with the
                   conventional soft match approach. Experiments using
                   speech from the meetings domain demonstrate that the
                   SPM performs better than soft match in most operating
                   regions, especially at low false alarm probabilities.
                   Furthermore, SPM and soft match are found to be
                   complementary: their combination provides further
                   performance gains.},
  categories = {confidence estimation, spoken term detection, speech
                   recognition, OOVs},
  doi = {10.1109/TASL.2010.2058800},
  issn = {1558-7916},
  month = jul,
  year = 2010
}
@inproceedings{vepa_king_icslp2004,
  author = {Jithendra Vepa and Simon King},
  title = {Subjective Evaluation Of Join Cost Functions Used In
                   Unit Selection Speech Synthesis},
  booktitle = {Proc. 8th International Conference on Spoken Language
                   Processing (ICSLP)},
  address = {Jeju, Korea},
  abstract = {In our previous papers, we have proposed join cost
                   functions derived from spectral distances, which have
                   good correlations with perceptual scores obtained for a
                   range of concatenation discontinuities. To further
                   validate their ability to predict concatenation
                   discontinuities, we have chosen the best three spectral
                   distances and evaluated them subjectively in a
                   listening test. The unit sequences for synthesis
                   stimuli are obtained from a state-of-the-art unit
                   selection text-tospeech system: rVoice from Rhetorical
                   Systems Ltd. In this paper, we report listeners
                   preferences for each of the three join cost functions.},
  month = oct,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/vepa_king_icslp2004.pdf},
  year = 2004
}
@inproceedings{Ayletetal09,
  author = {Matthew P. Aylett and Simon King and Junichi Yamagishi},
  title = {Speech Synthesis Without a Phone Inventory},
  booktitle = {Interspeech},
  pages = {2087--2090},
  abstract = { In speech synthesis the unit inventory is decided
                   using phonological and phonetic expertise. This process
                   is resource intensive and potentially sub-optimal. In
                   this paper we investigate how acoustic clustering,
                   together with lexicon constraints, can be used to build
                   a self-organised inventory. Six English speech
                   synthesis systems were built using two frameworks, unit
                   selection and parametric HTS for three inventory
                   conditions: 1) a traditional phone set, 2) a system
                   using orthographic units, and 3) a self-organised
                   inventory. A listening test showed a strong preference
                   for the classic system, and for the orthographic system
                   over the self-organised system. Results also varied by
                   letter to sound complexity and database coverage. This
                   suggests the self-organised approach failed to
                   generalise pronunciation as well as introducing noise
                   above and beyond that caused by orthographic sound
                   mismatch.},
  categories = {speech synthesis, unit selection, parametric
                   synthesis, phone inventory, orthographic synthesis},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/ma_interspeech09.pdf},
  place = {Brighton},
  year = 2009
}
@inproceedings{clark:podsiadlo:mayo:king:blizzard2007,
  author = {Robert A. J. Clark and Monika Podsiadlo and Mark
                   Fraser and Catherine Mayo and Simon King },
  title = {Statistical Analysis of the {B}lizzard {C}hallenge
                   2007 Listening Test Results },
  booktitle = {Proc. Blizzard 2007 (in Proc. Sixth {ISCA} Workshop on
                   Speech Synthesis)},
  address = {Bonn, Germany},
  abstract = {Blizzard 2007 is the third Blizzard Challenge, in
                   which participants build voices from a common dataset.
                   A large listening test is conducted which allows
                   comparison of systems in terms of naturalness and
                   intelligibility. New sections were added to the
                   listening test for 2007 to test the perceived
                   similarity of the speaker's identity between natural
                   and synthetic speech. In this paper, we present the
                   results of the listening test and the subsequent
                   statistical analysis. },
  categories = {blizzard,listening test},
  keywords = {Blizzard},
  month = {August},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blz3_003.pdf},
  year = 2007
}
@article{frankel07:AF_DBN,
  author = {Frankel, J. and Wester, M. and King, S.},
  title = {Articulatory feature recognition using dynamic
                   {B}ayesian networks},
  journal = {Computer Speech & Language },
  volume = {21},
  number = {4},
  pages = {620--640},
  abstract = {We describe a dynamic Bayesian network for
                   articulatory feature recognition. The model is intended
                   to be a component of a speech recognizer that avoids
                   the problems of conventional ``beads-on-a-string''
                   phoneme-based models. We demonstrate that the model
                   gives superior recognition of articulatory features
                   from the speech signal compared with a stateof- the art
                   neural network system. We also introduce a training
                   algorithm that offers two major advances: it does not
                   require time-aligned feature labels and it allows the
                   model to learn a set of asynchronous feature changes in
                   a data-driven manner.},
  month = oct,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Frankel_etal_CSL2007.pdf},
  year = 2007
}
@inproceedings{dongwang_interspeech09_spm,
  author = {Dong Wang and Simon King and Joe Frankel},
  title = {Stochastic Pronunciation Modelling for Spoken Term
                   Detection},
  booktitle = {Proc. of Interspeech},
  pages = {2135--2138},
  address = {Brighton, UK},
  abstract = {A major challenge faced by a spoken term detection
                   (STD) system is the detection of out-of-vocabulary
                   (OOV) terms. Although a subword-based STD system is
                   able to detect OOV terms, performance reduction is
                   always observed compared to in-vocabulary terms.
                   Current approaches to STD do not acknowledge the
                   particular properties of OOV terms, such as
                   pronunciation uncertainty. In this paper, we use a
                   stochastic pronunciation model to deal with the
                   uncertain pronunciations of OOV terms. By considering
                   all possible term pronunciations, predicted by a
                   joint-multigram model, we observe a significant
                   performance improvement. },
  categories = {joint-multigram, pronunciation model, spoken term
                   detection, speech recognition},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/spm.pdf},
  year = 2009
}
@inproceedings{watts_yamagishi_king_2011,
  author = {Oliver Watts and Junichi Yamagishi and Simon King},
  title = {Unsupervised continuous-valued word features for
                   phrase-break prediction without a part-of-speech tagger},
  booktitle = {Proc. Interspeech},
  pages = {2157--2160},
  address = {Florence, Italy},
  abstract = {Part of speech (POS) tags are foremost among the
                   features conventionally used to predict intonational
                   phrase-breaks for text to speech (TTS) conversion. The
                   construction of such systems therefore presupposes the
                   availability of a POS tagger for the relevant language,
                   or of a corpus manually tagged with POS. However, such
                   tools and resources are not available in the majority
                   of the world’s languages, and manually labelling text
                   with POS tags is an expensive and time-consuming
                   process. We therefore propose the use of
                   continuous-valued features that summarise the
                   distributional characteristics of word types as
                   surrogates for POS features. Importantly, such features
                   are obtained in an unsupervised manner from an untagged
                   text corpus. We present results on the phrase-break
                   prediction task, where use of the features closes the
                   gap in performance between a baseline system (using
                   only basic punctuation-related features) and a topline
                   system (incorporating a state-of-the-art POS tagger).},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/watts_yamagishi_king_2011.pdf},
  year = 2011
}
@inproceedings{Cassia_IS11,
  author = {Valentini-Botinhao, Cassia and Yamagishi, Junichi and
                   King, Simon},
  title = {Can Objective Measures Predict the Intelligibility of
                   Modified {HMM}-based Synthetic Speech in Noise?},
  booktitle = {Proc. Interspeech},
  abstract = {{Synthetic speech can be modified to improve
                   intelligibility in noise. In order to perform
                   modifications automatically, it would be useful to have
                   an objective measure that could predict the
                   intelligibility of modified synthetic speech for human
                   listeners. We analysed the impact on intelligibility
                   – and on how well objective measures predict it –
                   when we separately modify speaking rate, fundamental
                   frequency, line spectral pairs and spectral peaks.
                   Shifting LSPs can increase intelligibility for human
                   listeners; other modifications had weaker effects.
                   Among the objective measures we evaluated, the Dau
                   model and the Glimpse proportion were the best
                   predictors of human performance.}},
  categories = {HMM-based speech synthesis, objective measures of
                   intelligibility},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/Cassia_IS11.pdf},
  year = 2011
}
@inproceedings{richmond2011a,
  author = {Richmond, Korin and Hoole, Phil and King, Simon},
  title = {Announcing the Electromagnetic Articulography (Day 1)
                   Subset of the mngu0 Articulatory Corpus},
  booktitle = {Proc. Interspeech},
  pages = {1505--1508},
  address = {Florence, Italy},
  abstract = {This paper serves as an initial announcement of the
                   availability of a corpus of articulatory data called
                   mngu0. This corpus will ultimately consist of a
                   collection of multiple sources of articulatory data
                   acquired from a single speaker: electromagnetic
                   articulography (EMA), audio, video, volumetric MRI
                   scans, and 3D scans of dental impressions. This data
                   will be provided free for research use. In this first
                   stage of the release, we are making available one
                   subset of EMA data, consisting of more than 1,300
                   phonetically diverse utterances recorded with a
                   Carstens AG500 electromagnetic articulograph.
                   Distribution of mngu0 will be managed by a dedicated
                   ``forum-style'' web site. This paper both outlines the
                   general goals motivating the distribution of the data
                   and the creation of the mngu0 web forum, and also
                   provides a description of the EMA data contained in
                   this initial release.},
  categories = {articulography, corpus, EMA},
  month = {August},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/IS110767.pdf},
  year = 2011
}
@article{goubanova:king:specom2008,
  author = {Olga Goubanova and Simon King},
  title = {Bayesian networks for phone duration prediction},
  journal = {Speech Communication},
  volume = {50},
  number = {4},
  pages = {301-311},
  abstract = {In a text-to-speech system, the duration of each phone
                   may be predicted by a duration model. This model is
                   usually trained using a database of phones with known
                   durations; each phone (and the context it appears in)
                   is characterised by a feature vector that is composed
                   of a set of linguistic factor values. We describe the
                   use of a graphical model -- a Bayesian network -- for
                   predicting the duration of a phone, given the values
                   for these factors. The network has one discrete
                   variable for each of the linguistic factors and a
                   single continuous variable for the phone's duration.
                   Dependencies between variables (or the lack of them)
                   are represented in the BN structure by arcs (or missing
                   arcs) between pairs of nodes. During training, both the
                   topology of the network and its parameters are learned
                   from labelled data. We compare the results of the BN
                   model with results for sums of products and CART models
                   on the same data. In terms of the root mean square
                   error, the BN model performs much better than both CART
                   and SoP models. In terms of correlation coefficient,
                   the BN model performs better than the SoP model, and as
                   well as the CART model. A BN model has certain
                   advantages over CART and SoP models. Training SoP
                   models requires a high degree of expertise. CART models
                   do not deal with interactions between factors in any
                   explicit way. As we demonstrate, a BN model can also
                   make accurate predictions of a phone's duration, even
                   when the values for some of the linguistic factors are
                   unknown.},
  categories = {Text-to-speech; Bayesian networks; Duration modelling;
                   Sums of products; Classification and regression trees},
  doi = {10.1016/j.specom.2007.10.002},
  month = {April},
  year = 2008
}
@inproceedings{Aylett+King08,
  author = {Matthew P. Aylett and Simon King},
  title = {Single Speaker Segmentation and Inventory Selection
                   Using Dynamic Time Warping Self Organization and Joint
                   Multigram Mapping},
  booktitle = {SSW06},
  pages = {258--263},
  abstract = {In speech synthesis the inventory of units is decided
                   by inspection and on the basis of phonological and
                   phonetic expertise. The ephone (or emergent phone)
                   project at CSTR is investigating how self organisation
                   techniques can be applied to build an inventory based
                   on collected acoustic data together with the
                   constraints of a synthesis lexicon. In this paper we
                   will describe a prototype inventory creation method
                   using dynamic time warping (DTW) for acoustic
                   clustering and a joint multigram approach for relating
                   a series of symbols that represent the speech to these
                   emerged units. We initially examined two symbol sets:
                   1) A baseline of standard phones 2) Orthographic
                   symbols. The success of the approach is evaluated by
                   comparing word boundaries generated by the emergent
                   phones against those created using state-of-the-art HMM
                   segmentation. Initial results suggest the DTW
                   segmentation can match word boundaries with a root mean
                   square error (RMSE) of 35ms. Results from mapping units
                   onto phones resulted in a higher RMSE of 103ms. This
                   error was increased when multiple multigram types were
                   added and when the default unit clustering was altered
                   from 40 (our baseline) to 10. Results for orthographic
                   matching had a higher RMSE of 125ms. To conclude we
                   discuss future work that we believe can reduce this
                   error rate to a level sufficient for the techniques to
                   be applied to a unit selection synthesis system. },
  categories = {speech synthesis, unit selection, parametric
                   synthesis, phone inventory, orthographic synthesis},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/ssw06.pdf},
  place = {Bonn},
  year = 2008
}
@inproceedings{Cetin07:tandem,
  author = {Çetin, Ö. and Kantor, A. and King, S. and Bartels,
                   C. and Magimai-Doss, M. and Frankel, J. and Livescu, K.},
  title = {An articulatory feature-based tandem approach and
                   factored observation modeling},
  booktitle = {Proc. ICASSP},
  address = {Honolulu},
  abstract = {The so-called tandem approach, where the posteriors of
                   a multilayer perceptron (MLP) classifier are used as
                   features in an automatic speech recognition (ASR)
                   system has proven to be a very effective method. Most
                   tandem approaches up to date have relied on MLPs
                   trained for phone classification, and appended the
                   posterior features to some standard feature hidden
                   Markov model (HMM). In this paper, we develop an
                   alternative tandem approach based on MLPs trained for
                   articulatory feature (AF) classification. We also
                   develop a factored observation model for characterizing
                   the posterior and standard features at the HMM outputs,
                   allowing for separate hidden mixture and state-tying
                   structures for each factor. In experiments on a subset
                   of Switchboard, we show that the AFbased tandem
                   approach is as effective as the phone-based approach,
                   and that the factored observation model significantly
                   outperforms the simple feature concatenation approach
                   while using fewer parameters.},
  month = {April},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Cetin_icassp07_tandem.pdf},
  year = 2007
}
@inproceedings{gillett:king:eurospeech2003b,
  author = {Ben Gillett and Simon King},
  title = {Transforming {F0} Contours},
  booktitle = {Proc. {E}urospeech},
  address = {Geneva},
  abstract = {Voice transformation is the process of transforming
                   the characteristics of speech uttered by a source
                   speaker, such that a listener would believe the speech
                   was uttered by a target speaker. Training F0 contour
                   generation models for speech synthesis requires a large
                   corpus of speech. If it were possible to adapt the F0
                   contour of one speaker to sound like that of another
                   speaker, using a small, easily obtainable parameter
                   set, this would be extremely valuable. We present a new
                   method for the transformation of F0 contours from one
                   speaker to another based on a small linguistically
                   motivated parameter set. The system performs a
                   piecewise linear mapping using these parameters. A
                   perceptual experiment clearly demonstrates that the
                   presented system is at least as good as an existing
                   technique for all speaker pairs, and that in many cases
                   it is much better and almost as good as using the
                   target F0 contour},
  categories = {},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/Gillett_King_eurospeech2003a.pdf},
  year = 2003
}
@incollection{vepa:king:joincostchapter2004,
  author = {Jithendra Vepa and Simon King},
  title = {Join Cost for Unit Selection Speech Synthesis},
  booktitle = {Speech Synthesis},
  publisher = {Prentice Hall},
  editor = {Alwan, Abeer and Narayanan, Shri},
  categories = {},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Vepa_King_joincostchapter2004.ps},
  year = 2004
}
@inproceedings{robust-hts,
  author = {Junichi Yamagishi and Zhenhua Ling and Simon King},
  title = {Robustness of HMM-based Speech Synthesis},
  booktitle = {Proc. Interspeech 2008},
  pages = {581--584},
  address = {Brisbane, Australia},
  abstract = {As speech synthesis techniques become more advanced,
                   we are able to consider building high-quality voices
                   from data collected outside the usual highly-controlled
                   recording studio environment. This presents new
                   challenges that are not present in conventional
                   text-to-speech synthesis: the available speech data are
                   not perfectly clean, the recording conditions are not
                   consistent, and/or the phonetic balance of the material
                   is not ideal. Although a clear picture of the
                   performance of various speech synthesis techniques
                   (e.g., concatenative, HMM-based or hybrid) under good
                   conditions is provided by the Blizzard Challenge, it is
                   not well understood how robust these algorithms are to
                   less favourable conditions. In this paper, we analyse
                   the performance of several speech synthesis methods
                   under such conditions. This is, as far as we know, a
                   new research topic: ``Robust speech synthesis.'' As a
                   consequence of our investigations, we propose a new
                   robust training method for the HMM-based speech
                   synthesis in for use with speech data collected in
                   unfavourable conditions.},
  categories = {speech synthesis, HMM-based speech synthesis, HTS,
                   speaker adaptation, voice conversion, average voice,
                   unit selection},
  key = {robust-hts},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/RobustnessHTS.pdf},
  year = 2008
}
@inproceedings{taylor:shimodaira:isard:king:kowtko:icslp1996,
  author = {Paul A. Taylor and Hiroshi Shimodaira and Stephen
                   Isard and Simon King and Jacqueline Kowtko},
  title = {Using Prosodic Information to Constrain Language
                   Models for Spoken dialogue},
  booktitle = {Proc. {ICSLP} `96},
  address = {Philadelphia},
  abstract = {We present work intended to improve speech recognition
                   performance for computer dialogue by taking into
                   account the way that dialogue context and intonational
                   tune interact to limit the possibilities for what an
                   utterance might be. We report here on the extra
                   constraint achieved in a bigram language model
                   expressed in terms of entropy by using separate
                   submodels for different sorts of dialogue acts and
                   trying to predict which submodel to apply by analysis
                   of the intonation of the sentence being recognised.},
  categories = {asr, intonation, dialogue, lm,id4s},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1996/Taylor_1996_a.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1996/Taylor_1996_a.ps},
  year = 1996
}
@inproceedings{fraser:king:blizzard2007,
  author = {Mark Fraser and Simon King},
  title = {The {B}lizzard {C}hallenge 2007},
  booktitle = {Proc. Blizzard 2007 (in Proc. Sixth ISCA Workshop on
                   Speech Synthesis)},
  address = {Bonn, Germany},
  abstract = {In Blizzard 2007, the third Blizzard Challenge,
                   participants were asked to build voices from a dataset,
                   a defined subset and, following certain constraints, a
                   subset of their choice. A set of test sentences was
                   then released to be synthesised. An online evaluation
                   of the submitted synthesised sentences focused on
                   naturalness and intelligibility, and added new sec-
                   tions for degree of similarity to the original speaker,
                   and similarity in terms of naturalness of pairs of
                   sentences from different systems. We summarise this
                   year's Blizzard Challenge and look ahead to possible
                   designs for Blizzard 2008 in the light of participant
                   and listener feedback. },
  categories = {blizzard, listening test},
  keywords = {Blizzard},
  month = {August},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blz3_001.pdf},
  year = 2007
}
@techreport{king:verbmobil1996a,
  author = {Simon King},
  title = {Final report for {V}erbmobil {T}eilprojekt 4.4},
  institution = {IKP, Universitt Bonn},
  number = {ISSN 1434-8845},
  note = {Verbmobil-Report 195 available at {\tt
                   http://verbmobil.dfki.de}},
  abstract = {Final report for Verbmobil English speech synthesis},
  categories = {},
  month = jan,
  year = 1997
}
@inproceedings{livescu07:JHU_summary,
  author = {Livescu, K. and Çetin, Ö. and Hasegawa-Johnson, M.
                   and King, S. and Bartels, C. and Borges, N. and Kantor,
                   A. and Lal, P. and Yung, L. and Bezman,
                   Dawson-Haggerty, S. and Woods, B. and Frankel, J. and
                   Magimai-Doss, M. and Saenko, K.},
  title = {Articulatory feature-based methods for acoustic and
                   audio-visual speech recognition: {S}ummary from the
                   2006 {JHU} {S}ummer {W}orkshop},
  booktitle = {Proc. ICASSP},
  address = {Honolulu},
  abstract = {We report on investigations, conducted at the 2006
                   Johns HopkinsWorkshop, into the use of articulatory
                   features (AFs) for observation and pronunciation models
                   in speech recognition. In the area of observation
                   modeling, we use the outputs of AF classiers both
                   directly, in an extension of hybrid HMM/neural network
                   models, and as part of the observation vector, an
                   extension of the tandem approach. In the area of
                   pronunciation modeling, we investigate a model having
                   multiple streams of AF states with soft synchrony
                   constraints, for both audio-only and audio-visual
                   recognition. The models are implemented as dynamic
                   Bayesian networks, and tested on tasks from the
                   Small-Vocabulary Switchboard (SVitchboard) corpus and
                   the CUAVE audio-visual digits corpus. Finally, we
                   analyze AF classication and forced alignment using a
                   newly collected set of feature-level manual
                   transcriptions.},
  month = {April},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/livescu_icassp07_sum.pdf},
  year = 2007
}
@inproceedings{wester04:asynch,
  author = {Wester, M. and Frankel, J. and King, S.},
  title = {Asynchronous Articulatory Feature Recognition Using
                   Dynamic {B}ayesian Networks},
  booktitle = {Proc. IEICI Beyond HMM Workshop},
  address = {Kyoto},
  abstract = {This paper builds on previous work where dynamic
                   Bayesian networks (DBN) were proposed as a model for
                   articulatory feature recognition. Using DBNs makes it
                   possible to model the dependencies between features, an
                   addition to previous approaches which was found to
                   improve feature recognition performance. The DBN
                   results were promising, giving close to the accuracy of
                   artificial neural nets (ANNs). However, the system was
                   trained on canonical labels, leading to an overly
                   strong set of constraints on feature co-occurrence. In
                   this study, we describe an embedded training scheme
                   which learns a set of data-driven asynchronous feature
                   changes where supported in the data. Using a subset of
                   the OGI Numbers corpus, we describe articulatory
                   feature recognition experiments using both
                   canonically-trained and asynchronous DBNs. Performance
                   using DBNs is found to exceed that of ANNs trained on
                   an identical task, giving a higher recognition
                   accuracy. Furthermore, inter-feature dependencies
                   result in a more structured model, giving rise to fewer
                   feature combinations in the recognition output. In
                   addition to an empirical evaluation of this modelling
                   approach, we give a qualitative analysis, comparing
                   asynchrony found through our data-driven methods to the
                   asynchrony which may be expected on the basis of
                   linguistic knowledge.},
  categories = {am,artic,asr,dbn,oginumbers,edinburgh},
  month = dec,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Wester_et_al_IEICE.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Wester_et_al_IEICE.ps},
  year = 2004
}
@inproceedings{child_synthesis_2009,
  author = {Oliver Watts and Junichi Yamagishi and Simon King and
                   Kay Berkling},
  title = {{HMM} Adaptation and Voice Conversion for the
                   Synthesis of Child Speech: A Comparison},
  booktitle = {Proc. Interspeech 2009},
  pages = {2627--2630},
  address = {Brighton, U.K.},
  abstract = {This study compares two different methodologies for
                   producing data-driven synthesis of child speech from
                   existing systems that have been trained on the speech
                   of adults. On one hand, an existing statistical
                   parametric synthesiser is transformed using model
                   adaptation techniques, informed by linguistic and
                   prosodic knowledge, to the speaker characteristics of a
                   child speaker. This is compared with the application of
                   voice conversion techniques to convert the output of an
                   existing waveform concatenation synthesiser with no
                   explicit linguistic or prosodic knowledge. In a
                   subjective evaluation of the similarity of synthetic
                   speech to natural speech from the target speaker, the
                   HMM-based systems evaluated are generally preferred,
                   although this is at least in part due to the higher
                   dimensional acoustic features supported by these
                   techniques.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/child_synthesis_2009.pdf},
  year = 2009
}
@incollection{king:ELL2_2006b,
  author = {Simon King},
  title = {Handling variation in speech and language processing},
  booktitle = {Encyclopedia of Language and Linguistics},
  publisher = {Elsevier},
  editor = {Keith Brown},
  edition = {2nd},
  year = 2006
}
@inproceedings{Blizzard_summary_09,
  author = {Simon King and Vasilis Karaiskos},
  title = {The {B}lizzard {C}hallenge 2009},
  booktitle = {Proc. Blizzard Challenge Workshop},
  address = {Edinburgh, UK},
  abstract = {The Blizzard Challenge 2009 was the fifth annual
                   Blizzard Challenge. As in 2008, UK English and Mandarin
                   Chinese were the chosen languages for the 2009
                   Challenge. The English corpus was the same one used in
                   2008. The Mandarin corpus was pro- vided by iFLYTEK. As
                   usual, participants with limited resources or limited
                   experience in these languages had the option of using
                   unaligned labels that were provided for both corpora
                   and for the test sentences. An accent-specific
                   pronunciation dictionary was also available for the
                   English speaker. This year, the tasks were organised in
                   the form of `hubs' and `spokes' where each hub task
                   involved building a general-purpose voice and each
                   spoke task involved building a voice for a specific
                   application. A set of test sentences was released to
                   participants, who were given a limited time in which to
                   synthesise them and submit the synthetic speech. An
                   online listening test was conducted to evaluate
                   naturalness, intelligibility, degree of similarity to
                   the original speaker and, for one of the spoke tasks,
                   "appropriateness."},
  categories = {Blizzard Challenge, speech synthesis, evaluation,
                   listening test},
  keywords = {Blizzard},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/summary_Blizzard2009.pdf},
  year = 2009
}
@inproceedings{clarkrichmondking_interspeech05,
  author = {Robert A.J. Clark and Korin Richmond and Simon King},
  title = {Multisyn voices from {ARCTIC} data for the {B}lizzard
                   challenge},
  booktitle = {Proc. Interspeech 2005},
  abstract = {This paper describes the process of building unit
                   selection voices for the Festival Multisyn engine using
                   four ARCTIC datasets, as part of the Blizzard
                   evaluation challenge. The build process is almost
                   entirely automatic, with very little need for human
                   intervention. We discuss the difference in the
                   evaluation results for each voice and evaluate the
                   suitability of the ARCTIC datasets for building this
                   type of voice.},
  categories = {speech synthesis, festival, evaluation},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/clarkrichmondking_interspeech05.pdf},
  year = 2005
}
@inproceedings{shig043,
  author = {Yoshinori Shiga and Simon King},
  title = {Estimating detailed spectral envelopes using
                   articulatory clustering},
  booktitle = {Proc. ICSLP},
  address = {Jeju, Korea},
  abstract = {This paper presents an articulatory-acoustic mapping
                   where detailed spectral envelopes are estimated. During
                   the estimation, the harmonics of a range of F0 values
                   are derived from the spectra of multiple voiced speech
                   signals vocalized with similar articulator settings.
                   The envelope formed by these harmonics is represented
                   by a cepstrum, which is computed by fitting the peaks
                   of all the harmonics based on the weighted least square
                   method in the frequency domain. The experimental result
                   shows that the spectral envelopes are estimated with
                   the highest accuracy when the cepstral order is 48--64
                   for a female speaker, which suggests that representing
                   the real response of the vocal tract requires
                   high-quefrency elements that conventional speech
                   synthesis methods are forced to discard in order to
                   eliminate the pitch component of speech.},
  categories = {artic, lbg, clustering, mocha, harmonic, envelope,
                   edinburgh},
  month = oct,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_icslp04a.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_icslp04a.ps},
  year = 2004
}
@article{Dines2011,
  author = {John Dines and Hui Liang and Lakshmi Saheer and
                   Matthew Gibson and William Byrne and Keiichiro Oura and
                   Keiichi Tokuda and Junichi Yamagishi and Simon King and
                   Mirjam Wester and Teemu Hirsimäki and Reima
                   Karhila and Mikko Kurimo},
  title = {Personalising speech-to-speech translation:
                   Unsupervised cross-lingual speaker adaptation for
                   {HMM}-based speech synthesis},
  journal = {Computer Speech and Language},
  volume = {27},
  number = {2},
  pages = {420--437},
  abstract = {In this paper we present results of unsupervised
                   cross-lingual speaker adaptation applied to
                   text-to-speech synthesis. The application of our
                   research is the personalisation of speech-to-speech
                   translation in which we employ a HMM statistical
                   framework for both speech recognition and synthesis.
                   This framework provides a logical mechanism to adapt
                   synthesised speech output to the voice of the user by
                   way of speech recognition. In this work we present
                   results of several different unsupervised and
                   cross-lingual adaptation approaches as well as an
                   end-to-end speaker adaptive speech-to-speech
                   translation system. Our experiments show that we can
                   successfully apply speaker adaptation in both
                   unsupervised and cross-lingual scenarios and our
                   proposed algorithms seem to generalise well for several
                   language pairs. We also discuss important future
                   directions including the need for better evaluation
                   metrics.},
  doi = {10.1016/j.csl.2011.08.003},
  issn = {0885-2308},
  keywords = {Speech-to-speech translation, Cross-lingual speaker
                   adaptation, HMM-based speech synthesis, Speaker
                   adaptation, Voice conversion},
  url = {http://www.sciencedirect.com/science/article/pii/S0885230811000441},
  month = feb,
  year = 2013
}
@inproceedings{shig031,
  author = {Yoshinori Shiga and Simon King},
  title = {Estimating the Spectral Envelope of Voiced Speech
                   Using Multi-frame Analysis},
  booktitle = {Proc. {E}urospeech-2003},
  volume = 3,
  pages = {1737--1740},
  address = {Geneva, Switzerland},
  abstract = {This paper proposes a novel approach for estimating
                   the spectral envelope of voiced speech independently of
                   its harmonic structure. Because of the
                   quasi-periodicity of voiced speech, its spectrum
                   indicates harmonic structure and only has energy at
                   frequencies corresponding to integral multiples of F0.
                   It is hence impossible to identify transfer
                   characteristics between the adjacent harmonics. In
                   order to resolve this problem, Multi-frame Analysis
                   (MFA) is introduced. The MFA estimates a spectral
                   envelope using many portions of speech which are
                   vocalised using the same vocal-tract shape. Since each
                   of the portions usually has a different F0 and ensuing
                   different harmonic structure, a number of harmonics can
                   be obtained at various frequencies to form a spectral
                   envelope. The method thereby gives a closer
                   approximation to the vocal-tract transfer function.},
  categories = {artic, lbg, clustering, mocha, harmonic, envelope,
                   edinburgh},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/shiga_eurospeech03a.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/shiga_eurospeech03a.ps},
  year = 2003
}
@inproceedings{dongwang_icassp09,
  author = {Dong Wang and Tejedor Tejedor and Joe Frankel and
                   Simon King},
  title = {Posterior-based confidence measures for spoken term
                   detection},
  booktitle = {Proc. of ICASSP09},
  address = {Taiwan},
  abstract = {Confidence measures play a key role in spoken term
                   detection (STD) tasks. The confidence measure expresses
                   the posterior probability of the search term appearing
                   in the detection period, given the speech. Traditional
                   approaches are based on the acoustic and language model
                   scores for candidate detections found using automatic
                   speech recognition, with Bayes' rule being used to
                   compute the desired posterior probability. In this
                   paper, we present a novel direct posterior-based
                   confidence measure which, instead of resorting to the
                   Bayesian formula, calculates posterior probabilities
                   from a multi-layer perceptron (MLP) directly. Compared
                   with traditional Bayesian-based methods, the
                   direct-posterior approach is conceptually and
                   mathematically simpler. Moreover, the MLP-based model
                   does not require assumptions to be made about the
                   acoustic features such as their statistical
                   distribution and the independence of static and dynamic
                   co-efficients. Our experimental results in both English
                   and Spanish demonstrate that the proposed direct
                   posterior-based confidence improves STD performance. },
  categories = {Spoken term detection, confidence measure, posterior
                   probabilities, MLP},
  month = {April},
  page = {4889--4892},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/posterior.pdf},
  year = 2009
}
@inproceedings{frankel01:ASR,
  author = {Frankel, J. and King, S.},
  title = {{ASR} - Articulatory Speech Recognition},
  booktitle = {Proc. {E}urospeech},
  pages = {599-602},
  address = {Aalborg, Denmark},
  abstract = {In this paper we report recent work on a speech
                   recognition system using a combination of acoustic and
                   articulatory features as input. Linear dynamic models
                   are used to capture the trajectories which characterize
                   each segment type. We describe classification and
                   recognition tasks for systems based on acoustic data in
                   conjunction with both real and automatically recovered
                   articulatory parameters.},
  categories = {am,artic,asr,ldm,mocha,edinburgh},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/Frankel_King_Eurospeech2001.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/Frankel_King_Eurospeech2001.ps},
  year = 2001
}
@inproceedings{lei2011a,
  author = {Lei, Ming and Yamagishi, Junichi and Richmond, Korin
                   and Ling, Zhen-Hua and King, Simon and Dai, Li-Rong},
  title = {Formant-controlled {HMM}-based speech synthesis},
  booktitle = {Proc. Interspeech},
  pages = {2777--2780},
  address = {Florence, Italy},
  abstract = {This paper proposes a novel framework that enables us
                   to manipulate and control formants in HMM-based speech
                   synthesis. In this framework, the dependency between
                   formants and spectral features is modelled by piecewise
                   linear transforms; formant parameters are effectively
                   mapped by these to the means of Gaussian distributions
                   over the spectral synthesis parameters. The spectral
                   envelope features generated under the influence of
                   formants in this way may then be passed to high-quality
                   vocoders to generate the speech waveform. This provides
                   two major advantages over conventional frameworks.
                   First, we can achieve spectral modification by changing
                   formants only in those parts where we want control,
                   whereas the user must specify all formants manually in
                   conventional formant synthesisers (e.g. Klatt). Second,
                   this can produce high-quality speech. Our results show
                   the proposed method can control vowels in the
                   synthesized speech by manipulating F 1 and F 2 without
                   any degradation in synthesis quality.},
  categories = {speech synthesis, hidden Markov model, formants,
                   controllability},
  month = {August},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/IS110769.pdf},
  year = 2011
}
@inproceedings{frankel04:artic_dbn,
  author = {Frankel, J. and Wester, M. and King, S.},
  title = {Articulatory feature recognition using dynamic
                   {B}ayesian networks},
  booktitle = {Proc. {ICSLP}},
  abstract = {This paper describes the use of dynamic Bayesian
                   networks for the task of articulatory feature
                   recognition. We show that by modeling the dependencies
                   between a set of 6 multi-leveled articulatory features,
                   recognition accuracy is increased over an equivalent
                   system in which features are considered independent.
                   Results are compared to those found using artificial
                   neural networks on an identical task.},
  categories = {am,artic,asr,dbn,timit,edinburgh},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Frankel_et_al_ICSLP2004.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Frankel_et_al_ICSLP2004.ps},
  year = 2004
}
@article{john:ieee2011,
  author = {J. Dines and J. Yamagishi and S. King},
  title = {Measuring the gap between {HMM}-based {ASR} and {TTS}},
  journal = {IEEE Selected Topics in Signal Processing},
  note = {(in press)},
  abstract = {The EMIME European project is conducting research in
                   the development of technologies for mobile,
                   personalised speech-to-speech translation systems. The
                   hidden Markov model (HMM) is being used as the
                   underlying technology in both automatic speech
                   recognition (ASR) and text-to-speech synthesis (TTS)
                   components, thus, the investigation of unified
                   statistical modelling approaches has become an implicit
                   goal of our research. As one of the first steps towards
                   this goal, we have been investigating commonalities and
                   differences between HMM-based ASR and TTS. In this
                   paper we present results and analysis of a series of
                   experiments that have been conducted on English ASR and
                   TTS systems measuring their performance with respect to
                   phone set and lexicon; acoustic feature type and
                   dimensionality; HMM topology; and speaker adaptation.
                   Our results show that, although the fundamental
                   statistical model may be essentially the same, optimal
                   ASR and TTS performance often demands diametrically
                   opposed system designs. This represents a major
                   challenge to be addressed in the investigation of such
                   unified modelling approaches.},
  doi = {10.1109/JSTSP.2010.2079315},
  keywords = {Acoustics, Adaptation model, Context modeling, Hidden
                   Markov models, Speech, Speech recognition, Training,
                   speech recognition, speech synthesis, unified models},
  year = 2011
}
@inproceedings{dallIS2012,
  author = {Dall, Rasmus and Veaux, Christophe and Yamagishi,
                   Junichi and King, Simon},
  title = {Analysis of Speaker CLustering Strategies for
                   {HMM}-Based Speech Synthesis},
  booktitle = {Proc. Interspeech},
  address = {Portland, Oregon, USA},
  abstract = {This paper describes a method for speaker clustering,
                   with the application of building average voice models
                   for speaker-adaptive HMM-based speech synthesis that
                   are a good basis for adapting to specific target
                   speakers. Our main hypothesis is that using
                   perceptually similar speakers to build the average
                   voice model will be better than use unselected
                   speakers, even if the amount of data available from
                   perceptually similar speakers is smaller. We measure
                   the perceived similarities among a group of 30 female
                   speakers in a listening test and then apply multiple
                   linear regression to automatically predict these
                   listener judgements of speaker similarity and thus to
                   identify similar speakers automatically. We then
                   compare a variety of average voice models trained on
                   either speakers who were perceptually judged to be
                   similar to the target speaker, or speakers selected by
                   the multiple linear regression, or a large global set
                   of unselected speakers. We find that the average voice
                   model trained on perceptually similar speakers provides
                   better performance than the global model, even though
                   the latter is trained on more data, confirming our main
                   hypothesis. However, the average voice model using
                   speakers selected automatically by the multiple linear
                   regression does not reach the same level of
                   performance.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/DallIS2012.pdf},
  year = 2012
}
@inproceedings{Gutkin:King:pris05,
  author = {Alexander Gutkin and Simon King},
  title = {{I}nductive {S}tring {T}emplate-{B}ased {L}earning of
                   {S}poken {L}anguage},
  booktitle = {Proc. 5th International Workshop on Pattern
                   Recognition in Information Systems (PRIS-2005), In
                   conjunction with the 7th International Conference on
                   Enterprise Information Systems (ICEIS-2005)},
  editor = {Hugo Gamboa and Ana Fred},
  pages = {43--51},
  address = {Miami, USA},
  publisher = {INSTICC Press},
  abstract = { This paper deals with formulation of alternative
                   structural approach to the speech recognition problem.
                   In this approach, we require both the representation
                   and the learning algorithms defined on it to be
                   linguistically meaningful, which allows the speech
                   recognition system to discover the nature of the
                   linguistic classes of speech patterns corresponding to
                   the speech waveforms. We briefly discuss the current
                   formalisms and propose an alternative --- a
                   phonologically inspired string-based inductive speech
                   representation, defined within an analytical framework
                   specifically designed to address the issues of class
                   and object representation. We also present the results
                   of the phoneme classification experiments conducted on
                   the TIMIT corpus of continuous speech. },
  categories = {structural,recognition,acoustic,phonetic_feature,timit,edinburgh},
  isbn = {972-8865-28-7},
  month = may,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/gutkin_king_pris05.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/gutkin_king_pris05.ps.gz},
  year = 2005
}
@article{vepa_king_tsap05,
  author = {Jithendra Vepa and Simon King},
  title = {Subjective Evaluation of Join Cost and Smoothing
                   Methods for Unit Selection Speech Synthesis},
  journal = {IEEE Transactions on Speech and Audio Processing},
  volume = {14},
  number = {5},
  pages = {1763--1771},
  abstract = {In unit selection-based concatenative speech
                   synthesis, join cost (also known as concatenation
                   cost), which measures how well two units can be joined
                   together, is one of the main criteria for selecting
                   appropriate units from the inventory. Usually, some
                   form of local parameter smoothing is also needed to
                   disguise the remaining discontinuities. This paper
                   presents a subjective evaluation of three join cost
                   functions and three smoothing methods. We describe the
                   design and performance of a listening test. The three
                   join cost functions were taken from our previous study,
                   where we proposed join cost functions derived from
                   spectral distances, which have good correlations with
                   perceptual scores obtained for a range of concatenation
                   discontinuities. This evaluation allows us to further
                   validate their ability to predict concatenation
                   discontinuities. The units for synthesis stimuli are
                   obtained from a state-of-the-art unit selection
                   text-to-speech system: rVoice from Rhetorical Systems
                   Ltd. In this paper, we report listeners' preferences
                   for each join cost in combination with each smoothing
                   method.},
  categories = {TTS, join cost, listening test},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/vepa_king_ieee2005.pdf},
  year = 2006
}
@inproceedings{hts-child-oliver,
  author = {Oliver Watts and Junichi Yamagishi and Kay Berkling
                   and Simon King},
  title = {{HMM}-based synthesis of child speech},
  booktitle = {Proc. of The 1st Workshop on Child, Computer and
                   Interaction (ICMI'08 post-conference workshop)},
  address = {Crete, Greece},
  abstract = {The synthesis of child speech presents challenges both
                   in the collection of data and in the building of a
                   synthesiser from that data. Because only limited data
                   can be collected, and the domain of that data is
                   constrained, it is difficult to obtain the type of
                   phonetically-balanced corpus usually used in speech
                   synthesis. As a consequence, building a synthesiser
                   from this data is difficult. Concatenative synthesisers
                   are not robust to corpora with many missing units (as
                   is likely when the corpus content is not carefully
                   designed), so we chose to build a statistical
                   parametric synthesiser using the HMM-based system HTS.
                   This technique has previously been shown to perform
                   well for limited amounts of data, and for data
                   collected under imperfect conditions. We compared 6
                   different configurations of the synthesiser, using both
                   speaker-dependent and speaker-adaptive modelling
                   techniques, and using varying amounts of data. The
                   output from these systems was evaluated alongside
                   natural and vocoded speech, in a Blizzard-style
                   listening test.},
  categories = {speech synthesis, HMM-based speech synthesis, HTS,
                   speaker adaptation, voice conversion, average voice,
                   child speech},
  key = {hts-child-oliver},
  month = oct,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/child-hts-oliver.pdf},
  year = 2008
}
@inproceedings{dongwang_interspeech09_conf,
  author = {Dong Wang and Simon King and Joe Frankel and Peter
                   Bell},
  title = {Term-Dependent Confidence for Out-of-Vocabulary Term
                   Detection},
  booktitle = {Proc. Interspeech},
  pages = {2139--2142},
  address = {Brighton, UK},
  abstract = { Within a spoken term detection (STD) system, the
                   decision maker plays an important role in retrieving
                   reliable detections. Most of the state-of-the-art STD
                   systems make decisions based on a confidence measure
                   that is term-independent, which poses a serious problem
                   for out-of-vocabulary (OOV) term detection. In this
                   paper, we study a term-dependent confidence measure
                   based on confidence normalisation and discriminative
                   modelling, particularly focusing on its remarkable
                   effectiveness for detecting OOV terms. Experimental
                   results indicate that the term-dependent confidence
                   provides much more significant improvement for OOV
                   terms than terms in-vocabulary. },
  categories = {joint-multigram, pronunciation model, spoken term
                   detection, speech recognition},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/conf.pdf},
  year = 2009
}
@article{Oura2012703,
  author = {Keiichiro Oura and Junichi Yamagishi and Mirjam Wester
                   and Simon King and Keiichi Tokuda},
  title = {Analysis of unsupervised cross-lingual speaker
                   adaptation for {HMM}-based speech synthesis using
                   {KLD}-based transform mapping},
  journal = {Speech Communication},
  volume = {54},
  number = {6},
  pages = {703--714},
  abstract = {In the EMIME project, we developed a mobile device
                   that performs personalized speech-to-speech translation
                   such that a user's spoken input in one language is used
                   to produce spoken output in another language, while
                   continuing to sound like the user's voice. We
                   integrated two techniques into a single architecture:
                   unsupervised adaptation for HMM-based TTS using
                   word-based large-vocabulary continuous speech
                   recognition, and cross-lingual speaker adaptation
                   (CLSA) for HMM-based TTS. The CLSA is based on a
                   state-level transform mapping learned using minimum
                   Kullback-Leibler divergence between pairs of HMM states
                   in the input and output languages. Thus, an
                   unsupervised cross-lingual speaker adaptation system
                   was developed. End-to-end speech-to-speech translation
                   systems for four languages (English, Finnish, Mandarin,
                   and Japanese) were constructed within this framework.
                   In this paper, the English-to-Japanese adaptation is
                   evaluated. Listening tests demonstrate that adapted
                   voices sound more similar to a target speaker than
                   average voices and that differences between supervised
                   and unsupervised cross-lingual speaker adaptation are
                   small. Calculating the KLD state-mapping on only the
                   first 10 mel-cepstral coefficients leads to huge
                   savings in computational costs, without any detrimental
                   effect on the quality of the synthetic speech.},
  doi = {10.1016/j.specom.2011.12.004},
  issn = {0167-6393},
  keywords = {HMM-based speech synthesis, Unsupervised speaker
                   adaptation, Cross-lingual speaker adaptation,
                   Speech-to-speech translation},
  url = {http://www.sciencedirect.com/science/article/pii/S0167639311001774},
  year = 2012
}
@inproceedings{strom08,
  author = {Volker Strom and Simon King},
  title = {Investigating {F}estival's target cost function using
                   perceptual experiments},
  booktitle = {Proc.~Interspeech},
  address = {Brisbane},
  abstract = {We describe an investigation of the target cost used
                   in the Festival unit selection speech synthesis system.
                   Our ultimate goal is to automatically learn a
                   perceptually optimal target cost function. In this
                   study, we investigated the behaviour of the target cost
                   for one segment type. The target cost is based on
                   counting the mismatches in several context features. A
                   carrier sentence (``My name is Roger'') was synthesised
                   using all 147,820 possible combinations of the diphones
                   /n_ei/ and /ei_m/. 92 representative versions were
                   selected and presented to listeners as 460 pairwise
                   comparisons. The listeners' preference votes were used
                   to analyse the behaviour of the target cost, with
                   respect to the values of its component linguistic
                   context features.},
  categories = {speech synthesis, unit selection, target costs},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080514.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080514.ps},
  year = 2008
}
@inproceedings{clarkrichmondking_ssw504,
  author = {Robert A.J. Clark and Korin Richmond and Simon King},
  title = {Festival 2 -- build your own general purpose unit
                   selection speech synthesiser},
  booktitle = {Proc. 5th {ISCA} workshop on speech synthesis},
  abstract = {This paper describes version 2 of the Festival speech
                   synthesis system. Festival 2 provides a development
                   environment for concatenative speech synthesis, and now
                   includes a general purpose unit selection speech
                   synthesis engine. We discuss various aspects of unit
                   selection speech synthesis, focusing on the research
                   issues that relate to voice design and the automation
                   of the voice development process.},
  categories = {synthesis, festival, unitselection},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/clarkrichmondking_ssw504.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/clarkrichmondking_ssw504.ps},
  year = 2004
}
@phdthesis{king:thesis1998,
  author = {Simon King},
  title = {Using Information Above the Word Level for Automatic
                   Speech Recognition},
  school = {University of Edinburgh},
  abstract = {This thesis introduces a general method for using
                   information at the utterance level and across
                   utterances for automatic speech recognition. The method
                   involves classification of utterances into types. Using
                   constraints at the utterance level via this
                   classification method allows information sources to be
                   exploited which cannot necessarily be used directly for
                   word recognition. The classification power of three
                   sources of information is investigated: the language
                   model in the speech recogniser, dialogue context and
                   intonation. The method is applied to a challenging
                   task: the recognition of spontaneous dialogue speech.
                   The results show success in automatic utterance type
                   classification, and subsequent word error rate
                   reduction over a baseline system, when all three
                   information sources are probabilistically combined.},
  categories = {asr, lm, intonation, dialogue, systems},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/King_thesis1998.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/King_thesis1998.ps},
  year = 1998
}
@inproceedings{dong_ivan_joe_simon_interspeech08_marray,
  author = {Dong Wang and Ivan Himawan and Joe Frankel and Simon
                   King},
  title = {A Posterior Approach for Microphone Array Based Speech
                   Recognition},
  booktitle = {Proc. Interspeech},
  pages = {996--999},
  abstract = {Automatic speech recognition (ASR) becomes rather
                   difficult in meetings domains because of the adverse
                   acoustic conditions, including more background noise,
                   more echo and reverberation and frequent cross-talking.
                   Microphone arrays have been demonstrated able to boost
                   ASR performance dramatically in such noisy and
                   reverberant environments, with various beamforming
                   algorithms. However, almost all existing beamforming
                   measures work in the acoustic domain, resorting to
                   signal processing theories and geometric explanation.
                   This limits their application, and induces significant
                   performance degradation when the geometric property is
                   unavailable or hard to estimate, or if heterogenous
                   channels exist in the audio system. In this paper, we
                   preset a new posterior-based approach for array-based
                   speech recognition. The main idea is, instead of
                   enhancing speech signals, we try to enhance the
                   posterior probabilities that frames belonging to
                   recognition units, e.g., phones. These enhanced
                   posteriors are then transferred to posterior
                   probability based features and are modeled by HMMs,
                   leading to a tandem ANN-HMM hybrid system presented by
                   Hermansky et al.. Experimental results demonstrated the
                   validity of this posterior approach. With the posterior
                   accumulation or enhancement, significant improvement
                   was achieved over the single channel baseline.
                   Moreover, we can combine the acoustic enhancement and
                   posterior enhancement together, leading to a hybrid
                   acoustic-posterior beamforming approach, which works
                   significantly better than just the acoustic
                   beamforming, especially in the scenario with
                   moving-speakers. },
  categories = {speech recognition, microphone array, beamforming,
                   tandem approach},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/marray.a.pdf},
  year = 2008
}
@inproceedings{Cassia_ICASSP11,
  author = {Valentini-Botinhao, Cassia and Yamagishi, Junichi and
                   King, Simon},
  title = {Evaluation of objective measures for intelligibility
                   prediction of {HMM}-based synthetic speech in noise},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011
                   IEEE International Conference on},
  pages = {5112--5115},
  abstract = {{In this paper we evaluate four objective measures of
                   speech with regards to intelligibility prediction of
                   synthesized speech in diverse noisy situations. We
                   evaluated three intelligibility measures, the Dau
                   measure, the glimpse proportion and the Speech
                   Intelligibility Index (SII) and a quality measure, the
                   Perceptual Evaluation of Speech Quality (PESQ). For the
                   generation of synthesized speech we used a state of the
                   art HMM-based speech synthesis system. The noisy
                   conditions comprised four additive noises. The measures
                   were compared with subjective intelligibility scores
                   obtained in listening tests. The results show the Dau
                   and the glimpse measures to be the best predictors of
                   intelligibility, with correlations of around 0.83 to
                   subjective scores. All measures gave less accurate
                   predictions of intelligibility for synthetic speech
                   than have previously been found for natural speech; in
                   particular the SII measure. In additional experiments,
                   we processed the synthesized speech by an ideal binary
                   mask before adding noise. The Glimpse measure gave the
                   most accurate intelligibility predictions in this
                   situation.}},
  categories = {HMM-based speech synthesis, objective measures of
                   intelligibility},
  doi = {10.1109/ICASSP.2011.5947507},
  issn = {1520-6149},
  month = may,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/Cassia_ICASSP11.pdf},
  year = 2011
}
@inproceedings{king_bartels_bilmes_isp05,
  author = {Simon King and Chris Bartels and Jeff Bilmes},
  title = {SVitchboard 1: Small Vocabulary Tasks from Switchboard
                   1 },
  booktitle = {Proc. Interspeech 2005},
  address = {Lisbon, Portugal},
  abstract = {We present a conversational telephone speech data set
                   designed to support research on novel acoustic models.
                   Small vocabulary tasks from 10 words up to 500 words
                   are defined using subsets of the Switchboard-1 corpus;
                   each task has a completely closed vocabulary (an OOV
                   rate of 0\%). We justify the need for these tasks, de-
                   scribe the algorithm for selecting them from a large
                   cor- pus, give a statistical analysis of the data and
                   present baseline whole-word hidden Markov model
                   recognition results. The goal of the paper is to define
                   a common data set and to encourage other researchers to
                   use it.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/king_bartels_bilmes_svitchboard.pdf},
  year = 2005
}
@inproceedings{Gutkin:King:icslp04,
  author = {Alexander Gutkin and Simon King},
  title = {Phone classification in pseudo-{E}uclidean Vector
                   Spaces},
  booktitle = {Proc. 8th International Conference on Spoken Language
                   Processing (ICSLP)},
  volume = {II},
  pages = {1453--1457},
  address = {Jeju Island, Korea},
  abstract = { Recently we have proposed a structural framework for
                   modelling speech, which is based on patterns of
                   phonological distinctive features, a linguistically
                   well-motivated alternative to standard vector-space
                   acoustic models like HMMs. This framework gives
                   considerable representational freedom by working with
                   features that have explicit linguistic interpretation,
                   but at the expense of the ability to apply the wide
                   range of analytical decision algorithms available in
                   vector spaces, restricting oneself to more
                   computationally expensive and less-developed symbolic
                   metric tools. In this paper we show that a
                   dissimilarity-based distance-preserving transition from
                   the original structural representation to a
                   corresponding pseudo-Euclidean vector space is
                   possible. Promising results of phone classification
                   experiments conducted on the TIMIT database are
                   reported. },
  categories = {structural,recognition,acoustic,phonetic_feature,timit,edinburgh},
  issn = {1225-441x},
  month = oct,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/gutkin_king_icslp04.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/gutkin_king_icslp04.ps.gz},
  year = 2004
}
@article{yamagishi2009,
  author = {Yamagishi, Junichi and Nose, Takashi and Zen, Heiga
                   and Ling, Zhenhua and Toda, Tomoki and Tokuda, Keiichi
                   and King, Simon and Renals, Steve},
  title = {Robust Speaker-Adaptive {HMM}-based Text-to-Speech
                   Synthesis},
  journal = {IEEE Transactions on Audio, Speech and Language
                   Processing},
  volume = {17},
  number = {6},
  pages = {1208--1230},
  abstract = {This paper describes a speaker-adaptive HMM-based
                   speech synthesis system. The new system, called
                   ``HTS-2007,'' employs speaker adaptation (CSMAPLR+MAP),
                   feature-space adaptive training, mixed-gender modeling,
                   and full-covariance modeling using CSMAPLR transforms,
                   in addition to several other techniques that have
                   proved effective in our previous systems. Subjective
                   evaluation results show that the new system generates
                   significantly better quality synthetic speech than
                   speaker-dependent approaches with realistic amounts of
                   speech data, and that it bears comparison with
                   speaker-dependent approaches even when large amounts of
                   speech data are available. In addition, a comparison
                   study with several speech synthesis techniques shows
                   the new system is very robust: It is able to build
                   voices from less-than-ideal speech data and synthesize
                   good-quality speech even for out-of-domain sentences.},
  pdf = {},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=5109758&arnumber=5153555&count=14&index=12},
  year = 2009
}
@article{Stan2011442,
  author = {Adriana Stan and Junichi Yamagishi and Simon King and
                   Matthew Aylett},
  title = {The {R}omanian speech synthesis ({RSS}) corpus:
                   Building a high quality {HMM}-based speech synthesis
                   system using a high sampling rate},
  journal = {Speech Communication},
  volume = {53},
  number = {3},
  pages = {442--450},
  note = {},
  abstract = {This paper first introduces a newly-recorded high
                   quality Romanian speech corpus designed for speech
                   synthesis, called ``RSS'', along with Romanian
                   front-end text processing modules and HMM-based
                   synthetic voices built from the corpus. All of these
                   are now freely available for academic use in order to
                   promote Romanian speech technology research. The RSS
                   corpus comprises 3500 training sentences and 500 test
                   sentences uttered by a female speaker and was recorded
                   using multiple microphones at 96 kHz sampling
                   frequency in a hemianechoic chamber. The details of the
                   new Romanian text processor we have developed are also
                   given. Using the database, we then revisit some basic
                   configuration choices of speech synthesis, such as
                   waveform sampling frequency and auditory frequency
                   warping scale, with the aim of improving speaker
                   similarity, which is an acknowledged weakness of
                   current HMM-based speech synthesisers. As we
                   demonstrate using perceptual tests, these configuration
                   choices can make substantial differences to the quality
                   of the synthetic speech. Contrary to common practice in
                   automatic speech recognition, higher waveform sampling
                   frequencies can offer enhanced feature extraction and
                   improved speaker similarity for HMM-based speech
                   synthesis.},
  doi = {10.1016/j.specom.2010.12.002},
  issn = {0167-6393},
  keywords = {Speech synthesis, HTS, Romanian, HMMs, Sampling
                   frequency, Auditory scale},
  url = {http://www.sciencedirect.com/science/article/pii/S0167639310002074},
  year = 2011
}
@incollection{renals2010,
  author = {Renals, Steve and King, Simon},
  title = {Automatic Speech Recognition},
  booktitle = {Handbook of Phonetic Sciences},
  publisher = {Wiley Blackwell},
  editor = {Hardcastle, William J. and Laver, John and Gibbon,
                   Fiona E.},
  chapter = {22},
  year = 2010
}
@article{frankel06:adapt,
  author = {Frankel, J. and King, S.},
  title = {Observation Process Adaptation for Linear Dynamic
                   Models},
  journal = {Speech Communication},
  volume = 48,
  number = 9,
  pages = {1192-1199},
  abstract = {This work introduces two methods for adapting the
                   observation process parameters of linear dynamic models
                   (LDM) or other linear-Gaussian models. The first method
                   uses the expectation-maximization (EM) algorithm to
                   estimate transforms for location and covariance
                   parameters, and the second uses a generalized EM (GEM)
                   approach which reduces computation in making updates
                   from $O(p^6)$ to $O(p^3)$, where $p$ is the feature
                   dimension. We present the results of speaker adaptation
                   on TIMIT phone classification and recognition
                   experiments with relative error reductions of up to
                   $6\%$. Importantly, we find minimal differences in the
                   results from EM and GEM. We therefore propose that the
                   GEM approach be applied to adaptation of hidden Markov
                   models which use non-diagonal covariances. We provide
                   the necessary update equations.},
  categories = {am,asr,ldm,timit,edinburgh},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/Frankel_King_SPECOM2006.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/Frankel_King_SPECOM2006.ps},
  year = 2006
}
@article{richmond2003,
  author = {Richmond, K. and King, S. and Taylor, P.},
  title = {Modelling the Uncertainty in Recovering Articulation
                   from Acoustics},
  journal = {Computer Speech and Language},
  volume = 17,
  pages = {153--172},
  abstract = {This paper presents an experimental comparison of the
                   performance of the multilayer perceptron (MLP) with
                   that of the mixture density network (MDN) for an
                   acoustic-to-articulatory mapping task. A corpus of
                   acoustic-articulatory data recorded by electromagnetic
                   articulography (EMA) for a single speaker was used as
                   training and test data for this purpose. In theory, the
                   MDN is able to provide a richer, more flexible
                   description of the target variables in response to a
                   given input vector than the least-squares trained MLP.
                   Our results show that the mean likelihoods of the
                   target articulatory parameters for an unseen test set
                   were indeed consistently higher with the MDN than with
                   the MLP. The increase ranged from approximately 3\% to
                   22\%, depending on the articulatory channel in
                   question. On the basis of these results, we argue that
                   using a more flexible description of the target domain,
                   such as that offered by the MDN, can prove beneficial
                   when modelling the acoustic-to-articulatory mapping.},
  categories = {artic, ann, mlp, mdn, inversion, mocha, edinburgh},
  key = {richmond2003},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/richmond2003.pdf},
  year = 2003
}
@inproceedings{oura:icassp:10,
  author = {Keiichiro Oura and Keiichi Tokuda and Junichi
                   Yamagishi and Mirjam Wester and Simon King},
  title = {Unsupervised Cross-lingual Speaker Adaptation for
                   {HMM}-based Speech Synthesis},
  booktitle = {Proc. of ICASSP},
  volume = {I},
  pages = {4954-4957},
  abstract = {In the EMIME project, we are developing a mobile
                   device that performs personalized speech-to-speech
                   translation such that a user's spoken input in one
                   language is used to produce spoken output in another
                   language, while continuing to sound like the user's
                   voice. We integrate two techniques, unsupervised
                   adaptation for HMM-based TTS using a word-based
                   large-vocabulary continuous speech recognizer and
                   cross-lingual speaker adaptation for HMM-based TTS,
                   into a single architecture. Thus, an unsupervised
                   cross-lingual speaker adaptation system can be
                   developed. Listening tests show very promising results,
                   demonstrating that adapted voices sound similar to the
                   target speaker and that differences between supervised
                   and unsupervised cross-lingual speaker adaptation are
                   small.},
  categories = {speaker adaptation, TTS},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/oura_icassp2010.pdf},
  year = 2010
}
@inproceedings{strom:etal:interspeech2007,
  author = {Volker Strom and Ani Nenkova and Robert Clark and
                   Yolanda Vazquez-Alvarez and Jason Brenier and Simon
                   King and Dan Jurafsky},
  title = {Modelling Prominence and Emphasis Improves
                   Unit-Selection Synthesis},
  booktitle = {Proc. Interspeech 2007},
  address = {Antwerp, Belgium},
  abstract = {We describe the results of large scale perception
                   experiments showing improvements in synthesising two
                   distinct kinds of prominence: standard pitch-accent and
                   strong emphatic accents. Previously prominence
                   assignment has been mainly evaluated by computing
                   accuracy on a prominence-labelled test set. By contrast
                   we integrated an automatic pitch-accent classifier into
                   the unit selection target cost and showed that
                   listeners preferred these synthesised sentences. We
                   also describe an improved recording script for
                   collecting emphatic accents, and show that generating
                   emphatic accents leads to further improvements in the
                   fiction genre over incorporating pitch accent only.
                   Finally, we show differences in the effects of
                   prominence between child-directed speech and news and
                   fiction genres. Index Terms: speech synthesis, prosody,
                   prominence, pitch accent, unit selection},
  categories = {speech synthesis},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/p540.pdf},
  year = 2007
}
@inproceedings{clark_blizzard2006,
  author = {Clark, R. and Richmond, K. and Strom, V. and King, S.},
  title = {Multisyn Voices for the {B}lizzard {C}hallenge 2006},
  booktitle = {Proc. Blizzard Challenge Workshop (Interspeech
                   Satellite)},
  address = {Pittsburgh, USA},
  note = {(http://festvox.org/blizzard/blizzard2006.html)},
  abstract = {This paper describes the process of building unit
                   selection voices for the Festival Multisyn engine using
                   the ATR dataset provided for the Blizzard Challenge
                   2006. We begin by discussing recent improvements that
                   we have made to the Multisyn voice building process,
                   prompted by our participation in the Blizzard Challenge
                   2006. We then go on to discuss our interpretation of
                   the results observed. Finally, we conclude with some
                   comments and suggestions for the formulation of future
                   Blizzard Challenges.},
  categories = {tts, blizzard, multisyn, unit selection},
  key = {clark_blizzard2006},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/cstr_blizzard2006.pdf},
  year = 2006
}
@article{Hashimoto2012857,
  author = {Kei Hashimoto and Junichi Yamagishi and William Byrne
                   and Simon King and Keiichi Tokuda},
  title = {Impacts of machine translation and speech synthesis on
                   speech-to-speech translation},
  journal = {Speech Communication},
  volume = {54},
  number = {7},
  pages = {857--866},
  note = {},
  abstract = {This paper analyzes the impacts of machine translation
                   and speech synthesis on speech-to-speech translation
                   systems. A typical speech-to-speech translation system
                   consists of three components: speech recognition,
                   machine translation and speech synthesis. Many
                   techniques have been proposed for integration of speech
                   recognition and machine translation. However,
                   corresponding techniques have not yet been considered
                   for speech synthesis. The focus of the current work is
                   machine translation and speech synthesis, and we
                   present a subjective evaluation designed to analyze
                   their impact on speech-to-speech translation. The
                   results of these analyses show that the naturalness and
                   intelligibility of the synthesized speech are strongly
                   affected by the fluency of the translated sentences. In
                   addition, several features were found to correlate well
                   with the average fluency of the translated sentences
                   and the average naturalness of the synthesized speech.},
  doi = {10.1016/j.specom.2012.02.004},
  issn = {0167-6393},
  keywords = {Speech-to-speech translation, Machine translation,
                   Speech synthesis, Subjective evaluation},
  url = {http://www.sciencedirect.com/science/article/pii/S0167639312000283},
  year = 2012
}
@inproceedings{vanbael:king:icphs2003,
  author = {Christophe Van Bael and Simon King},
  title = {An Accent-Independent Lexicon for Automatic Speech
                   Recognition},
  booktitle = {Proc. ICPhS},
  pages = {1165-1168},
  abstract = {Recent work at the Centre for Speech Technology Re-
                   search (CSTR) at the University of Edinburgh has de-
                   veloped an accent-independent lexicon for speech syn-
                   thesis (the Unisyn project). The main purpose of this
                   lexicon is to avoid the problems and cost of writing a
                   new lexicon for every new accent needed for synthesis.
                   Only recently, a first attempt has been made to use the
                   Keyword Lexicon for automatic speech recognition.},
  categories = {},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/VanBael_King_icphs2003.pdf},
  year = 2003
}
@inproceedings{horlock:king:eurospeech2003a,
  author = {James Horlock and Simon King},
  title = {Named Entity Extraction from Word Lattices},
  booktitle = {Proc. Eurospeech},
  address = {Geneva},
  abstract = {We present a method for named entity extraction from
                   word lattices produced by a speech recogniser. Previous
                   work by others on named entity extraction from speech
                   has used either a manual transcript or 1-best
                   recogniser output. We describe how a single Viterbi
                   search can recover both the named entity sequence and
                   the corresponding word sequence from a word lattice,
                   and further that it is possible to trade off an
                   increase in word error rate for improved named entity
                   extraction.},
  categories = {},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/Horlock_King_eurospeech2003a.pdf},
  year = 2003
}
@inproceedings{stan12_grapheme_alignment,
  author = {Stan, Adriana and Bell, Peter and King, Simon},
  title = {A Grapheme-based Method for Automatic Alignment of
                   Speech and Text Data},
  booktitle = {Proc. IEEE Workshop on Spoken Language Technology},
  address = {Miami, Florida, USA},
  abstract = {This paper introduces a method for automatic alignment
                   of speech data with unsynchronised, imperfect
                   transcripts, for a domain where no initial acoustic
                   models are available. Using grapheme-based acoustic
                   models, word skip networks and orthographic speech
                   transcripts, we are able to harvest 55\% of the speech
                   with a 93\% utterance-level accuracy and 99\% word
                   accuracy for the produced transcriptions. The work is
                   based on the assumption that there is a high degree of
                   correspondence between the speech and text, and that a
                   full transcription of all of the speech is not
                   required. The method is language independent and the
                   only prior knowledge and resources required are the
                   speech and text transcripts, and a few minor user
                   interventions.},
  month = dec,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/grapheme_alignment_slt2012.pdf},
  year = 2012
}
@inproceedings{CassiaSAPA12,
  author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S.},
  title = {{Speech intelligibility enhancement for {HMM}-based
                   synthetic speech in noise}},
  booktitle = {Proc. Sapa Workshop},
  address = {Portland, USA},
  abstract = {It is possible to increase the intelligibility of
                   speech in noise by enhancing the clean speech signal.
                   In this paper we demonstrate the effects of modifying
                   the spectral envelope of synthetic speech according to
                   the environmental noise. To achieve this, we modify Mel
                   cepstral coefficients according to an intelligibility
                   measure that accounts for glimpses of speech in noise:
                   the Glimpse Proportion measure. We evaluate this method
                   against a baseline synthetic voice trained only with
                   normal speech and a topline voice trained with Lombard
                   speech, as well as natural speech. The intelligibility
                   of these voices was measured when mixed with
                   speech-shaped noise and with a competing speaker at
                   three different levels. The Lombard voices, both
                   natural and synthetic, were more intelligible than the
                   normal voices in all conditions. For speech-shaped
                   noise, the proposed modified voice was as intelligible
                   as the Lombard synthetic voice without requiring any
                   recordings of Lombard speech, which are hard to obtain.
                   However, in the case of competing talker noise, the
                   Lombard synthetic voice was more intelligible than the
                   proposed modified voice.},
  categories = {HMM-based speech synthesis, intelligibility
                   enhancement},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_Sapa12.pdf},
  year = 2012
}
@inproceedings{clark_king:proc:2006,
  author = {Robert A. J. Clark and Simon King},
  title = {Joint Prosodic and Segmental Unit Selection Speech
                   Synthesis},
  booktitle = {Proc. Interspeech 2006},
  address = {Pittsburgh, USA},
  abstract = {We describe a unit selection technique for
                   text-to-speech synthesis which jointly searches the
                   space of possible diphone sequences and the space of
                   possible prosodic unit sequences in order to produce
                   synthetic speech with more natural prosody. We
                   demonstrates that this search, although currently
                   computationally expensive, can achieve improved
                   intonation compared to a baseline in which only the
                   space of possible diphone sequences is searched. We
                   discuss ways in which the search could be made
                   sufficiently efficient for use in a real-time system.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/clarkking_interspeech_2006.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/clarkking_interspeech_2006.ps},
  year = 2006
}
@inproceedings{wang_interspeech10,
  author = {Dong Wang and Simon King and Nick Evans and Raphael
                   Troncy},
  title = {{CRF}-based Stochastic Pronunciation Modelling for
                   Out-of-Vocabulary Spoken Term Detection},
  booktitle = {Proc. Interspeech},
  address = {Makuhari, Chiba, Japan},
  abstract = {Out-of-vocabulary (OOV) terms present a significant
                   challenge to spoken term detection (STD). This
                   challenge, to a large extent, lies in the high degree
                   of uncertainty in pronunciations of OOV terms. In
                   previous work, we presented a stochastic pronunciation
                   modeling (SPM) approach to compensate for this
                   uncertainty. A shortcoming of our original work,
                   however, is that the SPM was based on a joint-multigram
                   model (JMM), which is suboptimal. In this paper, we
                   propose to use conditional random fields (CRFs) for
                   letter-to-sound conversion, which significantly
                   improves quality of the predicted pronunciations. When
                   applied to OOV STD, we achieve consider- able
                   performance improvement with both a 1-best system and
                   an SPM-based system.},
  categories = {speech recognition, spoken term detection, conditional
                   random field, joint multigram model},
  month = sep,
  year = 2010
}
@inproceedings{strom10d,
  author = {Volker Strom and Simon King},
  title = {A classifier-based target cost for unit selection
                   speech synthesis trained on perceptual data},
  booktitle = {Proc.~Interspeech},
  address = {Makuhari, Japan},
  abstract = {Our goal is to automatically learn a
                   PERCEPTUALLY-optimal target cost function for a unit
                   selection speech synthesiser. The approach we take here
                   is to train a classifier on human perceptual judgements
                   of synthetic speech. The output of the classifier is
                   used to make a simple three-way distinction rather than
                   to estimate a continuously-valued cost. In order to
                   collect the necessary perceptual data, we synthesised
                   145,137 short sentences with the usual target cost
                   switched off, so that the search was driven by the join
                   cost only. We then selected the 7200 sentences with the
                   best joins and asked 60 listeners to judge them,
                   providing their ratings for each syllable. From this,
                   we derived a rating for each demiphone. Using as input
                   the same context features employed in our conventional
                   target cost function, we trained a classifier on these
                   human perceptual ratings. We synthesised two sets of
                   test sentences with both our standard target cost and
                   the new target cost based on the classifier. A/B
                   preference tests showed that the classifier-based
                   target cost, which was learned completely automatically
                   from modest amounts of perceptual data, is almost as
                   good as our carefully- and expertly-tuned standard
                   target cost.},
  categories = {speech synthesis, unit selection, target cost},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/strom10d.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/strom10d.ps},
  year = 2010
}
@article{frankel07:ldm,
  author = {Frankel, J. and King, S.},
  title = {Speech Recognition using Linear Dynamic Models},
  journal = {IEEE {T}ransactions on {S}peech and {A}udio
                   {P}rocessing},
  volume = 15,
  number = 1,
  pages = {246--256},
  abstract = {The majority of automatic speech recognition (ASR)
                   systems rely on hidden Markov models, in which Gaussian
                   mixtures model the output distributions associated with
                   sub-phone states. This approach, whilst successful,
                   models consecutive feature vectors (augmented to
                   include derivative information) as statistically
                   independent. Furthermore, spatial correlations present
                   in speech parameters are frequently ignored through the
                   use of diagonal covariance matrices. This paper
                   continues the work of Digalakis and others who proposed
                   instead a first-order linear state-space model which
                   has the capacity to model underlying dynamics, and
                   furthermore give a model of spatial correlations. This
                   paper examines the assumptions made in applying such a
                   model and shows that the addition of a hidden dynamic
                   state leads to increases in accuracy over otherwise
                   equivalent static models. We also propose a
                   time-asynchronous decoding strategy suited to
                   recognition with segment models. We describe
                   implementation of decoding for linear dynamic models
                   and present TIMIT phone recognition results.},
  categories = {am,asr,ldm,timit,search,edinburgh},
  month = {January},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Frankel_King_IEEE2007.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Frankel_King_IEEE2007.ps},
  year = 2007
}
@article{wang_ieeesigprocletters2011,
  author = {Dong Wang and Simon King},
  title = {Letter-to-Sound Pronunciation Prediction Using
                   Conditional Random Fields},
  journal = {IEEE Signal Processing Letters},
  volume = {18},
  number = {2},
  pages = {122--125},
  abstract = {Pronunciation prediction, or letter-to-sound (LTS)
                   conversion, is an essential task for speech synthesis,
                   open vo- cabulary spoken term detection and other
                   applications dealing with novel words. Most current
                   approaches (at least for English) employ data-driven
                   methods to learn and represent pronunciation ``rules''
                   using statistical models such as decision trees, hidden
                   Markov models (HMMs) or joint-multigram models (JMMs).
                   The LTS task remains challenging, particularly for
                   languages with a complex relationship between spelling
                   and pronunciation such as English. In this paper, we
                   propose to use a conditional random field (CRF) to
                   perform LTS because it avoids having to model a
                   distribution over observations and can perform global
                   inference, suggesting that it may be more suitable for
                   LTS than decision trees, HMMs or JMMs. One challenge in
                   applying CRFs to LTS is that the phoneme and grapheme
                   sequences of a word are generally of different lengths,
                   which makes CRF training difficult. To solve this
                   problem, we employed a joint-multigram model to
                   generate aligned training exemplars. Experiments
                   conducted with the AMI05 dictionary demonstrate that a
                   CRF significantly outperforms other models, especially
                   if n-best lists of predictions are generated.},
  categories = {Terms—letter-to-sound, conditional random field,
                   joint multigram model, speech synthesis, spoken term
                   detection},
  doi = {10.1109/LSP.2010.2098440 },
  month = feb,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/wang_ieeesigprocletters2011.pdf},
  year = 2011
}
@manual{king:verbmobil1996c,
  title = {Users {M}anual for {V}erbmobil {T}eilprojekt 4.4},
  author = {Simon King},
  organization = {IKP, Universitt Bonn},
  abstract = {Verbmobil English synthesiser users manual},
  categories = {},
  month = oct,
  year = 1996
}
@inproceedings{Gutkin:King:icpr04,
  author = {Alexander Gutkin and Simon King},
  title = {{S}tructural {R}epresentation of {S}peech for
                   {P}honetic {C}lassification},
  booktitle = {Proc. 17th International Conference on Pattern
                   Recognition (ICPR)},
  volume = 3,
  pages = {438--441},
  address = {Cambridge, UK},
  publisher = {IEEE Computer Society Press},
  abstract = { This paper explores the issues involved in using
                   symbolic metric algorithms for automatic speech
                   recognition (ASR), via a structural representation of
                   speech. This representation is based on a set of
                   phonological distinctive features which is a
                   linguistically well-motivated alternative to the
                   ``beads-on-a-string'' view of speech that is standard
                   in current ASR systems. We report the promising results
                   of phoneme classification experiments conducted on a
                   standard continuous speech task. },
  categories = {structural,recognition,acoustic,phonetic_feature,timit,edinburgh},
  isbn = {0-7695-2128-2},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/gutkin_king_icpr04.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/gutkin_king_icpr04.ps.gz},
  year = 2004
}
@inproceedings{Gutkin:King:icassp05,
  author = {Alexander Gutkin and Simon King},
  title = {{D}etection of {S}ymbolic {G}estural {E}vents in
                   {A}rticulatory {D}ata for {U}se in {S}tructural
                   {R}epresentations of {C}ontinuous {S}peech},
  booktitle = {Proc. IEEE International Conference on Acoustics,
                   Speech, and Signal Processing (ICASSP-05)},
  volume = {I},
  pages = {885--888},
  address = {Philadelphia, PA, USA},
  publisher = {IEEE Signal Processing Society Press},
  abstract = { One of the crucial issues which often needs to be
                   addressed in structural approaches to speech
                   representation is the choice of fundamental symbolic
                   units of representation. In this paper, a
                   physiologically inspired methodology for defining these
                   symbolic atomic units in terms of primitive
                   articulatory events is proposed. It is shown how the
                   atomic articulatory events (gestures) can be detected
                   directly in the articulatory data. An algorithm for
                   evaluating the reliability of the articulatory events
                   is described and promising results of the experiments
                   conducted on MOCHA articulatory database are presented.
                   },
  categories = {structural,recognition,artic,mocha,edinburgh},
  isbn = {0-7803-8875-5},
  month = mar,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/gutkin_king_icassp2005.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/gutkin_king_icassp2005.ps.gz},
  year = 2005
}
@inproceedings{wang_std_covariance_icassp2010,
  author = {Wang, Dong and King, Simon and Frankel, Joe and Bell,
                   Peter},
  title = {Stochastic Pronunciation Modelling and Soft Match for
                   Out-of-vocabulary Spoken Term Detection},
  booktitle = {Proc. ICASSP},
  address = {Dallas, Texas, USA},
  abstract = {A major challenge faced by a spoken term detection
                   (STD) system is the detection of out-of-vocabulary
                   (OOV) terms. Although a subword-based STD system is
                   able to detect OOV terms, performance reduction is
                   always observed compared to in-vocabulary terms. One
                   challenge that OOV terms bring to STD is the
                   pronunciation uncertainty. A commonly used approach to
                   address this problem is a soft matching procedure,and
                   the other is the stochastic pronunciation modelling
                   (SPM) proposed by the authors. In this paper we compare
                   these two approaches, and combine them using a
                   discriminative decision strategy. Experimental results
                   demonstrated that SPM and soft match are highly
                   complementary, and their combination gives significant
                   performance improvement to OOV term detection.},
  keywords = {confidence estimation, spoken term detection, speech
                   recognition},
  month = mar,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wang10_icassp.pdf},
  year = 2010
}
@inproceedings{jyamagis:emime,
  author = {Junichi Yamagishi and Mike Lincoln and Simon King and
                   John Dines and Matthew Gibson and Jilei Tian and Yong
                   Guan},
  title = {Analysis of Unsupervised and Noise-Robust
                   Speaker-Adaptive {HMM}-Based Speech Synthesis Systems
                   toward a Unified {ASR} and {TTS} Framework},
  booktitle = {Proc. Interspeech 2009},
  address = {Edinburgh, U.K.},
  abstract = {For the 2009 Blizzard Challenge we have built an
                   unsupervised version of the HTS-2008 speaker-adaptive
                   HMM-based speech synthesis system for English, and a
                   noise robust version of the systems for Mandarin. They
                   are designed from a multidisciplinary application point
                   of view in that we attempt to integrate the components
                   of the TTS system with other technologies such as ASR.
                   All the average voice models are trained exclusively
                   from recognized, publicly available, ASR databases.
                   Multi-pass LVCSR and confidence scores calculated from
                   confusion network are used for the unsupervised
                   systems, and noisy data recorded in cars or public
                   spaces is used for the noise robust system. We believe
                   the developed systems form solid benchmarks and provide
                   good connections to ASR fields. This paper describes
                   the development of the systems and reports the results
                   and analysis of their evaluation.},
  month = sep,
  year = 2009
}
@inproceedings{CassiaLista12,
  author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S.},
  title = {{Using an intelligibility measure to create noise
                   robust cepstral coefficients for {HMM}-based speech
                   synthesis}},
  booktitle = {Proc. LISTA Workshop},
  address = {Edinburgh, UK},
  categories = {HMM-based speech synthesis, intelligibility
                   enhancement},
  month = {May},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_Lista12.pdf},
  year = 2012
}
@inproceedings{vepa-king_euro03,
  author = {Vepa, J. and King, S.},
  title = {Kalman-filter based Join Cost for Unit-selection
                   Speech Synthesis},
  booktitle = {Proc. {E}urospeech},
  address = {Geneva, Switzerland},
  abstract = {We introduce a new method for computing join cost in
                   unit-selection speech synthesis which uses a linear
                   dynamical model (also known as a Kalman filter) to
                   model line spectral frequency trajectories. The model
                   uses an underlying subspace in which it makes smooth,
                   continuous trajectories. This subspace can be seen as
                   an analogy for underlying articulator movement. Once
                   trained, the model can be used to measure how well
                   concatenated speech segments join together. The
                   objective join cost is based on the error between model
                   predictions and actual observations. We report
                   correlations between this measure and mean listener
                   scores obtained from a perceptual listening experiment.
                   Our experiments use a state-of-the art unit-selection
                   text-to-speech system: `rVoice' from Rhetorical Systems
                   Ltd.},
  categories = {join cost, Kalman filter, LDM, rVoice, edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/vepa_eurospeech03.pdf},
  year = 2003
}
@incollection{king:ELL2_2006a,
  author = {Simon King},
  title = {Language variation in speech technologies},
  booktitle = {Encyclopedia of Language and Linguistics},
  publisher = {Elsevier},
  editor = {Keith Brown},
  edition = {2nd},
  year = 2006
}
@inproceedings{livescu07:manual,
  author = {Livescu, K. and Bezman, A. and Borges, N. and Yung, L.
                   and Çetin, Ö. and Frankel, J. and King, S. and
                   Magimai-Doss, M. and Chi, X. and Lavoie, L.},
  title = {Manual transcription of conversational speech at the
                   articulatory feature level},
  booktitle = {Proc. ICASSP},
  address = {Honolulu},
  abstract = {We present an approach for the manual labeling of
                   speech at the articulatory feature level, and a new set
                   of labeled conversational speech collected using this
                   approach. A detailed transcription, including
                   overlapping or reduced gestures, is useful for studying
                   the great pronunciation variability in conversational
                   speech. It also facilitates the testing of feature
                   classiers, such as those used in articulatory
                   approaches to automatic speech recognition. We describe
                   an effort to transcribe a small set of utterances drawn
                   from the Switchboard database using eight articulatory
                   tiers. Two transcribers have labeled these utterances
                   in a multi-pass strategy, allowing for correction of
                   errors. We describe the data collection methods and
                   analyze the data to determine how quickly and reliably
                   this type of transcription can be done. Finally, we
                   demonstrate one use of the new data set by testing a
                   set of multilayer perceptron feature classiers against
                   both the manual labels and forced alignments.},
  month = {April},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/livescu_icassp07_trans.pdf},
  year = 2007
}
@inproceedings{frankel00:NN_LDM,
  author = {Frankel, J. and Richmond, K. and King, S. and Taylor,
                   P.},
  title = {An automatic speech recognition system using neural
                   networks and linear dynamic models to recover and model
                   articulatory traces},
  booktitle = {Proc. {ICSLP}},
  abstract = {In this paper we describe a speech recognition system
                   using linear dynamic models and articulatory features.
                   Experiments are reported in which measured articulation
                   from the MOCHA corpus has been used, along with those
                   where the articulatory parameters are estimated from
                   the speech signal using a recurrent neural network.},
  categories = {am,artic,asr,ldm,mocha,edinburgh,inversion,ann},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Frankel_et_al_ICSLP2000.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Frankel_et_al_ICSLP2000.ps},
  year = 2000
}
@inproceedings{vepa-king-taylor_ieee02,
  author = {Vepa, J. and King, S. and Taylor, P.},
  title = {New Objective Distance Measures for Spectral
                   Discontinuities in Concatenative Speech Synthesis},
  booktitle = {Proc. {IEEE} 2002 workshop on speech synthesis},
  address = {Santa Monica, USA},
  abstract = {The quality of unit selection based concatenative
                   speech synthesis mainly depends on how well two
                   successive units can be joined together to minimise the
                   audible discontinuities. The objective measure of
                   discontinuity used when selecting units is known as the
                   `join cost'. The ideal join cost will measure
                   `perceived' discontinuity, based on easily measurable
                   spectral properties of the units being joined, in order
                   to ensure smooth and natural-sounding synthetic speech.
                   In this paper we describe a perceptual experiment
                   conducted to measure the correlation between
                   `subjective' human perception and various `objective'
                   spectrally-based measures proposed in the literature.
                   Also we report new objective distance measures derived
                   from various distance metrics based on these spectral
                   features, which have good correlation with human
                   perception to concatenation discontinuities. Our
                   experiments used a state-of-the art unit-selection
                   text-to-speech system: `rVoice' from Rhetorical Systems
                   Ltd.},
  categories = {join cost, weighted distances, MCA, rVoice, edinburgh},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/vepa_tts02.pdf},
  year = 2002
}
@inproceedings{horlock:king:eurospeech2003b,
  author = {James Horlock and Simon King},
  title = {Discriminative Methods for Improving Named Entity
                   Extraction on Speech Data},
  booktitle = {Proc. Eurospeech},
  address = {Geneva},
  abstract = {In this paper we present a method of discriminatively
                   training language models for spoken language
                   understanding; we show improvements in named entity
                   F-scores on speech data using these improved language
                   models. A comparison between theoretical probabilities
                   associated with manual markup and the actual
                   probabilities of output markup is used to identify
                   probabilities requiring adjustment. We present results
                   which support our hypothesis that improvements in
                   F-scores are possible by using either previously used
                   training data or held out development data to improve
                   discrimination amongst a set of N-gram language models.},
  categories = {},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/Horlock_King_eurospeech2003b.pdf},
  year = 2003
}
@inproceedings{higher_level,
  author = {Oliver Watts and Junichi Yamagishi and Simon King},
  title = {The role of higher-level linguistic features in
                   {HMM}-based speech synthesis},
  booktitle = {Proc. Interspeech},
  pages = {841-844},
  address = {Makuhari, Japan},
  abstract = {We analyse the contribution of higher-level elements
                   of the linguistic specification of a data-driven speech
                   synthesiser to the naturalness of the synthetic speech
                   which it generates. The system is trained using various
                   subsets of the full feature-set, in which features
                   relating to syntactic category, intonational phrase
                   boundary, pitch accent and boundary tones are
                   selectively removed. Utterances synthesised by the
                   different configurations of the system are then
                   compared in a subjective evaluation of their
                   naturalness. The work presented forms background
                   analysis for an ongoing set of experiments in
                   performing text-to-speech (TTS) conversion based on
                   shallow features: features that can be trivially
                   extracted from text. By building a range of systems,
                   each assuming the availability of a different level of
                   linguistic annotation, we obtain benchmarks for our
                   on-going work.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100565.pdf},
  year = 2010
}
@inproceedings{joe_dong_simon_interspeech08_bottle,
  author = {Joe Frankel and Dong Wang and Simon King},
  title = {Growing bottleneck features for tandem {ASR}},
  booktitle = {Proc. Interspeech},
  pages = {1549},
  abstract = { We present a method for training bottleneck MLPs for
                   use in tandem ASR. Experiments on meetings data show
                   that this approach leads to improved performance
                   compared with training MLPs from a random
                   initialization. },
  categories = {tandem ASR, bottleneck MLP},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/bottlenet.a.pdf},
  year = 2008
}
@inproceedings{tts_barra08,
  author = {R. Barra-Chicote and J. Yamagishi and J.M. Montero and
                   S. King and S. Lutfi and J. Macias-Guarasa},
  title = {Generacion de una voz sintetica en {C}astellano basada
                   en {HSMM} para la {E}valuacion {A}lbayzin 2008:
                   conversion texto a voz},
  booktitle = {V Jornadas en Tecnologia del Habla},
  pages = {115-118},
  note = {(in Spanish)},
  month = nov,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/tts-jth08.pdf},
  year = 2008
}
@article{king:jphon2003,
  author = {Simon King},
  title = {Dependence and independence in automatic speech
                   recognition and synthesis},
  journal = {Journal of Phonetics},
  volume = 31,
  number = {3-4},
  pages = {407-411},
  abstract = {A short review paper},
  categories = {},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/King_jphon2003.pdf},
  year = 2003
}
@inproceedings{john:HTSGAP,
  author = {J. Dines and J. Yamagishi and S. King},
  title = {Measuring the gap between {HMM}-based {ASR} and {TTS}},
  booktitle = {Proc. Interspeech},
  pages = {1391--1394},
  address = {Brighton, U.K.},
  abstract = {The EMIME European project is conducting research in
                   the development of technologies for mobile,
                   personalised speech-to-speech translation systems. The
                   hidden Markov model is being used as the underlying
                   technology in both automatic speech recognition (ASR)
                   and text-to-speech synthesis (TTS) components, thus,
                   the investigation of unified statistical modelling
                   approaches has become an implicit goal of our research.
                   As one of the first steps towards this goal, we have
                   been investigating commonalities and differences
                   between HMM-based ASR and TTS. In this paper we present
                   results and analysis of a series of experiments that
                   have been conducted on English ASR and TTS systems,
                   measuring their performance with respect to phone set
                   and lexicon, acoustic feature type and dimensionality
                   and HMM topology. Our results show that, although the
                   fundamental statistical model may be essentially the
                   same, optimal ASR and TTS performance often demands
                   diametrically opposed system designs. This represents a
                   major challenge to be addressed in the investigation of
                   such unified modelling approaches.},
  month = sep,
  year = 2009
}
@inproceedings{strom06,
  author = {Volker Strom and Robert Clark and Simon King},
  title = {Expressive Prosody for Unit-selection Speech Synthesis},
  booktitle = {Proc.~Interspeech},
  address = {Pittsburgh},
  abstract = {Current unit selection speech synthesis voices cannot
                   produce emphasis or interrogative contours because of a
                   lack of the necessary prosodic variation in the
                   recorded speech database. A method of recording script
                   design is proposed which addresses this shortcoming.
                   Appropriate components were added to the target cost
                   function of the Festival Multisyn engine, and a
                   perceptual evaluation showed a clear preference over
                   the baseline system.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/strom06.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/strom06.ps},
  year = 2006
}
@article{turk:2429,
  author = {Alice Turk and James Scobbie and Christian Geng and
                   Cedric Macmartin and Ellen Bard and Barry Campbell and
                   Catherine Dickie and Eddie Dubourg and Bill Hardcastle
                   and Phil Hoole and Evia Kanaida and Robin Lickley and
                   Satsuki Nakai and Marianne Pouplier and Simon King and
                   Steve Renals and Korin Richmond and Sonja Schaeffler
                   and Ronnie Wiegand and Kevin White and Alan Wrench},
  title = {The {Edinburgh Speech Production Facility's}
                   articulatory corpus of spontaneous dialogue.},
  journal = {The Journal of the Acoustical Society of America},
  volume = {128},
  number = {4},
  pages = {2429-2429},
  abstract = {The EPSRC‐funded Edinburgh Speech Production is
                   built around two synchronized Carstens AG500
                   electromagnetic articulographs (EMAs) in order to
                   capture articulatory∕acoustic data from spontaneous
                   dialogue. An initial articulatory corpus was designed
                   with two aims. The first was to elicit a range of
                   speech styles∕registers from speakers, and therefore
                   provide an alternative to fully scripted corpora. The
                   second was to extend the corpus beyond monologue, by
                   using tasks that promote natural discourse and
                   interaction. A subsidiary driver was to use dialects
                   from outwith North America: dialogues paired up a
                   Scottish English and a Southern British English
                   speaker. Tasks. Monologue: Story reading of ``Comma
                   Gets a Cure'' [Honorof et al. (2000)], lexical sets
                   [Wells (1982)], spontaneous story telling,
                   diadochokinetic tasks. Dialogue: Map tasks [Anderson et
                   al. (1991)], ``Spot the Difference'' picture tasks
                   [Bradlow et al. (2007)], story‐recall. Shadowing of
                   the spontaneous story telling by the second
                   participant. Each dialogue session includes
                   approximately 30 min of speech, and there are
                   acoustics‐only baseline materials. We will introduce
                   the corpus and highlight the role of articulatory
                   production data in helping provide a fuller
                   understanding of various spontaneous speech phenomena
                   by presenting examples of naturally occurring covert
                   speech errors, accent accommodation, turn taking
                   negotiation, and shadowing.},
  doi = {10.1121/1.3508679},
  publisher = {ASA},
  year = 2010
}
@inproceedings{kurimo:acl:10,
  author = {Mikko Kurimo and William Byrne and John Dines and
                   Philip N. Garner and Matthew Gibson and Yong Guan and
                   Teemu Hirsim\"{a}ki and Reima Karhila and Simon King
                   and Hui Liang and Keiichiro Oura and Lakshmi Saheer and
                   Matt Shannon and Sayaka Shiota and Jilei Tian and
                   Keiichi Tokuda and Mirjam Wester and Yi-Jian Wu and
                   Junichi Yamagishi},
  title = {Personalising speech-to-speech translation in the
                   {EMIME} project},
  booktitle = {Proc. of the ACL 2010 System Demonstrations},
  address = {Uppsala, Sweden},
  abstract = {In the EMIME project we have studied unsupervised
                   cross-lingual speaker adaptation. We have employed an
                   HMM statistical framework for both speech recognition
                   and synthesis which provides transformation mechanisms
                   to adapt the synthesized voice in TTS (text-to-speech)
                   using the recognized voice in ASR (automatic speech
                   recognition). An important application for this
                   research is personalised speech-to-speech translation
                   that will use the voice of the speaker in the input
                   language to utter the translated sentences in the
                   output language. In mobile environments this enhances
                   the users' interaction across language barriers by
                   making the output speech sound more like the original
                   speaker's way of speaking, even if she or he could not
                   speak the output language.},
  categories = {speaker adaptation},
  month = {July},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/kurimo_acl_2010.pdf},
  year = 2010
}
@inproceedings{dongwang_interspeech09_cmb,
  author = {Javier Tejedor and Dong Wang and Simon King and Joe
                   Frankel and Jose Colas},
  title = {A Posterior Probability-based System Hybridisation and
                   Combination for Spoken Term Detection},
  booktitle = {Proc. Interspeech},
  pages = {2131--2134},
  address = {Brighton, UK},
  abstract = {Spoken term detection (STD) is a fundamental task for
                   multimedia information retrieval. To improve the
                   detection performance, we have presented a direct
                   posterior-based confidence measure generated from a
                   neural network. In this paper, we propose a
                   detection-independent confidence estimation based on
                   the direct posterior confidence measure, in which the
                   decision making is totally separated from the term
                   detection. Based on this idea, we first present a
                   hybrid system which conducts the term detection and
                   confidence estimation based on different sub-word
                   units, and then propose a combination method which
                   merges detections from heterogeneous term detectors
                   based on the direct posterior-based confidence.
                   Experimental results demonstrated that the proposed
                   methods improved system performance considerably for
                   both English and Spanish. },
  categories = {joint-multigram, pronunciation model, spoken term
                   detection, speech recognition},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/cmb.pdf},
  year = 2009
}
@inproceedings{bell_king_full_covariance_asru2009,
  author = {Bell, Peter and King, Simon},
  title = {Diagonal Priors for Full Covariance Speech Recognition},
  booktitle = {Proc. IEEE Workshop on Automatic Speech Recognition
                   and Understanding},
  address = {Merano, Italy},
  abstract = {We investigate the use of full covariance Gaussians
                   for large-vocabulary speech recognition. The large
                   number of parameters gives high modelling power, but
                   when training data is limited, the standard sample
                   covariance matrix is often poorly conditioned, and has
                   high variance. We explain how these problems may be
                   solved by the use of a diagonal covariance smoothing
                   prior, and relate this to the shrinkage estimator, for
                   which the optimal shrinkage parameter may itself be
                   estimated from the training data. We also compare the
                   use of generatively and discriminatively trained
                   priors. Results are presented on a large vocabulary
                   conversational telephone speech recognition task.},
  doi = {10.1109/ASRU.2009.5373344},
  month = dec,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/shrinkage_asru2009.pdf},
  year = 2009
}
@inproceedings{shig041,
  author = {Yoshinori Shiga and Simon King},
  title = {Accurate spectral envelope estimation for
                   articulation-to-speech synthesis},
  booktitle = {Proc. 5th ISCA Speech Synthesis Workshop},
  pages = {19--24},
  address = {CMU, Pittsburgh, USA},
  abstract = {This paper introduces a novel articulatory-acoustic
                   mapping in which detailed spectral envelopes are
                   estimated based on the cepstrum, inclusive of the
                   high-quefrency elements which are discarded in
                   conventional speech synthesis to eliminate the pitch
                   component of speech. For this estimation, the method
                   deals with the harmonics of multiple voiced-speech
                   spectra so that several sets of harmonics can be
                   obtained at various pitch frequencies to form a
                   spectral envelope. The experimental result shows that
                   the method estimates spectral envelopes with the
                   highest accuracy when the cepstral order is 48--64,
                   which suggests that the higher order coeffcients are
                   required to represent detailed envelopes reflecting the
                   real vocal-tract responses.},
  categories = {artic, lbg, clustering, mocha, harmonic, envelope,
                   edinburgh},
  month = jun,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_ssw504.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_ssw504.ps},
  year = 2004
}
@inproceedings{king:tokuda:zen:yamagishi:interspeech2008,
  author = {Simon King and Keiichi Tokuda and Heiga Zen and
                   Junichi Yamagishi},
  title = {Unsupervised adaptation for HMM-based speech synthesis},
  booktitle = {Proc. Interspeech},
  pages = {1869-1872},
  address = {Brisbane, Australia},
  abstract = {It is now possible to synthesise speech using HMMs
                   with a comparable quality to unit-selection techniques.
                   Generating speech from a model has many potential
                   advantages over concatenating waveforms. The most
                   exciting is model adaptation. It has been shown that
                   supervised speaker adaptation can yield high-quality
                   synthetic voices with an order of magnitude less data
                   than required to train a speaker-dependent model or to
                   build a basic unit-selection system. Such supervised
                   methods require labelled adaptation data for the target
                   speaker. In this paper, we introduce a method capable
                   of unsupervised adaptation, using only speech from the
                   target speaker without any labelling.},
  categories = {speech synthesis, HMM-based speech synthesis, HTS,
                   trajectory HMMs, speaker adaptation, MLLR},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080299.PDF},
  year = 2008
}
@inproceedings{sansegundo_et_al_IS2012,
  author = {Ruben San-Segundo and Juan M. Montero and Veronica
                   Lopez-Luden and Simon King},
  title = {Detecting Acronyms from Capital Letter Sequences in
                   Spanish},
  booktitle = {Proc. Interspeech},
  address = {Portland, Oregon, USA},
  abstract = {This paper presents an automatic strategy to decide
                   how to pronounce a Capital Letter Sequence (CLS) in a
                   Text to Speech system (TTS). If CLS is well known by
                   the TTS, it can be expanded in several words. But when
                   the CLS is unknown, the system has two alternatives:
                   spelling it (abbreviation) or pronouncing it as a new
                   word (acronym). In Spanish, there is a high
                   relationship between letters and phonemes. Because of
                   this, when a CLS is similar to other words in Spanish,
                   there is a high tendency to pronounce it as a standard
                   word. This paper proposes an automatic method for
                   detecting acronyms. Additionally, this paper analyses
                   the discrimination capability of some features, and
                   several strategies for combining them in order to
                   obtain the best classifier. For the best classifier,
                   the classification error is 8.45\%. About the feature
                   analysis, the best features have been the Letter
                   Sequence Perplexity and the Average N-gram order.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Thu-P10a-07.pdf},
  year = 2012
}
@inproceedings{toth:frankel:goztolya:king:interspeech2008,
  author = {Laszlo Toth and Joe Frankel and Gabor Gosztolya and
                   Simon King},
  title = {Cross-lingual Portability of MLP-Based Tandem Features
                   -- A Case Study for English and Hungarian},
  booktitle = {Proc. Interspeech},
  pages = {2695-2698},
  address = {Brisbane, Australia},
  abstract = {One promising approach for building ASR systems for
                   less-resourced languages is cross-lingual adaptation.
                   Tandem ASR is particularly well suited to such
                   adaptation, as it includes two cascaded modelling
                   steps: feature extraction using multi-layer perceptrons
                   (MLPs), followed by modelling using a standard HMM. The
                   language-specific tuning can be performed by adjusting
                   the HMM only, leaving the MLP untouched. Here we
                   examine the portability of feature extractor MLPs
                   between an Indo-European (English) and a Finno-Ugric
                   (Hungarian) language. We present experiments which use
                   both conventional phone-posterior and articulatory
                   feature (AF) detector MLPs, both trained on a much
                   larger quantity of (English) data than the monolingual
                   (Hungarian) system. We find that the cross-lingual
                   configurations achieve similar performance to the
                   monolingual system, and that, interestingly, the AF
                   detectors lead to slightly worse performance, despite
                   the expectation that they should be more
                   language-independent than phone-based MLPs. However,
                   the cross-lingual system outperforms all other
                   configurations when the English phone MLP is adapted on
                   the Hungarian data. },
  keywords = {tandem, ASR},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080729.PDF},
  year = 2008
}
@article{child_speech_journal_2010,
  author = {Watts, O. and Yamagishi, J. and King, S. and Berkling,
                   K.},
  title = {Synthesis of Child Speech with {HMM} Adaptation and
                   Voice Conversion},
  journal = {Audio, Speech, and Language Processing, IEEE
                   Transactions on},
  volume = {18},
  number = {5},
  pages = {1005--1016},
  abstract = {The synthesis of child speech presents challenges both
                   in the collection of data and in the building of a
                   synthesizer from that data. We chose to build a
                   statistical parametric synthesizer using the hidden
                   Markov model (HMM)-based system HTS, as this technique
                   has previously been shown to perform well for limited
                   amounts of data, and for data collected under imperfect
                   conditions. Six different configurations of the
                   synthesizer were compared, using both speaker-dependent
                   and speaker-adaptive modeling techniques, and using
                   varying amounts of data. For comparison with HMM
                   adaptation, techniques from voice conversion were used
                   to transform existing synthesizers to the
                   characteristics of the target speaker. Speaker-adaptive
                   voices generally outperformed child speaker-dependent
                   voices in the evaluation. HMM adaptation outperformed
                   voice conversion style techniques when using the full
                   target speaker corpus; with fewer adaptation data,
                   however, no significant listener preference for either
                   HMM adaptation or voice conversion methods was found.},
  doi = {10.1109/TASL.2009.2035029},
  issn = {1558-7916},
  keywords = {HMM adaptation techniques;child speech
                   synthesis;hidden Markov model;speaker adaptive modeling
                   technique;speaker dependent technique;speaker-adaptive
                   voice;statistical parametric synthesizer;target speaker
                   corpus;voice conversion;hidden Markov models;speech
                   synthesis;},
  month = jul,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/YamagishiJ_Synthesis\%20of\%20Child\%20Speech.pdf},
  year = 2010
}
@article{tejedor:wang:frankel:king:colas:specom2008,
  author = {Javier Tejedor and Dong Wang and Joe Frankel and Simon
                   King and José Colás},
  title = {A comparison of grapheme and phoneme-based units for
                   {S}panish spoken term detection},
  journal = {Speech Communication},
  volume = {50},
  number = {11-12},
  pages = {980-991},
  abstract = {The ever-increasing volume of audio data available
                   online through the world wide web means that automatic
                   methods for indexing and search are becoming essential.
                   Hidden Markov model (HMM) keyword spotting and lattice
                   search techniques are the two most common approaches
                   used by such systems. In keyword spotting, models or
                   templates are defined for each search term prior to
                   accessing the speech and used to find matches. Lattice
                   search (referred to as spoken term detection), uses a
                   pre-indexing of speech data in terms of word or
                   sub-word units, which can then quickly be searched for
                   arbitrary terms without referring to the original
                   audio. In both cases, the search term can be modelled
                   in terms of sub-word units, typically phonemes. For
                   in-vocabulary words (i.e. words that appear in the
                   pronunciation dictionary), the letter-to-sound
                   conversion systems are accepted to work well. However,
                   for out-of-vocabulary (OOV) search terms,
                   letter-to-sound conversion must be used to generate a
                   pronunciation for the search term. This is usually a
                   hard decision (i.e. not probabilistic and with no
                   possibility of backtracking), and errors introduced at
                   this step are difficult to recover from. We therefore
                   propose the direct use of graphemes (i.e., letter-based
                   sub-word units) for acoustic modelling. This is
                   expected to work particularly well in languages such as
                   Spanish, where despite the letter-to-sound mapping
                   being very regular, the correspondence is not
                   one-to-one, and there will be benefits from avoiding
                   hard decisions at early stages of processing. In this
                   article, we compare three approaches for Spanish
                   keyword spotting or spoken term detection, and within
                   each of these we compare acoustic modelling based on
                   phone and grapheme units. Experiments were performed
                   using the Spanish geographical-domain Albayzin corpus.
                   Results achieved in the two approaches proposed for
                   spoken term detection show us that trigrapheme units
                   for acoustic modelling match or exceed the performance
                   of phone-based acoustic models. In the method proposed
                   for keyword spotting, the results achieved with each
                   acoustic model are very similar.},
  categories = {Spoken term detection; Keyword spotting; Graphemes;
                   Spanish},
  doi = {10.1016/j.specom.2008.03.005},
  month = {November-December},
  year = 2008
}
@inproceedings{mayoclarkking-isp05,
  author = {Mayo, C. and Clark, R. A. J. and King, S.},
  title = {Multidimensional Scaling of Listener Responses to
                   Synthetic Speech},
  booktitle = {Proc. Interspeech 2005},
  address = {Lisbon, Portugal},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/ie-speech-2005.pdf},
  year = 2005
}
@inproceedings{king00:recognition_syll,
  author = {King, S. and Taylor, P. and Frankel, J. and Richmond,
                   K.},
  title = {Speech recognition via phonetically-featured syllables},
  booktitle = {PHONUS},
  volume = {5},
  pages = {15-34},
  address = {Institute of Phonetics, University of the Saarland},
  abstract = {We describe recent work on two new automatic speech
                   recognition systems. The first part of this paper
                   describes the components of a system based on
                   phonological features (which we call EspressoA) in
                   which the values of these features are estimated from
                   the speech signal before being used as the basis for
                   recognition. In the second part of the paper, another
                   system (which we call EspressoB) is described in which
                   articulatory parameters are used instead of
                   phonological features and a linear dynamical system
                   model is used to perform recognition from automatically
                   estimated values of these articulatory parameters.},
  categories = {am,artic,asr,ldm,phonetic_feature,mocha,timit,edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/King_et_al_Phonus2000.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/King_et_al_Phonus2000.ps},
  year = 2000
}
@techreport{king:verbmobil1996b,
  author = {Simon King},
  title = {Inventory design for {V}erbmobil {T}eilprojekt 4.4},
  institution = {IKP, Universitt Bonn},
  abstract = {Inventory design for Verbmobil English speech
                   synthesis},
  categories = {},
  month = oct,
  year = 1996
}
@inproceedings{5947506,
  author = {Hashimoto, K. and Yamagishi, J. and Byrne, W. and
                   King, S. and Tokuda, K.},
  title = {An analysis of machine translation and speech
                   synthesis in speech-to-speech translation system},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011
                   IEEE International Conference on},
  pages = {5108--5111},
  abstract = {This paper provides an analysis of the impacts of
                   machine translation and speech synthesis on
                   speech-to-speech translation systems. The
                   speech-to-speech translation system consists of three
                   components: speech recognition, machine translation and
                   speech synthesis. Many techniques for integration of
                   speech recognition and machine translation have been
                   proposed. However, speech synthesis has not yet been
                   considered. Therefore, in this paper, we focus on
                   machine translation and speech synthesis, and report a
                   subjective evaluation to analyze the impact of each
                   component. The results of these analyses show that the
                   naturalness and intelligibility of synthesized speech
                   are strongly affected by the fluency of the translated
                   sentences.},
  doi = {10.1109/ICASSP.2011.5947506},
  issn = {1520-6149},
  keywords = {machine translation;speech recognition;speech
                   synthesis;speech-to-speech translation system;speech
                   recognition;speech synthesis;},
  month = may,
  year = 2011
}
@inproceedings{junichi:interspeech2010,
  author = {Junichi Yamagishi and Oliver Watts and Simon King and
                   Bela Usabaev},
  title = {Roles of the Average Voice in Speaker-adaptive
                   {HMM}-based Speech Synthesis},
  booktitle = {{Proc. Interspeech}},
  pages = {418--421},
  address = {Makuhari, Japan},
  abstract = {In speaker-adaptive HMM-based speech synthesis, there
                   are typically a few speakers for which the output
                   synthetic speech sounds worse than that of other
                   speakers, despite having the same amount of adaptation
                   data from within the same corpus. This paper
                   investigates these fluctuations in quality and
                   concludes that as mel-cepstral distance from the
                   average voice becomes larger, the MOS naturalness
                   scores generally become worse. Although this negative
                   correlation is not that strong, it suggests a way to
                   improve the training and adaptation strategies. We also
                   draw comparisons between our findings and the work of
                   other researchers regarding ``vocal attractiveness.''},
  keywords = {speech synthesis, HMM, average voice, speaker
                   adaptation},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100361.pdf},
  year = 2010
}
@inproceedings{junichi:icassp2010,
  author = {J. Yamagishi and S. King},
  title = {Simple methods for improving speaker-similarity of
                   {HMM}-based speech synthesis},
  booktitle = {{Proc. ICASSP 2010}},
  address = {Dallas, Texas, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/JunichiICASSP10.pdf},
  year = 2010
}
@inproceedings{frankel05:hybrid,
  author = {Frankel, J. and King, S.},
  title = {A Hybrid {ANN/DBN} Approach to Articulatory Feature
                   Recognition},
  booktitle = {Proc. Eurospeech},
  address = {Lisbon},
  abstract = {Artificial neural networks (ANN) have proven to be
                   well suited to the task of articulatory feature (AF)
                   recognition. Previous studies have taken a cascaded
                   approach where separate ANNs are trained for each
                   feature group, making the assumption that features are
                   statistically independent. We address this by using
                   ANNs to provide virtual evidence to a dynamic Bayesian
                   network (DBN). This gives a hybrid ANN/DBN model and
                   allows modelling of inter-feature dependencies. We
                   demonstrate significant increases in AF recognition
                   accuracy from modelling dependencies between features,
                   and present the results of embedded training
                   experiments in which a set of asynchronous feature
                   changes are learned. Furthermore, we report on the
                   application of a Viterbi training scheme in which we
                   alternate between realigning the AF training labels and
                   retraining the ANNs.},
  categories = {am,artic,asr,dbn,oginumbers,edinburgh},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/Frankel_King_INTER2005.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/Frankel_King_INTER2005.ps},
  year = 2005
}
@inproceedings{gillett:king:eurospeech2003a,
  author = {Ben Gillett and Simon King},
  title = {Transforming Voice Quality},
  booktitle = {Proc. {E}urospeech},
  address = {Geneva},
  abstract = {Voice transformation is the process of transforming
                   the characteristics of speech uttered by a source
                   speaker, such that a listener would believe the speech
                   was uttered by a target speaker. In this paper we
                   address the problem of transforming voice quality. We
                   do not attempt to transform prosody. Our system has two
                   main parts corresponding to the two components of the
                   source-filter model of speech production. The first
                   component transforms the spectral envelope as
                   represented by a linear prediction model. The
                   transformation is achieved using a Gaussian mixture
                   model, which is trained on aligned speech from source
                   and target speakers. The second part of the system
                   predicts the spectral detail from the transformed
                   linear prediction coefficients. A novel approach is
                   proposed, which is based on a classifier and residual
                   codebooks. On the basis of a number of performance
                   metrics it outperforms existing systems.},
  categories = {},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/Gillett_King_eurospeech2003b.pdf},
  year = 2003
}
@article{junichi:ieee2010,
  author = {J. Yamagishi and B. Usabaev and S. King and O. Watts
                   and J. Dines and J. Tian and R. Hu and Y. Guan and K.
                   Oura and K. Tokuda and R. Karhila and M. Kurimo},
  title = {Thousands of Voices for {HMM}-based Speech Synthesis
                   -- Analysis and Application of {TTS} Systems Built on
                   Various {ASR} Corpora},
  journal = {IEEE Transactions on Audio, Speech and Language
                   Processing},
  volume = 18,
  number = 5,
  pages = {984--1004},
  abstract = {In conventional speech synthesis, large amounts of
                   phonetically balanced speech data recorded in highly
                   controlled recording studio environments are typically
                   required to build a voice. Although using such data is
                   a straightforward solution for high quality synthesis,
                   the number of voices available will always be limited,
                   because recording costs are high. On the other hand,
                   our recent experiments with HMM-based speech synthesis
                   systems have demonstrated that speaker-adaptive
                   HMM-based speech synthesis (which uses an ``average
                   voice model'' plus model adaptation) is robust to
                   non-ideal speech data that are recorded under various
                   conditions and with varying microphones, that are not
                   perfectly clean, and/or that lack phonetic balance.
                   This enables us to consider building high-quality
                   voices on ``non-TTS'' corpora such as ASR corpora.
                   Since ASR corpora generally include a large number of
                   speakers, this leads to the possibility of producing an
                   enormous number of voices automatically. In this paper,
                   we demonstrate the thousands of voices for HMM-based
                   speech synthesis that we have made from several popular
                   ASR corpora such as the Wall Street Journal (WSJ0,
                   WSJ1, and WSJCAM0), Resource Management, Globalphone,
                   and SPEECON databases. We also present the results of
                   associated analysis based on perceptual evaluation, and
                   discuss remaining issues.},
  doi = {10.1109/TASL.2010.2045237},
  keywords = {Automatic speech recognition (ASR), H Triple S (HTS),
                   SPEECON database, WSJ database, average voice, hidden
                   Markov model (HMM)-based speech synthesis, speaker
                   adaptation, speech synthesis, voice conversion},
  month = jul,
  year = 2010
}
@inproceedings{karaiskos:king:clark:mayo:blizzard2008,
  author = {Vasilis Karaiskos and Simon King and Robert A. J.
                   Clark and Catherine Mayo},
  title = {The Blizzard Challenge 2008},
  booktitle = {Proc. Blizzard Challenge Workshop},
  address = {Brisbane, Australia},
  abstract = {The Blizzard Challenge 2008 was the fourth annual
                   Blizzard Challenge. This year, participants were asked
                   to build two voices from a UK English corpus and one
                   voice from a Man- darin Chinese corpus. This is the
                   first time that a language other than English has been
                   included and also the first time that a large UK
                   English corpus has been available. In addi- tion, the
                   English corpus contained somewhat more expressive
                   speech than that found in corpora used in previous
                   Blizzard Challenges. To assist participants with
                   limited resources or limited ex- perience in
                   UK-accented English or Mandarin, unaligned la- bels
                   were provided for both corpora and for the test
                   sentences. Participants could use the provided labels
                   or create their own. An accent-specific pronunciation
                   dictionary was also available for the English speaker.
                   A set of test sentences was released to participants,
                   who were given a limited time in which to synthesise
                   them and submit the synthetic speech. An online
                   listening test was con- ducted, to evaluate
                   naturalness, intelligibility and degree of similarity
                   to the original speaker.},
  keywords = {Blizzard},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/summary_Blizzard2008.pdf},
  year = 2008
}
@inproceedings{goubanova_king_isp05,
  author = {Olga Goubanova and Simon King},
  title = {Predicting Consonant Duration with {B}ayesian Belief
                   Networks},
  booktitle = {Proc. Interspeech 2005},
  address = {Lisbon, Portugal},
  abstract = {Consonant duration is influenced by a number of
                   linguistic factors such as the consonant s identity,
                   within-word position, stress level of the previous and
                   following vowels, phrasal position of the word
                   containing the target consonant, its syllabic position,
                   identity of the previous and following segments. In our
                   work, consonant duration is predicted from a Bayesian
                   belief network (BN) consisting of discrete nodes for
                   the linguistic factors and a single continuous node for
                   the consonant s duration. Interactions between factors
                   are represented as conditional dependency arcs in this
                   graphical model. Given the parameters of the belief
                   network, the duration of each consonant in the test set
                   is then predicted as the value with the maximum
                   probability. We compare the results of the belief
                   network model with those of sums-of-products (SoP) and
                   classification and regression tree (CART) models using
                   the same data. In terms of RMS error, our BN model
                   performs better than both CART and SoP models. In terms
                   of the correlation coefficient, our BN model performs
                   better than SoP model, and no worse than CART model. In
                   addition, the Bayesian model reliably predicts
                   consonant duration in cases of missing or hidden
                   linguistic factors.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/goubanova_king_isp2005.pdf},
  year = 2005
}
@inproceedings{CassiaIS12,
  author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S.},
  title = {{Mel cepstral coefficient modification based on the
                   Glimpse Proportion measure for improving the
                   intelligibility of {HMM}-generated synthetic speech in
                   noise}},
  booktitle = {Proc. Interspeech},
  address = {Portland, USA},
  abstract = {We propose a method that modifies the Mel cepstral
                   coefficients of HMM-generated synthetic speech in order
                   to increase the intelligibility of the generated speech
                   when heard by a listener in the presence of a known
                   noise. This method is based on an approximation we
                   previously proposed for the Glimpse Proportion measure.
                   Here we show how to update the Mel cepstral
                   coefficients using this measure as an optimization
                   criterion and how to control the amount of distortion
                   by limiting the frequency resolution of the
                   modifications. To evaluate the method we built eight
                   different voices from normal read-text speech data from
                   a male speaker. Some voices were also built from
                   Lombard speech data produced by the same speaker.
                   Listening experiments with speech-shaped noise and with
                   a single competing talker indicate that our method
                   significantly improves intelligibility when compared to
                   unmodified synthetic speech. The voices built from
                   Lombard speech outperformed the proposed method
                   particularly for the competing talker case. However,
                   compared to a voice using only the spectral parameters
                   from Lombard speech, the proposed method obtains
                   similar or higher performance.},
  categories = {HMM-based speech synthesis, intelligibility
                   enhancement, Mel cepstral coefficients},
  month = {September},
  year = 2012
}
@article{clarkrichmondking_specom2007,
  author = {Robert A. J. Clark and Korin Richmond and Simon King},
  title = {Multisyn: Open-domain unit selection for the
                   {F}estival speech synthesis system},
  journal = {Speech Communication},
  volume = 49,
  number = 4,
  pages = {317--330},
  abstract = {We present the implementation and evaluation of an
                   open-domain unit selection speech synthesis engine
                   designed to be flexible enough to encourage further
                   unit selection research and allow rapid voice
                   development by users with minimal speech synthesis
                   knowledge and experience. We address the issues of
                   automatically processing speech data into a usable
                   voice using automatic segmentation techniques and how
                   the knowledge obtained at labelling time can be
                   exploited at synthesis time. We describe target cost
                   and join cost implementation for such a system and
                   describe the outcome of building voices with a number
                   of different sized datasets. We show that, in a
                   competitive evaluation, voices built using this
                   technology compare favourably to other systems.},
  categories = {speech synthesis, festival, multisyn, unitselection},
  doi = {10.1016/j.specom.2007.01.014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/clarkrichmondking_specom2007.pdf},
  year = 2007
}
@inproceedings{bell_king_is2007,
  author = {Bell, Peter and King, Simon},
  title = {Sparse Gaussian Graphical Models for Speech
                   Recognition},
  booktitle = {Proc. Interspeech 2007},
  address = {Antwerp, Belgium},
  abstract = {We address the problem of learning the structure of
                   Gaussian graphical models for use in automatic speech
                   recognition, a means of controlling the form of the
                   inverse covariance matrices of such systems. With
                   particular focus on data sparsity issues, we implement
                   a method for imposing graphical model structure on a
                   Gaussian mixture system, using a convex optimisation
                   technique to maximise a penalised likelihood
                   expression. The results of initial experiments on a
                   phone recognition task show a performance improvement
                   over an equivalent full-covariance system.},
  categories = {speech recognition, acoustic models, graphical models,
                   precision matrix models},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/sparseGM_is2007.pdf},
  year = 2007
}
@inproceedings{wester:ssw7:10,
  author = {Mirjam Wester and John Dines and Matthew Gibson and
                   Hui Liang and Yi-Jian Wu and Lakshmi Saheer and Simon
                   King and Keiichiro Oura and Philip N. Garner and
                   William Byrne and Yong Guan and Teemu Hirsim\"{a}ki and
                   Reima Karhila and Mikko Kurimo and Matt Shannon and
                   Sayaka Shiota and Jilei Tian and Keiichi Tokuda and
                   Junichi Yamagishi},
  title = {Speaker adaptation and the evaluation of speaker
                   similarity in the {EMIME} speech-to-speech translation
                   project},
  booktitle = {Proc. of 7th ISCA Speech Synthesis Workshop},
  address = {Kyoto, Japan},
  abstract = {This paper provides an overview of speaker adaptation
                   research carried out in the EMIME speech-to-speech
                   translation (S2ST) project. We focus on how speaker
                   adaptation transforms can be learned from speech in one
                   language and applied to the acoustic models of another
                   language. The adaptation is transferred across
                   languages and/or from recognition models to synthesis
                   models. The various approaches investigated can all be
                   viewed as a process in which a mapping is defined in
                   terms of either acoustic model states or linguistic
                   units. The mapping is used to transfer either speech
                   data or adaptation transforms between the two models.
                   Because the success of speaker adaptation in
                   text-to-speech synthesis is measured by judging speaker
                   similarity, we also discuss issues concerning
                   evaluation of speaker similarity in an S2ST scenario.},
  categories = {speaker adaptation, evaluation},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wester_ssw7_2010.pdf},
  year = 2010
}
@inproceedings{taylor:king:isard:wright:kowtko:eurospeech1997,
  author = {Paul A. Taylor and Simon King and Stephen Isard and
                   Helen Wright and Jacqueline Kowtko},
  title = {Using Intonation to Constrain Language Models in
                   Speech Recognition},
  booktitle = {Proc. {E}urospeech'97},
  address = {Rhodes},
  abstract = {This paper describes a method for using intonation to
                   reduce word error rate in a speech recognition system
                   designed to recognise spontaneous dialogue speech. We
                   use a form of dialogue analysis based on the theory of
                   conversational games. Different move types under this
                   analysis conform to different language models.
                   Different move types are also characterised by
                   different intonational tunes. Our overall recognition
                   strategy is first to predict from intonation the type
                   of game move that a test utterance represents, and then
                   to use a bigram language model for that type of move
                   during recognition. point in a game.},
  categories = {asr, intonation, dialogue, lm,id4s},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/Taylor_King_Isard_Wright_Kowtko_eurospeech1997.pdf},
  year = 1997
}
@inproceedings{shig032,
  author = {Yoshinori Shiga and Simon King},
  title = {Estimation of voice source and vocal tract
                   characteristics based on multi-frame analysis},
  booktitle = {Proc. Eurospeech},
  volume = 3,
  pages = {1749--1752},
  address = {Geneva, Switzerland},
  abstract = {This paper presents a new approach for estimating
                   voice source and vocal tract filter characteristics of
                   voiced speech. When it is required to know the transfer
                   function of a system in signal processing, the input
                   and output of the system are experimentally observed
                   and used to calculate the function. However, in the
                   case of source-filter separation we deal with in this
                   paper, only the output (speech) is observed and the
                   characteristics of the system (vocal tract) and the
                   input (voice source) must simultaneously be estimated.
                   Hence the estimate becomes extremely difficult, and it
                   is usually solved approximately using oversimplified
                   models. We demonstrate that these characteristics are
                   separable under the assumption that they are
                   independently controlled by different factors. The
                   separation is realised using an iterative approximation
                   along with the Multi-frame Analysis method, which we
                   have proposed to find spectral envelopes of voiced
                   speech with minimum interference of the harmonic
                   structure.},
  categories = {artic, lbg, clustering, mocha, source-filter,
                   edinburgh},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/shiga_eurospeech03b.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/shiga_eurospeech03b.ps},
  year = 2003
}
@article{king:taylor:csl2000,
  author = {Simon King and Paul Taylor},
  title = {Detection of Phonological Features in Continuous
                   Speech using Neural Networks},
  journal = {Computer {S}peech and {L}anguage},
  volume = 14,
  number = 4,
  pages = {333-353},
  abstract = {We report work on the first component of a two stage
                   speech recognition architecture based on phonological
                   features rather than phones. The paper reports
                   experiments on three phonological feature systems: 1)
                   the Sound Pattern of English (SPE) system which uses
                   binary features, 2)a multi valued (MV) feature system
                   which uses traditional phonetic categories such as
                   manner, place etc, and 3) Government Phonology (GP)
                   which uses a set of structured primes. All experiments
                   used recurrent neural networks to perform feature
                   detection. In these networks the input layer is a
                   standard framewise cepstral representation, and the
                   output layer represents the values of the features. The
                   system effectively produces a representation of the
                   most likely phonological features for each input frame.
                   All experiments were carried out on the TIMIT speaker
                   independent database. The networks performed well in
                   all cases, with the average accuracy for a single
                   feature ranging from 86 to 93 percent. We describe
                   these experiments in detail, and discuss the
                   justification and potential advantages of using
                   phonological features rather than phones for the basis
                   of speech recognition.},
  categories = {},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/King_Taylor_csl2000.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/King_Taylor_csl2000.ps},
  year = 2000
}
@article{mayo:clark:king:10,
  author = {Mayo, C. and Clark, R. A. J. and King, S.},
  title = {Listeners' Weighting of Acoustic Cues to Synthetic
                   Speech Naturalness: A Multidimensional Scaling Analysis},
  journal = {Speech Communication},
  volume = {53},
  number = {3},
  pages = {311--326},
  abstract = {The quality of current commercial speech synthesis
                   systems is now so high that system improvements are
                   being made at subtle sub- and supra-segmental levels.
                   Human perceptual evaluation of such subtle improvements
                   requires a highly sophisticated level of perceptual
                   attention to specific acoustic characteristics or cues.
                   However, it is not well understood what acoustic cues
                   listeners attend to by default when asked to evaluate
                   synthetic speech. It may, therefore, be potentially
                   quite difficult to design an evaluation method that
                   allows listeners to concentrate on only one dimension
                   of the signal, while ignoring others that are
                   perceptually more important to them. The aim of the
                   current study was to determine which acoustic
                   characteristics of unit-selection synthetic speech are
                   most salient to listeners when evaluating the
                   naturalness of such speech. This study made use of
                   multidimensional scaling techniques to analyse
                   listeners' pairwise comparisons of synthetic speech
                   sentences. Results indicate that listeners place a
                   great deal of perceptual importance on the presence of
                   artifacts and discontinuities in the speech, somewhat
                   less importance on aspects of segmental quality, and
                   very little importance on stress/intonation
                   appropriateness. These relative differences in
                   importance will impact on listeners' ability to attend
                   to these different acoustic characteristics of
                   synthetic speech, and should therefore be taken into
                   account when designing appropriate methods of synthetic
                   speech evaluation.},
  doi = {10.1016/j.specom.2010.10.003},
  keywords = {Speech synthesis; Evaluation; Speech perception;
                   Acoustic cue weighting; Multidimensional scaling},
  year = 2011
}
@inproceedings{bell_king_lineSearch_is2008,
  author = {Bell, Peter and King, Simon},
  title = {Covariance Updates for Discriminative Training by
                   Constrained Line Search},
  booktitle = {Proc. Interspeech},
  address = {Brisbane, Australia},
  abstract = {We investigate the recent Constrained Line Search
                   algorithm for discriminative training of HMMs and
                   propose an alternative formula for variance update. We
                   compare the method to standard techniques on a phone
                   recognition task.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/lineSearch_is2008.pdf},
  year = 2008
}
@article{2012E121001,
  author = {Junichi Yamagishi and Christophe Veaux and Simon King
                   and Steve Renals},
  title = {Speech synthesis technologies for individuals with
                   vocal disabilities: Voice banking and reconstruction},
  journal = {Acoustical Science and Technology},
  volume = {33},
  number = {1},
  pages = {1--5},
  url = {http://www.jstage.jst.go.jp/browse/ast/33/1/_contents},
  year = 2012
}
@inproceedings{king:wrench:icphs1999,
  author = {Simon King and Alan Wrench},
  title = {Dynamical System Modelling of Articulator Movement},
  booktitle = {Proc. {ICPhS} 99},
  pages = {2259-2262},
  address = {San Francisco},
  abstract = {We describe the modelling of articulatory movements
                   using (hidden) dynamical system models trained on
                   Electro-Magnetic Articulograph (EMA) data. These models
                   can be used for automatic speech recognition and to
                   give insights into articulatory behaviour. They belong
                   to a class of continuous-state Markov models, which we
                   believe can offer improved performance over
                   conventional Hidden Markov Models (HMMs) by better
                   accounting for the continuous nature of the underlying
                   speech production process -- that is, the movements of
                   the articulators. To assess the performance of our
                   models, a simple speech recognition task was used, on
                   which the models show promising results.},
  categories = {asr, artic, ema},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/King_Wrench_icphs1999.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/King_Wrench_icphs1999.ps},
  year = 1999
}
@inproceedings{jyamagis:1000sHTS,
  author = {J. Yamagishi and Bela Usabaev and Simon King and
                   Oliver Watts and John Dines and Jilei Tian and Rile Hu
                   and Yong Guan and Keiichiro Oura and Keiichi Tokuda and
                   Reima Karhila and Mikko Kurimo},
  title = {Thousands of voices for {HMM}-based speech synthesis},
  booktitle = {Proc. Interspeech},
  pages = {420--423},
  address = {Brighton, U.K.},
  abstract = {Our recent experiments with HMM-based speech synthesis
                   systems have demonstrated that speaker-adaptive
                   HMM-based speech synthesis (which uses an ‘average
                   voice model’ plus model adaptation) is robust to
                   non-ideal speech data that are recorded under various
                   conditions and with varying microphones, that are not
                   perfectly clean, and/or that lack of phonetic balance.
                   This enables us consider building high-quality voices
                   on ’non-TTS’ corpora such as ASR corpora. Since ASR
                   corpora generally include a large number of speakers,
                   this leads to the possibility of producing an enormous
                   number of voices automatically. In this paper we show
                   thousands of voices for HMM-based speech synthesis that
                   we have made from several popular ASR corpora such as
                   the Wall Street Journal databases (WSJ0/WSJ1/WSJCAM0),
                   Resource Management, Globalphone and Speecon. We report
                   some perceptual evaluation results and outline the
                   outstanding issues.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/index.php},
  year = 2009
}
@inproceedings{king_hmm_tutorial:india2010,
  author = {Simon King},
  title = {A tutorial on {HMM} speech synthesis (Invited paper)},
  booktitle = {Sadhana -- Academy Proceedings in Engineering
                   Sciences, Indian Institute of Sciences},
  abstract = {Statistical parametric speech synthesis, based on
                   HMM-like models, has become competitive with
                   established concatenative techniques over the last few
                   years. This paper offers a non-mathematical
                   introduction to this method of speech synthesis. It is
                   intended to be complementary to the wide range of
                   excellent technical publications already available.
                   Rather than offer a comprehensive literature review,
                   this paper instead gives a small number of carefully
                   chosen references which are good starting points for
                   further reading.},
  categories = {speech synthesis, HMM synthesis},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/king_hmm_tutorial.pdf},
  year = 2010
}
@article{king07:JASA2007,
  author = {King, S. and Frankel, J. and Livescu, K. and
                   McDermott, E. and Richmond, K. and Wester, M.},
  title = {Speech production knowledge in automatic speech
                   recognition},
  journal = {Journal of the Acoustical Society of America},
  volume = 121,
  number = 2,
  pages = {723--742},
  abstract = {Although much is known about how speech is produced,
                   and research into speech production has resulted in
                   measured articulatory data, feature systems of
                   different kinds and numerous models, speech production
                   knowledge is almost totally ignored in current
                   mainstream approaches to automatic speech recognition.
                   Representations of speech production allow simple
                   explanations for many phenomena observed in speech
                   which cannot be easily analyzed from either acoustic
                   signal or phonetic transcription alone. In this
                   article, we provide a survey of a growing body of work
                   in which such representations are used to improve
                   automatic speech recognition.},
  month = feb,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/King_et_al_review.pdf},
  year = 2007
}
@inproceedings{tejedor_interspeech10,
  author = {Javier Tejedor and Doroteo T. Toledano and Miguel
                   Bautista and Simon King and Dong Wang and Jose Colas},
  title = {Augmented set of features for confidence estimation in
                   spoken term detection},
  booktitle = {Proc. Interspeech},
  abstract = {Discriminative confidence estimation along with
                   confidence normalisation have been shown to construct
                   robust decision maker modules in spoken term detection
                   (STD) systems. Discriminative confidence estimation,
                   making use of termdependent features, has been shown to
                   improve the widely used lattice-based confidence
                   estimation in STD. In this work, we augment the set of
                   these term-dependent features and show a significant
                   improvement in the STD performance both in terms of
                   ATWV and DET curves in experiments conducted on a
                   Spanish geographical corpus. This work also proposes a
                   multiple linear regression analysis to carry out the
                   feature selection. Next, the most informative features
                   derived from it are used within the discriminative
                   confidence on the STD system.},
  categories = {confidence estimation, feature selection, spoken term
                   detection, speech recognition},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/features.pdf},
  year = 2010
}
@inproceedings{letter_based_TTS,
  author = {Oliver Watts and Junichi Yamagishi and Simon King},
  title = {Letter-based speech synthesis},
  booktitle = {Proc. Speech Synthesis Workshop 2010},
  pages = {317-322},
  address = {Nara, Japan},
  abstract = {Initial attempts at performing text-to-speech
                   conversion based on standard orthographic units are
                   presented, forming part of a larger scheme of training
                   TTS systems on features that can be trivially extracted
                   from text. We evaluate the possibility of using the
                   technique of decision-tree-based context clustering
                   conventionally used in HMM-based systems for
                   parametertying to handle letter-to-sound conversion. We
                   present the application of a method of compound-feature
                   discovery to corpusbased speech synthesis. Finally, an
                   evaluation of intelligibility of letter-based systems
                   and more conventional phoneme-based systems is
                   presented.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/ssw7.pdf},
  year = 2010
}
@inproceedings{wang_icassp2011a,
  author = {Dong Wang and Nicholas Evans and Raphael Troncy and
                   Simon King},
  title = {Handling overlaps in spoken term detection},
  booktitle = {Proc. International Conference on Acoustics, Speech
                   and Signal Processing},
  pages = {5656--5659},
  abstract = {Spoken term detection (STD) systems usually arrive at
                   many overlapping detections which are often addressed
                   with some pragmatic approaches, e.g. choosing the best
                   detection to represent all the overlaps. In this paper
                   we present a theoretical study based on a concept of
                   acceptance space. In particular, we present two
                   confidence estimation approaches based on Bayesian and
                   evidence perspectives respectively. Analysis shows that
                   both approaches possess respective ad vantages and
                   shortcomings, and that their combination has the
                   potential to provide an improved confidence estimation.
                   Experiments conducted on meeting data confirm our
                   analysis and show considerable performance improvement
                   with the combined approach, in particular for
                   out-of-vocabulary spoken term detection with stochastic
                   pronunciation modeling.},
  categories = {spoken term detection, speech recognition},
  doi = {10.1109/ICASSP.2011.5947643},
  month = may,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/wang_icassp2011a.pdf},
  year = 2011
}