The Centre for Speech Technology Research, The university of Edinburgh

Publications by Korin Richmond

korin.bib

@misc{turk2010,
  author = {Turk, Alice and Scobbie, James and Geng, Christian and
                   Campbell, Barry and Dickie, Catherine and Dubourg,
                   Eddie and Bard, Ellen Gurman and Hardcastle, William
                   and Hartinger, Mariam and King, Simon and Lickley,
                   Robin and Macmartin, Cedric and Nakai, Satsuki and
                   Renals, Steve and Richmond, Korin and Schaeffler, Sonja
                   and White, Kevin and Wiegand, Ronny and Wrench, Alan},
  title = {An {E}dinburgh speech production facility},
  howpublished = {Poster presented at the 12th Conference on Laboratory
                   Phonology, Albuquerque, New Mexico.},
  month = {July},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/ESPF.pdf},
  year = 2010
}
@inproceedings{ling:richmond:yamagishi:wang:2008a,
  author = {Ling, Zhen-Hua and Richmond, Korin and Yamagishi,
                   Junichi and Wang, Ren-Hua },
  title = {Articulatory Control of {HMM}-based Parametric Speech
                   Synthesis Driven by Phonetic Knowledge},
  booktitle = {Proc. Interspeech},
  pages = {573--576},
  address = {Brisbane, Australia},
  abstract = {This paper presents a method to control the
                   characteristics of synthetic speech flexibly by
                   integrating articulatory features into a Hidden Markov
                   Model (HMM)-based parametric speech synthesis system.
                   In contrast to model adaptation and interpolation
                   approaches for speaking style control, this method is
                   driven by phonetic knowledge, and target speech samples
                   are not required. The joint distribution of parallel
                   acoustic and articulatory features considering
                   cross-stream feature dependency is estimated. At
                   synthesis time, acoustic and articulatory features are
                   generated simultaneously based on the
                   maximum-likelihood criterion. The synthetic speech can
                   be controlled flexibly by modifying the generated
                   articulatory features according to arbitrary phonetic
                   rules in the parameter generation process. Our
                   experiments show that the proposed method is effective
                   in both changing the overall character of synthesized
                   speech and in controlling the quality of a specific
                   vowel.},
  categories = {speech synthesis, HMM, articulatory features, phonetic
                   knowledge},
  key = {ling:richmond:yamagishi:wang:2008a},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080582.PDF},
  year = 2008
}
@article{steiner:EL106,
  author = {Ingmar Steiner and Korin Richmond and Ian Marshall and
                   Calum D. Gray},
  title = {The magnetic resonance imaging subset of the mngu0
                   articulatory corpus},
  journal = {The Journal of the Acoustical Society of America},
  volume = {131},
  number = {2},
  pages = {EL106-EL111},
  abstract = {This paper announces the availability of the magnetic
                   resonance imaging (MRI) subset of the mngu0 corpus, a
                   collection of articulatory speech data from one speaker
                   containing different modalities. This subset comprises
                   volumetric MRI scans of the speaker's vocal tract
                   during sustained production of vowels and consonants,
                   as well as dynamic mid-sagittal scans of repetitive
                   consonant--vowel (CV) syllable production. For
                   reference, high-quality acoustic recordings of the
                   speech material are also available. The raw data are
                   made freely available for research purposes. },
  doi = {10.1121/1.3675459},
  keywords = {audio recording; magnetic resonance imaging; speech
                   processing},
  month = jan,
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2012/mngu0-mri-2.pdf},
  publisher = {ASA},
  year = 2012
}
@inproceedings{ultraxIS2012,
  author = {Richmond, Korin and Renals, Steve},
  title = {Ultrax: An Animated Midsagittal Vocal Tract Display
                   for Speech Therapy},
  booktitle = {Proc. Interspeech},
  address = {Portland, Oregon, USA},
  abstract = {Speech sound disorders (SSD) are the most common
                   communication impairment in childhood, and can hamper
                   social development and learning. Current speech therapy
                   interventions rely predominantly on the auditory skills
                   of the child, as little technology is available to
                   assist in diagnosis and therapy of SSDs. Realtime
                   visualisation of tongue movements has the potential to
                   bring enormous benefit to speech therapy. Ultrasound
                   scanning offers this possibility, although its display
                   may be hard to interpret. Our ultimate goal is to
                   exploit ultrasound to track tongue movement, while
                   displaying a simplified, diagrammatic vocal tract that
                   is easier for the user to interpret. In this paper, we
                   outline a general approach to this problem, combining a
                   latent space model with a dimensionality reducing model
                   of vocal tract shapes. We assess the feasibility of
                   this approach using magnetic resonance imaging (MRI)
                   scans to train a model of vocal tract shapes, which is
                   animated using electromagnetic articulography (EMA)
                   data from the same speaker.},
  categories = {Ultrasound, speech therapy, vocal tract visualisation},
  keywords = {Ultrasound, speech therapy, vocal tract visualisation},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/RichmondRenalsIS2012.pdf},
  year = 2012
}
@inproceedings{lingIS2012,
  author = {Ling, Zhen-Hua and Richmond, Korin and Yamagishi,
                   Junichi},
  title = {Vowel Creation by Articulatory Control in {HMM}-based
                   Parametric Speech Synthesis},
  booktitle = {Proc. Interspeech},
  address = {Portland, Oregon, USA},
  abstract = { This paper presents a method to produce a new vowel
                   by articulatory control in hidden Markov model (HMM)
                   based parametric speech synthesis. A multiple
                   regression HMM (MRHMM) is adopted to model the
                   distribution of acoustic features, with articulatory
                   features used as external auxiliary variables. The
                   dependency between acoustic and articulatory features
                   is modelled by a group of linear transforms that are
                   either estimated context-dependently or determined by
                   the distribution of articulatory features. Vowel
                   identity is removed from the set of context features
                   used to ensure compatibility between the
                   context-dependent model parameters and the articulatory
                   features of a new vowel. At synthesis time, acoustic
                   features are predicted according to the input
                   articulatory features as well as context information.
                   With an appropriate articulatory feature sequence, a
                   new vowel can be generated even when it does not exist
                   in the training set. Experimental results show this
                   method is effective in creating the English vowel /2/
                   by articulatory control without using any acoustic
                   samples of this vowel.},
  categories = {Speech synthesis, articulatory features,
                   multiple-regression hidden Markov model},
  keywords = {Speech synthesis, articulatory features,
                   multiple-regression hidden Markov model},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/LingRichmondYamagishi_IS2012.pdf},
  year = 2012
}
@article{6289354,
  author = {Ling, Z. and Richmond, K. and Yamagishi, J.},
  title = {Articulatory Control of {HMM}-based Parametric Speech
                   Synthesis using Feature-Space-Switched Multiple
                   Regression},
  journal = {Audio, Speech, and Language Processing, IEEE
                   Transactions on},
  volume = {21},
  number = {1},
  pages = {207--219},
  abstract = {In previous work we proposed a method to control the
                   characteristics of synthetic speech flexibly by
                   integrating articulatory features into a hidden Markov
                   model (HMM) based parametric speech synthesiser. In
                   this method, a unified acoustic-articulatory model is
                   trained, and context-dependent linear transforms are
                   used to model the dependency between the two feature
                   streams. In this paper, we go significantly further and
                   propose a feature-space-switched multiple regression
                   HMM to improve the performance of articulatory control.
                   A multiple regression HMM (MRHMM) is adopted to model
                   the distribution of acoustic features, with
                   articulatory features used as exogenous explanatory
                   variables. A separate Gaussian mixture model (GMM) is
                   introduced to model the articulatory space, and
                   articulatory-to-acoustic regression matrices are
                   trained for each component of this GMM, instead of for
                   the context-dependent states in the HMM. Furthermore,
                   we propose a task-specific context feature tailoring
                   method to ensure compatibility between state context
                   features and articulatory features that are manipulated
                   at synthesis time. The proposed method is evaluated on
                   two tasks, using a speech database with acoustic
                   waveforms and articulatory movements recorded in
                   parallel by electromagnetic articulography (EMA). In a
                   vowel identity modification task, the new method
                   achieves better performance when reconstructing target
                   vowels by varying articulatory inputs than our previous
                   approach. A second vowel creation task shows our new
                   method is highly effective at producing a new vowel
                   from appropriate articulatory representations which,
                   even though no acoustic samples for this vowel are
                   present in the training data, is shown to sound highly
                   natural.},
  doi = {10.1109/TASL.2012.2215600},
  issn = {1558-7916},
  year = 2013
}
@inproceedings{fitt_richmond_interspeech06,
  author = {Sue Fitt and Korin Richmond},
  title = {Redundancy and productivity in the speech technology
                   lexicon - can we do better?},
  booktitle = {Proc. Interspeech 2006},
  abstract = {Current lexica for speech technology typically contain
                   much redundancy, while omitting useful information. A
                   comparison with lexica in other media and for other
                   purposes is instructive, as it highlights some features
                   we may borrow for text-to-speech and speech recognition
                   lexica. We describe some aspects of the new lexicon we
                   are producing, Combilex, whose structure and
                   implementation is specifically designed to reduce
                   redundancy and improve the representation of productive
                   elements of English. Most importantly, many English
                   words are predictable derivations of baseforms, or
                   compounds. Storing the lexicon as a combination of
                   baseforms and derivational rules speeds up lexicon
                   development, and improves coverage and maintainability.},
  categories = {dictionary, lexicon, pronunciation, English accents,
                   productivity, derivation, redundancy, relational
                   database},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/Fitt_2006.pdf},
  year = 2006
}
@phdthesis{richmond2002,
  author = {Richmond, K.},
  title = {Estimating Articulatory Parameters from the Acoustic
                   Speech Signal},
  school = {The Centre for Speech Technology Research, Edinburgh
                   University},
  abstract = {A successful method for inferring articulation from
                   the acoustic speech signal would find many
                   applications: low bit-rate speech coding, visual
                   representation of speech, and the possibility of
                   improved automatic speech recognition to name but a
                   few. It is unsurprising, therefore, that researchers
                   have been investigating the acoustic-to-articulatory
                   inversion mapping for several decades now. A great
                   variety of approaches and models have been applied to
                   the problem. Unfortunately, the overwhelming majority
                   of these attempts have faced difficulties in
                   satisfactorily assessing performance in terms of
                   genuine human articulation. However, technologies such
                   as electromagnetic articulography (EMA) mean that
                   measurement of human articulation during speech has
                   become increasingly accessible. Crucially, a large
                   corpus of acoustic-articulatory data during
                   phonetically-diverse, continuous speech has recently
                   been recorded at Queen Margaret College, Edinburgh. One
                   of the primary motivations of this thesis is to exploit
                   the availability of this remarkable resource. Among the
                   data-driven models which have been employed in previous
                   studies, the feedforward multilayer perceptron (MLP) in
                   particular has been used several times with promising
                   results. Researchers have cited advantages in terms of
                   memory requirement and execution speed as a significant
                   factor motivating their use. Furthermore, the MLP is
                   well known as a universal function approximator; an MLP
                   of suitable form can in theory represent any arbitrary
                   mapping function. Therefore, using an MLP in
                   conjunction with the relatively large quantities of
                   acoustic-articulatory data arguably represents a
                   promising and useful first research step for the
                   current thesis, and a significant part of this thesis
                   is occupied with doing this. Having demonstrated an MLP
                   which performs well enough to provide a reasonable
                   baseline, we go on to critically evaluate the
                   suitability of the MLP for the inversion mapping. The
                   aim is to find ways to improve modelling accuracy
                   further. Considering what model of the target
                   articulatory domain is provided in the MLP is key in
                   this respect. It has been shown that the outputs of an
                   MLP trained with the sum-of-squares error function
                   approximate the mean of the target data points
                   conditioned on the input vector. In many situations,
                   this is an appropriate and sufficient solution. In
                   other cases, however, this conditional mean is an
                   inconveniently limiting model of data in the target
                   domain, particularly for ill-posed problems where the
                   mapping may be multi-valued. Substantial evidence
                   exists which shows that multiple articulatory
                   configurations are able to produce the same acoustic
                   signal. This means that a system intended to map from a
                   point in acoustic space can be faced with multiple
                   candidate articulatory configurations. Therefore,
                   despite the impressive ability of the MLP to model
                   mapping functions, it may prove inadequate in certain
                   respects for performing the acoustic-to-articulatory
                   inversion mapping. Mixture density networks (MDN)
                   provide a principled method to model arbitrary
                   probability density functions over the target domain,
                   conditioned on the input vector. In theory, therefore,
                   the MDN offers a superior model of the target domain
                   compared to the MLP. We hypothesise that this advantage
                   will prove beneficial in the case of the
                   acoustic-to-articulatory inversion mapping.
                   Accordingly, this thesis aims to test this hypothesis
                   and directly compare the performance of MDN with MLP on
                   exactly the same acoustic-to-articulatory inversion
                   task.},
  categories = {artic, ann, mlp, mdn, inversion, mocha, edinburgh},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/phd_final_bound.ps},
  year = 2002
}
@inproceedings{ling_interspeech2010,
  author = {Ling, Zhen-Hua and Richmond, Korin and Yamagishi,
                   Junichi},
  title = {{HMM}-based Text-to-Articulatory-Movement Prediction
                   and Analysis of Critical Articulators},
  booktitle = {Proc. Interspeech},
  pages = {2194--2197},
  address = {Makuhari, Japan},
  abstract = {In this paper we present a method to predict the
                   movement of a speaker's mouth from text input using
                   hidden Markov models (HMM). We have used a corpus of
                   human articulatory movements, recorded by
                   electromagnetic articulography (EMA), to train HMMs. To
                   predict articulatory movements from text, a suitable
                   model sequence is selected and the maximum-likelihood
                   parameter generation (MLPG) algorithm is used to
                   generate output articulatory trajectories. In our
                   experiments, we find that fully context-dependent
                   models outperform monophone and quinphone models,
                   achieving an average root mean square (RMS) error of
                   1.945mm when state durations are predicted from text,
                   and 0.872mm when natural state durations are used.
                   Finally, we go on to analyze the prediction error for
                   different EMA dimensions and phone types. We find a
                   clear pattern emerges that the movements of so-called
                   critical articulators can be predicted more accurately
                   than the average performance.},
  keywords = {Hidden Markov model, articulatory features, parameter
                   generation, critical articulators},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100349.pdf},
  year = 2010
}
@inproceedings{uria2011deep,
  author = {Uria, Benigno and Renals, Steve and Richmond, Korin},
  title = {A Deep Neural Network for Acoustic-Articulatory Speech
                   Inversion},
  booktitle = {Proc. NIPS 2011 Workshop on Deep Learning and
                   Unsupervised Feature Learning},
  address = {Sierra Nevada, Spain},
  abstract = {In this work, we implement a deep belief network for
                   the acoustic-articulatory inversion mapping problem. We
                   find that adding up to 3 hidden-layers improves
                   inversion accuracy. We also show that this improvement
                   is due to the higher ex- pressive capability of a deep
                   model and not a consequence of adding more adjustable
                   parameters. Additionally, we show unsupervised
                   pretraining of the sys- tem improves its performance in
                   all cases, even for a 1 hidden-layer model. Our
                   implementation obtained an average root mean square
                   error of 0.95 mm on the MNGU0 test dataset, beating all
                   previously published results.},
  month = {December},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2011/articulatory_inversion.pdf},
  year = 2011
}
@inproceedings{lingvowel,
  author = {Ling, Zhenhua and Richmond, Korin and Yamagishi,
                   Junichi},
  title = {Vowel Creation by Articulatory Control in {HMM}-based
                   Parametric Speech Synthesis},
  booktitle = {Proc. The Listening Talker Workshop},
  pages = {72},
  address = {Edinburgh, UK},
  month = {May},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2012/Ling_etal_LISTA.pdf},
  year = 2012
}
@inproceedings{felps_interspeech2010,
  author = {Felps, Daniel and Geng, Christian and Berger, Michael
                   and Richmond, Korin and Gutierrez-Osuna, Ricardo},
  title = {Relying on critical articulators to estimate vocal
                   tract spectra in an articulatory-acoustic database},
  booktitle = {Proc. Interspeech},
  pages = {1990--1993},
  abstract = {We present a new phone-dependent feature weighting
                   scheme that can be used to map articulatory
                   configurations (e.g. EMA) onto vocal tract spectra
                   (e.g. MFCC) through table lookup. The approach consists
                   of assigning feature weights according to a feature's
                   ability to predict the acoustic distance between
                   frames. Since an articulator's predictive accuracy is
                   phone-dependent (e.g., lip location is a better
                   predictor for bilabial sounds than for palatal sounds),
                   a unique weight vector is found for each phone.
                   Inspection of the weights reveals a correspondence with
                   the expected critical articulators for many phones. The
                   proposed method reduces overall cepstral error by 6\%
                   when compared to a uniform weighting scheme. Vowels
                   show the greatest benefit, though improvements occur
                   for 80\% of the tested phones.},
  keywords = {speech production, speech synthesis},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100076.pdf},
  year = 2010
}
@inproceedings{richmond2011a,
  author = {Richmond, Korin and Hoole, Phil and King, Simon},
  title = {Announcing the Electromagnetic Articulography (Day 1)
                   Subset of the mngu0 Articulatory Corpus},
  booktitle = {Proc. Interspeech},
  pages = {1505--1508},
  address = {Florence, Italy},
  abstract = {This paper serves as an initial announcement of the
                   availability of a corpus of articulatory data called
                   mngu0. This corpus will ultimately consist of a
                   collection of multiple sources of articulatory data
                   acquired from a single speaker: electromagnetic
                   articulography (EMA), audio, video, volumetric MRI
                   scans, and 3D scans of dental impressions. This data
                   will be provided free for research use. In this first
                   stage of the release, we are making available one
                   subset of EMA data, consisting of more than 1,300
                   phonetically diverse utterances recorded with a
                   Carstens AG500 electromagnetic articulograph.
                   Distribution of mngu0 will be managed by a dedicated
                   ``forum-style'' web site. This paper both outlines the
                   general goals motivating the distribution of the data
                   and the creation of the mngu0 web forum, and also
                   provides a description of the EMA data contained in
                   this initial release.},
  categories = {articulography, corpus, EMA},
  month = {August},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/IS110767.pdf},
  year = 2011
}
@article{ling2008,
  author = {Ling, Z. and Richmond, K. and Yamagishi, J. and Wang,
                   R.},
  title = {Integrating Articulatory Features into {HMM}-based
                   Parametric Speech Synthesis},
  journal = {IEEE Transactions on Audio, Speech and Language
                   Processing },
  volume = 17,
  number = 6,
  pages = {1171--1185},
  note = {\textbf{IEEE SPS 2010 Young Author Best Paper Award}},
  abstract = {This paper presents an investigation of ways to
                   integrate articulatory features into Hidden Markov
                   Model (HMM)-based parametric speech synthesis,
                   primarily with the aim of improving the performance of
                   acoustic parameter generation. The joint distribution
                   of acoustic and articulatory features is estimated
                   during training and is then used for parameter
                   generation at synthesis time in conjunction with a
                   maximum-likelihood criterion. Different model
                   structures are explored to allow the articulatory
                   features to influence acoustic modeling: model
                   clustering, state synchrony and cross-stream feature
                   dependency. The results of objective evaluation show
                   that the accuracy of acoustic parameter prediction can
                   be improved when shared clustering and
                   asynchronous-state model structures are adopted for
                   combined acoustic and articulatory features. More
                   significantly, our experiments demonstrate that
                   modeling the dependency between these two feature
                   streams can make speech synthesis more flexible. The
                   characteristics of synthetic speech can be easily
                   controlled by modifying generated articulatory features
                   as part of the process of acoustic parameter
                   generation.},
  categories = {Speech synthesis, articulation, HMM-based synthesis},
  doi = {10.1109/TASL.2009.2014796},
  key = {ling2008},
  month = aug,
  year = 2009
}
@inproceedings{richmond_interspeech2010,
  author = {Richmond, Korin and Clark, Robert and Fitt, Sue},
  title = {On Generating {C}ombilex Pronunciations via
                   Morphological Analysis},
  booktitle = {Proc. Interspeech},
  pages = {1974--1977},
  address = {Makuhari, Japan},
  abstract = {Combilex is a high-quality lexicon that has been
                   developed specifically for speech technology purposes
                   and recently released by CSTR. Combilex benefits from
                   many advanced features. This paper explores one of
                   these: the ability to generate fully-specified
                   transcriptions for morphologically derived words
                   automatically. This functionality was originally
                   implemented to encode the pronunciations of derived
                   words in terms of their constituent morphemes, thus
                   accelerating lexicon development and ensuring a high
                   level of consistency. In this paper, we propose this
                   method of modelling pronunciations can be exploited
                   further by combining it with a morphological parser,
                   thus yielding a method to generate full transcriptions
                   for unknown derived words. Not only could this
                   accelerate adding new derived words to Combilex, but it
                   could also serve as an alternative to conventional
                   letter-to-sound rules. This paper presents preliminary
                   work indicating this is a promising direction.},
  keywords = {combilex lexicon, letter-to-sound rules,
                   grapheme-to-phoneme conversion, morphological
                   decomposition},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100683.pdf},
  year = 2010
}
@inproceedings{Toney2004,
  author = {Toney, D. and Feinberg, D. and Richmond, K.},
  title = {Acoustic Features for Profiling Mobile Users of
                   Conversational Interfaces},
  booktitle = {6th International Symposium on Mobile Human-Computer
                   Interaction - {MobileHCI} 2004},
  editor = {Brewster, S. and Dunlop, M.},
  pages = {394--398},
  address = {Glasgow, Scotland},
  publisher = {Springer},
  abstract = {Conversational interfaces allow human users to use
                   spoken language to interact with computer-based
                   information services. In this paper, we examine the
                   potential for personalizing speech-based human-computer
                   interaction according to the user's gender and age. We
                   describe a system that uses acoustic features of the
                   user's speech to automatically estimate these physical
                   characteristics. We discuss the difficulties of
                   implementing this process in relation to the high level
                   of environmental noise that is typical of mobile
                   human-computer interaction.},
  month = sep,
  year = 2004
}
@inproceedings{clarkrichmondking_interspeech05,
  author = {Robert A.J. Clark and Korin Richmond and Simon King},
  title = {Multisyn voices from {ARCTIC} data for the {B}lizzard
                   challenge},
  booktitle = {Proc. Interspeech 2005},
  abstract = {This paper describes the process of building unit
                   selection voices for the Festival Multisyn engine using
                   four ARCTIC datasets, as part of the Blizzard
                   evaluation challenge. The build process is almost
                   entirely automatic, with very little need for human
                   intervention. We discuss the difference in the
                   evaluation results for each voice and evaluate the
                   suitability of the ARCTIC datasets for building this
                   type of voice.},
  categories = {speech synthesis, festival, evaluation},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/clarkrichmondking_interspeech05.pdf},
  year = 2005
}
@inproceedings{qin:perpinan:richmond:wrench:renals:2008a,
  author = {Qin, C. and Carreira-Perpiñán, M. and Richmond, K.
                   and Wrench, A. and Renals, S.},
  title = {Predicting Tongue Shapes from a Few Landmark Locations},
  booktitle = {Proc. Interspeech},
  pages = {2306--2309},
  address = {Brisbane, Australia},
  abstract = {We present a method for predicting the midsagittal
                   tongue contour from the locations of a few landmarks
                   (metal pellets) on the tongue surface, as used in
                   articulatory databases such as MOCHA and the Wisconsin
                   XRDB. Our method learns a mapping using ground-truth
                   tongue contours derived from ultrasound data and
                   drastically improves over spline interpolation. We also
                   determine the optimal locations of the landmarks, and
                   the number of landmarks required to achieve a desired
                   prediction error: 3-4 landmarks are enough to achieve
                   0.3-0.2 mm error per point on the tongue.},
  categories = {ultrasound, tongue contour, articulation},
  key = {qin:perpinan:richmond:wrench:renals:2008a},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080929.PDF},
  year = 2008
}
@inproceedings{lei2011a,
  author = {Lei, Ming and Yamagishi, Junichi and Richmond, Korin
                   and Ling, Zhen-Hua and King, Simon and Dai, Li-Rong},
  title = {Formant-controlled {HMM}-based speech synthesis},
  booktitle = {Proc. Interspeech},
  pages = {2777--2780},
  address = {Florence, Italy},
  abstract = {This paper proposes a novel framework that enables us
                   to manipulate and control formants in HMM-based speech
                   synthesis. In this framework, the dependency between
                   formants and spectral features is modelled by piecewise
                   linear transforms; formant parameters are effectively
                   mapped by these to the means of Gaussian distributions
                   over the spectral synthesis parameters. The spectral
                   envelope features generated under the influence of
                   formants in this way may then be passed to high-quality
                   vocoders to generate the speech waveform. This provides
                   two major advantages over conventional frameworks.
                   First, we can achieve spectral modification by changing
                   formants only in those parts where we want control,
                   whereas the user must specify all formants manually in
                   conventional formant synthesisers (e.g. Klatt). Second,
                   this can produce high-quality speech. Our results show
                   the proposed method can control vowels in the
                   synthesized speech by manipulating F 1 and F 2 without
                   any degradation in synthesis quality.},
  categories = {speech synthesis, hidden Markov model, formants,
                   controllability},
  month = {August},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/IS110769.pdf},
  year = 2011
}
@inproceedings{cabral:renals:richmond:yamagishi:2008a,
  author = {Cabral, J. and Renals, S. and Richmond, K. and
                   Yamagishi, J.},
  title = {Glottal Spectral Separation for Parametric Speech
                   Synthesis},
  booktitle = {Proc. Interspeech},
  pages = {1829--1832},
  address = {Brisbane, Australia},
  abstract = {This paper presents a method to control the
                   characteristics of synthetic speech flexibly by
                   integrating articulatory features into a Hidden Markov
                   Model (HMM)-based parametric speech synthesis system.
                   In contrast to model adaptation and interpolation
                   approaches for speaking style control, this method is
                   driven by phonetic knowledge, and target speech samples
                   are not required. The joint distribution of parallel
                   acoustic and articulatory features considering
                   cross-stream feature dependency is estimated. At
                   synthesis time, acoustic and articulatory features are
                   generated simultaneously based on the
                   maximum-likelihood criterion. The synthetic speech can
                   be controlled flexibly by modifying the generated
                   articulatory features according to arbitrary phonetic
                   rules in the parameter generation process. Our
                   experiments show that the proposed method is effective
                   in both changing the overall character of synthesized
                   speech and in controlling the quality of a specific
                   vowel. },
  categories = {HMM speech synthesis, Glottal Spectral Separation,
                   LF-model},
  key = {cabral:renals:richmond:yamagishi:2008a},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS081086.PDF},
  year = 2008
}
@inproceedings{wrench2000b,
  author = {Wrench, A. and Richmond, K.},
  title = {Continuous Speech Recognition Using Articulatory Data},
  booktitle = {Proc. {ICSLP} 2000},
  address = {Beijing, China},
  abstract = {In this paper we show that there is measurable
                   information in the articulatory system which can help
                   to disambiguate the acoustic signal. We measure
                   directly the movement of the lips, tongue, jaw, velum
                   and larynx and parameterise this articulatory feature
                   space using principle components analysis. The
                   parameterisation is developed and evaluated using a
                   speaker dependent phone recognition task on a specially
                   recorded TIMIT corpus of 460 sentences. The results
                   show that there is useful supplementary information
                   contained in the articulatory data which yields a small
                   but significant improvement in phone recognition
                   accuracy of 2\%. However, preliminary attempts to
                   estimate the articulatory data from the acoustic signal
                   and use this to supplement the acoustic input have not
                   yielded any significant improvement in phone accuracy.},
  categories = {artic, asr, ann, mlp, hmm, inversion, mocha,edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Wrench_2000_a.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Wrench_2000_a.ps},
  year = 2000
}
@inproceedings{clarkrichmondking_ssw504,
  author = {Robert A.J. Clark and Korin Richmond and Simon King},
  title = {Festival 2 -- build your own general purpose unit
                   selection speech synthesiser},
  booktitle = {Proc. 5th {ISCA} workshop on speech synthesis},
  abstract = {This paper describes version 2 of the Festival speech
                   synthesis system. Festival 2 provides a development
                   environment for concatenative speech synthesis, and now
                   includes a general purpose unit selection speech
                   synthesis engine. We discuss various aspects of unit
                   selection speech synthesis, focusing on the research
                   issues that relate to voice design and the automation
                   of the voice development process.},
  categories = {synthesis, festival, unitselection},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/clarkrichmondking_ssw504.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/clarkrichmondking_ssw504.ps},
  year = 2004
}
@inproceedings{cabral_ssw7,
  author = {Cabral, Jo{\~a}o and Renals, Steve and Richmond, Korin
                   and Yamagishi, Junichi},
  title = {Transforming Voice Source Parameters in a {HMM}-based
                   Speech Synthesiser with Glottal Post-Filtering},
  booktitle = {Proc. 7th ISCA Speech Synthesis Workshop (SSW7)},
  pages = {365--370},
  address = {NICT/ATR, Kyoto, Japan},
  abstract = {Control over voice quality, e.g. breathy and tense
                   voice, is important for speech synthesis applications.
                   For example, transformations can be used to modify
                   aspects of the voice re- lated to speaker's identity
                   and to improve expressiveness. How- ever, it is hard to
                   modify voice characteristics of the synthetic speech,
                   without degrading speech quality. State-of-the-art sta-
                   tistical speech synthesisers, in particular, do not
                   typically al- low control over parameters of the
                   glottal source, which are strongly correlated with
                   voice quality. Consequently, the con- trol of voice
                   characteristics in these systems is limited. In con-
                   trast, the HMM-based speech synthesiser proposed in
                   this paper uses an acoustic glottal source model. The
                   system passes the glottal signal through a whitening
                   filter to obtain the excitation of voiced sounds. This
                   technique, called glottal post-filtering, allows to
                   transform voice characteristics of the synthetic speech
                   by modifying the source model parameters. We evaluated
                   the proposed synthesiser in a perceptual ex- periment,
                   in terms of speech naturalness, intelligibility, and
                   similarity to the original speaker's voice. The results
                   show that it performed as well as a HMM-based
                   synthesiser, which generates the speech signal with a
                   commonly used high-quality speech vocoder.},
  keywords = {HMM-based speech synthesis, voice quality, glottal
                   post-filter},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/cabral_ssw7.pdf},
  year = 2010
}
@article{Ling2010834,
  author = {Zhen-Hua Ling and Korin Richmond and Junichi Yamagishi},
  title = {An Analysis of {HMM}-based prediction of articulatory
                   movements},
  journal = {Speech Communication},
  volume = {52},
  number = {10},
  pages = {834--846},
  abstract = { This paper presents an investigation into predicting
                   the movement of a speaker's mouth from text input using
                   hidden Markov models (HMM). A corpus of human
                   articulatory movements, recorded by electromagnetic
                   articulography (EMA), is used to train HMMs. To predict
                   articulatory movements for input text, a suitable model
                   sequence is selected and a maximum-likelihood parameter
                   generation (MLPG) algorithm is used to generate output
                   articulatory trajectories. Unified
                   acoustic-articulatory HMMs are introduced to integrate
                   acoustic features when an acoustic signal is also
                   provided with the input text. Several aspects of this
                   method are analyzed in this paper, including the
                   effectiveness of context-dependent modeling, the role
                   of supplementary acoustic input, and the
                   appropriateness of certain model structures for the
                   unified acoustic-articulatory models. When text is the
                   sole input, we find that fully context-dependent models
                   significantly outperform monophone and quinphone
                   models, achieving an average root mean square (RMS)
                   error of 1.945 mm and an average correlation
                   coefficient of 0.600. When both text and acoustic
                   features are given as input to the system, the
                   difference between the performance of quinphone models
                   and fully context-dependent models is no longer
                   significant. The best performance overall is achieved
                   using unified acoustic-articulatory quinphone HMMs with
                   separate clustering of acoustic and articulatory model
                   parameters, a synchronous-state sequence, and a
                   dependent-feature model structure, with an RMS error of
                   0.900 mm and a correlation coefficient of 0.855 on
                   average. Finally, we also apply the same quinphone HMMs
                   to the acoustic-articulatory, or inversion, mapping
                   problem, where only acoustic input is available. An
                   average root mean square (RMS) error of 1.076 mm and an
                   average correlation coefficient of 0.812 are achieved.
                   Taken together, our results demonstrate how text and
                   acoustic inputs both contribute to the prediction of
                   articulatory movements in the method used.},
  doi = {10.1016/j.specom.2010.06.006},
  issn = {0167-6393},
  keywords = {Hidden Markov model; Articulatory features; Parameter
                   generation},
  month = {October},
  year = 2010
}
@mastersthesis{richmond1997b,
  author = {Richmond, K.},
  title = {A Proposal for the Compartmental Modelling of Stellate
                   Cells in the Anteroventral Cochlear Nucleus, Using
                   Realistic Auditory Nerve Inputs},
  school = {Centre for Cognitive Science, University of Edinburgh},
  month = sep,
  year = 1997
}
@article{richmond2003,
  author = {Richmond, K. and King, S. and Taylor, P.},
  title = {Modelling the Uncertainty in Recovering Articulation
                   from Acoustics},
  journal = {Computer Speech and Language},
  volume = 17,
  pages = {153--172},
  abstract = {This paper presents an experimental comparison of the
                   performance of the multilayer perceptron (MLP) with
                   that of the mixture density network (MDN) for an
                   acoustic-to-articulatory mapping task. A corpus of
                   acoustic-articulatory data recorded by electromagnetic
                   articulography (EMA) for a single speaker was used as
                   training and test data for this purpose. In theory, the
                   MDN is able to provide a richer, more flexible
                   description of the target variables in response to a
                   given input vector than the least-squares trained MLP.
                   Our results show that the mean likelihoods of the
                   target articulatory parameters for an unseen test set
                   were indeed consistently higher with the MDN than with
                   the MLP. The increase ranged from approximately 3\% to
                   22\%, depending on the articulatory channel in
                   question. On the basis of these results, we argue that
                   using a more flexible description of the target domain,
                   such as that offered by the MDN, can prove beneficial
                   when modelling the acoustic-to-articulatory mapping.},
  categories = {artic, ann, mlp, mdn, inversion, mocha, edinburgh},
  key = {richmond2003},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/richmond2003.pdf},
  year = 2003
}
@inproceedings{steiner:richmond:2008a,
  author = {Steiner, I. and Richmond, K.},
  title = {Generating gestural timing from {EMA} data using
                   articulatory resynthesis},
  booktitle = {Proc. 8th International Seminar on Speech Production},
  address = {Strasbourg, France},
  abstract = {As part of ongoing work to integrate an articulatory
                   synthesizer into a modular TTS platform, a method is
                   presented which allows gestural timings to be generated
                   automatically from EMA data. Further work is outlined
                   which will adapt the vocal tract model and phoneset to
                   English using new articulatory data, and use
                   statistical trajectory models. },
  categories = {articulatory synthesis, EMA, VocalTractLab },
  key = {steiner:richmond:2008a},
  month = dec,
  year = 2008
}
@inproceedings{clark_blizzard2006,
  author = {Clark, R. and Richmond, K. and Strom, V. and King, S.},
  title = {Multisyn Voices for the {B}lizzard {C}hallenge 2006},
  booktitle = {Proc. Blizzard Challenge Workshop (Interspeech
                   Satellite)},
  address = {Pittsburgh, USA},
  note = {(http://festvox.org/blizzard/blizzard2006.html)},
  abstract = {This paper describes the process of building unit
                   selection voices for the Festival Multisyn engine using
                   the ATR dataset provided for the Blizzard Challenge
                   2006. We begin by discussing recent improvements that
                   we have made to the Multisyn voice building process,
                   prompted by our participation in the Blizzard Challenge
                   2006. We then go on to discuss our interpretation of
                   the results observed. Finally, we conclude with some
                   comments and suggestions for the formulation of future
                   Blizzard Challenges.},
  categories = {tts, blizzard, multisyn, unit selection},
  key = {clark_blizzard2006},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/cstr_blizzard2006.pdf},
  year = 2006
}
@inproceedings{cabral_yrwst,
  author = {J. Cabral and S. Renals and K. Richmond and J.
                   Yamagishi},
  title = {{HMM}-based Speech Synthesis with an Acoustic Glottal
                   Source Model},
  booktitle = {Proc. The First Young Researchers Workshop in Speech
                   Technology},
  abstract = {A major cause of degradation of speech quality in
                   HMM-based speech synthesis is the use of a simple delta
                   pulse signal to generate the excitation of voiced
                   speech. This paper describes a new approach to using an
                   acoustic glottal source model in HMM-based
                   synthesisers. The goal is to improve speech quality and
                   parametric flexibility to better model and transform
                   voice characteristics.},
  categories = {HMM-based Speech Synthesis, LF-Model, Glottal Spectral
                   Separation},
  month = apr,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/jscabral-yrwss2009.pdf},
  year = 2009
}
@misc{Hofer_Berger:sigg2010,
  author = {Gregor Hofer and Korin Richmond and Michael Berger},
  title = {Lip Synchronization by Acoustic Inversion},
  howpublished = {Poster at Siggraph 2010},
  address = {Los Angeles, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/lipsync-sig10.pdf},
  year = 2010
}
@inproceedings{richmond99,
  author = {Richmond, K.},
  title = {Estimating Velum Height from Acoustics During
                   Continuous Speech},
  booktitle = {Proc. Eurospeech},
  volume = 1,
  pages = {149--152},
  address = {Budapest, Hungary},
  abstract = {This paper reports on present work, in which a
                   recurrent neural network is trained to estimate `velum
                   height' during continuous speech. Parallel
                   acoustic-articulatory data comprising more than 400
                   read {TIMIT} sentences is obtained using
                   electromagnetic articulography (EMA). This data is
                   processed and used as training data for a range of
                   neural network sizes. The network demonstrating the
                   highest accuracy is identified. This performance is
                   then evaluated in detail by analysing the network's
                   output for each phonetic segment contained in 50
                   hand-labelled utterances set aside for testing
                   purposes.},
  categories = {artic, ann, mlp, inversion, mocha, edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/Richmond_1999_a.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/Richmond_1999_a.ps},
  year = 1999
}
@inproceedings{richmond2009b,
  author = {Richmond, K.},
  title = {Preliminary Inversion Mapping Results with a New {EMA}
                   Corpus},
  booktitle = {Proc. Interspeech},
  pages = {2835--2838},
  address = {Brighton, UK},
  abstract = {In this paper, we apply our inversion mapping method,
                   the trajectory mixture density network (TMDN), to a new
                   corpus of articulatory data, recorded with a Carstens
                   AG500 electromagnetic articulograph. This new data set,
                   mngu0, is relatively large and phonetically rich, among
                   other beneficial characteristics. We obtain good
                   results, with a root mean square (RMS) error of only
                   0.99mm. This compares very well with our previous
                   lowest result of 1.54mm RMS error for equivalent coils
                   of the MOCHA fsew0 EMA data. We interpret this as
                   showing the mngu0 data set is potentially more
                   consistent than the fsew0 data set, and is very useful
                   for research which calls for articulatory trajectory
                   data. It also supports our view that the TMDN is very
                   much suited to the inversion mapping problem.},
  keywords = {acoustic-articulatory inversion mapping, neural
                   network},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/IS090544.pdf},
  year = 2009
}
@inproceedings{cabral2011a,
  author = {Cabral, J.P. and Renals, S. and Yamagishi, J. and
                   Richmond, K.},
  title = {{HMM}-based speech synthesiser using the {LF}-model of
                   the glottal source},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011
                   IEEE International Conference on},
  pages = {4704--4707},
  abstract = {A major factor which causes a deterioration in speech
                   quality in {HMM}-based speech synthesis is the use of a
                   simple delta pulse signal to generate the excitation of
                   voiced speech. This paper sets out a new approach to
                   using an acoustic glottal source model in HMM-based
                   synthesisers instead of the traditional pulse signal.
                   The goal is to improve speech quality and to better
                   model and transform voice characteristics. We have
                   found the new method decreases buzziness and also
                   improves prosodic modelling. A perceptual evaluation
                   has supported this finding by showing a 55.6%
                   preference for the new system, as against the baseline.
                   This improvement, while not being as significant as we
                   had initially expected, does encourage us to work on
                   developing the proposed speech synthesiser further.},
  categories = {HMM-based speech synthesiser;acoustic glottal source
                   model LF-model;delta pulse signal;perceptual
                   evaluation;prosodic modelling;speech quality;voiced
                   speech generation;hidden Markov models;speech
                   synthesis;},
  doi = {10.1109/ICASSP.2011.5947405},
  issn = {1520-6149},
  month = may,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/05947405.pdf},
  year = 2011
}
@inproceedings{richmond2009a,
  author = {Richmond, K. and Clark, R. and Fitt, S.},
  title = {Robust {LTS} rules with the {Combilex} speech
                   technology lexicon},
  booktitle = {Proc. Interspeech},
  pages = {1295--1298},
  address = {Brighton, UK},
  abstract = {Combilex is a high quality pronunciation lexicon aimed
                   at speech technology applications that has recently
                   been released by CSTR. Combilex benefits from several
                   advanced features. This paper evaluates one of these:
                   the explicit alignment of phones to graphemes in a
                   word. This alignment can help to rapidly develop robust
                   and accurate letter-to-sound (LTS) rules, without
                   needing to rely on automatic alignment methods. To
                   evaluate this, we used Festival's LTS module, comparing
                   its standard automatic alignment with Combilex's
                   explicit alignment. Our results show using Combilex's
                   alignment improves LTS accuracy: 86.50\% words correct
                   as opposed to 84.49\%, with our most general form of
                   lexicon. In addition, building LTS models is greatly
                   accelerated, as the need to list allowed alignments is
                   removed. Finally, loose comparison with other studies
                   indicates Combilex is a superior quality lexicon in
                   terms of consistency and size.},
  keywords = {combilex, letter-to-sound rules, grapheme-to-phoneme
                   conversion},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/IS090308.pdf},
  year = 2009
}
@inproceedings{frankel00:NN_LDM,
  author = {Frankel, J. and Richmond, K. and King, S. and Taylor,
                   P.},
  title = {An automatic speech recognition system using neural
                   networks and linear dynamic models to recover and model
                   articulatory traces},
  booktitle = {Proc. {ICSLP}},
  abstract = {In this paper we describe a speech recognition system
                   using linear dynamic models and articulatory features.
                   Experiments are reported in which measured articulation
                   from the MOCHA corpus has been used, along with those
                   where the articulatory parameters are estimated from
                   the speech signal using a recurrent neural network.},
  categories = {am,artic,asr,ldm,mocha,edinburgh,inversion,ann},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Frankel_et_al_ICSLP2000.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Frankel_et_al_ICSLP2000.ps},
  year = 2000
}
@inproceedings{richmond2007_nolisp,
  author = {Richmond, K.},
  title = {Trajectory Mixture Density Networks With Multiple
                   Mixtures for Acoustic-Articulatory Inversion},
  booktitle = {Advances in Nonlinear Speech Processing, International
                   Conference on Non-Linear Speech Processing, NOLISP 2007},
  editor = {Chetouani, M. and Hussain, A. and Gas, B. and Milgram,
                   M. and Zarader, J.-L.},
  volume = 4885,
  series = {Lecture Notes in Computer Science},
  pages = {263--272},
  publisher = {Springer-Verlag Berlin Heidelberg},
  abstract = {We have previously proposed a trajectory model which
                   is based on a mixture density network (MDN) trained
                   with target variables augmented with dynamic features
                   together with an algorithm for estimating maximum
                   likelihood trajectories which respects the constraints
                   between those features. In this paper, we have extended
                   that model to allow diagonal covariance matrices and
                   multiple mixture components in the trajectory MDN
                   output probability density functions. We have evaluated
                   this extended model on an inversion mapping task and
                   found the trajectory model works well, outperforming
                   smoothing of equivalent trajectories using low-pass
                   filtering. Increasing the number of mixture components
                   in the TMDN improves results further.},
  categories = {ANN, TMDN, acoustic-articulatory inversion, MOCHA},
  doi = {10.1007/978-3-540-77347-4_23},
  key = {richmond2007_nolisp},
  month = dec,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/richmond_nolisp2007.pdf},
  year = 2007
}
@inproceedings{hofer_interspeech2010,
  author = {Hofer, Gregor and Richmond, Korin},
  title = {Comparison of {HMM} and {TMDN} Methods for Lip
                   Synchronisation},
  booktitle = {Proc. Interspeech},
  pages = {454--457},
  address = {Makuhari, Japan},
  abstract = {This paper presents a comparison between a hidden
                   Markov model (HMM) based method and a novel artificial
                   neural network (ANN) based method for lip
                   synchronisation. Both model types were trained on
                   motion tracking data, and a perceptual evaluation was
                   carried out comparing the output of the models, both to
                   each other and to the original tracked data. It was
                   found that the ANN-based method was judged
                   significantly better than the HMM based method.
                   Furthermore, the original data was not judged
                   significantly better than the output of the ANN method.},
  keywords = {hidden Markov model (HMM), mixture density network,
                   lip synchronisation, inversion mapping},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100668.pdf},
  year = 2010
}
@article{turk:2429,
  author = {Alice Turk and James Scobbie and Christian Geng and
                   Cedric Macmartin and Ellen Bard and Barry Campbell and
                   Catherine Dickie and Eddie Dubourg and Bill Hardcastle
                   and Phil Hoole and Evia Kanaida and Robin Lickley and
                   Satsuki Nakai and Marianne Pouplier and Simon King and
                   Steve Renals and Korin Richmond and Sonja Schaeffler
                   and Ronnie Wiegand and Kevin White and Alan Wrench},
  title = {The {Edinburgh Speech Production Facility's}
                   articulatory corpus of spontaneous dialogue.},
  journal = {The Journal of the Acoustical Society of America},
  volume = {128},
  number = {4},
  pages = {2429-2429},
  abstract = {The EPSRC‐funded Edinburgh Speech Production is
                   built around two synchronized Carstens AG500
                   electromagnetic articulographs (EMAs) in order to
                   capture articulatory∕acoustic data from spontaneous
                   dialogue. An initial articulatory corpus was designed
                   with two aims. The first was to elicit a range of
                   speech styles∕registers from speakers, and therefore
                   provide an alternative to fully scripted corpora. The
                   second was to extend the corpus beyond monologue, by
                   using tasks that promote natural discourse and
                   interaction. A subsidiary driver was to use dialects
                   from outwith North America: dialogues paired up a
                   Scottish English and a Southern British English
                   speaker. Tasks. Monologue: Story reading of ``Comma
                   Gets a Cure'' [Honorof et al. (2000)], lexical sets
                   [Wells (1982)], spontaneous story telling,
                   diadochokinetic tasks. Dialogue: Map tasks [Anderson et
                   al. (1991)], ``Spot the Difference'' picture tasks
                   [Bradlow et al. (2007)], story‐recall. Shadowing of
                   the spontaneous story telling by the second
                   participant. Each dialogue session includes
                   approximately 30 min of speech, and there are
                   acoustics‐only baseline materials. We will introduce
                   the corpus and highlight the role of articulatory
                   production data in helping provide a fuller
                   understanding of various spontaneous speech phenomena
                   by presenting examples of naturally occurring covert
                   speech errors, accent accommodation, turn taking
                   negotiation, and shadowing.},
  doi = {10.1121/1.3508679},
  publisher = {ASA},
  year = 2010
}
@inproceedings{cabral07,
  author = {J. Cabral and S. Renals and K. Richmond and J.
                   Yamagishi},
  title = {Towards an Improved Modeling of the Glottal Source in
                   Statistical Parametric Speech Synthesis},
  booktitle = {Proc.of the 6th ISCA Workshop on Speech Synthesis},
  address = {Bonn, Germany},
  abstract = {This paper proposes the use of the Liljencrants-Fant
                   model (LF-model) to represent the glottal source signal
                   in HMM-based speech synthesis systems. These systems
                   generally use a pulse train to model the periodicity of
                   the excitation signal of voiced speech. However, this
                   model produces a strong and uniform harmonic structure
                   throughout the spectrum of the excitation which makes
                   the synthetic speech sound buzzy. The use of a mixed
                   band excitation and phase manipulation reduces this
                   effect but it can result in degradation of the speech
                   quality if the noise component is not weighted
                   carefully. In turn, the LF-waveform has a decaying
                   spectrum at higher frequencies, which is more similar
                   to the real glottal source excitation signal. We
                   conducted a perceptual experiment to test the
                   hypothesis that the LF-model can perform as well as or
                   better than the pulse train in a HMM-based speech
                   synthesizer. In the synthesis, we used the mean values
                   of the LF-parameters, calculated by measurements of the
                   recorded speech. The result of this study is important
                   not only regarding the improvement in speech quality of
                   these type of systems, but also because the LF-model
                   can be used to model many characteristics of the
                   glottal source, such as voice quality, which are
                   important for voice transformation and generation of
                   expressive speech.},
  categories = {LF-model, Statistical parametric speech synthesis,
                   HMM-based speech synthesis},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/cabral07.pdf},
  year = 2007
}
@inproceedings{ling2011a,
  author = {Ling, Zhen-Hua and Richmond, Korin and Yamagishi,
                   Junichi},
  title = {Feature-space transform tying in unified
                   acoustic-articulatory modelling of articulatory control
                   of {HMM}-based speech synthesis},
  booktitle = {Proc. Interspeech},
  pages = {117--120},
  address = {Florence, Italy},
  abstract = {In previous work, we have proposed a method to control
                   the characteristics of synthetic speech flexibly by
                   integrating articulatory features into hidden Markov
                   model (HMM) based parametric speech synthesis. A
                   unified acoustic-articulatory model was trained and a
                   piecewise linear transform was adopted to describe the
                   dependency between these two feature streams. The
                   transform matrices were trained for each HMM state and
                   were tied based on each state's context. In this paper,
                   an improved acoustic-articulatory modelling method is
                   proposed. A Gaussian mixture model (GMM) is introduced
                   to model the articulatory space and the cross-stream
                   transform matrices are trained for each Gaussian
                   mixture instead of context-dependently. This means the
                   dependency relationship can vary with the change of
                   articulatory features flexibly. Our results show this
                   method improves the effectiveness of control over vowel
                   quality by modifing articulatory trajectories without
                   degrading naturalness.},
  categories = {speech synthesis, articulatory features, hidden Markov
                   model, Gaussian mixture model},
  month = {August},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/IS110482.pdf},
  year = 2011
}
@inproceedings{king00:recognition_syll,
  author = {King, S. and Taylor, P. and Frankel, J. and Richmond,
                   K.},
  title = {Speech recognition via phonetically-featured syllables},
  booktitle = {PHONUS},
  volume = {5},
  pages = {15-34},
  address = {Institute of Phonetics, University of the Saarland},
  abstract = {We describe recent work on two new automatic speech
                   recognition systems. The first part of this paper
                   describes the components of a system based on
                   phonological features (which we call EspressoA) in
                   which the values of these features are estimated from
                   the speech signal before being used as the basis for
                   recognition. In the second part of the paper, another
                   system (which we call EspressoB) is described in which
                   articulatory parameters are used instead of
                   phonological features and a linear dynamical system
                   model is used to perform recognition from automatically
                   estimated values of these articulatory parameters.},
  categories = {am,artic,asr,ldm,phonetic_feature,mocha,timit,edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/King_et_al_Phonus2000.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/King_et_al_Phonus2000.ps},
  year = 2000
}
@inproceedings{richmond2007a,
  author = {Richmond, K.},
  title = {A Multitask Learning Perspective on
                   Acoustic-Articulatory Inversion},
  booktitle = {Proc. Interspeech},
  address = {Antwerp, Belgium},
  abstract = {This paper proposes the idea that by viewing an
                   inversion mapping MLP from a Multitask Learning
                   perspective, we may be able to relax two constraints
                   which are inherent in using electromagnetic
                   articulography as a source of articulatory information
                   for speech technology purposes. As a first step to
                   evaluating this idea, we perform an inversion mapping
                   experiment in an attempt to ascertain whether the
                   hidden layer of a ``multitask'' MLP can act
                   beneficially as a hidden representation that is shared
                   between inversion mapping subtasks for multiple
                   articulatory targets. Our results in the case of the
                   tongue dorsum x-coordinate indicate this is indeed the
                   case and show good promise. Results for the tongue
                   dorsum y-coordinate however are not so clear-cut, and
                   will require further investigation.},
  categories = {acoustic-articulatory inversion, MLP, multitask
                   learning},
  key = {richmond2007a},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/is2007_paper.pdf},
  year = 2007
}
@article{clarkrichmondking_specom2007,
  author = {Robert A. J. Clark and Korin Richmond and Simon King},
  title = {Multisyn: Open-domain unit selection for the
                   {F}estival speech synthesis system},
  journal = {Speech Communication},
  volume = 49,
  number = 4,
  pages = {317--330},
  abstract = {We present the implementation and evaluation of an
                   open-domain unit selection speech synthesis engine
                   designed to be flexible enough to encourage further
                   unit selection research and allow rapid voice
                   development by users with minimal speech synthesis
                   knowledge and experience. We address the issues of
                   automatically processing speech data into a usable
                   voice using automatic segmentation techniques and how
                   the knowledge obtained at labelling time can be
                   exploited at synthesis time. We describe target cost
                   and join cost implementation for such a system and
                   describe the outcome of building voices with a number
                   of different sized datasets. We show that, in a
                   competitive evaluation, voices built using this
                   technology compare favourably to other systems.},
  categories = {speech synthesis, festival, multisyn, unitselection},
  doi = {10.1016/j.specom.2007.01.014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/clarkrichmondking_specom2007.pdf},
  year = 2007
}
@inproceedings{uriaIS2012,
  author = {Benigno Uria and Iain Murray and Steve Renals and
                   Korin Richmond},
  title = {Deep Architectures for Articulatory Inversion},
  booktitle = {Proc. Interspeech},
  address = {Portland, Oregon, USA},
  abstract = { We implement two deep architectures for the
                   acoustic-articulatory inversion mapping problem: a deep
                   neural network and a deep trajectory mixture density
                   network. We find that in both cases, deep architectures
                   produce more accurate predictions than shallow
                   architectures and that this is due to the higher
                   expressive capability of a deep model and not a
                   consequence of adding more adjustable parameters. We
                   also find that a deep trajectory mixture density
                   network is able to obtain better inversion accuracies
                   than smoothing the results of a deep neural network.
                   Our best model obtained an average root mean square
                   error of 0.885 mm on the MNGU0 test dataset.},
  categories = {Articulatory inversion, deep neural network, deep
                   belief network, deep regression network, pretraining},
  keywords = {Articulatory inversion, deep neural network, deep
                   belief network, deep regression network, pretraining},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Uria_et_al_IS2012.pdf},
  year = 2012
}
@inproceedings{richmond1997,
  author = {Richmond, K. and Smith, A. and Amitay, E.},
  title = {Detecting Subject Boundaries Within Text: A
                   Language-independent Statistical Approach},
  booktitle = {Proc. The Second Conference on Empirical Methods in
                   Natural Language Processing},
  pages = {47--54},
  address = {Brown University, Providence, USA},
  abstract = {We describe here an algorithm for detecting subject
                   boundaries within text based on a statistical lexical
                   similarity measure. Hearst has already tackled this
                   problem with good results (Hearst, 1994). One of her
                   main assumptions is that a change in subject is
                   accompanied by a change in vocabulary. Using this
                   assumption, but by introducing a new measure of word
                   significance, we have been able to build a robust and
                   reliable algorithm which exhibits improved accuracy
                   without sacrificing language independency.},
  categories = {nlp, texttiling, subject detection},
  key = {richmond1997},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/Richmond_1997_a.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/Richmond_1997_a.ps},
  year = 1997
}
@inproceedings{richmond2007b,
  author = {Richmond, K. and Strom, V. and Clark, R. and
                   Yamagishi, J. and Fitt, S.},
  title = {Festival Multisyn Voices for the 2007 Blizzard
                   Challenge},
  booktitle = {Proc. Blizzard Challenge Workshop (in Proc. SSW6)},
  address = {Bonn, Germany},
  abstract = {This paper describes selected aspects of the Festival
                   Multisyn entry to the Blizzard Challenge 2007. We
                   provide an overview of the process of building the
                   three required voices from the speech data provided.
                   This paper focuses on new features of Multisyn which
                   are currently under development and which have been
                   employed in the system used for this Blizzard
                   Challenge. These differences are the application of a
                   more flexible phonetic lattice representation during
                   forced alignment labelling and the use of a pitch
                   accent target cost component. Finally, we also examine
                   aspects of the speech data provided for this year's
                   Blizzard Challenge and raise certain issues for
                   discussion concerning the aim of comparing voices made
                   with differing subsets of the data provided.},
  categories = {tts, blizzard, multisyn, unit selection},
  key = {richmond2007b},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blizzard2007paper.pdf},
  year = 2007
}
@inproceedings{steiner_is2009a,
  author = {Steiner, I. and Richmond, K.},
  title = {Towards Unsupervised Articulatory Resynthesis of
                   {G}erman Utterances using {EMA} data},
  booktitle = {Proc. Interspeech},
  pages = {2055--2058},
  address = {Brighton, UK},
  abstract = {As part of ongoing research towards integrating an
                   articulatory synthesizer into a text-to-speech (TTS)
                   framework, a corpus of German utterances recorded with
                   electromagnetic articulography (EMA) is resynthesized
                   to provide training data for statistical models. The
                   resynthesis is based on a measure of similarity between
                   the original and resynthesized EMA trajectories,
                   weighted by articulatory relevance. Preliminary results
                   are discussed and future work outlined.},
  keywords = {articulatory speech synthesis, copy synthesis,
                   electromagnetic articulography, EMA},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/IS090558.pdf},
  year = 2009
}
@inproceedings{richmond2006,
  author = {Richmond, K.},
  title = {A Trajectory Mixture Density Network for the
                   Acoustic-Articulatory Inversion Mapping},
  booktitle = {Proc. Interspeech},
  address = {Pittsburgh, USA},
  abstract = {This paper proposes a trajectory model which is based
                   on a mixture density network trained with target
                   features augmented with dynamic features together with
                   an algorithm for estimating maximum likelihood
                   trajectories which respects constraints between the
                   static and derived dynamic features. This model was
                   evaluated on an inversion mapping task. We found the
                   introduction of the trajectory model successfully
                   reduced root mean square error by up to $7.5\%$, as
                   well as increasing correlation scores.},
  categories = {acoustic-articulatory, inversion mapping, MDN, MLPG,
                   trajectory modelling},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/interspeech2006_richmond.pdf},
  year = 2006
}
@inproceedings{richmond2001,
  author = {Richmond, K.},
  title = {Mixture Density Networks, Human Articulatory Data and
                   Acoustic-to-Articulatory Inversion of Continuous Speech},
  booktitle = {Proc. Workshop on Innovation in Speech Processing},
  pages = {259--276},
  organization = {Institute of Acoustics},
  categories = {artic, ann, mlp, mdn, inversion, mocha, edinburgh},
  key = {richmond2001},
  month = apr,
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/Richmond_2001_a.ps},
  year = 2001
}
@inproceedings{hofer-eurosp05,
  author = {G. Hofer and K. Richmond and R. Clark},
  title = {Informed Blending of Databases for Emotional Speech
                   Synthesis},
  booktitle = {Proc. Interspeech},
  abstract = {The goal of this project was to build a unit selection
                   voice that could portray emotions with varying
                   intensities. A suitable definition of an emotion was
                   developed along with a descriptive framework that
                   supported the work carried out. A single speaker was
                   recorded portraying happy and angry speaking styles.
                   Additionally a neutral database was also recorded. A
                   target cost function was implemented that chose units
                   according to emotion mark-up in the database. The
                   Dictionary of Affect supported the emotional target
                   cost function by providing an emotion rating for words
                   in the target utterance. If a word was particularly
                   'emotional', units from that emotion were favoured. In
                   addition intensity could be varied which resulted in a
                   bias to select a greater number emotional units. A
                   perceptual evaluation was carried out and subjects were
                   able to recognise reliably emotions with varying
                   amounts of emotional units present in the target
                   utterance.},
  categories = {speech synthesis,emotion,edinburgh},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hofer_emosyn.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hofer_emosyn.ps},
  year = 2005
}
@article{king07:JASA2007,
  author = {King, S. and Frankel, J. and Livescu, K. and
                   McDermott, E. and Richmond, K. and Wester, M.},
  title = {Speech production knowledge in automatic speech
                   recognition},
  journal = {Journal of the Acoustical Society of America},
  volume = 121,
  number = 2,
  pages = {723--742},
  abstract = {Although much is known about how speech is produced,
                   and research into speech production has resulted in
                   measured articulatory data, feature systems of
                   different kinds and numerous models, speech production
                   knowledge is almost totally ignored in current
                   mainstream approaches to automatic speech recognition.
                   Representations of speech production allow simple
                   explanations for many phenomena observed in speech
                   which cannot be easily analyzed from either acoustic
                   signal or phonetic transcription alone. In this
                   article, we provide a survey of a growing body of work
                   in which such representations are used to improve
                   automatic speech recognition.},
  month = feb,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/King_et_al_review.pdf},
  year = 2007
}
@article{onnis2005,
  author = {Onnis, L. and Monaghan, P. and Richmond, K. and
                   Chater, N.},
  title = {Phonology impacts segmentation in speech processing.},
  journal = {Journal of Memory and Language},
  volume = {53},
  number = {2},
  pages = {225--237},
  abstract = {Pea, Bonatti, Nespor and Mehler(2002) investigated an
                   artificial language where the structure of words was
                   determined by nonadjacent dependencies between
                   syllables. They found that segmentation of continuous
                   speech could proceed on the basis of these
                   dependencies. However, Pea et al.'s artificial
                   language contained a confound in terms of phonology, in
                   that the dependent syllables began with plosives and
                   the intervening syllables began with continuants. We
                   consider three hypotheses concerning the role of
                   phonology in speech segmentation in this task: (1)
                   participants may recruit probabilistic phonotactic
                   information from their native language to the
                   artificial language learning task; (2) phonetic
                   properties of the stimuli, such as the gaps that
                   precede unvoiced plosives, can influence segmentation;
                   and (3) grouping by phonological similarity between
                   dependent syllables contributes to learning the
                   dependency. In a series of experiments controlling the
                   phonological and statistical structure of the language,
                   we found that segmentation performance is influenced by
                   the three factors in different degrees. Learning of
                   non-adjacent dependencies did not occur when (3) is
                   eliminated. We suggest that phonological processing
                   provides a fundamental contribution to distributional
                   analysis.},
  categories = {artificial language learning, statistical learning,
                   segmentation, phonology, festival},
  key = {onnis2005},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/jml.pdf},
  year = 2005
}