The Centre for Speech Technology Research, The university of Edinburgh

Publications by Joe Frankel

joe.bib

@inproceedings{frankel07:AF_MLP,
  author = {Frankel, J. and Magimai-Doss, M. and King, S. and
                   Livescu, K. and Çetin, Ö.},
  title = {Articulatory Feature Classifiers Trained on 2000 hours
                   of Telephone Speech},
  booktitle = {Proc. Interspeech},
  address = {Antwerp, Belgium},
  abstract = {This paper is intended to advertise the public
                   availability of the articulatory feature (AF)
                   classification multi-layer perceptrons (MLPs) which
                   were used in the Johns Hopkins 2006 summer workshop. We
                   describe the design choices, data preparation, AF label
                   generation, and the training of MLPs for feature
                   classification on close to 2000 hours of telephone
                   speech. In addition, we present some analysis of the
                   MLPs in terms of classification accuracy and confusions
                   along with a brief summary of the results obtained
                   during the workshop using the MLPs. We invite
                   interested parties to make use of these MLPs.},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/frankel_AF-MLP.pdf},
  year = 2007
}
@inproceedings{vipperla08,
  author = {Ravichander Vipperla and Steve Renals and Joe Frankel},
  title = {Longitudinal study of {ASR} performance on ageing
                   voices},
  booktitle = {Proc.~Interspeech},
  address = {Brisbane},
  abstract = {This paper presents the results of a longitudinal
                   study of ASR performance on ageing voices. Experiments
                   were conducted on the audio recordings of the
                   proceedings of the Supreme Court Of The United States
                   (SCOTUS). Results show that the Automatic Speech
                   Recognition (ASR) Word Error Rates (WERs) for elderly
                   voices are significantly higher than those of adult
                   voices. The word error rate increases gradually as the
                   age of the elderly speakers increase. Use of maximum
                   likelihood linear regression (MLLR) based speaker
                   adaptation on ageing voices improves the WER though the
                   performance is still considerably lower compared to
                   adult voices. Speaker adaptation however reduces the
                   increase in WER with age during old age.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/vipperla_is08.pdf},
  year = 2008
}
@article{Wang_JCST2012,
  author = {Dong Wang and Javier Tejedor and Simon King and Joe
                   Frankel},
  title = {Term-dependent Confidence Normalization for
                   Out-of-Vocabulary Spoken Term Detection},
  journal = {Journal of Computer Science and Technology},
  volume = {27},
  number = {2},
  abstract = {Spoken Term Detection (STD) is a fundamental component
                   of spoken information retrieval systems. A key task of
                   an STD system is to determine reliable detections and
                   reject false alarms based on certain confidence
                   measures. The detection posterior probability, which is
                   often computed from lattices, is a widely used
                   confidence measure. However, a potential problem of
                   this confidence measure is that the confidence scores
                   of detections of all search terms are treated
                   uniformly, regardless of how much they may differ in
                   terms of phonetic or linguistic properties. This
                   problem is particularly evident for out-of-vocabulary
                   (OOV) terms which tend to exhibit high intra-term
                   diversity. To address the discrepancy on confidence
                   levels that the same confidence score may convey for
                   different terms, a term-dependent decision strategy is
                   desirable – for example, the term-specific threshold
                   (TST) approach. In this work, we propose a
                   term-dependent normalisation technique which
                   compensates for term diversity on confidence
                   estimation. Particularly, we propose a linear bias
                   compensation and a discriminative compensation to deal
                   with the bias problem that is inherent in lattice-based
                   confidence measuring from which the TST approach
                   suffers. We tested the proposed technique on speech
                   data from the multi-party meeting domain with two
                   state-of-the-art STD systems based on phonemes and
                   words respectively. The experimental results
                   demonstrate that the confidence normalisation approach
                   leads to a significant performance improvement in STD,
                   particularly for OOV terms with phoneme-based systems.},
  doi = {http://dx.doi.org/10.1007/s11390-012-1228-x},
  year = 2012
}
@inproceedings{wang:frankel:tejedor:king:icassp2008,
  author = {Dong Wang and Joe Frankel and Javier Tejedor and Simon
                   King},
  title = {A comparison of phone and grapheme-based spoken term
                   detection},
  booktitle = {Proc. ICASSP},
  pages = {4969--4972 },
  abstract = {We propose grapheme-based sub-word units for spoken
                   term detection (STD). Compared to phones, graphemes
                   have a number of potential advantages. For
                   out-of-vocabulary search terms, phone- based approaches
                   must generate a pronunciation using letter-to-sound
                   rules. Using graphemes obviates this potentially
                   error-prone hard decision, shifting pronunciation
                   modelling into the statistical models describing the
                   observation space. In addition, long-span grapheme
                   language models can be trained directly from large text
                   corpora. We present experiments on Spanish and English
                   data, comparing phone and grapheme-based STD. For
                   Spanish, where phone and grapheme-based systems give
                   similar transcription word error rates (WERs),
                   grapheme-based STD significantly outperforms a phone-
                   based approach. The converse is found for English,
                   where the phone-based system outperforms a grapheme
                   approach. However, we present additional analysis which
                   suggests that phone-based STD performance levels may be
                   achieved by a grapheme-based approach despite lower
                   transcription accuracy, and that the two approaches may
                   usefully be combined. We propose a number of directions
                   for future development of these ideas, and suggest that
                   if grapheme-based STD can match phone-based
                   performance, the inherent flexibility in dealing with
                   out-of-vocabulary terms makes this a desirable
                   approach.},
  doi = {10.1109/ICASSP.2008.4518773},
  month = {March-April},
  year = 2008
}
@article{frankel07:factoring,
  author = {Frankel, J. and King, S.},
  title = {Factoring {G}aussian Precision Matrices for Linear
                   Dynamic Models},
  journal = {Pattern Recognition Letters},
  volume = {28},
  number = {16},
  pages = {2264-2272},
  abstract = {The linear dynamic model (LDM), also known as the
                   Kalman filter model, has been the subject of research
                   in the engineering, control, and more recently, machine
                   learning and speech technology communities. The
                   Gaussian noise processes are usually assumed to have
                   diagonal, or occasionally full, covariance matrices. A
                   number of recent papers have considered modelling the
                   precision rather than covariance matrix of a Gaussian
                   distribution, and this work applies such ideas to the
                   LDM. A Gaussian precision matrix P can be factored into
                   the form P = UTSU where U is a transform and S a
                   diagonal matrix. By varying the form of U, the
                   covariance can be specified as being diagonal or full,
                   or used to model a given set of spatial dependencies.
                   Furthermore, the transform and scaling components can
                   be shared between models, allowing richer distributions
                   with only marginally more parameters than required to
                   specify diagonal covariances. The method described in
                   this paper allows the construction of models with an
                   appropriate number of parameters for the amount of
                   available training data. We provide illustrative
                   experimental results on synthetic and real speech data
                   in which models with factored precision matrices and
                   automatically-selected numbers of parameters are as
                   good as or better than models with diagonal covariances
                   on small data sets and as good as models with full
                   covariance matrices on larger data sets.},
  categories = {LDM},
  doi = {10.1016/j.patrec.2007.07.008},
  month = {December},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Frankel_LDM_covar.pdf},
  year = 2007
}
@inproceedings{cetin07:crosslingual,
  author = {Çetin, Ö. and Magimai-Doss, M. and Kantor, A. and
                   King, S. and Bartels, C. and Frankel, J. and Livescu,
                   K.},
  title = {Monolingual and crosslingual comparison of tandem
                   features derived from articulatory and phone {MLP}s},
  booktitle = {Proc. ASRU},
  address = {Kyoto},
  organization = {IEEE},
  abstract = {In recent years, the features derived from posteriors
                   of a multilayer perceptron (MLP), known as tandem
                   features, have proven to be very effective for
                   automatic speech recognition. Most tandem features to
                   date have relied on MLPs trained for phone
                   classification. We recently showed on a relatively
                   small data set that MLPs trained for articulatory
                   feature classification can be equally effective. In
                   this paper, we provide a similar comparison using MLPs
                   trained on a much larger data set - 2000 hours of
                   English conversational telephone speech. We also
                   explore how portable phone- and articulatory feature-
                   based tandem features are in an entirely different
                   language - Mandarin - without any retraining. We find
                   that while phone-based features perform slightly better
                   in the matched-language condition, they perform
                   significantly better in the cross-language condition.
                   Yet, in the cross-language condition, neither approach
                   is as effective as the tandem features extracted from
                   an MLP trained on a relatively small amount of
                   in-domain data. Beyond feature concatenation, we also
                   explore novel observation modelling schemes that allow
                   for greater flexibility in combining the tandem and
                   standard features at hidden Markov model (HMM) outputs.},
  month = {December},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Cetin_etal_ASRU2007.pdf},
  year = 2007
}
@inproceedings{frankel01:alternative,
  author = {Frankel, J. and King, S.},
  title = {Speech recognition in the articulatory domain:
                   investigating an alternative to acoustic {HMM}s},
  booktitle = {Proc. Workshop on Innovations in Speech Processing},
  abstract = {We describe a speech recognition system which uses a
                   combination of acoustic and articulatory features as
                   input. Linear dynamic models capture the trajectories
                   which characterize each segment type. We describe
                   classification and recognition tasks for systems based
                   on acoustic data in conjunction with both real and
                   automatically recovered articulatory parameters.},
  categories = {am,artic,asr,ldm,mocha,edinburgh},
  month = apr,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/Frankel_King_WISP2001.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/Frankel_King_WISP2001.ps},
  year = 2001
}
@article{5510125,
  author = {Wang, D. and King, S. and Frankel, J.},
  title = {Stochastic Pronunciation Modelling for
                   Out-of-Vocabulary Spoken Term Detection},
  journal = {Audio, Speech, and Language Processing, IEEE
                   Transactions on},
  volume = {PP},
  number = {99},
  abstract = {Spoken term detection (STD) is the name given to the
                   task of searching large amounts of audio for
                   occurrences of spoken terms, which are typically single
                   words or short phrases. One reason that STD is a hard
                   task is that search terms tend to contain a
                   disproportionate number of out-of-vocabulary (OOV)
                   words. The most common approach to STD uses subword
                   units. This, in conjunction with some method for
                   predicting pronunciations of OOVs from their written
                   form, enables the detection of OOV terms but
                   performance is considerably worse than for
                   in-vocabulary terms. This performance differential can
                   be largely attributed to the special properties of
                   OOVs. One such property is the high degree of
                   uncertainty in the pronunciation of OOVs. We present a
                   stochastic pronunciation model (SPM) which explicitly
                   deals with this uncertainty. The key insight is to
                   search for all possible pronunciations when detecting
                   an OOV term, explicitly capturing the uncertainty in
                   pronunciation. This requires a probabilistic model of
                   pronunciation, able to estimate a distribution over all
                   possible pronunciations. We use a joint-multigram model
                   (JMM) for this and compare the JMM-based SPM with the
                   conventional soft match approach. Experiments using
                   speech from the meetings domain demonstrate that the
                   SPM performs better than soft match in most operating
                   regions, especially at low false alarm probabilities.
                   Furthermore, SPM and soft match are found to be
                   complementary: their combination provides further
                   performance gains.},
  categories = {confidence estimation, spoken term detection, speech
                   recognition, OOVs},
  doi = {10.1109/TASL.2010.2058800},
  issn = {1558-7916},
  month = jul,
  year = 2010
}
@article{frankel07:AF_DBN,
  author = {Frankel, J. and Wester, M. and King, S.},
  title = {Articulatory feature recognition using dynamic
                   {B}ayesian networks},
  journal = {Computer Speech & Language },
  volume = {21},
  number = {4},
  pages = {620--640},
  abstract = {We describe a dynamic Bayesian network for
                   articulatory feature recognition. The model is intended
                   to be a component of a speech recognizer that avoids
                   the problems of conventional ``beads-on-a-string''
                   phoneme-based models. We demonstrate that the model
                   gives superior recognition of articulatory features
                   from the speech signal compared with a stateof- the art
                   neural network system. We also introduce a training
                   algorithm that offers two major advances: it does not
                   require time-aligned feature labels and it allows the
                   model to learn a set of asynchronous feature changes in
                   a data-driven manner.},
  month = oct,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Frankel_etal_CSL2007.pdf},
  year = 2007
}
@inproceedings{dongwang_interspeech09_spm,
  author = {Dong Wang and Simon King and Joe Frankel},
  title = {Stochastic Pronunciation Modelling for Spoken Term
                   Detection},
  booktitle = {Proc. of Interspeech},
  pages = {2135--2138},
  address = {Brighton, UK},
  abstract = {A major challenge faced by a spoken term detection
                   (STD) system is the detection of out-of-vocabulary
                   (OOV) terms. Although a subword-based STD system is
                   able to detect OOV terms, performance reduction is
                   always observed compared to in-vocabulary terms.
                   Current approaches to STD do not acknowledge the
                   particular properties of OOV terms, such as
                   pronunciation uncertainty. In this paper, we use a
                   stochastic pronunciation model to deal with the
                   uncertain pronunciations of OOV terms. By considering
                   all possible term pronunciations, predicted by a
                   joint-multigram model, we observe a significant
                   performance improvement. },
  categories = {joint-multigram, pronunciation model, spoken term
                   detection, speech recognition},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/spm.pdf},
  year = 2009
}
@inproceedings{janin06:rt06s,
  author = {Janin, A. and Stolcke, A. and Anguera, X. and Boakye,
                   K. and Çetin, Ö. and Frankel, J. and Zheng, J.},
  title = {The {ICSI-SRI} Spring 2006 Meeting Recognition System},
  booktitle = {Proc. MLMI},
  address = {Washington DC.},
  abstract = {We describe the development of the ICSI-SRI speech
                   recognition system for the National Institute of
                   Standards and Technology (NIST) Spring 2006 Meeting
                   Rich Transcription (RT-06S) evaluation, highlighting
                   improvements made since last year, including
                   improvements to the delay-and-sum algorithm, the
                   nearfield segmenter, language models, posterior-based
                   features, HMM adaptation methods, and adapting to a
                   small amount of new lecture data. Results are reported
                   on RT-05S and RT-06S meeting data. Compared to the
                   RT-05S conference system, we achieved an overall
                   improvement of 4\% relative in the MDM and SDM
                   conditions, and 11\% relative in the IHM condition. On
                   lecture data, we achieved an overall improvement of 8\%
                   relative in the SDM condition, 12\% on MDM, 14\% on
                   ADM, and 15\% on IHM.},
  categories = {am,asr},
  month = {May},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/Janin_et_al_RT06s.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/Janin_et_al_RT06s.ps},
  year = 2006
}
@inproceedings{Cetin07:tandem,
  author = {Çetin, Ö. and Kantor, A. and King, S. and Bartels,
                   C. and Magimai-Doss, M. and Frankel, J. and Livescu, K.},
  title = {An articulatory feature-based tandem approach and
                   factored observation modeling},
  booktitle = {Proc. ICASSP},
  address = {Honolulu},
  abstract = {The so-called tandem approach, where the posteriors of
                   a multilayer perceptron (MLP) classifier are used as
                   features in an automatic speech recognition (ASR)
                   system has proven to be a very effective method. Most
                   tandem approaches up to date have relied on MLPs
                   trained for phone classification, and appended the
                   posterior features to some standard feature hidden
                   Markov model (HMM). In this paper, we develop an
                   alternative tandem approach based on MLPs trained for
                   articulatory feature (AF) classification. We also
                   develop a factored observation model for characterizing
                   the posterior and standard features at the HMM outputs,
                   allowing for separate hidden mixture and state-tying
                   structures for each factor. In experiments on a subset
                   of Switchboard, we show that the AFbased tandem
                   approach is as effective as the phone-based approach,
                   and that the factored observation model significantly
                   outperforms the simple feature concatenation approach
                   while using fewer parameters.},
  month = {April},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Cetin_icassp07_tandem.pdf},
  year = 2007
}
@inproceedings{livescu07:JHU_summary,
  author = {Livescu, K. and Çetin, Ö. and Hasegawa-Johnson, M.
                   and King, S. and Bartels, C. and Borges, N. and Kantor,
                   A. and Lal, P. and Yung, L. and Bezman,
                   Dawson-Haggerty, S. and Woods, B. and Frankel, J. and
                   Magimai-Doss, M. and Saenko, K.},
  title = {Articulatory feature-based methods for acoustic and
                   audio-visual speech recognition: {S}ummary from the
                   2006 {JHU} {S}ummer {W}orkshop},
  booktitle = {Proc. ICASSP},
  address = {Honolulu},
  abstract = {We report on investigations, conducted at the 2006
                   Johns HopkinsWorkshop, into the use of articulatory
                   features (AFs) for observation and pronunciation models
                   in speech recognition. In the area of observation
                   modeling, we use the outputs of AF classiers both
                   directly, in an extension of hybrid HMM/neural network
                   models, and as part of the observation vector, an
                   extension of the tandem approach. In the area of
                   pronunciation modeling, we investigate a model having
                   multiple streams of AF states with soft synchrony
                   constraints, for both audio-only and audio-visual
                   recognition. The models are implemented as dynamic
                   Bayesian networks, and tested on tasks from the
                   Small-Vocabulary Switchboard (SVitchboard) corpus and
                   the CUAVE audio-visual digits corpus. Finally, we
                   analyze AF classication and forced alignment using a
                   newly collected set of feature-level manual
                   transcriptions.},
  month = {April},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/livescu_icassp07_sum.pdf},
  year = 2007
}
@inproceedings{wester04:asynch,
  author = {Wester, M. and Frankel, J. and King, S.},
  title = {Asynchronous Articulatory Feature Recognition Using
                   Dynamic {B}ayesian Networks},
  booktitle = {Proc. IEICI Beyond HMM Workshop},
  address = {Kyoto},
  abstract = {This paper builds on previous work where dynamic
                   Bayesian networks (DBN) were proposed as a model for
                   articulatory feature recognition. Using DBNs makes it
                   possible to model the dependencies between features, an
                   addition to previous approaches which was found to
                   improve feature recognition performance. The DBN
                   results were promising, giving close to the accuracy of
                   artificial neural nets (ANNs). However, the system was
                   trained on canonical labels, leading to an overly
                   strong set of constraints on feature co-occurrence. In
                   this study, we describe an embedded training scheme
                   which learns a set of data-driven asynchronous feature
                   changes where supported in the data. Using a subset of
                   the OGI Numbers corpus, we describe articulatory
                   feature recognition experiments using both
                   canonically-trained and asynchronous DBNs. Performance
                   using DBNs is found to exceed that of ANNs trained on
                   an identical task, giving a higher recognition
                   accuracy. Furthermore, inter-feature dependencies
                   result in a more structured model, giving rise to fewer
                   feature combinations in the recognition output. In
                   addition to an empirical evaluation of this modelling
                   approach, we give a qualitative analysis, comparing
                   asynchrony found through our data-driven methods to the
                   asynchrony which may be expected on the basis of
                   linguistic knowledge.},
  categories = {am,artic,asr,dbn,oginumbers,edinburgh},
  month = dec,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Wester_et_al_IEICE.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Wester_et_al_IEICE.ps},
  year = 2004
}
@inproceedings{dongwang_icassp09,
  author = {Dong Wang and Tejedor Tejedor and Joe Frankel and
                   Simon King},
  title = {Posterior-based confidence measures for spoken term
                   detection},
  booktitle = {Proc. of ICASSP09},
  address = {Taiwan},
  abstract = {Confidence measures play a key role in spoken term
                   detection (STD) tasks. The confidence measure expresses
                   the posterior probability of the search term appearing
                   in the detection period, given the speech. Traditional
                   approaches are based on the acoustic and language model
                   scores for candidate detections found using automatic
                   speech recognition, with Bayes' rule being used to
                   compute the desired posterior probability. In this
                   paper, we present a novel direct posterior-based
                   confidence measure which, instead of resorting to the
                   Bayesian formula, calculates posterior probabilities
                   from a multi-layer perceptron (MLP) directly. Compared
                   with traditional Bayesian-based methods, the
                   direct-posterior approach is conceptually and
                   mathematically simpler. Moreover, the MLP-based model
                   does not require assumptions to be made about the
                   acoustic features such as their statistical
                   distribution and the independence of static and dynamic
                   co-efficients. Our experimental results in both English
                   and Spanish demonstrate that the proposed direct
                   posterior-based confidence improves STD performance. },
  categories = {Spoken term detection, confidence measure, posterior
                   probabilities, MLP},
  month = {April},
  page = {4889--4892},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/posterior.pdf},
  year = 2009
}
@inproceedings{frankel01:ASR,
  author = {Frankel, J. and King, S.},
  title = {{ASR} - Articulatory Speech Recognition},
  booktitle = {Proc. {E}urospeech},
  pages = {599-602},
  address = {Aalborg, Denmark},
  abstract = {In this paper we report recent work on a speech
                   recognition system using a combination of acoustic and
                   articulatory features as input. Linear dynamic models
                   are used to capture the trajectories which characterize
                   each segment type. We describe classification and
                   recognition tasks for systems based on acoustic data in
                   conjunction with both real and automatically recovered
                   articulatory parameters.},
  categories = {am,artic,asr,ldm,mocha,edinburgh},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/Frankel_King_Eurospeech2001.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/Frankel_King_Eurospeech2001.ps},
  year = 2001
}
@inproceedings{frankel04:artic_dbn,
  author = {Frankel, J. and Wester, M. and King, S.},
  title = {Articulatory feature recognition using dynamic
                   {B}ayesian networks},
  booktitle = {Proc. {ICSLP}},
  abstract = {This paper describes the use of dynamic Bayesian
                   networks for the task of articulatory feature
                   recognition. We show that by modeling the dependencies
                   between a set of 6 multi-leveled articulatory features,
                   recognition accuracy is increased over an equivalent
                   system in which features are considered independent.
                   Results are compared to those found using artificial
                   neural networks on an identical task.},
  categories = {am,artic,asr,dbn,timit,edinburgh},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Frankel_et_al_ICSLP2004.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Frankel_et_al_ICSLP2004.ps},
  year = 2004
}
@inproceedings{dongwang_interspeech09_conf,
  author = {Dong Wang and Simon King and Joe Frankel and Peter
                   Bell},
  title = {Term-Dependent Confidence for Out-of-Vocabulary Term
                   Detection},
  booktitle = {Proc. Interspeech},
  pages = {2139--2142},
  address = {Brighton, UK},
  abstract = { Within a spoken term detection (STD) system, the
                   decision maker plays an important role in retrieving
                   reliable detections. Most of the state-of-the-art STD
                   systems make decisions based on a confidence measure
                   that is term-independent, which poses a serious problem
                   for out-of-vocabulary (OOV) term detection. In this
                   paper, we study a term-dependent confidence measure
                   based on confidence normalisation and discriminative
                   modelling, particularly focusing on its remarkable
                   effectiveness for detecting OOV terms. Experimental
                   results indicate that the term-dependent confidence
                   provides much more significant improvement for OOV
                   terms than terms in-vocabulary. },
  categories = {joint-multigram, pronunciation model, spoken term
                   detection, speech recognition},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/conf.pdf},
  year = 2009
}
@inproceedings{vipperla2010a,
  author = {Vipperla, Ravi Chander and Renals, Steve and Frankel,
                   Joe},
  title = {Augmentation of adaptation data},
  booktitle = {Proc. Interspeech},
  pages = {530--533},
  address = {Makuhari, Japan},
  abstract = {Linear regression based speaker adaptation approaches
                   can improve Automatic Speech Recognition (ASR) accuracy
                   significantly for a target speaker. However, when the
                   available adaptation data is limited to a few seconds,
                   the accuracy of the speaker adapted models is often
                   worse compared with speaker independent models. In this
                   paper, we propose an approach to select a set of
                   reference speakers acoustically close to the target
                   speaker whose data can be used to augment the
                   adaptation data. To determine the acoustic similarity
                   of two speakers, we propose a distance metric based on
                   transforming sample points in the acoustic space with
                   the regression matrices of the two speakers. We show
                   the validity of this approach through a speaker
                   identification task. ASR results on SCOTUS and AMI
                   corpora with limited adaptation data of 10 to 15
                   seconds augmented by data from selected reference
                   speakers show a significant improvement in Word Error
                   Rate over speaker independent and speaker adapted
                   models.},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/vipperla-is2010.pdf},
  year = 2010
}
@inproceedings{dong_ivan_joe_simon_interspeech08_marray,
  author = {Dong Wang and Ivan Himawan and Joe Frankel and Simon
                   King},
  title = {A Posterior Approach for Microphone Array Based Speech
                   Recognition},
  booktitle = {Proc. Interspeech},
  pages = {996--999},
  abstract = {Automatic speech recognition (ASR) becomes rather
                   difficult in meetings domains because of the adverse
                   acoustic conditions, including more background noise,
                   more echo and reverberation and frequent cross-talking.
                   Microphone arrays have been demonstrated able to boost
                   ASR performance dramatically in such noisy and
                   reverberant environments, with various beamforming
                   algorithms. However, almost all existing beamforming
                   measures work in the acoustic domain, resorting to
                   signal processing theories and geometric explanation.
                   This limits their application, and induces significant
                   performance degradation when the geometric property is
                   unavailable or hard to estimate, or if heterogenous
                   channels exist in the audio system. In this paper, we
                   preset a new posterior-based approach for array-based
                   speech recognition. The main idea is, instead of
                   enhancing speech signals, we try to enhance the
                   posterior probabilities that frames belonging to
                   recognition units, e.g., phones. These enhanced
                   posteriors are then transferred to posterior
                   probability based features and are modeled by HMMs,
                   leading to a tandem ANN-HMM hybrid system presented by
                   Hermansky et al.. Experimental results demonstrated the
                   validity of this posterior approach. With the posterior
                   accumulation or enhancement, significant improvement
                   was achieved over the single channel baseline.
                   Moreover, we can combine the acoustic enhancement and
                   posterior enhancement together, leading to a hybrid
                   acoustic-posterior beamforming approach, which works
                   significantly better than just the acoustic
                   beamforming, especially in the scenario with
                   moving-speakers. },
  categories = {speech recognition, microphone array, beamforming,
                   tandem approach},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/marray.a.pdf},
  year = 2008
}
@article{frankel06:adapt,
  author = {Frankel, J. and King, S.},
  title = {Observation Process Adaptation for Linear Dynamic
                   Models},
  journal = {Speech Communication},
  volume = 48,
  number = 9,
  pages = {1192-1199},
  abstract = {This work introduces two methods for adapting the
                   observation process parameters of linear dynamic models
                   (LDM) or other linear-Gaussian models. The first method
                   uses the expectation-maximization (EM) algorithm to
                   estimate transforms for location and covariance
                   parameters, and the second uses a generalized EM (GEM)
                   approach which reduces computation in making updates
                   from $O(p^6)$ to $O(p^3)$, where $p$ is the feature
                   dimension. We present the results of speaker adaptation
                   on TIMIT phone classification and recognition
                   experiments with relative error reductions of up to
                   $6\%$. Importantly, we find minimal differences in the
                   results from EM and GEM. We therefore propose that the
                   GEM approach be applied to adaptation of hidden Markov
                   models which use non-diagonal covariances. We provide
                   the necessary update equations.},
  categories = {am,asr,ldm,timit,edinburgh},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/Frankel_King_SPECOM2006.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/Frankel_King_SPECOM2006.ps},
  year = 2006
}
@article{vipperla2010,
  author = {Vipperla, Ravi Chander and Renals, Steve and Frankel,
                   Joe},
  title = {Ageing voices: The effect of changes in voice
                   parameters on {ASR} performance},
  journal = {EURASIP Journal on Audio, Speech, and Music Processing},
  abstract = {With ageing, human voices undergo several changes
                   which are typically characterized by increased
                   hoarseness and changes in articulation patterns. In
                   this study, we have examined the effect on Automatic
                   Speech Recognition (ASR) and found that the Word Error
                   Rates (WER) on older voices is about 9\% absolute
                   higher compared to those of adult voices. Subsequently,
                   we compared several voice source parameters including
                   fundamental frequency, jitter, shimmer, harmonicity and
                   cepstral peak prominence of adult and older males.
                   Several of these parameters show statistically
                   significant difference for the two groups. However,
                   artificially increasing jitter and shimmer measures do
                   not effect the ASR accuracies significantly.
                   Artificially lowering the fundamental frequency
                   degrades the ASR performance marginally but this drop
                   in performance can be overcome to some extent using
                   Vocal Tract Length Normalisation (VTLN). Overall, we
                   observe that the changes in the voice source parameters
                   do not have a significant impact on ASR performance.
                   Comparison of the likelihood scores of all the phonemes
                   for the two age groups show that there is a systematic
                   mismatch in the acoustic space of the two age groups.
                   Comparison of the phoneme recognition rates show that
                   mid vowels, nasals and phonemes that depend on the
                   ability to create constrictions with tongue tip for
                   articulation are more affected by ageing than other
                   phonemes.},
  doi = {10.1155/2010/525783},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/vipperla-eurasip10.pdf},
  url = {http://dx.doi.org/10.1155/2010/525783},
  year = 2010
}
@article{frankel07:ldm,
  author = {Frankel, J. and King, S.},
  title = {Speech Recognition using Linear Dynamic Models},
  journal = {IEEE {T}ransactions on {S}peech and {A}udio
                   {P}rocessing},
  volume = 15,
  number = 1,
  pages = {246--256},
  abstract = {The majority of automatic speech recognition (ASR)
                   systems rely on hidden Markov models, in which Gaussian
                   mixtures model the output distributions associated with
                   sub-phone states. This approach, whilst successful,
                   models consecutive feature vectors (augmented to
                   include derivative information) as statistically
                   independent. Furthermore, spatial correlations present
                   in speech parameters are frequently ignored through the
                   use of diagonal covariance matrices. This paper
                   continues the work of Digalakis and others who proposed
                   instead a first-order linear state-space model which
                   has the capacity to model underlying dynamics, and
                   furthermore give a model of spatial correlations. This
                   paper examines the assumptions made in applying such a
                   model and shows that the addition of a hidden dynamic
                   state leads to increases in accuracy over otherwise
                   equivalent static models. We also propose a
                   time-asynchronous decoding strategy suited to
                   recognition with segment models. We describe
                   implementation of decoding for linear dynamic models
                   and present TIMIT phone recognition results.},
  categories = {am,asr,ldm,timit,search,edinburgh},
  month = {January},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Frankel_King_IEEE2007.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Frankel_King_IEEE2007.ps},
  year = 2007
}
@inproceedings{wang_std_covariance_icassp2010,
  author = {Wang, Dong and King, Simon and Frankel, Joe and Bell,
                   Peter},
  title = {Stochastic Pronunciation Modelling and Soft Match for
                   Out-of-vocabulary Spoken Term Detection},
  booktitle = {Proc. ICASSP},
  address = {Dallas, Texas, USA},
  abstract = {A major challenge faced by a spoken term detection
                   (STD) system is the detection of out-of-vocabulary
                   (OOV) terms. Although a subword-based STD system is
                   able to detect OOV terms, performance reduction is
                   always observed compared to in-vocabulary terms. One
                   challenge that OOV terms bring to STD is the
                   pronunciation uncertainty. A commonly used approach to
                   address this problem is a soft matching procedure,and
                   the other is the stochastic pronunciation modelling
                   (SPM) proposed by the authors. In this paper we compare
                   these two approaches, and combine them using a
                   discriminative decision strategy. Experimental results
                   demonstrated that SPM and soft match are highly
                   complementary, and their combination gives significant
                   performance improvement to OOV term detection.},
  keywords = {confidence estimation, spoken term detection, speech
                   recognition},
  month = mar,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wang10_icassp.pdf},
  year = 2010
}
@inproceedings{livescu07:manual,
  author = {Livescu, K. and Bezman, A. and Borges, N. and Yung, L.
                   and Çetin, Ö. and Frankel, J. and King, S. and
                   Magimai-Doss, M. and Chi, X. and Lavoie, L.},
  title = {Manual transcription of conversational speech at the
                   articulatory feature level},
  booktitle = {Proc. ICASSP},
  address = {Honolulu},
  abstract = {We present an approach for the manual labeling of
                   speech at the articulatory feature level, and a new set
                   of labeled conversational speech collected using this
                   approach. A detailed transcription, including
                   overlapping or reduced gestures, is useful for studying
                   the great pronunciation variability in conversational
                   speech. It also facilitates the testing of feature
                   classiers, such as those used in articulatory
                   approaches to automatic speech recognition. We describe
                   an effort to transcribe a small set of utterances drawn
                   from the Switchboard database using eight articulatory
                   tiers. Two transcribers have labeled these utterances
                   in a multi-pass strategy, allowing for correction of
                   errors. We describe the data collection methods and
                   analyze the data to determine how quickly and reliably
                   this type of transcription can be done. Finally, we
                   demonstrate one use of the new data set by testing a
                   set of multilayer perceptron feature classiers against
                   both the manual labels and forced alignments.},
  month = {April},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/livescu_icassp07_trans.pdf},
  year = 2007
}
@inproceedings{frankel00:NN_LDM,
  author = {Frankel, J. and Richmond, K. and King, S. and Taylor,
                   P.},
  title = {An automatic speech recognition system using neural
                   networks and linear dynamic models to recover and model
                   articulatory traces},
  booktitle = {Proc. {ICSLP}},
  abstract = {In this paper we describe a speech recognition system
                   using linear dynamic models and articulatory features.
                   Experiments are reported in which measured articulation
                   from the MOCHA corpus has been used, along with those
                   where the articulatory parameters are estimated from
                   the speech signal using a recurrent neural network.},
  categories = {am,artic,asr,ldm,mocha,edinburgh,inversion,ann},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Frankel_et_al_ICSLP2000.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Frankel_et_al_ICSLP2000.ps},
  year = 2000
}
@inproceedings{joe_dong_simon_interspeech08_bottle,
  author = {Joe Frankel and Dong Wang and Simon King},
  title = {Growing bottleneck features for tandem {ASR}},
  booktitle = {Proc. Interspeech},
  pages = {1549},
  abstract = { We present a method for training bottleneck MLPs for
                   use in tandem ASR. Experiments on meetings data show
                   that this approach leads to improved performance
                   compared with training MLPs from a random
                   initialization. },
  categories = {tandem ASR, bottleneck MLP},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/bottlenet.a.pdf},
  year = 2008
}
@inproceedings{dongwang_interspeech09_cmb,
  author = {Javier Tejedor and Dong Wang and Simon King and Joe
                   Frankel and Jose Colas},
  title = {A Posterior Probability-based System Hybridisation and
                   Combination for Spoken Term Detection},
  booktitle = {Proc. Interspeech},
  pages = {2131--2134},
  address = {Brighton, UK},
  abstract = {Spoken term detection (STD) is a fundamental task for
                   multimedia information retrieval. To improve the
                   detection performance, we have presented a direct
                   posterior-based confidence measure generated from a
                   neural network. In this paper, we propose a
                   detection-independent confidence estimation based on
                   the direct posterior confidence measure, in which the
                   decision making is totally separated from the term
                   detection. Based on this idea, we first present a
                   hybrid system which conducts the term detection and
                   confidence estimation based on different sub-word
                   units, and then propose a combination method which
                   merges detections from heterogeneous term detectors
                   based on the direct posterior-based confidence.
                   Experimental results demonstrated that the proposed
                   methods improved system performance considerably for
                   both English and Spanish. },
  categories = {joint-multigram, pronunciation model, spoken term
                   detection, speech recognition},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/cmb.pdf},
  year = 2009
}
@inproceedings{toth:frankel:goztolya:king:interspeech2008,
  author = {Laszlo Toth and Joe Frankel and Gabor Gosztolya and
                   Simon King},
  title = {Cross-lingual Portability of MLP-Based Tandem Features
                   -- A Case Study for English and Hungarian},
  booktitle = {Proc. Interspeech},
  pages = {2695-2698},
  address = {Brisbane, Australia},
  abstract = {One promising approach for building ASR systems for
                   less-resourced languages is cross-lingual adaptation.
                   Tandem ASR is particularly well suited to such
                   adaptation, as it includes two cascaded modelling
                   steps: feature extraction using multi-layer perceptrons
                   (MLPs), followed by modelling using a standard HMM. The
                   language-specific tuning can be performed by adjusting
                   the HMM only, leaving the MLP untouched. Here we
                   examine the portability of feature extractor MLPs
                   between an Indo-European (English) and a Finno-Ugric
                   (Hungarian) language. We present experiments which use
                   both conventional phone-posterior and articulatory
                   feature (AF) detector MLPs, both trained on a much
                   larger quantity of (English) data than the monolingual
                   (Hungarian) system. We find that the cross-lingual
                   configurations achieve similar performance to the
                   monolingual system, and that, interestingly, the AF
                   detectors lead to slightly worse performance, despite
                   the expectation that they should be more
                   language-independent than phone-based MLPs. However,
                   the cross-lingual system outperforms all other
                   configurations when the English phone MLP is adapted on
                   the Hungarian data. },
  keywords = {tandem, ASR},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080729.PDF},
  year = 2008
}
@article{tejedor:wang:frankel:king:colas:specom2008,
  author = {Javier Tejedor and Dong Wang and Joe Frankel and Simon
                   King and José Colás},
  title = {A comparison of grapheme and phoneme-based units for
                   {S}panish spoken term detection},
  journal = {Speech Communication},
  volume = {50},
  number = {11-12},
  pages = {980-991},
  abstract = {The ever-increasing volume of audio data available
                   online through the world wide web means that automatic
                   methods for indexing and search are becoming essential.
                   Hidden Markov model (HMM) keyword spotting and lattice
                   search techniques are the two most common approaches
                   used by such systems. In keyword spotting, models or
                   templates are defined for each search term prior to
                   accessing the speech and used to find matches. Lattice
                   search (referred to as spoken term detection), uses a
                   pre-indexing of speech data in terms of word or
                   sub-word units, which can then quickly be searched for
                   arbitrary terms without referring to the original
                   audio. In both cases, the search term can be modelled
                   in terms of sub-word units, typically phonemes. For
                   in-vocabulary words (i.e. words that appear in the
                   pronunciation dictionary), the letter-to-sound
                   conversion systems are accepted to work well. However,
                   for out-of-vocabulary (OOV) search terms,
                   letter-to-sound conversion must be used to generate a
                   pronunciation for the search term. This is usually a
                   hard decision (i.e. not probabilistic and with no
                   possibility of backtracking), and errors introduced at
                   this step are difficult to recover from. We therefore
                   propose the direct use of graphemes (i.e., letter-based
                   sub-word units) for acoustic modelling. This is
                   expected to work particularly well in languages such as
                   Spanish, where despite the letter-to-sound mapping
                   being very regular, the correspondence is not
                   one-to-one, and there will be benefits from avoiding
                   hard decisions at early stages of processing. In this
                   article, we compare three approaches for Spanish
                   keyword spotting or spoken term detection, and within
                   each of these we compare acoustic modelling based on
                   phone and grapheme units. Experiments were performed
                   using the Spanish geographical-domain Albayzin corpus.
                   Results achieved in the two approaches proposed for
                   spoken term detection show us that trigrapheme units
                   for acoustic modelling match or exceed the performance
                   of phone-based acoustic models. In the method proposed
                   for keyword spotting, the results achieved with each
                   acoustic model are very similar.},
  categories = {Spoken term detection; Keyword spotting; Graphemes;
                   Spanish},
  doi = {10.1016/j.specom.2008.03.005},
  month = {November-December},
  year = 2008
}
@inproceedings{king00:recognition_syll,
  author = {King, S. and Taylor, P. and Frankel, J. and Richmond,
                   K.},
  title = {Speech recognition via phonetically-featured syllables},
  booktitle = {PHONUS},
  volume = {5},
  pages = {15-34},
  address = {Institute of Phonetics, University of the Saarland},
  abstract = {We describe recent work on two new automatic speech
                   recognition systems. The first part of this paper
                   describes the components of a system based on
                   phonological features (which we call EspressoA) in
                   which the values of these features are estimated from
                   the speech signal before being used as the basis for
                   recognition. In the second part of the paper, another
                   system (which we call EspressoB) is described in which
                   articulatory parameters are used instead of
                   phonological features and a linear dynamical system
                   model is used to perform recognition from automatically
                   estimated values of these articulatory parameters.},
  categories = {am,artic,asr,ldm,phonetic_feature,mocha,timit,edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/King_et_al_Phonus2000.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/King_et_al_Phonus2000.ps},
  year = 2000
}
@inproceedings{frankel05:hybrid,
  author = {Frankel, J. and King, S.},
  title = {A Hybrid {ANN/DBN} Approach to Articulatory Feature
                   Recognition},
  booktitle = {Proc. Eurospeech},
  address = {Lisbon},
  abstract = {Artificial neural networks (ANN) have proven to be
                   well suited to the task of articulatory feature (AF)
                   recognition. Previous studies have taken a cascaded
                   approach where separate ANNs are trained for each
                   feature group, making the assumption that features are
                   statistically independent. We address this by using
                   ANNs to provide virtual evidence to a dynamic Bayesian
                   network (DBN). This gives a hybrid ANN/DBN model and
                   allows modelling of inter-feature dependencies. We
                   demonstrate significant increases in AF recognition
                   accuracy from modelling dependencies between features,
                   and present the results of embedded training
                   experiments in which a set of asynchronous feature
                   changes are learned. Furthermore, we report on the
                   application of a Viterbi training scheme in which we
                   alternate between realigning the AF training labels and
                   retraining the ANNs.},
  categories = {am,artic,asr,dbn,oginumbers,edinburgh},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/Frankel_King_INTER2005.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/Frankel_King_INTER2005.ps},
  year = 2005
}
@phdthesis{frankel03:thesis,
  author = {Frankel, J.},
  title = {Linear dynamic models for automatic speech recognition},
  school = {The Centre for Speech Technology Research, Edinburgh
                   University},
  abstract = {The majority of automatic speech recognition (ASR)
                   systems rely on hidden Markov models (HMM), in which
                   the output distribution associated with each state is
                   modelled by a mixture of diagonal covariance Gaussians.
                   Dynamic information is typically included by appending
                   time-derivatives to feature vectors. This approach,
                   whilst successful, makes the false assumption of
                   framewise independence of the augmented feature vectors
                   and ignores the spatial correlations in the
                   parametrised speech signal. This dissertation seeks to
                   address these shortcomings by exploring acoustic
                   modelling for ASR with an application of a form of
                   state-space model, the linear dynamic model (LDM).
                   Rather than modelling individual frames of data, LDMs
                   characterize entire segments of speech. An
                   auto-regressive state evolution through a continuous
                   space gives a Markovian model of the underlying
                   dynamics, and spatial correlations between feature
                   dimensions are absorbed into the structure of the
                   observation process. LDMs have been applied to speech
                   recognition before, however a smoothed Gauss-Markov
                   form was used which ignored the potential for subspace
                   modelling. The continuous dynamical state means that
                   information is passed along the length of each segment.
                   Furthermore, if the state is allowed to be continuous
                   across segment boundaries, long range dependencies are
                   built into the system and the assumption of
                   independence of successive segments is loosened. The
                   state provides an explicit model of temporal
                   correlation which sets this approach apart from
                   frame-based and some segment-based models where the
                   ordering of the data is unimportant. The benefits of
                   such a model are examined both within and between
                   segments. LDMs are well suited to modelling smoothly
                   varying, continuous, yet noisy trajectories such as
                   found in measured articulatory data. Using
                   speaker-dependent data from the MOCHA corpus, the
                   performance of systems which model acoustic,
                   articulatory, and combined acoustic-articulatory
                   features are compared. As well as measured articulatory
                   parameters, experiments use the output of neural
                   networks trained to perform an articulatory inversion
                   mapping. The speaker-independent TIMIT corpus provides
                   the basis for larger scale acoustic-only experiments.
                   Classification tasks provide an ideal means to compare
                   modelling choices without the confounding influence of
                   recognition search errors, and are used to explore
                   issues such as choice of state dimension, front-end
                   acoustic parametrization and parameter initialization.
                   Recognition for segment models is typically more
                   computationally expensive than for frame-based models.
                   Unlike frame-level models, it is not always possible to
                   share likelihood calculations for observation sequences
                   which occur within hypothesized segments that have
                   different start and end times. Furthermore, the Viterbi
                   criterion is not necessarily applicable at the frame
                   level. This work introduces a novel approach to
                   decoding for segment models in the form of a stack
                   decoder with $A^*$ search. Such a scheme allows
                   flexibility in the choice of acoustic and language
                   models since the Viterbi criterion is not integral to
                   the search, and hypothesis generation is independent of
                   the particular language model. Furthermore, the
                   time-asynchronous ordering of the search means that
                   only likely paths are extended, and so a minimum number
                   of models are evaluated. The decoder is used to give
                   full recognition results for feature-sets derived from
                   the MOCHA and TIMIT corpora. Conventional train/test
                   divisions and choice of language model are used so that
                   results can be directly compared to those in other
                   studies. The decoder is also used to implement Viterbi
                   training, in which model parameters are alternately
                   updated and then used to re-align the training data.},
  categories = {am,artic,asr,ldm,mocha,timit,search,edinburgh},
  month = apr,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/Frankel_thesis2003.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/Frankel_thesis2003.ps},
  year = 2003
}
@article{king07:JASA2007,
  author = {King, S. and Frankel, J. and Livescu, K. and
                   McDermott, E. and Richmond, K. and Wester, M.},
  title = {Speech production knowledge in automatic speech
                   recognition},
  journal = {Journal of the Acoustical Society of America},
  volume = 121,
  number = 2,
  pages = {723--742},
  abstract = {Although much is known about how speech is produced,
                   and research into speech production has resulted in
                   measured articulatory data, feature systems of
                   different kinds and numerous models, speech production
                   knowledge is almost totally ignored in current
                   mainstream approaches to automatic speech recognition.
                   Representations of speech production allow simple
                   explanations for many phenomena observed in speech
                   which cannot be easily analyzed from either acoustic
                   signal or phonetic transcription alone. In this
                   article, we provide a survey of a growing body of work
                   in which such representations are used to improve
                   automatic speech recognition.},
  month = feb,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/King_et_al_review.pdf},
  year = 2007
}