Publications by Peter Bell
s0566164.bib
@inproceedings{hasler2012,
abstract = {This paper describes the University of Edinburgh (UEDIN) systems for the IWSLT 2012 Evaluation. We participated in the ASR (English), MT (English-French, German-English) and SLT (English-French) tracks.},
author = {Hasler, Eva and Bell, Peter and Ghoshal, Arnab and Haddow, Barry and Koehn, Philipp and McInnes, Fergus and Renals, Steve and Swietojanski, Pawel},
booktitle = {Proc. International Workshop on Spoken Language Translation},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/paper_50.pdf},
title = {The {UEDIN} system for the {IWSLT} 2012 evaluation},
year = {2012}
}
@inproceedings{rasipuram13_gaelic_graphemes,
abstract = {Standard automatic speech recognition (ASR) systems use
phonemes as subword units. Thus, one of the primary
resources required to build a good ASR system is a
well developed phoneme pronunciation
lexicon. However, under-resourced languages
typically lack such lexical resources. In this
paper, we investigate recently proposed
grapheme-based ASR in the framework of
Kullback-Leibler divergence based hidden Markov
model (KL-HMM) for under-resourced languages,
particularly Scottish Gaelic which has no lexical
resources. More specifically, we study the use of
grapheme and multilingual phoneme class conditional
probabilities (posterior features) as feature
observations in the KL-HMM. ASR studies conducted show
that the proposed approach yields better system
compared to the conventional HMM/GMM approach using
cepstral features. Furthermore, grapheme posterior
features estimated using both auxiliary data and
Gaelic data yield the best system.},
address = {Vancouver, Canada},
author = {Rasipuram, Ramya and Bell, Peter and Magimai.-Doss, Mathew},
booktitle = {Proc. ICASSP},
month = may,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/gaelic_graphemes_icassp13.pdf},
title = {Grapheme and multilingual posterior features for under-resourced speech recognition: a study on {S}cottish {G}aelic},
year = 2013
}
@phdthesis{bell_phd_thesis,
abstract = {HMM-based systems for Automatic Speech Recognition typically model the
acoustic features using mixtures of multivariate Gaussians. In this
thesis, we consider the problem of learning a suitable covariance
matrix for each Gaussian. A variety of schemes have been proposed for
controlling the number of covariance parameters per Gaussian, and
studies have shown that in general, the greater the number of
parameters used in the models, the better the recognition performance.
We therefore investigate systems with full covariance Gaussians.
However, in this case, the obvious choice of parameters -- given by
the sample covariance matrix -- leads to matrices that are
poorly-conditioned, and do not generalise well to unseen test data.
The problem is particularly acute when the amount of training data is
limited.
We propose two solutions to this problem: firstly, we impose the
requirement that each matrix should take the form of a Gaussian
graphical model, and introduce a method for learning the parameters
and the model structure simultaneously. Secondly, we explain how an
alternative estimator, the shrinkage estimator, is preferable to the
standard maximum likelihood estimator, and derive formulae for the
optimal shrinkage intensity within the context of a Gaussian mixture
model. We show how this relates to the use of a diagonal covariance
smoothing prior.
We compare the effectiveness of these techniques to standard methods
on a phone recognition task where the quantity of training data is
artificially constrained. We then investigate the performance of the
shrinkage estimator on a large-vocabulary conversational telephone
speech recognition task.
Discriminative training techniques can be used to compensate for the
invalidity of the model correctness assumption underpinning maximum
likelihood estimation. On the large-vocabulary task, we use
discriminative training of the full covariance models and diagonal
priors to yield improved recognition performance.},
author = {Bell, Peter},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/thesis.pdf},
school = {University of Edinburgh},
title = {Full covariance modelling for speech recognition},
year = 2010
}
@inproceedings{bell13_mlan,
abstract = {In this paper we investigate the use of Multi-level adaptive networks (MLAN)
to incorporate out-of-domain data when training large vocabulary speech recognition
systems. In a set of experiments on multi-genre broadcast data and on
TED lecture recordings we present results using of out-of-domain features
in a hybrid DNN system and explore tandem systems using a variety of input
acoustic features. Our experiments indicate using the MLAN approach in both
hybrid and tandem systems results in consistent reductions in word error rate of
5--10\% relative.},
address = {Vancouver, Canada},
author = {Bell, Peter and Swietojanski, Pawel and Renals, Steve},
booktitle = {Proc. ICASSP},
month = may,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/mlan_icassp2013.pdf},
title = {Multi-level adaptive networks in tandem and hybrid {ASR} systems},
year = 2013
}
@inproceedings{dzikovskaSIGDIAL20112,
author = {Dzikovska, Myroslava and Isard, Amy and Bell, Peter
and Moore, Johanna and Steinhauser, Natalie and
Campbell, Gwendolyn},
title = {{Beetle II}: an adaptable tutorial dialogue system},
booktitle = {Proceedings of the SIGDIAL 2011 Conference, demo
session},
pages = {338--340},
address = {Portland, Oregon},
publisher = {Association for Computational Linguistics},
abstract = {We present Beetle II, a tutorial dialogue system which
accepts unrestricted language input and supports
experimentation with different tutorial planning and
dialogue strategies. Our first system evaluation
compared two tutorial policies and demonstrated that
the system can be used to study the impact of different
approaches to tutoring. The system is also designed to
allow experimentation with a variety of natural
language techniques, and discourse and dialogue
strategies.},
month = jun,
url = {http://www.aclweb.org/anthology/W11-2041},
year = 2011
}
@inproceedings{bell_king_shrinkage_is2008,
author = {Bell, Peter and King, Simon},
title = {A Shrinkage Estimator for Speech Recognition with Full
Covariance {HMM}s},
booktitle = {Proc. Interspeech},
address = {Brisbane, Australia},
note = {Shortlisted for best student paper award.},
abstract = {We consider the problem of parameter estimation in
full-covariance Gaussian mixture systems for automatic
speech recognition. Due to the high dimensionality of
the acoustic feature vector, the standard sample
covariance matrix has a high variance and is often
poorly-conditioned when the amount of training data is
limited. We explain how the use of a shrinkage
estimator can solve these problems, and derive a
formula for the optimal shrinkage intensity. We present
results of experiments on a phone recognition task,
showing that the estimator gives a performance
improvement over a standard full-covariance system},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/shrinkage_is2008.pdf},
year = 2008
}
@inproceedings{DBLP:conf/aied/DzikovskaIBMSCTCS11,
author = {Myroslava Dzikovska and Amy Isard and Peter Bell and
Johanna D. Moore and Natalie B. Steinhauser and
Gwendolyn E. Campbell and Leanne S. Taylor and Simon
Caine and Charlie Scott},
title = {Adaptive Intelligent Tutorial Dialogue in the {Beetle
II} System},
booktitle = {Artificial Intelligence in Education - 15th
International Conference (AIED 2011), interactive event},
volume = {6738},
series = {Lecture Notes in Computer Science},
pages = {621},
address = {Auckland, New Zealand},
publisher = {Springer},
doi = {10.1007/978-3-642-21869-9_122},
year = 2011
}
@inproceedings{bell12_mlan,
author = {Bell, P. and Gales, M. and Lanchantin, P. and Liu, X.
and Long, Y. and Renals, S. and Swietojanski, P. and
Woodland, P.},
title = {Transcription of multi-genre media archives using
out-of-domain data},
booktitle = {Proc. IEEE Workshop on Spoken Language Technology},
address = {Miami, Florida, USA},
abstract = {We describe our work on developing a speech
recognition system for multi-genre media archives. The
high diversity of the data makes this a challenging
recognition task, which may benefit from systems
trained on a combination of in-domain and out-of-domain
data. Working with tandem HMMs, we present Multi-level
Adaptive Networks (MLAN), a novel technique for
incorporating information from out-of-domain posterior
features using deep neural networks. We show that it
provides a substantial reduction in WER over other
systems, with relative WER reductions of 15\% over a
PLP baseline, 9\% over in-domain tandem features and
8\% over the best out-of-domain tandem features.},
month = dec,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/mlan_slt2012.pdf},
year = 2012
}
@inproceedings{dongwang_interspeech09_conf,
author = {Dong Wang and Simon King and Joe Frankel and Peter
Bell},
title = {Term-Dependent Confidence for Out-of-Vocabulary Term
Detection},
booktitle = {Proc. Interspeech},
pages = {2139--2142},
address = {Brighton, UK},
abstract = { Within a spoken term detection (STD) system, the
decision maker plays an important role in retrieving
reliable detections. Most of the state-of-the-art STD
systems make decisions based on a confidence measure
that is term-independent, which poses a serious problem
for out-of-vocabulary (OOV) term detection. In this
paper, we study a term-dependent confidence measure
based on confidence normalisation and discriminative
modelling, particularly focusing on its remarkable
effectiveness for detecting OOV terms. Experimental
results indicate that the term-dependent confidence
provides much more significant improvement for OOV
terms than terms in-vocabulary. },
categories = {joint-multigram, pronunciation model, spoken term
detection, speech recognition},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/conf.pdf},
year = 2009
}
@inproceedings{bell12_tutoring,
author = {Bell, Peter and Dzikovska, Myroslava and Isard, Amy},
title = {Designing a spoken language interface for a tutorial
dialogue system},
booktitle = {Proc. Interspeech},
address = {Portland, Oregon, USA},
abstract = {We describe our work in building a spoken language
interface for a tutorial dialogue system. Our goal is
to allow natural, unrestricted student interaction with
the computer tutor, which has been shown to improve the
student's learning gain, but presents challenges for
speech recognition and spoken language understanding.
We discuss the choice of system components and present
the results of development experiments in both acoustic
and language modelling for speech recognition in this
domain.},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/tutoring_is2012.pdf},
year = 2012
}
@inproceedings{stan12_grapheme_alignment,
author = {Stan, Adriana and Bell, Peter and King, Simon},
title = {A Grapheme-based Method for Automatic Alignment of
Speech and Text Data},
booktitle = {Proc. IEEE Workshop on Spoken Language Technology},
address = {Miami, Florida, USA},
abstract = {This paper introduces a method for automatic alignment
of speech data with unsynchronised, imperfect
transcripts, for a domain where no initial acoustic
models are available. Using grapheme-based acoustic
models, word skip networks and orthographic speech
transcripts, we are able to harvest 55\% of the speech
with a 93\% utterance-level accuracy and 99\% word
accuracy for the produced transcriptions. The work is
based on the assumption that there is a high degree of
correspondence between the speech and text, and that a
full transcription of all of the speech is not
required. The method is language independent and the
only prior knowledge and resources required are the
speech and text transcripts, and a few minor user
interventions.},
month = dec,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/grapheme_alignment_slt2012.pdf},
year = 2012
}
@inproceedings{wang_std_covariance_icassp2010,
author = {Wang, Dong and King, Simon and Frankel, Joe and Bell,
Peter},
title = {Stochastic Pronunciation Modelling and Soft Match for
Out-of-vocabulary Spoken Term Detection},
booktitle = {Proc. ICASSP},
address = {Dallas, Texas, USA},
abstract = {A major challenge faced by a spoken term detection
(STD) system is the detection of out-of-vocabulary
(OOV) terms. Although a subword-based STD system is
able to detect OOV terms, performance reduction is
always observed compared to in-vocabulary terms. One
challenge that OOV terms bring to STD is the
pronunciation uncertainty. A commonly used approach to
address this problem is a soft matching procedure,and
the other is the stochastic pronunciation modelling
(SPM) proposed by the authors. In this paper we compare
these two approaches, and combine them using a
discriminative decision strategy. Experimental results
demonstrated that SPM and soft match are highly
complementary, and their combination gives significant
performance improvement to OOV term detection.},
keywords = {confidence estimation, spoken term detection, speech
recognition},
month = mar,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wang10_icassp.pdf},
year = 2010
}
@inproceedings{bell_burrows_taylor_sp2006,
author = {Peter Bell and Tina Burrows and Paul Taylor},
title = {Adaptation of Prosodic Phrasing Models},
booktitle = {Proc. Speech Prosody 2006},
address = {Dresden, Germany},
abstract = {There is considerable variation in the prosodic
phrasing of speech betweeen different speakers and
speech styles. Due to the time and cost of obtaining
large quantities of data to train a model for every
variation, it is desirable to develop models that can
be adapted to new conditions with a limited amount of
training data. We describe a technique for adapting
HMM-based phrase boundary prediction models which
alters a statistic distribution of prosodic phrase
lengths. The adapted models show improved prediction
performance across different speakers and types of
spoken material.},
month = may,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/phrasing_sp2006.pdf},
year = 2006
}
@inproceedings{bell_king_full_covariance_asru2009,
author = {Bell, Peter and King, Simon},
title = {Diagonal Priors for Full Covariance Speech Recognition},
booktitle = {Proc. IEEE Workshop on Automatic Speech Recognition
and Understanding},
address = {Merano, Italy},
abstract = {We investigate the use of full covariance Gaussians
for large-vocabulary speech recognition. The large
number of parameters gives high modelling power, but
when training data is limited, the standard sample
covariance matrix is often poorly conditioned, and has
high variance. We explain how these problems may be
solved by the use of a diagonal covariance smoothing
prior, and relate this to the shrinkage estimator, for
which the optimal shrinkage parameter may itself be
estimated from the training data. We also compare the
use of generatively and discriminatively trained
priors. Results are presented on a large vocabulary
conversational telephone speech recognition task.},
doi = {10.1109/ASRU.2009.5373344},
month = dec,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/shrinkage_asru2009.pdf},
year = 2009
}
@inproceedings{dzikovska-EtAl:2012:EACL2012,
author = {Dzikovska, Myroslava O. and Bell, Peter and Isard, Amy
and Moore, Johanna D.},
title = {Evaluating language understanding accuracy with
respect to objective outcomes in a dialogue system},
booktitle = {Proceedings of the 13th Conference of the European
Chapter of the Association for Computational
Linguistics},
pages = {471--481},
address = {Avignon, France},
publisher = {Association for Computational Linguistics},
month = {April},
url = {http://www.aclweb.org/anthology/E12-1048},
year = 2012
}
@inproceedings{bell_king_is2007,
author = {Bell, Peter and King, Simon},
title = {Sparse Gaussian Graphical Models for Speech
Recognition},
booktitle = {Proc. Interspeech 2007},
address = {Antwerp, Belgium},
abstract = {We address the problem of learning the structure of
Gaussian graphical models for use in automatic speech
recognition, a means of controlling the form of the
inverse covariance matrices of such systems. With
particular focus on data sparsity issues, we implement
a method for imposing graphical model structure on a
Gaussian mixture system, using a convex optimisation
technique to maximise a penalised likelihood
expression. The results of initial experiments on a
phone recognition task show a performance improvement
over an equivalent full-covariance system.},
categories = {speech recognition, acoustic models, graphical models,
precision matrix models},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/sparseGM_is2007.pdf},
year = 2007
}
@inproceedings{bell_king_lineSearch_is2008,
author = {Bell, Peter and King, Simon},
title = {Covariance Updates for Discriminative Training by
Constrained Line Search},
booktitle = {Proc. Interspeech},
address = {Brisbane, Australia},
abstract = {We investigate the recent Constrained Line Search
algorithm for discriminative training of HMMs and
propose an alternative formula for variance update. We
compare the method to standard techniques on a phone
recognition task.},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/lineSearch_is2008.pdf},
year = 2008
}