The Centre for Speech Technology Research, The university of Edinburgh

Publications by Peter Bell

s0566164.bib

@inproceedings{dzikovskaSIGDIAL20112,
  author = {Dzikovska, Myroslava and Isard, Amy and Bell, Peter
                   and Moore, Johanna and Steinhauser, Natalie and
                   Campbell, Gwendolyn},
  title = {{Beetle II}: an adaptable tutorial dialogue system},
  booktitle = {Proceedings of the SIGDIAL 2011 Conference, demo
                   session},
  pages = {338--340},
  address = {Portland, Oregon},
  publisher = {Association for Computational Linguistics},
  abstract = {We present Beetle II, a tutorial dialogue system which
                   accepts unrestricted language input and supports
                   experimentation with different tutorial planning and
                   dialogue strategies. Our first system evaluation
                   compared two tutorial policies and demonstrated that
                   the system can be used to study the impact of different
                   approaches to tutoring. The system is also designed to
                   allow experimentation with a variety of natural
                   language techniques, and discourse and dialogue
                   strategies.},
  month = jun,
  url = {http://www.aclweb.org/anthology/W11-2041},
  year = 2011
}
@inproceedings{bell_king_shrinkage_is2008,
  author = {Bell, Peter and King, Simon},
  title = {A Shrinkage Estimator for Speech Recognition with Full
                   Covariance {HMM}s},
  booktitle = {Proc. Interspeech},
  address = {Brisbane, Australia},
  note = {Shortlisted for best student paper award.},
  abstract = {We consider the problem of parameter estimation in
                   full-covariance Gaussian mixture systems for automatic
                   speech recognition. Due to the high dimensionality of
                   the acoustic feature vector, the standard sample
                   covariance matrix has a high variance and is often
                   poorly-conditioned when the amount of training data is
                   limited. We explain how the use of a shrinkage
                   estimator can solve these problems, and derive a
                   formula for the optimal shrinkage intensity. We present
                   results of experiments on a phone recognition task,
                   showing that the estimator gives a performance
                   improvement over a standard full-covariance system},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/shrinkage_is2008.pdf},
  year = 2008
}
@inproceedings{DBLP:conf/aied/DzikovskaIBMSCTCS11,
  author = {Myroslava Dzikovska and Amy Isard and Peter Bell and
                   Johanna D. Moore and Natalie B. Steinhauser and
                   Gwendolyn E. Campbell and Leanne S. Taylor and Simon
                   Caine and Charlie Scott},
  title = {Adaptive Intelligent Tutorial Dialogue in the {Beetle
                   II} System},
  booktitle = {Artificial Intelligence in Education - 15th
                   International Conference (AIED 2011), interactive event},
  volume = {6738},
  series = {Lecture Notes in Computer Science},
  pages = {621},
  address = {Auckland, New Zealand},
  publisher = {Springer},
  doi = {10.1007/978-3-642-21869-9_122},
  year = 2011
}
@inproceedings{bell12_mlan,
  author = {Bell, P. and Gales, M. and Lanchantin, P. and Liu, X.
                   and Long, Y. and Renals, S. and Swietojanski, P. and
                   Woodland, P.},
  title = {Transcription of multi-genre media archives using
                   out-of-domain data},
  booktitle = {Proc. IEEE Workshop on Spoken Language Technology},
  address = {Miami, Florida, USA},
  abstract = {We describe our work on developing a speech
                   recognition system for multi-genre media archives. The
                   high diversity of the data makes this a challenging
                   recognition task, which may benefit from systems
                   trained on a combination of in-domain and out-of-domain
                   data. Working with tandem HMMs, we present Multi-level
                   Adaptive Networks (MLAN), a novel technique for
                   incorporating information from out-of-domain posterior
                   features using deep neural networks. We show that it
                   provides a substantial reduction in WER over other
                   systems, with relative WER reductions of 15\% over a
                   PLP baseline, 9\% over in-domain tandem features and
                   8\% over the best out-of-domain tandem features.},
  month = dec,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/mlan_slt2012.pdf},
  year = 2012
}
@inproceedings{dongwang_interspeech09_conf,
  author = {Dong Wang and Simon King and Joe Frankel and Peter
                   Bell},
  title = {Term-Dependent Confidence for Out-of-Vocabulary Term
                   Detection},
  booktitle = {Proc. Interspeech},
  pages = {2139--2142},
  address = {Brighton, UK},
  abstract = { Within a spoken term detection (STD) system, the
                   decision maker plays an important role in retrieving
                   reliable detections. Most of the state-of-the-art STD
                   systems make decisions based on a confidence measure
                   that is term-independent, which poses a serious problem
                   for out-of-vocabulary (OOV) term detection. In this
                   paper, we study a term-dependent confidence measure
                   based on confidence normalisation and discriminative
                   modelling, particularly focusing on its remarkable
                   effectiveness for detecting OOV terms. Experimental
                   results indicate that the term-dependent confidence
                   provides much more significant improvement for OOV
                   terms than terms in-vocabulary. },
  categories = {joint-multigram, pronunciation model, spoken term
                   detection, speech recognition},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/conf.pdf},
  year = 2009
}
@inproceedings{bell12_tutoring,
  author = {Bell, Peter and Dzikovska, Myroslava and Isard, Amy},
  title = {Designing a spoken language interface for a tutorial
                   dialogue system},
  booktitle = {Proc. Interspeech},
  address = {Portland, Oregon, USA},
  abstract = {We describe our work in building a spoken language
                   interface for a tutorial dialogue system. Our goal is
                   to allow natural, unrestricted student interaction with
                   the computer tutor, which has been shown to improve the
                   student's learning gain, but presents challenges for
                   speech recognition and spoken language understanding.
                   We discuss the choice of system components and present
                   the results of development experiments in both acoustic
                   and language modelling for speech recognition in this
                   domain.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/tutoring_is2012.pdf},
  year = 2012
}
@inproceedings{stan12_grapheme_alignment,
  author = {Stan, Adriana and Bell, Peter and King, Simon},
  title = {A Grapheme-based Method for Automatic Alignment of
                   Speech and Text Data},
  booktitle = {Proc. IEEE Workshop on Spoken Language Technology},
  address = {Miami, Florida, USA},
  abstract = {This paper introduces a method for automatic alignment
                   of speech data with unsynchronised, imperfect
                   transcripts, for a domain where no initial acoustic
                   models are available. Using grapheme-based acoustic
                   models, word skip networks and orthographic speech
                   transcripts, we are able to harvest 55\% of the speech
                   with a 93\% utterance-level accuracy and 99\% word
                   accuracy for the produced transcriptions. The work is
                   based on the assumption that there is a high degree of
                   correspondence between the speech and text, and that a
                   full transcription of all of the speech is not
                   required. The method is language independent and the
                   only prior knowledge and resources required are the
                   speech and text transcripts, and a few minor user
                   interventions.},
  month = dec,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/grapheme_alignment_slt2012.pdf},
  year = 2012
}
@inproceedings{wang_std_covariance_icassp2010,
  author = {Wang, Dong and King, Simon and Frankel, Joe and Bell,
                   Peter},
  title = {Stochastic Pronunciation Modelling and Soft Match for
                   Out-of-vocabulary Spoken Term Detection},
  booktitle = {Proc. ICASSP},
  address = {Dallas, Texas, USA},
  abstract = {A major challenge faced by a spoken term detection
                   (STD) system is the detection of out-of-vocabulary
                   (OOV) terms. Although a subword-based STD system is
                   able to detect OOV terms, performance reduction is
                   always observed compared to in-vocabulary terms. One
                   challenge that OOV terms bring to STD is the
                   pronunciation uncertainty. A commonly used approach to
                   address this problem is a soft matching procedure,and
                   the other is the stochastic pronunciation modelling
                   (SPM) proposed by the authors. In this paper we compare
                   these two approaches, and combine them using a
                   discriminative decision strategy. Experimental results
                   demonstrated that SPM and soft match are highly
                   complementary, and their combination gives significant
                   performance improvement to OOV term detection.},
  keywords = {confidence estimation, spoken term detection, speech
                   recognition},
  month = mar,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wang10_icassp.pdf},
  year = 2010
}
@inproceedings{bell_burrows_taylor_sp2006,
  author = {Peter Bell and Tina Burrows and Paul Taylor},
  title = {Adaptation of Prosodic Phrasing Models},
  booktitle = {Proc. Speech Prosody 2006},
  address = {Dresden, Germany},
  abstract = {There is considerable variation in the prosodic
                   phrasing of speech betweeen different speakers and
                   speech styles. Due to the time and cost of obtaining
                   large quantities of data to train a model for every
                   variation, it is desirable to develop models that can
                   be adapted to new conditions with a limited amount of
                   training data. We describe a technique for adapting
                   HMM-based phrase boundary prediction models which
                   alters a statistic distribution of prosodic phrase
                   lengths. The adapted models show improved prediction
                   performance across different speakers and types of
                   spoken material.},
  month = may,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/phrasing_sp2006.pdf},
  year = 2006
}
@inproceedings{bell_king_full_covariance_asru2009,
  author = {Bell, Peter and King, Simon},
  title = {Diagonal Priors for Full Covariance Speech Recognition},
  booktitle = {Proc. IEEE Workshop on Automatic Speech Recognition
                   and Understanding},
  address = {Merano, Italy},
  abstract = {We investigate the use of full covariance Gaussians
                   for large-vocabulary speech recognition. The large
                   number of parameters gives high modelling power, but
                   when training data is limited, the standard sample
                   covariance matrix is often poorly conditioned, and has
                   high variance. We explain how these problems may be
                   solved by the use of a diagonal covariance smoothing
                   prior, and relate this to the shrinkage estimator, for
                   which the optimal shrinkage parameter may itself be
                   estimated from the training data. We also compare the
                   use of generatively and discriminatively trained
                   priors. Results are presented on a large vocabulary
                   conversational telephone speech recognition task.},
  doi = {10.1109/ASRU.2009.5373344},
  month = dec,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/shrinkage_asru2009.pdf},
  year = 2009
}
@inproceedings{dzikovska-EtAl:2012:EACL2012,
  author = {Dzikovska, Myroslava O. and Bell, Peter and Isard, Amy
                   and Moore, Johanna D.},
  title = {Evaluating language understanding accuracy with
                   respect to objective outcomes in a dialogue system},
  booktitle = {Proceedings of the 13th Conference of the European
                   Chapter of the Association for Computational
                   Linguistics},
  pages = {471--481},
  address = {Avignon, France},
  publisher = {Association for Computational Linguistics},
  month = {April},
  url = {http://www.aclweb.org/anthology/E12-1048},
  year = 2012
}
@inproceedings{bell_king_is2007,
  author = {Bell, Peter and King, Simon},
  title = {Sparse Gaussian Graphical Models for Speech
                   Recognition},
  booktitle = {Proc. Interspeech 2007},
  address = {Antwerp, Belgium},
  abstract = {We address the problem of learning the structure of
                   Gaussian graphical models for use in automatic speech
                   recognition, a means of controlling the form of the
                   inverse covariance matrices of such systems. With
                   particular focus on data sparsity issues, we implement
                   a method for imposing graphical model structure on a
                   Gaussian mixture system, using a convex optimisation
                   technique to maximise a penalised likelihood
                   expression. The results of initial experiments on a
                   phone recognition task show a performance improvement
                   over an equivalent full-covariance system.},
  categories = {speech recognition, acoustic models, graphical models,
                   precision matrix models},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/sparseGM_is2007.pdf},
  year = 2007
}
@inproceedings{bell_king_lineSearch_is2008,
  author = {Bell, Peter and King, Simon},
  title = {Covariance Updates for Discriminative Training by
                   Constrained Line Search},
  booktitle = {Proc. Interspeech},
  address = {Brisbane, Australia},
  abstract = {We investigate the recent Constrained Line Search
                   algorithm for discriminative training of HMMs and
                   propose an alternative formula for variance update. We
                   compare the method to standard techniques on a phone
                   recognition task.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/lineSearch_is2008.pdf},
  year = 2008
}