The Centre for Speech Technology Research, The university of Edinburgh

Publications by Steve Renals

srenals.bib

@inproceedings{swi2012_dnn,
  author = {Swietojanski, P. and Ghoshal, A. and Renals, S.},
  title = {Unsupervised Cross-lingual knowledge transfer in {DNN-based LVCSR}},
  booktitle = {Proc. IEEE Workshop on Spoken Language Technology},
  address = {Miami, Florida, USA},
  abstract = {We investigate the use of cross-lingual acoustic data to initialise deep neural network (DNN) acoustic models by means
of unsupervised restricted Boltzmann machine (RBM) pretraining.
DNNs for German are pretrained using one or all of German, Portuguese, Spanish and Swedish. The DNNs are used in a tandem configuration, where the network outputs are used as features for a hidden Markov model (HMM) whose
emission densities are modeled by Gaussian mixture models (GMMs), as well as in a hybrid configuration, where the network outputs are used as the HMM state likelihoods. The experiments show that unsupervised pretraining is more crucial
for the hybrid setups, particularly with limited amounts of transcribed training data. More importantly, unsupervised pretraining is shown to be language-independent.},
  month = dec,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/ps_slt2012.pdf},
  year = 2012
}
@incollection{gotoh-lm03,
  author = {Y.~Gotoh and S.~Renals},
  title = {Language Modelling},
  booktitle = {Text and Speech Triggered Information Access},
  editor = {S.~Renals and G.~Grefenstette},
  pages = {78--105},
  abstract = {This is a preprint of a tutorial on statistical
                   language modelling, based on Yoshi Gotoh's course at
                   the \href{http://www.ilsp.gr/testia/testia2000.html}
                   {ELSNET-2000 Summer School} on Text and Speech
                   Triggered Information Access. },
  categories = {ie,lm,bnews,sheffield},
  crossref = {renals-book03},
  year = 2003
}
@misc{turk2010,
  author = {Turk, Alice and Scobbie, James and Geng, Christian and
                   Campbell, Barry and Dickie, Catherine and Dubourg,
                   Eddie and Bard, Ellen Gurman and Hardcastle, William
                   and Hartinger, Mariam and King, Simon and Lickley,
                   Robin and Macmartin, Cedric and Nakai, Satsuki and
                   Renals, Steve and Richmond, Korin and Schaeffler, Sonja
                   and White, Kevin and Wiegand, Ronny and Wrench, Alan},
  title = {An {E}dinburgh speech production facility},
  howpublished = {Poster presented at the 12th Conference on Laboratory
                   Phonology, Albuquerque, New Mexico.},
  month = {July},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/ESPF.pdf},
  year = 2010
}
@inproceedings{renals2008,
  author = {Renals, Steve and Hain, Thomas and Bourlard, Hervé},
  title = {Interpretation of Multiparty Meetings: The {AMI} and
                   {AMIDA} Projects},
  booktitle = {IEEE Workshop on Hands-Free Speech Communication and
                   Microphone Arrays, 2008. HSCMA 2008},
  pages = {115--118},
  abstract = {The AMI and AMIDA projects are collaborative EU
                   projects concerned with the automatic recognition and
                   interpretation of multiparty meetings. This paper
                   provides an overview of the advances we have made in
                   these projects with a particular focus on the
                   multimodal recording infrastructure, the publicly
                   available AMI corpus of annotated meeting recordings,
                   and the speech recognition framework that we have
                   developed for this domain.},
  doi = {10.1109/HSCMA.2008.4538700},
  keywords = {AMI corpus; Meetings; evaluation; speech recognition},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/renals2008.pdf},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=4538666&arnumber=4538700&count=68&index=33},
  year = 2008
}
@article{murray2009,
  author = {Murray, Gabriel and Kleinbauer, Thomas and Poller,
                   Peter and Becker, Tilman and Renals, Steve and Kilgour,
                   Jonathan},
  title = {Extrinsic Summarization Evaluation: A Decision Audit
                   Task},
  journal = {ACM Transactions on Speech and Language Processing},
  volume = {6},
  number = {2},
  pages = {1--29},
  abstract = {In this work we describe a large-scale extrinsic
                   evaluation of automatic speech summarization
                   technologies for meeting speech. The particular task is
                   a decision audit, wherein a user must satisfy a complex
                   information need, navigating several meetings in order
                   to gain an understanding of how and why a given
                   decision was made. We compare the usefulness of
                   extractive and abstractive technologies in satisfying
                   this information need, and assess the impact of
                   automatic speech recognition (ASR) errors on user
                   performance. We employ several evaluation methods for
                   participant performance, including post-questionnaire
                   data, human subjective and objective judgments, and a
                   detailed analysis of participant browsing behavior. We
                   find that while ASR errors affect user satisfaction on
                   an information retrieval task, users can adapt their
                   browsing behavior to complete the task satisfactorily.
                   Results also indicate that users consider extractive
                   summaries to be intuitive and useful tools for browsing
                   multimodal meeting data. We discuss areas in which
                   automatic summarization techniques can be improved in
                   comparison with gold-standard meeting abstracts.},
  doi = {10.1145/1596517.1596518},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/murray-acm09.pdf},
  url = {http://doi.acm.org/10.1145/1596517.1596518},
  year = 2009
}
@inproceedings{hochberg-arpa94,
  author = {M.~Hochberg and S.~Renals and T.~Robinson},
  title = {{Abbot}: The {CUED} hybrid {connectionist/HMM} large
                   vocabulary recognition system},
  booktitle = {Proc. ARPA Spoken Language Technology Workshop},
  pages = {102--105},
  categories = {},
  year = 1994
}
@inproceedings{vipperla08,
  author = {Ravichander Vipperla and Steve Renals and Joe Frankel},
  title = {Longitudinal study of {ASR} performance on ageing
                   voices},
  booktitle = {Proc.~Interspeech},
  address = {Brisbane},
  abstract = {This paper presents the results of a longitudinal
                   study of ASR performance on ageing voices. Experiments
                   were conducted on the audio recordings of the
                   proceedings of the Supreme Court Of The United States
                   (SCOTUS). Results show that the Automatic Speech
                   Recognition (ASR) Word Error Rates (WERs) for elderly
                   voices are significantly higher than those of adult
                   voices. The word error rate increases gradually as the
                   age of the elderly speakers increase. Use of maximum
                   likelihood linear regression (MLLR) based speaker
                   adaptation on ageing voices improves the WER though the
                   performance is still considerably lower compared to
                   adult voices. Speaker adaptation however reduces the
                   increase in WER with age during old age.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/vipperla_is08.pdf},
  year = 2008
}
@inproceedings{christensen-icassp05,
  author = {H. Christensen and B. Kolluru and Y. Gotoh and S.
                   Renals},
  title = {Maximum entropy segmentation of broadcast news},
  booktitle = {Proc. IEEE ICASSP},
  pages = {},
  abstract = {This paper presents an automatic system for
                   structuring and preparing a news broadcast for
                   applications such as speech summarization, browsing,
                   archiving and information retrieval. This process
                   comprises transcribing the audio using an automatic
                   speech recognizer and subsequently segmenting the text
                   into utterances and topics. A maximum entropy approach
                   is used to build statistical models for both utterance
                   and topic segmentation. The experimental work addresses
                   the effect on performance of the topic boundary
                   detector of three factors: the information sources
                   used, the quality of the ASR transcripts, and the
                   quality of the utterance boundary detector. The results
                   show that the topic segmentation is not affected
                   severely by transcripts errors, whereas errors in the
                   utterance segmentation are more devastating. },
  categories = {s3l,summarization,bnews,edinburgh,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/christensen-icassp05.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/christensen-icassp05.ps.gz},
  year = 2005
}
@article{zhang-spl2008,
  author = {Le Zhang and Steve Renals},
  title = {Acoustic-Articulatory Modelling with the Trajectory
                   {HMM}},
  journal = {IEEE Signal Processing Letters},
  volume = 15,
  pages = {245-248},
  abstract = { In this letter, we introduce an hidden Markov model
                   (HMM)-based inversion system to recovery articulatory
                   movements from speech acoustics. Trajectory HMMs are
                   used as generative models for modelling articulatory
                   data. Experiments on the MOCHA-TIMIT corpus indicate
                   that the jointly trained acoustic-articulatory models
                   are more accurate (lower RMS error) than the separately
                   trained ones, and that trajectory HMM training results
                   in greater accuracy compared with conventional maximum
                   likelihood HMM training. Moreover, the system has the
                   ability to synthesize articulatory movements directly
                   from a textual representation. },
  key = {articulatory inversion},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/zhang-spl.pdf},
  year = 2008
}
@inproceedings{ultraxIS2012,
  author = {Richmond, Korin and Renals, Steve},
  title = {Ultrax: An Animated Midsagittal Vocal Tract Display
                   for Speech Therapy},
  booktitle = {Proc. Interspeech},
  address = {Portland, Oregon, USA},
  abstract = {Speech sound disorders (SSD) are the most common
                   communication impairment in childhood, and can hamper
                   social development and learning. Current speech therapy
                   interventions rely predominantly on the auditory skills
                   of the child, as little technology is available to
                   assist in diagnosis and therapy of SSDs. Realtime
                   visualisation of tongue movements has the potential to
                   bring enormous benefit to speech therapy. Ultrasound
                   scanning offers this possibility, although its display
                   may be hard to interpret. Our ultimate goal is to
                   exploit ultrasound to track tongue movement, while
                   displaying a simplified, diagrammatic vocal tract that
                   is easier for the user to interpret. In this paper, we
                   outline a general approach to this problem, combining a
                   latent space model with a dimensionality reducing model
                   of vocal tract shapes. We assess the feasibility of
                   this approach using magnetic resonance imaging (MRI)
                   scans to train a model of vocal tract shapes, which is
                   animated using electromagnetic articulography (EMA)
                   data from the same speaker.},
  categories = {Ultrasound, speech therapy, vocal tract visualisation},
  keywords = {Ultrasound, speech therapy, vocal tract visualisation},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/RichmondRenalsIS2012.pdf},
  year = 2012
}
@article{gotoh-roysoc00,
  author = {Y.~Gotoh and S.~Renals},
  title = {Information Extraction from Broadcast News},
  journal = {Philosophical Transactions of the Royal Society of
                   London, Series A},
  volume = {358},
  pages = {1295--1310},
  abstract = {This paper discusses the development of trainable
                   statistical models for extracting content from
                   television and radio news broadcasts. In particular we
                   concentrate on statistical finite state models for
                   identifying proper names and other named entities in
                   broadcast speech. Two models are presented: the first
                   models name class information as a word attribute; the
                   second explicitly models both word-word and class-class
                   transitions. A common n-gram based formulation is used
                   for both models. The task of named entity
                   identification is characterized by relatively sparse
                   training data and issues related to smoothing are
                   discussed. Experiments are reported using the
                   DARPA/NIST Hub-4E evaluation for North American
                   Broadcast News.},
  categories = {stobs,ie,lm,bnews,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/rs00-preprint.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/rs00-preprint.ps.gz},
  year = 2000
}
@incollection{vipperla2009a,
  author = {Vipperla, Ravi Chander and Wolters, Maria and
                   Georgila, Kallirroi and Renals, Steve},
  title = {Speech Input from Older Users in Smart Environments:
                   Challenges and Perspectives},
  booktitle = {Proc. HCI International: Universal Access in
                   Human-Computer Interaction. Intelligent and Ubiquitous
                   Interaction Environments},
  publisher = {Springer},
  number = {5615},
  series = {Lecture Notes in Computer Science},
  abstract = {Although older people are an important user group for
                   smart environments, there has been relatively little
                   work on adapting natural language interfaces to their
                   requirements. In this paper, we focus on a particularly
                   thorny problem: processing speech input from older
                   users. Our experiments on the MATCH corpus show clearly
                   that we need age-specific adaptation in order to
                   recognize older users' speech reliably. Language models
                   need to cover typical interaction patterns of older
                   people, and acoustic models need to accommodate older
                   voices. Further research is needed into intelligent
                   adaptation techniques that will allow existing large,
                   robust systems to be adapted with relatively small
                   amounts of in-domain, age appropriate data. In
                   addition, older users need to be supported with
                   adequate strategies for handling speech recognition
                   errors.},
  doi = {10.1007/978-3-642-02710-9},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/vipperla2009a.pdf},
  url = {http://www.springerlink.com/content/27r01345r1683251/?p=ad2394d646814db59cf9868b0f74b11e&pi=13},
  year = 2009
}
@incollection{morgan-guyonbook94,
  author = {N.~Morgan and H.~Bourlard and S.~Renals and M.~Cohen
                   and H.~Franco},
  title = {Hybrid neural network/hidden {Markov} model systems
                   for continuous speech recognition},
  booktitle = {Advances in Pattern Recognition Systems using Neural
                   Networks Technologies},
  publisher = {World Scientific Publications},
  editor = {I.~Guyon and P.~S.~P.~Wang},
  volume = {7},
  series = {Series in Machine Perception and Artificial
                   Intelligence},
  categories = {},
  year = 1994
}
@inproceedings{koumpis-eurospeech01,
  author = {K.~Koumpis and S.~Renals and M.~Niranjan},
  title = {Extractive Summarization of Voicemail using Lexical
                   and Prosodic Feature Subset Selection},
  booktitle = {Proc. Eurospeech},
  pages = {2377--2380},
  address = {Aalborg, Denmark},
  abstract = {This paper presents a novel data-driven approach to
                   summarizing spoken audio transcripts utilizing lexical
                   and prosodic features. The former are obtained from a
                   speech recognizer and the latter are extracted
                   automatically from speech waveforms. We employ a
                   feature subset selection algorithm, based on ROC
                   curves, which examines different combinations of
                   features at different target operating conditions. The
                   approach is evaluated on the IBM Voicemail corpus,
                   demonstrating that it is possible and desirable to
                   avoid complete commitment to a single best classifier
                   or feature set.},
  categories = {voicemail,summarization,prosody,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/eurospeech01.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/eurospeech01.ps.gz},
  year = 2001
}
@inproceedings{cuayahuitletal_interspeech06,
  author = {Heriberto Cuayáhuitl and Steve Renals and Oliver
                   Lemon and Hiroshi Shimodaira},
  title = {Learning Multi-Goal Dialogue Strategies Using
                   Reinforcement Learning With Reduced State-Action Spaces},
  booktitle = {Proc. of INTERSPEECH},
  abstract = {Learning dialogue strategies using the reinforcement
                   learning framework is problematic due to its expensive
                   computational cost. In this paper we propose an
                   algorithm that reduces a state-action space to one
                   which includes only valid state-actions. We performed
                   experiments on full and reduced spaces using three
                   systems (with 5, 9 and 20 slots) in the travel domain
                   using a simulated environment. The task was to learn
                   multi-goal dialogue strategies optimizing single and
                   multiple confirmations. Average results using
                   strategies learnt on reduced spaces reveal the
                   following benefits against full spaces: 1) less
                   computer memory (94\% reduction), 2) faster learning
                   (93\% faster convergence) and better performance (8.4\%
                   less time steps and 7.7\% higher reward).},
  categories = {reinforcement learning, spoken dialogue systems},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/rss-icslp2006.pdf},
  year = 2006
}
@inproceedings{hochberg-icslp94,
  author = {M.~Hochberg and S.~Renals and T.~Robinson and
                   D.~Kershaw},
  title = {Large vocabulary continuous speech recognition using a
                   hybrid {connectionist/HMM} system},
  booktitle = {Proc. ICSLP},
  pages = {1499--1502},
  address = {Yokohama},
  categories = {},
  year = 1994
}
@inproceedings{zwyssig2010,
  author = {Zwyssig, Erich and Lincoln, Mike and Renals, Steve},
  title = {A Digital Microphone Array for Distant Speech
                   Recognition},
  booktitle = {Proc. IEEE ICASSP--10},
  pages = {5106--5109},
  abstract = {In this paper, the design, implementation and testing
                   of a digital microphone array is presented. The array
                   uses digital MEMS microphones which integrate the
                   microphone, amplifier and analogue to digital converter
                   on a single chip in place of the analogue microphones
                   and external audio interfaces currently used. The
                   device has the potential to be smaller, cheaper and
                   more flexible than typical analogue arrays, however the
                   effect on speech recognition performance of using
                   digital microphones is as yet unknown. In order to
                   evaluate the effect, an analogue array and the new
                   digital array are used to simultaneously record test
                   data for a speech recognition experiment. Initial
                   results employing no adaptation show that performance
                   using the digital array is significantly worse (14\%
                   absolute WER) than the analogue device. Subsequent
                   experiments using MLLR and CMLLR channel adaptation
                   reduce this gap, and employing MLLR for both channel
                   and speaker adaptation reduces the difference between
                   the arrays to 4.5\% absolute WER.},
  doi = {10.1109/ICASSP.2010.5495040},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/zwyssig-icassp10.pdf},
  year = 2010
}
@inproceedings{garau-interspeech05,
  author = {G. Garau and S. Renals and T. Hain},
  title = {Applying Vocal Tract Length Normalization to Meeting
                   Recordings},
  booktitle = {Proc. Interspeech},
  abstract = {Vocal Tract Length Normalisation (VTLN) is a commonly
                   used technique to normalise for inter-speaker
                   variability. It is based on the speaker-specific
                   warping of the frequency axis, parameterised by a
                   scalar warp factor. This factor is typically estimated
                   using maximum likelihood. We discuss how VTLN may be
                   applied to multiparty conversations, reporting a
                   substantial decrease in word error rate in experiments
                   using the ICSI meetings corpus. We investigate the
                   behaviour of the VTLN warping factor and show that a
                   stable estimate is not obtained. Instead it appears to
                   be influenced by the context of the meeting, in
                   particular the current conversational partner. These
                   results are consistent with predictions made by the
                   psycholinguistic interactive alignment account of
                   dialogue, when applied at the acoustic and phonological
                   levels.},
  categories = {ami,asr,edinburgh,vtln,speaker
                   adaptation,lvcsr,meetings},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/giuliagarau_eurospeech05.pdf},
  year = 2005
}
@article{morgan-ijprai93,
  author = {N.~Morgan and H.~Bourlard and S.~Renals and M.~Cohen
                   and H.~Franco},
  title = {Hybrid neural network/hidden {Markov} model systems
                   for continuous speech recognition},
  journal = {Intl. J. Pattern Recog. and Artific. Intell.},
  volume = {7},
  pages = {899--916},
  categories = {},
  year = 1993
}
@inproceedings{hochberg-arpa95,
  author = {M.~Hochberg and G.~Cook and S.~Renals and T.~Robinson
                   and R.~Schechtman},
  title = {The 1994 {Abbot} hybrid {connectionist--HMM} large
                   vocabulary recognition system},
  booktitle = {Proc. ARPA Spoken Language Technology Workshop},
  pages = {170--175},
  categories = {wernicke,recognition,wsj,am,hybrid,abbot,eval,search,sheffield,cambridge},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1995/slt95.ps.gz},
  year = 1995
}
@incollection{alhames-mlmi05,
  author = {M. Al-Hames and A. Dielmann and D. Gatica-Perez and S.
                   Reiter and S. Renals and G. Rigoll and D. Zhang},
  title = {Multimodal Integration for Meeting Group Action
                   Segmentation and Recognition},
  booktitle = {Proc. Multimodal Interaction and Related Machine
                   Learning Algorithms Workshop (MLMI--05)},
  publisher = {Springer},
  editor = {S. Renals and S. Bengio},
  pages = {52--63},
  abstract = {We address the problem of segmentation and recognition
                   of sequences of multimodal human interactions in
                   meetings. These interactions can be seen as a rough
                   structure of a meeting, and can be used either as input
                   for a meeting browser or as a first step towards a
                   higher semantic analysis of the meeting. A common
                   lexicon of multimodal group meeting actions, a shared
                   meeting data set, and a common evaluation procedure
                   enable us to compare the different approaches. We
                   compare three different multimodal feature sets and our
                   modelling infrastructures: a higher semantic feature
                   approach, multi-layer HMMs, a multistream DBN, as well
                   as a multi-stream mixed-state DBN for disturbed data.},
  categories = {m4,ami,multimodal,dbn,meetings,edinburgh,IDIAP,munich},
  year = 2006
}
@inproceedings{jyamagis07:avss2006,
  author = {Junichi Yamagishi and Takao Kobayashi and Steve Renals
                   and Simon King and Heiga Zen and Tomoki Toda and
                   Keiichi Tokuda },
  title = {Improved Average-Voice-based Speech Synthesis Using
                   Gender-Mixed Modeling and a Parameter Generation
                   Algorithm Considering {GV}},
  booktitle = {Proc. 6th ISCA Workshop on Speech Synthesis (SSW-6)},
  abstract = {For constructing a speech synthesis system which can
                   achieve diverse voices, we have been developing a
                   speaker independent approach of HMM-based speech
                   synthesis in which statistical average voice models are
                   adapted to a target speaker using a small amount of
                   speech data. In this paper, we incorporate a
                   high-quality speech vocoding method STRAIGHT and a
                   parameter generation algorithm with global variance
                   into the system for improving quality of synthetic
                   speech. Furthermore, we introduce a feature-space
                   speaker adaptive training algorithm and a gender mixed
                   modeling technique for conducting further normalization
                   of the average voice model. We build an English
                   text-to-speech system using these techniques and show
                   the performance of the system.},
  categories = {HMM, speech synthesis, speaker adaptation, HTS},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/ssw6-yamagishi.pdf},
  year = 2007
}
@inproceedings{renals2010b,
  author = {Renals, Steve},
  title = {Recognition and Understanding of Meetings},
  booktitle = {Proc. NAACL/HLT},
  pages = {1--9},
  abstract = {This paper is about interpreting human communication
                   in meetings using audio, video and other signals.
                   Automatic meeting recognition and understanding is
                   extremely challenging, since communication in a meeting
                   is spontaneous and conversational, and involves
                   multiple speakers and multiple modalities. This leads
                   to a number of significant research problems in signal
                   processing, in speech recognition, and in discourse
                   interpretation, taking account of both individual and
                   group behaviours. Addressing these problems requires an
                   interdisciplinary effort. In this paper, I discuss the
                   capture and annotation of multimodal meeting recordings
                   - resulting in the AMI meeting corpus - and how we have
                   built on this to develop techniques and applications
                   for the recognition and interpretation of meetings.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/renals-naacl10.pdf},
  year = 2010
}
@incollection{renals-sesimbra90,
  author = {S.~Renals},
  title = {Chaos in neural networks},
  booktitle = {Neural Networks},
  publisher = {Springer-Verlag},
  editor = {L.~B.~Almeida and C.~J.~Wellekens},
  number = {412},
  series = {Lecture Notes in Computer Science},
  pages = {90--99},
  categories = {},
  year = 1990
}
@inproceedings{renals-ijcnn89,
  author = {S.~Renals and R.~Rohwer},
  title = {Phoneme classification experiments using radial basis
                   functions},
  booktitle = {Proc. IJCNN},
  pages = {461--468},
  address = {Washington DC},
  categories = {},
  year = 1989
}
@inproceedings{kilgour2010,
  author = {Kilgour, Jonathan and Carletta, Jean and Renals, Steve},
  title = {The {Ambient Spotlight}: Queryless desktop search from
                   meeting speech},
  booktitle = {Proc ACM Multimedia 2010 Workshop SSCS 2010},
  abstract = {It has recently become possible to record any small
                   meeting using a laptop equipped with a plug-and-play
                   USB microphone array. We show the potential for such
                   recordings in a personal aid that allows project
                   managers to record their meetings and, when reviewing
                   them afterwards through a standard calendar interface,
                   to find relevant documents on their computer. This
                   interface is intended to supplement or replace the
                   textual searches that managers typically perform. The
                   prototype, which relies on meeting speech recognition
                   and topic segmentation, formulates and runs desktop
                   search queries in order to present its results.},
  doi = {10.1145/1878101.1878112},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2010/AmbientSpot.pdf},
  year = 2010
}
@incollection{murray2008c,
  author = {Murray, Gabriel and Kleinbauer, Thomas and Poller,
                   Peter and Renals, Steve and Kilgour, Jonathan},
  title = {Extrinsic Summarization Evaluation: A Decision Audit
                   Task},
  booktitle = {Machine Learning for Multimodal Interaction (Proc.
                   MLMI '08)},
  publisher = {Springer},
  number = {5237},
  series = {Lecture Notes in Computer Science},
  pages = {349--361},
  abstract = {In this work we describe a large-scale extrinsic
                   evaluation of automatic speech summarization
                   technologies for meeting speech. The particular task is
                   a decision audit, wherein a user must satisfy a complex
                   information need, navigating several meetings in order
                   to gain an understanding of how and why a given
                   decision was made. We compare the usefulness of
                   extractive and abstractive technologies in satisfying
                   this information need, and assess the impact of
                   automatic speech recognition (ASR) errors on user
                   performance. We employ several evaluation methods for
                   participant performance, including post-questionnaire
                   data, human subjective and objective judgments, and an
                   analysis of participant browsing behaviour.},
  doi = {10.1007/978-3-540-85853-9_32},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/murray2008c.pdf},
  year = 2008
}
@article{renals-splett96,
  author = {S.~Renals},
  title = {Phone deactivation pruning in large vocabulary
                   continuous speech recognition},
  journal = {IEEE Signal Processing Letters},
  volume = {3},
  pages = {4--6},
  abstract = {In this letter we introduce a new pruning strategy for
                   large vocabulary continuous speech recognition based on
                   direct estimates of local posterior phone
                   probabilities. This approach is well suited to hybrid
                   connectionist/hidden Markov model systems. Experiments
                   on the Wall Street Journal task using a 20,000 word
                   vocabulary and a trigram language model have
                   demonstrated that phone deactivation pruning can
                   increase the speed of recognition-time search by up to
                   a factor of 10, with a relative increase in error rate
                   of less than 2\%.},
  categories = {wernicke,sprach,recognition,search,wsj,sheffield},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1996/splett96.ps.gz},
  year = 1996
}
@incollection{dielmann-mlmi06,
  author = {A. Dielmann and S. Renals},
  title = {Automatic Dialogue Act Recognition using a Dynamic
                   {Bayesian} Network},
  booktitle = {Proc. Multimodal Interaction and Related Machine
                   Learning Algorithms Workshop (MLMI--06)},
  publisher = {Springer},
  editor = {S. Renals and S. Bengio and J. Fiscus},
  pages = {178--189},
  abstract = {We propose a joint segmentation and classification
                   approach for the dialogue act recognition task on
                   natural multi-party meetings ({ICSI} Meeting Corpus).
                   Five broad DA categories are automatically recognised
                   using a generative Dynamic {Bayesian} Network based
                   infrastructure. Prosodic features and a switching
                   graphical model are used to estimate DA boundaries, in
                   conjunction with a factored language model which is
                   used to relate words and DA categories. This easily
                   generalizable and extensible system promotes a rational
                   approach to the joint DA segmentation and recognition
                   task, and is capable of good recognition performance.},
  categories = {ami,dialogue act,dbn,factored language
                   model,meetings,edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/dielmann-mlmi06.pdf},
  year = 2007
}
@inproceedings{hain-interspeech05,
  author = {T. Hain and J. Dines and G. Garau and M. Karafiat and
                   D. Moore and V. Wan and R. Ordelman and S. Renals},
  title = {Transcription of Conference Room Meetings: an
                   Investigation},
  booktitle = {Proc. Interspeech},
  abstract = {The automatic processing of speech collected in
                   conference style meetings has attracted considerable
                   interest with several large scale projects devoted to
                   this area. In this paper we explore the use of various
                   meeting corpora for the purpose of automatic speech
                   recognition. In particular we investigate the
                   similarity of these resources and how to efficiently
                   use them in the construction of a meeting transcription
                   system. The analysis shows distinctive features for
                   each resource. However the benefit in pooling data and
                   hence the similarity seems sufficient to speak of a
                   generic conference meeting domain . In this context
                   this paper also presents work on development for the
                   AMI meeting transcription system, a joint effort by
                   seven sites working on the AMI (augmented multi-party
                   interaction) project.},
  categories = {ami,asr,edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hain-eurospeech05.pdf},
  year = 2005
}
@inproceedings{huang2008-is,
  author = {Songfang Huang and Steve Renals},
  title = {Unsupervised Language Model Adaptation Based on Topic
                   and Role Information in Multiparty Meetings},
  booktitle = {Proc. Interspeech'08},
  pages = {833--836},
  address = {Brisbane, Australia},
  abstract = {We continue our previous work on the modeling of topic
                   and role information from multiparty meetings using a
                   hierarchical Dirichlet process (HDP), in the context of
                   language model adaptation. In this paper we focus on
                   three problems: 1) an empirical analysis of the HDP as
                   a nonparametric topic model; 2) the mismatch problem of
                   vocabularies of the baseline n-gram model and the HDP;
                   and 3) an automatic speech recognition experiment to
                   further verify the effectiveness of our adaptation
                   framework. Experiments on a large meeting corpus of
                   more than 70 hours speech data show consistent and
                   significant improvements in terms of word error rate
                   for language model adaptation based on the topic and
                   role information.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/interspeech08.pdf},
  year = 2008
}
@inproceedings{hennebert-eurospeech97,
  author = {J.~Hennebert and C.~Ris and H.~Bourlard and S.~Renals
                   and N.~Morgan},
  title = {Estimation of global posteriors and forward-backward
                   training of hybrid {HMM/ANN} systems},
  booktitle = {Proc. Eurospeech},
  pages = {1951--1954},
  address = {Rhodes},
  abstract = {The results of our research presented in this paper
                   are two-fold. First, an estimation of global
                   posteriors[5~5 is formalized in the framework of hybrid
                   HMM/ANN systems. It is shown that hybrid HMM/ANN
                   systems, in which the ANN part estimates local
                   posteriors can be used to model global posteriors. This
                   formalization provides us with a clear theory in which
                   both REMAP and ``classical'' Viterbi trained hybrid
                   systems are unified. Second, a new forward-backward
                   training of hybrid HMM/ANN systems is derived from the
                   previous formulation. Comparisons of performance
                   between Viterbi and forward-backward hybrid systems are
                   presented and discussed.},
  categories = {sprach,am,hybrid,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/eurosp97-remap.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/eurosp97-remap.ps.gz},
  year = 1997
}
@inproceedings{cook-darpa99,
  author = {G.~Cook and K.~Al-Ghoneim and D.~Ellis and
                   E.~Fosler-Lussier and Y.~Gotoh and B.~Kingsbury and
                   N.~Morgan and S.~Renals and T.~Robinson and G.~Williams},
  title = {The {SPRACH} system for the transcription of broadcast
                   news},
  booktitle = {Proc. DARPA Broadcast News Workshop},
  pages = {161--166},
  abstract = {This paper describes the SPRACH system developed for
                   the 1998 Hub-4E broadcast news evaluation. The system
                   is based on the connectionist-HMM framework and uses
                   both recurrent neural network and multi-layer
                   perceptron acoustic models. We describe both a system
                   designed for the primary transcription hub, and a
                   system for the less-than 10 times real-time spoke. We
                   then describe recent developments to CHRONOS, a
                   time-first stack decoder. We show how these
                   developments have simplified the evaluation system, and
                   led to significant reductions in the error rate of the
                   10x real-time system. We also present a system designed
                   to operate in real-time with negligible search error.},
  categories = {sprach,bnews,recognition,am,hybrid,abbot,search,eval,sheffield},
  http = {http://homepages.inf.ed.ac.uk/srenals/pubs/1999/darpa99-sprach.html},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/darpa99-sprach.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/darpa99-sprach.ps.gz},
  year = 1999
}
@inproceedings{abberley-icassp98,
  author = {D.~Abberley and S.~Renals and G.~Cook},
  title = {Retrieval of broadcast news documents with the {THISL}
                   system},
  booktitle = {Proc IEEE ICASSP},
  pages = {3781--3784},
  address = {Seattle},
  abstract = {This paper describes a spoken document retrieval
                   system, combining the Abbot large vocabulary continuous
                   speech recognition (LVCSR) system developed by
                   Cambridge University, Sheffield University and
                   SoftSound, and the PRISE information retrieval engine
                   developed by NIST. The system was constructed to enable
                   us to participate in the TREC 6 Spoken Document
                   Retrieval experimental evaluation. Our key aims in this
                   work wer e to produce a complete system for the SDR
                   task, to investigate the effect of a word error rate of
                   30-50\% on retrieval performance and to investigate the
                   integration of LVCSR and word spotting in a retrieval
                   task.},
  categories = {thisl,bnews,trec,ir,recognition,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/icassp98.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/icassp98.ps.gz},
  year = 1998
}
@inproceedings{uria2011deep,
  author = {Uria, Benigno and Renals, Steve and Richmond, Korin},
  title = {A Deep Neural Network for Acoustic-Articulatory Speech
                   Inversion},
  booktitle = {Proc. NIPS 2011 Workshop on Deep Learning and
                   Unsupervised Feature Learning},
  address = {Sierra Nevada, Spain},
  abstract = {In this work, we implement a deep belief network for
                   the acoustic-articulatory inversion mapping problem. We
                   find that adding up to 3 hidden-layers improves
                   inversion accuracy. We also show that this improvement
                   is due to the higher ex- pressive capability of a deep
                   model and not a consequence of adding more adjustable
                   parameters. Additionally, we show unsupervised
                   pretraining of the sys- tem improves its performance in
                   all cases, even for a 1 hidden-layer model. Our
                   implementation obtained an average root mean square
                   error of 0.95 mm on the MNGU0 test dataset, beating all
                   previously published results.},
  month = {December},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2011/articulatory_inversion.pdf},
  year = 2011
}
@inproceedings{robinson-icassp95,
  author = {T.~Robinson and J.~Fransen and D.~Pye and J.~Foote and
                   S.~Renals},
  title = {{WSJCAM0}: A {British English} speech corpus for large
                   vocabulary continuous speech recognition},
  booktitle = {Proc IEEE ICASSP},
  pages = {81--84},
  address = {Detroit},
  categories = {},
  year = 1995
}
@article{cuayahuitl2009,
  author = {Cuayáhuitl, Heriberto and Renals, Steve and Lemon,
                   Oliver and Shimodaira, Hiroshi},
  title = {Evaluation of a hierarchical reinforcement learning
                   spoken dialogue system},
  journal = {Computer Speech and Language},
  volume = {24},
  number = {2},
  pages = {395-429},
  abstract = {We describe an evaluation of spoken dialogue
                   strategies designed using hierarchical reinforcement
                   learning agents. The dialogue strategies were learnt in
                   a simulated environment and tested in a laboratory
                   setting with 32 users. These dialogues were used to
                   evaluate three types of machine dialogue behaviour:
                   hand-coded, fully-learnt and semi-learnt. These
                   experiments also served to evaluate the realism of
                   simulated dialogues using two proposed metrics
                   contrasted with ‘Precision-Recall’. The learnt
                   dialogue behaviours used the Semi-Markov Decision
                   Process (SMDP) model, and we report the first
                   evaluation of this model in a realistic conversational
                   environment. Experimental results in the travel
                   planning domain provide evidence to support the
                   following claims: (a) hierarchical semi-learnt dialogue
                   agents are a better alternative (with higher overall
                   performance) than deterministic or fully-learnt
                   behaviour; (b) spoken dialogue strategies learnt with
                   highly coherent user behaviour and conservative
                   recognition error rates (keyword error rate of 20\%)
                   can outperform a reasonable hand-coded strategy; and
                   (c) hierarchical reinforcement learning dialogue agents
                   are feasible and promising for the (semi) automatic
                   design of optimized dialogue behaviours in larger-scale
                   systems.},
  doi = {10.1016/j.csl.2009.07.001},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/cuayahuitl-csl09.pdf},
  year = 2009
}
@article{renals-specom00,
  author = {S.~Renals and D.~Abberley and D.~Kirby and T.~Robinson},
  title = {Indexing and Retrieval of Broadcast News},
  journal = {Speech Communication},
  volume = {32},
  pages = {5--20},
  abstract = {This paper describes a spoken document retrieval (SDR)
                   system for British and North American Broadcast News.
                   The system is based on a connectionist large vocabulary
                   speech recognizer and a probabilistic information
                   retrieval system. We discuss the development of a
                   realtime Broadcast News speech recognizer, and its
                   integration into an SDR system. Two advances were made
                   for this task: automatic segmentation and statistical
                   query expansion using a secondary corpus. Precision and
                   recall results using the Text Retrieval Conference
                   (TREC) SDR evaluation infrastructure are reported
                   throughout the paper, and we discuss the application of
                   these developments to a large scale SDR task based on
                   an archive of British English broadcast news.},
  categories = {thisl,bnews,trec,ir,recognition,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/specom00-preprint.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/specom00-preprint.ps.gz},
  year = 2000
}
@inproceedings{renals-twente98,
  author = {S.~Renals and D.~Abberley},
  title = {The {THISL} spoken document retrieval system},
  booktitle = {Proc. 14th Twente Workshop on Language Technology},
  pages = {129--140},
  abstract = {THISL is an ESPRIT Long Term Research Project focused
                   the development and construction of a system to items
                   from an archive of television and radio news
                   broadcasts. In this paper we outline our spoken
                   document retrieval system based on the Abbot speech
                   recognizer and a text retrieval system based on Okapi
                   term-weighting . The system has been evaluated as part
                   of the TREC-6 and TREC-7 spoken document retrieval
                   evaluations and we report on the results of the TREC-7
                   evaluation based on a document collection of 100 hours
                   of North American broadcast news.},
  categories = {thisl,bnews,trec,ir,recognition,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/twente98.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/twente98.ps.gz},
  year = 1998
}
@inproceedings{christensen-ecir04,
  author = {H. Christensen and B. Kolluru and Y. Gotoh and S.
                   Renals},
  title = {From text summarisation to style-specific
                   summarisation for broadcast news},
  booktitle = {Proc. ECIR--2004},
  pages = {},
  abstract = {In this paper we report on a series of experiments
                   investigating the path from text-summarisation to
                   style-specific summarisation of spoken news stories. We
                   show that the portability of traditional text
                   summarisation features to broadcast news is dependent
                   on the diffusiveness of the information in the
                   broadcast news story. An analysis of two categories of
                   news stories (containing only read speech or some
                   spontaneous speech) demonstrates the importance of the
                   style and the quality of the transcript, when
                   extracting the summary-worthy information content.
                   Further experiments indicate the advantages of doing
                   style-specific summarisation of broadcast news.},
  categories = {s3l,summarization,bnews,edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/ecir04.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/ecir04.ps.gz},
  year = 2004
}
@inproceedings{robinson-eurospeech99,
  author = {T.~Robinson and D.~Abberley and D.~Kirby and S.~Renals},
  title = {Recognition, indexing and retrieval of {British}
                   broadcast news with the {THISL} SYSTEM},
  booktitle = {Proc. Eurospeech},
  pages = {1067--1070},
  address = {Budapest},
  abstract = {This paper described the THISL spoken document
                   retrieval system for British and North American
                   Broadcast News. The system is based on the Abbot large
                   vocabulary speech recognizer and a probabilistic text
                   retrieval system. We discuss the development of a
                   realtime British English Broadcast News system, and its
                   integration into a spoken document retrieval system.
                   Detailed evaluation is performed using a similar North
                   American Broadcast News system, to take advantage of
                   the TREC SDR evaluation methodology. We report results
                   on this evaluation, with particular reference to the
                   effect of query expansion and of automatic segmentation
                   algorithms.},
  categories = {thisl,bnews,trec,ir,recognition,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/eurospeech99-thisl.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/eurospeech99-thisl.ps.gz},
  year = 1999
}
@inproceedings{carreira-nnsp98,
  author = {M.~Carreira-Perpiñán and S.~Renals},
  title = {Experimental evaluation of latent variable models for
                   dimensionality reduction},
  booktitle = {IEEE Proc. Neural Networks for Signal Processing},
  volume = {8},
  pages = {165--173},
  address = {Cambridge},
  abstract = {We use electropalatographic (EPG) data as a test bed
                   for dimensionality reduction methods based in latent
                   variable modelling, in which an underlying lower
                   dimension representation is inferred directly from the
                   data. Several models (and mixtures of them) are
                   investigated, including factor analysis and the
                   generative topographic mapping (GTM). Experiments
                   indicate that nonlinear latent variable modelling
                   reveals a low-dimensional structure in the data
                   inaccessible to the investigated linear models.},
  categories = {ml,lv,artic,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/nnsp98.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/nnsp98.ps.gz},
  year = 1998
}
@inproceedings{koumpis-icoin01,
  author = {K.~Koumpis and C.~Ladas and S. Renals},
  title = {An Advanced Integrated Architecture for Wireless
                   Voicemail Retrieval},
  booktitle = {Proc. 15th IEEE International Conference on
                   Information Networking},
  pages = {403--410},
  abstract = {This paper describes an alternative architecture for
                   voicemail data retrieval on the move. It is comprised
                   of three distinct components: a speech recognizer, a
                   text summarizer and a WAP push service initiator,
                   enabling mobile users to receive a text summary of
                   their voicemail in realtime without an explicit
                   request. Our approach overcomes the cost and usability
                   limitations of the conventional voicemail retrieval
                   paradigm which requires a connection establishment in
                   order to listen to spoken messages. We report
                   performance results on all different components of the
                   system which has been trained on a database containing
                   1843 North American English messages as well as on the
                   duration of the corresponding data path. The proposed
                   architecture can be further customized to meet the
                   requirements of a complete voicemail value-added
                   service.},
  categories = {voicemail,summarization,sheffield},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/icoin01.ps.gz},
  year = 2001
}
@article{renals-sap94,
  author = {S.~Renals and N.~Morgan and H.~Bourlard and M.~Cohen
                   and H.~Franco},
  title = {Connectionist probability estimators in {HMM} speech
                   recognition},
  journal = {IEEE Trans. on Speech and Audio Processing},
  volume = {2},
  pages = {161--175},
  abstract = {We are concerned with integrating connectionist
                   networks into a hidden Markov model (HMM) speech
                   recognition system. This is achieved through a
                   statistical interpretation of connectionist networks as
                   probability estimators. We review the basis of HMM
                   speech recognition and point out the possible benefits
                   of incorporating connectionist networks. Issues
                   necessary to the construction of a connectionist HMM
                   recognition system are discussed, including choice of
                   connectionist probability estimator. We describe the
                   performance of such a system, using a multi-layer
                   perceptron probability estimator, evaluated on the
                   speaker-independent DARPA Resource Management database.
                   In conclusion, we show that a connectionist component
                   improves a state-of-the-art HMM system.},
  categories = {},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1994/sap94.ps.gz},
  year = 1994
}
@inproceedings{renals-trec01,
  author = {S.~Renals and D.~Abberley},
  title = {The {THISL} {SDR} system at {TREC}--9},
  booktitle = {Proc. Ninth Text Retrieval Conference (TREC--9)},
  pages = {},
  abstract = {This paper describes our participation in the TREC-9
                   Spoken Document Retrieval (SDR) track. The THISL SDR
                   system consists of a realtime version of a hybrid
                   connectionist/HMM large vocabulary speech recognition
                   system and a probabilistic text retrieval system. This
                   paper describes the configuration of the speech
                   recognition and text retrieval systems, including
                   segmentation and query expansion. We report our results
                   for development tests using the TREC-8 queries, and for
                   the TREC-9 evaluation.},
  categories = {thisl,bnews,trec,ir,recognition,eval,abbot,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/trec9-proc.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/trec9-proc.ps.gz},
  year = 2001
}
@article{carreira-nc00,
  author = {M.~Carreira-Perpiñán and S.~Renals},
  title = {Practical identifiability of finite mixtures of
                   multivariate {Bernoulli} distributions},
  journal = {Neural Computation},
  volume = {12},
  pages = {141--152},
  abstract = {The class of finite mixtures of multivariate Bernoulli
                   distributions is known to be nonidentifiable, i.e.,
                   different values of the mixture parameters can
                   correspond to exactly the same probability
                   distribution. In principle, this would mean that sample
                   estimates using this model would give rise to different
                   interpretations. We give empirical support to the fact
                   that estimation of this class of mixtures can still
                   produce meaningful results in practice, thus lessening
                   the importance of the identifiability problem. We also
                   show that the EM algorithm is guaranteed to converge to
                   a proper maximum likelihood estimate, owing to a
                   property of the log-likelihood surface. Experiments
                   with synthetic data sets show that an original
                   generating distribution can be estimated from a sample.
                   Experiments with an electropalatography (EPG) data set
                   show important structure in the data.},
  categories = {ml,lv,artic,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/nc00-preprint.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/nc00-preprint.ps.gz},
  year = 2000
}
@article{bourlard-specom92,
  author = {H.~Bourlard and N.~Morgan and S.~Renals},
  title = {Neural nets and hidden {Markov} models: Review and
                   generalizations},
  journal = {Speech Communication},
  volume = {11},
  pages = {237--246},
  categories = {},
  year = 1992
}
@incollection{renals-nips94,
  author = {S.~Renals and M.~Hochberg and T.~Robinson},
  title = {Learning temporal dependencies in connectionist speech
                   recognition},
  booktitle = {Advances in Neural Information Processing Systems},
  publisher = {Morgan Kaufmann},
  editor = {J.~D.~Cowan and G.~Tesauro and J.~Alspector},
  volume = {6},
  pages = {1051--1058},
  categories = {},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1994/1051.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1994/1051.ps.gz},
  year = 1994
}
@inproceedings{zhang-icslp2006,
  author = {Le Zhang and Steve Renals},
  title = {Phone Recognition Analysis for Trajectory {HMM}},
  booktitle = {Proc. Interspeech 2006},
  address = {Pittsburgh, USA},
  abstract = { The trajectory {HMM} has been shown to be useful for
                   model-based speech synthesis where a smoothed
                   trajectory is generated using temporal constraints
                   imposed by dynamic features. To evaluate the
                   performance of such model on an ASR task, we present a
                   trajectory decoder based on tree search with delayed
                   path merging. Experiment on a speaker-dependent phone
                   recognition task using the MOCHA-TIMIT database shows
                   that the MLE-trained trajectory model, while retaining
                   attractive properties of being a proper generative
                   model, tends to favour over-smoothed trajectory among
                   competing hypothesises, and does not perform better
                   than a conventional {HMM}. We use this to build an
                   argument that models giving better fit on training data
                   may suffer a reduction of discrimination by being too
                   faithful to training data. This partially explains why
                   alternative acoustic models that try to explicitly
                   model temporal constraints do not achieve significant
                   improvements in ASR. },
  key = {asr},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/zhang-icslp2006.pdf},
  year = 2006
}
@inproceedings{dielmann-icassp04,
  author = {A. Dielmann and S. Renals},
  title = {Dynamic {Bayesian} Networks for Meeting Structuring},
  booktitle = {Proc. IEEE ICASSP},
  pages = {},
  abstract = {This paper is about the automatic structuring of
                   multiparty meetings using audio information. We have
                   used a corpus of 53 meetings, recorded using a
                   microphone array and lapel microphones for each
                   participant. The task was to segment meetings into a
                   sequence of meeting actions, or phases. We have adopted
                   a statistical approach using dynamic Bayesian networks
                   (DBNs). Two DBN architectures were investigated: a
                   two-level hidden Markov model (HMM) in which the
                   acoustic observations were concatenated; and a
                   multistream DBN in which two separate observation
                   sequences were modelled. Additionally we have also
                   explored the use of counter variables to constrain the
                   number of action transitions. Experimental results
                   indicate that the DBN architectures are an improvement
                   over a simple baseline HMM, with the multistream DBN
                   with counter constraints producing an action error rate
                   of 6\%.},
  categories = {m4,multimodal,dbn,meetings,edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/icassp04-m4.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/icassp04-m4.ps.gz},
  year = 2004
}
@inproceedings{gotoh-esca99,
  author = {Y.~Gotoh and S.~Renals},
  title = {Statistical annotation of named entities in spoken
                   audio},
  booktitle = {Proc. ESCA Workshop on Accessing Information In Spoken
                   Audio},
  pages = {43--48},
  address = {Cambridge},
  abstract = {In this paper we describe stochastic finite state
                   model for named entity (NE) identification, based on
                   explicit word-level n-gram relations. NE categories are
                   incorporated in the model as word attributes. We
                   present an overview of the approach, describing how the
                   extensible vocabulary model may be used for NE
                   identification. We report development and evaluation
                   results on a North American Broadcast News task. This
                   approach resulted in average precision and recall
                   scores of around 83\% on hand transcribed data, and
                   73\% on the SPRACH recogniser output. We also present
                   an error analysis and a comparison of our approach with
                   an alternative statistical approach.},
  categories = {sprach,stobs,ie,lm,bnews,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/esca99-ne.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/esca99-ne.ps.gz},
  year = 1999
}
@inproceedings{wolters-is:09,
  author = {Wolters, Maria and Vipperla, Ravichander and Renals,
                   Steve},
  title = {Age Recognition for Spoken Dialogue Systems: Do We
                   Need It?},
  booktitle = {Proc. Interspeech},
  abstract = {When deciding whether to adapt relevant aspects of the
                   system to the particular needs of older users, spoken
                   dialogue systems often rely on automatic detection of
                   chronological age. In this paper, we show that vocal
                   ageing as measured by acoustic features is an
                   unreliable indicator of the need for adaptation. Simple
                   lexical features greatly improve the prediction of both
                   relevant aspects of cognition and interactions style.
                   Lexical features also boost age group prediction. We
                   suggest that adaptation should be based on observed
                   behaviour, not on chronological age, unless it is not
                   feasible to build classifiers for relevant adaptation
                   decisions.},
  categories = {age recognition, spoken dialogue systems},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/is09.pdf},
  year = 2009
}
@inproceedings{christensen-prosody01,
  author = {H.~Christensen and Y.~Gotoh and S.~Renals},
  title = {Punctuation Annotation using Statistical Prosody
                   Models},
  booktitle = {Proc. ISCA Workshop on Prosody in Speech Recognition
                   and Understanding},
  pages = {},
  address = {Red Bank, NJ, USA},
  abstract = {This paper is about the development of statistical
                   models of prosodic features to generate linguistic
                   meta-data for spoken language. In particular, we are
                   concerned with automatically punctuating the output of
                   a broadcast news speech recogniser. We present a
                   statistical finite state model that combines prosodic,
                   linguistic and punctuation class features. Experimental
                   results are presented using the Hub-4 Broadcast News
                   corpus, and in the light of our results we discuss the
                   issue of a suitable method of evaluating the present
                   task.},
  categories = {stobs,ie,lm,prosody,bnews,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/pros01-punc.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/pros01-punc.ps.gz},
  year = 2001
}
@inproceedings{huang2009-is,
  author = {Songfang Huang and Steve Renals},
  title = {A Parallel Training Algorithm for Hierarchical
                   {P}itman-{Y}or Process Language Models},
  booktitle = {Proc. Interspeech'09},
  pages = {2695--2698},
  address = {Brighton, UK},
  abstract = {The Hierarchical Pitman Yor Process Language Model
                   (HPYLM) is a Bayesian language model based on a
                   non-parametric prior, the Pitman-Yor Process. It has
                   been demonstrated, both theoretically and practically,
                   that the HPYLM can provide better smoothing for
                   language modeling, compared with state-of-the-art
                   approaches such as interpolated Kneser-Ney and modified
                   Kneser-Ney smoothing. However, estimation of Bayesian
                   language models is expensive in terms of both
                   computation time and memory; the inference is
                   approximate and requires a number of iterations to
                   converge. In this paper, we present a parallel training
                   algorithm for the HPYLM, which enables the approach to
                   be applied in the context of automatic speech
                   recognition, using large training corpora with large
                   vocabularies. We demonstrate the effectiveness of the
                   proposed algorithm by estimating language models from
                   corpora for meeting transcription containing over 200
                   million words, and observe significant reductions in
                   perplexity and word error rate.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/sh_interspeech09.pdf},
  year = 2009
}
@incollection{murray2008b,
  author = {Murray, Gabriel and Renals, Steve},
  title = {Detecting Action Items in Meetings},
  booktitle = {Machine Learning for Multimodal Interaction (Proc.
                   MLMI '08)},
  publisher = {Springer},
  number = {5237},
  series = {Lecture Notes in Computer Science},
  pages = {208--213},
  abstract = {We present a method for detecting action items in
                   spontaneous meeting speech. Using a supervised approach
                   incorporating prosodic, lexical and structural
                   features, we can classify such items with a high degree
                   of accuracy. We also examine how well various feature
                   subclasses can perform this task on their own.},
  doi = {10.1007/978-3-540-85853-9_19},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/murray2008b.pdf},
  url = {http://dx.doi.org/10.1007/978-3-540-85853-9_19},
  year = 2008
}
@inproceedings{rohwer-neuro88,
  author = {R.~Rohwer and S.~Renals},
  title = {Training Recurrent Networks},
  booktitle = {Neural networks from models to applications (Proc.
                   nEuro '88)},
  editor = {L.~Personnaz and G.~Dreyfus},
  pages = {207--216},
  address = {Paris},
  publisher = {I.D.S.E.T.},
  categories = {},
  year = 1988
}
@article{huang2010,
  author = {Huang, Songfang and Renals, Steve},
  title = {Hierarchical {Bayesian} Language Models for
                   Conversational Speech Recognition},
  journal = {IEEE Transactions on Audio, Speech and Language
                   Processing},
  volume = {18},
  number = {8},
  pages = {1941--1954},
  abstract = {Traditional n-gram language models are widely used in
                   state-of-the-art large vocabulary speech recognition
                   systems. This simple model suffers from some
                   limitations, such as overfitting of maximum-likelihood
                   estimation and the lack of rich contextual knowledge
                   sources. In this paper, we exploit a hierarchical
                   Bayesian interpretation for language modeling, based on
                   a nonparametric prior called the Pitman--Yor process.
                   This offers a principled approach to language model
                   smoothing, embedding the power-law distribution for
                   natural language. Experiments on the recognition of
                   conversational speech in multiparty meetings
                   demonstrate that by using hierarchical Bayesian
                   language models, we are able to achieve significant
                   reductions in perplexity and word error rate.},
  doi = {10.1109/TASL.2010.2040782},
  keywords = {AMI corpus , conversational speech recognition ,
                   hierarchical Bayesian model , language model (LM) ,
                   meetings , smoothing},
  month = {January},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/huang-taslp10.pdf},
  url = {http://dx.doi.org/10.1109/TASL.2010.2040782},
  year = 2010
}
@inproceedings{williams-eurospeech97,
  author = {G.~Williams and S.~Renals},
  title = {Confidence measures for hybrid {HMM/ANN} speech
                   recognition},
  booktitle = {Proc. Eurospeech},
  pages = {1955--1958},
  address = {Rhodes},
  abstract = {In this paper we introduce four acoustic confidence
                   measures which are derived from the output of a hybrid
                   HMM/ANN large vocabulary continuous speech recognition
                   system. These confidence measures, based on local
                   posterior probability estimates computed by an ANN, are
                   evaluated at both phone and word levels, using the
                   North American Business News corpus.},
  categories = {recognition,conf,hybrid,wsj,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/eurosp97-conf.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/eurosp97-conf.ps.gz},
  year = 1997
}
@inproceedings{carreira-icphs99,
  author = {M.~Carreira-Perpiñán and S.~Renals},
  title = {A latent-variable modelling approach to the
                   acoustic-to-articulatory mapping problem},
  booktitle = {Proc. 14th Int. Congress of Phonetic Sciences},
  pages = {2013-2016},
  address = {San Francisco},
  abstract = {We present a latent variable approach to the
                   acoustic-to-articulatory mapping problem, where
                   different vocal tract configurations can give rise to
                   the same acoustics. In latent variable modelling, the
                   combined acoustic and articulatory data are assumed to
                   have been generated by an underlying low-dimensional
                   process. A parametric probabilistic model is estimated
                   and mappings are derived from the respective
                   conditional distributions. This has the advantage over
                   other methods, such as articulatory codebooks or neural
                   networks, of directly addressing the nonuniqueness
                   problem. We demonstrate our approach with
                   electropalatographic and acoustic data from the ACCOR
                   database.},
  categories = {ml,lv,artic,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/icphs99.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/icphs99.ps.gz},
  year = 1999
}
@inproceedings{barker-icslp98,
  author = {J.~Barker and G.~Williams and S.~Renals},
  title = {Acoustic confidence measures for segmenting broadcast
                   news},
  booktitle = {Proc. ICSLP},
  pages = {2719--2722},
  address = {Sydney},
  abstract = {In this paper we define an acoustic confidence measure
                   based on the estimates of local posterior probabilities
                   produced by a HMM/ANN large vocabulary continuous
                   speech recognition system. We use this measure to
                   segment continuous audio into regions where it is and
                   is not appropriate to expend recognition effort. The
                   segmentation is computationally inexpensive and
                   provides reductions in both overall word error rate and
                   decoding time. The technique is evaluated using
                   material from the Broadcast News corpus.},
  categories = {recognition,conf,hybrid,bnews,segmentation,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/icslp98-seg.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/icslp98-seg.ps.gz},
  year = 1998
}
@inproceedings{renals-icassp92,
  author = {S.~Renals and N.~Morgan and M.~Cohen and H.~Franco},
  title = {Connectionist probability estimation in the {Decipher}
                   speech recognition system},
  booktitle = {Proc IEEE ICASSP},
  pages = {601--604},
  address = {San Francisco},
  categories = {},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1992/icassp92.ps.gz},
  year = 1992
}
@incollection{huang2007-mlmi,
  author = {Huang, Songfang and Renals, Steve},
  title = {Modeling Prosodic Features in Language Models for
                   Meetings},
  booktitle = {Machine Learning for Multimodal Interaction IV},
  publisher = {Springer},
  editor = {Popescu-Belis, A. and Renals, S. and Bourlard, H.},
  volume = {4892},
  series = {Lecture Notes in Computer Science},
  pages = {191--202},
  abstract = {Prosody has been actively studied as an important
                   knowledge source for speech recognition and
                   understanding. In this paper, we are concerned with the
                   question of exploiting prosody for language models to
                   aid automatic speech recognition in the context of
                   meetings. Using an automatic syllable detection
                   algorithm, the syllable-based prosodic features are
                   extracted to form the prosodic representation for each
                   word. Two modeling approaches are then investigated.
                   One is based on a factored language model, which
                   directly uses the prosodic representation and treats it
                   as a `word'. Instead of direct association, the second
                   approach provides a richer probabilistic structure
                   within a hierarchical Bayesian framework by introducing
                   an intermediate latent variable to represent similar
                   prosodic patterns shared by groups of words. Four-fold
                   cross-validation experiments on the ICSI Meeting Corpus
                   show that exploiting prosody for language modeling can
                   significantly reduce the perplexity, and also have
                   marginal reductions in word error rate.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/mlmi07.pdf},
  year = 2007
}
@article{renals-sap99,
  author = {S.~Renals and M.~Hochberg},
  title = {Start-synchronous search for large vocabulary
                   continuous speech recognition},
  journal = {IEEE Trans. on Speech and Audio Processing},
  volume = {7},
  pages = {542--553},
  abstract = {In this paper, we present a novel, efficient search
                   strategy for large vocabulary continuous speech
                   recognition. The search algorithm, based on a stack
                   decoder framework, utilizes phone-level posterior
                   probability estimates (produced by a connectionist/HMM
                   acoustic model) as a basis for phone deactivation
                   pruning - a highly efficient method of reducing the
                   required computation. The single-pass algorithm is
                   naturally factored into the time-asynchronous
                   processing of the word sequence and the
                   time-synchronous processing of the HMM state sequence.
                   This enables the search to be decoupled from the
                   language model while still maintaining the
                   computational benefits of time-synchronous processing.
                   The incorporation of the language model in the search
                   is discussed and computationally cheap approximations
                   to the full language model are introduced. Experiments
                   were performed on the North American Business News task
                   using a 60,000 word vocabulary and a trigram language
                   model. Results indicate that the computational cost of
                   the search may be reduced by more than a factor of 40
                   with a relative search error of less than 2\% using the
                   techniques discussed in the paper.},
  categories = {sprach,recognition,search,bnews,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/sap99-preprint.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/sap99-preprint.ps.gz},
  year = 1999
}
@article{garau2008,
  author = {Garau, Giulia and Renals, Steve},
  title = {Combining Spectral Representations for Large
                   Vocabulary Continuous Speech Recognition},
  journal = {IEEE Transactions on Audio, Speech and Language
                   Processing},
  volume = {16},
  number = {3},
  pages = {508--518},
  abstract = {In this paper we investigate the combination of
                   complementary acoustic feature streams in large
                   vocabulary continuous speech recognition (LVCSR). We
                   have explored the use of acoustic features obtained
                   using a pitch-synchronous analysis, STRAIGHT, in
                   combination with conventional features such as mel
                   frequency cepstral coefficients. Pitch-synchronous
                   acoustic features are of particular interest when used
                   with vocal tract length normalisation (VTLN) which is
                   known to be affected by the fundamental frequency. We
                   have combined these spectral representations directly
                   at the acoustic feature level using heteroscedastic
                   linear discriminant analysis (HLDA) and at the system
                   level using ROVER. We evaluated this approach on three
                   LVCSR tasks: dictated newspaper text (WSJCAM0),
                   conversational telephone speech (CTS), and multiparty
                   meeting transcription. The CTS and meeting
                   transcription experiments were both evaluated using
                   standard NIST test sets and evaluation protocols. Our
                   results indicate that combining conventional and
                   pitch-synchronous acoustic feature sets using HLDA
                   results in a consistent, significant decrease in word
                   error rate across all three tasks. Combining at the
                   system level using ROVER resulted in a further
                   significant decrease in word error rate.},
  doi = {10.1109/TASL.2008.916519},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2008/garau-taslp08.pdf},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=4443886},
  year = 2008
}
@inproceedings{llu2012map,
  author = {Lu, L. and Ghoshal, A. and Renals, S.},
  title = {{Maximum a posteriori adaptation of subspace Gaussian
                   mixture models for cross-lingual speech recognition}},
  booktitle = {Proc. ICASSP},
  abstract = {This paper concerns cross-lingual acoustic modeling in
                   the case when there are limited target language
                   resources. We build on an approach in which a subspace
                   Gaussian mixture model (SGMM) is adapted to the target
                   language by reusing the globally shared parameters
                   estimated from out-of-language training data. In
                   current cross-lingual systems, these parameters are
                   fixed when training the target system, which can give
                   rise to a mismatch between the source and target
                   systems. We investigate a maximum a posteriori (MAP)
                   adaptation approach to alleviate the potential
                   mismatch. In particular, we focus on the adaptation of
                   phonetic subspace parameters using a matrix variate
                   Gaussian prior distribution. Experiments on the
                   GlobalPhone corpus using the MAP adaptation approach
                   results in word error rate reductions, compared with
                   the cross-lingual baseline systems and systems updated
                   using maximum likelihood, for training conditions with
                   1 hour and 5 hours of target language data.},
  keywords = {Subspace Gaussian Mixture Model, Maximum a Posteriori
                   Adaptation, Cross-lingual Speech Recognition},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/llu-icassp-2012.pdf},
  year = 2012
}
@inproceedings{murray06,
  author = {G. Murray and S. Renals and J. Moore and J. Carletta},
  title = {Incorporating Speaker and Discourse Features into
                   Speech Summarization},
  booktitle = {Proceedings of the Human Language Technology
                   Conference - North American Chapter of the Association
                   for Computational Linguistics Meeting (HLT-NAACL) 2006,
                   New York City, USA},
  abstract = {The research presented herein explores the usefulness
                   of incorporating speaker and discourse features in an
                   automatic speech summarization system applied to
                   meeting recordings from the ICSI Meetings corpus. By
                   analyzing speaker activity, turn-taking and discourse
                   cues, it is hypothesized that a system can outperform
                   solely text-based methods inherited from the field of
                   text summarization. The summarization methods are
                   described, two evaluation methods are applied and
                   compared, and the results clearly show that utilizing
                   such features is advantageous and efficient. Even
                   simple methods relying on discourse cues and speaker
                   activity can outperform text summarization approaches.},
  categories = {summarization, speech summarization, prosody, latent
                   semantic analysis},
  month = jun,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/hlt2006-final.pdf},
  year = 2006
}
@inproceedings{qin:perpinan:richmond:wrench:renals:2008a,
  author = {Qin, C. and Carreira-Perpiñán, M. and Richmond, K.
                   and Wrench, A. and Renals, S.},
  title = {Predicting Tongue Shapes from a Few Landmark Locations},
  booktitle = {Proc. Interspeech},
  pages = {2306--2309},
  address = {Brisbane, Australia},
  abstract = {We present a method for predicting the midsagittal
                   tongue contour from the locations of a few landmarks
                   (metal pellets) on the tongue surface, as used in
                   articulatory databases such as MOCHA and the Wisconsin
                   XRDB. Our method learns a mapping using ground-truth
                   tongue contours derived from ultrasound data and
                   drastically improves over spline interpolation. We also
                   determine the optimal locations of the landmarks, and
                   the number of landmarks required to achieve a desired
                   prediction error: 3-4 landmarks are enough to achieve
                   0.3-0.2 mm error per point on the tongue.},
  categories = {ultrasound, tongue contour, articulation},
  key = {qin:perpinan:richmond:wrench:renals:2008a},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080929.PDF},
  year = 2008
}
@inproceedings{koumpis-msdr03,
  author = {K.~Koumpis and S.~Renals},
  title = {Evaluation of extractive voicemail summarization},
  booktitle = {Proc. ISCA Workshop on Multilingual Spoken Document
                   Retrieval},
  pages = {19--24},
  abstract = {This paper is about the evaluation of a system that
                   generates short text summaries of voicemail messages,
                   suitable for transmission as text messages. Our
                   approach to summarization is based on a
                   speech-recognized transcript of the voicemail message,
                   from which a set of summary words is extracted. The
                   system uses a classifier to identify the summary words,
                   with each word being identified by a vector of lexical
                   and prosodic features. The features are selected using
                   Parcel, an ROC-based algorithm. Our evaluations of the
                   system, using a slot error rate metric, have compared
                   manual and automatic summarization, and manual and
                   automatic recognition (using two different
                   recognizers). We also report on two subjective
                   evaluations using mean opinion score of summaries, and
                   a set of comprehension tests. The main results from
                   these experiments were that the perceived difference in
                   quality of summarization was affected more by errors
                   resulting from automatic transcription, than by the
                   automatic summarization process.},
  categories = {voicemail,summarization,prosody,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/msdr03.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/msdr03.ps.gz},
  year = 2003
}
@inproceedings{robinson-eurospeech93,
  author = {A.~J.~Robinson and L.~Almeida and J.-M.~Boite and
                   H.~Bourlard and F.~Fallside and M.~Hochberg and
                   D.~Kershaw and P.~Kohn and Y.~Konig and N.~Morgan and
                   J.~P.~Neto and S.~Renals and M.~Saerens and C.~Wooters},
  title = {A neural network based, speaker independent, large
                   vocabulary, continuous speech recognition system: the
                   {Wernicke} project},
  booktitle = {Proc. Eurospeech},
  pages = {1941--1944},
  address = {Berlin},
  categories = {},
  year = 1993
}
@inproceedings{jaimes2007,
  author = {Jaimes, Alejandro and Bourlard, Hervé and Renals,
                   Steve and Carletta, Jean},
  title = {Recording, Indexing, Summarizing, and Accessing
                   Meeting Videos: An Overview of the {AMI} Project},
  booktitle = {Proc IEEE ICIAPW},
  pages = {59--64},
  abstract = {n this paper we give an overview of the AMI project.
                   AMI developed the following: (1) an infrastructure for
                   recording meetings using multiple microphones and
                   cameras; (2) a one hundred hour, manually annotated
                   meeting corpus; (3) a number of techniques for
                   indexing, and summarizing of meeting videos using
                   automatic speech recognition and computer vision, and
                   (4) an extensible framework for browsing, and searching
                   of meeting videos. We give an overview of the various
                   techniques developed in AMI, their integration into our
                   meeting browser framework, and future plans for AMIDA
                   (Augmented Multiparty Interaction with Distant Access),
                   the follow-up project to AMI.},
  doi = {10.1109/ICIAPW.2007.36},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/jaimes2007.pdf},
  url = {http://ieeexplore.ieee.org/search/srchabstract.jsp?arnumber=4427477&isnumber=4427459&punumber=4427458&k2dockey=4427477@ieeecnfs&query=%28+%28%28renals%29%3Cin%3Eau+%29+%29+%3Cand%3E+%28pyr+%3E%3D+2006+%3Cand%3E+pyr+%3C%3D+2008%29&pos=6&access=no},
  year = 2007
}
@inproceedings{abberley-trec98,
  author = {D.~Abberley and S.~Renals and G.~Cook and T.~Robinson},
  title = {The 1997 {THISL} spoken document retrieval system},
  booktitle = {Proc. Sixth Text Retrieval Conference (TREC--6)},
  pages = {747--752},
  abstract = {The THISL spoken document retrieval system is based on
                   the Abbot Large Vocabulary Continuous Speech
                   Recognition (LVCSR) system developed by Cambridge
                   University, Sheffield University and SoftSound, and
                   uses PRISE (NIST) for indexing and retrieval. We
                   participated in full SDR mode. Our approach was to
                   transcribe the spoken documents at the word level using
                   Abbot, indexing the resulting text transcriptions using
                   PRISE. The LVCSR system uses a recurrent network-based
                   acoustic model (with no adaptation to different
                   conditions) trained on the 50 hour Broadcast News
                   training set, a 65,000 word vocabulary and a trigram
                   language model derived from Broadcast News text. Words
                   in queries which were out-of-vocabulary (OOV) were word
                   spotted at query time (utilizing the posterior phone
                   probabilities output by the acoustic model), added to
                   the transcriptions of the relevant documents and the
                   collection was then re-indexed. We generated
                   pronunciations at run-time for OOV words using the
                   Festival TTS system (University of Edinburgh).},
  categories = {thisl,bnews,trec,ir,recognition,eval,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/trec6.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/trec6.ps.gz},
  year = 1998
}
@article{robinson-specom02,
  author = {A.~J.~Robinson and G.~D.~Cook and D.~P.~W.~Ellis and
                   E.~Fosler-Lussier and S.~J.~Renals and
                   D.~A.~G.~Williams},
  title = {Connectionist Speech Recognition of Broadcast News},
  journal = {Speech Communication},
  volume = {37},
  pages = {27--45},
  abstract = {This paper describes connectionist techniques for
                   recognition of Broadcast News. The fundamental
                   difference between connectionist systems and more
                   conventional mixture-of-Gaussian systems is that
                   connectionist models directly estimate posterior
                   probabilities as opposed to likelihoods. Access to
                   posterior probabilities has enabled us to develop a
                   number of novel approaches to confidence estimation,
                   pronunciation modelling and search. In addition we have
                   investigated a new feature extraction technique based
                   on the modulation-filtered spectrogram, and methods for
                   combining multiple information sources. We have
                   incorporated all of these techniques into a system for
                   the transcription of Broadcast News, and we present
                   results on the 1998 DARPA Hub-4E Broadcast News
                   evaluation data.},
  categories = {sprach,bnews,recognition,am,hybrid,abbot,lm,search,pron,eval,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/specom02-preprint.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/specom02-preprint.ps.gz},
  year = 2002
}
@inproceedings{renals-eurospeech93,
  author = {S.~Renals and D.~MacKay},
  title = {Bayesian regularisation methods in a hybrid {MLP--HMM}
                   system},
  booktitle = {Proc. Eurospeech},
  pages = {1719--1722},
  address = {Berlin},
  categories = {},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1993/eurosp93-bayes.ps.gz},
  year = 1993
}
@incollection{renals-nips92,
  author = {S.~Renals and H.~Bourlard and N.~Morgan and H.~Franco
                   and M.~Cohen},
  title = {Connectionist optimisation of tied mixture hidden
                   {Markov} models},
  booktitle = {Advances in Neural Information Processing Systems},
  publisher = {Morgan-Kaufmann},
  editor = {J.~E.~Moody and S.~J.~Hanson and R.~P.~Lippmann},
  volume = {4},
  pages = {167--174},
  categories = {},
  year = 1992
}
@inproceedings{renals-icassp03,
  author = {S.~Renals and D.~Ellis},
  title = {Audio information access from meeting rooms},
  booktitle = {Proc. IEEE ICASSP},
  volume = {4},
  pages = {744--747},
  abstract = {We investigate approaches to accessing information
                   from the streams of audio data that result from
                   multi-channel recordings of meetings. The methods
                   investigated use word-level transcriptions, and
                   information derived from models of speaker activity and
                   speaker turn patterns. Our experiments include spoken
                   document retrieval for meetings, automatic structuring
                   of meetings based on self-similarity matrices of
                   speaker turn patterns and a simple model of speaker
                   activity. Meeting recordings are rich in both lexical
                   and non-lexical information; our results illustrate
                   some novel kinds of analysis made possible by a
                   transcribed corpus of natural meetings.},
  categories = {m4,multimodal,ir,meetings,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/icassp03-mtg.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/icassp03-mtg.ps.gz},
  year = 2003
}
@inproceedings{cabral:renals:richmond:yamagishi:2008a,
  author = {Cabral, J. and Renals, S. and Richmond, K. and
                   Yamagishi, J.},
  title = {Glottal Spectral Separation for Parametric Speech
                   Synthesis},
  booktitle = {Proc. Interspeech},
  pages = {1829--1832},
  address = {Brisbane, Australia},
  abstract = {This paper presents a method to control the
                   characteristics of synthetic speech flexibly by
                   integrating articulatory features into a Hidden Markov
                   Model (HMM)-based parametric speech synthesis system.
                   In contrast to model adaptation and interpolation
                   approaches for speaking style control, this method is
                   driven by phonetic knowledge, and target speech samples
                   are not required. The joint distribution of parallel
                   acoustic and articulatory features considering
                   cross-stream feature dependency is estimated. At
                   synthesis time, acoustic and articulatory features are
                   generated simultaneously based on the
                   maximum-likelihood criterion. The synthetic speech can
                   be controlled flexibly by modifying the generated
                   articulatory features according to arbitrary phonetic
                   rules in the parameter generation process. Our
                   experiments show that the proposed method is effective
                   in both changing the overall character of synthesized
                   speech and in controlling the quality of a specific
                   vowel. },
  categories = {HMM speech synthesis, Glottal Spectral Separation,
                   LF-model},
  key = {cabral:renals:richmond:yamagishi:2008a},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS081086.PDF},
  year = 2008
}
@inproceedings{pietquin-icassp02,
  author = {O.~Pietquin and S.~Renals},
  title = {{ASR} system modeling for automatic evaluation and
                   optimization of dialogue systems},
  booktitle = {Proc IEEE ICASSP},
  pages = {46--49},
  abstract = {Though the field of spoken dialogue systems has
                   developed quickly in the last decade, rapid design of
                   dialogue strategies remains uneasy. Several approaches
                   to the problem of automatic strategy learning have been
                   proposed and the use of Reinforcement Learning
                   introduced by Levin and Pieraccini is becoming part of
                   the state of the art in this area. However, the quality
                   of the strategy learned by the system depends on the
                   definition of the optimization criterion and on the
                   accuracy of the environment model. In this paper, we
                   propose to bring a model of an ASR system in the
                   simulated environment in order to enhance the learned
                   strategy. To do so, we introduced recognition error
                   rates and confidence levels produced by ASR systems in
                   the optimization criterion.},
  categories = {dialog,rl,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/icassp02-rl.pdf},
  year = 2002
}
@inproceedings{kershaw-icslp96,
  author = {D.~Kershaw and T.~Robinson and S.~Renals},
  title = {The 1995 {Abbot} {LVCSR} system for multiple unknown
                   microphones},
  booktitle = {Proc. ICSLP},
  pages = {1325-1328},
  address = {Philadelphia PA},
  categories = {wernicke,sprach,wsj,recognition,am,hybrid,abbot,search,eval,sheffield},
  year = 1996
}
@inproceedings{NistevalAMI05,
  author = {T. Hain and L. Burget and J. Dines and G. Garau and M.
                   Karafiat and M. Lincoln and I. McCowan and D. Moore and
                   V. Wan and R. Ordelman and S. Renals},
  title = {The 2005 {AMI} System for the transcription of Speech
                   in Meetings},
  booktitle = {Proceedings of the Rich Transcription 2005 Spring
                   Meeting Recognition Evaluation},
  abstract = {In this paper we describe the 2005 AMI system for the
                   transcription of speech in meetings used in the 2005
                   NIST RT evaluations. The system was designed for
                   participation in the speech to text part of the
                   evaluations, in particular for transcription of speech
                   recorded with multiple distant microphones and
                   independent headset microphones. System performance was
                   tested on both conference room and lecture style
                   meetings. Although input sources are processed using
                   different frontends, the recognition process is based
                   on a unified system architecture. The system operates
                   in multiple passes and makes use of state of the art
                   technologies such as discriminative training, vocal
                   tract length normalisation, heteroscedastic linear
                   discriminant analysis, speaker adaptation with maximum
                   likelihood linear regression and minimum word error
                   rate decoding. In this paper we describe the system
                   performance on the official development and test sets
                   for the NIST RT05s evaluations. The system was jointly
                   developed in less than 10 months by a multi-site team
                   and was shown to achieve competitive performance.},
  categories = {LVCSR, NIST Meeting Transcription Evaluation RT05S},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/AMIasr.nist2005.pdf},
  year = 2005
}
@inproceedings{cuayahuitletal_slt06,
  author = {Heriberto Cuayáhuitl and Steve Renals and Oliver
                   Lemon and Hiroshi Shimodaira},
  title = {Reinforcement Learning of Dialogue Strategies With
                   Hierarchical Abstract Machines},
  booktitle = {Proc. of IEEE/ACL Workshop on Spoken Language
                   Technology (SLT)},
  abstract = {In this paper we propose partially specified dialogue
                   strategies for dialogue strategy optimization, where
                   part of the strategy is specified deterministically and
                   the rest optimized with Reinforcement Learning (RL). To
                   do this we apply RL with Hierarchical Abstract Machines
                   (HAMs). We also propose to build simulated users using
                   HAMs, incorporating a combination of hierarchical
                   deterministic and probabilistic behaviour. We performed
                   experiments using a single-goal flight booking dialogue
                   system, and compare two dialogue strategies
                   (deterministic and optimized) using three types of
                   simulated user (novice, experienced and expert). Our
                   results show that HAMs are promising for both dialogue
                   optimization and simulation, and provide evidence that
                   indeed partially specified dialogue strategies can
                   outperform deterministic ones (on average 4.7 fewer
                   system turns) with faster learning than the traditional
                   RL framework.},
  categories = {reinforcement learning, spoken dialogue systems},
  month = dec,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/ham-slt2006.pdf},
  year = 2006
}
@inproceedings{bell12_mlan,
  author = {Bell, P. and Gales, M. and Lanchantin, P. and Liu, X.
                   and Long, Y. and Renals, S. and Swietojanski, P. and
                   Woodland, P.},
  title = {Transcription of multi-genre media archives using
                   out-of-domain data},
  booktitle = {Proc. IEEE Workshop on Spoken Language Technology},
  address = {Miami, Florida, USA},
  abstract = {We describe our work on developing a speech
                   recognition system for multi-genre media archives. The
                   high diversity of the data makes this a challenging
                   recognition task, which may benefit from systems
                   trained on a combination of in-domain and out-of-domain
                   data. Working with tandem HMMs, we present Multi-level
                   Adaptive Networks (MLAN), a novel technique for
                   incorporating information from out-of-domain posterior
                   features using deep neural networks. We show that it
                   provides a substantial reduction in WER over other
                   systems, with relative WER reductions of 15\% over a
                   PLP baseline, 9\% over in-domain tandem features and
                   8\% over the best out-of-domain tandem features.},
  month = dec,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/mlan_slt2012.pdf},
  year = 2012
}
@inproceedings{renals2007,
  author = {Renals, Steve and Hain, Thomas and Bourlard, Hervé},
  title = {Recognition and interpretation of meetings: The {AMI}
                   and {AMIDA} projects},
  booktitle = {Proc. IEEE Workshop on Automatic Speech Recognition
                   and Understanding (ASRU '07)},
  abstract = {The AMI and AMIDA projects are concerned with the
                   recognition and interpretation of multiparty meetings.
                   Within these projects we have: developed an
                   infrastructure for recording meetings using multiple
                   microphones and cameras; released a 100 hour annotated
                   corpus of meetings; developed techniques for the
                   recognition and interpretation of meetings based
                   primarily on speech recognition and computer vision;
                   and developed an evaluation framework at both component
                   and system levels. In this paper we present an overview
                   of these projects, with an emphasis on speech
                   recognition and content extraction. },
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/ami-asru2007.pdf},
  year = 2007
}
@inproceedings{dielmann-icassp07,
  author = {A. Dielmann and S. Renals},
  title = {{DBN} based joint Dialogue Act recognition of
                   multiparty meetings},
  booktitle = {Proc. IEEE ICASSP},
  volume = 4,
  pages = {133--136},
  abstract = {Joint Dialogue Act segmentation and classification of
                   the new {AMI} meeting corpus has been performed through
                   an integrated framework based on a switching dynamic
                   {Bayesian} network and a set of continuous features and
                   language models. The recognition process is based on a
                   dictionary of 15 {DA} classes tailored for group
                   decision-making. Experimental results show that a novel
                   interpolated Factored Language Model results in a low
                   error rate on the automatic segmentation task, and thus
                   good recognition results can be achieved on {AMI}
                   multiparty conversational speech.},
  categories = {ami,dialogue act,dbn,factored language
                   model,meetings,edinburgh},
  month = {April},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/dielmann-icassp07.pdf},
  year = 2007
}
@inproceedings{kolluru-asru03,
  author = {B. Kolluru and H. Christensen and Y. Gotoh and S.
                   Renals},
  title = {Exploring the style-technique interaction in
                   extractive summarization of broadcast news},
  booktitle = {Proc. IEEE Automatic Speech Recognition and
                   Understanding Workshop},
  pages = {},
  abstract = {In this paper we seek to explore the interaction
                   between the style of a broadcast news story and its
                   summarization technique. We report the performance of
                   three different summarization techniques on broadcast
                   news stories, which are split into planned speech and
                   spontaneous speech. The initial results indicate that
                   some summarization techniques work better for the
                   documents with spontaneous speech than for those with
                   planned speech. Even for human beings some documents
                   are inherently dif cult to summarize. We observe this
                   correlation between degree of dif culty in summarizing
                   and performance of the three automatic summarizers.
                   Given the high frequency of named entities in broadcast
                   news and even greater number of references to these
                   named entities, we also gauge the effect of named
                   entity and coreference resolution in a news story, on
                   the performance of these summarizers.},
  categories = {s3l,summarization,bnews,edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/asru03-style.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/asru03-style.ps.gz},
  year = 2003
}
@inproceedings{cabral_ssw7,
  author = {Cabral, Jo{\~a}o and Renals, Steve and Richmond, Korin
                   and Yamagishi, Junichi},
  title = {Transforming Voice Source Parameters in a {HMM}-based
                   Speech Synthesiser with Glottal Post-Filtering},
  booktitle = {Proc. 7th ISCA Speech Synthesis Workshop (SSW7)},
  pages = {365--370},
  address = {NICT/ATR, Kyoto, Japan},
  abstract = {Control over voice quality, e.g. breathy and tense
                   voice, is important for speech synthesis applications.
                   For example, transformations can be used to modify
                   aspects of the voice re- lated to speaker's identity
                   and to improve expressiveness. How- ever, it is hard to
                   modify voice characteristics of the synthetic speech,
                   without degrading speech quality. State-of-the-art sta-
                   tistical speech synthesisers, in particular, do not
                   typically al- low control over parameters of the
                   glottal source, which are strongly correlated with
                   voice quality. Consequently, the con- trol of voice
                   characteristics in these systems is limited. In con-
                   trast, the HMM-based speech synthesiser proposed in
                   this paper uses an acoustic glottal source model. The
                   system passes the glottal signal through a whitening
                   filter to obtain the excitation of voiced sounds. This
                   technique, called glottal post-filtering, allows to
                   transform voice characteristics of the synthetic speech
                   by modifying the source model parameters. We evaluated
                   the proposed synthesiser in a perceptual ex- periment,
                   in terms of speech naturalness, intelligibility, and
                   similarity to the original speaker's voice. The results
                   show that it performed as well as a HMM-based
                   synthesiser, which generates the speech signal with a
                   commonly used high-quality speech vocoder.},
  keywords = {HMM-based speech synthesis, voice quality, glottal
                   post-filter},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/cabral_ssw7.pdf},
  year = 2010
}
@inproceedings{gotoh-eurospeech97,
  author = {Y.~Gotoh and S.~Renals},
  title = {Document space models using latent semantic analysis},
  booktitle = {Proc. Eurospeech},
  pages = {1443--1446},
  address = {Rhodes},
  abstract = {In this paper, an approach for constructing mixture
                   language models (LMs) based on some notion of semantics
                   is discussed. To this end, a technique known as latent
                   semantic analysis (LSA) is used. The approach
                   encapsulates corpus-derived semantic information and is
                   able to model the varying style of the text. Using such
                   information, the corpus texts are clustered in an
                   unsupervised manner and mixture LMs are automatically
                   created. This work builds on previous work in the field
                   of information retrieval which was recently applied by
                   Bellegarda et. al. to the problem of clustering words
                   by semantic categories. The principal contribution of
                   this work is to characterize the document space
                   resulting from the LSA modeling and to demonstrate the
                   approach for mixture LM application. Comparison is made
                   between manual and automatic clustering in order to
                   elucidate how the semantic information is expressed in
                   the space. It is shown that, using semantic
                   information, mixture LMs performs better than a
                   conventional single LM with slight increase of
                   computational cost.},
  categories = {sprach,lm,bnc,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/eurosp97-lsa.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/eurosp97-lsa.ps.gz},
  year = 1997
}
@inproceedings{vipperla2010a,
  author = {Vipperla, Ravi Chander and Renals, Steve and Frankel,
                   Joe},
  title = {Augmentation of adaptation data},
  booktitle = {Proc. Interspeech},
  pages = {530--533},
  address = {Makuhari, Japan},
  abstract = {Linear regression based speaker adaptation approaches
                   can improve Automatic Speech Recognition (ASR) accuracy
                   significantly for a target speaker. However, when the
                   available adaptation data is limited to a few seconds,
                   the accuracy of the speaker adapted models is often
                   worse compared with speaker independent models. In this
                   paper, we propose an approach to select a set of
                   reference speakers acoustically close to the target
                   speaker whose data can be used to augment the
                   adaptation data. To determine the acoustic similarity
                   of two speakers, we propose a distance metric based on
                   transforming sample points in the acoustic space with
                   the regression matrices of the two speakers. We show
                   the validity of this approach through a speaker
                   identification task. ASR results on SCOTUS and AMI
                   corpora with limited adaptation data of 10 to 15
                   seconds augmented by data from selected reference
                   speakers show a significant improvement in Word Error
                   Rate over speaker independent and speaker adapted
                   models.},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/vipperla-is2010.pdf},
  year = 2010
}
@inproceedings{terry-icassp88,
  author = {M.~Terry and S.~Renals and R.~Rohwer and J.~Harrington},
  title = {A connectionist approach to speech recognition using
                   peripheral auditory modelling},
  booktitle = {Proc IEEE ICASSP},
  pages = {699--702},
  address = {New York},
  categories = {},
  year = 1988
}
@inproceedings{williams-icslp98,
  author = {G.~Williams and S.~Renals},
  title = {Confidence measures derived from an acceptor {HMM}},
  booktitle = {Proc. ICSLP},
  pages = {831--834},
  address = {Sydney},
  abstract = {In this paper we define a number of confidence
                   measures derived from an acceptor HMM and evaluate
                   their performance for the task of utterance
                   verification using the North American Business News
                   (NAB) and Broadcast News (BN) corpora. Results are
                   presented for decodings made at both the word and phone
                   level which show the relative profitability of
                   rejection provided by the diverse set of confidence
                   measures. The results indicate that language model
                   dependent confidence measures have reduced performance
                   on BN data relative to that for the more grammatically
                   constrained NAB data. An explanation linking the
                   observations that rejection is more profitable for
                   noisy acoustics, for a reduced vocabulary and at the
                   phone level is also given.},
  categories = {recognition,conf,hybrid,bnews,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/icslp98-conf.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/icslp98-conf.ps.gz},
  year = 1998
}
@inproceedings{renals-icassp91,
  author = {S.~Renals and D.~McKelvie and F.~McInnes},
  title = {A comparative study of continuous speech recognition
                   using neural networks and hidden {Markov} models},
  booktitle = {Proc IEEE ICASSP},
  pages = {369--372},
  address = {Toronto},
  categories = {},
  year = 1991
}
@inproceedings{koumpis-icslp00,
  author = {K.~Koumpis and S. Renals},
  title = {Transcription and Summarization of Voicemail Speech},
  booktitle = {Proc. ICSLP},
  volume = {2},
  pages = {688--691},
  address = {Beijing},
  abstract = {This paper describes the development of a system to
                   transcribe and summarize voicemail messages. The
                   results of the research presented in this paper are
                   two-fold. First, a hybrid connectionist approach to the
                   Voicemail transcription task shows that competitive
                   performance can be achieved using a context-independent
                   system with fewer parameters than those based on
                   mixtures of Gaussian likelihoods. Second, an effective
                   and robust combination of statistical with prior
                   knowledge sources for term weighting is used to extract
                   information from the decoders output in order to
                   deliver summaries to the message recipients via a GSM
                   Short Message Service (SMS) gateway.},
  categories = {voicemail,summarization,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/icslp00.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/icslp00.ps.gz},
  year = 2000
}
@inproceedings{gotoh-icassp99,
  author = {Y.~Gotoh and S.~Renals and G.~Williams},
  title = {Named entity tagged language models},
  booktitle = {Proc IEEE ICASSP},
  pages = {513--516},
  address = {Phoenix AZ},
  abstract = {We introduce Named Entity (NE) Language Modelling, a
                   stochastic finite state machine approach to identifying
                   both words and NE categories from a stream of spoken
                   data. We provide an overview of our approach to NE
                   tagged language model (LM) generation together with
                   results of the application of such a LM to the task of
                   out-of-vocabulary (OOV) word reduction in large
                   vocabulary speech recognition. Using the Wall Street
                   Journal and Broadcast News corpora, it is shown that
                   the tagged LM was able to reduce the overall word error
                   rate by 14\%, detecting up to 70\% of previously OOV
                   words. We also describe an example of the direct
                   tagging of spoken data with NE categories.},
  categories = {sprach,ie,lm,bnews,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/icassp99.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/icassp99.ps.gz},
  year = 1999
}
@inproceedings{huang2007-asru,
  author = {Huang, Songfang and Renals, Steve},
  title = {Hierarchical {Pitman-Yor} Language Models for {ASR} in
                   Meetings},
  booktitle = {Proc. IEEE Workshop on Automatic Speech Recognition
                   and Understanding (ASRU'07)},
  pages = {124--129},
  address = {Kyoto, Japan},
  abstract = {In this paper we investigate the application of a
                   novel technique for language modeling --- a
                   hierarchical Bayesian language model (LM) based on the
                   Pitman-Yor process --- on automatic speech recognition
                   (ASR) for multiparty meetings. The hierarchical
                   Pitman-Yor language model (HPYLM), which was originally
                   proposed in the machine learning field, provides a
                   Bayesian interpretation to language modeling. An
                   approximation to the HPYLM recovers the exact
                   formulation of the interpolated Kneser-Ney smoothing
                   method in n-gram models. This paper focuses on the
                   application and scalability of HPYLM on a practical
                   large vocabulary ASR system. Experimental results on
                   NIST RT06s evaluation meeting data verify that HPYLM is
                   a competitive and promising language modeling
                   technique, which consistently performs better than
                   interpolated Kneser-Ney and modified Kneser-Ney n-gram
                   LMs in terms of both perplexity (PPL) and word error
                   rate (WER).},
  month = dec,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/asru07.pdf},
  year = 2007
}
@inproceedings{huang2010a,
  author = {Huang, Songfang and Renals, Steve},
  title = {Power Law Discounting for N-Gram Language Models},
  booktitle = {Proc. IEEE ICASSP--10},
  pages = {5178--5181},
  abstract = {We present an approximation to the Bayesian
                   hierarchical Pitman-Yor process language model which
                   maintains the power law distribution over word tokens,
                   while not requiring a computationally expensive
                   approximate inference process. This approximation,
                   which we term power law discounting, has a similar
                   computational complexity to interpolated and modified
                   Kneser-Ney smoothing. We performed experiments on
                   meeting transcription using the NIST RT06s evaluation
                   data and the AMI corpus, with a vocabulary of 50,000
                   words and a language model training set of up to 211
                   million words. Our results indicate that power law
                   discounting results in statistically significant
                   reductions in perplexity and word error rate compared
                   to both interpolated and modified Kneser-Ney smoothing,
                   while producing similar results to the hierarchical
                   Pitman-Yor process language model.},
  doi = {10.1109/ICASSP.2010.5495007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/huang-icassp10.pdf},
  url = {http://dx.doi.org/10.1109/ICASSP.2010.5495007},
  year = 2010
}
@inproceedings{renals-eurospeech99,
  author = {S.~Renals and Y.~Gotoh},
  title = {Integrated transcription and identification of named
                   entities in broadcast speech},
  booktitle = {Proc. Eurospeech},
  pages = {1039--1042},
  address = {Budapest},
  abstract = {This paper presents an approach to integrating
                   functions for both transcription and named entity (NE)
                   identification into a large vocabulary continuous
                   speech recognition system. It builds on NE tagged
                   language modelling approach, which was recently applied
                   for development of the statistical NE annotation
                   system. We also present results for proper name
                   identification experiment using the Hub-4E open
                   evaluation data.},
  categories = {sprach,stobs,ie,lm,bnews,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/eurospeech99-ne.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/eurospeech99-ne.ps.gz},
  year = 1999
}
@inproceedings{renals-icassp95,
  author = {S.~Renals and M.~Hochberg},
  title = {Efficient search using posterior phone probability
                   estimates},
  booktitle = {Proc IEEE ICASSP},
  pages = {596--599},
  address = {Detroit},
  abstract = {In this paper we present a novel, efficient search
                   strategy for large vocabulary continuous speech
                   recognition (LVCSR). The search algorithm, based on
                   stack decoding, uses posterior phone probability
                   estimates to substantially increase its efficiency with
                   minimal effect on accuracy. In particular, the search
                   space is dramatically reduced by phone deactivation
                   pruning where phones with a small local posterior
                   probability are deactivated. This approach is
                   particularly well-suited to hybrid connectionist/hidden
                   Markov model systems because posterior phone
                   probabilities are directly computed by the acoustic
                   model. On large vocabulary tasks, using a trigram
                   language model, this increased the search speed by an
                   order of magnitude, with 2\% or less relative search
                   error. Results from a hybrid system are presented using
                   the Wall Street Journal LVCSR database for a 20,000
                   word task using a backed-off trigram language model.
                   For this task, our single-pass decoder took around 15
                   times realtime on an HP735 workstation. At the cost of
                   7\% relative search error, decoding time can be speeded
                   up to approximately realtime.},
  categories = {wernicke,recognition,wsj,search,sheffield,cambridge},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1995/icassp95-search.ps.gz},
  year = 1995
}
@inproceedings{wolters2010,
  author = {Wolters, Maria K. and Isaac, Karl B. and Renals, Steve},
  title = {Evaluating speech synthesis intelligibility using
                   {Amazon Mechanical Turk}},
  booktitle = {Proc. 7th Speech Synthesis Workshop (SSW7)},
  pages = {136--141},
  abstract = {Microtask platforms such as Amazon Mechanical Turk
                   (AMT) are increasingly used to create speech and
                   language resources. AMT in particular allows
                   researchers to quickly recruit a large number of fairly
                   demographically diverse participants. In this study, we
                   investigated whether AMT can be used for comparing the
                   intelligibility of speech synthesis systems. We
                   conducted two experiments in the lab and via AMT, one
                   comparing US English diphone to US English
                   speaker-adaptive HTS synthesis and one comparing UK
                   English unit selection to UK English speaker-dependent
                   HTS synthesis. While AMT word error rates were worse
                   than lab error rates, AMT results were more sensitive
                   to relative differences between systems. This is mainly
                   due to the larger number of listeners. Boxplots and
                   multilevel modelling allowed us to identify listeners
                   who performed particularly badly, while thresholding
                   was sufficient to eliminate rogue workers. We conclude
                   that AMT is a viable platform for synthetic speech
                   intelligibility comparisons.},
  categories = {intelligibility, evaluation, semantically
                   unpredictable sentences, diphone, unit selection,
                   crowd- sourcing, Mechanical Turk, HMM-based synthesis},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wolters-ssw2010.pdf},
  year = 2010
}
@inproceedings{gotoh-icassp00,
  author = {Y.~Gotoh and S.~Renals},
  title = {Variable word rate n-grams},
  booktitle = {Proc IEEE ICASSP},
  pages = {1591--1594},
  address = {Istanbul},
  abstract = {The rate of occurrence of words is not uniform but
                   varies from document to document. Despite this
                   observation, parameters for conventional n-gram
                   language models are usually derived using the
                   assumption of a constant word rate. In this paper we
                   investigate the use of variable word rate assumption,
                   modelled by a Poisson distribution or a continuous
                   mixture of Poissons. We present an approach to
                   estimating the relative frequencies of words or n-grams
                   taking prior information of their occurrences into
                   account. Discounting and smoothing schemes are also
                   considered. Using the Broadcast News task, the approach
                   demonstrates a reduction of perplexity up to 10\%.},
  categories = {stobs,lm,bnews,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/icassp2000.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/icassp2000.ps.gz},
  year = 2000
}
@inproceedings{renals-fase88,
  author = {S.~Renals and R.~Rohwer and M.~Terry},
  title = {A comparison of speech recognition front ends using a
                   connectionist classifier},
  booktitle = {Proc. FASE Speech '88},
  pages = {1381--1388},
  address = {Edinburgh},
  categories = {},
  year = 1988
}
@article{wrigley-sap05,
  author = {S. J. Wrigley and G. J. Brown and V. Wan and S. Renals},
  title = {Speech and crosstalk detection in multi-channel audio},
  journal = {IEEE Trans. on Speech and Audio Processing},
  volume = {13},
  pages = {84--91},
  abstract = {The analysis of scenarios in which a number of
                   microphones record the activity of speakers, such as in
                   a roundtable meeting, presents a number of
                   computational challenges. For example, if each
                   participant wears a microphone, it can receive speech
                   from both the microphone's wearer (local speech) and
                   from other participants (crosstalk). The recorded audio
                   can be broadly classified in four ways: local speech,
                   crosstalk plus local speech, crosstalk alone and
                   silence. We describe two experiments related to the
                   automatic classification of audio into these four
                   classes. The first experiment attempted to optimise a
                   set of acoustic features for use with a Gaussian
                   mixture model (GMM) classifier. A large set of
                   potential acoustic features were considered, some of
                   which have been employed in previous studies. The
                   best-performing features were found to be kurtosis,
                   fundamentalness and cross-correlation metrics. The
                   second experiment used these features to train an
                   ergodic hidden Markov model classifier. Tests performed
                   on a large corpus of recorded meetings show
                   classification accuracies of up to 96\%, and automatic
                   speech recognition performance close to that obtained
                   using ground truth segmentation.},
  categories = {m4,meetings,edinburgh,asr,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/sap04-xtalk.pdf},
  year = 2005
}
@article{yamagishi2009,
  author = {Yamagishi, Junichi and Nose, Takashi and Zen, Heiga
                   and Ling, Zhenhua and Toda, Tomoki and Tokuda, Keiichi
                   and King, Simon and Renals, Steve},
  title = {Robust Speaker-Adaptive {HMM}-based Text-to-Speech
                   Synthesis},
  journal = {IEEE Transactions on Audio, Speech and Language
                   Processing},
  volume = {17},
  number = {6},
  pages = {1208--1230},
  abstract = {This paper describes a speaker-adaptive HMM-based
                   speech synthesis system. The new system, called
                   ``HTS-2007,'' employs speaker adaptation (CSMAPLR+MAP),
                   feature-space adaptive training, mixed-gender modeling,
                   and full-covariance modeling using CSMAPLR transforms,
                   in addition to several other techniques that have
                   proved effective in our previous systems. Subjective
                   evaluation results show that the new system generates
                   significantly better quality synthetic speech than
                   speaker-dependent approaches with realistic amounts of
                   speech data, and that it bears comparison with
                   speaker-dependent approaches even when large amounts of
                   speech data are available. In addition, a comparison
                   study with several speech synthesis techniques shows
                   the new system is very robust: It is able to build
                   voices from less-than-ideal speech data and synthesize
                   good-quality speech even for out-of-domain sentences.},
  pdf = {},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=5109758&arnumber=5153555&count=14&index=12},
  year = 2009
}
@inproceedings{zwyssig2012determining,
  author = {Zwyssig, E. and Renals, S. and Lincoln, M.},
  title = {Determining the number of speakers in a meeting using
                   microphone array features},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2012
                   IEEE International Conference on},
  pages = {4765--4768},
  year = 2012
}
@inproceedings{neto-eurospeech95,
  author = {J.~Neto and L.~Almeida and M.~Hochberg and C.~Martins
                   and L.~Nunes and S.~Renals and T.~Robinson},
  title = {Speaker adaptation for hybrid {HMM--ANN} continuous
                   speech recogniton system},
  booktitle = {Proc. Eurospeech},
  pages = {2171--2174},
  address = {Madrid},
  abstract = {It is well known that recognition performance degrades
                   significantly when moving from a speaker- dependent to
                   a speaker-independent system. Traditional hidden Markov
                   model (HMM) systems have successfully applied
                   speaker-adaptation approaches to reduce this
                   degradation. In this paper we present and evaluate some
                   techniques for speaker-adaptation of a hybrid
                   HMM-artificial neural network (ANN) continuous speech
                   recognition system. These techniques are applied to a
                   well trained, speaker-independent, hybrid HMM-ANN
                   system and the recognizer parameters are adapted to a
                   new speaker through off-line procedures. The techniques
                   are evaluated on the DARPA RM corpus using varying
                   amounts of adaptation material and different ANN
                   architectures. The results show that speaker-adaptation
                   within the hybrid framework can substantially improve
                   system performance.},
  categories = {wernicke,rm,recognition,am,hybrid,adaptation,sheffield,cambridge},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1995/eurosp95.ps.gz},
  year = 1995
}
@incollection{renals2010,
  author = {Renals, Steve and King, Simon},
  title = {Automatic Speech Recognition},
  booktitle = {Handbook of Phonetic Sciences},
  publisher = {Wiley Blackwell},
  editor = {Hardcastle, William J. and Laver, John and Gibbon,
                   Fiona E.},
  chapter = {22},
  year = 2010
}
@article{christensen2008,
  author = {Christensen, Heidi and Gotoh, Yoshihiko and Renals,
                   Steve},
  title = {A Cascaded Broadcast News Highlighter},
  journal = {IEEE Transactions on Audio, Speech and Language
                   Processing},
  volume = {16},
  pages = {151--161},
  abstract = {This paper presents a fully automatic news skimming
                   system which takes a broadcast news audio stream and
                   provides the user with the segmented, structured and
                   highlighted transcript. This constitutes a system with
                   three different, cascading stages: converting the audio
                   stream to text using an automatic speech recogniser,
                   segmenting into utterances and stories and finally
                   determining which utterance should be highlighted using
                   a saliency score. Each stage must operate on the
                   erroneous output from the previous stage in the system;
                   an effect which is naturally amplified as the data
                   progresses through the processing stages. We present a
                   large corpus of transcribed broadcast news data
                   enabling us to investigate to which degree information
                   worth highlighting survives this cascading of
                   processes. Both extrinsic and intrinsic experimental
                   results indicate that mistakes in the story boundary
                   detection has a strong impact on the quality of
                   highlights, whereas erroneous utterance boundaries
                   cause only minor problems. Further, the difference in
                   transcription quality does not affect the overall
                   performance greatly.},
  doi = {10.1109/TASL.2007.910746},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2008/christensen-tasl08.pdf},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=4407525&arnumber=4383075&count=28&index=16},
  year = 2008
}
@inproceedings{koumpis-eurospeech03,
  author = {K.~Koumpis and S.~Renals},
  title = {Multi-class Extractive Voicemail Summarization},
  booktitle = {Proc. Eurospeech},
  pages = {2785--2788},
  abstract = {This paper is about a system that extracts principal
                   content words from speech-recognized transcripts of
                   voicemail messages and classifies them into proper
                   names, telephone numbers, dates/times and `other'. The
                   short text summaries generated are suitable for mobile
                   messaging applications. The system uses a set of
                   classifiers to identify the summary words, with each
                   word being identified by a vector of lexical and
                   prosodic features. The features are selected using
                   Parcel, an ROC-based algorithm. We visually compare the
                   role of a large number of individual features and
                   discuss effective ways to combine them. We finally
                   evaluate their performance on manual and automatic
                   transcriptions derived from two different speech
                   recognition systems.},
  categories = {voicemail,summarization,prosody,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/eurospeech03-voicemail.pdf},
  year = 2003
}
@incollection{huang2008-mlmi,
  author = {Songfang Huang and Steve Renals},
  title = {Modeling Topic and Role Information in Meetings using
                   the Hierarchical {D}irichlet Process},
  booktitle = {Machine Learning for Multimodal Interaction V},
  publisher = {Springer},
  editor = {Popescu-Belis, A. and Stiefelhagen, R.},
  volume = {5237},
  series = {Lecture Notes in Computer Science},
  pages = {214--225},
  abstract = {In this paper, we address the modeling of topic and
                   role information in multiparty meetings, via a
                   nonparametric Bayesian model called the hierarchical
                   Dirichlet process. This model provides a powerful
                   solution to topic modeling and a flexible framework for
                   the incorporation of other cues such as speaker role
                   information. We present our modeling framework for
                   topic and role on the AMI Meeting Corpus, and
                   illustrate the effectiveness of the approach in the
                   context of adapting a baseline language model in a
                   large-vocabulary automatic speech recognition system
                   for multiparty meetings. The adapted LM produces
                   significant improvements in terms of both perplexity
                   and word error rate.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/mlmi08.pdf},
  year = 2008
}
@inproceedings{wan-icassp02,
  author = {V.~Wan and S.~Renals},
  title = {Evaluation of Kernel Methods for Speaker Verification
                   and Identification},
  booktitle = {Proc IEEE ICASSP},
  pages = {669--672},
  abstract = {Support vector machines are evaluated on speaker
                   verification and speaker identification tasks. We
                   compare the polynomial kernel, the Fisher kernel, a
                   likelihood ratio kernel and the pair hidden Markov
                   model kernel with baseline systems based on a
                   discriminative polynomial classifier and generative
                   Gaussian mixture model classifiers. Simulations were
                   carried out on the YOHO database and some promising
                   results were obtained.},
  categories = {verification,kernel,svm,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/icassp02-svm.pdf},
  year = 2002
}
@article{vipperla2010,
  author = {Vipperla, Ravi Chander and Renals, Steve and Frankel,
                   Joe},
  title = {Ageing voices: The effect of changes in voice
                   parameters on {ASR} performance},
  journal = {EURASIP Journal on Audio, Speech, and Music Processing},
  abstract = {With ageing, human voices undergo several changes
                   which are typically characterized by increased
                   hoarseness and changes in articulation patterns. In
                   this study, we have examined the effect on Automatic
                   Speech Recognition (ASR) and found that the Word Error
                   Rates (WER) on older voices is about 9\% absolute
                   higher compared to those of adult voices. Subsequently,
                   we compared several voice source parameters including
                   fundamental frequency, jitter, shimmer, harmonicity and
                   cepstral peak prominence of adult and older males.
                   Several of these parameters show statistically
                   significant difference for the two groups. However,
                   artificially increasing jitter and shimmer measures do
                   not effect the ASR accuracies significantly.
                   Artificially lowering the fundamental frequency
                   degrades the ASR performance marginally but this drop
                   in performance can be overcome to some extent using
                   Vocal Tract Length Normalisation (VTLN). Overall, we
                   observe that the changes in the voice source parameters
                   do not have a significant impact on ASR performance.
                   Comparison of the likelihood scores of all the phonemes
                   for the two age groups show that there is a systematic
                   mismatch in the acoustic space of the two age groups.
                   Comparison of the phoneme recognition rates show that
                   mid vowels, nasals and phonemes that depend on the
                   ability to create constrictions with tongue tip for
                   articulation are more affected by ageing than other
                   phonemes.},
  doi = {10.1155/2010/525783},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/vipperla-eurasip10.pdf},
  url = {http://dx.doi.org/10.1155/2010/525783},
  year = 2010
}
@inproceedings{cabral_yrwst,
  author = {J. Cabral and S. Renals and K. Richmond and J.
                   Yamagishi},
  title = {{HMM}-based Speech Synthesis with an Acoustic Glottal
                   Source Model},
  booktitle = {Proc. The First Young Researchers Workshop in Speech
                   Technology},
  abstract = {A major cause of degradation of speech quality in
                   HMM-based speech synthesis is the use of a simple delta
                   pulse signal to generate the excitation of voiced
                   speech. This paper describes a new approach to using an
                   acoustic glottal source model in HMM-based
                   synthesisers. The goal is to improve speech quality and
                   parametric flexibility to better model and transform
                   voice characteristics.},
  categories = {HMM-based Speech Synthesis, LF-Model, Glottal Spectral
                   Separation},
  month = apr,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/jscabral-yrwss2009.pdf},
  year = 2009
}
@inproceedings{renals-icslp94,
  author = {S.~Renals and M.~Hochberg},
  title = {Using {Gamma} filters to model temporal dependencies
                   in speech},
  booktitle = {Proc. ICSLP},
  pages = {1491--1494},
  address = {Yokohama},
  categories = {},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1994/icslp94-gamma.ps.gz},
  year = 1994
}
@inproceedings{murray2007-interspeech,
  author = {Murray, Gabriel and Renals, Steve},
  title = {Towards online speech summarization},
  booktitle = {Proc. Interspeech '07},
  abstract = {The majority of speech summarization research has
                   focused on extracting the most informative dialogue
                   acts from recorde d, archived data. However, a
                   potential use case for speech sum- marization in the
                   meetings domain is to facilitate a meeting in progress
                   by providing the participants - whether they are at
                   tend- ing in-person or remotely - with an indication of
                   the most im- portant parts of the discussion so far.
                   This requires being a ble to determine whether a
                   dialogue act is extract-worthy befor e the global
                   meeting context is available. This paper introduces a
                   novel method for weighting dialogue acts using only
                   very lim- ited local context, and shows that high
                   summary precision is possible even when information
                   about the meeting as a whole is lacking. A new
                   evaluation framework consisting of weighted precision,
                   recall and f-score is detailed, and the novel onl ine
                   summarization method is shown to significantly increase
                   recall and f-score compared with a method using no
                   contextual infor- mation. },
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/IS070966.PDF},
  year = 2007
}
@incollection{renals2010a,
  author = {Renals, Steve and Hain, Thomas},
  title = {Speech Recognition},
  booktitle = {Handbook of Computational Linguistics and Natural
                   Language Processing},
  publisher = {Wiley Blackwell},
  editor = {Clark, Alex and Fox, Chris and Lappin, Shalom},
  year = 2010
}
@incollection{karlsen-casa97,
  author = {B.~L.~Karlsen and G.~J.~Brown and M.~Cooke and
                   P.~Green and S.~Renals},
  title = {Analysis of a simultaneous speaker sound corpus},
  booktitle = {Computational Auditory Scene Analysis},
  publisher = {Lawrence Erlbaum Associates},
  editor = {D.~F.~Rosenthal and H.~G.~Okuno},
  pages = {321--334},
  categories = {},
  year = 1997
}
@article{lu_spl_2011,
  author = {Lu, L. and Ghoshal, A. and Renals, S.},
  title = {Regularized Subspace Gausian Mixture Models for Speech
                   Recognition},
  journal = {IEEE Signal Processing Letters},
  volume = {18},
  number = {7},
  pages = {419--422},
  abstract = {Subspace Gaussian mixture models (SGMMs) provide a
                   compact representation of the Gaussian parameters in an
                   acoustic model, but may still suffer from over-fitting
                   with insufficient training data. In this letter, the
                   SGMM state parameters are estimated using a penalized
                   maximum-likelihood objective, based on $\ell_1$ and
                   $\ell_2$ regularization, as well as their combination,
                   referred to as the elastic net, for robust model
                   estimation. Experiments on the 5000-word Wall Street
                   Journal transcription task show word error rate
                   reduction and improved model robustness with
                   regularization.},
  categories = {Acoustic Modelling, Regularization, Sparsity, Subspace
                   Gaussian Mixture Model},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/lu-spl-2011.pdf},
  year = 2011
}
@article{goldman2005,
  author = {Jerry Goldman and Steve Renals and Steven Bird and
                   Franciska {de Jong} and Marcello Federico and Carl
                   Fleischhauer and Mark Kornbluh and Lori Lamel and Doug
                   Oard and Clare Stewart and Richard Wright},
  title = {Accessing the spoken word},
  journal = {International Journal of Digital Libraries},
  volume = 5,
  number = 4,
  pages = {287--298},
  abstract = {Spoken word audio collections cover many domains,
                   including radio and television broadcasts, oral
                   narratives, governmental proceedings, lectures, and
                   telephone conversations. The collection, access and
                   preservation of such data is stimulated by political,
                   economic, cultural and educational needs. This paper
                   outlines the major issues in the field, reviews the
                   current state of technology, examines the rapidly
                   changing policy issues relating to privacy and
                   copyright, and presents issues relating to the
                   collection and preservation of spoken audio content.},
  categories = {swag,asr,ir,edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/swag-ijdl05.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/swag-ijdl05.ps.gz},
  year = 2005
}
@inproceedings{hifny-interspeech05,
  author = {Y. Hifny and S. Renals and N. Lawrence},
  title = {A Hybrid {MaxEnt/HMM} based {ASR} System},
  booktitle = {Proc. Interspeech},
  abstract = {The aim of this work is to develop a practical
                   framework, which extends the classical Hidden Markov
                   Models (HMM) for continuous speech recognition based on
                   the Maximum Entropy (MaxEnt) principle. The MaxEnt
                   models can estimate the posterior probabilities
                   directly as with Hybrid NN/HMM connectionist speech
                   recognition systems. In particular, a new acoustic
                   modelling based on discriminative MaxEnt models is
                   formulated and is being developed to replace the
                   generative Gaussian Mixture Models (GMM) commonly used
                   to model acoustic variability. Initial experimental
                   results using the TIMIT phone task are reported.},
  categories = {ml,asr,edinburgh,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hifny-eurospeech05.pdf},
  year = 2005
}
@incollection{dielmann-mlmi04,
  author = {A. Dielmann and S. Renals},
  title = {Multistream dynamic {Bayesian} network for meeting
                   segmentation},
  booktitle = {Proc. Multimodal Interaction and Related Machine
                   Learning Algorithms Workshop (MLMI--04)},
  publisher = {Springer},
  editor = {S. Bengio and H. Bourlard},
  pages = {76--86},
  abstract = {This paper investigates the automatic analysis and
                   segmentation of meetings. A meeting is analysed in
                   terms of individual behaviours and group interactions,
                   in order to decompose each meeting in a sequence of
                   relevant phases, named meeting actions. Three feature
                   families are extracted from multimodal recordings:
                   prosody from individual lapel microphone signals,
                   speaker activity from microphone array data and lexical
                   features from textual transcripts. A statistical
                   approach is then used to relate low-level features with
                   a set of abstract categories. In order to provide a
                   flexible and powerful framework, we have employed a
                   dynamic Bayesian network based model, characterized by
                   multiple stream processing and flexible state duration
                   modelling. Experimental results demonstrate the
                   strength of this system, providing a meeting action
                   error rate of 9\%.},
  categories = {m4,multimodal,dbn,meetings,edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/dielmann-mlmi04.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/dielmann-mlmi04.ps.gz},
  year = 2005
}
@inproceedings{zwyssig2012effect,
  author = {Zwyssig, E. and Renals, S. and Lincoln, M.},
  title = {{On the effect of SNR and superdirective beamforming
                   in speaker diarisation in meetings}},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2012
                   IEEE International Conference on},
  pages = {4177--4180},
  year = 2012
}
@inproceedings{wan-icassp03,
  author = {V.~Wan and S.~Renals},
  title = {{SVMSVM}: Support vector machine speaker verification
                   methodology},
  booktitle = {Proc. IEEE ICASSP},
  volume = {2},
  pages = {221--224},
  abstract = {Support vector machines with the Fisher and
                   score-space kernels are used for text independent
                   speaker verification to provide direct q discrimination
                   between complete utterances. This is unlike approaches
                   such as discriminatively trained Gaussian mixture
                   models or other discriminative classifiers that
                   discriminate at the frame-level only. Using the
                   sequence-level discrimination approach we are able to
                   achieve error-rates that are significantly better than
                   the current state-of-the-art on the PolyVar database.},
  categories = {verification,kernel,svm,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/icassp03-svm.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/icassp03-svm.ps.gz},
  year = 2003
}
@inproceedings{dielmann-mmsp04,
  author = {A. Dielmann and S. Renals},
  title = {Multi-stream segmentation of meetings},
  booktitle = {Proc. IEEE Workshop on Multimedia Signal Processing},
  pages = {},
  abstract = {This paper investigates the automatic segmentation of
                   meetings into a sequence of group actions or phases.
                   Our work is based on a corpus of multiparty meetings
                   collected in a meeting room instrumented with video
                   cameras, lapel microphones and a microphone array. We
                   have extracted a set of feature streams, in this case
                   extracted from the audio data, based on speaker turns,
                   prosody and a transcript of what was spoken. We have
                   related these signals to the higher level semantic
                   categories via a multistream statistical model based on
                   dynamic Bayesian networks (DBNs). We report on a set of
                   experiments in which different DBN architectures are
                   compared, together with the different feature streams.
                   The resultant system has an action error rate of 9\%.},
  categories = {m4,multimodal,dbn,meetings,edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/dielmann-mmsp04.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/dielmann-mmsp04.ps.gz},
  year = 2004
}
@inproceedings{christensen-asru03,
  author = {H. Christensen and Y. Gotoh and B. Kolluru and S.
                   Renals},
  title = {Are extractive text summarisation techniques portable
                   to broadcast news?},
  booktitle = {Proc. IEEE Automatic Speech Recognition and
                   Understanding Workshop},
  pages = {},
  abstract = {In this paper we report on a series of experiments
                   which compare the effect of individual features on both
                   text and speech summarisation, the effect of basing the
                   speech summaries on automatic speech recognition
                   transcripts with varying word error rates, and the
                   effect of summarisation approach and transcript source
                   on summary quality. We show that classical text
                   summarisation features (based on stylistic and content
                   information) are portable to broadcast news. However,
                   the quality of the speech transcripts as well as the
                   difference in information structure between broadcast
                   and newspaper news affect the usability of the
                   individual features.},
  categories = {s3l,summarization,bnews,edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/asru03-portable.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/asru03-portable.ps.gz},
  year = 2003
}
@inproceedings{cabral2011a,
  author = {Cabral, J.P. and Renals, S. and Yamagishi, J. and
                   Richmond, K.},
  title = {{HMM}-based speech synthesiser using the {LF}-model of
                   the glottal source},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011
                   IEEE International Conference on},
  pages = {4704--4707},
  abstract = {A major factor which causes a deterioration in speech
                   quality in {HMM}-based speech synthesis is the use of a
                   simple delta pulse signal to generate the excitation of
                   voiced speech. This paper sets out a new approach to
                   using an acoustic glottal source model in HMM-based
                   synthesisers instead of the traditional pulse signal.
                   The goal is to improve speech quality and to better
                   model and transform voice characteristics. We have
                   found the new method decreases buzziness and also
                   improves prosodic modelling. A perceptual evaluation
                   has supported this finding by showing a 55.6%
                   preference for the new system, as against the baseline.
                   This improvement, while not being as significant as we
                   had initially expected, does encourage us to work on
                   developing the proposed speech synthesiser further.},
  categories = {HMM-based speech synthesiser;acoustic glottal source
                   model LF-model;delta pulse signal;perceptual
                   evaluation;prosodic modelling;speech quality;voiced
                   speech generation;hidden Markov models;speech
                   synthesis;},
  doi = {10.1109/ICASSP.2011.5947405},
  issn = {1520-6149},
  month = may,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/05947405.pdf},
  year = 2011
}
@incollection{robinson-yellowbook96,
  author = {T.~Robinson and M.~Hochberg and S.~Renals},
  title = {The use of recurrent networks in continuous speech
                   recognition},
  booktitle = {Automatic Speech and Speaker Recognition -- Advanced
                   Topics},
  publisher = {Kluwer Academic Publishers},
  editor = {C.-H.~Lee and K.~K.~Paliwal and F.~K.~Soong},
  pages = {233--258},
  abstract = {This chapter describes a use of recurrent neural
                   networks (ie, feedback is incorporated in the
                   computation) as an acoustic model for continuous speech
                   recognition. The form of the recurrent neural network
                   is described, along with an appropriate parameter
                   estimation procedure. For each frame of acoustic data,
                   the recurrent network generates an estimate of the
                   posterior probability of the possible phones given the
                   observed acoustic signal. The posteriors are then
                   converted into scaled likelihoods and used as the
                   observation probabilities within a conventional
                   decoding paradigm (eg, Viterbi decoding). The
                   advantages of the using recurrent networks are that
                   they require a small number of parameters and provide a
                   fast decoding capability (relative to conventional
                   large vocabulary HMM systems).},
  categories = {wernicke,sprach,wsj,recognition,am,hybrid,abbot,search,sheffield},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1996/rnn4csr96.ps.gz},
  year = 1996
}
@inproceedings{renals-ieeann89,
  author = {S.~Renals and R.~Rohwer},
  title = {Neural networks for speech pattern classification},
  booktitle = {IEE Conference Publication 313, 1st IEE Conference on
                   Artificial Neural Networks},
  pages = {292--296},
  address = {London},
  categories = {},
  year = 1989
}
@incollection{murray2007-mlmi,
  author = {Murray, Gabriel and Renals, Steve},
  title = {Term-weighting for summarization of multi-party spoken
                   dialogues},
  booktitle = {Machine Learning for Multimodal Interaction IV },
  publisher = {Springer},
  editor = {Popescu-Belis, A. and Renals, S. and Bourlard, H.},
  volume = {4892},
  series = {Lecture Notes in Computer Science},
  pages = {155--166},
  abstract = {This paper explores the issue of term-weighting in the
                   genre of spontaneous, multi-party spoken dialogues,
                   with the intent of using such term-weights in the
                   creation of extractive meeting summaries. The field of
                   text information retrieval has yielded many
                   term-weighting tech- niques to import for our purposes;
                   this paper implements and compares several of these,
                   namely tf.idf, Residual IDF and Gain. We propose that
                   term-weighting for multi-party dialogues can exploit
                   patterns in word us- age among participant speakers,
                   and introduce the su.idf metric as one attempt to do
                   so. Results for all metrics are reported on both manual
                   and automatic speech recognition (ASR) transcripts, and
                   on both the ICSI and AMI meeting corpora. },
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/48920155.pdf},
  year = 2007
}
@article{wan-sap05,
  author = {V. Wan and S. Renals},
  title = {Speaker verification using sequence discriminant
                   support vector machines},
  journal = {IEEE Trans. on Speech and Audio Processing},
  volume = {13},
  pages = {203--210},
  abstract = {This paper presents a text-independent speaker
                   verification system using support vector machines
                   (SVMs) with score-space kernels. Score-space kernels,
                   generalize Fisher kernels, and are based on an
                   underlying generative model, such as a Gaussian mixture
                   model (GMM). This approach provides direct
                   discrimination between whole sequences, in contrast to
                   the frame-level approaches at the heart of most current
                   systems. The resultant SVMs have a very high
                   dimensionality, since it is related to the number of
                   parameters in the underlying generative model. To
                   ameliorate problems that can arise in the resultant
                   optimization, we introduce a technique called spherical
                   normalization that preconditions the Hessian matrix. We
                   have performed speaker verification experiments using
                   the PolyVar database. The SVM system presented here
                   reduces the relative error rates by 34\% compared to a
                   GMM likelihood ratio system.},
  categories = {verification,kernel,svm,edinburgh,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/sap05-svm.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/sap05-svm.ps.gz},
  year = 2005
}
@article{williams-csl99,
  author = {G.~Williams and S.~Renals},
  title = {Confidence measures from local posterior probability
                   estimates},
  journal = {Computer Speech and Language},
  volume = {13},
  pages = {395--411},
  abstract = {In this paper we introduce a set of related confidence
                   measures for large vocabulary continuous speech
                   recognition (LVCSR) based on local phone posterior
                   probability estimates output by an acceptor HMM
                   acoustic model. In addition to their computational
                   efficiency, these confidence measures are attractive as
                   they may be applied at the state-, phone-, word- or
                   utterance-levels, potentially enabling discrimination
                   between different causes of low confidence recognizer
                   output, such as unclear acoustics or mismatched
                   pronunciation models. We have evaluated these
                   confidence measures for utterance verification using a
                   number of different metrics. Experiments reveal several
                   trends in `profitability of rejection', as measured by
                   the unconditional error rate of a hypothesis test.
                   These trends suggest that crude pronunciation models
                   can mask the relatively subtle reductions in confidence
                   caused by out-of-vocabulary (OOV) words and
                   disfluencies, but not the gross model mismatches
                   elicited by non-speech sounds. The observation that a
                   purely acoustic confidence measure can provide improved
                   performance over a measure based upon both acoustic and
                   language model information for data drawn from the
                   Broadcast News corpus, but not for data drawn from the
                   North American Business News corpus suggests that the
                   quality of model fit offered by a trigram language
                   model is reduced for Broadcast News data. We also argue
                   that acoustic confidence measures may be used to inform
                   the search for improved pronunciation models.},
  categories = {recognition,conf,hybrid,bnews,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/csl99-preprint.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/csl99-preprint.ps.gz},
  year = 1999
}
@inproceedings{abdelhaleem-icassp04,
  author = {Y. H. Abdel-Haleem and S. Renals and N. D. Lawrence},
  title = {Acoustic space dimensionality selection and
                   combination using the maximum entropy principle},
  booktitle = {Proc. IEEE ICASSP},
  pages = {},
  abstract = {In this paper we propose a discriminative approach to
                   acoustic space dimensionality selection based on
                   maximum entropy modelling. We form a set of constraints
                   by composing the acoustic space with the space of phone
                   classes, and use a continuous feature formulation of
                   maximum entropy modelling to select an optimal feature
                   set. The suggested approach has two steps: (1) the
                   selection of the best acoustic space that efficiently
                   and economically represents the acoustic data and its
                   variability; (2) the combination of selected acoustic
                   features in the maximum entropy framework to estimate
                   the posterior probabilities over the phonetic labels
                   given the acoustic input. Specific contributions of
                   this paper include a parameter estimation algorithm
                   (generalized improved iterative scaling) that enables
                   the use of negative features, the parameterization of
                   constraint functions using Gaussian mixture models, and
                   experimental results using the TIMIT database.},
  categories = {ml,maxent,am,recognition,edinburgh,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/icassp04-me.pdf},
  year = 2004
}
@inproceedings{hsueh2006asm,
  author = {Hsueh, P. and Moore, J. and Renals, S.},
  title = {Automatic Segmentation of Multiparty Dialogue},
  booktitle = {Proc. EACL06},
  abstract = {In this paper, we investigate the prob- lem of
                   automatically predicting segment boundaries in spoken
                   multiparty dialogue. We extend prior work in two ways.
                   We first apply approaches that have been pro- posed for
                   predicting top-level topic shifts to the problem of
                   identifying subtopic boundaries. We then explore the
                   impact on performance of using ASR output as opposed to
                   human transcription. Exam- ination of the effect of
                   features shows that predicting top-level and predicting
                   subtopic boundaries are two distinct tasks: (1) for
                   predicting subtopic boundaries, the lexical
                   cohesion-based approach alone can achieve competitive
                   results, (2) for predicting top-level boundaries, the
                   ma- chine learning approach that combines
                   lexical-cohesion and conversational fea- tures performs
                   best, and (3) conversational cues, such as cue phrases
                   and overlapping speech, are better indicators for the
                   top- level prediction task. We also find that the
                   transcription errors inevitable in ASR output have a
                   negative impact on models that combine lexical-cohesion
                   and conver- sational features, but do not change the
                   general preference of approach for the two tasks. },
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/21_1_hsuehmoorerenals.pdf},
  year = 2006
}
@inproceedings{cuayahuitletal_interspeech07,
  author = {Heriberto Cuayáhuitl and Steve Renals and Oliver
                   Lemon and Hiroshi Shimodaira},
  title = {Hierarchical Dialogue Optimization Using Semi-Markov
                   Decision Processes},
  booktitle = {Proc. of INTERSPEECH},
  abstract = {This paper addresses the problem of dialogue
                   optimization on large search spaces. For such a
                   purpose, in this paper we propose to learn dialogue
                   strategies using multiple Semi-Markov Decision
                   Processes and hierarchical reinforcement learning. This
                   approach factorizes state variables and actions in
                   order to learn a hierarchy of policies. Our experiments
                   are based on a simulated flight booking dialogue system
                   and compare flat versus hierarchical reinforcement
                   learning. Experimental results show that the proposed
                   approach produced a dramatic search space reduction
                   (99.36\%), and converged four orders of magnitude
                   faster than flat reinforcement learning with a very
                   small loss in optimality (on average 0.3 system turns).
                   Results also report that the learnt policies
                   outperformed a hand-crafted one under three different
                   conditions of ASR confidence levels. This approach is
                   appealing to dialogue optimization due to faster
                   learning, reusable subsolutions, and scalability to
                   larger problems.},
  categories = {Spoken dialogue systems, semi-Markov decision
                   processes, hierarchical reinforcement learning.},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/SMDPs-interspeech2007.pdf},
  year = 2007
}
@inproceedings{rohwer-icassp88,
  author = {R.~Rohwer and S.~Renals and M.~Terry},
  title = {Unstable connectionist networks in speech recognition},
  booktitle = {Proc IEEE ICASSP},
  pages = {426--428},
  address = {New York},
  categories = {},
  year = 1988
}
@article{renals-jstatphys90,
  author = {S.~Renals and R.~Rohwer},
  title = {A study of network dynamics},
  journal = {J. Stat. Phys.},
  volume = {58},
  pages = {825--847},
  categories = {},
  year = 1990
}
@article{carreira-specom98,
  author = {M.~Carreira-Perpiñán and S.~Renals},
  title = {Dimensionality reduction of electropalatographic data
                   using latent variable models},
  journal = {Speech Communication},
  volume = {26},
  pages = {259--282},
  abstract = {We consider the problem of obtaining a reduced
                   dimension representation of electropalatographic (EPG)
                   data. An unsupervised learning approach based on latent
                   variable modelling is adopted, in which an underlying
                   lower dimension representation is inferred directly
                   from the data. Several latent variable models are
                   investigated, including factor analysis and the
                   generative topographic mapping (GTM). Experiments were
                   carried out using a subset of the EUR-ACCOR database,
                   and the results indicate that these automatic methods
                   capture important, adaptive structure in the EPG data.
                   Nonlinear latent variable modelling clearly outperforms
                   the investigated linear models in terms of
                   log-likelihood and reconstruction error and suggests a
                   substantially smaller intrinsic dimensionality for the
                   EPG data than that claimed by previous studies. A
                   two-dimensional representation is produced with
                   applications to speech therapy, language learning and
                   articulatory dynamics.},
  categories = {ml,lv,artic,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/specom98.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/specom98.ps.gz},
  year = 1998
}
@inproceedings{wrigley-eurospeech03,
  author = {S.~Wrigley and G.~Brown and V.~Wan and S. Renals},
  title = {Feature Selection for the Classification of Crosstalk
                   in Multi-Channel Audio},
  booktitle = {Proc. Eurospeech},
  pages = {469--472},
  abstract = {An extension to the conventional speech / nonspeech
                   classification framework is presented for a scenario in
                   which a number of microphones record the activity of
                   speakers present at a meeting (one microphone per
                   speaker). Since each microphone can receive speech from
                   both the participant wearing the microphone (local
                   speech) and other participants (crosstalk), the
                   recorded audio can be broadly classified in four ways:
                   local speech, crosstalk plus local speech, crosstalk
                   alone and silence. We describe a classifier in which a
                   Gaussian mixture model (GMM) is used to model each
                   class. A large set of potential acoustic features are
                   considered, some of which have been employed in
                   previous speech / nonspeech classifiers. A combination
                   of two feature selection algorithms is used to identify
                   the optimal feature set for each class. Results from
                   the GMM classifier using the selected features are
                   superior to those of a previously published approach.},
  categories = {m4,crosstalk,meetings,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/eurospeech03-xtalk.pdf},
  year = 2003
}
@article{hifny2009,
  author = {Hifny, Y. and Renals, S.},
  title = {Speech Recognition Using Augmented Conditional Random
                   Fields},
  journal = {IEEE Transactions on Audio, Speech and Language
                   Processing},
  volume = {17},
  number = {2},
  pages = {354--365},
  abstract = {Acoustic modeling based on hidden Markov models (HMMs)
                   is employed by state-of-the-art stochastic speech
                   recognition systems. Although HMMs are a natural choice
                   to warp the time axis and model the temporal phenomena
                   in the speech signal, their conditional independence
                   properties limit their ability to model spectral
                   phenomena well. In this paper, a new acoustic modeling
                   paradigm based on augmented conditional random fields
                   (ACRFs) is investigated and developed. This paradigm
                   addresses some limitations of HMMs while maintaining
                   many of the aspects which have made them successful. In
                   particular, the acoustic modeling problem is
                   reformulated in a data driven, sparse, augmented space
                   to increase discrimination. Acoustic context modeling
                   is explicitly integrated to handle the sequential
                   phenomena of the speech signal. We present an efficient
                   framework for estimating these models that ensures
                   scalability and generality. In the TIMIT phone
                   recognition task, a phone error rate of 23.0\% was
                   recorded on the full test set, a significant
                   improvement over comparable HMM-based systems.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/hifny2009.pdf},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=4749447&arnumber=4749472&count=25&index=15},
  year = 2009
}
@inproceedings{gotoh-asr2000,
  author = {Y.~Gotoh and S.~Renals},
  title = {Sentence Boundary Detection in Broadcast Speech
                   Transcripts},
  booktitle = {ISCA ITRW: ASR2000},
  pages = {228--235},
  address = {Paris},
  abstract = {This paper presents an approach to identifying
                   sentence boundaries in broadcast speech transcripts. We
                   describe finite state models that extract sentence
                   boundary information statistically from text and audio
                   sources. An n-gram language model is constructed from a
                   collection of British English news broadcasts and
                   scripts. An alternative model is estimated from pause
                   duration information in speech recogniser outputs
                   aligned with their programme script counterparts.
                   Experimental results show that the pause duration model
                   alone outperforms the language modelling approach and
                   that, by combining these two models, it can be improved
                   further and precision and recall scores of over 70\%
                   were attained for the task.},
  categories = {stobs,ie,lm,prosody,bnews,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/asr2000.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/asr2000.ps.gz},
  year = 2000
}
@inproceedings{Murray05b,
  author = {G. Murray and S. Renals and J. Carletta and J. Moore},
  title = {Evaluating Automatic Summaries of Meeting Recordings},
  booktitle = {Proceedings of the 43rd Annual Meeting of the
                   Association for Computational Linguistics, Ann Arbor,
                   MI, USA},
  abstract = {The research below explores schemes for evaluating
                   automatic summaries of business meetings, using the
                   ICSI Meeting Corpus. Both automatic and subjective
                   evaluations were carried out, with a central interest
                   being whether or not the two types of evaluations
                   correlate with each other. The evaluation metrics were
                   used to compare and contrast differing approaches to
                   automatic summarization, the deterioration of summary
                   quality on ASR output versus manual transcripts, and to
                   determine whether manual extracts are rated
                   significantly higher than automatic extracts. },
  categories = {ami,summarization, speech summarization, prosody,
                   latent semantic analysis, summarization evaluation,
                   edinburgh},
  month = jun,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/murray-renals-carletta-moore.pdf},
  year = 2005
}
@inproceedings{williams-escapron98,
  author = {G.~Williams and S.~Renals },
  title = {Confidence measures for evaluating pronunciation
                   models},
  booktitle = {ESCA Workshop on Modeling pronunciation variation for
                   automatic speech recognition},
  pages = {151--155},
  address = {Kerkrade, Netherlands},
  abstract = {In this paper, we investigate the use of confidence
                   measures for the evaluation of pronunciation models and
                   the employment of these evaluations in an automatic
                   baseform learning process. The confidence measures and
                   pronunciation models are obtained from the Abbot hybrid
                   Hidden Markov Model/Artificial Neural Network Large
                   Vocabulary Continuous Speech Recognition system.
                   Experiments were carried out for a number of baseform
                   learning schemes using the ARPA North American Business
                   News and the Broadcast News corpora from which it was
                   found that a confidence measure based scheme provided
                   the largest reduction in Word Error Rate.},
  categories = {recognition,conf,hybrid,abbot,wsj,bnews,pron,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/esca98.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/esca98.ps.gz},
  year = 1998
}
@incollection{murray2008a,
  author = {Murray, Gabriel and Renals, Steve},
  title = {Meta Comments for Summarizing Meeting Speech},
  booktitle = {Machine Learning for Multimodal Interaction (Proc.
                   MLMI '08)},
  publisher = {Springer},
  number = {5237},
  series = {Lecture Notes in Computer Science},
  pages = {236--247},
  abstract = {This paper is about the extractive summarization of
                   meeting speech, using the ICSI and AMI corpora. In the
                   first set of experiments we use prosodic, lexical,
                   structural and speaker-related features to select the
                   most informative dialogue acts from each meeting, with
                   the hypothesis being that such a rich mixture of
                   features will yield the best results. In the second
                   part, we present an approach in which the
                   identification of ``meta-comments'' is used to create
                   more informative summaries that provide an increased
                   level of abstraction. We find that the inclusion of
                   these meta comments improves summarization performance
                   according to several evaluation metrics.},
  doi = {10.1007/978-3-540-85853-9_22},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/murray2008a.pdf},
  url = {http://dx.doi.org/10.1007/978-3-540-85853-9_22},
  year = 2008
}
@incollection{al-hames2006-mlmi06,
  author = {Marc Al-Hames and Thomas Hain and Jan Cernocky and
                   Sascha Schreiber and Mannes Poel and Ronald Mueller and
                   Sebastien Marcel and David {van Leeuwen} and Jean-Marc
                   Odobez and Sileye Ba and Hervé Bourlard and Fabien
                   Cardinaux and Daniel Gatica-Perez and Adam Janin and
                   Petr Motlicek and Stephan Reiter and Steve Renals and
                   Jeroen {van Rest} and Rutger Rienks and Gerhard Rigoll
                   and Kevin Smith and Andrew Thean and Pavel Zemcik},
  title = {Audio-video processing in meetings: Seven questions
                   and current {AMI} answers},
  booktitle = {Machine Learning for Multimodal Interaction (Proc.
                   MLMI '06)},
  publisher = {Springer},
  editor = {S. Renals and S. Bengio and J. G. Fiscus},
  volume = {4299},
  series = {Lecture Notes in Computer Science},
  pages = {24--35},
  year = 2006
}
@inproceedings{renals-nnsp91,
  author = {S.~Renals and N.~Morgan and H.~Bourlard},
  title = {Probability estimation by feed-forward networks in
                   continuous speech recognition},
  booktitle = {IEEE Proc. Neural Networks for Signal Processing},
  pages = {309--318},
  address = {Princeton NJ},
  categories = {},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1991/nnsp91.ps.gz},
  year = 1991
}
@inproceedings{lu2012jud,
  author = {Lu, L. and Ghoshal, A. and Renals, S.},
  title = {{Joint uncertainty decoding with unscented transform
                   for noise robust subspace Gaussian mixture model}},
  booktitle = {Proc. Sapa-Scale workshop},
  abstract = {Common noise compensation techniques use vector Taylor
                   series (VTS) to approximate the mismatch function.
                   Recent work shows that the approximation accuracy may
                   be improved by sampling. One such sampling technique is
                   the unscented transform (UT), which draws samples
                   deterministically from clean speech and noise model to
                   derive the noise corrupted speech parameters. This
                   paper applies UT to noise compensation of the subspace
                   Gaussian mixture model (SGMM). Since UT requires
                   relatively smaller number of samples for accurate
                   estimation, it has significantly lower computational
                   cost compared to other random sampling techniques.
                   However, the number of surface Gaussians in an SGMM is
                   typically very large, making the direct application of
                   UT, for compensating individual Gaussian components,
                   computationally impractical. In this paper, we avoid
                   the computational burden by employing UT in the
                   framework of joint uncertainty decoding (JUD), which
                   groups all the Gaussian components into small number of
                   classes, sharing the compensation parameters by class.
                   We evaluate the JUD-UT technique for an SGMM system
                   using the Aurora 4 corpus. Experimental results
                   indicate that UT can lead to increased accuracy
                   compared to VTS approximation if the JUD phase factor
                   is untuned, and to similar accuracy if the phase factor
                   is tuned empirically},
  keywords = {noise compensation, SGMM, JUD, UT},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/llu-sapa2012.pdf},
  year = 2012
}
@inproceedings{renals-ijcnn92,
  author = {S.~Renals and N.~Morgan and M.~Cohen and H.~Franco and
                   H.~Bourlard},
  title = {Improving statistical speech recognition},
  booktitle = {Proc. IJCNN},
  volume = {2},
  pages = {301--307},
  address = {Baltimore MD},
  categories = {},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1992/ijcnn92.ps.gz},
  year = 1992
}
@article{turk:2429,
  author = {Alice Turk and James Scobbie and Christian Geng and
                   Cedric Macmartin and Ellen Bard and Barry Campbell and
                   Catherine Dickie and Eddie Dubourg and Bill Hardcastle
                   and Phil Hoole and Evia Kanaida and Robin Lickley and
                   Satsuki Nakai and Marianne Pouplier and Simon King and
                   Steve Renals and Korin Richmond and Sonja Schaeffler
                   and Ronnie Wiegand and Kevin White and Alan Wrench},
  title = {The {Edinburgh Speech Production Facility's}
                   articulatory corpus of spontaneous dialogue.},
  journal = {The Journal of the Acoustical Society of America},
  volume = {128},
  number = {4},
  pages = {2429-2429},
  abstract = {The EPSRC‐funded Edinburgh Speech Production is
                   built around two synchronized Carstens AG500
                   electromagnetic articulographs (EMAs) in order to
                   capture articulatory∕acoustic data from spontaneous
                   dialogue. An initial articulatory corpus was designed
                   with two aims. The first was to elicit a range of
                   speech styles∕registers from speakers, and therefore
                   provide an alternative to fully scripted corpora. The
                   second was to extend the corpus beyond monologue, by
                   using tasks that promote natural discourse and
                   interaction. A subsidiary driver was to use dialects
                   from outwith North America: dialogues paired up a
                   Scottish English and a Southern British English
                   speaker. Tasks. Monologue: Story reading of ``Comma
                   Gets a Cure'' [Honorof et al. (2000)], lexical sets
                   [Wells (1982)], spontaneous story telling,
                   diadochokinetic tasks. Dialogue: Map tasks [Anderson et
                   al. (1991)], ``Spot the Difference'' picture tasks
                   [Bradlow et al. (2007)], story‐recall. Shadowing of
                   the spontaneous story telling by the second
                   participant. Each dialogue session includes
                   approximately 30 min of speech, and there are
                   acoustics‐only baseline materials. We will introduce
                   the corpus and highlight the role of articulatory
                   production data in helping provide a fuller
                   understanding of various spontaneous speech phenomena
                   by presenting examples of naturally occurring covert
                   speech errors, accent accommodation, turn taking
                   negotiation, and shadowing.},
  doi = {10.1121/1.3508679},
  publisher = {ASA},
  year = 2010
}
@inproceedings{cabral07,
  author = {J. Cabral and S. Renals and K. Richmond and J.
                   Yamagishi},
  title = {Towards an Improved Modeling of the Glottal Source in
                   Statistical Parametric Speech Synthesis},
  booktitle = {Proc.of the 6th ISCA Workshop on Speech Synthesis},
  address = {Bonn, Germany},
  abstract = {This paper proposes the use of the Liljencrants-Fant
                   model (LF-model) to represent the glottal source signal
                   in HMM-based speech synthesis systems. These systems
                   generally use a pulse train to model the periodicity of
                   the excitation signal of voiced speech. However, this
                   model produces a strong and uniform harmonic structure
                   throughout the spectrum of the excitation which makes
                   the synthetic speech sound buzzy. The use of a mixed
                   band excitation and phase manipulation reduces this
                   effect but it can result in degradation of the speech
                   quality if the noise component is not weighted
                   carefully. In turn, the LF-waveform has a decaying
                   spectrum at higher frequencies, which is more similar
                   to the real glottal source excitation signal. We
                   conducted a perceptual experiment to test the
                   hypothesis that the LF-model can perform as well as or
                   better than the pulse train in a HMM-based speech
                   synthesizer. In the synthesis, we used the mean values
                   of the LF-parameters, calculated by measurements of the
                   recorded speech. The result of this study is important
                   not only regarding the improvement in speech quality of
                   these type of systems, but also because the LF-model
                   can be used to model many characteristics of the
                   glottal source, such as voice quality, which are
                   important for voice transformation and generation of
                   expressive speech.},
  categories = {LF-model, Statistical parametric speech synthesis,
                   HMM-based speech synthesis},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/cabral07.pdf},
  year = 2007
}
@inproceedings{renals-icassp96,
  author = {S.~Renals and M.~Hochberg},
  title = {Efficient evaluation of the {LVCSR} search space using
                   the {NOWAY} decoder},
  booktitle = {Proc IEEE ICASSP},
  pages = {149--152},
  address = {Atlanta},
  abstract = {This work further develops and analyses the large
                   vocabulary continuous speech recognition search
                   strategy reported at ICASSP-95. In particular, the
                   posterior-based phone deactivation pruning approach has
                   been extended to include phone-dependent thresholds and
                   an improved estimate of the least upper bound on the
                   utterance log-probability has been developed. Analysis
                   of the pruning procedures and of the search's
                   interaction with the language model has also been
                   performed. Experiments were carried out using the ARPA
                   North American Business News task with a 20,000 word
                   vocabulary and a trigram language model. As a result of
                   these improvements and analyses, the computational cost
                   of the recognition process performed by the Noway
                   decoder has been substantially reduced.},
  categories = {wernicke,sprach,recognition,wsj,search,sheffield},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1996/icassp96.ps.gz},
  year = 1996
}
@inproceedings{koumpis-prosody01,
  author = {K.~Koumpis and S.~Renals},
  title = {The role of prosody in a voicemail summarization
                   system},
  booktitle = {Proc. ISCA Workshop on Prosody in Speech Recognition
                   and Understanding},
  address = {Red Bank, NJ, USA},
  abstract = {When a speaker leaves a voicemail message there are
                   prosodic cues that emphasize the important points in
                   the message, in addition to lexical content. In this
                   paper we compare and visualize the relative
                   contribution of these two types of features within a
                   voicemail summarization system. We describe the
                   system's ability to generate summaries of two test
                   sets, having trained and validated using 700 messages
                   from the IBM Voicemail corpus. Results measuring the
                   quality of summary artifacts show that combined lexical
                   and prosodic features are at least as robust as
                   combined lexical features alone across all operating
                   conditions.},
  categories = {voicemail,summarization,prosody,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/pros01-vm.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/pros01-vm.ps.gz},
  year = 2001
}
@inproceedings{garau2008a,
  author = {Garau, Giulia and Renals, Steve},
  title = {Pitch adaptive features for {LVCSR}},
  booktitle = {Proc. Interspeech '08},
  abstract = {We have investigated the use of a pitch adaptive
                   spectral representation on large vocabulary speech
                   recognition, in conjunction with speaker normalisation
                   techniques. We have compared the effect of a smoothed
                   spectrogram to the pitch adaptive spectral analysis by
                   decoupling these two components of STRAIGHT.
                   Experiments performed on a large vocabulary meeting
                   speech recognition task highlight the importance of
                   combining a pitch adaptive spectral representation with
                   a conventional fixed window spectral analysis. We found
                   evidence that STRAIGHT pitch adaptive features are more
                   speaker independent than conventional MFCCs without
                   pitch adaptation, thus they also provide better
                   performances when combined using feature combination
                   techniques such as Heteroscedastic Linear Discriminant
                   Analysis.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/garau2008a.pdf},
  year = 2008
}
@article{gotoh-nle99,
  author = {Y.~Gotoh and S.~Renals},
  title = {Topic-based mixture language modelling},
  journal = {Journal of Natural Language Engineering},
  volume = {5},
  pages = {355--375},
  abstract = {This paper describes an approach for constructing a
                   mixture of language models based on simple statistical
                   notions of semantics using probabilistic models
                   developed for information retrieval. The approach
                   encapsulates corpus-derived semantic information and is
                   able to model varying styles of text. Using such
                   information, the corpus texts are clustered in an
                   unsupervised manner and a mixture of topic-specific
                   language models is automatically created. The principal
                   contribution of this work is to characterise the
                   document space resulting from information retrieval
                   techniques and to demonstrate the approach for mixture
                   language modelling. A comparison is made between manual
                   and automatic clustering in order to elucidate how the
                   global content information is expressed in the space.
                   We also compare (in terms of association with manual
                   clustering and language modelling accuracy) alternative
                   term-weighting schemes and the effect of singular
                   valued decomposition dimension reduction (latent
                   semantic analysis). Test set perplexity results using
                   the British National Corpus indicate that the approach
                   can improve the potential of statistical language
                   modelling. Using an adaptive procedure, the
                   conventional model may be tuned to track text data with
                   a slight increase in computational cost.},
  categories = {sprach,stobs,lm,bnc,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/jnle99-preprint.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/jnle99-preprint.ps.gz},
  year = 1999
}
@inproceedings{murray-interspeech05,
  author = {G. Murray and S. Renals and J. Carletta},
  title = {Extractive Summarization of Meeting Recordings},
  booktitle = {Proc. Interspeech},
  abstract = {Several approaches to automatic speech summarization
                   are discussed below, using the ICSI Meetings corpus. We
                   contrast feature-based approaches using prosodic and
                   lexical features with maximal marginal relevance and
                   latent semantic analysis approaches to summarization.
                   While the latter two techniques are borrowed directly
                   from the field of text summarization, feature-based
                   approaches using prosodic information are able to
                   utilize characteristics unique to speech data. We also
                   investigate how the summarization results might
                   deteriorate when carried out on ASR output as opposed
                   to manual transcripts. All of the summaries are of an
                   extractive variety, and are compared using the software
                   ROUGE.},
  categories = {ami,summarization,prosody, latent semantic
                   analysis,edinburgh},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/murray-eurospeech05.pdf},
  year = 2005
}
@inproceedings{cuayahuitletal_asru05,
  author = {Heriberto Cuayáhuitl and Steve Renals and Oliver
                   Lemon and Hiroshi Shimodaira},
  title = {Human-Computer Dialogue Simulation Using Hidden Markov
                   Models},
  booktitle = {Proc. of IEEE Workshop on Automatic Speech Recognition
                   and Understanding (ASRU)},
  abstract = {This paper presents a probabilistic method to simulate
                   task-oriented human-computer dialogues at the intention
                   level, that may be used to improve or to evaluate the
                   performance of spoken dialogue systems. Our method uses
                   a network of Hidden Markov Models (HMMs) to predict
                   system and user intentions, where a ``language model''
                   predicts sequences of goals and the component HMMs
                   predict sequences of intentions. We compare standard
                   HMMs, Input HMMs and Input-Output HMMs in an effort to
                   better predict sequences of intentions. In addition, we
                   propose a dialogue similarity measure to evaluate the
                   realism of the simulated dialogues. We performed
                   experiments using the DARPA Communicator corpora and
                   report results with three different metrics: dialogue
                   length, dialogue similarity and precision-recall.},
  categories = {dialogue simulation, hidden markov models},
  month = nov,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hcp-asru2005.pdf},
  year = 2005
}
@inproceedings{kilgour2011,
  author = {Kilgour, Jonathan and Carletta, Jean and Renals, Steve},
  title = {The {Ambient Spotlight}: Personal meeting capture with
                   a microphone array},
  booktitle = {Proc. HSCMA},
  abstract = {We present the Ambient Spotlight system for personal
                   meeting capture based on a portable USB microphone
                   array and a laptop. The system combined distant speech
                   recognition and content linking with personal
                   productivity tools, and enables recognised meeting
                   recordings to be integrated with desktop search,
                   calender, and email. },
  doi = {10.1109/HSCMA.2011.5942389},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2011/ambientDemo.pdf},
  year = 2011
}
@inproceedings{renals-icassp89,
  author = {S.~Renals and R.~Rohwer},
  title = {Learning phoneme recognition using neural networks},
  booktitle = {Proc IEEE ICASSP},
  pages = {413--416},
  address = {Glasgow},
  categories = {},
  year = 1989
}
@inproceedings{kershaw-arpa96,
  author = {D.~Kershaw and T.~Robinson and S.~Renals},
  title = {The 1995 {Abbot} hybrid {connectionist--HMM} large
                   vocabulary recognition system},
  booktitle = {Proc. ARPA Spoken Language Technology Conference},
  pages = {93--99},
  categories = {wernicke,sprach,wsj,recognition,am,hybrid,abbot,search,eval,sheffield},
  year = 1996
}
@inproceedings{hochberg-icassp95,
  author = {M.~Hochberg and S.~Renals and T.~Robinson and G.~Cook},
  title = {Recent improvements to the {Abbot} large vocabulary
                   {CSR} system},
  booktitle = {Proc IEEE ICASSP},
  pages = {69--72},
  address = {Detroit},
  abstract = {ABBOT is the hybrid connectionist-hidden Markov model
                   (HMM) large-vocabulary continuous speech recognition
                   (CSR) system developed at Cambridge University. This
                   system uses a recurrent network to estimate the
                   acoustic observation probabilities within an HMM
                   framework. A major advantage of this approach is that
                   good performance is achieved using context-independent
                   acoustic models and requiring many fewer parameters
                   than comparable HMM systems. This paper presents
                   substantial performance improvements gained from new
                   approaches to connectionist model combination and
                   phone-duration modeling. Additional capability has also
                   been achieved by extending the decoder to handle larger
                   vocabulary tasks (20,000 words and greater) with a
                   trigram language model. This paper describes the recent
                   modifications to the system and experimental results
                   are reported for various test and development sets from
                   the November 1992, 1993, and 1994 ARPA evaluations of
                   spoken language systems.},
  categories = {wernicke,recognition,wsj,am,hybrid,abbot,eval,search,sheffield,cambridge},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1995/icassp95-abbot.ps.gz},
  year = 1995
}
@article{renals-elett88,
  author = {S.~Renals},
  title = {Radial basis functions network for speech pattern
                   classification},
  journal = {Electronics Letters},
  volume = {25},
  pages = {437--439},
  categories = {},
  year = 1988
}
@article{koumpis2005-acmslp,
  author = {Konstantinos Koumpis and Steve Renals},
  title = {Automatic summarization of voicemail messages using
                   lexical and prosodic features},
  journal = {ACM Transactions on Speech and Language Processing},
  volume = 2,
  number = 1,
  pages = {1--24},
  abstract = {This paper presents trainable methods for extracting
                   principal content words from voicemail messages. The
                   short text summaries generated are suitable for mobile
                   messaging applications. The system uses a set of
                   classifiers to identify the summary words, with each
                   word being identified by a vector of lexical and
                   prosodic features. We use an ROC-based algorithm,
                   Parcel, to select input features (and classifiers). We
                   have performed a series of objective and subjective
                   evaluations using unseen data from two different speech
                   recognition systems, as well as human transcriptions of
                   voicemail speech.},
  categories = {voicemail,summarization,prosody,sheffield,edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/tslp05.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/tslp05.ps.gz},
  year = 2005
}
@inproceedings{huang2008-ptkl,
  author = {Songfang Huang and Steve Renals},
  title = {Using Participant Role in Multiparty Meetings as Prior
                   Knowledge for Nonparametric Topic Modeling},
  booktitle = {Proc. ICML/UAI/COLT Workshop on Prior Knowledge for
                   Text and Language Processing},
  pages = {21--24},
  address = {Helsinki, Finland},
  abstract = {In this paper we introduce our attempts to incorporate
                   the participant role information in multiparty meetings
                   for document modeling using the hierarchical Dirichlet
                   process. The perplexity and automatic speech
                   recognition results demonstrate that the participant
                   role information is a promising prior knowledge source
                   to be combined with language models for automatic
                   speech recognition and interaction modeling for
                   multiparty meetings.},
  month = jul,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/ptkl.pdf},
  year = 2008
}
@inproceedings{lu2012noise,
  author = {Lu, L. and Chin, KK and Ghoshal, A. and Renals, S.},
  title = {{Noise compensation for subspace Gaussian mixture
                   models}},
  booktitle = {Proc. INTERSPEECH},
  abstract = {Joint uncertainty decoding (JUD) is an effective
                   model-based noise compensation technique for
                   conventional Gaussian mixture model (GMM) based speech
                   recognition systems. In this paper, we apply JUD to
                   subspace Gaussian mixture model (SGMM) based acoustic
                   models. The total number of Gaussians in the SGMM
                   acoustic model is usually much larger than for
                   conventional GMMs, which limits the application of
                   approaches which explicitly compensate each Gaussian,
                   such as vector Taylor series (VTS). However, by
                   clustering the Gaussian components into a number of
                   regression classes, JUD-based noise compensation can be
                   successfully applied to SGMM systems. We evaluate the
                   JUD/SGMM technique using the Aurora 4 corpus, and the
                   experimental results indicated that it is more accurate
                   than conventional GMM-based systems using either VTS or
                   JUD noise compensation.},
  keywords = {acoustic modelling, noise compensation, SGMM, JUD},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/llu-is2012.pdf},
  year = 2012
}
@article{dielmann2007-tmm,
  author = {Dielmann, Alfred and Renals, Steve},
  title = {Automatic meeting segmentation using dynamic
                   {Bayesian} networks},
  journal = {IEEE Transactions on Multimedia},
  volume = {9},
  number = {1},
  pages = {25--36},
  abstract = {Multiparty meetings are a ubiquitous feature of
                   organizations, and there are considerable economic
                   benefits that would arise from their automatic analysis
                   and structuring. In this paper, we are concerned with
                   the segmentation and structuring of meetings (recorded
                   using multiple cameras and microphones) into sequences
                   of group meeting actions such as monologue, discussion
                   and presentation. We outline four families of
                   multimodal features based on speaker turns, lexical
                   transcription, prosody, and visual motion that are
                   extracted from the raw audio and video recordings. We
                   relate these low-level features to more complex group
                   behaviors using a multistream modelling framework based
                   on multistream dynamic Bayesian networks (DBNs). This
                   results in an effective approach to the segmentation
                   problem, resulting in an action error rate of 12.2\%,
                   compared with 43\% using an approach based on hidden
                   Markov models. Moreover, the multistream DBN developed
                   here leaves scope for many further improvements and
                   extensions.},
  doi = {10.1109/TMM.2006.886337},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2007/dielmann2007-tmm.pdf},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=4032598&arnumber=4032608&count=23&index=3},
  year = 2007
}
@inproceedings{robinson-icassp94,
  author = {T.~Robinson and M.~Hochberg and S.~Renals},
  title = {{IPA}: Improved phone modelling with recurrent neural
                   networks},
  booktitle = {Proc IEEE ICASSP},
  pages = {37--40},
  address = {Adelaide},
  categories = {},
  year = 1994
}
@inproceedings{renals-mmsp99,
  author = {S.~Renals and D.~Abberley and D.~Kirby and T.~Robinson},
  title = {The {THISL} System for Indexing and Retrieval of
                   Broadcast News},
  booktitle = {Proc. IEEE Workshop on Multimedia Signal Processing},
  pages = {77--82},
  address = {Copenhagen},
  abstract = {This paper describes the THISL news retrieval system
                   which maintains an archive of BBC radio and television
                   news recordings. The system uses the Abbot large
                   vocabulary continuous speech recognition system to
                   transcribe news broadcasts, and the thislIR text
                   retrieval system to index and access the transcripts.
                   Decoding and indexing is performed automatically, and
                   the archive is updated with three hours of new material
                   every day. A web-based interface to the retrieval
                   system has been devised to facilitate access to the
                   archive.},
  categories = {thisl,bnews,trec,ir,recognition,sheffield},
  http = {http://homepages.inf.ed.ac.uk/srenals/pubs/1999/mmsp99-54/},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/mmsp99.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/mmsp99.ps.gz},
  year = 1999
}
@article{koumpis2005-spmag,
  author = {Koumpis, Konstantinos and Renals, Steve},
  title = {Content-based access to spoken audio},
  journal = {IEEE Signal Processing Magazine},
  volume = 22,
  number = 5,
  pages = {61--69},
  abstract = {"How analysis, retrieval and delivery phases make
                   spoken audio content more accessible"},
  categories = {asr,ir,summarization,edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/koumpis-spm05.pdf},
  year = 2005
}
@inproceedings{kilgour2010a,
  author = {Kilgour, Jonathan and Carletta, Jean and Renals, Steve},
  title = {The {Ambient Spotlight}: Personal multimodal search
                   without query},
  booktitle = {Proc. ICMI-MLMI},
  abstract = {The Ambient Spotlight is a prototype system based on
                   personal meeting capture using a laptop and a portable
                   microphone array. The system automatically recognises
                   and structures the meeting content using automatic
                   speech recognition, topic segmentation and extractive
                   summarisation. The recognised speech in the meeting is
                   used to construct queries to automatically link meeting
                   segments to other relevant material, both multimodal
                   and textual. The interface to the system is constructed
                   around a standard calendar interface, and it is
                   integrated with the laptop's standard indexing, search
                   and retrieval.},
  doi = {10.1145/1891903.1891919},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2010/ambientDemo-icmi.pdf},
  url = {http://dx.doi.org/10.1145/1891903.1891919},
  year = 2010
}
@inproceedings{AMIMLMI05,
  author = {T. Hain and L. Burget and J. Dines and G. Garau and M.
                   Karafiat and M. Lincoln and I. McCowan and D. Moore and
                   V. Wan and R. Ordelman and S. Renals},
  title = {The Development of the {AMI} System for the
                   Transcription of Speech in Meetings},
  booktitle = {2nd Joint Workshop on Multimodal Interaction and
                   Related Machine Learning Algorithms},
  abstract = {The automatic processing of speech collected in
                   conference style meetings has attracted considerable
                   interest with several large scale projects devoted to
                   this area. This paper describes the development of a
                   baseline automatic speech transcription system for
                   meetings in the context of the AMI (Augmented
                   Multiparty Interaction) project. We present several
                   techniques important to processing of this data and
                   show the performance in terms of word error rates
                   (WERs). An important aspect of transcription of this
                   data is the necessary flexibility in terms of audio
                   pre-processing. Real world systems have to deal with
                   flexible input, for example by using microphone arrays
                   or randomly placed microphones in a room. Automatic
                   segmentation and microphone array processing techniques
                   are described and the effect on WERs is discussed. The
                   system and its components presented in this paper yield
                   compettive performance and form a baseline for future
                   research in this domain.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/AMIsystemMLMI05.pdf},
  year = 2005
}
@inproceedings{uriaIS2012,
  author = {Benigno Uria and Iain Murray and Steve Renals and
                   Korin Richmond},
  title = {Deep Architectures for Articulatory Inversion},
  booktitle = {Proc. Interspeech},
  address = {Portland, Oregon, USA},
  abstract = { We implement two deep architectures for the
                   acoustic-articulatory inversion mapping problem: a deep
                   neural network and a deep trajectory mixture density
                   network. We find that in both cases, deep architectures
                   produce more accurate predictions than shallow
                   architectures and that this is due to the higher
                   expressive capability of a deep model and not a
                   consequence of adding more adjustable parameters. We
                   also find that a deep trajectory mixture density
                   network is able to obtain better inversion accuracies
                   than smoothing the results of a deep neural network.
                   Our best model obtained an average root mean square
                   error of 0.885 mm on the MNGU0 test dataset.},
  categories = {Articulatory inversion, deep neural network, deep
                   belief network, deep regression network, pretraining},
  keywords = {Articulatory inversion, deep neural network, deep
                   belief network, deep regression network, pretraining},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Uria_et_al_IS2012.pdf},
  year = 2012
}
@inproceedings{renals-eurospeech89,
  author = {S.~Renals and J.~Dalby},
  title = {Analysis of a neural network model for speech
                   recognition},
  booktitle = {Proc. Eurospeech},
  volume = {1},
  pages = {333--336},
  address = {Paris},
  categories = {},
  year = 1989
}
@inproceedings{murray06b,
  author = {G. Murray and S. Renals and M. Taboada},
  title = {Prosodic Correlates of Rhetorical Relations},
  booktitle = {Proceedings of HLT/NAACL ACTS Workshop, 2006, New York
                   City, USA},
  abstract = {This paper investigates the usefulness of prosodic
                   features in classifying rhetorical relations between
                   utterances in meeting recordings. Five rhetorical
                   relations of \textit{contrast}, \textit{elaboration},
                   \textit{summary}, \textit{question} and \textit{cause}
                   are explored. Three training methods - supervised,
                   unsupervised, and combined - are compared, and
                   classification is carried out using support vector
                   machines. The results of this pilot study are
                   encouraging but mixed, with pairwise classification
                   achieving an average of 68\% accuracy in discerning
                   between relation pairs using only prosodic features,
                   but multi-class classification performing only slightly
                   better than chance.},
  categories = {rhetorical structure theory, prosody, unsupervised
                   learning},
  month = jun,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/dacts-hlt.pdf},
  year = 2006
}
@article{2012E121001,
  author = {Junichi Yamagishi and Christophe Veaux and Simon King
                   and Steve Renals},
  title = {Speech synthesis technologies for individuals with
                   vocal disabilities: Voice banking and reconstruction},
  journal = {Acoustical Science and Technology},
  volume = {33},
  number = {1},
  pages = {1--5},
  url = {http://www.jstage.jst.go.jp/browse/ast/33/1/_contents},
  year = 2012
}
@inproceedings{abberley-esca99,
  author = {D.~Abberley and D.~Kirby and S.~Renals and T.~Robinson},
  title = {The {THISL} broadcast news retrieval system},
  booktitle = {Proc. ESCA Workshop on Accessing Information In Spoken
                   Audio},
  pages = {19--24},
  address = {Cambridge},
  abstract = {This paper described the THISL spoken document
                   retrieval system for British and North American
                   Broadcast News. The system is based on the
                   \textsc{Abbot} large vocabulary speech recognizer,
                   using a recurrent network acoustic model, and a
                   probabilistic text retrieval system. We discuss the
                   development of a realtime British English Broadcast
                   News system, and its integration into a spoken document
                   retrieval system. Detailed evaluation is performed
                   using a similar North American Broadcast News system,
                   to take advantage of the TREC SDR evaluation
                   methodology. We report results on this evaluation, with
                   particular reference to the effect of query expansion
                   and of automatic segmentation algorithms.},
  categories = {thisl,bnews,trec,ir,recognition,sheffield},
  http = {http://homepages.inf.ed.ac.uk/srenals/pubs/1999/esca99-thisl/},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/esca99-thisl.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/esca99-thisl.ps.gz},
  year = 1999
}
@inproceedings{bourlard-icassp92,
  author = {H.~Bourlard and N.~Morgan and C.~Wooters and S.~Renals},
  title = {{CDNN}: A context-dependent neural network for
                   continuous speech recognition},
  booktitle = {Proc IEEE ICASSP},
  pages = {349--352},
  address = {San Francisco},
  categories = {},
  year = 1992
}
@article{dielmann2008,
  author = {Dielmann, Alfred and Renals, Steve},
  title = {Recognition of Dialogue Acts in Multiparty Meetings
                   using a Switching {DBN}},
  journal = {IEEE Transactions on Audio, Speech and Language
                   Processing},
  volume = {16},
  number = {7},
  pages = {1303--1314},
  abstract = {This paper is concerned with the automatic recognition
                   of dialogue acts (DAs) in multiparty conversational
                   speech. We present a joint generative model for DA
                   recognition in which segmentation and classification of
                   DAs are carried out in parallel. Our approach to DA
                   recognition is based on a switching dynamic Bayesian
                   network (DBN) architecture. This generative approach
                   models a set of features, related to lexical content
                   and prosody, and incorporates a weighted interpolated
                   factored language model. The switching DBN coordinates
                   the recognition process by integrating the component
                   models. The factored language model, which is estimated
                   from multiple conversational data corpora, is used in
                   conjunction with additional task-specific language
                   models. In conjunction with this joint generative
                   model, we have also investigated the use of a
                   discriminative approach, based on conditional random
                   fields, to perform a reclassification of the segmented
                   DAs. We have carried out experiments on the AMI corpus
                   of multimodal meeting recordings, using both manually
                   transcribed speech, and the output of an automatic
                   speech recognizer, and using different configurations
                   of the generative model. Our results indicate that the
                   system performs well both on reference and fully
                   automatic transcriptions. A further significant
                   improvement in recognition accuracy is obtained by the
                   application of the discriminative reranking approach
                   based on conditional random fields.},
  doi = {10.1109/TASL.2008.922463},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/dielmann2008.pdf},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=4599391&arnumber=4497831&count=18&index=9},
  year = 2008
}
@inproceedings{lu_asru_2011,
  author = {Lu, L. and Ghoshal, A. and Renals, S.},
  title = {Regularized Subspace {G}ausian Mixture Models for
                   Cross-lingual Speech Recognition},
  booktitle = {Proc. ASRU},
  abstract = {We investigate cross-lingual acoustic modelling for
                   low resource languages using the subspace Gaussian
                   mixture model (SGMM). We assume the presence of
                   acoustic models trained on multiple source languages,
                   and use the global subspace parameters from those
                   models for improved modelling in a target language with
                   limited amounts of transcribed speech. Experiments on
                   the GlobalPhone corpus using Spanish, Portuguese, and
                   Swedish as source languages and German as target
                   language (with 1 hour and 5 hours of transcribed audio)
                   show that multilingually trained SGMM shared parameters
                   result in lower word error rates (WERs) than using
                   those from a single source language. We also show that
                   regularizing the estimation of the SGMM state vectors
                   by penalizing their $\ell_1$-norm help to overcome
                   numerical instabilities and lead to lower WER.},
  categories = {Subspace Gaussian Mixture Model, Cross-lingual, model
                   regularization},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/lu-asru-2011.pdf},
  year = 2011
}
@inproceedings{abberley-trec00,
  author = {D.~Abberley and S.~Renals and D.~Ellis and T.~Robinson},
  title = {The {THISL} {SDR} system at {TREC}--8},
  booktitle = {Proc. Eighth Text Retrieval Conference (TREC--8)},
  abstract = {This paper describes the participation of the THISL
                   group at the TREC-8 Spoken Document Retrieval (SDR)
                   track. The THISL SDR system consists of the realtime
                   version of the Abbot large vocabulary speech
                   recognition system and the thislIR text retrieval
                   system. The TREC-8 evaluation assessed SDR performance
                   on a corpus of 500 hours of broadcast news material
                   collected over a five month period. The main test
                   condition involved retrieval of stories defined by
                   manual segmentation of the corpus in which non-news
                   material, such as commercials, were excluded. An
                   optional test condition required required retrieval of
                   the same stories from the unsegmented audio stream. The
                   THISL SDR system participated at both test conditions.
                   The results show that a system such as THISL can
                   produce respectable information retrieval performance
                   on a realistically-sized corpus of unsegmented audio
                   material.},
  categories = {thisl,bnews,trec,ir,recognition,eval,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/trec8.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/trec8.ps.gz},
  year = 2000
}
@inproceedings{hochberg-nnsp94,
  author = {M.~Hochberg and G.~Cook and S.~Renals and T.~Robinson},
  title = {Connectionist model combination for large vocabulary
                   speech recognition},
  booktitle = {IEEE Proc. Neural Networks for Signal Processing},
  volume = {4},
  pages = {269--278},
  categories = {},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1994/nnsp94.ps.gz},
  year = 1994
}
@inproceedings{renals-darpa99,
  author = {S.~Renals and Y.~Gotoh and R.~Gaizauskas and
                   M.~Stevenson},
  title = {The {SPRACH/LaSIE} system for named entity
                   identification in broadcast news},
  booktitle = {Proc. DARPA Broadcast News Workshop},
  pages = {47--50},
  abstract = {We have developed two conceptually different systems
                   that are able to identify named entities from spoken
                   audio. One (referred to as SPRACH-S) has a stochastic
                   finite state machine structure for use with an acoustic
                   model that identifies both words and named entities
                   from speech data. The other (referred to as SPRACH-R)
                   is a rule-based system which uses matching against
                   stored name lists, part-of-speech tagging, and light
                   phrasal parsing with specialised named entity grammars.
                   We provide an overview of the two approaches and
                   present results on the Hub-4E IE-NE evaluation task.},
  categories = {sprach,stobs,ie,lm,bnews,sheffield},
  http = {http://homepages.inf.ed.ac.uk/srenals/pubs/1999/darpa99-ne.html},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/darpa99-ne.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/darpa99-ne.ps.gz},
  year = 1999
}
@inproceedings{abberley-trec99,
  author = {D.~Abberley and S.~Renals and G.~Cook and T.~Robinson},
  title = {Retrieval of broadcast news documents with the {THISL}
                   system},
  booktitle = {Proc. Seventh Text Retrieval Conference (TREC--7)},
  pages = {181--190},
  abstract = {This paper describes the THISL system that
                   participated in the TREC-7 evaluation, Spoken Document
                   Retrieval (SDR) Track, and presents the results
                   obtained, together with some analysis. The THISL system
                   is based on the {\sc Abbot} speech recognition system
                   and the thislIR text retrieval system. In this
                   evaluation we were concerned with investigating the
                   suitability for SDR of a recognizer running at less
                   than ten times realtime, the use of multiple
                   transcriptions and word graphs, the effect of simple
                   query expansion algorithms and the effect of varying
                   standard IR parameters.},
  categories = {thisl,bnews,trec,ir,recognition,eval,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/trec7.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/trec7.ps.gz},
  year = 1999
}
@book{renals-book03,
  editor = {S.~Renals and G.~Grefenstette},
  title = {Text and Speech Triggered Information Access},
  publisher = {Springer-Verlag},
  number = {2705},
  series = {Lecture Notes in Computer Science},
  abstract = {Edited collection of revised lectures from the
                   \href{http://www.ilsp.gr/testia/testia2000.html}
                   {ELSNET-2000 Summer School} on Text and Speech
                   Triggered Information Access. },
  categories = {recognition,ir,ie,lm,multimodal,sheffield},
  url = {http://www.springerlink.com/openurl.asp?genre=issue&issn=0302-9743&volume=2705&issue=preprint},
  year = 2003
}
@inproceedings{murray06c,
  author = {G. Murray and S. Renals},
  title = {Dialogue Act Compression Via Pitch Contour
                   Preservation},
  booktitle = {Proceedings of the 9th International Conference on
                   Spoken Language Processing, Pittsburgh, USA},
  abstract = {This paper explores the usefulness of prosody in
                   automatically compressing dialogue acts from meeting
                   speech. Specifically, this work attempts to compress
                   utterances by preserving the pitch contour of the
                   original whole utterance. Two methods of doing this are
                   described in detail and are evaluated
                   \textit{subjectively} using human annotators and
                   \textit{objectively} using edit distance with a
                   human-authored gold-standard. Both metrics show that
                   such a prosodic approach is much better than the random
                   baseline approach and significantly better than a
                   simple text compression method.},
  categories = {automatic compression, prosody, summarization},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/inter2006.pdf},
  year = 2006
}
@inproceedings{bourlard2008,
  author = {Bourlard, Herve and Renals, Steve},
  title = {Recognition and Understanding of Meetings: Overview of
                   the {European} {AMI} and {AMIDA} Projects},
  booktitle = {Proc. LangTech 2008},
  abstract = {The AMI and AMIDA projects are concerned with the
                   recognition and interpretation of multiparty
                   (face-to-face and remote) meetings. Within these
                   projects we have developed the following: (1) an
                   infrastructure for recording meetings using multiple
                   microphones and cameras; (2) a one hundred hour,
                   manually annotated meeting corpus; (3) a number of
                   techniques for indexing, and summarizing of meeting
                   videos using automatic speech recognition and computer
                   vision, and (4) a extensible framework for browsing,
                   and searching of meeting videos. We give an overview of
                   the various techniques developed in AMI (mainly
                   involving face-to-face meetings), their integration
                   into our meeting browser framework, and future plans
                   for AMIDA (Augmented Multiparty Interaction with
                   Distant Access), the follow-up project to AMI.
                   Technical and business information related to these two
                   projects can be found at www.amiproject.org,
                   respectively on the Scientific and Business portals. },
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/bourlard2008.pdf},
  year = 2008
}