The Centre for Speech Technology Research, The university of Edinburgh

Publications by Steve Renals

srenals.bib

@article{lu2013,
  author = {Lu, Liang and Chin, KK and Ghoshal, Arnab and Renals, Steve},
  doi = {10.1109/TASL.2013.2248718},
  title = {Joint Uncertainty Decoding for Noise Robust Subspace {Gaussian} Mixture Models},
  journal = {IEEE Transactions on Audio, Speech and Language Processing},
  number = {9},
  abstract = {Joint uncertainty decoding (JUD) is a model-based noise compensation technique for conventional Gaussian Mixture Model (GMM) based speech recognition systems. Unlike vector Taylor series (VTS) compensation which operates on the individual Gaussian components in an acoustic model, JUD clusters the Gaussian components into a smaller number of classes, sharing the compensation parameters for the set of Gaussians in a given class. This significantly reduces the computational cost. In this paper, we investigate noise compensation for subspace Gaussian mixture model (SGMM) based speech recognition systems using JUD. The total number of Gaussian components in an SGMM is typically very large. Therefore direct compensation of the individual Gaussian components, as performed by VTS, is computationally expensive. In this paper we show that JUD-based noise compensation can be successfully applied to SGMMs in a computationally efficient way. We evaluate the JUD/SGMM technique on the standard Aurora 4 corpus. Our experimental results indicate that the JUD/SGMM system results in lower word error rates compared with a conventional GMM system with either VTS-based or JUD-based noise compensation.},
  volume = {21},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/liang-taslp12-noise.pdf},
  pages = {1791--1804}
}
@inproceedings{Swietojanski:ICASSP13,
  author = {Swietojanski, Pawel and Ghoshal, Arnab and Renals, Steve},
  doi = {10.1109/ICASSP.2013.6638967},
  title = {Revisiting Hybrid and {GMM-HMM} system combination techniques},
  booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Swietojanski_ICASSP2013.pdf},
  abstract = {In this paper we investigate techniques to combine hybrid HMM-DNN (hidden Markov model -- deep neural network) and tandem HMM-GMM (hidden Markov model -- Gaussian mixture model) acoustic models using: (1) model averaging, and (2) lattice combination with Minimum Bayes Risk decoding. We have performed experiments on the ``TED Talks'' task following the protocol of the IWSLT-2012 evaluation. Our experimental results suggest that DNN-based and GMM- based acoustic models are complementary, with error rates being reduced by up to 8% relative when the DNN and GMM systems are combined at model-level in a multi-pass auto- matic speech recognition (ASR) system. Additionally, further gains were obtained by combining model-averaged lat- tices with the one obtained from baseline systems.},
  categories = {deep neural networks, tandem, hybrid, system combination, TED}
}
@inproceedings{Ghoshal:ICASSP13,
  author = {Ghoshal, Arnab and Swietojanski, Pawel and Renals, Steve},
  doi = {10.1109/ICASSP.2013.6639084},
  title = {Multilingual training of deep neural networks},
  booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Ghoshal_ICASSP2013.pdf},
  abstract = {We investigate multilingual modeling in the context of a deep neural network (DNN) -- hidden Markov model (HMM) hy- brid, where the DNN outputs are used as the HMM state like- lihoods. By viewing neural networks as a cascade of fea- ture extractors followed by a logistic regression classifier, we hypothesise that the hidden layers, which act as feature ex- tractors, will be transferable between languages. As a corol- lary, we propose that training the hidden layers on multiple languages makes them more suitable for such cross-lingual transfer. We experimentally confirm these hypotheses on the GlobalPhone corpus using seven languages from three dif- ferent language families: Germanic, Romance, and Slavic. The experiments demonstrate substantial improvements over a monolingual DNN-HMM hybrid baseline, and hint at av- enues of further exploration.},
  categories = {Speech recognition, deep learning, neural networks, multilingual modeling}
}
@inproceedings{hasler2012,
  author = {Hasler, Eva and Bell, Peter and Ghoshal, Arnab and Haddow, Barry and Koehn, Philipp and McInnes, Fergus and Renals, Steve and Swietojanski, Pawel},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/paper_50.pdf},
  abstract = {This paper describes the University of Edinburgh (UEDIN) systems for the IWSLT 2012 Evaluation. We participated in the ASR (English), MT (English-French, German-English) and SLT (English-French) tracks.},
  year = {2012},
  booktitle = {Proc. International Workshop on Spoken Language Translation},
  title = {The {UEDIN} system for the {IWSLT} 2012 evaluation}
}
@incollection{vipperla2012,
  editor = {Turner, Kenneth J.},
  author = {Vipperla, Ravichander and Wolters, Maria and Renals, Steve},
  publisher = {IOS Press},
  title = {Spoken dialogue interfaces for older people},
  abstract = {Although speech is a highly natural mode of communication, building robust and usable speech-based interfaces is still a challenge, even if the target user group is restricted to younger users. When designing for older users, there are added complications due to cognitive, physiological, and anatomical ageing. Users may also find it difficult to adapt to the interaction style required by the speech interface. In this chapter, we summarise the work on spoken dialogue interfaces that was carried out during the MATCH project. After a brief overview of relevant aspects of ageing and previous work on spoken dialogue interfaces for older people, we summarise our work on managing spoken interactions (dialogue management), understanding older people's speech (speech recognition), and generating spoken messages that older people can understand (speech synthesis). We conclude with suggestions for design guidelines that have emerged from our work and suggest directions for future research.},
  year = {2012},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/08-vipperla-2013.pdf},
  booktitle = {Advances in Home Care Technologies}
}
@inproceedings{zwyssig2012effect,
  author = {Zwyssig, E. and Renals, S. and Lincoln, M.},
  doi = {10.1109/ICASSP.2012.6288839},
  title = {{On the effect of SNR and superdirective beamforming in speaker diarisation in meetings}},
  abstract = {This paper examines the effect of sensor performance on speaker diarisation in meetings and investigates the use of more advanced beamforming techniques, beyond the typically employed delay-sum beamformer, for mitigating the effects of poorer sensor performance. We present super-directive beamforming and investigate how different time difference of arrival (TDOA) smoothing and beamforming techniques influence the performance of state-of-the-art diarisation systems. We produced and transcribed a new corpus of meetings recorded in the instrumented meeting room using a high SNR analogue and a newly developed low SNR digital MEMS microphone array (DMMA.2). This research demonstrates that TDOA smoothing has a significant effect on the diarisation error rate and that simple noise reduction and beamforming schemes suffice to overcome audio signal degradation due to the lower SNR of modern MEMS microphones.},
  year = {2012},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/zwyssig-dmma2-icassp12.pdf},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2012 IEEE International Conference on},
  pages = {4177--4180}
}
@inproceedings{zwyssig2012determining,
  author = {Zwyssig, E. and Renals, S. and Lincoln, M.},
  doi = {10.1109/ICASSP.2012.6288984},
  title = {Determining the number of speakers in a meeting using microphone array features},
  abstract = {The accuracy of speaker diarisation in meetings relies heavily on determining the correct number of speakers. In this paper we present a novel algorithm based on time difference of arrival (TDOA) features that aims to find the correct number of active speakers in a meeting and thus aid the speaker segmentation and clustering process. With our proposed method the microphone array TDOA values and known geometry of the array are used to calculate a speaker matrix from which we determine the correct number of active speakers with the aid of the Bayesian information criterion (BIC). In addition, we analyse several well-known voice activity detection (VAD) algorithms and verified their fitness for meeting recordings. Experiments were performed using the NIST RT06, RT07 and RT09 data sets, and resulted in reduced error rates compared with BIC-based approaches.},
  year = {2012},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/zwyssig-dia-icassp12.pdf},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2012 IEEE International Conference on},
  pages = {4765--4768}
}
@inproceedings{bell13_mlan,
  author = {Bell, Peter and Swietojanski, Pawel and Renals, Steve},
  doi = {10.1109/ICASSP.2013.6639014},
  title = {Multi-level adaptive networks in tandem and hybrid {ASR} systems},
  abstract = {In this paper we investigate the use of Multi-level adaptive networks (MLAN) to incorporate out-of-domain data when training large vocabulary speech recognition systems. In a set of experiments on multi-genre broadcast data and on TED lecture recordings we present results using of out-of-domain features in a hybrid DNN system and explore tandem systems using a variety of input acoustic features. Our experiments indicate using the MLAN approach in both hybrid and tandem systems results in consistent reductions in word error rate of 5--10\% relative.},
  address = {Vancouver, Canada},
  month = {May},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/mlan_icassp2013.pdf},
  booktitle = {Proc. ICASSP}
}
@inproceedings{swi2012_dnn,
  author = {Swietojanski, P. and Ghoshal, A. and Renals, S.},
  doi = {10.1109/SLT.2012.6424230},
  title = {Unsupervised Cross-lingual knowledge transfer in {DNN-based LVCSR}},
  booktitle = {Proc. IEEE Workshop on Spoken Language Technology},
  year = {2012},
  abstract = {We investigate the use of cross-lingual acoustic data to initialise deep neural network (DNN) acoustic models by means of unsupervised restricted Boltzmann machine (RBM) pretraining. DNNs for German are pretrained using one or all of German, Portuguese, Spanish and Swedish. The DNNs are used in a tandem configuration, where the network outputs are used as features for a hidden Markov model (HMM) whose emission densities are modeled by Gaussian mixture models (GMMs), as well as in a hybrid configuration, where the network outputs are used as the HMM state likelihoods. The experiments show that unsupervised pretraining is more crucial for the hybrid setups, particularly with limited amounts of transcribed training data. More importantly, unsupervised pretraining is shown to be language-independent.},
  month = {December},
  address = {Miami, Florida, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/ps_slt2012.pdf},
  pages = {246--251}
}
@incollection{gotoh-lm03,
  editor = {Renals, S. and Grefenstette, G.},
  author = {Gotoh, Y. and Renals, S.},
  title = {Language Modelling},
  booktitle = {Text and Speech Triggered Information Access},
  abstract = {This is a preprint of a tutorial on statistical language modelling, based on Yoshi Gotoh's course at the \href{http://www.ilsp.gr/testia/testia2000.html} {ELSNET-2000 Summer School} on Text and Speech Triggered Information Access.},
  year = {2003},
  pages = {78--105},
  categories = {ie,lm,bnews,sheffield},
  crossref = {renals-book03}
}
@misc{turk2010,
  author = {Turk, Alice and Scobbie, James and Geng, Christian and Campbell, Barry and Dickie, Catherine and Dubourg, Eddie and Bard, Ellen Gurman and Hardcastle, William and Hartinger, Mariam and King, Simon and Lickley, Robin and Macmartin, Cedric and Nakai, Satsuki and Renals, Steve and Richmond, Korin and Schaeffler, Sonja and White, Kevin and Wiegand, Ronny and Wrench, Alan},
  howpublished = {Poster presented at the 12th Conference on Laboratory Phonology, Albuquerque, New Mexico.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/ESPF.pdf},
  year = {2010},
  title = {An {E}dinburgh speech production facility},
  month = {July}
}
@inproceedings{renals2008,
  author = {Renals, Steve and Hain, Thomas and Bourlard, Hervé},
  doi = {10.1109/HSCMA.2008.4538700},
  title = {Interpretation of Multiparty Meetings: The {AMI} and {AMIDA} Projects},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=4538666&arnumber=4538700&count=68&index=33},
  booktitle = {IEEE Workshop on Hands-Free Speech Communication and Microphone Arrays, 2008. HSCMA 2008},
  abstract = {The AMI and AMIDA projects are collaborative EU projects concerned with the automatic recognition and interpretation of multiparty meetings. This paper provides an overview of the advances we have made in these projects with a particular focus on the multimodal recording infrastructure, the publicly available AMI corpus of annotated meeting recordings, and the speech recognition framework that we have developed for this domain.},
  year = {2008},
  keywords = {AMI corpus; Meetings; evaluation; speech recognition},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/renals2008.pdf},
  pages = {115--118}
}
@article{murray2009,
  author = {Murray, Gabriel and Kleinbauer, Thomas and Poller, Peter and Becker, Tilman and Renals, Steve and Kilgour, Jonathan},
  doi = {10.1145/1596517.1596518},
  title = {Extrinsic Summarization Evaluation: A Decision Audit Task},
  url = {http://doi.acm.org/10.1145/1596517.1596518},
  journal = {ACM Transactions on Speech and Language Processing},
  number = {2},
  abstract = {In this work we describe a large-scale extrinsic evaluation of automatic speech summarization technologies for meeting speech. The particular task is a decision audit, wherein a user must satisfy a complex information need, navigating several meetings in order to gain an understanding of how and why a given decision was made. We compare the usefulness of extractive and abstractive technologies in satisfying this information need, and assess the impact of automatic speech recognition (ASR) errors on user performance. We employ several evaluation methods for participant performance, including post-questionnaire data, human subjective and objective judgments, and a detailed analysis of participant browsing behavior. We find that while ASR errors affect user satisfaction on an information retrieval task, users can adapt their browsing behavior to complete the task satisfactorily. Results also indicate that users consider extractive summaries to be intuitive and useful tools for browsing multimodal meeting data. We discuss areas in which automatic summarization techniques can be improved in comparison with gold-standard meeting abstracts.},
  volume = {6},
  year = {2009},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/murray-acm09.pdf},
  pages = {1--29}
}
@inproceedings{hochberg-arpa94,
  author = {Hochberg, M. and Renals, S. and Robinson, T.},
  booktitle = {Proc. ARPA Spoken Language Technology Workshop},
  year = {1994},
  pages = {102--105},
  categories = {},
  title = {{Abbot}: The {CUED} hybrid {connectionist/HMM} large vocabulary recognition system}
}
@inproceedings{vipperla08,
  author = {Vipperla, Ravichander and Renals, Steve and Frankel, Joe},
  title = {Longitudinal study of {ASR} performance on ageing voices},
  booktitle = {Proc.~Interspeech},
  year = {2008},
  address = {Brisbane},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/vipperla_is08.pdf},
  abstract = {This paper presents the results of a longitudinal study of ASR performance on ageing voices. Experiments were conducted on the audio recordings of the proceedings of the Supreme Court Of The United States (SCOTUS). Results show that the Automatic Speech Recognition (ASR) Word Error Rates (WERs) for elderly voices are significantly higher than those of adult voices. The word error rate increases gradually as the age of the elderly speakers increase. Use of maximum likelihood linear regression (MLLR) based speaker adaptation on ageing voices improves the WER though the performance is still considerably lower compared to adult voices. Speaker adaptation however reduces the increase in WER with age during old age.}
}
@inproceedings{christensen-icassp05,
  author = {Christensen, H. and Kolluru, B. and Gotoh, Y. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/christensen-icassp05.ps.gz},
  title = {Maximum entropy segmentation of broadcast news},
  booktitle = {Proc. IEEE ICASSP},
  abstract = {This paper presents an automatic system for structuring and preparing a news broadcast for applications such as speech summarization, browsing, archiving and information retrieval. This process comprises transcribing the audio using an automatic speech recognizer and subsequently segmenting the text into utterances and topics. A maximum entropy approach is used to build statistical models for both utterance and topic segmentation. The experimental work addresses the effect on performance of the topic boundary detector of three factors: the information sources used, the quality of the ASR transcripts, and the quality of the utterance boundary detector. The results show that the topic segmentation is not affected severely by transcripts errors, whereas errors in the utterance segmentation are more devastating.},
  year = {2005},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/christensen-icassp05.pdf},
  pages = {},
  categories = {s3l,summarization,bnews,edinburgh,sheffield}
}
@article{zhang-spl2008,
  author = {Zhang, Le and Renals, Steve},
  title = {Acoustic-Articulatory Modelling with the Trajectory {HMM}},
  journal = {IEEE Signal Processing Letters},
  abstract = {In this letter, we introduce an hidden Markov model (HMM)-based inversion system to recovery articulatory movements from speech acoustics. Trajectory HMMs are used as generative models for modelling articulatory data. Experiments on the MOCHA-TIMIT corpus indicate that the jointly trained acoustic-articulatory models are more accurate (lower RMS error) than the separately trained ones, and that trajectory HMM training results in greater accuracy compared with conventional maximum likelihood HMM training. Moreover, the system has the ability to synthesize articulatory movements directly from a textual representation.},
  volume = {15},
  key = {articulatory inversion},
  year = {2008},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/zhang-spl.pdf},
  pages = {245-248}
}
@inproceedings{ultraxIS2012,
  author = {Richmond, Korin and Renals, Steve},
  title = {Ultrax: An Animated Midsagittal Vocal Tract Display for Speech Therapy},
  booktitle = {Proc. Interspeech},
  year = {2012},
  month = {September},
  address = {Portland, Oregon, USA},
  keywords = {Ultrasound, speech therapy, vocal tract visualisation},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/RichmondRenalsIS2012.pdf},
  abstract = {Speech sound disorders (SSD) are the most common communication impairment in childhood, and can hamper social development and learning. Current speech therapy interventions rely predominantly on the auditory skills of the child, as little technology is available to assist in diagnosis and therapy of SSDs. Realtime visualisation of tongue movements has the potential to bring enormous benefit to speech therapy. Ultrasound scanning offers this possibility, although its display may be hard to interpret. Our ultimate goal is to exploit ultrasound to track tongue movement, while displaying a simplified, diagrammatic vocal tract that is easier for the user to interpret. In this paper, we outline a general approach to this problem, combining a latent space model with a dimensionality reducing model of vocal tract shapes. We assess the feasibility of this approach using magnetic resonance imaging (MRI) scans to train a model of vocal tract shapes, which is animated using electromagnetic articulography (EMA) data from the same speaker.},
  categories = {Ultrasound, speech therapy, vocal tract visualisation}
}
@article{gotoh-roysoc00,
  author = {Gotoh, Y. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/rs00-preprint.ps.gz},
  title = {Information Extraction from Broadcast News},
  journal = {Philosophical Transactions of the Royal Society of London, Series A},
  abstract = {This paper discusses the development of trainable statistical models for extracting content from television and radio news broadcasts. In particular we concentrate on statistical finite state models for identifying proper names and other named entities in broadcast speech. Two models are presented: the first models name class information as a word attribute; the second explicitly models both word-word and class-class transitions. A common n-gram based formulation is used for both models. The task of named entity identification is characterized by relatively sparse training data and issues related to smoothing are discussed. Experiments are reported using the DARPA/NIST Hub-4E evaluation for North American Broadcast News.},
  volume = {358},
  year = {2000},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/rs00-preprint.pdf},
  pages = {1295--1310},
  categories = {stobs,ie,lm,bnews,sheffield}
}
@incollection{vipperla2009a,
  author = {Vipperla, Ravi Chander and Wolters, Maria and Georgila, Kallirroi and Renals, Steve},
  publisher = {Springer},
  doi = {10.1007/978-3-642-02710-9},
  title = {Speech Input from Older Users in Smart Environments: Challenges and Perspectives},
  url = {http://www.springerlink.com/content/27r01345r1683251/?p=ad2394d646814db59cf9868b0f74b11e&pi=13},
  series = {Lecture Notes in Computer Science},
  booktitle = {Proc. HCI International: Universal Access in Human-Computer Interaction. Intelligent and Ubiquitous Interaction Environments},
  number = {5615},
  year = {2009},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/vipperla2009a.pdf},
  abstract = {Although older people are an important user group for smart environments, there has been relatively little work on adapting natural language interfaces to their requirements. In this paper, we focus on a particularly thorny problem: processing speech input from older users. Our experiments on the MATCH corpus show clearly that we need age-specific adaptation in order to recognize older users' speech reliably. Language models need to cover typical interaction patterns of older people, and acoustic models need to accommodate older voices. Further research is needed into intelligent adaptation techniques that will allow existing large, robust systems to be adapted with relatively small amounts of in-domain, age appropriate data. In addition, older users need to be supported with adequate strategies for handling speech recognition errors.}
}
@incollection{morgan-guyonbook94,
  editor = {Guyon, I. and Wang, P. S. P.},
  author = {Morgan, N. and Bourlard, H. and Renals, S. and Cohen, M. and Franco, H.},
  publisher = {World Scientific Publications},
  title = {Hybrid neural network/hidden {Markov} model systems for continuous speech recognition},
  series = {Series in Machine Perception and Artificial Intelligence},
  booktitle = {Advances in Pattern Recognition Systems using Neural Networks Technologies},
  volume = {7},
  year = {1994},
  categories = {}
}
@inproceedings{koumpis-eurospeech01,
  author = {Koumpis, K. and Renals, S. and Niranjan, M.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/eurospeech01.ps.gz},
  title = {Extractive Summarization of Voicemail using Lexical and Prosodic Feature Subset Selection},
  booktitle = {Proc. Eurospeech},
  year = {2001},
  abstract = {This paper presents a novel data-driven approach to summarizing spoken audio transcripts utilizing lexical and prosodic features. The former are obtained from a speech recognizer and the latter are extracted automatically from speech waveforms. We employ a feature subset selection algorithm, based on ROC curves, which examines different combinations of features at different target operating conditions. The approach is evaluated on the IBM Voicemail corpus, demonstrating that it is possible and desirable to avoid complete commitment to a single best classifier or feature set.},
  address = {Aalborg, Denmark},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/eurospeech01.pdf},
  pages = {2377--2380},
  categories = {voicemail,summarization,prosody,sheffield}
}
@inproceedings{cuayahuitletal_interspeech06,
  author = {Cuayáhuitl, Heriberto and Renals, Steve and Lemon, Oliver and Shimodaira, Hiroshi},
  title = {Learning Multi-Goal Dialogue Strategies Using Reinforcement Learning With Reduced State-Action Spaces},
  booktitle = {Proc. Interspeech},
  month = {September},
  year = {2006},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/rss-icslp2006.pdf},
  abstract = {Learning dialogue strategies using the reinforcement learning framework is problematic due to its expensive computational cost. In this paper we propose an algorithm that reduces a state-action space to one which includes only valid state-actions. We performed experiments on full and reduced spaces using three systems (with 5, 9 and 20 slots) in the travel domain using a simulated environment. The task was to learn multi-goal dialogue strategies optimizing single and multiple confirmations. Average results using strategies learnt on reduced spaces reveal the following benefits against full spaces: 1) less computer memory (94\% reduction), 2) faster learning (93\% faster convergence) and better performance (8.4\% less time steps and 7.7\% higher reward).},
  categories = {reinforcement learning, spoken dialogue systems}
}
@inproceedings{hochberg-icslp94,
  author = {Hochberg, M. and Renals, S. and Robinson, T. and Kershaw, D.},
  title = {Large vocabulary continuous speech recognition using a hybrid {connectionist/HMM} system},
  booktitle = {Proc. ICSLP},
  year = {1994},
  address = {Yokohama},
  pages = {1499--1502},
  categories = {}
}
@inproceedings{zwyssig2010,
  author = {Zwyssig, Erich and Lincoln, Mike and Renals, Steve},
  doi = {10.1109/ICASSP.2010.5495040},
  title = {A Digital Microphone Array for Distant Speech Recognition},
  booktitle = {Proc. IEEE ICASSP--10},
  abstract = {In this paper, the design, implementation and testing of a digital microphone array is presented. The array uses digital MEMS microphones which integrate the microphone, amplifier and analogue to digital converter on a single chip in place of the analogue microphones and external audio interfaces currently used. The device has the potential to be smaller, cheaper and more flexible than typical analogue arrays, however the effect on speech recognition performance of using digital microphones is as yet unknown. In order to evaluate the effect, an analogue array and the new digital array are used to simultaneously record test data for a speech recognition experiment. Initial results employing no adaptation show that performance using the digital array is significantly worse (14\% absolute WER) than the analogue device. Subsequent experiments using MLLR and CMLLR channel adaptation reduce this gap, and employing MLLR for both channel and speaker adaptation reduces the difference between the arrays to 4.5\% absolute WER.},
  year = {2010},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/zwyssig-icassp10.pdf},
  pages = {5106--5109}
}
@inproceedings{garau-interspeech05,
  author = {Garau, G. and Renals, S. and Hain, T.},
  title = {Applying Vocal Tract Length Normalization to Meeting Recordings},
  booktitle = {Proc. Interspeech},
  month = {September},
  year = {2005},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/giuliagarau_eurospeech05.pdf},
  abstract = {Vocal Tract Length Normalisation (VTLN) is a commonly used technique to normalise for inter-speaker variability. It is based on the speaker-specific warping of the frequency axis, parameterised by a scalar warp factor. This factor is typically estimated using maximum likelihood. We discuss how VTLN may be applied to multiparty conversations, reporting a substantial decrease in word error rate in experiments using the ICSI meetings corpus. We investigate the behaviour of the VTLN warping factor and show that a stable estimate is not obtained. Instead it appears to be influenced by the context of the meeting, in particular the current conversational partner. These results are consistent with predictions made by the psycholinguistic interactive alignment account of dialogue, when applied at the acoustic and phonological levels.},
  categories = {ami,asr,edinburgh,vtln,speaker adaptation,lvcsr,meetings}
}
@article{morgan-ijprai93,
  author = {Morgan, N. and Bourlard, H. and Renals, S. and Cohen, M. and Franco, H.},
  title = {Hybrid neural network/hidden {Markov} model systems for continuous speech recognition},
  journal = {Intl. J. Pattern Recog. and Artific. Intell.},
  volume = {7},
  year = {1993},
  pages = {899--916},
  categories = {}
}
@inproceedings{hochberg-arpa95,
  author = {Hochberg, M. and Cook, G. and Renals, S. and Robinson, T. and Schechtman, R.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1995/slt95.ps.gz},
  title = {The 1994 {Abbot} hybrid {connectionist--HMM} large vocabulary recognition system},
  booktitle = {Proc. ARPA Spoken Language Technology Workshop},
  year = {1995},
  pages = {170--175},
  categories = {wernicke,recognition,wsj,am,hybrid,abbot,eval,search,sheffield,cambridge}
}
@incollection{alhames-mlmi05,
  editor = {Renals, S. and Bengio, S.},
  author = {Al-Hames, M. and Dielmann, A. and Gatica-Perez, D. and Reiter, S. and Renals, S. and Rigoll, G. and Zhang, D.},
  publisher = {Springer},
  title = {Multimodal Integration for Meeting Group Action Segmentation and Recognition},
  booktitle = {Proc. Multimodal Interaction and Related Machine Learning Algorithms Workshop (MLMI--05)},
  abstract = {We address the problem of segmentation and recognition of sequences of multimodal human interactions in meetings. These interactions can be seen as a rough structure of a meeting, and can be used either as input for a meeting browser or as a first step towards a higher semantic analysis of the meeting. A common lexicon of multimodal group meeting actions, a shared meeting data set, and a common evaluation procedure enable us to compare the different approaches. We compare three different multimodal feature sets and our modelling infrastructures: a higher semantic feature approach, multi-layer HMMs, a multistream DBN, as well as a multi-stream mixed-state DBN for disturbed data.},
  year = {2006},
  pages = {52--63},
  categories = {m4,ami,multimodal,dbn,meetings,edinburgh,IDIAP,munich}
}
@inproceedings{jyamagis07:avss2006,
  author = {Yamagishi, Junichi and Kobayashi, Takao and Renals, Steve and King, Simon and Zen, Heiga and Toda, Tomoki and Tokuda, Keiichi},
  title = {Improved Average-Voice-based Speech Synthesis Using Gender-Mixed Modeling and a Parameter Generation Algorithm Considering {GV}},
  booktitle = {Proc. 6th ISCA Workshop on Speech Synthesis (SSW-6)},
  month = {August},
  year = {2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/ssw6-yamagishi.pdf},
  abstract = {For constructing a speech synthesis system which can achieve diverse voices, we have been developing a speaker independent approach of HMM-based speech synthesis in which statistical average voice models are adapted to a target speaker using a small amount of speech data. In this paper, we incorporate a high-quality speech vocoding method STRAIGHT and a parameter generation algorithm with global variance into the system for improving quality of synthetic speech. Furthermore, we introduce a feature-space speaker adaptive training algorithm and a gender mixed modeling technique for conducting further normalization of the average voice model. We build an English text-to-speech system using these techniques and show the performance of the system.},
  categories = {HMM, speech synthesis, speaker adaptation, HTS}
}
@inproceedings{renals2010b,
  author = {Renals, Steve},
  title = {Recognition and Understanding of Meetings},
  booktitle = {Proc. NAACL/HLT},
  abstract = {This paper is about interpreting human communication in meetings using audio, video and other signals. Automatic meeting recognition and understanding is extremely challenging, since communication in a meeting is spontaneous and conversational, and involves multiple speakers and multiple modalities. This leads to a number of significant research problems in signal processing, in speech recognition, and in discourse interpretation, taking account of both individual and group behaviours. Addressing these problems requires an interdisciplinary effort. In this paper, I discuss the capture and annotation of multimodal meeting recordings - resulting in the AMI meeting corpus - and how we have built on this to develop techniques and applications for the recognition and interpretation of meetings.},
  year = {2010},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/renals-naacl10.pdf},
  pages = {1--9}
}
@incollection{renals-sesimbra90,
  editor = {Almeida, L. B. and Wellekens, C. J.},
  author = {Renals, S.},
  publisher = {Springer-Verlag},
  title = {Chaos in neural networks},
  series = {Lecture Notes in Computer Science},
  booktitle = {Neural Networks},
  number = {412},
  year = {1990},
  pages = {90--99},
  categories = {}
}
@inproceedings{renals-ijcnn89,
  author = {Renals, S. and Rohwer, R.},
  title = {Phoneme classification experiments using radial basis functions},
  booktitle = {Proc. IJCNN},
  year = {1989},
  address = {Washington DC},
  pages = {461--468},
  categories = {}
}
@inproceedings{kilgour2010,
  author = {Kilgour, Jonathan and Carletta, Jean and Renals, Steve},
  doi = {10.1145/1878101.1878112},
  title = {The {Ambient Spotlight}: Queryless desktop search from meeting speech},
  booktitle = {Proc ACM Multimedia 2010 Workshop SSCS 2010},
  year = {2010},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2010/AmbientSpot.pdf},
  abstract = {It has recently become possible to record any small meeting using a laptop equipped with a plug-and-play USB microphone array. We show the potential for such recordings in a personal aid that allows project managers to record their meetings and, when reviewing them afterwards through a standard calendar interface, to find relevant documents on their computer. This interface is intended to supplement or replace the textual searches that managers typically perform. The prototype, which relies on meeting speech recognition and topic segmentation, formulates and runs desktop search queries in order to present its results.}
}
@incollection{murray2008c,
  author = {Murray, Gabriel and Kleinbauer, Thomas and Poller, Peter and Renals, Steve and Kilgour, Jonathan},
  publisher = {Springer},
  doi = {10.1007/978-3-540-85853-9_32},
  title = {Extrinsic Summarization Evaluation: A Decision Audit Task},
  series = {Lecture Notes in Computer Science},
  booktitle = {Machine Learning for Multimodal Interaction (Proc. MLMI '08)},
  number = {5237},
  abstract = {In this work we describe a large-scale extrinsic evaluation of automatic speech summarization technologies for meeting speech. The particular task is a decision audit, wherein a user must satisfy a complex information need, navigating several meetings in order to gain an understanding of how and why a given decision was made. We compare the usefulness of extractive and abstractive technologies in satisfying this information need, and assess the impact of automatic speech recognition (ASR) errors on user performance. We employ several evaluation methods for participant performance, including post-questionnaire data, human subjective and objective judgments, and an analysis of participant browsing behaviour.},
  year = {2008},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/murray2008c.pdf},
  pages = {349--361}
}
@article{renals-splett96,
  author = {Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1996/splett96.ps.gz},
  title = {Phone deactivation pruning in large vocabulary continuous speech recognition},
  journal = {IEEE Signal Processing Letters},
  abstract = {In this letter we introduce a new pruning strategy for large vocabulary continuous speech recognition based on direct estimates of local posterior phone probabilities. This approach is well suited to hybrid connectionist/hidden Markov model systems. Experiments on the Wall Street Journal task using a 20,000 word vocabulary and a trigram language model have demonstrated that phone deactivation pruning can increase the speed of recognition-time search by up to a factor of 10, with a relative increase in error rate of less than 2\%.},
  volume = {3},
  year = {1996},
  pages = {4--6},
  categories = {wernicke,sprach,recognition,search,wsj,sheffield}
}
@incollection{dielmann-mlmi06,
  editor = {Renals, S. and Bengio, S. and Fiscus, J.},
  author = {Dielmann, A. and Renals, S.},
  publisher = {Springer},
  title = {Automatic Dialogue Act Recognition using a Dynamic {Bayesian} Network},
  booktitle = {Proc. Multimodal Interaction and Related Machine Learning Algorithms Workshop (MLMI--06)},
  abstract = {We propose a joint segmentation and classification approach for the dialogue act recognition task on natural multi-party meetings ({ICSI} Meeting Corpus). Five broad DA categories are automatically recognised using a generative Dynamic {Bayesian} Network based infrastructure. Prosodic features and a switching graphical model are used to estimate DA boundaries, in conjunction with a factored language model which is used to relate words and DA categories. This easily generalizable and extensible system promotes a rational approach to the joint DA segmentation and recognition task, and is capable of good recognition performance.},
  year = {2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/dielmann-mlmi06.pdf},
  pages = {178--189},
  categories = {ami,dialogue act,dbn,factored language model,meetings,edinburgh}
}
@inproceedings{hain-interspeech05,
  author = {Hain, T. and Dines, J. and Garau, G. and Karafiat, M. and Moore, D. and Wan, V. and Ordelman, R. and Renals, S.},
  title = {Transcription of Conference Room Meetings: an Investigation},
  booktitle = {Proc. Interspeech},
  year = {2005},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hain-eurospeech05.pdf},
  abstract = {The automatic processing of speech collected in conference style meetings has attracted considerable interest with several large scale projects devoted to this area. In this paper we explore the use of various meeting corpora for the purpose of automatic speech recognition. In particular we investigate the similarity of these resources and how to efficiently use them in the construction of a meeting transcription system. The analysis shows distinctive features for each resource. However the benefit in pooling data and hence the similarity seems sufficient to speak of a generic conference meeting domain . In this context this paper also presents work on development for the AMI meeting transcription system, a joint effort by seven sites working on the AMI (augmented multi-party interaction) project.},
  categories = {ami,asr,edinburgh}
}
@inproceedings{huang2008-is,
  author = {Huang, Songfang and Renals, Steve},
  title = {Unsupervised Language Model Adaptation Based on Topic and Role Information in Multiparty Meetings},
  booktitle = {Proc. Interspeech'08},
  year = {2008},
  abstract = {We continue our previous work on the modeling of topic and role information from multiparty meetings using a hierarchical Dirichlet process (HDP), in the context of language model adaptation. In this paper we focus on three problems: 1) an empirical analysis of the HDP as a nonparametric topic model; 2) the mismatch problem of vocabularies of the baseline n-gram model and the HDP; and 3) an automatic speech recognition experiment to further verify the effectiveness of our adaptation framework. Experiments on a large meeting corpus of more than 70 hours speech data show consistent and significant improvements in terms of word error rate for language model adaptation based on the topic and role information.},
  month = {September},
  address = {Brisbane, Australia},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/interspeech08.pdf},
  pages = {833--836}
}
@inproceedings{hennebert-eurospeech97,
  author = {Hennebert, J. and Ris, C. and Bourlard, H. and Renals, S. and Morgan, N.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/eurosp97-remap.ps.gz},
  title = {Estimation of global posteriors and forward-backward training of hybrid {HMM/ANN} systems},
  booktitle = {Proc. Eurospeech},
  year = {1997},
  abstract = {The results of our research presented in this paper are two-fold. First, an estimation of global posteriors[5~5 is formalized in the framework of hybrid HMM/ANN systems. It is shown that hybrid HMM/ANN systems, in which the ANN part estimates local posteriors can be used to model global posteriors. This formalization provides us with a clear theory in which both REMAP and ``classical'' Viterbi trained hybrid systems are unified. Second, a new forward-backward training of hybrid HMM/ANN systems is derived from the previous formulation. Comparisons of performance between Viterbi and forward-backward hybrid systems are presented and discussed.},
  address = {Rhodes},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/eurosp97-remap.pdf},
  pages = {1951--1954},
  categories = {sprach,am,hybrid,sheffield}
}
@inproceedings{cook-darpa99,
  author = {Cook, G. and Al-Ghoneim, K. and Ellis, D. and Fosler-Lussier, E. and Gotoh, Y. and Kingsbury, B. and Morgan, N. and Renals, S. and Robinson, T. and Williams, G.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/darpa99-sprach.ps.gz},
  http = {http://homepages.inf.ed.ac.uk/srenals/pubs/1999/darpa99-sprach.html},
  title = {The {SPRACH} system for the transcription of broadcast news},
  booktitle = {Proc. DARPA Broadcast News Workshop},
  abstract = {This paper describes the SPRACH system developed for the 1998 Hub-4E broadcast news evaluation. The system is based on the connectionist-HMM framework and uses both recurrent neural network and multi-layer perceptron acoustic models. We describe both a system designed for the primary transcription hub, and a system for the less-than 10 times real-time spoke. We then describe recent developments to CHRONOS, a time-first stack decoder. We show how these developments have simplified the evaluation system, and led to significant reductions in the error rate of the 10x real-time system. We also present a system designed to operate in real-time with negligible search error.},
  year = {1999},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/darpa99-sprach.pdf},
  pages = {161--166},
  categories = {sprach,bnews,recognition,am,hybrid,abbot,search,eval,sheffield}
}
@inproceedings{abberley-icassp98,
  author = {Abberley, D. and Renals, S. and Cook, G.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/icassp98.ps.gz},
  title = {Retrieval of broadcast news documents with the {THISL} system},
  booktitle = {Proc IEEE ICASSP},
  year = {1998},
  abstract = {This paper describes a spoken document retrieval system, combining the Abbot large vocabulary continuous speech recognition (LVCSR) system developed by Cambridge University, Sheffield University and SoftSound, and the PRISE information retrieval engine developed by NIST. The system was constructed to enable us to participate in the TREC 6 Spoken Document Retrieval experimental evaluation. Our key aims in this work wer e to produce a complete system for the SDR task, to investigate the effect of a word error rate of 30-50\% on retrieval performance and to investigate the integration of LVCSR and word spotting in a retrieval task.},
  address = {Seattle},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/icassp98.pdf},
  pages = {3781--3784},
  categories = {thisl,bnews,trec,ir,recognition,sheffield}
}
@inproceedings{uria2011deep,
  author = {Uria, Benigno and Renals, Steve and Richmond, Korin},
  title = {A Deep Neural Network for Acoustic-Articulatory Speech Inversion},
  booktitle = {Proc. NIPS 2011 Workshop on Deep Learning and Unsupervised Feature Learning},
  year = {2011},
  month = {December},
  address = {Sierra Nevada, Spain},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2011/articulatory_inversion.pdf},
  abstract = {In this work, we implement a deep belief network for the acoustic-articulatory inversion mapping problem. We find that adding up to 3 hidden-layers improves inversion accuracy. We also show that this improvement is due to the higher ex- pressive capability of a deep model and not a consequence of adding more adjustable parameters. Additionally, we show unsupervised pretraining of the sys- tem improves its performance in all cases, even for a 1 hidden-layer model. Our implementation obtained an average root mean square error of 0.95 mm on the MNGU0 test dataset, beating all previously published results.}
}
@inproceedings{robinson-icassp95,
  author = {Robinson, T. and Fransen, J. and Pye, D. and Foote, J. and Renals, S.},
  title = {{WSJCAM0}: A {British English} speech corpus for large vocabulary continuous speech recognition},
  booktitle = {Proc IEEE ICASSP},
  year = {1995},
  address = {Detroit},
  pages = {81--84},
  categories = {}
}
@article{cuayahuitl2009,
  author = {Cuayáhuitl, Heriberto and Renals, Steve and Lemon, Oliver and Shimodaira, Hiroshi},
  doi = {10.1016/j.csl.2009.07.001},
  title = {Evaluation of a hierarchical reinforcement learning spoken dialogue system},
  journal = {Computer Speech and Language},
  number = {2},
  abstract = {We describe an evaluation of spoken dialogue strategies designed using hierarchical reinforcement learning agents. The dialogue strategies were learnt in a simulated environment and tested in a laboratory setting with 32 users. These dialogues were used to evaluate three types of machine dialogue behaviour: hand-coded, fully-learnt and semi-learnt. These experiments also served to evaluate the realism of simulated dialogues using two proposed metrics contrasted with ‘Precision-Recall’. The learnt dialogue behaviours used the Semi-Markov Decision Process (SMDP) model, and we report the first evaluation of this model in a realistic conversational environment. Experimental results in the travel planning domain provide evidence to support the following claims: (a) hierarchical semi-learnt dialogue agents are a better alternative (with higher overall performance) than deterministic or fully-learnt behaviour; (b) spoken dialogue strategies learnt with highly coherent user behaviour and conservative recognition error rates (keyword error rate of 20\%) can outperform a reasonable hand-coded strategy; and (c) hierarchical reinforcement learning dialogue agents are feasible and promising for the (semi) automatic design of optimized dialogue behaviours in larger-scale systems.},
  volume = {24},
  year = {2009},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/cuayahuitl-csl09.pdf},
  pages = {395-429}
}
@article{renals-specom00,
  author = {Renals, S. and Abberley, D. and Kirby, D. and Robinson, T.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/specom00-preprint.ps.gz},
  title = {Indexing and Retrieval of Broadcast News},
  journal = {Speech Communication},
  abstract = {This paper describes a spoken document retrieval (SDR) system for British and North American Broadcast News. The system is based on a connectionist large vocabulary speech recognizer and a probabilistic information retrieval system. We discuss the development of a realtime Broadcast News speech recognizer, and its integration into an SDR system. Two advances were made for this task: automatic segmentation and statistical query expansion using a secondary corpus. Precision and recall results using the Text Retrieval Conference (TREC) SDR evaluation infrastructure are reported throughout the paper, and we discuss the application of these developments to a large scale SDR task based on an archive of British English broadcast news.},
  volume = {32},
  year = {2000},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/specom00-preprint.pdf},
  pages = {5--20},
  categories = {thisl,bnews,trec,ir,recognition,sheffield}
}
@inproceedings{renals-twente98,
  author = {Renals, S. and Abberley, D.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/twente98.ps.gz},
  title = {The {THISL} spoken document retrieval system},
  booktitle = {Proc. 14th Twente Workshop on Language Technology},
  abstract = {THISL is an ESPRIT Long Term Research Project focused the development and construction of a system to items from an archive of television and radio news broadcasts. In this paper we outline our spoken document retrieval system based on the Abbot speech recognizer and a text retrieval system based on Okapi term-weighting . The system has been evaluated as part of the TREC-6 and TREC-7 spoken document retrieval evaluations and we report on the results of the TREC-7 evaluation based on a document collection of 100 hours of North American broadcast news.},
  year = {1998},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/twente98.pdf},
  pages = {129--140},
  categories = {thisl,bnews,trec,ir,recognition,sheffield}
}
@inproceedings{christensen-ecir04,
  author = {Christensen, H. and Kolluru, B. and Gotoh, Y. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/ecir04.ps.gz},
  title = {From text summarisation to style-specific summarisation for broadcast news},
  booktitle = {Proc. ECIR--2004},
  abstract = {In this paper we report on a series of experiments investigating the path from text-summarisation to style-specific summarisation of spoken news stories. We show that the portability of traditional text summarisation features to broadcast news is dependent on the diffusiveness of the information in the broadcast news story. An analysis of two categories of news stories (containing only read speech or some spontaneous speech) demonstrates the importance of the style and the quality of the transcript, when extracting the summary-worthy information content. Further experiments indicate the advantages of doing style-specific summarisation of broadcast news.},
  year = {2004},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/ecir04.pdf},
  pages = {},
  categories = {s3l,summarization,bnews,edinburgh}
}
@book{renals2006-mlmi06,
  editor = {Renals, Steve and Bengio, Samy and Fiscus, Jonathan},
  volume = {4299},
  publisher = {Springer-Verlag},
  year = {2006},
  series = {Lecture Notes in Computer Science},
  title = {Machine learning for multimodal interaction (Proceedings of {MLMI} '06)}
}
@inproceedings{robinson-eurospeech99,
  author = {Robinson, T. and Abberley, D. and Kirby, D. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/eurospeech99-thisl.ps.gz},
  title = {Recognition, indexing and retrieval of {British} broadcast news with the {THISL} SYSTEM},
  booktitle = {Proc. Eurospeech},
  year = {1999},
  abstract = {This paper described the THISL spoken document retrieval system for British and North American Broadcast News. The system is based on the Abbot large vocabulary speech recognizer and a probabilistic text retrieval system. We discuss the development of a realtime British English Broadcast News system, and its integration into a spoken document retrieval system. Detailed evaluation is performed using a similar North American Broadcast News system, to take advantage of the TREC SDR evaluation methodology. We report results on this evaluation, with particular reference to the effect of query expansion and of automatic segmentation algorithms.},
  address = {Budapest},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/eurospeech99-thisl.pdf},
  pages = {1067--1070},
  categories = {thisl,bnews,trec,ir,recognition,sheffield}
}
@inproceedings{carreira-nnsp98,
  author = {Carreira-Perpiñán, M. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/nnsp98.ps.gz},
  title = {Experimental evaluation of latent variable models for dimensionality reduction},
  booktitle = {IEEE Proc. Neural Networks for Signal Processing},
  year = {1998},
  abstract = {We use electropalatographic (EPG) data as a test bed for dimensionality reduction methods based in latent variable modelling, in which an underlying lower dimension representation is inferred directly from the data. Several models (and mixtures of them) are investigated, including factor analysis and the generative topographic mapping (GTM). Experiments indicate that nonlinear latent variable modelling reveals a low-dimensional structure in the data inaccessible to the investigated linear models.},
  volume = {8},
  address = {Cambridge},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/nnsp98.pdf},
  pages = {165--173},
  categories = {ml,lv,artic,sheffield}
}
@inproceedings{koumpis-icoin01,
  author = {Koumpis, K. and Ladas, C. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/icoin01.ps.gz},
  title = {An Advanced Integrated Architecture for Wireless Voicemail Retrieval},
  booktitle = {Proc. 15th IEEE International Conference on Information Networking},
  abstract = {This paper describes an alternative architecture for voicemail data retrieval on the move. It is comprised of three distinct components: a speech recognizer, a text summarizer and a WAP push service initiator, enabling mobile users to receive a text summary of their voicemail in realtime without an explicit request. Our approach overcomes the cost and usability limitations of the conventional voicemail retrieval paradigm which requires a connection establishment in order to listen to spoken messages. We report performance results on all different components of the system which has been trained on a database containing 1843 North American English messages as well as on the duration of the corresponding data path. The proposed architecture can be further customized to meet the requirements of a complete voicemail value-added service.},
  year = {2001},
  pages = {403--410},
  categories = {voicemail,summarization,sheffield}
}
@article{renals-sap94,
  author = {Renals, S. and Morgan, N. and Bourlard, H. and Cohen, M. and Franco, H.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1994/sap94.ps.gz},
  title = {Connectionist probability estimators in {HMM} speech recognition},
  journal = {IEEE Trans. on Speech and Audio Processing},
  abstract = {We are concerned with integrating connectionist networks into a hidden Markov model (HMM) speech recognition system. This is achieved through a statistical interpretation of connectionist networks as probability estimators. We review the basis of HMM speech recognition and point out the possible benefits of incorporating connectionist networks. Issues necessary to the construction of a connectionist HMM recognition system are discussed, including choice of connectionist probability estimator. We describe the performance of such a system, using a multi-layer perceptron probability estimator, evaluated on the speaker-independent DARPA Resource Management database. In conclusion, we show that a connectionist component improves a state-of-the-art HMM system.},
  volume = {2},
  year = {1994},
  pages = {161--175},
  categories = {}
}
@inproceedings{renals-trec01,
  author = {Renals, S. and Abberley, D.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/trec9-proc.ps.gz},
  title = {The {THISL} {SDR} system at {TREC}--9},
  booktitle = {Proc. Ninth Text Retrieval Conference (TREC--9)},
  abstract = {This paper describes our participation in the TREC-9 Spoken Document Retrieval (SDR) track. The THISL SDR system consists of a realtime version of a hybrid connectionist/HMM large vocabulary speech recognition system and a probabilistic text retrieval system. This paper describes the configuration of the speech recognition and text retrieval systems, including segmentation and query expansion. We report our results for development tests using the TREC-8 queries, and for the TREC-9 evaluation.},
  year = {2001},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/trec9-proc.pdf},
  pages = {},
  categories = {thisl,bnews,trec,ir,recognition,eval,abbot,sheffield}
}
@article{carreira-nc00,
  author = {Carreira-Perpiñán, M. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/nc00-preprint.ps.gz},
  title = {Practical identifiability of finite mixtures of multivariate {Bernoulli} distributions},
  journal = {Neural Computation},
  abstract = {The class of finite mixtures of multivariate Bernoulli distributions is known to be nonidentifiable, i.e., different values of the mixture parameters can correspond to exactly the same probability distribution. In principle, this would mean that sample estimates using this model would give rise to different interpretations. We give empirical support to the fact that estimation of this class of mixtures can still produce meaningful results in practice, thus lessening the importance of the identifiability problem. We also show that the EM algorithm is guaranteed to converge to a proper maximum likelihood estimate, owing to a property of the log-likelihood surface. Experiments with synthetic data sets show that an original generating distribution can be estimated from a sample. Experiments with an electropalatography (EPG) data set show important structure in the data.},
  volume = {12},
  year = {2000},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/nc00-preprint.pdf},
  pages = {141--152},
  categories = {ml,lv,artic,sheffield}
}
@article{bourlard-specom92,
  author = {Bourlard, H. and Morgan, N. and Renals, S.},
  title = {Neural nets and hidden {Markov} models: Review and generalizations},
  journal = {Speech Communication},
  volume = {11},
  year = {1992},
  pages = {237--246},
  categories = {}
}
@incollection{renals-nips94,
  editor = {Cowan, J. D. and Tesauro, G. and Alspector, J.},
  author = {Renals, S. and Hochberg, M. and Robinson, T.},
  publisher = {Morgan Kaufmann},
  title = {Learning temporal dependencies in connectionist speech recognition},
  booktitle = {Advances in Neural Information Processing Systems},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1994/1051.ps.gz},
  volume = {6},
  year = {1994},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1994/1051.pdf},
  pages = {1051--1058},
  categories = {}
}
@inproceedings{zhang-icslp2006,
  author = {Zhang, Le and Renals, Steve},
  title = {Phone Recognition Analysis for Trajectory {HMM}},
  booktitle = {Proc. Interspeech 2006},
  year = {2006},
  month = {September},
  key = {asr},
  address = {Pittsburgh, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/zhang-icslp2006.pdf},
  abstract = {The trajectory {HMM} has been shown to be useful for model-based speech synthesis where a smoothed trajectory is generated using temporal constraints imposed by dynamic features. To evaluate the performance of such model on an ASR task, we present a trajectory decoder based on tree search with delayed path merging. Experiment on a speaker-dependent phone recognition task using the MOCHA-TIMIT database shows that the MLE-trained trajectory model, while retaining attractive properties of being a proper generative model, tends to favour over-smoothed trajectory among competing hypothesises, and does not perform better than a conventional {HMM}. We use this to build an argument that models giving better fit on training data may suffer a reduction of discrimination by being too faithful to training data. This partially explains why alternative acoustic models that try to explicitly model temporal constraints do not achieve significant improvements in ASR.}
}
@inproceedings{dielmann-icassp04,
  author = {Dielmann, A. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/icassp04-m4.ps.gz},
  title = {Dynamic {Bayesian} Networks for Meeting Structuring},
  booktitle = {Proc. IEEE ICASSP},
  abstract = {This paper is about the automatic structuring of multiparty meetings using audio information. We have used a corpus of 53 meetings, recorded using a microphone array and lapel microphones for each participant. The task was to segment meetings into a sequence of meeting actions, or phases. We have adopted a statistical approach using dynamic Bayesian networks (DBNs). Two DBN architectures were investigated: a two-level hidden Markov model (HMM) in which the acoustic observations were concatenated; and a multistream DBN in which two separate observation sequences were modelled. Additionally we have also explored the use of counter variables to constrain the number of action transitions. Experimental results indicate that the DBN architectures are an improvement over a simple baseline HMM, with the multistream DBN with counter constraints producing an action error rate of 6\%.},
  year = {2004},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/icassp04-m4.pdf},
  pages = {},
  categories = {m4,multimodal,dbn,meetings,edinburgh}
}
@inproceedings{gotoh-esca99,
  author = {Gotoh, Y. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/esca99-ne.ps.gz},
  title = {Statistical annotation of named entities in spoken audio},
  booktitle = {Proc. ESCA Workshop on Accessing Information In Spoken Audio},
  year = {1999},
  abstract = {In this paper we describe stochastic finite state model for named entity (NE) identification, based on explicit word-level n-gram relations. NE categories are incorporated in the model as word attributes. We present an overview of the approach, describing how the extensible vocabulary model may be used for NE identification. We report development and evaluation results on a North American Broadcast News task. This approach resulted in average precision and recall scores of around 83\% on hand transcribed data, and 73\% on the SPRACH recogniser output. We also present an error analysis and a comparison of our approach with an alternative statistical approach.},
  address = {Cambridge},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/esca99-ne.pdf},
  pages = {43--48},
  categories = {sprach,stobs,ie,lm,bnews,sheffield}
}
@inproceedings{wolters-is:09,
  author = {Wolters, Maria and Vipperla, Ravichander and Renals, Steve},
  title = {Age Recognition for Spoken Dialogue Systems: Do We Need It?},
  booktitle = {Proc. Interspeech},
  month = {September},
  year = {2009},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/is09.pdf},
  abstract = {When deciding whether to adapt relevant aspects of the system to the particular needs of older users, spoken dialogue systems often rely on automatic detection of chronological age. In this paper, we show that vocal ageing as measured by acoustic features is an unreliable indicator of the need for adaptation. Simple lexical features greatly improve the prediction of both relevant aspects of cognition and interactions style. Lexical features also boost age group prediction. We suggest that adaptation should be based on observed behaviour, not on chronological age, unless it is not feasible to build classifiers for relevant adaptation decisions.},
  categories = {age recognition, spoken dialogue systems}
}
@inproceedings{christensen-prosody01,
  author = {Christensen, H. and Gotoh, Y. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/pros01-punc.ps.gz},
  title = {Punctuation Annotation using Statistical Prosody Models},
  booktitle = {Proc. ISCA Workshop on Prosody in Speech Recognition and Understanding},
  year = {2001},
  abstract = {This paper is about the development of statistical models of prosodic features to generate linguistic meta-data for spoken language. In particular, we are concerned with automatically punctuating the output of a broadcast news speech recogniser. We present a statistical finite state model that combines prosodic, linguistic and punctuation class features. Experimental results are presented using the Hub-4 Broadcast News corpus, and in the light of our results we discuss the issue of a suitable method of evaluating the present task.},
  address = {Red Bank, NJ, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/pros01-punc.pdf},
  pages = {},
  categories = {stobs,ie,lm,prosody,bnews,sheffield}
}
@inproceedings{huang2009-is,
  author = {Huang, Songfang and Renals, Steve},
  title = {A Parallel Training Algorithm for Hierarchical {P}itman-{Y}or Process Language Models},
  booktitle = {Proc. Interspeech'09},
  year = {2009},
  abstract = {The Hierarchical Pitman Yor Process Language Model (HPYLM) is a Bayesian language model based on a non-parametric prior, the Pitman-Yor Process. It has been demonstrated, both theoretically and practically, that the HPYLM can provide better smoothing for language modeling, compared with state-of-the-art approaches such as interpolated Kneser-Ney and modified Kneser-Ney smoothing. However, estimation of Bayesian language models is expensive in terms of both computation time and memory; the inference is approximate and requires a number of iterations to converge. In this paper, we present a parallel training algorithm for the HPYLM, which enables the approach to be applied in the context of automatic speech recognition, using large training corpora with large vocabularies. We demonstrate the effectiveness of the proposed algorithm by estimating language models from corpora for meeting transcription containing over 200 million words, and observe significant reductions in perplexity and word error rate.},
  month = {September},
  address = {Brighton, UK},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/sh_interspeech09.pdf},
  pages = {2695--2698}
}
@incollection{murray2008b,
  author = {Murray, Gabriel and Renals, Steve},
  publisher = {Springer},
  doi = {10.1007/978-3-540-85853-9_19},
  title = {Detecting Action Items in Meetings},
  url = {http://dx.doi.org/10.1007/978-3-540-85853-9_19},
  series = {Lecture Notes in Computer Science},
  booktitle = {Machine Learning for Multimodal Interaction (Proc. MLMI '08)},
  number = {5237},
  abstract = {We present a method for detecting action items in spontaneous meeting speech. Using a supervised approach incorporating prosodic, lexical and structural features, we can classify such items with a high degree of accuracy. We also examine how well various feature subclasses can perform this task on their own.},
  year = {2008},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/murray2008b.pdf},
  pages = {208--213}
}
@inproceedings{rohwer-neuro88,
  editor = {Personnaz, L. and Dreyfus, G.},
  author = {Rohwer, R. and Renals, S.},
  publisher = {I.D.S.E.T.},
  title = {Training Recurrent Networks},
  booktitle = {Neural networks from models to applications (Proc. nEuro '88)},
  year = {1988},
  address = {Paris},
  pages = {207--216},
  categories = {}
}
@article{huang2010,
  author = {Huang, Songfang and Renals, Steve},
  doi = {10.1109/TASL.2010.2040782},
  title = {Hierarchical {Bayesian} Language Models for Conversational Speech Recognition},
  url = {http://dx.doi.org/10.1109/TASL.2010.2040782},
  journal = {IEEE Transactions on Audio, Speech and Language Processing},
  number = {8},
  abstract = {Traditional n-gram language models are widely used in state-of-the-art large vocabulary speech recognition systems. This simple model suffers from some limitations, such as overfitting of maximum-likelihood estimation and the lack of rich contextual knowledge sources. In this paper, we exploit a hierarchical Bayesian interpretation for language modeling, based on a nonparametric prior called the Pitman--Yor process. This offers a principled approach to language model smoothing, embedding the power-law distribution for natural language. Experiments on the recognition of conversational speech in multiparty meetings demonstrate that by using hierarchical Bayesian language models, we are able to achieve significant reductions in perplexity and word error rate.},
  month = {January},
  volume = {18},
  year = {2010},
  keywords = {AMI corpus , conversational speech recognition , hierarchical Bayesian model , language model (LM) , meetings , smoothing},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/huang-taslp10.pdf},
  pages = {1941--1954}
}
@inproceedings{williams-eurospeech97,
  author = {Williams, G. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/eurosp97-conf.ps.gz},
  title = {Confidence measures for hybrid {HMM/ANN} speech recognition},
  booktitle = {Proc. Eurospeech},
  year = {1997},
  abstract = {In this paper we introduce four acoustic confidence measures which are derived from the output of a hybrid HMM/ANN large vocabulary continuous speech recognition system. These confidence measures, based on local posterior probability estimates computed by an ANN, are evaluated at both phone and word levels, using the North American Business News corpus.},
  address = {Rhodes},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/eurosp97-conf.pdf},
  pages = {1955--1958},
  categories = {recognition,conf,hybrid,wsj,sheffield}
}
@inproceedings{carreira-icphs99,
  author = {Carreira-Perpiñán, M. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/icphs99.ps.gz},
  title = {A latent-variable modelling approach to the acoustic-to-articulatory mapping problem},
  booktitle = {Proc. 14th Int. Congress of Phonetic Sciences},
  year = {1999},
  abstract = {We present a latent variable approach to the acoustic-to-articulatory mapping problem, where different vocal tract configurations can give rise to the same acoustics. In latent variable modelling, the combined acoustic and articulatory data are assumed to have been generated by an underlying low-dimensional process. A parametric probabilistic model is estimated and mappings are derived from the respective conditional distributions. This has the advantage over other methods, such as articulatory codebooks or neural networks, of directly addressing the nonuniqueness problem. We demonstrate our approach with electropalatographic and acoustic data from the ACCOR database.},
  address = {San Francisco},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/icphs99.pdf},
  pages = {2013-2016},
  categories = {ml,lv,artic,sheffield}
}
@inproceedings{barker-icslp98,
  author = {Barker, J. and Williams, G. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/icslp98-seg.ps.gz},
  title = {Acoustic confidence measures for segmenting broadcast news},
  booktitle = {Proc. ICSLP},
  year = {1998},
  abstract = {In this paper we define an acoustic confidence measure based on the estimates of local posterior probabilities produced by a HMM/ANN large vocabulary continuous speech recognition system. We use this measure to segment continuous audio into regions where it is and is not appropriate to expend recognition effort. The segmentation is computationally inexpensive and provides reductions in both overall word error rate and decoding time. The technique is evaluated using material from the Broadcast News corpus.},
  address = {Sydney},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/icslp98-seg.pdf},
  pages = {2719--2722},
  categories = {recognition,conf,hybrid,bnews,segmentation,sheffield}
}
@inproceedings{renals-icassp92,
  author = {Renals, S. and Morgan, N. and Cohen, M. and Franco, H.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1992/icassp92.ps.gz},
  title = {Connectionist probability estimation in the {Decipher} speech recognition system},
  booktitle = {Proc IEEE ICASSP},
  year = {1992},
  address = {San Francisco},
  pages = {601--604},
  categories = {}
}
@incollection{huang2007-mlmi,
  editor = {Popescu-Belis, A. and Renals, S. and Bourlard, H.},
  author = {Huang, Songfang and Renals, Steve},
  publisher = {Springer},
  title = {Modeling Prosodic Features in Language Models for Meetings},
  series = {Lecture Notes in Computer Science},
  booktitle = {Machine Learning for Multimodal Interaction IV},
  abstract = {Prosody has been actively studied as an important knowledge source for speech recognition and understanding. In this paper, we are concerned with the question of exploiting prosody for language models to aid automatic speech recognition in the context of meetings. Using an automatic syllable detection algorithm, the syllable-based prosodic features are extracted to form the prosodic representation for each word. Two modeling approaches are then investigated. One is based on a factored language model, which directly uses the prosodic representation and treats it as a `word'. Instead of direct association, the second approach provides a richer probabilistic structure within a hierarchical Bayesian framework by introducing an intermediate latent variable to represent similar prosodic patterns shared by groups of words. Four-fold cross-validation experiments on the ICSI Meeting Corpus show that exploiting prosody for language modeling can significantly reduce the perplexity, and also have marginal reductions in word error rate.},
  volume = {4892},
  year = {2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/mlmi07.pdf},
  pages = {191--202}
}
@article{renals-sap99,
  author = {Renals, S. and Hochberg, M.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/sap99-preprint.ps.gz},
  title = {Start-synchronous search for large vocabulary continuous speech recognition},
  journal = {IEEE Trans. on Speech and Audio Processing},
  abstract = {In this paper, we present a novel, efficient search strategy for large vocabulary continuous speech recognition. The search algorithm, based on a stack decoder framework, utilizes phone-level posterior probability estimates (produced by a connectionist/HMM acoustic model) as a basis for phone deactivation pruning - a highly efficient method of reducing the required computation. The single-pass algorithm is naturally factored into the time-asynchronous processing of the word sequence and the time-synchronous processing of the HMM state sequence. This enables the search to be decoupled from the language model while still maintaining the computational benefits of time-synchronous processing. The incorporation of the language model in the search is discussed and computationally cheap approximations to the full language model are introduced. Experiments were performed on the North American Business News task using a 60,000 word vocabulary and a trigram language model. Results indicate that the computational cost of the search may be reduced by more than a factor of 40 with a relative search error of less than 2\% using the techniques discussed in the paper.},
  volume = {7},
  year = {1999},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/sap99-preprint.pdf},
  pages = {542--553},
  categories = {sprach,recognition,search,bnews,sheffield}
}
@article{garau2008,
  author = {Garau, Giulia and Renals, Steve},
  doi = {10.1109/TASL.2008.916519},
  title = {Combining Spectral Representations for Large Vocabulary Continuous Speech Recognition},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=4443886},
  journal = {IEEE Transactions on Audio, Speech and Language Processing},
  number = {3},
  abstract = {In this paper we investigate the combination of complementary acoustic feature streams in large vocabulary continuous speech recognition (LVCSR). We have explored the use of acoustic features obtained using a pitch-synchronous analysis, STRAIGHT, in combination with conventional features such as mel frequency cepstral coefficients. Pitch-synchronous acoustic features are of particular interest when used with vocal tract length normalisation (VTLN) which is known to be affected by the fundamental frequency. We have combined these spectral representations directly at the acoustic feature level using heteroscedastic linear discriminant analysis (HLDA) and at the system level using ROVER. We evaluated this approach on three LVCSR tasks: dictated newspaper text (WSJCAM0), conversational telephone speech (CTS), and multiparty meeting transcription. The CTS and meeting transcription experiments were both evaluated using standard NIST test sets and evaluation protocols. Our results indicate that combining conventional and pitch-synchronous acoustic feature sets using HLDA results in a consistent, significant decrease in word error rate across all three tasks. Combining at the system level using ROVER resulted in a further significant decrease in word error rate.},
  volume = {16},
  year = {2008},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2008/garau-taslp08.pdf},
  pages = {508--518}
}
@inproceedings{llu2012map,
  author = {Lu, L. and Ghoshal, A. and Renals, S.},
  doi = {10.1109/ICASSP.2012.6289012},
  title = {{Maximum a posteriori adaptation of subspace Gaussian mixture models for cross-lingual speech recognition}},
  booktitle = {Proc. ICASSP},
  abstract = {This paper concerns cross-lingual acoustic modeling in the case when there are limited target language resources. We build on an approach in which a subspace Gaussian mixture model (SGMM) is adapted to the target language by reusing the globally shared parameters estimated from out-of-language training data. In current cross-lingual systems, these parameters are fixed when training the target system, which can give rise to a mismatch between the source and target systems. We investigate a maximum a posteriori (MAP) adaptation approach to alleviate the potential mismatch. In particular, we focus on the adaptation of phonetic subspace parameters using a matrix variate Gaussian prior distribution. Experiments on the GlobalPhone corpus using the MAP adaptation approach results in word error rate reductions, compared with the cross-lingual baseline systems and systems updated using maximum likelihood, for training conditions with 1 hour and 5 hours of target language data.},
  year = {2012},
  keywords = {Subspace Gaussian Mixture Model, Maximum a Posteriori Adaptation, Cross-lingual Speech Recognition},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/llu-icassp-2012.pdf},
  pages = {4877--4880}
}
@inproceedings{murray06,
  author = {Murray, G. and Renals, S. and Moore, J. and Carletta, J.},
  title = {Incorporating Speaker and Discourse Features into Speech Summarization},
  booktitle = {Proceedings of the Human Language Technology Conference - North American Chapter of the Association for Computational Linguistics Meeting (HLT-NAACL) 2006, New York City, USA},
  month = {June},
  year = {2006},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/hlt2006-final.pdf},
  abstract = {The research presented herein explores the usefulness of incorporating speaker and discourse features in an automatic speech summarization system applied to meeting recordings from the ICSI Meetings corpus. By analyzing speaker activity, turn-taking and discourse cues, it is hypothesized that a system can outperform solely text-based methods inherited from the field of text summarization. The summarization methods are described, two evaluation methods are applied and compared, and the results clearly show that utilizing such features is advantageous and efficient. Even simple methods relying on discourse cues and speaker activity can outperform text summarization approaches.},
  categories = {summarization, speech summarization, prosody, latent semantic analysis}
}
@inproceedings{qin:perpinan:richmond:wrench:renals:2008a,
  author = {Qin, C. and Carreira-Perpiñán, M. and Richmond, K. and Wrench, A. and Renals, S.},
  title = {Predicting Tongue Shapes from a Few Landmark Locations},
  booktitle = {Proc. Interspeech},
  year = {2008},
  abstract = {We present a method for predicting the midsagittal tongue contour from the locations of a few landmarks (metal pellets) on the tongue surface, as used in articulatory databases such as MOCHA and the Wisconsin XRDB. Our method learns a mapping using ground-truth tongue contours derived from ultrasound data and drastically improves over spline interpolation. We also determine the optimal locations of the landmarks, and the number of landmarks required to achieve a desired prediction error: 3-4 landmarks are enough to achieve 0.3-0.2 mm error per point on the tongue.},
  month = {September},
  key = {qin:perpinan:richmond:wrench:renals:2008a},
  address = {Brisbane, Australia},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080929.PDF},
  pages = {2306--2309},
  categories = {ultrasound, tongue contour, articulation}
}
@inproceedings{koumpis-msdr03,
  author = {Koumpis, K. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/msdr03.ps.gz},
  title = {Evaluation of extractive voicemail summarization},
  booktitle = {Proc. ISCA Workshop on Multilingual Spoken Document Retrieval},
  abstract = {This paper is about the evaluation of a system that generates short text summaries of voicemail messages, suitable for transmission as text messages. Our approach to summarization is based on a speech-recognized transcript of the voicemail message, from which a set of summary words is extracted. The system uses a classifier to identify the summary words, with each word being identified by a vector of lexical and prosodic features. The features are selected using Parcel, an ROC-based algorithm. Our evaluations of the system, using a slot error rate metric, have compared manual and automatic summarization, and manual and automatic recognition (using two different recognizers). We also report on two subjective evaluations using mean opinion score of summaries, and a set of comprehension tests. The main results from these experiments were that the perceived difference in quality of summarization was affected more by errors resulting from automatic transcription, than by the automatic summarization process.},
  year = {2003},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/msdr03.pdf},
  pages = {19--24},
  categories = {voicemail,summarization,prosody,sheffield}
}
@inproceedings{robinson-eurospeech93,
  author = {Robinson, A. J. and Almeida, L. and Boite, J.-M. and Bourlard, H. and Fallside, F. and Hochberg, M. and Kershaw, D. and Kohn, P. and Konig, Y. and Morgan, N. and Neto, J. P. and Renals, S. and Saerens, M. and Wooters, C.},
  title = {A neural network based, speaker independent, large vocabulary, continuous speech recognition system: the {Wernicke} project},
  booktitle = {Proc. Eurospeech},
  year = {1993},
  address = {Berlin},
  pages = {1941--1944},
  categories = {}
}
@inproceedings{jaimes2007,
  author = {Jaimes, Alejandro and Bourlard, Hervé and Renals, Steve and Carletta, Jean},
  doi = {10.1109/ICIAPW.2007.36},
  title = {Recording, Indexing, Summarizing, and Accessing Meeting Videos: An Overview of the {AMI} Project},
  url = {http://ieeexplore.ieee.org/search/srchabstract.jsp?arnumber=4427477&isnumber=4427459&punumber=4427458&k2dockey=4427477@ieeecnfs&query=%28+%28%28renals%29%3Cin%3Eau+%29+%29+%3Cand%3E+%28pyr+%3E%3D+2006+%3Cand%3E+pyr+%3C%3D+2008%29&pos=6&access=no},
  booktitle = {Proc IEEE ICIAPW},
  abstract = {n this paper we give an overview of the AMI project. AMI developed the following: (1) an infrastructure for recording meetings using multiple microphones and cameras; (2) a one hundred hour, manually annotated meeting corpus; (3) a number of techniques for indexing, and summarizing of meeting videos using automatic speech recognition and computer vision, and (4) an extensible framework for browsing, and searching of meeting videos. We give an overview of the various techniques developed in AMI, their integration into our meeting browser framework, and future plans for AMIDA (Augmented Multiparty Interaction with Distant Access), the follow-up project to AMI.},
  year = {2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/jaimes2007.pdf},
  pages = {59--64}
}
@inproceedings{abberley-trec98,
  author = {Abberley, D. and Renals, S. and Cook, G. and Robinson, T.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/trec6.ps.gz},
  title = {The 1997 {THISL} spoken document retrieval system},
  booktitle = {Proc. Sixth Text Retrieval Conference (TREC--6)},
  abstract = {The THISL spoken document retrieval system is based on the Abbot Large Vocabulary Continuous Speech Recognition (LVCSR) system developed by Cambridge University, Sheffield University and SoftSound, and uses PRISE (NIST) for indexing and retrieval. We participated in full SDR mode. Our approach was to transcribe the spoken documents at the word level using Abbot, indexing the resulting text transcriptions using PRISE. The LVCSR system uses a recurrent network-based acoustic model (with no adaptation to different conditions) trained on the 50 hour Broadcast News training set, a 65,000 word vocabulary and a trigram language model derived from Broadcast News text. Words in queries which were out-of-vocabulary (OOV) were word spotted at query time (utilizing the posterior phone probabilities output by the acoustic model), added to the transcriptions of the relevant documents and the collection was then re-indexed. We generated pronunciations at run-time for OOV words using the Festival TTS system (University of Edinburgh).},
  year = {1998},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/trec6.pdf},
  pages = {747--752},
  categories = {thisl,bnews,trec,ir,recognition,eval,sheffield}
}
@article{robinson-specom02,
  author = {Robinson, A. J. and Cook, G. D. and Ellis, D. P. W. and Fosler-Lussier, E. and Renals, S. J. and Williams, D. A. G.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/specom02-preprint.ps.gz},
  title = {Connectionist Speech Recognition of Broadcast News},
  journal = {Speech Communication},
  abstract = {This paper describes connectionist techniques for recognition of Broadcast News. The fundamental difference between connectionist systems and more conventional mixture-of-Gaussian systems is that connectionist models directly estimate posterior probabilities as opposed to likelihoods. Access to posterior probabilities has enabled us to develop a number of novel approaches to confidence estimation, pronunciation modelling and search. In addition we have investigated a new feature extraction technique based on the modulation-filtered spectrogram, and methods for combining multiple information sources. We have incorporated all of these techniques into a system for the transcription of Broadcast News, and we present results on the 1998 DARPA Hub-4E Broadcast News evaluation data.},
  volume = {37},
  year = {2002},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/specom02-preprint.pdf},
  pages = {27--45},
  categories = {sprach,bnews,recognition,am,hybrid,abbot,lm,search,pron,eval,sheffield}
}
@inproceedings{renals-eurospeech93,
  author = {Renals, S. and MacKay, D.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1993/eurosp93-bayes.ps.gz},
  title = {Bayesian regularisation methods in a hybrid {MLP--HMM} system},
  booktitle = {Proc. Eurospeech},
  year = {1993},
  address = {Berlin},
  pages = {1719--1722},
  categories = {}
}
@incollection{renals-nips92,
  editor = {Moody, J. E. and Hanson, S. J. and Lippmann, R. P.},
  author = {Renals, S. and Bourlard, H. and Morgan, N. and Franco, H. and Cohen, M.},
  publisher = {Morgan-Kaufmann},
  title = {Connectionist optimisation of tied mixture hidden {Markov} models},
  booktitle = {Advances in Neural Information Processing Systems},
  volume = {4},
  year = {1992},
  pages = {167--174},
  categories = {}
}
@inproceedings{renals-icassp03,
  author = {Renals, S. and Ellis, D.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/icassp03-mtg.ps.gz},
  title = {Audio information access from meeting rooms},
  booktitle = {Proc. IEEE ICASSP},
  abstract = {We investigate approaches to accessing information from the streams of audio data that result from multi-channel recordings of meetings. The methods investigated use word-level transcriptions, and information derived from models of speaker activity and speaker turn patterns. Our experiments include spoken document retrieval for meetings, automatic structuring of meetings based on self-similarity matrices of speaker turn patterns and a simple model of speaker activity. Meeting recordings are rich in both lexical and non-lexical information; our results illustrate some novel kinds of analysis made possible by a transcribed corpus of natural meetings.},
  volume = {4},
  year = {2003},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/icassp03-mtg.pdf},
  pages = {744--747},
  categories = {m4,multimodal,ir,meetings,sheffield}
}
@inproceedings{cabral:renals:richmond:yamagishi:2008a,
  author = {Cabral, J. and Renals, S. and Richmond, K. and Yamagishi, J.},
  title = {Glottal Spectral Separation for Parametric Speech Synthesis},
  booktitle = {Proc. Interspeech},
  year = {2008},
  abstract = {This paper presents a method to control the characteristics of synthetic speech flexibly by integrating articulatory features into a Hidden Markov Model (HMM)-based parametric speech synthesis system. In contrast to model adaptation and interpolation approaches for speaking style control, this method is driven by phonetic knowledge, and target speech samples are not required. The joint distribution of parallel acoustic and articulatory features considering cross-stream feature dependency is estimated. At synthesis time, acoustic and articulatory features are generated simultaneously based on the maximum-likelihood criterion. The synthetic speech can be controlled flexibly by modifying the generated articulatory features according to arbitrary phonetic rules in the parameter generation process. Our experiments show that the proposed method is effective in both changing the overall character of synthesized speech and in controlling the quality of a specific vowel.},
  month = {September},
  key = {cabral:renals:richmond:yamagishi:2008a},
  address = {Brisbane, Australia},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS081086.PDF},
  pages = {1829--1832},
  categories = {HMM speech synthesis, Glottal Spectral Separation, LF-model}
}
@inproceedings{pietquin-icassp02,
  author = {Pietquin, O. and Renals, S.},
  title = {{ASR} system modeling for automatic evaluation and optimization of dialogue systems},
  booktitle = {Proc IEEE ICASSP},
  abstract = {Though the field of spoken dialogue systems has developed quickly in the last decade, rapid design of dialogue strategies remains uneasy. Several approaches to the problem of automatic strategy learning have been proposed and the use of Reinforcement Learning introduced by Levin and Pieraccini is becoming part of the state of the art in this area. However, the quality of the strategy learned by the system depends on the definition of the optimization criterion and on the accuracy of the environment model. In this paper, we propose to bring a model of an ASR system in the simulated environment in order to enhance the learned strategy. To do so, we introduced recognition error rates and confidence levels produced by ASR systems in the optimization criterion.},
  year = {2002},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/icassp02-rl.pdf},
  pages = {46--49},
  categories = {dialog,rl,sheffield}
}
@inproceedings{kershaw-icslp96,
  author = {Kershaw, D. and Robinson, T. and Renals, S.},
  title = {The 1995 {Abbot} {LVCSR} system for multiple unknown microphones},
  booktitle = {Proc. ICSLP},
  year = {1996},
  address = {Philadelphia PA},
  pages = {1325-1328},
  categories = {wernicke,sprach,wsj,recognition,am,hybrid,abbot,search,eval,sheffield}
}
@inproceedings{NistevalAMI05,
  author = {Hain, T. and Burget, L. and Dines, J. and Garau, G. and Karafiat, M. and Lincoln, M. and McCowan, I. and Moore, D. and Wan, V. and Ordelman, R. and Renals, S.},
  title = {The 2005 {AMI} System for the transcription of Speech in Meetings},
  booktitle = {Proceedings of the Rich Transcription 2005 Spring Meeting Recognition Evaluation},
  year = {2005},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/AMIasr.nist2005.pdf},
  abstract = {In this paper we describe the 2005 AMI system for the transcription of speech in meetings used in the 2005 NIST RT evaluations. The system was designed for participation in the speech to text part of the evaluations, in particular for transcription of speech recorded with multiple distant microphones and independent headset microphones. System performance was tested on both conference room and lecture style meetings. Although input sources are processed using different frontends, the recognition process is based on a unified system architecture. The system operates in multiple passes and makes use of state of the art technologies such as discriminative training, vocal tract length normalisation, heteroscedastic linear discriminant analysis, speaker adaptation with maximum likelihood linear regression and minimum word error rate decoding. In this paper we describe the system performance on the official development and test sets for the NIST RT05s evaluations. The system was jointly developed in less than 10 months by a multi-site team and was shown to achieve competitive performance.},
  categories = {LVCSR, NIST Meeting Transcription Evaluation RT05S}
}
@inproceedings{cuayahuitletal_slt06,
  author = {Cuayáhuitl, Heriberto and Renals, Steve and Lemon, Oliver and Shimodaira, Hiroshi},
  title = {Reinforcement Learning of Dialogue Strategies With Hierarchical Abstract Machines},
  booktitle = {Proc. IEEE/ACL Workshop on Spoken Language Technology (SLT)},
  month = {December},
  year = {2006},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/ham-slt2006.pdf},
  abstract = {In this paper we propose partially specified dialogue strategies for dialogue strategy optimization, where part of the strategy is specified deterministically and the rest optimized with Reinforcement Learning (RL). To do this we apply RL with Hierarchical Abstract Machines (HAMs). We also propose to build simulated users using HAMs, incorporating a combination of hierarchical deterministic and probabilistic behaviour. We performed experiments using a single-goal flight booking dialogue system, and compare two dialogue strategies (deterministic and optimized) using three types of simulated user (novice, experienced and expert). Our results show that HAMs are promising for both dialogue optimization and simulation, and provide evidence that indeed partially specified dialogue strategies can outperform deterministic ones (on average 4.7 fewer system turns) with faster learning than the traditional RL framework.},
  categories = {reinforcement learning, spoken dialogue systems}
}
@inproceedings{bell12_mlan,
  author = {Bell, P. and Gales, M. and Lanchantin, P. and Liu, X. and Long, Y. and Renals, S. and Swietojanski, P. and Woodland, P.},
  doi = {10.1109/SLT.2012.6424244},
  title = {Transcription of multi-genre media archives using out-of-domain data},
  booktitle = {Proc. IEEE Workshop on Spoken Language Technology},
  year = {2012},
  abstract = {We describe our work on developing a speech recognition system for multi-genre media archives. The high diversity of the data makes this a challenging recognition task, which may benefit from systems trained on a combination of in-domain and out-of-domain data. Working with tandem HMMs, we present Multi-level Adaptive Networks (MLAN), a novel technique for incorporating information from out-of-domain posterior features using deep neural networks. We show that it provides a substantial reduction in WER over other systems, with relative WER reductions of 15\% over a PLP baseline, 9\% over in-domain tandem features and 8\% over the best out-of-domain tandem features.},
  month = {December},
  address = {Miami, Florida, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/mlan_slt2012.pdf},
  pages = {324--329}
}
@inproceedings{renals2007,
  author = {Renals, Steve and Hain, Thomas and Bourlard, Hervé},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/ami-asru2007.pdf},
  booktitle = {Proc. IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU '07)},
  year = {2007},
  abstract = {The AMI and AMIDA projects are concerned with the recognition and interpretation of multiparty meetings. Within these projects we have: developed an infrastructure for recording meetings using multiple microphones and cameras; released a 100 hour annotated corpus of meetings; developed techniques for the recognition and interpretation of meetings based primarily on speech recognition and computer vision; and developed an evaluation framework at both component and system levels. In this paper we present an overview of these projects, with an emphasis on speech recognition and content extraction.},
  title = {Recognition and interpretation of meetings: The {AMI} and {AMIDA} projects}
}
@inproceedings{dielmann-icassp07,
  author = {Dielmann, A. and Renals, S.},
  title = {{DBN} based joint Dialogue Act recognition of multiparty meetings},
  booktitle = {Proc. IEEE ICASSP},
  abstract = {Joint Dialogue Act segmentation and classification of the new {AMI} meeting corpus has been performed through an integrated framework based on a switching dynamic {Bayesian} network and a set of continuous features and language models. The recognition process is based on a dictionary of 15 {DA} classes tailored for group decision-making. Experimental results show that a novel interpolated Factored Language Model results in a low error rate on the automatic segmentation task, and thus good recognition results can be achieved on {AMI} multiparty conversational speech.},
  month = {April},
  volume = {4},
  year = {2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/dielmann-icassp07.pdf},
  pages = {133--136},
  categories = {ami,dialogue act,dbn,factored language model,meetings,edinburgh}
}
@inproceedings{kolluru-asru03,
  author = {Kolluru, B. and Christensen, H. and Gotoh, Y. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/asru03-style.ps.gz},
  title = {Exploring the style-technique interaction in extractive summarization of broadcast news},
  booktitle = {Proc. IEEE Automatic Speech Recognition and Understanding Workshop},
  abstract = {In this paper we seek to explore the interaction between the style of a broadcast news story and its summarization technique. We report the performance of three different summarization techniques on broadcast news stories, which are split into planned speech and spontaneous speech. The initial results indicate that some summarization techniques work better for the documents with spontaneous speech than for those with planned speech. Even for human beings some documents are inherently dif cult to summarize. We observe this correlation between degree of dif culty in summarizing and performance of the three automatic summarizers. Given the high frequency of named entities in broadcast news and even greater number of references to these named entities, we also gauge the effect of named entity and coreference resolution in a news story, on the performance of these summarizers.},
  year = {2003},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/asru03-style.pdf},
  pages = {},
  categories = {s3l,summarization,bnews,edinburgh}
}
@inproceedings{cabral_ssw7,
  author = {Cabral, Jo{\~a}o and Renals, Steve and Richmond, Korin and Yamagishi, Junichi},
  title = {Transforming Voice Source Parameters in a {HMM}-based Speech Synthesiser with Glottal Post-Filtering},
  booktitle = {Proc. 7th ISCA Speech Synthesis Workshop (SSW7)},
  year = {2010},
  abstract = {Control over voice quality, e.g. breathy and tense voice, is important for speech synthesis applications. For example, transformations can be used to modify aspects of the voice re- lated to speaker's identity and to improve expressiveness. How- ever, it is hard to modify voice characteristics of the synthetic speech, without degrading speech quality. State-of-the-art sta- tistical speech synthesisers, in particular, do not typically al- low control over parameters of the glottal source, which are strongly correlated with voice quality. Consequently, the con- trol of voice characteristics in these systems is limited. In con- trast, the HMM-based speech synthesiser proposed in this paper uses an acoustic glottal source model. The system passes the glottal signal through a whitening filter to obtain the excitation of voiced sounds. This technique, called glottal post-filtering, allows to transform voice characteristics of the synthetic speech by modifying the source model parameters. We evaluated the proposed synthesiser in a perceptual ex- periment, in terms of speech naturalness, intelligibility, and similarity to the original speaker's voice. The results show that it performed as well as a HMM-based synthesiser, which generates the speech signal with a commonly used high-quality speech vocoder.},
  month = {September},
  address = {NICT/ATR, Kyoto, Japan},
  keywords = {HMM-based speech synthesis, voice quality, glottal post-filter},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/cabral_ssw7.pdf},
  pages = {365--370}
}
@inproceedings{gotoh-eurospeech97,
  author = {Gotoh, Y. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/eurosp97-lsa.ps.gz},
  title = {Document space models using latent semantic analysis},
  booktitle = {Proc. Eurospeech},
  year = {1997},
  abstract = {In this paper, an approach for constructing mixture language models (LMs) based on some notion of semantics is discussed. To this end, a technique known as latent semantic analysis (LSA) is used. The approach encapsulates corpus-derived semantic information and is able to model the varying style of the text. Using such information, the corpus texts are clustered in an unsupervised manner and mixture LMs are automatically created. This work builds on previous work in the field of information retrieval which was recently applied by Bellegarda et. al. to the problem of clustering words by semantic categories. The principal contribution of this work is to characterize the document space resulting from the LSA modeling and to demonstrate the approach for mixture LM application. Comparison is made between manual and automatic clustering in order to elucidate how the semantic information is expressed in the space. It is shown that, using semantic information, mixture LMs performs better than a conventional single LM with slight increase of computational cost.},
  address = {Rhodes},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/eurosp97-lsa.pdf},
  pages = {1443--1446},
  categories = {sprach,lm,bnc,sheffield}
}
@inproceedings{vipperla2010a,
  author = {Vipperla, Ravi Chander and Renals, Steve and Frankel, Joe},
  title = {Augmentation of adaptation data},
  booktitle = {Proc. Interspeech},
  year = {2010},
  abstract = {Linear regression based speaker adaptation approaches can improve Automatic Speech Recognition (ASR) accuracy significantly for a target speaker. However, when the available adaptation data is limited to a few seconds, the accuracy of the speaker adapted models is often worse compared with speaker independent models. In this paper, we propose an approach to select a set of reference speakers acoustically close to the target speaker whose data can be used to augment the adaptation data. To determine the acoustic similarity of two speakers, we propose a distance metric based on transforming sample points in the acoustic space with the regression matrices of the two speakers. We show the validity of this approach through a speaker identification task. ASR results on SCOTUS and AMI corpora with limited adaptation data of 10 to 15 seconds augmented by data from selected reference speakers show a significant improvement in Word Error Rate over speaker independent and speaker adapted models.},
  month = {September},
  address = {Makuhari, Japan},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/vipperla-is2010.pdf},
  pages = {530--533}
}
@inproceedings{terry-icassp88,
  author = {Terry, M. and Renals, S. and Rohwer, R. and Harrington, J.},
  title = {A connectionist approach to speech recognition using peripheral auditory modelling},
  booktitle = {Proc IEEE ICASSP},
  year = {1988},
  address = {New York},
  pages = {699--702},
  categories = {}
}
@inproceedings{williams-icslp98,
  author = {Williams, G. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/icslp98-conf.ps.gz},
  title = {Confidence measures derived from an acceptor {HMM}},
  booktitle = {Proc. ICSLP},
  year = {1998},
  abstract = {In this paper we define a number of confidence measures derived from an acceptor HMM and evaluate their performance for the task of utterance verification using the North American Business News (NAB) and Broadcast News (BN) corpora. Results are presented for decodings made at both the word and phone level which show the relative profitability of rejection provided by the diverse set of confidence measures. The results indicate that language model dependent confidence measures have reduced performance on BN data relative to that for the more grammatically constrained NAB data. An explanation linking the observations that rejection is more profitable for noisy acoustics, for a reduced vocabulary and at the phone level is also given.},
  address = {Sydney},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/icslp98-conf.pdf},
  pages = {831--834},
  categories = {recognition,conf,hybrid,bnews,sheffield}
}
@inproceedings{renals-icassp91,
  author = {Renals, S. and McKelvie, D. and McInnes, F.},
  title = {A comparative study of continuous speech recognition using neural networks and hidden {Markov} models},
  booktitle = {Proc IEEE ICASSP},
  year = {1991},
  address = {Toronto},
  pages = {369--372},
  categories = {}
}
@inproceedings{koumpis-icslp00,
  author = {Koumpis, K. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/icslp00.ps.gz},
  title = {Transcription and Summarization of Voicemail Speech},
  booktitle = {Proc. ICSLP},
  year = {2000},
  abstract = {This paper describes the development of a system to transcribe and summarize voicemail messages. The results of the research presented in this paper are two-fold. First, a hybrid connectionist approach to the Voicemail transcription task shows that competitive performance can be achieved using a context-independent system with fewer parameters than those based on mixtures of Gaussian likelihoods. Second, an effective and robust combination of statistical with prior knowledge sources for term weighting is used to extract information from the decoders output in order to deliver summaries to the message recipients via a GSM Short Message Service (SMS) gateway.},
  volume = {2},
  address = {Beijing},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/icslp00.pdf},
  pages = {688--691},
  categories = {voicemail,summarization,sheffield}
}
@inproceedings{gotoh-icassp99,
  author = {Gotoh, Y. and Renals, S. and Williams, G.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/icassp99.ps.gz},
  title = {Named entity tagged language models},
  booktitle = {Proc IEEE ICASSP},
  year = {1999},
  abstract = {We introduce Named Entity (NE) Language Modelling, a stochastic finite state machine approach to identifying both words and NE categories from a stream of spoken data. We provide an overview of our approach to NE tagged language model (LM) generation together with results of the application of such a LM to the task of out-of-vocabulary (OOV) word reduction in large vocabulary speech recognition. Using the Wall Street Journal and Broadcast News corpora, it is shown that the tagged LM was able to reduce the overall word error rate by 14\%, detecting up to 70\% of previously OOV words. We also describe an example of the direct tagging of spoken data with NE categories.},
  address = {Phoenix AZ},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/icassp99.pdf},
  pages = {513--516},
  categories = {sprach,ie,lm,bnews,sheffield}
}
@inproceedings{huang2007-asru,
  author = {Huang, Songfang and Renals, Steve},
  title = {Hierarchical {Pitman-Yor} Language Models for {ASR} in Meetings},
  booktitle = {Proc. IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU'07)},
  year = {2007},
  abstract = {In this paper we investigate the application of a novel technique for language modeling --- a hierarchical Bayesian language model (LM) based on the Pitman-Yor process --- on automatic speech recognition (ASR) for multiparty meetings. The hierarchical Pitman-Yor language model (HPYLM), which was originally proposed in the machine learning field, provides a Bayesian interpretation to language modeling. An approximation to the HPYLM recovers the exact formulation of the interpolated Kneser-Ney smoothing method in n-gram models. This paper focuses on the application and scalability of HPYLM on a practical large vocabulary ASR system. Experimental results on NIST RT06s evaluation meeting data verify that HPYLM is a competitive and promising language modeling technique, which consistently performs better than interpolated Kneser-Ney and modified Kneser-Ney n-gram LMs in terms of both perplexity (PPL) and word error rate (WER).},
  month = {December},
  address = {Kyoto, Japan},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/asru07.pdf},
  pages = {124--129}
}
@inproceedings{huang2010a,
  author = {Huang, Songfang and Renals, Steve},
  doi = {10.1109/ICASSP.2010.5495007},
  title = {Power Law Discounting for N-Gram Language Models},
  url = {http://dx.doi.org/10.1109/ICASSP.2010.5495007},
  booktitle = {Proc. IEEE ICASSP--10},
  abstract = {We present an approximation to the Bayesian hierarchical Pitman-Yor process language model which maintains the power law distribution over word tokens, while not requiring a computationally expensive approximate inference process. This approximation, which we term power law discounting, has a similar computational complexity to interpolated and modified Kneser-Ney smoothing. We performed experiments on meeting transcription using the NIST RT06s evaluation data and the AMI corpus, with a vocabulary of 50,000 words and a language model training set of up to 211 million words. Our results indicate that power law discounting results in statistically significant reductions in perplexity and word error rate compared to both interpolated and modified Kneser-Ney smoothing, while producing similar results to the hierarchical Pitman-Yor process language model.},
  year = {2010},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/huang-icassp10.pdf},
  pages = {5178--5181}
}
@inproceedings{renals-eurospeech99,
  author = {Renals, S. and Gotoh, Y.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/eurospeech99-ne.ps.gz},
  title = {Integrated transcription and identification of named entities in broadcast speech},
  booktitle = {Proc. Eurospeech},
  year = {1999},
  abstract = {This paper presents an approach to integrating functions for both transcription and named entity (NE) identification into a large vocabulary continuous speech recognition system. It builds on NE tagged language modelling approach, which was recently applied for development of the statistical NE annotation system. We also present results for proper name identification experiment using the Hub-4E open evaluation data.},
  address = {Budapest},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/eurospeech99-ne.pdf},
  pages = {1039--1042},
  categories = {sprach,stobs,ie,lm,bnews,sheffield}
}
@inproceedings{renals-icassp95,
  author = {Renals, S. and Hochberg, M.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1995/icassp95-search.ps.gz},
  title = {Efficient search using posterior phone probability estimates},
  booktitle = {Proc IEEE ICASSP},
  year = {1995},
  abstract = {In this paper we present a novel, efficient search strategy for large vocabulary continuous speech recognition (LVCSR). The search algorithm, based on stack decoding, uses posterior phone probability estimates to substantially increase its efficiency with minimal effect on accuracy. In particular, the search space is dramatically reduced by phone deactivation pruning where phones with a small local posterior probability are deactivated. This approach is particularly well-suited to hybrid connectionist/hidden Markov model systems because posterior phone probabilities are directly computed by the acoustic model. On large vocabulary tasks, using a trigram language model, this increased the search speed by an order of magnitude, with 2\% or less relative search error. Results from a hybrid system are presented using the Wall Street Journal LVCSR database for a 20,000 word task using a backed-off trigram language model. For this task, our single-pass decoder took around 15 times realtime on an HP735 workstation. At the cost of 7\% relative search error, decoding time can be speeded up to approximately realtime.},
  address = {Detroit},
  pages = {596--599},
  categories = {wernicke,recognition,wsj,search,sheffield,cambridge}
}
@inproceedings{wolters2010,
  author = {Wolters, Maria K. and Isaac, Karl B. and Renals, Steve},
  title = {Evaluating speech synthesis intelligibility using {Amazon Mechanical Turk}},
  booktitle = {Proc. 7th Speech Synthesis Workshop (SSW7)},
  abstract = {Microtask platforms such as Amazon Mechanical Turk (AMT) are increasingly used to create speech and language resources. AMT in particular allows researchers to quickly recruit a large number of fairly demographically diverse participants. In this study, we investigated whether AMT can be used for comparing the intelligibility of speech synthesis systems. We conducted two experiments in the lab and via AMT, one comparing US English diphone to US English speaker-adaptive HTS synthesis and one comparing UK English unit selection to UK English speaker-dependent HTS synthesis. While AMT word error rates were worse than lab error rates, AMT results were more sensitive to relative differences between systems. This is mainly due to the larger number of listeners. Boxplots and multilevel modelling allowed us to identify listeners who performed particularly badly, while thresholding was sufficient to eliminate rogue workers. We conclude that AMT is a viable platform for synthetic speech intelligibility comparisons.},
  year = {2010},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wolters-ssw2010.pdf},
  pages = {136--141},
  categories = {intelligibility, evaluation, semantically unpredictable sentences, diphone, unit selection, crowd- sourcing, Mechanical Turk, HMM-based synthesis}
}
@inproceedings{gotoh-icassp00,
  author = {Gotoh, Y. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/icassp2000.ps.gz},
  title = {Variable word rate n-grams},
  booktitle = {Proc IEEE ICASSP},
  year = {2000},
  abstract = {The rate of occurrence of words is not uniform but varies from document to document. Despite this observation, parameters for conventional n-gram language models are usually derived using the assumption of a constant word rate. In this paper we investigate the use of variable word rate assumption, modelled by a Poisson distribution or a continuous mixture of Poissons. We present an approach to estimating the relative frequencies of words or n-grams taking prior information of their occurrences into account. Discounting and smoothing schemes are also considered. Using the Broadcast News task, the approach demonstrates a reduction of perplexity up to 10\%.},
  address = {Istanbul},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/icassp2000.pdf},
  pages = {1591--1594},
  categories = {stobs,lm,bnews,sheffield}
}
@inproceedings{renals-fase88,
  author = {Renals, S. and Rohwer, R. and Terry, M.},
  title = {A comparison of speech recognition front ends using a connectionist classifier},
  booktitle = {Proc. FASE Speech '88},
  year = {1988},
  address = {Edinburgh},
  pages = {1381--1388},
  categories = {}
}
@article{wrigley-sap05,
  author = {Wrigley, S. J. and Brown, G. J. and Wan, V. and Renals, S.},
  title = {Speech and crosstalk detection in multi-channel audio},
  journal = {IEEE Trans. on Speech and Audio Processing},
  abstract = {The analysis of scenarios in which a number of microphones record the activity of speakers, such as in a roundtable meeting, presents a number of computational challenges. For example, if each participant wears a microphone, it can receive speech from both the microphone's wearer (local speech) and from other participants (crosstalk). The recorded audio can be broadly classified in four ways: local speech, crosstalk plus local speech, crosstalk alone and silence. We describe two experiments related to the automatic classification of audio into these four classes. The first experiment attempted to optimise a set of acoustic features for use with a Gaussian mixture model (GMM) classifier. A large set of potential acoustic features were considered, some of which have been employed in previous studies. The best-performing features were found to be kurtosis, fundamentalness and cross-correlation metrics. The second experiment used these features to train an ergodic hidden Markov model classifier. Tests performed on a large corpus of recorded meetings show classification accuracies of up to 96\%, and automatic speech recognition performance close to that obtained using ground truth segmentation.},
  volume = {13},
  year = {2005},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/sap04-xtalk.pdf},
  pages = {84--91},
  categories = {m4,meetings,edinburgh,asr,sheffield}
}
@article{yamagishi2009,
  author = {Yamagishi, Junichi and Nose, Takashi and Zen, Heiga and Ling, Zhenhua and Toda, Tomoki and Tokuda, Keiichi and King, Simon and Renals, Steve},
  title = {Robust Speaker-Adaptive {HMM}-based Text-to-Speech Synthesis},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=5109758&arnumber=5153555&count=14&index=12},
  journal = {IEEE Transactions on Audio, Speech and Language Processing},
  number = {6},
  abstract = {This paper describes a speaker-adaptive HMM-based speech synthesis system. The new system, called ``HTS-2007,'' employs speaker adaptation (CSMAPLR+MAP), feature-space adaptive training, mixed-gender modeling, and full-covariance modeling using CSMAPLR transforms, in addition to several other techniques that have proved effective in our previous systems. Subjective evaluation results show that the new system generates significantly better quality synthetic speech than speaker-dependent approaches with realistic amounts of speech data, and that it bears comparison with speaker-dependent approaches even when large amounts of speech data are available. In addition, a comparison study with several speech synthesis techniques shows the new system is very robust: It is able to build voices from less-than-ideal speech data and synthesize good-quality speech even for out-of-domain sentences.},
  volume = {17},
  year = {2009},
  pdf = {},
  pages = {1208--1230}
}
@inproceedings{neto-eurospeech95,
  author = {Neto, J. and Almeida, L. and Hochberg, M. and Martins, C. and Nunes, L. and Renals, S. and Robinson, T.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1995/eurosp95.ps.gz},
  title = {Speaker adaptation for hybrid {HMM--ANN} continuous speech recogniton system},
  booktitle = {Proc. Eurospeech},
  year = {1995},
  abstract = {It is well known that recognition performance degrades significantly when moving from a speaker- dependent to a speaker-independent system. Traditional hidden Markov model (HMM) systems have successfully applied speaker-adaptation approaches to reduce this degradation. In this paper we present and evaluate some techniques for speaker-adaptation of a hybrid HMM-artificial neural network (ANN) continuous speech recognition system. These techniques are applied to a well trained, speaker-independent, hybrid HMM-ANN system and the recognizer parameters are adapted to a new speaker through off-line procedures. The techniques are evaluated on the DARPA RM corpus using varying amounts of adaptation material and different ANN architectures. The results show that speaker-adaptation within the hybrid framework can substantially improve system performance.},
  address = {Madrid},
  pages = {2171--2174},
  categories = {wernicke,rm,recognition,am,hybrid,adaptation,sheffield,cambridge}
}
@incollection{renals2010,
  editor = {Hardcastle, William J. and Laver, John and Gibbon, Fiona E.},
  author = {Renals, Steve and King, Simon},
  chapter = {22},
  publisher = {Wiley Blackwell},
  booktitle = {Handbook of Phonetic Sciences},
  year = {2010},
  title = {Automatic Speech Recognition}
}
@article{christensen2008,
  author = {Christensen, Heidi and Gotoh, Yoshihiko and Renals, Steve},
  doi = {10.1109/TASL.2007.910746},
  title = {A Cascaded Broadcast News Highlighter},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=4407525&arnumber=4383075&count=28&index=16},
  journal = {IEEE Transactions on Audio, Speech and Language Processing},
  abstract = {This paper presents a fully automatic news skimming system which takes a broadcast news audio stream and provides the user with the segmented, structured and highlighted transcript. This constitutes a system with three different, cascading stages: converting the audio stream to text using an automatic speech recogniser, segmenting into utterances and stories and finally determining which utterance should be highlighted using a saliency score. Each stage must operate on the erroneous output from the previous stage in the system; an effect which is naturally amplified as the data progresses through the processing stages. We present a large corpus of transcribed broadcast news data enabling us to investigate to which degree information worth highlighting survives this cascading of processes. Both extrinsic and intrinsic experimental results indicate that mistakes in the story boundary detection has a strong impact on the quality of highlights, whereas erroneous utterance boundaries cause only minor problems. Further, the difference in transcription quality does not affect the overall performance greatly.},
  volume = {16},
  year = {2008},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2008/christensen-tasl08.pdf},
  pages = {151--161}
}
@inproceedings{koumpis-eurospeech03,
  author = {Koumpis, K. and Renals, S.},
  title = {Multi-class Extractive Voicemail Summarization},
  booktitle = {Proc. Eurospeech},
  abstract = {This paper is about a system that extracts principal content words from speech-recognized transcripts of voicemail messages and classifies them into proper names, telephone numbers, dates/times and `other'. The short text summaries generated are suitable for mobile messaging applications. The system uses a set of classifiers to identify the summary words, with each word being identified by a vector of lexical and prosodic features. The features are selected using Parcel, an ROC-based algorithm. We visually compare the role of a large number of individual features and discuss effective ways to combine them. We finally evaluate their performance on manual and automatic transcriptions derived from two different speech recognition systems.},
  year = {2003},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/eurospeech03-voicemail.pdf},
  pages = {2785--2788},
  categories = {voicemail,summarization,prosody,sheffield}
}
@incollection{huang2008-mlmi,
  editor = {Popescu-Belis, A. and Stiefelhagen, R.},
  author = {Huang, Songfang and Renals, Steve},
  publisher = {Springer},
  title = {Modeling Topic and Role Information in Meetings using the Hierarchical {D}irichlet Process},
  series = {Lecture Notes in Computer Science},
  booktitle = {Machine Learning for Multimodal Interaction V},
  abstract = {In this paper, we address the modeling of topic and role information in multiparty meetings, via a nonparametric Bayesian model called the hierarchical Dirichlet process. This model provides a powerful solution to topic modeling and a flexible framework for the incorporation of other cues such as speaker role information. We present our modeling framework for topic and role on the AMI Meeting Corpus, and illustrate the effectiveness of the approach in the context of adapting a baseline language model in a large-vocabulary automatic speech recognition system for multiparty meetings. The adapted LM produces significant improvements in terms of both perplexity and word error rate.},
  volume = {5237},
  year = {2008},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/mlmi08.pdf},
  pages = {214--225}
}
@inproceedings{wan-icassp02,
  author = {Wan, V. and Renals, S.},
  title = {Evaluation of Kernel Methods for Speaker Verification and Identification},
  booktitle = {Proc IEEE ICASSP},
  abstract = {Support vector machines are evaluated on speaker verification and speaker identification tasks. We compare the polynomial kernel, the Fisher kernel, a likelihood ratio kernel and the pair hidden Markov model kernel with baseline systems based on a discriminative polynomial classifier and generative Gaussian mixture model classifiers. Simulations were carried out on the YOHO database and some promising results were obtained.},
  year = {2002},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/icassp02-svm.pdf},
  pages = {669--672},
  categories = {verification,kernel,svm,sheffield}
}
@article{vipperla2010,
  author = {Vipperla, Ravi Chander and Renals, Steve and Frankel, Joe},
  doi = {10.1155/2010/525783},
  title = {Ageing voices: The effect of changes in voice parameters on {ASR} performance},
  url = {http://dx.doi.org/10.1155/2010/525783},
  journal = {EURASIP Journal on Audio, Speech, and Music Processing},
  year = {2010},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/vipperla-eurasip10.pdf},
  abstract = {With ageing, human voices undergo several changes which are typically characterized by increased hoarseness and changes in articulation patterns. In this study, we have examined the effect on Automatic Speech Recognition (ASR) and found that the Word Error Rates (WER) on older voices is about 9\% absolute higher compared to those of adult voices. Subsequently, we compared several voice source parameters including fundamental frequency, jitter, shimmer, harmonicity and cepstral peak prominence of adult and older males. Several of these parameters show statistically significant difference for the two groups. However, artificially increasing jitter and shimmer measures do not effect the ASR accuracies significantly. Artificially lowering the fundamental frequency degrades the ASR performance marginally but this drop in performance can be overcome to some extent using Vocal Tract Length Normalisation (VTLN). Overall, we observe that the changes in the voice source parameters do not have a significant impact on ASR performance. Comparison of the likelihood scores of all the phonemes for the two age groups show that there is a systematic mismatch in the acoustic space of the two age groups. Comparison of the phoneme recognition rates show that mid vowels, nasals and phonemes that depend on the ability to create constrictions with tongue tip for articulation are more affected by ageing than other phonemes.}
}
@inproceedings{cabral_yrwst,
  author = {Cabral, J. and Renals, S. and Richmond, K. and Yamagishi, J.},
  title = {{HMM}-based Speech Synthesis with an Acoustic Glottal Source Model},
  booktitle = {Proc. The First Young Researchers Workshop in Speech Technology},
  month = {April},
  year = {2009},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/jscabral-yrwss2009.pdf},
  abstract = {A major cause of degradation of speech quality in HMM-based speech synthesis is the use of a simple delta pulse signal to generate the excitation of voiced speech. This paper describes a new approach to using an acoustic glottal source model in HMM-based synthesisers. The goal is to improve speech quality and parametric flexibility to better model and transform voice characteristics.},
  categories = {HMM-based Speech Synthesis, LF-Model, Glottal Spectral Separation}
}
@inproceedings{renals-icslp94,
  author = {Renals, S. and Hochberg, M.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1994/icslp94-gamma.ps.gz},
  title = {Using {Gamma} filters to model temporal dependencies in speech},
  booktitle = {Proc. ICSLP},
  year = {1994},
  address = {Yokohama},
  pages = {1491--1494},
  categories = {}
}
@inproceedings{murray2007-interspeech,
  author = {Murray, Gabriel and Renals, Steve},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/IS070966.PDF},
  booktitle = {Proc. Interspeech '07},
  year = {2007},
  abstract = {The majority of speech summarization research has focused on extracting the most informative dialogue acts from recorde d, archived data. However, a potential use case for speech sum- marization in the meetings domain is to facilitate a meeting in progress by providing the participants - whether they are at tend- ing in-person or remotely - with an indication of the most im- portant parts of the discussion so far. This requires being a ble to determine whether a dialogue act is extract-worthy befor e the global meeting context is available. This paper introduces a novel method for weighting dialogue acts using only very lim- ited local context, and shows that high summary precision is possible even when information about the meeting as a whole is lacking. A new evaluation framework consisting of weighted precision, recall and f-score is detailed, and the novel onl ine summarization method is shown to significantly increase recall and f-score compared with a method using no contextual infor- mation.},
  title = {Towards online speech summarization}
}
@incollection{renals2010a,
  editor = {Clark, Alex and Fox, Chris and Lappin, Shalom},
  author = {Renals, Steve and Hain, Thomas},
  publisher = {Wiley Blackwell},
  booktitle = {Handbook of Computational Linguistics and Natural Language Processing},
  year = {2010},
  title = {Speech Recognition}
}
@incollection{karlsen-casa97,
  editor = {Rosenthal, D. F. and Okuno, H. G.},
  author = {Karlsen, B. L. and Brown, G. J. and Cooke, M. and Green, P. and Renals, S.},
  publisher = {Lawrence Erlbaum Associates},
  title = {Analysis of a simultaneous speaker sound corpus},
  booktitle = {Computational Auditory Scene Analysis},
  year = {1997},
  pages = {321--334},
  categories = {}
}
@article{lu_spl_2011,
  author = {Lu, L. and Ghoshal, A. and Renals, S.},
  title = {Regularized Subspace Gausian Mixture Models for Speech Recognition},
  journal = {IEEE Signal Processing Letters},
  number = {7},
  abstract = {Subspace Gaussian mixture models (SGMMs) provide a compact representation of the Gaussian parameters in an acoustic model, but may still suffer from over-fitting with insufficient training data. In this letter, the SGMM state parameters are estimated using a penalized maximum-likelihood objective, based on $\ell_1$ and $\ell_2$ regularization, as well as their combination, referred to as the elastic net, for robust model estimation. Experiments on the 5000-word Wall Street Journal transcription task show word error rate reduction and improved model robustness with regularization.},
  volume = {18},
  year = {2011},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/lu-spl-2011.pdf},
  pages = {419--422},
  categories = {Acoustic Modelling, Regularization, Sparsity, Subspace Gaussian Mixture Model}
}
@article{goldman2005,
  author = {Goldman, Jerry and Renals, Steve and Bird, Steven and {de Jong}, Franciska and Federico, Marcello and Fleischhauer, Carl and Kornbluh, Mark and Lamel, Lori and Oard, Doug and Stewart, Clare and Wright, Richard},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/swag-ijdl05.ps.gz},
  title = {Accessing the spoken word},
  journal = {International Journal of Digital Libraries},
  number = {4},
  abstract = {Spoken word audio collections cover many domains, including radio and television broadcasts, oral narratives, governmental proceedings, lectures, and telephone conversations. The collection, access and preservation of such data is stimulated by political, economic, cultural and educational needs. This paper outlines the major issues in the field, reviews the current state of technology, examines the rapidly changing policy issues relating to privacy and copyright, and presents issues relating to the collection and preservation of spoken audio content.},
  volume = {5},
  year = {2005},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/swag-ijdl05.pdf},
  pages = {287--298},
  categories = {swag,asr,ir,edinburgh}
}
@inproceedings{hifny-interspeech05,
  author = {Hifny, Y. and Renals, S. and Lawrence, N.},
  title = {A Hybrid {MaxEnt/HMM} based {ASR} System},
  booktitle = {Proc. Interspeech},
  year = {2005},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hifny-eurospeech05.pdf},
  abstract = {The aim of this work is to develop a practical framework, which extends the classical Hidden Markov Models (HMM) for continuous speech recognition based on the Maximum Entropy (MaxEnt) principle. The MaxEnt models can estimate the posterior probabilities directly as with Hybrid NN/HMM connectionist speech recognition systems. In particular, a new acoustic modelling based on discriminative MaxEnt models is formulated and is being developed to replace the generative Gaussian Mixture Models (GMM) commonly used to model acoustic variability. Initial experimental results using the TIMIT phone task are reported.},
  categories = {ml,asr,edinburgh,sheffield}
}
@incollection{dielmann-mlmi04,
  editor = {Bengio, S. and Bourlard, H.},
  author = {Dielmann, A. and Renals, S.},
  publisher = {Springer},
  title = {Multistream dynamic {Bayesian} network for meeting segmentation},
  booktitle = {Proc. Multimodal Interaction and Related Machine Learning Algorithms Workshop (MLMI--04)},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/dielmann-mlmi04.ps.gz},
  abstract = {This paper investigates the automatic analysis and segmentation of meetings. A meeting is analysed in terms of individual behaviours and group interactions, in order to decompose each meeting in a sequence of relevant phases, named meeting actions. Three feature families are extracted from multimodal recordings: prosody from individual lapel microphone signals, speaker activity from microphone array data and lexical features from textual transcripts. A statistical approach is then used to relate low-level features with a set of abstract categories. In order to provide a flexible and powerful framework, we have employed a dynamic Bayesian network based model, characterized by multiple stream processing and flexible state duration modelling. Experimental results demonstrate the strength of this system, providing a meeting action error rate of 9\%.},
  year = {2005},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/dielmann-mlmi04.pdf},
  pages = {76--86},
  categories = {m4,multimodal,dbn,meetings,edinburgh}
}
@inproceedings{wan-icassp03,
  author = {Wan, V. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/icassp03-svm.ps.gz},
  title = {{SVMSVM}: Support vector machine speaker verification methodology},
  booktitle = {Proc. IEEE ICASSP},
  abstract = {Support vector machines with the Fisher and score-space kernels are used for text independent speaker verification to provide direct q discrimination between complete utterances. This is unlike approaches such as discriminatively trained Gaussian mixture models or other discriminative classifiers that discriminate at the frame-level only. Using the sequence-level discrimination approach we are able to achieve error-rates that are significantly better than the current state-of-the-art on the PolyVar database.},
  volume = {2},
  year = {2003},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/icassp03-svm.pdf},
  pages = {221--224},
  categories = {verification,kernel,svm,sheffield}
}
@inproceedings{dielmann-mmsp04,
  author = {Dielmann, A. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/dielmann-mmsp04.ps.gz},
  title = {Multi-stream segmentation of meetings},
  booktitle = {Proc. IEEE Workshop on Multimedia Signal Processing},
  abstract = {This paper investigates the automatic segmentation of meetings into a sequence of group actions or phases. Our work is based on a corpus of multiparty meetings collected in a meeting room instrumented with video cameras, lapel microphones and a microphone array. We have extracted a set of feature streams, in this case extracted from the audio data, based on speaker turns, prosody and a transcript of what was spoken. We have related these signals to the higher level semantic categories via a multistream statistical model based on dynamic Bayesian networks (DBNs). We report on a set of experiments in which different DBN architectures are compared, together with the different feature streams. The resultant system has an action error rate of 9\%.},
  year = {2004},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/dielmann-mmsp04.pdf},
  pages = {},
  categories = {m4,multimodal,dbn,meetings,edinburgh}
}
@inproceedings{christensen-asru03,
  author = {Christensen, H. and Gotoh, Y. and Kolluru, B. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/asru03-portable.ps.gz},
  title = {Are extractive text summarisation techniques portable to broadcast news?},
  booktitle = {Proc. IEEE Automatic Speech Recognition and Understanding Workshop},
  abstract = {In this paper we report on a series of experiments which compare the effect of individual features on both text and speech summarisation, the effect of basing the speech summaries on automatic speech recognition transcripts with varying word error rates, and the effect of summarisation approach and transcript source on summary quality. We show that classical text summarisation features (based on stylistic and content information) are portable to broadcast news. However, the quality of the speech transcripts as well as the difference in information structure between broadcast and newspaper news affect the usability of the individual features.},
  year = {2003},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/asru03-portable.pdf},
  pages = {},
  categories = {s3l,summarization,bnews,edinburgh}
}
@inproceedings{cabral2011a,
  author = {Cabral, J.P. and Renals, S. and Yamagishi, J. and Richmond, K.},
  doi = {10.1109/ICASSP.2011.5947405},
  title = {{HMM}-based speech synthesiser using the {LF}-model of the glottal source},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011 IEEE International Conference on},
  issn = {1520-6149},
  abstract = {A major factor which causes a deterioration in speech quality in {HMM}-based speech synthesis is the use of a simple delta pulse signal to generate the excitation of voiced speech. This paper sets out a new approach to using an acoustic glottal source model in HMM-based synthesisers instead of the traditional pulse signal. The goal is to improve speech quality and to better model and transform voice characteristics. We have found the new method decreases buzziness and also improves prosodic modelling. A perceptual evaluation has supported this finding by showing a 55.6% preference for the new system, as against the baseline. This improvement, while not being as significant as we had initially expected, does encourage us to work on developing the proposed speech synthesiser further.},
  month = {May},
  year = {2011},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/05947405.pdf},
  pages = {4704--4707},
  categories = {HMM-based speech synthesiser;acoustic glottal source model LF-model;delta pulse signal;perceptual evaluation;prosodic modelling;speech quality;voiced speech generation;hidden Markov models;speech synthesis;}
}
@incollection{robinson-yellowbook96,
  editor = {Lee, C.-H. and Paliwal, K. K. and Soong, F. K.},
  author = {Robinson, T. and Hochberg, M. and Renals, S.},
  publisher = {Kluwer Academic Publishers},
  title = {The use of recurrent networks in continuous speech recognition},
  booktitle = {Automatic Speech and Speaker Recognition -- Advanced Topics},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1996/rnn4csr96.ps.gz},
  abstract = {This chapter describes a use of recurrent neural networks (ie, feedback is incorporated in the computation) as an acoustic model for continuous speech recognition. The form of the recurrent neural network is described, along with an appropriate parameter estimation procedure. For each frame of acoustic data, the recurrent network generates an estimate of the posterior probability of the possible phones given the observed acoustic signal. The posteriors are then converted into scaled likelihoods and used as the observation probabilities within a conventional decoding paradigm (eg, Viterbi decoding). The advantages of the using recurrent networks are that they require a small number of parameters and provide a fast decoding capability (relative to conventional large vocabulary HMM systems).},
  year = {1996},
  pages = {233--258},
  categories = {wernicke,sprach,wsj,recognition,am,hybrid,abbot,search,sheffield}
}
@inproceedings{renals-ieeann89,
  author = {Renals, S. and Rohwer, R.},
  title = {Neural networks for speech pattern classification},
  booktitle = {IEE Conference Publication 313, 1st IEE Conference on Artificial Neural Networks},
  year = {1989},
  address = {London},
  pages = {292--296},
  categories = {}
}
@incollection{murray2007-mlmi,
  editor = {Popescu-Belis, A. and Renals, S. and Bourlard, H.},
  author = {Murray, Gabriel and Renals, Steve},
  publisher = {Springer},
  title = {Term-weighting for summarization of multi-party spoken dialogues},
  series = {Lecture Notes in Computer Science},
  booktitle = {Machine Learning for Multimodal Interaction IV},
  abstract = {This paper explores the issue of term-weighting in the genre of spontaneous, multi-party spoken dialogues, with the intent of using such term-weights in the creation of extractive meeting summaries. The field of text information retrieval has yielded many term-weighting tech- niques to import for our purposes; this paper implements and compares several of these, namely tf.idf, Residual IDF and Gain. We propose that term-weighting for multi-party dialogues can exploit patterns in word us- age among participant speakers, and introduce the su.idf metric as one attempt to do so. Results for all metrics are reported on both manual and automatic speech recognition (ASR) transcripts, and on both the ICSI and AMI meeting corpora.},
  volume = {4892},
  year = {2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/48920155.pdf},
  pages = {155--166}
}
@article{wan-sap05,
  author = {Wan, V. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/sap05-svm.ps.gz},
  title = {Speaker verification using sequence discriminant support vector machines},
  journal = {IEEE Trans. on Speech and Audio Processing},
  abstract = {This paper presents a text-independent speaker verification system using support vector machines (SVMs) with score-space kernels. Score-space kernels, generalize Fisher kernels, and are based on an underlying generative model, such as a Gaussian mixture model (GMM). This approach provides direct discrimination between whole sequences, in contrast to the frame-level approaches at the heart of most current systems. The resultant SVMs have a very high dimensionality, since it is related to the number of parameters in the underlying generative model. To ameliorate problems that can arise in the resultant optimization, we introduce a technique called spherical normalization that preconditions the Hessian matrix. We have performed speaker verification experiments using the PolyVar database. The SVM system presented here reduces the relative error rates by 34\% compared to a GMM likelihood ratio system.},
  volume = {13},
  year = {2005},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/sap05-svm.pdf},
  pages = {203--210},
  categories = {verification,kernel,svm,edinburgh,sheffield}
}
@article{williams-csl99,
  author = {Williams, G. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/csl99-preprint.ps.gz},
  title = {Confidence measures from local posterior probability estimates},
  journal = {Computer Speech and Language},
  abstract = {In this paper we introduce a set of related confidence measures for large vocabulary continuous speech recognition (LVCSR) based on local phone posterior probability estimates output by an acceptor HMM acoustic model. In addition to their computational efficiency, these confidence measures are attractive as they may be applied at the state-, phone-, word- or utterance-levels, potentially enabling discrimination between different causes of low confidence recognizer output, such as unclear acoustics or mismatched pronunciation models. We have evaluated these confidence measures for utterance verification using a number of different metrics. Experiments reveal several trends in `profitability of rejection', as measured by the unconditional error rate of a hypothesis test. These trends suggest that crude pronunciation models can mask the relatively subtle reductions in confidence caused by out-of-vocabulary (OOV) words and disfluencies, but not the gross model mismatches elicited by non-speech sounds. The observation that a purely acoustic confidence measure can provide improved performance over a measure based upon both acoustic and language model information for data drawn from the Broadcast News corpus, but not for data drawn from the North American Business News corpus suggests that the quality of model fit offered by a trigram language model is reduced for Broadcast News data. We also argue that acoustic confidence measures may be used to inform the search for improved pronunciation models.},
  volume = {13},
  year = {1999},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/csl99-preprint.pdf},
  pages = {395--411},
  categories = {recognition,conf,hybrid,bnews,sheffield}
}
@inproceedings{abdelhaleem-icassp04,
  author = {Abdel-Haleem, Y. H. and Renals, S. and Lawrence, N. D.},
  title = {Acoustic space dimensionality selection and combination using the maximum entropy principle},
  booktitle = {Proc. IEEE ICASSP},
  abstract = {In this paper we propose a discriminative approach to acoustic space dimensionality selection based on maximum entropy modelling. We form a set of constraints by composing the acoustic space with the space of phone classes, and use a continuous feature formulation of maximum entropy modelling to select an optimal feature set. The suggested approach has two steps: (1) the selection of the best acoustic space that efficiently and economically represents the acoustic data and its variability; (2) the combination of selected acoustic features in the maximum entropy framework to estimate the posterior probabilities over the phonetic labels given the acoustic input. Specific contributions of this paper include a parameter estimation algorithm (generalized improved iterative scaling) that enables the use of negative features, the parameterization of constraint functions using Gaussian mixture models, and experimental results using the TIMIT database.},
  year = {2004},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/icassp04-me.pdf},
  pages = {},
  categories = {ml,maxent,am,recognition,edinburgh,sheffield}
}
@inproceedings{hsueh2006asm,
  author = {Hsueh, P. and Moore, J. and Renals, S.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/21_1_hsuehmoorerenals.pdf},
  booktitle = {Proc. EACL06},
  year = {2006},
  abstract = {In this paper, we investigate the prob- lem of automatically predicting segment boundaries in spoken multiparty dialogue. We extend prior work in two ways. We first apply approaches that have been pro- posed for predicting top-level topic shifts to the problem of identifying subtopic boundaries. We then explore the impact on performance of using ASR output as opposed to human transcription. Exam- ination of the effect of features shows that predicting top-level and predicting subtopic boundaries are two distinct tasks: (1) for predicting subtopic boundaries, the lexical cohesion-based approach alone can achieve competitive results, (2) for predicting top-level boundaries, the ma- chine learning approach that combines lexical-cohesion and conversational fea- tures performs best, and (3) conversational cues, such as cue phrases and overlapping speech, are better indicators for the top- level prediction task. We also find that the transcription errors inevitable in ASR output have a negative impact on models that combine lexical-cohesion and conver- sational features, but do not change the general preference of approach for the two tasks.},
  title = {Automatic Segmentation of Multiparty Dialogue}
}
@inproceedings{cuayahuitletal_interspeech07,
  author = {Cuayáhuitl, Heriberto and Renals, Steve and Lemon, Oliver and Shimodaira, Hiroshi},
  title = {Hierarchical Dialogue Optimization Using Semi-Markov Decision Processes},
  booktitle = {Proc. Interspeech},
  month = {August},
  year = {2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/SMDPs-interspeech2007.pdf},
  abstract = {This paper addresses the problem of dialogue optimization on large search spaces. For such a purpose, in this paper we propose to learn dialogue strategies using multiple Semi-Markov Decision Processes and hierarchical reinforcement learning. This approach factorizes state variables and actions in order to learn a hierarchy of policies. Our experiments are based on a simulated flight booking dialogue system and compare flat versus hierarchical reinforcement learning. Experimental results show that the proposed approach produced a dramatic search space reduction (99.36\%), and converged four orders of magnitude faster than flat reinforcement learning with a very small loss in optimality (on average 0.3 system turns). Results also report that the learnt policies outperformed a hand-crafted one under three different conditions of ASR confidence levels. This approach is appealing to dialogue optimization due to faster learning, reusable subsolutions, and scalability to larger problems.},
  categories = {Spoken dialogue systems, semi-Markov decision processes, hierarchical reinforcement learning.}
}
@inproceedings{rohwer-icassp88,
  author = {Rohwer, R. and Renals, S. and Terry, M.},
  title = {Unstable connectionist networks in speech recognition},
  booktitle = {Proc IEEE ICASSP},
  year = {1988},
  address = {New York},
  pages = {426--428},
  categories = {}
}
@article{renals-jstatphys90,
  author = {Renals, S. and Rohwer, R.},
  title = {A study of network dynamics},
  journal = {J. Stat. Phys.},
  volume = {58},
  year = {1990},
  pages = {825--847},
  categories = {}
}
@article{carreira-specom98,
  author = {Carreira-Perpiñán, M. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/specom98.ps.gz},
  title = {Dimensionality reduction of electropalatographic data using latent variable models},
  journal = {Speech Communication},
  abstract = {We consider the problem of obtaining a reduced dimension representation of electropalatographic (EPG) data. An unsupervised learning approach based on latent variable modelling is adopted, in which an underlying lower dimension representation is inferred directly from the data. Several latent variable models are investigated, including factor analysis and the generative topographic mapping (GTM). Experiments were carried out using a subset of the EUR-ACCOR database, and the results indicate that these automatic methods capture important, adaptive structure in the EPG data. Nonlinear latent variable modelling clearly outperforms the investigated linear models in terms of log-likelihood and reconstruction error and suggests a substantially smaller intrinsic dimensionality for the EPG data than that claimed by previous studies. A two-dimensional representation is produced with applications to speech therapy, language learning and articulatory dynamics.},
  volume = {26},
  year = {1998},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/specom98.pdf},
  pages = {259--282},
  categories = {ml,lv,artic,sheffield}
}
@inproceedings{wrigley-eurospeech03,
  author = {Wrigley, S. and Brown, G. and Wan, V. and Renals, S.},
  title = {Feature Selection for the Classification of Crosstalk in Multi-Channel Audio},
  booktitle = {Proc. Eurospeech},
  abstract = {An extension to the conventional speech / nonspeech classification framework is presented for a scenario in which a number of microphones record the activity of speakers present at a meeting (one microphone per speaker). Since each microphone can receive speech from both the participant wearing the microphone (local speech) and other participants (crosstalk), the recorded audio can be broadly classified in four ways: local speech, crosstalk plus local speech, crosstalk alone and silence. We describe a classifier in which a Gaussian mixture model (GMM) is used to model each class. A large set of potential acoustic features are considered, some of which have been employed in previous speech / nonspeech classifiers. A combination of two feature selection algorithms is used to identify the optimal feature set for each class. Results from the GMM classifier using the selected features are superior to those of a previously published approach.},
  year = {2003},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/eurospeech03-xtalk.pdf},
  pages = {469--472},
  categories = {m4,crosstalk,meetings,sheffield}
}
@article{hifny2009,
  author = {Hifny, Y. and Renals, S.},
  title = {Speech Recognition Using Augmented Conditional Random Fields},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=4749447&arnumber=4749472&count=25&index=15},
  journal = {IEEE Transactions on Audio, Speech and Language Processing},
  number = {2},
  abstract = {Acoustic modeling based on hidden Markov models (HMMs) is employed by state-of-the-art stochastic speech recognition systems. Although HMMs are a natural choice to warp the time axis and model the temporal phenomena in the speech signal, their conditional independence properties limit their ability to model spectral phenomena well. In this paper, a new acoustic modeling paradigm based on augmented conditional random fields (ACRFs) is investigated and developed. This paradigm addresses some limitations of HMMs while maintaining many of the aspects which have made them successful. In particular, the acoustic modeling problem is reformulated in a data driven, sparse, augmented space to increase discrimination. Acoustic context modeling is explicitly integrated to handle the sequential phenomena of the speech signal. We present an efficient framework for estimating these models that ensures scalability and generality. In the TIMIT phone recognition task, a phone error rate of 23.0\% was recorded on the full test set, a significant improvement over comparable HMM-based systems.},
  volume = {17},
  year = {2009},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/hifny2009.pdf},
  pages = {354--365}
}
@inproceedings{gotoh-asr2000,
  author = {Gotoh, Y. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/asr2000.ps.gz},
  title = {Sentence Boundary Detection in Broadcast Speech Transcripts},
  booktitle = {ISCA ITRW: ASR2000},
  year = {2000},
  abstract = {This paper presents an approach to identifying sentence boundaries in broadcast speech transcripts. We describe finite state models that extract sentence boundary information statistically from text and audio sources. An n-gram language model is constructed from a collection of British English news broadcasts and scripts. An alternative model is estimated from pause duration information in speech recogniser outputs aligned with their programme script counterparts. Experimental results show that the pause duration model alone outperforms the language modelling approach and that, by combining these two models, it can be improved further and precision and recall scores of over 70\% were attained for the task.},
  address = {Paris},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/asr2000.pdf},
  pages = {228--235},
  categories = {stobs,ie,lm,prosody,bnews,sheffield}
}
@inproceedings{Murray05b,
  author = {Murray, G. and Renals, S. and Carletta, J. and Moore, J.},
  title = {Evaluating Automatic Summaries of Meeting Recordings},
  booktitle = {Proceedings of the 43rd Annual Meeting of the Association for Computational Linguistics, Ann Arbor, MI, USA},
  month = {June},
  year = {2005},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/murray-renals-carletta-moore.pdf},
  abstract = {The research below explores schemes for evaluating automatic summaries of business meetings, using the ICSI Meeting Corpus. Both automatic and subjective evaluations were carried out, with a central interest being whether or not the two types of evaluations correlate with each other. The evaluation metrics were used to compare and contrast differing approaches to automatic summarization, the deterioration of summary quality on ASR output versus manual transcripts, and to determine whether manual extracts are rated significantly higher than automatic extracts.},
  categories = {ami,summarization, speech summarization, prosody, latent semantic analysis, summarization evaluation, edinburgh}
}
@inproceedings{williams-escapron98,
  author = {Williams, G. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/esca98.ps.gz},
  title = {Confidence measures for evaluating pronunciation models},
  booktitle = {ESCA Workshop on Modeling pronunciation variation for automatic speech recognition},
  year = {1998},
  abstract = {In this paper, we investigate the use of confidence measures for the evaluation of pronunciation models and the employment of these evaluations in an automatic baseform learning process. The confidence measures and pronunciation models are obtained from the Abbot hybrid Hidden Markov Model/Artificial Neural Network Large Vocabulary Continuous Speech Recognition system. Experiments were carried out for a number of baseform learning schemes using the ARPA North American Business News and the Broadcast News corpora from which it was found that a confidence measure based scheme provided the largest reduction in Word Error Rate.},
  address = {Kerkrade, Netherlands},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/esca98.pdf},
  pages = {151--155},
  categories = {recognition,conf,hybrid,abbot,wsj,bnews,pron,sheffield}
}
@incollection{murray2008a,
  author = {Murray, Gabriel and Renals, Steve},
  publisher = {Springer},
  doi = {10.1007/978-3-540-85853-9_22},
  title = {Meta Comments for Summarizing Meeting Speech},
  url = {http://dx.doi.org/10.1007/978-3-540-85853-9_22},
  series = {Lecture Notes in Computer Science},
  booktitle = {Machine Learning for Multimodal Interaction (Proc. MLMI '08)},
  number = {5237},
  abstract = {This paper is about the extractive summarization of meeting speech, using the ICSI and AMI corpora. In the first set of experiments we use prosodic, lexical, structural and speaker-related features to select the most informative dialogue acts from each meeting, with the hypothesis being that such a rich mixture of features will yield the best results. In the second part, we present an approach in which the identification of ``meta-comments'' is used to create more informative summaries that provide an increased level of abstraction. We find that the inclusion of these meta comments improves summarization performance according to several evaluation metrics.},
  year = {2008},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/murray2008a.pdf},
  pages = {236--247}
}
@incollection{al-hames2006-mlmi06,
  editor = {Renals, S. and Bengio, S. and Fiscus, J. G.},
  author = {Al-Hames, Marc and Hain, Thomas and Cernocky, Jan and Schreiber, Sascha and Poel, Mannes and Mueller, Ronald and Marcel, Sebastien and {van Leeuwen}, David and Odobez, Jean-Marc and Ba, Sileye and Bourlard, Hervé and Cardinaux, Fabien and Gatica-Perez, Daniel and Janin, Adam and Motlicek, Petr and Reiter, Stephan and Renals, Steve and {van Rest}, Jeroen and Rienks, Rutger and Rigoll, Gerhard and Smith, Kevin and Thean, Andrew and Zemcik, Pavel},
  publisher = {Springer},
  title = {Audio-video processing in meetings: Seven questions and current {AMI} answers},
  series = {Lecture Notes in Computer Science},
  booktitle = {Machine Learning for Multimodal Interaction (Proc. MLMI '06)},
  volume = {4299},
  year = {2006},
  pages = {24--35}
}
@inproceedings{renals-nnsp91,
  author = {Renals, S. and Morgan, N. and Bourlard, H.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1991/nnsp91.ps.gz},
  title = {Probability estimation by feed-forward networks in continuous speech recognition},
  booktitle = {IEEE Proc. Neural Networks for Signal Processing},
  year = {1991},
  address = {Princeton NJ},
  pages = {309--318},
  categories = {}
}
@inproceedings{lu2012jud,
  author = {Lu, L. and Ghoshal, A. and Renals, S.},
  title = {{Joint uncertainty decoding with unscented transform for noise robust subspace Gaussian mixture model}},
  booktitle = {Proc. Sapa-Scale workshop},
  year = {2012},
  keywords = {noise compensation, SGMM, JUD, UT},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/llu-sapa2012.pdf},
  abstract = {Common noise compensation techniques use vector Taylor series (VTS) to approximate the mismatch function. Recent work shows that the approximation accuracy may be improved by sampling. One such sampling technique is the unscented transform (UT), which draws samples deterministically from clean speech and noise model to derive the noise corrupted speech parameters. This paper applies UT to noise compensation of the subspace Gaussian mixture model (SGMM). Since UT requires relatively smaller number of samples for accurate estimation, it has significantly lower computational cost compared to other random sampling techniques. However, the number of surface Gaussians in an SGMM is typically very large, making the direct application of UT, for compensating individual Gaussian components, computationally impractical. In this paper, we avoid the computational burden by employing UT in the framework of joint uncertainty decoding (JUD), which groups all the Gaussian components into small number of classes, sharing the compensation parameters by class. We evaluate the JUD-UT technique for an SGMM system using the Aurora 4 corpus. Experimental results indicate that UT can lead to increased accuracy compared to VTS approximation if the JUD phase factor is untuned, and to similar accuracy if the phase factor is tuned empirically}
}
@inproceedings{renals-ijcnn92,
  author = {Renals, S. and Morgan, N. and Cohen, M. and Franco, H. and Bourlard, H.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1992/ijcnn92.ps.gz},
  title = {Improving statistical speech recognition},
  booktitle = {Proc. IJCNN},
  year = {1992},
  volume = {2},
  address = {Baltimore MD},
  pages = {301--307},
  categories = {}
}
@article{turk:2429,
  author = {Turk, Alice and Scobbie, James and Geng, Christian and Macmartin, Cedric and Bard, Ellen and Campbell, Barry and Dickie, Catherine and Dubourg, Eddie and Hardcastle, Bill and Hoole, Phil and Kanaida, Evia and Lickley, Robin and Nakai, Satsuki and Pouplier, Marianne and King, Simon and Renals, Steve and Richmond, Korin and Schaeffler, Sonja and Wiegand, Ronnie and White, Kevin and Wrench, Alan},
  publisher = {ASA},
  doi = {10.1121/1.3508679},
  title = {The {Edinburgh Speech Production Facility's} articulatory corpus of spontaneous dialogue.},
  journal = {The Journal of the Acoustical Society of America},
  number = {4},
  abstract = {The EPSRC‐funded Edinburgh Speech Production is built around two synchronized Carstens AG500 electromagnetic articulographs (EMAs) in order to capture articulatory∕acoustic data from spontaneous dialogue. An initial articulatory corpus was designed with two aims. The first was to elicit a range of speech styles∕registers from speakers, and therefore provide an alternative to fully scripted corpora. The second was to extend the corpus beyond monologue, by using tasks that promote natural discourse and interaction. A subsidiary driver was to use dialects from outwith North America: dialogues paired up a Scottish English and a Southern British English speaker. Tasks. Monologue: Story reading of ``Comma Gets a Cure'' [Honorof et al. (2000)], lexical sets [Wells (1982)], spontaneous story telling, diadochokinetic tasks. Dialogue: Map tasks [Anderson et al. (1991)], ``Spot the Difference'' picture tasks [Bradlow et al. (2007)], story‐recall. Shadowing of the spontaneous story telling by the second participant. Each dialogue session includes approximately 30 min of speech, and there are acoustics‐only baseline materials. We will introduce the corpus and highlight the role of articulatory production data in helping provide a fuller understanding of various spontaneous speech phenomena by presenting examples of naturally occurring covert speech errors, accent accommodation, turn taking negotiation, and shadowing.},
  volume = {128},
  year = {2010},
  pages = {2429-2429}
}
@inproceedings{cabral07,
  author = {Cabral, J. and Renals, S. and Richmond, K. and Yamagishi, J.},
  title = {Towards an Improved Modeling of the Glottal Source in Statistical Parametric Speech Synthesis},
  booktitle = {Proc.of the 6th ISCA Workshop on Speech Synthesis},
  year = {2007},
  address = {Bonn, Germany},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/cabral07.pdf},
  abstract = {This paper proposes the use of the Liljencrants-Fant model (LF-model) to represent the glottal source signal in HMM-based speech synthesis systems. These systems generally use a pulse train to model the periodicity of the excitation signal of voiced speech. However, this model produces a strong and uniform harmonic structure throughout the spectrum of the excitation which makes the synthetic speech sound buzzy. The use of a mixed band excitation and phase manipulation reduces this effect but it can result in degradation of the speech quality if the noise component is not weighted carefully. In turn, the LF-waveform has a decaying spectrum at higher frequencies, which is more similar to the real glottal source excitation signal. We conducted a perceptual experiment to test the hypothesis that the LF-model can perform as well as or better than the pulse train in a HMM-based speech synthesizer. In the synthesis, we used the mean values of the LF-parameters, calculated by measurements of the recorded speech. The result of this study is important not only regarding the improvement in speech quality of these type of systems, but also because the LF-model can be used to model many characteristics of the glottal source, such as voice quality, which are important for voice transformation and generation of expressive speech.},
  categories = {LF-model, Statistical parametric speech synthesis, HMM-based speech synthesis}
}
@book{renals2006-mlmi05,
  editor = {Renals, Steve and Bengio, Samy},
  volume = {3869},
  publisher = {Springer-Verlag},
  year = {2006},
  series = {Lecture Notes in Computer Science},
  title = {Machine learning for multimodal interaction (Proceedings of {MLMI} '05)}
}
@inproceedings{renals-icassp96,
  author = {Renals, S. and Hochberg, M.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1996/icassp96.ps.gz},
  title = {Efficient evaluation of the {LVCSR} search space using the {NOWAY} decoder},
  booktitle = {Proc IEEE ICASSP},
  year = {1996},
  abstract = {This work further develops and analyses the large vocabulary continuous speech recognition search strategy reported at ICASSP-95. In particular, the posterior-based phone deactivation pruning approach has been extended to include phone-dependent thresholds and an improved estimate of the least upper bound on the utterance log-probability has been developed. Analysis of the pruning procedures and of the search's interaction with the language model has also been performed. Experiments were carried out using the ARPA North American Business News task with a 20,000 word vocabulary and a trigram language model. As a result of these improvements and analyses, the computational cost of the recognition process performed by the Noway decoder has been substantially reduced.},
  address = {Atlanta},
  pages = {149--152},
  categories = {wernicke,sprach,recognition,wsj,search,sheffield}
}
@inproceedings{koumpis-prosody01,
  author = {Koumpis, K. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/pros01-vm.ps.gz},
  title = {The role of prosody in a voicemail summarization system},
  booktitle = {Proc. ISCA Workshop on Prosody in Speech Recognition and Understanding},
  year = {2001},
  address = {Red Bank, NJ, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/pros01-vm.pdf},
  abstract = {When a speaker leaves a voicemail message there are prosodic cues that emphasize the important points in the message, in addition to lexical content. In this paper we compare and visualize the relative contribution of these two types of features within a voicemail summarization system. We describe the system's ability to generate summaries of two test sets, having trained and validated using 700 messages from the IBM Voicemail corpus. Results measuring the quality of summary artifacts show that combined lexical and prosodic features are at least as robust as combined lexical features alone across all operating conditions.},
  categories = {voicemail,summarization,prosody,sheffield}
}
@inproceedings{garau2008a,
  author = {Garau, Giulia and Renals, Steve},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/garau2008a.pdf},
  booktitle = {Proc. Interspeech '08},
  year = {2008},
  abstract = {We have investigated the use of a pitch adaptive spectral representation on large vocabulary speech recognition, in conjunction with speaker normalisation techniques. We have compared the effect of a smoothed spectrogram to the pitch adaptive spectral analysis by decoupling these two components of STRAIGHT. Experiments performed on a large vocabulary meeting speech recognition task highlight the importance of combining a pitch adaptive spectral representation with a conventional fixed window spectral analysis. We found evidence that STRAIGHT pitch adaptive features are more speaker independent than conventional MFCCs without pitch adaptation, thus they also provide better performances when combined using feature combination techniques such as Heteroscedastic Linear Discriminant Analysis.},
  title = {Pitch adaptive features for {LVCSR}}
}
@article{gotoh-nle99,
  author = {Gotoh, Y. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/jnle99-preprint.ps.gz},
  title = {Topic-based mixture language modelling},
  journal = {Journal of Natural Language Engineering},
  abstract = {This paper describes an approach for constructing a mixture of language models based on simple statistical notions of semantics using probabilistic models developed for information retrieval. The approach encapsulates corpus-derived semantic information and is able to model varying styles of text. Using such information, the corpus texts are clustered in an unsupervised manner and a mixture of topic-specific language models is automatically created. The principal contribution of this work is to characterise the document space resulting from information retrieval techniques and to demonstrate the approach for mixture language modelling. A comparison is made between manual and automatic clustering in order to elucidate how the global content information is expressed in the space. We also compare (in terms of association with manual clustering and language modelling accuracy) alternative term-weighting schemes and the effect of singular valued decomposition dimension reduction (latent semantic analysis). Test set perplexity results using the British National Corpus indicate that the approach can improve the potential of statistical language modelling. Using an adaptive procedure, the conventional model may be tuned to track text data with a slight increase in computational cost.},
  volume = {5},
  year = {1999},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/jnle99-preprint.pdf},
  pages = {355--375},
  categories = {sprach,stobs,lm,bnc,sheffield}
}
@inproceedings{murray-interspeech05,
  author = {Murray, G. and Renals, S. and Carletta, J.},
  title = {Extractive Summarization of Meeting Recordings},
  booktitle = {Proc. Interspeech},
  month = {September},
  year = {2005},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/murray-eurospeech05.pdf},
  abstract = {Several approaches to automatic speech summarization are discussed below, using the ICSI Meetings corpus. We contrast feature-based approaches using prosodic and lexical features with maximal marginal relevance and latent semantic analysis approaches to summarization. While the latter two techniques are borrowed directly from the field of text summarization, feature-based approaches using prosodic information are able to utilize characteristics unique to speech data. We also investigate how the summarization results might deteriorate when carried out on ASR output as opposed to manual transcripts. All of the summaries are of an extractive variety, and are compared using the software ROUGE.},
  categories = {ami,summarization,prosody, latent semantic analysis,edinburgh}
}
@inproceedings{cuayahuitletal_asru05,
  author = {Cuayáhuitl, Heriberto and Renals, Steve and Lemon, Oliver and Shimodaira, Hiroshi},
  title = {Human-Computer Dialogue Simulation Using Hidden Markov Models},
  booktitle = {Proc. IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)},
  month = {November},
  year = {2005},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hcp-asru2005.pdf},
  abstract = {This paper presents a probabilistic method to simulate task-oriented human-computer dialogues at the intention level, that may be used to improve or to evaluate the performance of spoken dialogue systems. Our method uses a network of Hidden Markov Models (HMMs) to predict system and user intentions, where a ``language model'' predicts sequences of goals and the component HMMs predict sequences of intentions. We compare standard HMMs, Input HMMs and Input-Output HMMs in an effort to better predict sequences of intentions. In addition, we propose a dialogue similarity measure to evaluate the realism of the simulated dialogues. We performed experiments using the DARPA Communicator corpora and report results with three different metrics: dialogue length, dialogue similarity and precision-recall.},
  categories = {dialogue simulation, hidden markov models}
}
@inproceedings{kilgour2011,
  author = {Kilgour, Jonathan and Carletta, Jean and Renals, Steve},
  doi = {10.1109/HSCMA.2011.5942389},
  title = {The {Ambient Spotlight}: Personal meeting capture with a microphone array},
  booktitle = {Proc. HSCMA},
  year = {2011},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2011/ambientDemo.pdf},
  abstract = {We present the Ambient Spotlight system for personal meeting capture based on a portable USB microphone array and a laptop. The system combined distant speech recognition and content linking with personal productivity tools, and enables recognised meeting recordings to be integrated with desktop search, calender, and email.}
}
@inproceedings{renals-icassp89,
  author = {Renals, S. and Rohwer, R.},
  title = {Learning phoneme recognition using neural networks},
  booktitle = {Proc IEEE ICASSP},
  year = {1989},
  address = {Glasgow},
  pages = {413--416},
  categories = {}
}
@inproceedings{kershaw-arpa96,
  author = {Kershaw, D. and Robinson, T. and Renals, S.},
  booktitle = {Proc. ARPA Spoken Language Technology Conference},
  year = {1996},
  pages = {93--99},
  categories = {wernicke,sprach,wsj,recognition,am,hybrid,abbot,search,eval,sheffield},
  title = {The 1995 {Abbot} hybrid {connectionist--HMM} large vocabulary recognition system}
}
@inproceedings{hochberg-icassp95,
  author = {Hochberg, M. and Renals, S. and Robinson, T. and Cook, G.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1995/icassp95-abbot.ps.gz},
  title = {Recent improvements to the {Abbot} large vocabulary {CSR} system},
  booktitle = {Proc IEEE ICASSP},
  year = {1995},
  abstract = {ABBOT is the hybrid connectionist-hidden Markov model (HMM) large-vocabulary continuous speech recognition (CSR) system developed at Cambridge University. This system uses a recurrent network to estimate the acoustic observation probabilities within an HMM framework. A major advantage of this approach is that good performance is achieved using context-independent acoustic models and requiring many fewer parameters than comparable HMM systems. This paper presents substantial performance improvements gained from new approaches to connectionist model combination and phone-duration modeling. Additional capability has also been achieved by extending the decoder to handle larger vocabulary tasks (20,000 words and greater) with a trigram language model. This paper describes the recent modifications to the system and experimental results are reported for various test and development sets from the November 1992, 1993, and 1994 ARPA evaluations of spoken language systems.},
  address = {Detroit},
  pages = {69--72},
  categories = {wernicke,recognition,wsj,am,hybrid,abbot,eval,search,sheffield,cambridge}
}
@article{renals-elett88,
  author = {Renals, S.},
  title = {Radial basis functions network for speech pattern classification},
  journal = {Electronics Letters},
  volume = {25},
  year = {1988},
  pages = {437--439},
  categories = {}
}
@article{koumpis2005-acmslp,
  author = {Koumpis, Konstantinos and Renals, Steve},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/tslp05.ps.gz},
  title = {Automatic summarization of voicemail messages using lexical and prosodic features},
  journal = {ACM Transactions on Speech and Language Processing},
  number = {1},
  abstract = {This paper presents trainable methods for extracting principal content words from voicemail messages. The short text summaries generated are suitable for mobile messaging applications. The system uses a set of classifiers to identify the summary words, with each word being identified by a vector of lexical and prosodic features. We use an ROC-based algorithm, Parcel, to select input features (and classifiers). We have performed a series of objective and subjective evaluations using unseen data from two different speech recognition systems, as well as human transcriptions of voicemail speech.},
  volume = {2},
  year = {2005},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/tslp05.pdf},
  pages = {1--24},
  categories = {voicemail,summarization,prosody,sheffield,edinburgh}
}
@inproceedings{huang2008-ptkl,
  author = {Huang, Songfang and Renals, Steve},
  title = {Using Participant Role in Multiparty Meetings as Prior Knowledge for Nonparametric Topic Modeling},
  booktitle = {Proc. ICML/UAI/COLT Workshop on Prior Knowledge for Text and Language Processing},
  year = {2008},
  abstract = {In this paper we introduce our attempts to incorporate the participant role information in multiparty meetings for document modeling using the hierarchical Dirichlet process. The perplexity and automatic speech recognition results demonstrate that the participant role information is a promising prior knowledge source to be combined with language models for automatic speech recognition and interaction modeling for multiparty meetings.},
  month = {July},
  address = {Helsinki, Finland},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/ptkl.pdf},
  pages = {21--24}
}
@inproceedings{lu2012noise,
  author = {Lu, L. and Chin, KK and Ghoshal, A. and Renals, S.},
  title = {{Noise compensation for subspace Gaussian mixture models}},
  booktitle = {Proc. Interspeech},
  year = {2012},
  keywords = {acoustic modelling, noise compensation, SGMM, JUD},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/llu-is2012.pdf},
  abstract = {Joint uncertainty decoding (JUD) is an effective model-based noise compensation technique for conventional Gaussian mixture model (GMM) based speech recognition systems. In this paper, we apply JUD to subspace Gaussian mixture model (SGMM) based acoustic models. The total number of Gaussians in the SGMM acoustic model is usually much larger than for conventional GMMs, which limits the application of approaches which explicitly compensate each Gaussian, such as vector Taylor series (VTS). However, by clustering the Gaussian components into a number of regression classes, JUD-based noise compensation can be successfully applied to SGMM systems. We evaluate the JUD/SGMM technique using the Aurora 4 corpus, and the experimental results indicated that it is more accurate than conventional GMM-based systems using either VTS or JUD noise compensation.}
}
@article{dielmann2007-tmm,
  author = {Dielmann, Alfred and Renals, Steve},
  doi = {10.1109/TMM.2006.886337},
  title = {Automatic meeting segmentation using dynamic {Bayesian} networks},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=4032598&arnumber=4032608&count=23&index=3},
  journal = {IEEE Transactions on Multimedia},
  number = {1},
  abstract = {Multiparty meetings are a ubiquitous feature of organizations, and there are considerable economic benefits that would arise from their automatic analysis and structuring. In this paper, we are concerned with the segmentation and structuring of meetings (recorded using multiple cameras and microphones) into sequences of group meeting actions such as monologue, discussion and presentation. We outline four families of multimodal features based on speaker turns, lexical transcription, prosody, and visual motion that are extracted from the raw audio and video recordings. We relate these low-level features to more complex group behaviors using a multistream modelling framework based on multistream dynamic Bayesian networks (DBNs). This results in an effective approach to the segmentation problem, resulting in an action error rate of 12.2\%, compared with 43\% using an approach based on hidden Markov models. Moreover, the multistream DBN developed here leaves scope for many further improvements and extensions.},
  volume = {9},
  year = {2007},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2007/dielmann2007-tmm.pdf},
  pages = {25--36}
}
@inproceedings{robinson-icassp94,
  author = {Robinson, T. and Hochberg, M. and Renals, S.},
  title = {{IPA}: Improved phone modelling with recurrent neural networks},
  booktitle = {Proc IEEE ICASSP},
  year = {1994},
  address = {Adelaide},
  pages = {37--40},
  categories = {}
}
@inproceedings{renals-mmsp99,
  author = {Renals, S. and Abberley, D. and Kirby, D. and Robinson, T.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/mmsp99.ps.gz},
  http = {http://homepages.inf.ed.ac.uk/srenals/pubs/1999/mmsp99-54/},
  title = {The {THISL} System for Indexing and Retrieval of Broadcast News},
  booktitle = {Proc. IEEE Workshop on Multimedia Signal Processing},
  year = {1999},
  abstract = {This paper describes the THISL news retrieval system which maintains an archive of BBC radio and television news recordings. The system uses the Abbot large vocabulary continuous speech recognition system to transcribe news broadcasts, and the thislIR text retrieval system to index and access the transcripts. Decoding and indexing is performed automatically, and the archive is updated with three hours of new material every day. A web-based interface to the retrieval system has been devised to facilitate access to the archive.},
  address = {Copenhagen},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/mmsp99.pdf},
  pages = {77--82},
  categories = {thisl,bnews,trec,ir,recognition,sheffield}
}
@article{koumpis2005-spmag,
  author = {Koumpis, Konstantinos and Renals, Steve},
  title = {Content-based access to spoken audio},
  journal = {IEEE Signal Processing Magazine},
  number = {5},
  abstract = {"How analysis, retrieval and delivery phases make spoken audio content more accessible"},
  volume = {22},
  year = {2005},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/koumpis-spm05.pdf},
  pages = {61--69},
  categories = {asr,ir,summarization,edinburgh}
}
@inproceedings{kilgour2010a,
  author = {Kilgour, Jonathan and Carletta, Jean and Renals, Steve},
  doi = {10.1145/1891903.1891919},
  title = {The {Ambient Spotlight}: Personal multimodal search without query},
  url = {http://dx.doi.org/10.1145/1891903.1891919},
  booktitle = {Proc. ICMI-MLMI},
  year = {2010},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2010/ambientDemo-icmi.pdf},
  abstract = {The Ambient Spotlight is a prototype system based on personal meeting capture using a laptop and a portable microphone array. The system automatically recognises and structures the meeting content using automatic speech recognition, topic segmentation and extractive summarisation. The recognised speech in the meeting is used to construct queries to automatically link meeting segments to other relevant material, both multimodal and textual. The interface to the system is constructed around a standard calendar interface, and it is integrated with the laptop's standard indexing, search and retrieval.}
}
@inproceedings{AMIMLMI05,
  author = {Hain, T. and Burget, L. and Dines, J. and Garau, G. and Karafiat, M. and Lincoln, M. and McCowan, I. and Moore, D. and Wan, V. and Ordelman, R. and Renals, S.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/AMIsystemMLMI05.pdf},
  booktitle = {2nd Joint Workshop on Multimodal Interaction and Related Machine Learning Algorithms},
  year = {2005},
  abstract = {The automatic processing of speech collected in conference style meetings has attracted considerable interest with several large scale projects devoted to this area. This paper describes the development of a baseline automatic speech transcription system for meetings in the context of the AMI (Augmented Multiparty Interaction) project. We present several techniques important to processing of this data and show the performance in terms of word error rates (WERs). An important aspect of transcription of this data is the necessary flexibility in terms of audio pre-processing. Real world systems have to deal with flexible input, for example by using microphone arrays or randomly placed microphones in a room. Automatic segmentation and microphone array processing techniques are described and the effect on WERs is discussed. The system and its components presented in this paper yield compettive performance and form a baseline for future research in this domain.},
  title = {The Development of the {AMI} System for the Transcription of Speech in Meetings}
}
@inproceedings{uriaIS2012,
  author = {Uria, Benigno and Murray, Iain and Renals, Steve and Richmond, Korin},
  title = {Deep Architectures for Articulatory Inversion},
  booktitle = {Proc. Interspeech},
  year = {2012},
  month = {September},
  address = {Portland, Oregon, USA},
  keywords = {Articulatory inversion, deep neural network, deep belief network, deep regression network, pretraining},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Uria_et_al_IS2012.pdf},
  abstract = {We implement two deep architectures for the acoustic-articulatory inversion mapping problem: a deep neural network and a deep trajectory mixture density network. We find that in both cases, deep architectures produce more accurate predictions than shallow architectures and that this is due to the higher expressive capability of a deep model and not a consequence of adding more adjustable parameters. We also find that a deep trajectory mixture density network is able to obtain better inversion accuracies than smoothing the results of a deep neural network. Our best model obtained an average root mean square error of 0.885 mm on the MNGU0 test dataset.},
  categories = {Articulatory inversion, deep neural network, deep belief network, deep regression network, pretraining}
}
@inproceedings{renals-eurospeech89,
  author = {Renals, S. and Dalby, J.},
  title = {Analysis of a neural network model for speech recognition},
  booktitle = {Proc. Eurospeech},
  year = {1989},
  volume = {1},
  address = {Paris},
  pages = {333--336},
  categories = {}
}
@inproceedings{murray06b,
  author = {Murray, G. and Renals, S. and Taboada, M.},
  title = {Prosodic Correlates of Rhetorical Relations},
  booktitle = {Proceedings of HLT/NAACL ACTS Workshop, 2006, New York City, USA},
  month = {June},
  year = {2006},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/dacts-hlt.pdf},
  abstract = {This paper investigates the usefulness of prosodic features in classifying rhetorical relations between utterances in meeting recordings. Five rhetorical relations of \textit{contrast}, \textit{elaboration}, \textit{summary}, \textit{question} and \textit{cause} are explored. Three training methods - supervised, unsupervised, and combined - are compared, and classification is carried out using support vector machines. The results of this pilot study are encouraging but mixed, with pairwise classification achieving an average of 68\% accuracy in discerning between relation pairs using only prosodic features, but multi-class classification performing only slightly better than chance.},
  categories = {rhetorical structure theory, prosody, unsupervised learning}
}
@article{2012E121001,
  author = {Yamagishi, Junichi and Veaux, Christophe and King, Simon and Renals, Steve},
  doi = {10.1250/ast.33.1},
  title = {Speech synthesis technologies for individuals with vocal disabilities: Voice banking and reconstruction},
  url = {http://www.jstage.jst.go.jp/browse/ast/33/1/_contents},
  journal = {Acoustical Science and Technology},
  number = {1},
  pages = {1--5},
  volume = {33},
  year = {2012},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/AST-33_1.pdf},
  abstract = {In this invited paper, we overview the clinical applications of speech synthesis technologies and explain a few selected researches. We also introduce the University of Edinburgh’s new project ``Voice Banking and reconstruction'' for patients with degenerative diseases, such as motor neurone disease and Parkinson's disease and show how speech synthesis technologies can improve the quality of life for the patients.}
}
@inproceedings{abberley-esca99,
  author = {Abberley, D. and Kirby, D. and Renals, S. and Robinson, T.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/esca99-thisl.ps.gz},
  http = {http://homepages.inf.ed.ac.uk/srenals/pubs/1999/esca99-thisl/},
  title = {The {THISL} broadcast news retrieval system},
  booktitle = {Proc. ESCA Workshop on Accessing Information In Spoken Audio},
  year = {1999},
  abstract = {This paper described the THISL spoken document retrieval system for British and North American Broadcast News. The system is based on the \textsc{Abbot} large vocabulary speech recognizer, using a recurrent network acoustic model, and a probabilistic text retrieval system. We discuss the development of a realtime British English Broadcast News system, and its integration into a spoken document retrieval system. Detailed evaluation is performed using a similar North American Broadcast News system, to take advantage of the TREC SDR evaluation methodology. We report results on this evaluation, with particular reference to the effect of query expansion and of automatic segmentation algorithms.},
  address = {Cambridge},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/esca99-thisl.pdf},
  pages = {19--24},
  categories = {thisl,bnews,trec,ir,recognition,sheffield}
}
@inproceedings{bourlard-icassp92,
  author = {Bourlard, H. and Morgan, N. and Wooters, C. and Renals, S.},
  title = {{CDNN}: A context-dependent neural network for continuous speech recognition},
  booktitle = {Proc IEEE ICASSP},
  year = {1992},
  address = {San Francisco},
  pages = {349--352},
  categories = {}
}
@article{dielmann2008,
  author = {Dielmann, Alfred and Renals, Steve},
  doi = {10.1109/TASL.2008.922463},
  title = {Recognition of Dialogue Acts in Multiparty Meetings using a Switching {DBN}},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=4599391&arnumber=4497831&count=18&index=9},
  journal = {IEEE Transactions on Audio, Speech and Language Processing},
  number = {7},
  abstract = {This paper is concerned with the automatic recognition of dialogue acts (DAs) in multiparty conversational speech. We present a joint generative model for DA recognition in which segmentation and classification of DAs are carried out in parallel. Our approach to DA recognition is based on a switching dynamic Bayesian network (DBN) architecture. This generative approach models a set of features, related to lexical content and prosody, and incorporates a weighted interpolated factored language model. The switching DBN coordinates the recognition process by integrating the component models. The factored language model, which is estimated from multiple conversational data corpora, is used in conjunction with additional task-specific language models. In conjunction with this joint generative model, we have also investigated the use of a discriminative approach, based on conditional random fields, to perform a reclassification of the segmented DAs. We have carried out experiments on the AMI corpus of multimodal meeting recordings, using both manually transcribed speech, and the output of an automatic speech recognizer, and using different configurations of the generative model. Our results indicate that the system performs well both on reference and fully automatic transcriptions. A further significant improvement in recognition accuracy is obtained by the application of the discriminative reranking approach based on conditional random fields.},
  volume = {16},
  year = {2008},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/dielmann2008.pdf},
  pages = {1303--1314}
}
@inproceedings{lu_asru_2011,
  author = {Lu, L. and Ghoshal, A. and Renals, S.},
  title = {Regularized Subspace {G}ausian Mixture Models for Cross-lingual Speech Recognition},
  booktitle = {Proc. ASRU},
  year = {2011},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/lu-asru-2011.pdf},
  abstract = {We investigate cross-lingual acoustic modelling for low resource languages using the subspace Gaussian mixture model (SGMM). We assume the presence of acoustic models trained on multiple source languages, and use the global subspace parameters from those models for improved modelling in a target language with limited amounts of transcribed speech. Experiments on the GlobalPhone corpus using Spanish, Portuguese, and Swedish as source languages and German as target language (with 1 hour and 5 hours of transcribed audio) show that multilingually trained SGMM shared parameters result in lower word error rates (WERs) than using those from a single source language. We also show that regularizing the estimation of the SGMM state vectors by penalizing their $\ell_1$-norm help to overcome numerical instabilities and lead to lower WER.},
  categories = {Subspace Gaussian Mixture Model, Cross-lingual, model regularization}
}
@inproceedings{abberley-trec00,
  author = {Abberley, D. and Renals, S. and Ellis, D. and Robinson, T.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/trec8.ps.gz},
  title = {The {THISL} {SDR} system at {TREC}--8},
  booktitle = {Proc. Eighth Text Retrieval Conference (TREC--8)},
  year = {2000},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/trec8.pdf},
  abstract = {This paper describes the participation of the THISL group at the TREC-8 Spoken Document Retrieval (SDR) track. The THISL SDR system consists of the realtime version of the Abbot large vocabulary speech recognition system and the thislIR text retrieval system. The TREC-8 evaluation assessed SDR performance on a corpus of 500 hours of broadcast news material collected over a five month period. The main test condition involved retrieval of stories defined by manual segmentation of the corpus in which non-news material, such as commercials, were excluded. An optional test condition required required retrieval of the same stories from the unsegmented audio stream. The THISL SDR system participated at both test conditions. The results show that a system such as THISL can produce respectable information retrieval performance on a realistically-sized corpus of unsegmented audio material.},
  categories = {thisl,bnews,trec,ir,recognition,eval,sheffield}
}
@inproceedings{hochberg-nnsp94,
  author = {Hochberg, M. and Cook, G. and Renals, S. and Robinson, T.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1994/nnsp94.ps.gz},
  title = {Connectionist model combination for large vocabulary speech recognition},
  booktitle = {IEEE Proc. Neural Networks for Signal Processing},
  volume = {4},
  year = {1994},
  pages = {269--278},
  categories = {}
}
@inproceedings{renals-darpa99,
  author = {Renals, S. and Gotoh, Y. and Gaizauskas, R. and Stevenson, M.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/darpa99-ne.ps.gz},
  http = {http://homepages.inf.ed.ac.uk/srenals/pubs/1999/darpa99-ne.html},
  title = {The {SPRACH/LaSIE} system for named entity identification in broadcast news},
  booktitle = {Proc. DARPA Broadcast News Workshop},
  abstract = {We have developed two conceptually different systems that are able to identify named entities from spoken audio. One (referred to as SPRACH-S) has a stochastic finite state machine structure for use with an acoustic model that identifies both words and named entities from speech data. The other (referred to as SPRACH-R) is a rule-based system which uses matching against stored name lists, part-of-speech tagging, and light phrasal parsing with specialised named entity grammars. We provide an overview of the two approaches and present results on the Hub-4E IE-NE evaluation task.},
  year = {1999},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/darpa99-ne.pdf},
  pages = {47--50},
  categories = {sprach,stobs,ie,lm,bnews,sheffield}
}
@inproceedings{abberley-trec99,
  author = {Abberley, D. and Renals, S. and Cook, G. and Robinson, T.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/trec7.ps.gz},
  title = {Retrieval of broadcast news documents with the {THISL} system},
  booktitle = {Proc. Seventh Text Retrieval Conference (TREC--7)},
  abstract = {This paper describes the THISL system that participated in the TREC-7 evaluation, Spoken Document Retrieval (SDR) Track, and presents the results obtained, together with some analysis. The THISL system is based on the {\sc Abbot} speech recognition system and the thislIR text retrieval system. In this evaluation we were concerned with investigating the suitability for SDR of a recognizer running at less than ten times realtime, the use of multiple transcriptions and word graphs, the effect of simple query expansion algorithms and the effect of varying standard IR parameters.},
  year = {1999},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/trec7.pdf},
  pages = {181--190},
  categories = {thisl,bnews,trec,ir,recognition,eval,sheffield}
}
@book{renals-book03,
  editor = {Renals, S. and Grefenstette, G.},
  publisher = {Springer-Verlag},
  title = {Text and Speech Triggered Information Access},
  url = {http://www.springerlink.com/openurl.asp?genre=issue&issn=0302-9743&volume=2705&issue=preprint},
  series = {Lecture Notes in Computer Science},
  abstract = {Edited collection of revised lectures from the \href{http://www.ilsp.gr/testia/testia2000.html} {ELSNET-2000 Summer School} on Text and Speech Triggered Information Access.},
  number = {2705},
  year = {2003},
  categories = {recognition,ir,ie,lm,multimodal,sheffield}
}
@inproceedings{murray06c,
  author = {Murray, G. and Renals, S.},
  title = {Dialogue Act Compression Via Pitch Contour Preservation},
  booktitle = {Proceedings of the 9th International Conference on Spoken Language Processing, Pittsburgh, USA},
  month = {September},
  year = {2006},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/inter2006.pdf},
  abstract = {This paper explores the usefulness of prosody in automatically compressing dialogue acts from meeting speech. Specifically, this work attempts to compress utterances by preserving the pitch contour of the original whole utterance. Two methods of doing this are described in detail and are evaluated \textit{subjectively} using human annotators and \textit{objectively} using edit distance with a human-authored gold-standard. Both metrics show that such a prosodic approach is much better than the random baseline approach and significantly better than a simple text compression method.},
  categories = {automatic compression, prosody, summarization}
}
@inproceedings{bourlard2008,
  author = {Bourlard, Herve and Renals, Steve},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/bourlard2008.pdf},
  booktitle = {Proc. LangTech 2008},
  year = {2008},
  abstract = {The AMI and AMIDA projects are concerned with the recognition and interpretation of multiparty (face-to-face and remote) meetings. Within these projects we have developed the following: (1) an infrastructure for recording meetings using multiple microphones and cameras; (2) a one hundred hour, manually annotated meeting corpus; (3) a number of techniques for indexing, and summarizing of meeting videos using automatic speech recognition and computer vision, and (4) a extensible framework for browsing, and searching of meeting videos. We give an overview of the various techniques developed in AMI (mainly involving face-to-face meetings), their integration into our meeting browser framework, and future plans for AMIDA (Augmented Multiparty Interaction with Distant Access), the follow-up project to AMI. Technical and business information related to these two projects can be found at www.amiproject.org, respectively on the Scientific and Business portals.},
  title = {Recognition and Understanding of Meetings: Overview of the {European} {AMI} and {AMIDA} Projects}
}
@inproceedings{lai2013summarize,
  author = {Lai, Catherine and Carletta, Jean and Renals, Steve},
  title = {Detecting Summarization Hot Spots in Meetings Using Group Level Involvement and Turn-Taking Features},
  booktitle = {Proc. Interspeech 2013, Lyon, France},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/laic2013summarization.pdf},
  abstract = {In this paper we investigate how participant involvement and turn-taking features relate to extractive summarization of meeting dialogues. In particular, we examine whether automatically derived measures of group level involvement, like participation equality and turn-taking freedom, can help detect where summarization relevant meeting segments will be. Results show that classification using turn-taking features performed better than the majority class baseline for data from both AMI and ICSI meeting corpora in identifying whether meeting segments contain extractive summary dialogue acts. The feature based approach also provided better recall than using manual ICSI involvement hot spot annotations. Turn-taking features were additionally found to be predictive of the amount of extractive summary content in a segment. In general, we find that summary content decreases with higher participation equality and overlap, while it increases with the number of very short utterances. Differences in results between the AMI and ICSI data sets suggest how group participatory structure can be used to understand what makes meetings easy or difficult to summarize.},
  categories = {summarization, turn-taking, involvement, social signals}
}
@inproceedings{lai2013affect,
  author = {Lai, Catherine and Carletta, Jean and Renals, Steve},
  title = {Modelling Participant Affect in Meetings with Turn-Taking Features},
  booktitle = {Proceedings of WASSS 2013, Grenoble, France},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/laic2013affect.pdf},
  abstract = {This paper explores the relationship between turn-taking and meeting affect. To investigate this, we model post-meeting ratings of satisfaction, cohesion and leadership from participants of AMI corpus meetings using group and individual turn-taking features. The results indicate that participants gave higher satisfaction and cohesiveness ratings to meetings with greater group turn-taking freedom and individual very short utterance rates, while lower ratings were associated with more silence and speaker overlap. Besides broad applicability to satisfaction ratings, turn-taking freedom was found to be a better predictor than equality of speaking time when considering whether participants felt that everyone they had a chance to contribute. If we include dialogue act information, we see that substantive feedback type turns like assessments are more predictive of meeting affect than information giving acts or backchannels. This work highlights the importance of feedback turns and modelling group level activity in multiparty dialogue for understanding the social aspects of speech.},
  categories = {turn-taking, meetings, affect, involvement, social signals}
}
@inproceedings{Swietojanski:ASRU13,
  author = {Swietojanski, P. and Ghoshal, A. and Renals, S.},
  doi = {10.1109/ASRU.2013.6707744},
  title = {HYBRID ACOUSTIC MODELS FOR DISTANT AND MULTICHANNEL LARGE VOCABULARY SPEECH RECOGNITION},
  abstract = {We investigate the application of deep neural network (DNN)-hidden Markov model (HMM) hybrid acoustic models for far-field speech recognition of meetings recorded using microphone arrays. We show that the hybrid models achieve significantly better accuracy than conventional systems based on Gaussian mixture models (GMMs). We observe up to 8% absolute word error rate (WER) reduction from a discriminatively trained GMM baseline when using a single distant microphone, and between 4–6% absolute WER reduction when using beamforming on various combinations of array channels. By training the networks on audio from multiple channels, we find the networks can recover significant part of accuracy difference between the single distant microphone and beamformed configurations. Finally, we show that the accuracy of a network recognising speech from a single distant microphone can approach that of a multi-microphone setup by training with data from other microphones.},
  month = {December},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Swietojanski_ASRU2013.pdf},
  booktitle = {Proc. IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)},
  categories = {Distant Speech Recognition, Deep Neural Networks, Microphone Arrays, Beamforming, Meeting recognition}
}
@inproceedings{bell13_lecture_transcription,
  author = {Bell, Peter and Yamamoto, Hitoshi and Swietojanski, Pawel and Wu, Youzheng and McInnes, Fergus and Hori, Chiori and Renals, Steve},
  title = {A lecture transcription system combining neural network acoustic and language models},
  booktitle = {Proc. Interspeech},
  address = {Lyon, France},
  month = {August},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/lecture_transcription_is2013.pdf},
  abstract = {This paper presents a new system for automatic transcription of lectures. The system combines a number of novel features, including deep neural network acoustic models using multi-level adaptive networks to incorporate out-of-domain information, and factored recurrent neural network language models. We demonstrate that the system achieves large improvements on the TED lecture transcription task from the 2012 IWSLT evaluation -- our results are currently the best reported on this task, showing an relative WER reduction of more than 16\% compared to the closest competing system from the evaluation.}
}
@inproceedings{lanchantin13_multigenre_transcription,
  author = {Lanchantin, P. and Bell, P. and Gales, M. and Hain, T. and Liu, X. and Long, Y. and Quinnell, J. and Renals, S. and Saz, O. and Seigel, M. and Swietojanski, P. and Woodland, P.},
  title = {Automatic Transcription of Multi-genre Media Archives},
  booktitle = {Proc. Workshop on Speech, Language and Audio in Multimedia},
  year = {2013},
  address = {Marseille, France},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/lanchantin13_multigenre_transcription.pdf},
  abstract = {This paper describes some recent results of our collaborative work on developing a speech recognition system for the automatic transcription or media archives from the British Broadcasting Corporation (BBC). Material includes a high diversity of shows with their associated transcriptions. The latter are highly diverse in terms of completeness, reliability and accuracy. First, we investigate how to improve lightly supervised acoustic training when time-stamps information is inaccurate or when speech deviates significantly from the transcription. To address the last issue, word and segment level combination approaches are used between the lightly supervised transcripts and the original programme scripts which yield improved transcriptions. Experimental results show that systems trained using these improved transcriptions consistently outperform those trained using only the original lightly supervised decoding hypotheses. Secondly, we show that the recognition task may benefit from systems trained on a combination of in-domain and out-of-domain data. Working with tandem HMMs, we present Multi-level Adaptive Networks, a novel technique for incorporating information from out-of domain posterior features using deep neural network. We show that it provides a substantial reduction in WER over other systems including PLP baseline, in-domain tandem features and best out-of-domain tandem features.}
}
@inproceedings{jdriesen_asru13,
  author = {Driesen, Joris and Renals, Steve},
  doi = {10.1109/ASRU.2013.6707772},
  title = {Lightly Supervised Automatic Subtitling of Weather Forecasts},
  booktitle = {Proc. Automatic Speech Recognition and Understanding Workshop},
  address = {Olomouc, Czech Republic},
  month = {December},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/asru13.pdf},
  abstract = {Since subtitling television content is a costly process, there are large potential advantages to automating it, using automatic speech recognition (ASR). However, training the necessary acoustic models can be a challenge, since the available training data usually lacks verbatim orthographic transcriptions. If there are approximate transcriptions, this problem can be overcome using light supervision methods. In this paper, we perform speech recognition on broadcasts of Weatherview, BBC's daily weather report, as a first step towards automatic subtitling. For training, we use a large set of past broadcasts, using their manually created subtitles as approximate transcriptions. We discuss and and compare two different light supervision methods, applying them to this data. The best training set finally obtained with these methods is used to create a hybrid deep neural networkbased recognition system, which yields high recognition accuracies on three separate Weatherview evaluation sets.}
}
@inproceedings{bourlard_slam2013,
  author = {Bourlard, H. and Ferras, M. and Pappas, N. and Popescu-Belis, A. and Renals, S. and McInnes, F. and Bell, P. and Ingram, S. and Guillemot, M.},
  title = {Processing and Linking Audio Events in Large Multimedia Archives: The {EU} {inEvent} Project},
  booktitle = {Proceedings of SLAM 2013 (First Workshop on Speech, Language and Audio in Multimedia)},
  year = {2013},
  month = {August},
  address = {Marseille, France},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/bourlard_slam2013.pdf},
  abstract = {In the inEvent EU project, we aim at structuring, retrieving, and sharing large archives of networked, and dynamically changing, multimedia recordings, mainly consisting of meetings, videoconferences, and lectures. More specifically, we are developing an integrated system that performs audiovisual processing of multimedia recordings, and labels them in terms of interconnected "hyper-events" (a notion inspired from hyper-texts). Each hyper-event is composed of simpler facets, including audio-video recordings and metadata, which are then easier to search, retrieve and share. In the present paper, we mainly cover the audio processing aspects of the system, including speech recognition, speaker diarization and linking (across recordings), the use of these features for hyper-event indexing and recommendation, and the search portal. We present initial results for feature extraction from lecture recordings using the TED talks.},
  categories = {networked multimedia events, audio processing: speech recognition, speaker diarization and linking, multimedia indexing and searching, hyper-events}
}
@inproceedings{lu2013_nat,
  author = {Lu, Liang and Ghoshal, Arnab and Renals, Steve},
  title = {Noise adaptive training for subspace {Gaussian} mixture models},
  abstract = {Noise adaptive training (NAT) is an effective approach to normalise environmental distortions when training a speech recogniser on noise-corrupted speech. This paper investigates the model-based NAT scheme using joint uncertainty decoding (JUD) for subspace Gaussian mixture models (SGMMs). A typical SGMM acoustic model has much larger number of surface Gaussian components, which makes it computationally infeasible to compensate each Gaussian explicitly. JUD tackles this problem by sharing the compensation parameters among the Gaussians and hence reduces the computational and memory demands. For noise adaptive training, JUD is reformulated into a generative model, which leads to an efficient expectation-maximisation (EM) based algorithm to update the SGMM acoustic model parameters. We evaluated the SGMMs with NAT on the Aurora 4 database, and obtained higher recognition accuracy compared to systems without adaptive training.},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/lu2013noise.pdf},
  booktitle = {Proc. Interspeech},
  categories = {adaptive training, noise robustness, joint uncertainty decoding, subspace Gaussian mixture models}
}
@inproceedings{lu2013_pronunciation,
  author = {Lu, Liang and Ghoshal, Arnab and Renals, Steve},
  doi = {10.1109/ASRU.2013.6707759},
  title = {Acoustic Data-driven Pronunciation Lexicon for Large Vocabulary Speech Recognition},
  abstract = {Speech recognition systems normally use handcrafted pronunciation lexicons designed by linguistic experts. Building and maintaining such a lexicon is expensive and time consuming. This paper concerns automatically learning a pronunciation lexicon for speech recognition. We assume the availability of a small seed lexicon and then learn the pronunciations of new words directly from speech that is transcribed at word-level. We present two implementations for refining the putative pronunciations of new words based on acoustic evidence. The first one is an expectation maximization (EM) algorithm based on weighted finite state transducers (WFSTs) and the other is its Viterbi approximation. We carried out experiments on the Switchboard corpus of conversational telephone speech. The expert lexicon has a size of more than 30,000 words, from which we randomly selected 5,000 words to form the seed lexicon. By using the proposed lexicon learning method, we have significantly improved the accuracy compared with a lexicon learned using a grapheme-to-phoneme transformation, and have obtained a word error rate that approaches that achieved using a fully handcrafted lexicon.},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/liang_asru13.pdf},
  booktitle = {Proc. ASRU},
  categories = {Lexical modelling, Probabilistic pronunciation model, Automatic speech recognition}
}
@article{lu2013cross,
  author = {Lu, Liang and Ghoshal, Arnab and Renals, Steve},
  doi = {10.1109/TASL.2013.2281575},
  title = {{Cross-lingual subspace {Gaussian} mixture model for low-resource speech recognition}},
  journal = {IEEE Transactions on Audio, Speech and Language Processing},
  number = {1},
  abstract = {This paper studies cross-lingual acoustic modelling in the context of subspace Gaussian mixture models (SGMMs). SGMMs factorize the acoustic model parameters into a set that is globally shared between all the states of a hidden Markov model (HMM) and another that is specific to the HMM states. We demonstrate that the SGMM global parameters are transferable between languages, particularly when the parameters are trained multilingually. As a result, acoustic models may be trained using limited amounts of transcribed audio by borrowing the SGMM global parameters from one or more source languages, and only training the state-specific parameters on the target language audio. Model regularization using $\ell_1$-norm penalty is shown to be particularly effective at avoiding overtraining and leading to lower word error rates. We investigate maximum a posteriori (MAP) adaptation of subspace parameters in order to reduce the mismatch between the SGMM global parameters of the source and target languages. In addition, monolingual and cross-lingual speaker adaptive training is used to reduce the model variance introduced by speakers. We have systematically evaluated these techniques by experiments on the GlobalPhone corpus.},
  volume = {22},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/lu_crosslingual13.pdf},
  pages = {17--27},
  categories = {acoustic modelling, subspace Gaussian mixture model, cross-lingual speech recognition, regularization, adaptation}
}
@article{Geng2013421,
  author = {Geng, Christian and Turk, Alice and Scobbie, James M. and Macmartin, Cedric and Hoole, Philip and Richmond, Korin and Wrench, Alan and Pouplier, Marianne and Bard, Ellen Gurman and Campbell, Ziggy and Dickie, Catherine and Dubourg, Eddie and Hardcastle, William and Kainada, Evia and King, Simon and Lickley, Robin and Nakai, Satsuki and Renals, Steve and White, Kevin and Wiegand, Ronny},
  doi = {http://dx.doi.org/10.1016/j.wocn.2013.07.002},
  title = {Recording speech articulation in dialogue: Evaluating a synchronized double electromagnetic articulography setup},
  url = {http://www.sciencedirect.com/science/article/pii/S0095447013000375},
  journal = {Journal of Phonetics},
  issn = {0095-4470},
  number = {6},
  abstract = {Abstract We demonstrate the workability of an experimental facility that is geared towards the acquisition of articulatory data from a variety of speech styles common in language use, by means of two synchronized electromagnetic articulography (EMA) devices. This approach synthesizes the advantages of real dialogue settings for speech research with a detailed description of the physiological reality of speech production. We describe the facility's method for acquiring synchronized audio streams of two speakers and the system that enables communication among control room technicians, experimenters and participants. Further, we demonstrate the feasibility of the approach by evaluating problems inherent to this specific setup: The first problem is the accuracy of temporal synchronization of the two \{EMA\} machines, the second is the severity of electromagnetic interference between the two machines. Our results suggest that the synchronization method used yields an accuracy of approximately 1 ms. Electromagnetic interference was derived from the complex-valued signal amplitudes. This dependent variable was analyzed as a function of the recording status -- i.e. on/off -- of the interfering machine's transmitters. The intermachine distance was varied between 1 m and 8.5 m. Results suggest that a distance of approximately 6.5 m is appropriate to achieve data quality comparable to that of single speaker recordings.},
  volume = {41},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Geng2013421.pdf},
  pages = {421 - 431}
}
@inproceedings{jdriesen:iwslt_german,
  author = {Driesen, Joris and Bell, Peter and Sinclair, Mark and Renals, Steve},
  title = {Description of the {UEDIN} system for {German ASR}},
  booktitle = {Proc IWSLT},
  year = {2013},
  month = {December},
  address = {Heidelberg, Germany},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/german_iwslt.pdf},
  abstract = {In this paper we describe the ASR system for German built at the University of Edinburgh (UEDIN) for the 2013 IWSLT evaluation campaign. For ASR, the major challenge to overcome, was to find suitable acoustic training data. Due to the lack of expertly transcribed acoustic speech data for German, acoustic model training had to be performed on publicly available data crawled from the internet. For evaluation, lack of a manual segmentation into utterances was handled in two different ways: by generating an automatic segmentation, and by treating entire input files as a single segment. Demonstrating the latter method is superior in the current task, we obtained a WER of 28.16% on the dev set and 36.21% on the test set.}
}
@article{cabral2014a,
  author = {Cabral, J.P. and Richmond, K. and Yamagishi, J. and Renals, S.},
  doi = {10.1109/JSTSP.2014.2307274},
  title = {Glottal Spectral Separation for Speech Synthesis},
  journal = {Selected Topics in Signal Processing, IEEE Journal of},
  issn = {1932-4553},
  number = {2},
  abstract = {This paper proposes an analysis method to separate the glottal source and vocal tract components of speech that is called Glottal Spectral Separation (GSS). This method can produce high-quality synthetic speech using an acoustic glottal source model. In the source-filter models commonly used in speech technology applications it is assumed the source is a spectrally flat excitation signal and the vocal tract filter can be represented by the spectral envelope of speech. Although this model can produce high-quality speech, it has limitations for voice transformation because it does not allow control over glottal parameters which are correlated with voice quality. The main problem with using a speech model that better represents the glottal source and the vocal tract filter is that current analysis methods for separating these components are not robust enough to produce the same speech quality as using a model based on the spectral envelope of speech. The proposed GSS method is an attempt to overcome this problem, and consists of the following three steps. Initially, the glottal source signal is estimated from the speech signal. Then, the speech spectrum is divided by the spectral envelope of the glottal source signal in order to remove the glottal source effects from the speech signal. Finally, the vocal tract transfer function is obtained by computing the spectral envelope of the resulting signal. In this work, the glottal source signal is represented using the Liljencrants-Fant model (LF-model). The experiments we present here show that the analysis-synthesis technique based on GSS can produce speech comparable to that of a high-quality vocoder that is based on the spectral envelope representation. However, it also permit control over voice qualities, namely to transform a modal voice into breathy and tense, by modifying the glottal parameters.},
  month = {April},
  volume = {8},
  year = {2014},
  keywords = {Analytical models;Computational modeling;Estimation;Hidden Markov models;Mathematical model;Speech;Speech synthesis;Glottal spectral separation;LF-model;parametric speech synthesis;voice quality transformation},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/gss-ieee-2014-final.pdf},
  pages = {195-208}
}
@inproceedings{watts-2014,
  author = {Watts, Oliver and Gangireddy, Siva and Yamagishi, Junichi and King, Simon and Renals, Steve and Stan, Adriana and Giurgiu, Mircea},
  title = {NEURAL NET WORD REPRESENTATIONS FOR PHRASE-BREAK PREDICTION WITHOUT A PART OF SPEECH TAGGER},
  booktitle = {Proc. ICASSP},
  year = {2014},
  month = {May},
  pages = {2618--2622},
  address = {Florence, Italy},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/watts-2014.pdf},
  abstract = {The use of shared projection neural nets of the sort used in language modelling is proposed as a way of sharing parameters between multiple text-to-speech system components. We experiment with pretraining the weights of such a shared projection on an auxiliary language modelling task and then apply the resulting word representations to the task of phrase-break prediction. Doing so allows us to build phrase-break predictors that rival conventional systems without any reliance on conventional knowledge-based resources such as part of speech taggers.},
  categories = {Speech synthesis, TTS, unsupervised learning, neural net language modelling, multitask learning}
}
@inproceedings{bell13_iwslt,
  author = {Bell, Peter and McInnes, Fergus and Gangireddy, Siva Reddy and Sinclair, Mark and Birch, Alexandra and Renals, Steve},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/bell13_iwslt_system.pdf},
  booktitle = {Proc. International Workshop on Spoken Language Translation},
  title = {The {UEDIN} English {ASR} System for the {IWSLT} 2013 Evaluation},
  abstract = {This paper describes the University of Edinburgh (UEDIN) English ASR system for the IWSLT 2013 Evaluation. \mbox{Notable} features of the system include deep neural network acoustic models in both tandem and hybrid configuration, cross-domain adaptation with multi-level adaptive networks, and the use of a recurrent neural network language model. Improvements to our system since the 2012 evaluation -- which include the use of a significantly improved n-gram language model -- result in a 19\% relative WER reduction on the \tstD set.},
  year = {2013}
}
@article{Swietojanski:SPL14,
  author = {Swietojanski, P. and Ghoshal, A. and Renals, S.},
  doi = {10.1109/LSP.2014.2325781},
  title = {Convolutional Neural Networks for Distant Speech Recognition},
  journal = {Signal Processing Letters, IEEE},
  issn = {1070-9908},
  number = {9},
  month = {September},
  volume = {21},
  pages = {1120-1124},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/Swietojanski_SPL14.pdf},
  abstract = {We investigate convolutional neural networks (CNNs) for large vocabulary distant speech recognition, trained using speech recorded from a single distant microphone (SDM) and multiple distant microphones (MDM). In the MDM case we explore a beamformed signal input representation compared with the direct use of multiple acoustic channels as a parallel input to the CNN. We have explored different weight sharing approaches, and propose a channel-wise convolution with two-way pooling. Our experiments, using the AMI meeting corpus, found that CNNs improve the word error rate (WER) by 6.5% relative compared to conventional deep neural network (DNN) models and 15.7% over a discriminatively trained Gaussian mixture model (GMM) baseline. For cross-channel CNN training, the WER improves by 3.5% relative over the comparable DNN structure. Compared with the best beamformed GMM system, cross-channel convolution reduces the WER by 9.7% relative, and matches the accuracy of a beamformed DNN.},
  categories = {distant speech recognition, deep neural networks, convolutional neural networks, meetings, AMI corpus}
}
@inproceedings{Renals:HSCMA14,
  author = {Renals, S. and Swietojanski, P.},
  title = {Neural Networks for Distant Speech Recognition},
  booktitle = {The 4th Joint Workshop on Hands-free Speech Communication and Microphone Arrays (HSCMA)},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/srenals-hscma2014.pdf},
  abstract = {Distant conversational speech recognition is challenging owing to the presence of multiple, overlapping talkers, additional non-speech acoustic sources, and the effects of reverberation. In this paper we review work on distant speech recognition, with an emphasis on approaches which combine multichannel signal processing with acoustic modelling, and investigate the use of hybrid neural network / hidden Markov model acoustic models for distant speech recognition of meetings recorded using microphone arrays. In particular we investigate the use of convolutional and fully-connected neural networks with different activation functions (sigmoid, rectified linear, and maxout). We performed experiments on the AMI and ICSI meeting corpora, with results indicating that neural network models are capable of significant improvements in accuracy compared with discriminatively trained Gaussian mixture models.},
  categories = {convolutional neural networks, distant speech recognition, rectifier unit, maxout networks, beamforming, meetings, AMI corpus, ICSI corpus}
}
@inproceedings{zwyssig2013-overlap_SS_MEMS,
  author = {Zwyssig, E. and Faubel, F. and Renals, S. and Lincoln, M.},
  doi = {10.1109/ICASSP.2013.6639033},
  title = {Recognition of overlapping speech using digital {MEMS} microphone arrays},
  booktitle = {Proc IEEE ICASSP},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/zwyssig3344-final.pdf},
  abstract = {This paper presents a new corpus comprising single and overlapping speech recorded using digital MEMS and analogue microphone arrays. In addition to this, the paper presents results from speech separation and recognition experiments on this data. The corpus is a reproduction of the multi-channel Wall Street Journal audio-visual corpus (MC-WSJ-AV), containing recorded speech in both a meeting room and an anechoic chamber using two different microphone types as well as two different array geometries. The speech separation and speech recognition experiments were performed using SRP-PHAT-based speaker localisation, superdirective beamforming and multiple post-processing schemes, such as residual echo suppression and binary masking. Our simple, cMLLR-based recognition system matches the performance of state-of-the-art ASR systems on the single speaker task and outperforms them on overlapping speech. The corpus will be made publicly available via the LDC in spring 2013.}
}
@book{renals2012,
  editor = {Renals, Steve and Bourlard, Herv{\'e} and Carletta, Jean and {Popescu-Belis}, Andrei},
  publisher = {Cambridge University Press},
  title = {Multimodal Signal Processing: Human Interactions in Meetings},
  isbn = {9781107022294},
  year = {2012}
}
@article{lu2014a,
  author = {Lu, Liang and Renals, Steve},
  doi = {10.1109/LSP.2014.2313410},
  title = {Probabilistic Linear Discriminant Analysis for Acoustic Modelling},
  journal = {IEEE Signal Processing Letters},
  number = {6},
  pages = {702--706},
  volume = {21},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/plda-spl2014.pdf},
  abstract = {In this letter, we propose a new acoustic modelling approach for automatic speech recognition based on probabilistic linear discriminant analysis (PLDA), which is used to model the state density function for the standard hidden Markov models (HMMs). Unlike the conventional Gaussian mixture models (GMMs) where the correlations are weakly modelled by using the diagonal covariance matrices, PLDA captures the correlations of feature vector in subspaces without vastly expanding the model. It also allows the usage of high dimensional feature input, and therefore is more flexible to make use of different type of acoustic features. We performed the preliminary experiments on the Switchboard corpus, and demonstrated the feasibility of this acoustic model.}
}
@inproceedings{Swietojanski2014_lhuc,
  author = {Swietojanski, P. and Renals, S.},
  title = {Learning Hidden Unit Contributions for Unsupervised Speaker Adaptation of Neural Network Acoustic Models},
  booktitle = {Proc. IEEE Workshop on Spoken Language Technology},
  year = {2014},
  month = {December},
  address = {Lake Tahoe, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/ps-slt14.pdf},
  abstract = {This paper proposes a simple yet effective model-based neural network speaker adaptation technique that learns speaker- specific hidden unit contributions given adaptation data, without requiring any form of speaker-adaptive training, or labelled adaptation data. An additional amplitude parameter is defined for each hidden unit; the amplitude parameters are tied for each speaker, and are learned using unsupervised adaptation. We conducted experiments on the TED talks data, as used in the International Workshop on Spoken Language Translation (IWSLT) evaluations. Our results indicate that the approach can reduce word error rates on standard IWSLT test sets by about 8–15% relative compared to unadapted systems, with a further reduction of 4–6% relative when combined with feature-space maximum likelihood linear re- gression (fMLLR). The approach can be employed in most existing feed-forward neural network architectures, and we report results using various hidden unit activation functions: sigmoid, maxout, and rectifying linear units (ReLU).},
  categories = {Speaker Adaptation, Deep Neural Networks, TED, IWSLT, LHUC}
}
@inproceedings{lai2014,
  author = {Lai, Catherine and Renals, Steve},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/lai2014incorporating.pdf},
  booktitle = {Proc. Interspeech 2014},
  year = {2014},
  abstract = {This paper investigates how prosodic features can be used to augment lexical features for meeting summarization. Automatic detection of summary-worthy content using non-lexical features, like prosody, has generally focused on features calculated over dialogue acts. However, a salient role of prosody is to distinguish important words within utterances. To examine whether including more fine grained prosodic information can help extractive summarization, we perform experiments incorporating lexical and prosodic features at different levels. For ICSI and AMI meeting corpora, we find that combining prosodic and lexical features at a lower level has better AUROC performance than adding in prosodic features derived over dialogue acts. ROUGE F-scores also show the same pattern for the ICSI data. However, the differences are less clear for the AMI data where the range of scores is much more compressed. In order to understand the relationship between the generated summaries and differences in standard measures, we look at the distribution of extracted content over meeting as well as summary redundancy. We find that summaries based on dialogue act level prosody better reflect the amount of human annotated summary content in meeting segments, while summaries derived from prosodically augmented lexical features exhibit less redundancy.},
  title = {Incorporating Lexical and Prosodic Information at Different Levels for Meeting Summarization}
}
@inproceedings{llu_IS2014,
  author = {Lu, Liang and Renals, Steve},
  title = {Probabilistic linear discriminant analysis with bottleneck features for speech recognition},
  abstract = {We have recently proposed a new acoustic model based on prob- abilistic linear discriminant analysis (PLDA) which enjoys the flexibility of using higher dimensional acoustic features, and is more capable to capture the intra-frame feature correlations. In this paper, we investigate the use of bottleneck features obtained from a deep neural network (DNN) for the PLDA-based acous- tic model. Experiments were performed on the Switchboard dataset --- a large vocabulary conversational telephone speech corpus. We observe significant word error reduction by using the bottleneck features. In addition, we have also compared the PLDA-based acoustic model to three others using Gaussian mixture models (GMMs), subspace GMMs and hybrid deep neural networks (DNNs), and PLDA can achieve comparable or slightly higher recognition accuracy from our experiments.},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/llu_is14.pdf},
  booktitle = {Proc. Interspeech},
  categories = {speech recognition, bottleneck features, probabilistic linear discriminant analysis}
}
@inproceedings{bell14_xling_mlan,
  author = {Bell, P. and Driesen, J. and Renals, S.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/bell_xling_mlan_is2014.pdf},
  booktitle = {Proc. Interspeech},
  year = {2014},
  abstract = {Posterior-based or bottleneck features derived from neural networks trained on out-of-domain data may be successfully applied to improve speech recognition performance when data is scarce for the target domain or language. In this paper we combine this approach with the use of a hierarchical deep neural network (DNN) network structure -- which we term a multi-level adaptive network (MLAN) -- and the use of multitask learning. We have applied the technique to cross-lingual speech recognition experiments on recordings of TED talks and European Parliament sessions in English (source language) and German (target language). We demonstrate that the proposed method can lead to improvements over standard methods, even when the quantity of training data for the target language is relatively high. When the complete method is applied, we achieve relative WER reductions of around 13\% compared to a monolingual hybrid DNN baseline.},
  title = {Cross-lingual adaptation with multi-task adaptive networks}
}
@inproceedings{sgangireddy_interspeech14,
  author = {Gangireddy, Siva Reddy and McInnes, Fergus and Renals, Steve},
  title = {Feed Forward Pre-Training for Recurrent Neural Network Language Models},
  booktitle = {Proc. Interspeech},
  abstract = {The recurrent neural network language model (RNNLM) has been demonstrated to consistently reduce perplexities and automatic speech recognition (ASR) word error rates across a variety of domains. In this paper we propose a pre-training method for the RNNLM, by sharing the output weights of the feed forward neural network language model (NNLM) with the RNNLM. This is accomplished by first fine-tuning the weights of the NNLM, which are then used to initialise the output weights of an RNNLM with the same number of hidden units. We have carried out text-based experiments on the Penn Treebank Wall Street Journal data, and ASR experiments on the TED talks data used in the International Workshop on Spoken Language Translation (IWSLT) evaluation campaigns. Across the experiments, we observe small improvements in perplexity and ASR word error rate.},
  month = {September},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/srg-interspeech14.pdf},
  pages = {2620-2624},
  categories = {Language Modelling, Recurrent Neural Network, Pre-training, Automatic Speech Recognition, TED talks}
}
@inproceedings{SwietojanskiICASSP15,
  author = {Swietojanski, P. and Renals, S.},
  title = {Differentiable Pooling for Unsupervised Speaker Adaptation},
  booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
  year = {2015},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/Swietojanski_ICASSP2015.pdf},
  abstract = {This paper proposes a differentiable pooling mechanism to perform model-based neural network speaker adaptation. The proposed technique learns a speaker-dependent combination of activations within pools of hidden units, was shown to work well unsupervised, and does not require speaker-adaptive training. We have conducted a set of experiments on the TED talks data, as used in the IWSLT evaluations. Our results indicate that the approach can reduce word error rates (WERs) on standard IWSLT test sets by about 5–11% relative compared to speaker-independent systems and was found complementary to the recently proposed learning hidden units contribution (LHUC) approach, reducing WER by 6–13% relative. Both methods were also found to work well when adapting with small amounts of unsupervised data – 10 seconds is able to decrease the WER by 5% relative compared to the baseline speaker independent system},
  categories = {Differentiable pooling, Speaker Adaptation, Deep Neural Networks, TED, LHUC}
}
@inproceedings{Swietojanski2015,
  author = {Swietojanski, P. and Bell, P. and Renals, S.},
  doi = {},
  title = {Structured Output Layer with Auxiliary Targets for Context-Dependent Acoustic Modelling},
  booktitle = {Proc. Interspeech},
  year = {2015},
  abstract = {In previous work we have introduced a multi-task training technique for neural network acoustic modelling, in which context-dependent and context-independent targets are jointly learned. In this paper, we extend the approach by structuring the out-put layer such that the context-dependent outputs are dependent on the context-independent outputs, thus using the context-independent predictions at run-time. We have also investigated the applicability of this idea to unsupervised speaker adaptation as an approach to overcome the data sparsity issues that comes to the fore when estimating systems with a large number of context-dependent states, when data is limited. We have experimented with various amounts of training material (from 10 to 300 hours) and find the proposed techniques are particularly well suited to data-constrained conditions allowing to better utilise large context-dependent state-clustered trees. Experimental results are reported for large vocabulary speech recognition using the Switchboard and TED corpora.},
  month = {September},
  address = {Dresden, Germany},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/Swietojanski_Interspeech2015.pdf},
  pages = {},
  categories = {multitask learning, structured output layer, adap- tation, deep neural networks}
}
@inproceedings{rnade_ICASSP15,
  author = {Uria, B. and Murray, I. and Renals, S. and Valentini-Botinhao, C. and Bridle, J.},
  title = {{Modelling acoustic feature dependencies with artificial neural networks: Trajectory-RNADE.}},
  booktitle = {Proc. ICASSP},
  year = {2015},
  abstract = {Given a transcription, sampling from a good model of acoustic feature trajectories should result in plausible realizations of an utterance. However, samples from current probabilistic speech synthesis systems result in low quality synthetic speech. Henter et al. have demonstrated the need to capture the dependencies between acoustic features conditioned on the phonetic labels in order to obtain high quality synthetic speech. These dependencies are often ignored in neural network based acoustic models. We tackle this deficiency by introducing a probabilistic neural network model of acoustic trajectories, trajectory RNADE, able to capture these dependencies.},
  month = {April},
  address = {Brisbane, Australia},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/Uria2015.pdf},
  pages = {4465-4469}
}
@inproceedings{bell15_cd_multitask,
  author = {Bell, P. and Renals, S.},
  title = {Regularization of context-dependent deep neural networks with context-independent multi-task training},
  booktitle = {Proc. ICASSP},
  address = {Brisbane, Australia},
  month = {April},
  year = {2015},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/bell15_cd_multitask.pdf},
  abstract = {The use of context-dependent targets has become standard in hybrid DNN systems for automatic speech recognition. However, we argue that despite the use of state-tying, optimising to context-dependent targets can lead to over-fitting, and that discriminating between arbitrary tied context-dependent targets may not be optimal. We propose a multitask learning method where the network jointly predicts context-dependent and monophone targets. We evaluate the method on a large-vocabulary lecture recognition task and show that it yields relative improvements of 3-10\% over baseline systems.}
}
@inproceedings{bell14_iwslt,
  author = {Bell, Peter and Swietojanski, Pawel and Driesen, Joris and Sinclair, Mark and McInnes, Fergus and Renals, Steve},
  title = {The {UEDIN} {ASR} Systems for the {IWSLT} 2014 Evaluation},
  booktitle = {Proc. IWSLT},
  address = {South Lake Tahoe, USA},
  month = {December},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/bell14_iwslt.pdf},
  abstract = {This paper describes the University of Edinburgh (UEDIN) ASR systems for the 2014 IWSLT Evaluation. Notable features of the English system include deep neural network acoustic models in both tandem and hybrid configuration with the use of multi-level adaptive networks, LHUC adaptation and Maxout units. The German system includes lightly supervised training and a new method for dictionary generation. Our voice activity detection system now uses a semi-Markov model to incorporate a prior on utterance lengths. There are improvements of up to 30\% relative WER on the tst2013 English test set.}
}
@inproceedings{bell15_complementary_task_mt,
  author = {Bell, Peter and Renals, Steve},
  title = {Complementary tasks for context-dependent deep neural network acoustic models},
  booktitle = {Proc. Interspeech},
  address = {Dresden, Germany},
  month = {September},
  year = {2015},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/bell15_complementary_task_mt.pdf},
  abstract = {We have previously found that context-dependent DNN models for automatic speech recognition can be improved with the use of monophone targets as a secondary task for the network. This paper asks whether the improvements derive from the regularising effect of having a much small number of monophone outputs -- compared to the typical number of tied states -- or from the use of targets that are not tied to an arbitrary state-clustering. We investigate the use of factorised targets for left and right context, and targets motivated by articulatory properties of the phonemes. We present results on a large-vocabulary lecture recognition task. Although the regularising effect of monophones seems to be important, all schemes give substantial improvements over the baseline single task system, even though the cardinality of the outputs is relatively high.}
}
@inproceedings{wu2015adaptation,
  author = {Wu, Zhizheng and Swietojanski, Pawel and Veaux, Christophe and Renals, Steve and King, Simon},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/is2015_dnn_adaptation.pdf},
  booktitle = {Interspeech},
  year = {2015},
  title = {A study of speaker adaptation for {DNN}-based speech synthesis}
}
@inproceedings{llu_is2015b,
  author = {Lu, Liang and Zhang, Xingxing and Cho, KyungHyun and Renals, Steve},
  date-modified = {2015-08-19 10:22:47 +0100},
  title = {A Study of the Recurrent Neural Network Encoder-Decoder for Large Vocabulary Speech Recognition},
  abstract = {Deep neural networks have advanced the state-of-the-art in automatic speech recognition, when combined with hidden Markov models (HMMs). Recently there has been interest in using systems based on recurrent neural networks (RNNs) to perform sequence modelling directly, without the requirement of an HMM superstructure. In this paper, we study the RNN encoder-decoder approach for large vocabulary end-to-end speech recognition, whereby an encoder transforms a sequence of acoustic vectors into a sequence of feature representations, from which a decoder recovers a sequence of words. We investigated this approach on the Switchboard corpus using a training set of around 300 hours of transcribed audio data. Without the use of an explicit language model or pronunciation lexicon, we achieved promising recognition accuracy, demonstrating that this approach warrants further investigation.},
  year = {2015},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/liang_is15a.pdf},
  booktitle = {Proc. Interspeech},
  categories = {end-to-end speech recognition, deep neural networks, recurrent neural networks, encoder-decoder},
  date-added = {2015-08-19 10:14:21 +0100}
}
@inproceedings{llu_is2015a,
  author = {Lu, Liang and Renals, Steve},
  date-modified = {2015-08-19 10:13:33 +0100},
  title = {Feature-space Speaker Adaptation for Probabilistic Linear Discriminant Analysis Acoustic Models},
  abstract = {Probabilistic linear discriminant analysis (PLDA) acoustic models extend Gaussian mixture models by factorizing the acoustic variability using state-dependent and observation-dependent variables. This enables the use of higher dimensional acoustic features, and the capture of intra-frame feature correlations. In this paper, we investigate the estimation of speaker adaptive feature-space (constrained) maximum likelihood linear regression transforms from PLDA-based acoustic models. This feature-space speaker transformation estimation approach is potentially very useful due to the ability of PLDA acoustic models to use different types of acoustic features, for example applying these transforms to deep neural network (DNN) acoustic models for cross adaptation. We evaluated the approach on the Switchboard corpus, and observe significant word error reduction by using both the mel-frequency cepstral coefficients and DNN bottleneck features.},
  year = {2015},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/liang_is15b.pdf},
  booktitle = {Proc. Interspeech},
  categories = {speech recognition, probabilistic linear discriminant analysis, speaker adaptation, fMLLR, PLDA},
  date-added = {2015-08-19 10:11:53 +0100}
}
@inproceedings{llu_icassp15,
  author = {Lu, Liang and Renals, Steve},
  date-modified = {2015-08-19 10:16:24 +0100},
  title = {Multi-frame factorisation for long-span acoustic modelling},
  abstract = {Acoustic models based on Gaussian mixture models (GMMs) typically use short span acoustic feature inputs. This does not capture long-term temporal information from speech owing to the conditional independence assumption of hidden Markov models. In this paper, we present an implicit approach that approximates the joint distribution of long span features by product of factorized models, in contrast to deep neural networks (DNNs) that model feature correlations directly. The approach is applicable to a broad range of acoustic models. We present experiments using GMM and probabilistic linear discriminant analysis (PLDA) based models on Switchboard, observing consistent word error rate reductions.},
  year = {2015},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/llu_icassp15.pdf},
  booktitle = {Proc. ICASSP},
  categories = {Acoustic modelling, long span features, multi-frame factorisation},
  date-added = {2015-08-19 10:06:12 +0100}
}
@inproceedings{Swietojanski_ICASSP2016,
  author = {Swietojanski, P. and Renals, S.},
  title = {SAT-LHUC: Speaker Adaptive Training for Learning Hidden Unit Contributions},
  booktitle = {Proc. IEEE ICASSP},
  address = {Shanghai, China},
  month = {March},
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/Swietojanski_ICASSP2016.pdf},
  abstract = {This paper extends learning hidden unit contributions (LHUC) unsupervised speaker adaptation with speaker adaptive training (SAT). Contrary to other SAT approaches, the proposed technique does not require speaker-dependent features, the generation of auxiliary generative models to estimate or extract speaker-dependent information, or any changes to the speaker-independent model structure. SAT-LHUC is directly integrated into the objective and jointly learns speaker-independent and speaker-dependent representations. We demonstrate that the SAT-LHUC technique can match feature-space regression transforms for matched narrow-band data and outperform it on wide-band data when the runtime distribution differs significantly from training one. We have obtained 6.5%, 10% and 18.5% relative word error rate reductions compared to speaker-independent models on Switchboard, AMI meetings and TED lectures, respectively. This corresponds to relative gains of 2%, 4% and 6% compared with non-SAT LHUC adaptation. SAT-LHUC was also found to be complementary to SAT with feature-space maximum likelihood linear regression transforms.},
  categories = {SAT, Deep Neural Networks, LHUC}
}
@inproceedings{bell15_alignment,
  author = {Bell, Peter and Renals, Steve},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/bell15_alignment.pdf},
  booktitle = {Proc. ASRU},
  title = {A system for automatic alignment of broadcast media captions using weighted finite-state transducers},
  abstract = {We describe our system for alignment of broadcast media captions in the 2015 MGB Challenge. A precise time alignment of previously-generated subtitles to media data is important in the process of caption generation by broadcasters. However, this task is challenging due to the highly diverse, often noisy content of the audio, and because the subtitles are frequently not a verbatim representation of the actual words spoken. Our system employs a two-pass approach with appropriately constrained weighted finite state transducers (WFSTs) to enable good alignment even when the audio quality would be challenging for conventional ASR. The system achieves an f-score of 0.8965 on the MGB Challenge development set.},
  year = {2015}
}
@inproceedings{ali15_multi_wer_asr,
  author = {Ali, Ahmed and Magdy, Walid and Bell, Peter and Renals, Steve},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/asru2015-multi-reference.pdf},
  booktitle = {Proc. ASRU},
  title = {Multi-reference {WER} for evaluating {ASR} for languages with no orthographic rules},
  abstract = {We describe our system for alignment of broadcast media captions in the 2015 MGB Challenge. A precise time alignment of previously-generated subtitles to media data is important in the process of caption generation by broadcasters. However, this task is challenging due to the highly diverse, often noisy content of the audio, and because the subtitles are frequently not a verbatim representation of the actual words spoken. Our system employs a two-pass approach with appropriately constrained weighted finite state transducers (WFSTs) to enable good alignment even when the audio quality would be challenging for conventional ASR. The system achieves an f-score of 0.8965 on the MGB Challenge development set.},
  year = {2015}
}
@inproceedings{bell15_mgb_challenge,
  author = {Bell, Peter and Gales, Mark and Hain, Thomas and Kilgour, Jonathan and Lanchantin, Pierre and Liu, Xunying and McParland, Andrew and Renals, Steve and Saz, Oscar and Wester, Mirjam and Woodland, Phil},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/bell15_mgb_challenge.pdf},
  booktitle = {Proc. ASRU},
  title = {The {MGB} challenge: Evaluating multi-genre broadcast media recognition},
  abstract = {This paper describes the Multi-Genre Broadcast (MGB) Challenge at ASRU~2015, an evaluation focused on speech recognition, speaker diarization, and ``lightly supervised'' alignment of BBC TV recordings. The challenge training data covered the whole range of seven weeks BBC TV output across four channels, resulting in about 1,600 hours of broadcast audio. In addition several hundred million words of BBC subtitle text was provided for language modelling. A novel aspect of the evaluation was the exploration of speech recognition and speaker diarization in a longitudinal setting -- i.e. recognition of several episodes of the same show, and speaker diarization across these episodes, linking speakers. The longitudinal tasks also offered the opportunity for systems to make use of supplied metadata including show title, genre tag, and date/time of transmission. This paper describes the task data and evaluation process used in the MGB challenge, and summarises the results obtained.},
  year = {2015}
}
@inproceedings{joachim_fainberg_improving_2016,
  author = {Fainberg, Joachim and Bell, Peter and Lincoln, Mike and Renals, Steve},
  title = {Improving Children's Speech Recognition through Out-of-Domain Data Augmentation},
  abstract = {Children’s speech poses challenges to speech recognition due to strong age-dependent anatomical variations and a lack of large, publicly-available corpora. In this paper we explore data augmentation for children’s speech recognition using stochastic feature mapping (SFM) to transform out-of-domain adult data for both GMM-based and DNN-based acoustic models. We performed experiments on the English PF-STAR corpus, augmenting using WSJCAM0 and ABI. Our experimental results indicate that a DNN acoustic model for childrens speech can make use of adult data, and that out-of-domain SFM is more accurate than in-domain SFM.},
  address = {San Francisco, USA},
  month = {September},
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/master.pdf},
  booktitle = {Proc. Interspeech},
  categories = {speech recognition, data augmentation, children’s speech}
}
@inproceedings{ali16_dialect_detection,
  author = {Ali, Ahmed and Dehak, Najim and Cardinal, Patrick and Khurana, Sameer and Yella, Sree Harsha and Glass, James and Bell, Peter and Renals, Steve},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/is2016-automatic-dialect-detection.pdf},
  booktitle = {Proc. Interspeech},
  title = {Automatic dialect detection in arabic broadcast speech},
  abstract = {In this paper, we investigate different approaches for dialect identification in Arabic broadcast speech. These methods are based on phonetic and lexical features obtained from a speech recognition system, and bottleneck features using the i-vector framework. We studied both generative and discriminative classifiers, and we combined these features using a multi-class Support Vector Machine (SVM). We validated our results on an Arabic/English language identification task, with an accuracy of 100\%. We also evaluated these features in a binary classifier to discriminate between Modern Standard Arabic (MSA) and Dialectal Arabic, with an accuracy of 100\%. We further reported results using the proposed methods to discriminate between the five most widely used dialects of Arabic: namely Egyptian, Gulf, Levantine, North African, and MSA, with an accuracy of 59.2\%. We discuss dialect identification errors in the context of dialect code-switching between Dialectal Arabic and MSA, and compare the error pattern between manually labeled data, and the output from our classifier. All the data used on our experiments have been released to the public as a language identification corpus.},
  year = {2016}
}
@inproceedings{swietojanskiICASSP16,
  author = {Swietojanski, P. and Renals, S.},
  title = {{SAT-LHUC}: Speaker Adaptive Training for Learning Hidden Unit Contributions},
  booktitle = {Proc. IEEE Int. Conf. Acoustic, Speech Signal Processing (ICASSP)},
  abstract = {This paper extends learning hidden unit contributions (LHUC) unsupervised speaker adaptation with speaker adaptive training (SAT). Contrary to other SAT approaches, the proposed technique does not require speaker-dependent features, the generation of auxiliary generative models to estimate or extract speaker-dependent information, or any changes to the speaker-independent model structure. SAT-LHUC is directly integrated into the objective and jointly learns speaker-independent and speaker-dependent representations. We demonstrate that the SAT-LHUC technique can match feature-space regression transforms for matched narrow-band data and outperform it on wide-band data when the runtime distribution differs significantly from training one. We have obtained 6.5\%, 10\% and 18.5\% relative word error rate reductions compared to speaker-independent models on Switchboard, AMI meetings and TED lectures, respectively. This corresponds to relative gains of 2\%, 4\% and 6\% compared with non-SAT LHUC adaptation. SAT-LHUC was also found to be complementary to SAT with feature-space maximum likelihood linear regression transforms.},
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/Swietojanski_ICASSP2016.pdf},
  pages = {5010--5014}
}
@article{swietojanski2016lhuc,
  author = {Swietojanski, P. and Li, J. and Renals, S.},
  doi = {10.1109/TASLP.2016.2560534},
  title = {Learning Hidden Unit Contributions for Unsupervised Acoustic Model Adaptation},
  journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
  issn = {2329-9290},
  number = {8},
  month = {August},
  volume = {24},
  pages = {1450-1463},
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/swietojanski2016lhuc.pdf},
  abstract = {This work presents a broad study on the adaptation of neural network acoustic models by means of learning hidden unit contributions (LHUC) -- a method that linearly re-combines hidden units in a speaker- or environment-dependent manner using small amounts of unsupervised adaptation data. We also extend LHUC to a speaker adaptive training (SAT) framework that leads to a more adaptable DNN acoustic model, working both in a speaker-dependent and a speaker-independent manner, without the requirements to maintain auxiliary speaker-dependent feature extractors or to introduce significant speaker-dependent changes to the DNN structure. Through a series of experiments on four different speech recognition benchmarks (TED talks, Switchboard, AMI meetings, and Aurora4) comprising 270 test speakers, we show that LHUC in both its test-only and SAT variants results in consistent word error rate reductions ranging from 5\% to 23\% relative depending on the task and the degree of mismatch between training and test data. In addition, we have investigated the effect of the amount of adaptation data per speaker, the quality of unsupervised adaptation targets, the complementarity to other adaptation techniques, one-shot adaptation, and an extension to adapting DNNs trained in a sequence discriminative manner.}
}
@article{swietojanski2016diffp,
  author = {{Swietojanski}, P. and {Renals}, S.},
  doi = {10.1109/TASLP.2016.2584700},
  title = {{Differentiable Pooling for Unsupervised Acoustic Model Adaptation}},
  journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
  issn = {2329-9290},
  number = {10},
  month = {October},
  volume = {24},
  pages = {1773-1784},
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/swietojanski2016diffp.pdf},
  abstract = {We present a deep neural network (DNN) acoustic model that includes parametrised and differentiable pooling operators. Unsupervised acoustic model adaptation is cast as the problem of updating the decision boundaries implemented by each pooling operator. In particular, we experiment with two types of pooling parametrisations: learned $L_p$-norm pooling and weighted Gaussian pooling, in which the weights of both operators are treated as speaker-dependent. We perform investigations using three different large vocabulary speech recognition corpora: AMI meetings, TED talks and Switchboard conversational telephone speech. We demonstrate that differentiable pooling operators provide a robust and relatively low-dimensional way to adapt acoustic models, with relative word error rates reductions ranging from 5--20\% with respect to unadapted systems, which themselves are better than the baseline fully-connected DNN-based acoustic models. We also investigate how the proposed techniques work under various adaptation conditions including the quality of adaptation data and complementarity to other feature- and model-space adaptation methods, as well as providing an analysis of the characteristics of each of the proposed approaches.}
}
@inproceedings{sgangireddy_interspeech15,
  author = {Gangireddy, Siva Reddy and Renals, Steve and Nankaku, Yoshihiko and Lee, Akinobu},
  title = {Prosodically-enahanced Recurrent Neural Network Language Models},
  booktitle = {Proc. Interspeech},
  address = {Dresden, Germany},
  abstract = {Recurrent neural network language models have been shown to consistently reduce the word error rates (WERs) of large vocabulary speech recognition tasks. In this work we propose to enhance the RNNLMs with prosodic features computed using the context of the current word. Since it is plausible to compute the prosody features at the word and syllable level we have trained the models on prosody features computed at both these levels. To investigate the effectiveness of proposed models we report perplexity and WER for two speech recognition tasks, Switchboard and TED. We observed substantial improvements in perplexity and small improvements in WER.},
  month = {September},
  year = {2015},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/ispeech15.pdf},
  pages = {2390—2394},
  categories = {RNNLMs, 3-gram, prosody features, pause duration, duration of the word, syllable duration, syllable F0, GMM-HMM, DNN-HMM, Switchboard conversations and TED lectures}
}
@inproceedings{sgangireddy_interspeech16,
  author = {Gangireddy, Siva Reddy and Swietojanski, Pawel and Bell, Peter and Renals, Steve},
  title = {{Unsupervised adaptation of Recurrent Neural Network Language Models}},
  booktitle = {Proc. Interspeech},
  year = {2016},
  month = {September},
  address = {San Francisco, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/ispeech16.pdf},
  abstract = {Recurrent neural network language models (RNNLMs) have been shown to consistently improve Word Error Rates (WERs) of large vocabulary speech recognition systems employing n-gram LMs. In this paper we investigate supervised and unsupervised discriminative adaptation of RNNLMs in a broadcast transcription task to target domains defined by either genre or show. We have explored two approaches based on (1) scaling forward-propagated hidden activations (Learning Hidden Unit Contributions (LHUC) technique) and (2) direct fine-tuning of the parameters of the whole RNNLM. To investigate the effectiveness of the proposed methods we carry out experiments on multi-genre broadcast (MGB) data following the MGB-2015 challenge protocol. We observe small but significant improvements in WER compared to a strong unadapted RNNLM model.},
  categories = {RNNLM, LHUC, unsupervised adaptation, fine-tuning, MGB-Challenge}
}
@article{fainberg2017factorised,
  author = {Fainberg, Joachim and Renals, Steve and Bell, Peter},
  title = {Factorised Representations for Neural Network Adaptation to Diverse Acoustic Environments},
  journal = {Proc. Interspeech 2017},
  abstract = {Adapting acoustic models jointly to both speaker and environment has been shown to be effective. In many realistic scenarios, however, either the speaker or environment at test time might be unknown, or there may be insufficient data to learn a joint transform. Generating independent speaker and environment transforms improves the match of an acoustic model to unseen combinations. Using i-vectors, we demonstrate that it is possible to factorise speaker or environment information using multi-condition training with neural networks. Specifically, we extract bottleneck features from networks trained to classify either speakers or environments. We perform experiments on the Wall Street Journal corpus combined with environment noise from the Diverse Environments Multichannel Acoustic Noise Database. Using the factorised i-vectors we show improvements in word error rates on perturbed versions of the eval92 and dev93 test sets, both when one factor is missing and when the factors are seen but not in the desired combination.},
  year = {2017},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2017/joachimIS2017.pdf},
  pages = {749--753},
  categories = {speech recognition, adaptation, acoustic factorisation, i-vectors, deep neural networks}
}
@article{bell2017multitask,
  author = {Bell, Peter and Swietojanski, Pawel and Renals, Steve},
  publisher = {IEEE},
  title = {Multitask Learning of Context-Dependent Targets in Deep Neural Network Acoustic Models},
  journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
  number = {2},
  abstract = {This paper investigates the use of multitask learning to improve context-dependent deep neural network (DNN) acoustic models. The use of hybrid DNN systems with clustered triphone targets is now standard in automatic speech recognition. However, we suggest that using a single set of DNN targets in this manner may not be the most effective choice, since the targets are the result of a somewhat arbitrary clustering process that may not be optimal for discrimination. We propose to remedy this problem through the addition of secondary tasks predicting alternative content-dependent or context-independent targets. We present a comprehensive set of experiments on a lecture recognition task showing that DNNs trained through multitask learning in this manner give consistently improved performance compared to standard hybrid DNNs. The technique is evaluated across a range of data and output sizes. Improvements are seen when training uses the cross entropy criterion and also when sequence training is applied.},
  volume = {25},
  year = {2017},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2017/master_final_1.pdf},
  pages = {238--247}
}
@inproceedings{tsunoo17_rnn_story_segmentation_fusion,
  author = {Tsunoo, Emiru and Klejch, Ondrej and Bell, Peter and Renals, Steve},
  publisher = {IEEE},
  title = {Hierarchical recurrent neural network for story segmentation using fusion of lexical and acoustic features},
  booktitle = {Proc. ASRU},
  month = aug,
  year = {2017},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2017/ASRU2017Tsunoo_v7.pdf},
  abstract = {A broadcast news stream consists of a number of stories and it is an important task to find the boundaries of stories automatically in news analysis. We capture the topic structure using a hierarchical model based on a Recurrent Neural Network (RNN) sentence modeling layer and a bidirectional Long Short-Term Memory (LSTM) topic modeling layer, with a fusion of acoustic and lexical features. Both features are accumulated with RNNs and trained jointly within the model to be fused at the sentence level. We conduct experiments on the topic detection and tracking (TDT4) task comparing combinations of two modalities trained with limited amount of parallel data. Further we utilize additional sufficient text data for training to polish our model. Experimental results indicate that the hierarchical RNN topic modeling takes advantage of the fusion scheme, especially with additional text training data, with a higher F1-measure compared to conventional state-of-the-art methods.}
}
@inproceedings{ali17_werd,
  author = {Ali, Ahmed and Nakov, Preslav and Bell, Peter and Renals, Steve},
  publisher = {IEEE},
  title = {WERd: Using Social Text Spelling Variants for Evaluating Dialectal Speech Recognition},
  booktitle = {Proc. ASRU},
  month = {December},
  year = {2017},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2017/WERd_ASRU_2017.pdf},
  abstract = {We study the problem of evaluating automatic speech recognition (ASR) systems that target dialectal speech input. A major challenge in this case is that the orthography of dialects is typically not standardized. From an ASR evaluation perspective, this means that there is no clear gold standard for the expected output, and several possible outputs could be considered correct according to different human annotators, which makes standard word error rate (WER) inadequate as an evaluation metric. Such a situation is typical for machine translation (MT), and thus we borrow ideas from an MT evaluation metric, namely TERp, an extension of translation error rate which is closely-related to WER. In particular, in the process of comparing a hypothesis to a reference, we make use of spelling variants for words and phrases, which we mine from Twitter in an unsupervised fashion. Our experiments with evaluating ASR output for Egyptian Arabic, and further manual analysis, show that the resulting WERd (i.e., WER for dialects) metric, a variant of TERp, is more adequate than WER for evaluating dialectal ASR.}
}
@inproceedings{liepins17_summa_platform,
  author = {Liepins, Renars and Germann, Ulrich and Barzdins, Guntis and Birch, Alexandra and Renals, Steve and Weber, Susanne and Kreeft, {Peggy van der} and Bourlard, Hervé and Prieto, João and Klejch, Ondřej and Bell, Peter and Lazaridis, Alexandros and Mendes, Alfonso and Riedel, Sebastian and Almeida, {Mariana S. C.} and Balage, Pedro and Cohen, Shay and Dwojak, Tomasz and Garner, Phil and Giefer, Andreas and Junczys-Dowmunt, Marcin and Imrani, Hina and Nogueira, David and Ali, Ahmed and Miranda, Sebastião and Popescu-Belis, Andrei and Werlen, {Lesly Miculicich} and Papasarantopoulos, Nikos and Obamuyide, Abiola and Jones, Clive and Dalvi, Fahim and Vlachos, Andreas and Wang, Yang and Tong, Sibo and Sennrich, Rico and Pappas, Nikolaos and Narayan, Shashi and Damonte, Marco and Durrani, Nadir and Khurana, Sameer and Abdelali, Ahmed and Sajjad, Hassan and Vogel, Stephan and Sheppey, David and Hernon, Chris},
  publisher = {Association for Computational Linguistics (ACL)},
  isbn = {978-1-945626-34-0},
  title = {The SUMMA Platform Prototype},
  booktitle = {Proceedings of the EACL 2017 Software Demonstrations},
  abstract = {We present the first prototype of the SUMMA Platform: an integrated platform for multilingual media monitoring. The platform contains a rich suite of low-level and high-level natural language processing technologies: automatic speech recognition of broadcast media, machine translation, automated tagging and classification of named entities, semantic parsing to detect relationships between entities, and automatic construction / augmentation of factual knowledge bases. Implemented on the Docker platform, it can easily be deployed, customised, and scaled to large volumes of incoming media streams.},
  month = apr,
  year = {2017},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2017/E17_3029.pdf},
  pages = {116–119}
}
@inproceedings{ali16_mgb2,
  author = {Ali, A. and Bell, P. and Glass, J. and Messaoui, Y. and Mubarak, H. and Renals, S. and Zhang, Y.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/mgb_2_challenge_1.pdf},
  booktitle = {Proc. SLT},
  year = {2016},
  abstract = {This paper describes the Arabic Multi-Genre Broadcast (MGB-2) Challenge for SLT-2016. Unlike last year’s English MGB Challenge, which focused on recognition of diverse TV genres, this year, the challenge has an emphasis on handling the diversity in dialect in Arabic speech. Audio data comes from 19 distinct programmes from the Aljazeera Arabic TV channel between March 2005 and December 2015. Programmes are split into three groups: conversations, interviews, and reports. A total of 1,200 hours have been released with lightly supervised transcriptions for the acoustic modelling. For language modelling, we made available over 110M words crawled from Aljazeera Arabic website Aljazeera.net for a 10 year duration 2000-2011. Two lexicons have been provided, one phoneme based and one grapheme based. Finally, two tasks were proposed for this year’s challenge: standard speech transcription, and word alignment. This paper describes the task data and evaluation process used in the MGB challenge, and summarises the results obtained.},
  title = {The {MGB-2} {C}hallenge: {A}rabic multi-dialect broadcast media recognition}
}
@inproceedings{tsunoo17_rnn,
  author = {Tsunoo, Emiru and Bell, Peter and Renals, Steve},
  title = {Hierarchical Recurrent Neural Network for Story Segmentation},
  booktitle = {Proc. Interspeech},
  month = aug,
  year = {2017},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2017/Interspeech2017Tsunoo_v7.pdf},
  abstract = {A broadcast news stream consists of a number of stories and each story consists of several sentences. We capture this structure using a hierarchical model based on a word-level Recurrent Neural Network (RNN) sentence modeling layer and a sentence-level bidirectional Long Short-Term Memory (LSTM) topic modeling layer. First, the word-level RNN layer extracts a vector embedding the sentence information from the given transcribed lexical tokens of each sentence. These sentence embedding vectors are fed into a bidirectional LSTM that models the sentence and topic transitions. A topic posterior for each sentence is estimated discriminatively and a Hidden Markov model (HMM) follows to decode the story sequence and identify story boundaries. Experiments on the topic detection and tracking (TDT2) task indicate that the hierarchical RNN topic modeling achieves the best story segmentation performance with a higher F1-measure compared to conventional state-of-the-art methods. We also compare variations of our model to infer the optimal structure for the story segmentation task.}
}
@inproceedings{rownicka17,
  author = {Rownicka, Joanna and Renals, Steve and Bell, Peter},
  title = {Simplifying very deep convolutional neural network architectures for robust speech recognition},
  booktitle = {Proc. 2017 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), Okinawa, Japan},
  month = {December},
  year = {2017},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2017/rownicka_asru17.pdf},
  abstract = {Very deep convolutional neural networks (VDCNNs) have been successfully used in computer vision. More recently VDCNNs have been applied to speech recognition, using architectures adopted from computer vision. In this paper, we experimentally analyse the role of the components in VDCNN architectures for robust speech recognition. We have proposed a number of simplified VDCNN architectures, taking into account the use of fully-connected layers and down-sampling approaches. We have investigated three ways to down-sample feature maps: max-pooling, average-pooling, and convolution with increased stride. Our proposed model consisting solely of convolutional (conv) layers, and without any fully-connected layers, achieves a lower word error rate on Aurora 4 compared to other VDCNN architectures typically used in speech recognition. We have also extended our experiments to the MGB-3 task of multi-genre broadcast recognition using BBC TV recordings. The MGB-3 results indicate that the same architecture achieves the best result among our VDCNNs on this task as well.},
  categories = {Robust Speech Recognition, Very Deep Convolutional Neural Networks, Aurora 4, MGB Challenge}
}
@inproceedings{klejch2016slt,
  author = {Klejch, Ondrej and Bell, Peter and Renals, Steve},
  title = {Punctuated transcription of multi-genre broadcasts using acoustic and lexical approaches},
  abstract = {In this paper we investigate the punctuated transcription of multi-genre broadcast media. We examine four systems, three of which are based on lexical features, the fourth of which uses acoustic features by integrating punctuation into the speech recognition acoustic models. We also explore the combination of these component systems using voting and log-linear interpolation. We performed experiments on the English language MGB Challenge data, which comprises about 1,600h of BBC television recordings. Our results indicate that a lexical system, based on a neural machine translation approach is significantly better than other systems achieving an F-Measure of 62.6% on reference text, with a relative degradation of 19% on ASR output. Our analysis of the results in terms of specific punctuation indicated that using longer context improves the prediction of question marks and acoustic information improves prediction of exclamation marks. Finally, we show that even though the systems are complementary, their straightforward combination does not yield better F-measures than a single system using neural machine translation.},
  year = {2016},
  month = {December},
  address = {San Diego, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/slt-2016.pdf},
  booktitle = {Proc. IEEE Workshop on Spoken Language Technology},
  categories = {punctuation, speech recognition, neural machine translation, rich transcription}
}
@inproceedings{klejch2017icassp,
  author = {Klejch, Ondrej and Bell, Peter and Renals, Steve},
  title = {Sequence-to-sequence models for punctuated transcription combining lexical and acoustic features},
  abstract = {In this paper we present an extension of our previously described neural machine translation based system for punctuated transcription. This extension allows the system to map from per frame acoustic features to word level representations by replacing the traditional encoder in the encoder-decoder architecture with a hierarchical encoder. Furthermore, we show that a system combining lexical and acoustic features significantly outperforms systems using only a single source of features on all measured punctuation marks. The combination of lexical and acoustic features achieves a significant improvement in F-Measure of 1.5 absolute over the purely lexical neural machine translation based system.},
  year = {2017},
  month = {March},
  address = {New Orleans, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2017/icassp-2017.pdf},
  booktitle = {Proc. IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
  categories = {punctuation, speech recognition, neural machine translation, rich transcription}
}
@inproceedings{tsunoo2017asru,
  author = {Tsunoo, Emiru and Klejch, Ondrej and Bell, Peter and Renals, Steve},
  title = {Hierarchical recurrent neural network for story segmentation using fusion of lexical and acoustic features},
  abstract = {A broadcast news stream consists of a number of stories and it is an important task to find the boundaries of stories automatically in news analysis. We capture the topic structure using a hierarchical model based on a Recurrent Neural Network (RNN) sentence modeling layer and a bidirectional Long Short-Term Memory (LSTM) topic modeling layer, with a fusion of acoustic and lexical features. Both features are accumulated with RNNs and trained jointly within the model to be fused at the sentence level. We conduct experiments on the topic detection and tracking (TDT4) task comparing combinations of two modalities trained with limited amount of parallel data. Further we utilize additional sufficient text data for training to polish our model. Experimental results indicate that the hierarchical RNN topic modeling takes advantage of the fusion scheme, especially with additional text training data, with a higher F1-measure compared to conventional state-of-the-art methods.},
  year = {2017},
  month = {December},
  address = {Okinawa, Japan},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2017/ASRU2017Tsunoo_v7.pdf},
  booktitle = {Proc. IEEE Automatic Speech Recognition and Understanding Workshop},
  categories = {spoken document processing, recurrent neural network, topic modeling, story segmentation, multimodal features}
}