The Centre for Speech Technology Research, The university of Edinburgh

Publications by Peter Bell

s0566164.bib

@inproceedings{hasler2012,
  author = {Hasler, Eva and Bell, Peter and Ghoshal, Arnab and Haddow, Barry and Koehn, Philipp and McInnes, Fergus and Renals, Steve and Swietojanski, Pawel},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/paper_50.pdf},
  abstract = {This paper describes the University of Edinburgh (UEDIN) systems for the IWSLT 2012 Evaluation. We participated in the ASR (English), MT (English-French, German-English) and SLT (English-French) tracks.},
  title = {The {UEDIN} system for the {IWSLT} 2012 evaluation},
  booktitle = {Proc. International Workshop on Spoken Language Translation},
  year = {2012}
}
@inproceedings{rasipuram13_gaelic_graphemes,
  author = {Rasipuram, Ramya and Bell, Peter and Magimai.-Doss, Mathew},
  title = {Grapheme and multilingual posterior features for under-resourced speech recognition: a study on {S}cottish {G}aelic},
  abstract = {Standard automatic speech recognition (ASR) systems use phonemes as subword units. Thus, one of the primary resources required to build a good ASR system is a well developed phoneme pronunciation lexicon. However, under-resourced languages typically lack such lexical resources. In this paper, we investigate recently proposed grapheme-based ASR in the framework of Kullback-Leibler divergence based hidden Markov model (KL-HMM) for under-resourced languages, particularly Scottish Gaelic which has no lexical resources. More specifically, we study the use of grapheme and multilingual phoneme class conditional probabilities (posterior features) as feature observations in the KL-HMM. ASR studies conducted show that the proposed approach yields better system compared to the conventional HMM/GMM approach using cepstral features. Furthermore, grapheme posterior features estimated using both auxiliary data and Gaelic data yield the best system.},
  address = {Vancouver, Canada},
  month = {May},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/gaelic_graphemes_icassp13.pdf},
  booktitle = {Proc. ICASSP}
}
@phdthesis{bell_phd_thesis,
  author = {Bell, Peter},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/thesis.pdf},
  abstract = {HMM-based systems for Automatic Speech Recognition typically model the acoustic features using mixtures of multivariate Gaussians. In this thesis, we consider the problem of learning a suitable covariance matrix for each Gaussian. A variety of schemes have been proposed for controlling the number of covariance parameters per Gaussian, and studies have shown that in general, the greater the number of parameters used in the models, the better the recognition performance. We therefore investigate systems with full covariance Gaussians. However, in this case, the obvious choice of parameters -- given by the sample covariance matrix -- leads to matrices that are poorly-conditioned, and do not generalise well to unseen test data. The problem is particularly acute when the amount of training data is limited. We propose two solutions to this problem: firstly, we impose the requirement that each matrix should take the form of a Gaussian graphical model, and introduce a method for learning the parameters and the model structure simultaneously. Secondly, we explain how an alternative estimator, the shrinkage estimator, is preferable to the standard maximum likelihood estimator, and derive formulae for the optimal shrinkage intensity within the context of a Gaussian mixture model. We show how this relates to the use of a diagonal covariance smoothing prior. We compare the effectiveness of these techniques to standard methods on a phone recognition task where the quantity of training data is artificially constrained. We then investigate the performance of the shrinkage estimator on a large-vocabulary conversational telephone speech recognition task. Discriminative training techniques can be used to compensate for the invalidity of the model correctness assumption underpinning maximum likelihood estimation. On the large-vocabulary task, we use discriminative training of the full covariance models and diagonal priors to yield improved recognition performance.},
  title = {Full covariance modelling for speech recognition},
  school = {University of Edinburgh},
  year = {2010}
}
@inproceedings{bell13_mlan,
  author = {Bell, Peter and Swietojanski, Pawel and Renals, Steve},
  doi = {10.1109/ICASSP.2013.6639014},
  title = {Multi-level adaptive networks in tandem and hybrid {ASR} systems},
  abstract = {In this paper we investigate the use of Multi-level adaptive networks (MLAN) to incorporate out-of-domain data when training large vocabulary speech recognition systems. In a set of experiments on multi-genre broadcast data and on TED lecture recordings we present results using of out-of-domain features in a hybrid DNN system and explore tandem systems using a variety of input acoustic features. Our experiments indicate using the MLAN approach in both hybrid and tandem systems results in consistent reductions in word error rate of 5--10\% relative.},
  address = {Vancouver, Canada},
  month = {May},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/mlan_icassp2013.pdf},
  booktitle = {Proc. ICASSP}
}
@inproceedings{dzikovskaSIGDIAL20112,
  author = {Dzikovska, Myroslava and Isard, Amy and Bell, Peter and Moore, Johanna and Steinhauser, Natalie and Campbell, Gwendolyn},
  publisher = {Association for Computational Linguistics},
  title = {{Beetle II}: an adaptable tutorial dialogue system},
  url = {http://www.aclweb.org/anthology/W11-2041},
  booktitle = {Proceedings of the SIGDIAL 2011 Conference, demo session},
  address = {Portland, Oregon},
  month = {June},
  pages = {338--340},
  year = {2011},
  abstract = {We present Beetle II, a tutorial dialogue system which accepts unrestricted language input and supports experimentation with different tutorial planning and dialogue strategies. Our first system evaluation compared two tutorial policies and demonstrated that the system can be used to study the impact of different approaches to tutoring. The system is also designed to allow experimentation with a variety of natural language techniques, and discourse and dialogue strategies.}
}
@inproceedings{bell_king_shrinkage_is2008,
  author = {Bell, Peter and King, Simon},
  title = {A Shrinkage Estimator for Speech Recognition with Full Covariance {HMM}s},
  booktitle = {Proc. Interspeech},
  address = {Brisbane, Australia},
  month = {September},
  note = {Shortlisted for best student paper award.},
  year = {2008},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/shrinkage_is2008.pdf},
  abstract = {We consider the problem of parameter estimation in full-covariance Gaussian mixture systems for automatic speech recognition. Due to the high dimensionality of the acoustic feature vector, the standard sample covariance matrix has a high variance and is often poorly-conditioned when the amount of training data is limited. We explain how the use of a shrinkage estimator can solve these problems, and derive a formula for the optimal shrinkage intensity. We present results of experiments on a phone recognition task, showing that the estimator gives a performance improvement over a standard full-covariance system}
}
@inproceedings{DBLP:conf/aied/DzikovskaIBMSCTCS11,
  author = {Dzikovska, Myroslava and Isard, Amy and Bell, Peter and Moore, Johanna D. and Steinhauser, Natalie B. and Campbell, Gwendolyn E. and Taylor, Leanne S. and Caine, Simon and Scott, Charlie},
  publisher = {Springer},
  doi = {10.1007/978-3-642-21869-9_122},
  title = {Adaptive Intelligent Tutorial Dialogue in the {Beetle II} System},
  series = {Lecture Notes in Computer Science},
  booktitle = {Artificial Intelligence in Education - 15th International Conference (AIED 2011), interactive event},
  address = {Auckland, New Zealand},
  volume = {6738},
  year = {2011},
  pages = {621}
}
@inproceedings{bell12_mlan,
  author = {Bell, P. and Gales, M. and Lanchantin, P. and Liu, X. and Long, Y. and Renals, S. and Swietojanski, P. and Woodland, P.},
  doi = {10.1109/SLT.2012.6424244},
  title = {Transcription of multi-genre media archives using out-of-domain data},
  booktitle = {Proc. IEEE Workshop on Spoken Language Technology},
  address = {Miami, Florida, USA},
  month = {December},
  pages = {324--329},
  year = {2012},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/mlan_slt2012.pdf},
  abstract = {We describe our work on developing a speech recognition system for multi-genre media archives. The high diversity of the data makes this a challenging recognition task, which may benefit from systems trained on a combination of in-domain and out-of-domain data. Working with tandem HMMs, we present Multi-level Adaptive Networks (MLAN), a novel technique for incorporating information from out-of-domain posterior features using deep neural networks. We show that it provides a substantial reduction in WER over other systems, with relative WER reductions of 15\% over a PLP baseline, 9\% over in-domain tandem features and 8\% over the best out-of-domain tandem features.}
}
@inproceedings{dongwang_interspeech09_conf,
  author = {Wang, Dong and King, Simon and Frankel, Joe and Bell, Peter},
  title = {Term-Dependent Confidence for Out-of-Vocabulary Term Detection},
  booktitle = {Proc. Interspeech},
  address = {Brighton, UK},
  month = {September},
  pages = {2139--2142},
  year = {2009},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/conf.pdf},
  abstract = {Within a spoken term detection (STD) system, the decision maker plays an important role in retrieving reliable detections. Most of the state-of-the-art STD systems make decisions based on a confidence measure that is term-independent, which poses a serious problem for out-of-vocabulary (OOV) term detection. In this paper, we study a term-dependent confidence measure based on confidence normalisation and discriminative modelling, particularly focusing on its remarkable effectiveness for detecting OOV terms. Experimental results indicate that the term-dependent confidence provides much more significant improvement for OOV terms than terms in-vocabulary.},
  categories = {joint-multigram, pronunciation model, spoken term detection, speech recognition}
}
@inproceedings{bell12_tutoring,
  author = {Bell, Peter and Dzikovska, Myroslava and Isard, Amy},
  title = {Designing a spoken language interface for a tutorial dialogue system},
  booktitle = {Proc. Interspeech},
  address = {Portland, Oregon, USA},
  month = {September},
  year = {2012},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/tutoring_is2012.pdf},
  abstract = {We describe our work in building a spoken language interface for a tutorial dialogue system. Our goal is to allow natural, unrestricted student interaction with the computer tutor, which has been shown to improve the student's learning gain, but presents challenges for speech recognition and spoken language understanding. We discuss the choice of system components and present the results of development experiments in both acoustic and language modelling for speech recognition in this domain.}
}
@inproceedings{stan12_grapheme_alignment,
  author = {Stan, Adriana and Bell, Peter and King, Simon},
  title = {A Grapheme-based Method for Automatic Alignment of Speech and Text Data},
  booktitle = {Proc. IEEE Workshop on Spoken Language Technology},
  address = {Miami, Florida, USA},
  month = {December},
  year = {2012},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/grapheme_alignment_slt2012.pdf},
  abstract = {This paper introduces a method for automatic alignment of speech data with unsynchronised, imperfect transcripts, for a domain where no initial acoustic models are available. Using grapheme-based acoustic models, word skip networks and orthographic speech transcripts, we are able to harvest 55\% of the speech with a 93\% utterance-level accuracy and 99\% word accuracy for the produced transcriptions. The work is based on the assumption that there is a high degree of correspondence between the speech and text, and that a full transcription of all of the speech is not required. The method is language independent and the only prior knowledge and resources required are the speech and text transcripts, and a few minor user interventions.}
}
@inproceedings{wang_std_covariance_icassp2010,
  author = {Wang, Dong and King, Simon and Frankel, Joe and Bell, Peter},
  title = {Stochastic Pronunciation Modelling and Soft Match for Out-of-vocabulary Spoken Term Detection},
  booktitle = {Proc. ICASSP},
  address = {Dallas, Texas, USA},
  month = {March},
  year = {2010},
  keywords = {confidence estimation, spoken term detection, speech recognition},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wang10_icassp.pdf},
  abstract = {A major challenge faced by a spoken term detection (STD) system is the detection of out-of-vocabulary (OOV) terms. Although a subword-based STD system is able to detect OOV terms, performance reduction is always observed compared to in-vocabulary terms. One challenge that OOV terms bring to STD is the pronunciation uncertainty. A commonly used approach to address this problem is a soft matching procedure,and the other is the stochastic pronunciation modelling (SPM) proposed by the authors. In this paper we compare these two approaches, and combine them using a discriminative decision strategy. Experimental results demonstrated that SPM and soft match are highly complementary, and their combination gives significant performance improvement to OOV term detection.}
}
@inproceedings{bell_burrows_taylor_sp2006,
  author = {Bell, Peter and Burrows, Tina and Taylor, Paul},
  title = {Adaptation of Prosodic Phrasing Models},
  booktitle = {Proc. Speech Prosody 2006},
  address = {Dresden, Germany},
  month = {May},
  year = {2006},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/phrasing_sp2006.pdf},
  abstract = {There is considerable variation in the prosodic phrasing of speech betweeen different speakers and speech styles. Due to the time and cost of obtaining large quantities of data to train a model for every variation, it is desirable to develop models that can be adapted to new conditions with a limited amount of training data. We describe a technique for adapting HMM-based phrase boundary prediction models which alters a statistic distribution of prosodic phrase lengths. The adapted models show improved prediction performance across different speakers and types of spoken material.}
}
@inproceedings{bell_king_full_covariance_asru2009,
  author = {Bell, Peter and King, Simon},
  doi = {10.1109/ASRU.2009.5373344},
  title = {Diagonal Priors for Full Covariance Speech Recognition},
  booktitle = {Proc. IEEE Workshop on Automatic Speech Recognition and Understanding},
  address = {Merano, Italy},
  month = {December},
  year = {2009},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/shrinkage_asru2009.pdf},
  abstract = {We investigate the use of full covariance Gaussians for large-vocabulary speech recognition. The large number of parameters gives high modelling power, but when training data is limited, the standard sample covariance matrix is often poorly conditioned, and has high variance. We explain how these problems may be solved by the use of a diagonal covariance smoothing prior, and relate this to the shrinkage estimator, for which the optimal shrinkage parameter may itself be estimated from the training data. We also compare the use of generatively and discriminatively trained priors. Results are presented on a large vocabulary conversational telephone speech recognition task.}
}
@inproceedings{dzikovska-EtAl:2012:EACL2012,
  author = {Dzikovska, Myroslava O. and Bell, Peter and Isard, Amy and Moore, Johanna D.},
  publisher = {Association for Computational Linguistics},
  title = {Evaluating language understanding accuracy with respect to objective outcomes in a dialogue system},
  url = {http://www.aclweb.org/anthology/E12-1048},
  booktitle = {Proceedings of the 13th Conference of the European Chapter of the Association for Computational Linguistics},
  address = {Avignon, France},
  month = {April},
  year = {2012},
  pages = {471--481}
}
@inproceedings{bell_king_is2007,
  author = {Bell, Peter and King, Simon},
  title = {Sparse Gaussian Graphical Models for Speech Recognition},
  booktitle = {Proc. Interspeech 2007},
  address = {Antwerp, Belgium},
  month = {August},
  year = {2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/sparseGM_is2007.pdf},
  abstract = {We address the problem of learning the structure of Gaussian graphical models for use in automatic speech recognition, a means of controlling the form of the inverse covariance matrices of such systems. With particular focus on data sparsity issues, we implement a method for imposing graphical model structure on a Gaussian mixture system, using a convex optimisation technique to maximise a penalised likelihood expression. The results of initial experiments on a phone recognition task show a performance improvement over an equivalent full-covariance system.},
  categories = {speech recognition, acoustic models, graphical models, precision matrix models}
}
@inproceedings{bell_king_lineSearch_is2008,
  author = {Bell, Peter and King, Simon},
  title = {Covariance Updates for Discriminative Training by Constrained Line Search},
  booktitle = {Proc. Interspeech},
  address = {Brisbane, Australia},
  month = {September},
  year = {2008},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/lineSearch_is2008.pdf},
  abstract = {We investigate the recent Constrained Line Search algorithm for discriminative training of HMMs and propose an alternative formula for variance update. We compare the method to standard techniques on a phone recognition task.}
}
@inproceedings{bell13_lecture_transcription,
  author = {Bell, Peter and Yamamoto, Hitoshi and Swietojanski, Pawel and Wu, Youzheng and McInnes, Fergus and Hori, Chiori and Renals, Steve},
  title = {A lecture transcription system combining neural network acoustic and language models},
  booktitle = {Proc. Interspeech},
  year = {2013},
  month = {August},
  address = {Lyon, France},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/lecture_transcription_is2013.pdf},
  abstract = {This paper presents a new system for automatic transcription of lectures. The system combines a number of novel features, including deep neural network acoustic models using multi-level adaptive networks to incorporate out-of-domain information, and factored recurrent neural network language models. We demonstrate that the system achieves large improvements on the TED lecture transcription task from the 2012 IWSLT evaluation -- our results are currently the best reported on this task, showing an relative WER reduction of more than 16\% compared to the closest competing system from the evaluation.}
}
@inproceedings{stan13_lightly_supervised_discriminative,
  author = {Stan, Adriana and Bell, Peter and Yamagishi, Junichi and King, Simon},
  title = {Lightly Supervised Discriminative Training of Grapheme Models for Improved Sentence-level Alignment of Speech and Text Data},
  booktitle = {Proc. Interspeech},
  year = {2013},
  month = {August},
  address = {Lyon, France},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/lightly_supervised_discriminative_is2013.pdf},
  abstract = {This paper introduces a method for lightly supervised discriminative training using MMI to improve the alignment of speech and text data for use in training HMM-based TTS systems for low-resource languages. In TTS applications, due to the use of long-span contexts, it is important to select training utterances which have wholly correct transcriptions. In a low-resource setting, when using poorly trained grapheme models, we show that the use of MMI discriminative training at the grapheme-level enables us to increase the amount of correctly aligned data by 40\%, while maintaining a 7\% sentence error rate and 0.8\% word error rate. We present the procedure for lightly supervised discriminative training with regard to the objective of minimising sentence error rate.}
}
@inproceedings{christensen13_disordered,
  author = {Christensen, H. and Aniol, M. and Bell, P. and Green, P. and Hain, T. and King, S. and Swietojanski, P.},
  title = {Combining in-domain and out-of-domain speech data for automatic recognition of disordered speech},
  booktitle = {Proc. Interspeech},
  year = {2013},
  month = {August},
  address = {Lyon, France},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/christensen_is13_2_final.pdf},
  abstract = {Recently there has been increasing interest in ways of using out-of-domain (OOD) data to improve automatic speech recognition performance in domains where only limited data is available. This paper focuses on one such domain, namely that of disordered speech for which only very small databases exist, but where normal speech can be considered OOD. Standard approaches for handling small data domains use adaptation from OOD models into the target domain, but here we investigate an alternative approach with its focus on the feature extraction stage: OOD data is used to train feature-generating deep belief neural networks. Using AMI meeting and TED talk datasets, we investigate various tandem-based speaker independent systems as well as maximum a posteriori adapted speaker dependent systems. Results on the UAspeech isolated word task of disordered speech are very promising with our overall best system (using a combination of AMI and TED data) giving a correctness of 62.5\%; an increase of 15\% on previously best published results based on conventional model adaptation. We show that the relative benefit of using OOD data varies considerably from speaker to speaker and is only loosely correlated with the severity of a speaker's impairments.}
}
@inproceedings{lanchantin13_multigenre_transcription,
  author = {Lanchantin, P. and Bell, P. and Gales, M. and Hain, T. and Liu, X. and Long, Y. and Quinnell, J. and Renals, S. and Saz, O. and Seigel, M. and Swietojanski, P. and Woodland, P.},
  title = {Automatic Transcription of Multi-genre Media Archives},
  booktitle = {Proc. Workshop on Speech, Language and Audio in Multimedia},
  address = {Marseille, France},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/lanchantin13_multigenre_transcription.pdf},
  abstract = {This paper describes some recent results of our collaborative work on developing a speech recognition system for the automatic transcription or media archives from the British Broadcasting Corporation (BBC). Material includes a high diversity of shows with their associated transcriptions. The latter are highly diverse in terms of completeness, reliability and accuracy. First, we investigate how to improve lightly supervised acoustic training when time-stamps information is inaccurate or when speech deviates significantly from the transcription. To address the last issue, word and segment level combination approaches are used between the lightly supervised transcripts and the original programme scripts which yield improved transcriptions. Experimental results show that systems trained using these improved transcriptions consistently outperform those trained using only the original lightly supervised decoding hypotheses. Secondly, we show that the recognition task may benefit from systems trained on a combination of in-domain and out-of-domain data. Working with tandem HMMs, we present Multi-level Adaptive Networks, a novel technique for incorporating information from out-of domain posterior features using deep neural network. We show that it provides a substantial reduction in WER over other systems including PLP baseline, in-domain tandem features and best out-of-domain tandem features.}
}
@inproceedings{bourlard_slam2013,
  author = {Bourlard, H. and Ferras, M. and Pappas, N. and Popescu-Belis, A. and Renals, S. and McInnes, F. and Bell, P. and Ingram, S. and Guillemot, M.},
  title = {Processing and Linking Audio Events in Large Multimedia Archives: The {EU} {inEvent} Project},
  booktitle = {Proceedings of SLAM 2013 (First Workshop on Speech, Language and Audio in Multimedia)},
  address = {Marseille, France},
  month = {August},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/bourlard_slam2013.pdf},
  abstract = {In the inEvent EU project, we aim at structuring, retrieving, and sharing large archives of networked, and dynamically changing, multimedia recordings, mainly consisting of meetings, videoconferences, and lectures. More specifically, we are developing an integrated system that performs audiovisual processing of multimedia recordings, and labels them in terms of interconnected "hyper-events" (a notion inspired from hyper-texts). Each hyper-event is composed of simpler facets, including audio-video recordings and metadata, which are then easier to search, retrieve and share. In the present paper, we mainly cover the audio processing aspects of the system, including speech recognition, speaker diarization and linking (across recordings), the use of these features for hyper-event indexing and recommendation, and the search portal. We present initial results for feature extraction from lecture recordings using the TED talks.},
  categories = {networked multimedia events, audio processing: speech recognition, speaker diarization and linking, multimedia indexing and searching, hyper-events}
}
@inproceedings{Mamiya_SSW8,
  author = {Mamiya, Yoshitaka and Stan, Adriana and Yamagishi, Junichi and Bell, Peter and Watts, Oliver and Clark, Robert and King, Simon},
  title = {Using Adaptation to Improve Speech Transcription Alignment in Noisy and Reverberant Environments},
  booktitle = {8th ISCA Workshop on Speech Synthesis},
  address = {Barcelona, Spain},
  month = {August},
  pages = {61--66},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/ssw8_PS1-4_Mamiya.pdf},
  abstract = {When using data retrieved from the internet to create new speech databases, the recording conditions can often be highly variable within and between sessions. This variance influences the overall performance of any automatic speech and text alignment techniques used to process this data. In this paper we discuss the use of speaker adaptation methods to address this issue. Starting from a baseline system for automatic sentence-level segmentation and speech and text alignment based on GMMs and grapheme HMMs, respectively, we employ Maximum A Posteriori (MAP) and Constrained Maximum Likelihood Linear Regression (CMLLR) techniques to model the variation in the data in order to increase the amount of confidently aligned speech. We tested 29 different scenarios, which include reverberation, 8 talker babble noise and white noise, each in various combinations and SNRs. Results show that the MAP-based segmentation's performance is very much influenced by the noise type, as well as the presence or absence of reverberation. On the other hand, the CMLLR adaptation of the acoustic models gives an average 20\% increase in the aligned data percentage for the majority of the studied scenarios.}
}
@inproceedings{jdriesen:iwslt_german,
  author = {Driesen, Joris and Bell, Peter and Sinclair, Mark and Renals, Steve},
  title = {Description of the {UEDIN} system for {German ASR}},
  booktitle = {Proc IWSLT},
  address = {Heidelberg, Germany},
  month = {December},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/german_iwslt.pdf},
  abstract = {In this paper we describe the ASR system for German built at the University of Edinburgh (UEDIN) for the 2013 IWSLT evaluation campaign. For ASR, the major challenge to overcome, was to find suitable acoustic training data. Due to the lack of expertly transcribed acoustic speech data for German, acoustic model training had to be performed on publicly available data crawled from the internet. For evaluation, lack of a manual segmentation into utterances was handled in two different ways: by generating an automatic segmentation, and by treating entire input files as a single segment. Demonstrating the latter method is superior in the current task, we obtained a WER of 28.16% on the dev set and 36.21% on the test set.}
}
@inproceedings{bell13_iwslt,
  author = {Bell, Peter and McInnes, Fergus and Gangireddy, Siva Reddy and Sinclair, Mark and Birch, Alexandra and Renals, Steve},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/bell13_iwslt_system.pdf},
  booktitle = {Proc. International Workshop on Spoken Language Translation},
  year = {2013},
  abstract = {This paper describes the University of Edinburgh (UEDIN) English ASR system for the IWSLT 2013 Evaluation. \mbox{Notable} features of the system include deep neural network acoustic models in both tandem and hybrid configuration, cross-domain adaptation with multi-level adaptive networks, and the use of a recurrent neural network language model. Improvements to our system since the 2012 evaluation -- which include the use of a significantly improved n-gram language model -- result in a 19\% relative WER reduction on the \tstD set.},
  title = {The {UEDIN} English {ASR} System for the {IWSLT} 2013 Evaluation}
}
@inproceedings{bell14_xling_mlan,
  author = {Bell, P. and Driesen, J. and Renals, S.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/bell_xling_mlan_is2014.pdf},
  booktitle = {Proc. Interspeech},
  title = {Cross-lingual adaptation with multi-task adaptive networks},
  abstract = {Posterior-based or bottleneck features derived from neural networks trained on out-of-domain data may be successfully applied to improve speech recognition performance when data is scarce for the target domain or language. In this paper we combine this approach with the use of a hierarchical deep neural network (DNN) network structure -- which we term a multi-level adaptive network (MLAN) -- and the use of multitask learning. We have applied the technique to cross-lingual speech recognition experiments on recordings of TED talks and European Parliament sessions in English (source language) and German (target language). We demonstrate that the proposed method can lead to improvements over standard methods, even when the quantity of training data for the target language is relatively high. When the complete method is applied, we achieve relative WER reductions of around 13\% compared to a monolingual hybrid DNN baseline.},
  year = {2014}
}
@inproceedings{cervone14_attribution_relations,
  author = {Cervone, A. and Pareti, S. and Bell, P. and Prodanof, I. and Caselli, T.},
  title = {Detecting attribution relations in speech: a corpus study},
  booktitle = {Proc. Italian Conference on Computational Linguistics},
  year = {2014},
  address = {Pisa, Italy},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/CLiCCervoneDetectingAttribution.pdf},
  abstract = {In this work we present a methodology for the annotation of Attribution Relations (ARs) in speech which we apply to create a pilot corpus of spoken informal dialogues. This represents the first step towards the creation of a resource for the analysis of ARs in speech and the development of automatic extraction systems. Despite its relevance for speech recognition systems and spoken language understanding, the relation holding between quotations and opinions and their source has been studied and extracted only in written corpora, characterized by a formal register (news, literature, scientific articles). The shift to the informal register and to a spoken corpus widens our view of this relation and poses new challenges. Our hypothesis is that the decreased reliability of the linguistic cues found for written corpora in the fragmented structure of speech could be overcome by including prosodic clues in the system. The analysis of SARC confirms the hypothesis showing the crucial role played by the acoustic level in providing the missing lexical clues.}
}
@inproceedings{sinclairbell_interspeech14,
  author = {Sinclair, Mark and Bell, Peter and Birch, Alexandra and McInnes, Fergus},
  title = {A semi-Markov model for speech segmentation with an utterance-break prior},
  booktitle = {Proc. Interspeech},
  month = {September},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/interspeech2014.pdf},
  abstract = {Speech segmentation is the problem of finding the end points of a speech utterance for passing to an automatic speech recognition (ASR) system. The quality of this segmentation can have a large impact on the accuracy of the ASR system; in this paper we demonstrate that it can have an even larger impact on downstream natural language processing tasks – in this case, machine translation. We develop a novel semi-Markov model which allows the segmentation of audio streams into speech utterances which are optimised for the desired distribution of sentence lengths for the target domain. We compare this with existing state-of-the-art methods and show that it is able to achieve not only improved ASR performance, but also to yield significant benefits to a speech translation task.},
  categories = {speech activity detection, speech segmentation, machine translation, speech recognition}
}
@inproceedings{Swietojanski2015,
  author = {Swietojanski, P. and Bell, P. and Renals, S.},
  doi = {},
  title = {Structured Output Layer with Auxiliary Targets for Context-Dependent Acoustic Modelling},
  booktitle = {Proc. Interspeech},
  address = {Dresden, Germany},
  month = {September},
  pages = {},
  year = {2015},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/Swietojanski_Interspeech2015.pdf},
  abstract = {In previous work we have introduced a multi-task training technique for neural network acoustic modelling, in which context-dependent and context-independent targets are jointly learned. In this paper, we extend the approach by structuring the out-put layer such that the context-dependent outputs are dependent on the context-independent outputs, thus using the context-independent predictions at run-time. We have also investigated the applicability of this idea to unsupervised speaker adaptation as an approach to overcome the data sparsity issues that comes to the fore when estimating systems with a large number of context-dependent states, when data is limited. We have experimented with various amounts of training material (from 10 to 300 hours) and find the proposed techniques are particularly well suited to data-constrained conditions allowing to better utilise large context-dependent state-clustered trees. Experimental results are reported for large vocabulary speech recognition using the Switchboard and TED corpora.},
  categories = {multitask learning, structured output layer, adap- tation, deep neural networks}
}
@inproceedings{bell15_cd_multitask,
  author = {Bell, P. and Renals, S.},
  title = {Regularization of context-dependent deep neural networks with context-independent multi-task training},
  booktitle = {Proc. ICASSP},
  year = {2015},
  month = {April},
  address = {Brisbane, Australia},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/bell15_cd_multitask.pdf},
  abstract = {The use of context-dependent targets has become standard in hybrid DNN systems for automatic speech recognition. However, we argue that despite the use of state-tying, optimising to context-dependent targets can lead to over-fitting, and that discriminating between arbitrary tied context-dependent targets may not be optimal. We propose a multitask learning method where the network jointly predicts context-dependent and monophone targets. We evaluate the method on a large-vocabulary lecture recognition task and show that it yields relative improvements of 3-10\% over baseline systems.}
}
@inproceedings{bell14_iwslt,
  author = {Bell, Peter and Swietojanski, Pawel and Driesen, Joris and Sinclair, Mark and McInnes, Fergus and Renals, Steve},
  title = {The {UEDIN} {ASR} Systems for the {IWSLT} 2014 Evaluation},
  booktitle = {Proc. IWSLT},
  year = {2014},
  month = {December},
  address = {South Lake Tahoe, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/bell14_iwslt.pdf},
  abstract = {This paper describes the University of Edinburgh (UEDIN) ASR systems for the 2014 IWSLT Evaluation. Notable features of the English system include deep neural network acoustic models in both tandem and hybrid configuration with the use of multi-level adaptive networks, LHUC adaptation and Maxout units. The German system includes lightly supervised training and a new method for dictionary generation. Our voice activity detection system now uses a semi-Markov model to incorporate a prior on utterance lengths. There are improvements of up to 30\% relative WER on the tst2013 English test set.}
}
@inproceedings{bell15_complementary_task_mt,
  author = {Bell, Peter and Renals, Steve},
  title = {Complementary tasks for context-dependent deep neural network acoustic models},
  booktitle = {Proc. Interspeech},
  year = {2015},
  month = {September},
  address = {Dresden, Germany},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/bell15_complementary_task_mt.pdf},
  abstract = {We have previously found that context-dependent DNN models for automatic speech recognition can be improved with the use of monophone targets as a secondary task for the network. This paper asks whether the improvements derive from the regularising effect of having a much small number of monophone outputs -- compared to the typical number of tied states -- or from the use of targets that are not tied to an arbitrary state-clustering. We investigate the use of factorised targets for left and right context, and targets motivated by articulatory properties of the phonemes. We present results on a large-vocabulary lecture recognition task. Although the regularising effect of monophones seems to be important, all schemes give substantial improvements over the baseline single task system, even though the cardinality of the outputs is relatively high.}
}
@inproceedings{bell15_news_summarisation,
  author = {Bell, Peter and Lai, Catherine and Llewellyn, Clare and Birch, Alexandra and Sinclair, Mark},
  title = {A system for automatic broadcast news summarisation, geolocation and translation},
  booktitle = {Proc. Interspeech (demo session)},
  year = {2015},
  month = {September},
  address = {Dresden, Germany},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/bell15_news_summarisation.pdf},
  abstract = {An increasing amount of news content is produced in audio-video form every day. To effectively analyse and monitoring this multilingual data stream, we require methods to extract and present audio content in accessible ways. In this paper, we describe an end-to-end system for processing and browsing audio news data. This fully automated system brings together our recent research on audio scene analysis, speech recognition, summarisation, named entity detection, geolocation, and machine translation. The graphical interface allows users to visualise the distribution of news content by entity names and story location. Browsing of news events is facilitated through extractive summaries and the ability to view transcripts in multiple languages.}
}
@inproceedings{cervone15_reported_speech_prosody,
  author = {Cervone, Alessandra and Lai, Catherine and Pareti, Silvia and Bell, Peter},
  title = {Towards automatic detection of reported speech in dialogue using prosodic cues},
  booktitle = {Proc. Interspeech},
  year = {2015},
  month = {September},
  address = {Dresden, Germany},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/cervone15_reported_speech_prosody.pdf},
  abstract = {The phenomenon of reported speech -- whereby we quote the words, thoughts and opinions of others, or recount past dialogue -- is widespread in conversational speech. Detecting such quotations automatically has numerous applications: for example, in enhancing automatic transcription or spoken language understanding applications. However, the task is challenging, not least because lexical cues of quotations are frequently ambiguous or not present in spoken language. The aim of this paper is to identify potential prosodic cues of reported speech which could be used, along with the lexical ones, to automatically detect quotations and ascribe them to their rightful source, that is reconstructing their Attribution Relations. In order to do so we analyze SARC, a small corpus of telephone conversations that we have annotated with Attribution Relations. The results of the statistical analysis performed on the data show how variations in pitch, intensity, and timing features can be exploited as cues of quotations. Furthermore, we build a SVM classifier which integrates lexical and prosodic cues to automatically detect quotations in speech that performs significantly better than chance.}
}
@article{stan-2016,
  author = {Stan, Adriana and Mamiya, Yoshitaka and Yamagishi, Junichi and Bell, Peter and Watts, Oliver and Clark, Rob and King, Simon},
  doi = {http://dx.doi.org/10.1016/j.csl.2015.06.006},
  title = {{ALISA}: An automatic lightly supervised speech segmentation and alignment tool},
  url = {http://www.sciencedirect.com/science/article/pii/S0885230815000650},
  journal = {Computer Speech and Language},
  issn = {0885-2308},
  pages = {116--133},
  volume = {35},
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/stan-2016.pdf},
  abstract = {This paper describes the ALISA tool, which implements a lightly supervised method for sentence-level alignment of speech with imperfect transcripts. Its intended use is to enable the creation of new speech corpora from a multitude of resources in a language-independent fashion, thus avoiding the need to record or transcribe speech data. The method is designed so that it requires minimum user intervention and expert knowledge, and it is able to align data in languages which employ alphabetic scripts. It comprises a GMM-based voice activity detector and a highly constrained grapheme-based speech aligner. The method is evaluated objectively against a gold standard segmentation and transcription, as well as subjectively through building and testing speech synthesis systems from the retrieved data. Results show that on average, 70% of the original data is correctly aligned, with a word error rate of less than 0.5%. In one case, subjective listening tests show a statistically significant preference for voices built on the gold transcript, but this is small and in other tests, no statistically significant differences between the systems built from the fully supervised training data and the one which uses the proposed method are found.},
  categories = {Speech segmentation, speech and text alignment, grapheme acoustic models, lightly supervised system, imperfect transcripts}
}
@inproceedings{bell15_alignment,
  author = {Bell, Peter and Renals, Steve},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/bell15_alignment.pdf},
  booktitle = {Proc. ASRU},
  year = {2015},
  abstract = {We describe our system for alignment of broadcast media captions in the 2015 MGB Challenge. A precise time alignment of previously-generated subtitles to media data is important in the process of caption generation by broadcasters. However, this task is challenging due to the highly diverse, often noisy content of the audio, and because the subtitles are frequently not a verbatim representation of the actual words spoken. Our system employs a two-pass approach with appropriately constrained weighted finite state transducers (WFSTs) to enable good alignment even when the audio quality would be challenging for conventional ASR. The system achieves an f-score of 0.8965 on the MGB Challenge development set.},
  title = {A system for automatic alignment of broadcast media captions using weighted finite-state transducers}
}
@inproceedings{ali15_multi_wer_asr,
  author = {Ali, Ahmed and Magdy, Walid and Bell, Peter and Renals, Steve},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/asru2015-multi-reference.pdf},
  booktitle = {Proc. ASRU},
  year = {2015},
  abstract = {We describe our system for alignment of broadcast media captions in the 2015 MGB Challenge. A precise time alignment of previously-generated subtitles to media data is important in the process of caption generation by broadcasters. However, this task is challenging due to the highly diverse, often noisy content of the audio, and because the subtitles are frequently not a verbatim representation of the actual words spoken. Our system employs a two-pass approach with appropriately constrained weighted finite state transducers (WFSTs) to enable good alignment even when the audio quality would be challenging for conventional ASR. The system achieves an f-score of 0.8965 on the MGB Challenge development set.},
  title = {Multi-reference {WER} for evaluating {ASR} for languages with no orthographic rules}
}
@inproceedings{bell15_mgb_challenge,
  author = {Bell, Peter and Gales, Mark and Hain, Thomas and Kilgour, Jonathan and Lanchantin, Pierre and Liu, Xunying and McParland, Andrew and Renals, Steve and Saz, Oscar and Wester, Mirjam and Woodland, Phil},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/bell15_mgb_challenge.pdf},
  booktitle = {Proc. ASRU},
  year = {2015},
  abstract = {This paper describes the Multi-Genre Broadcast (MGB) Challenge at ASRU~2015, an evaluation focused on speech recognition, speaker diarization, and ``lightly supervised'' alignment of BBC TV recordings. The challenge training data covered the whole range of seven weeks BBC TV output across four channels, resulting in about 1,600 hours of broadcast audio. In addition several hundred million words of BBC subtitle text was provided for language modelling. A novel aspect of the evaluation was the exploration of speech recognition and speaker diarization in a longitudinal setting -- i.e. recognition of several episodes of the same show, and speaker diarization across these episodes, linking speakers. The longitudinal tasks also offered the opportunity for systems to make use of supplied metadata including show title, genre tag, and date/time of transmission. This paper describes the task data and evaluation process used in the MGB challenge, and summarises the results obtained.},
  title = {The {MGB} challenge: Evaluating multi-genre broadcast media recognition}
}
@inproceedings{joachim_fainberg_improving_2016,
  author = {Fainberg, Joachim and Bell, Peter and Lincoln, Mike and Renals, Steve},
  title = {Improving Children's Speech Recognition through Out-of-Domain Data Augmentation},
  abstract = {Children’s speech poses challenges to speech recognition due to strong age-dependent anatomical variations and a lack of large, publicly-available corpora. In this paper we explore data augmentation for children’s speech recognition using stochastic feature mapping (SFM) to transform out-of-domain adult data for both GMM-based and DNN-based acoustic models. We performed experiments on the English PF-STAR corpus, augmenting using WSJCAM0 and ABI. Our experimental results indicate that a DNN acoustic model for childrens speech can make use of adult data, and that out-of-domain SFM is more accurate than in-domain SFM.},
  year = {2016},
  month = {September},
  address = {San Francisco, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/master.pdf},
  booktitle = {Proc. Interspeech},
  categories = {speech recognition, data augmentation, children’s speech}
}
@inproceedings{ali16_dialect_detection,
  author = {Ali, Ahmed and Dehak, Najim and Cardinal, Patrick and Khurana, Sameer and Yella, Sree Harsha and Glass, James and Bell, Peter and Renals, Steve},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/is2016-automatic-dialect-detection.pdf},
  booktitle = {Proc. Interspeech},
  year = {2016},
  abstract = {In this paper, we investigate different approaches for dialect identification in Arabic broadcast speech. These methods are based on phonetic and lexical features obtained from a speech recognition system, and bottleneck features using the i-vector framework. We studied both generative and discriminative classifiers, and we combined these features using a multi-class Support Vector Machine (SVM). We validated our results on an Arabic/English language identification task, with an accuracy of 100\%. We also evaluated these features in a binary classifier to discriminate between Modern Standard Arabic (MSA) and Dialectal Arabic, with an accuracy of 100\%. We further reported results using the proposed methods to discriminate between the five most widely used dialects of Arabic: namely Egyptian, Gulf, Levantine, North African, and MSA, with an accuracy of 59.2\%. We discuss dialect identification errors in the context of dialect code-switching between Dialectal Arabic and MSA, and compare the error pattern between manually labeled data, and the output from our classifier. All the data used on our experiments have been released to the public as a language identification corpus.},
  title = {Automatic dialect detection in arabic broadcast speech}
}
@inproceedings{sgangireddy_interspeech16,
  author = {Gangireddy, Siva Reddy and Swietojanski, Pawel and Bell, Peter and Renals, Steve},
  title = {{Unsupervised adaptation of Recurrent Neural Network Language Models}},
  booktitle = {Proc. Interspeech},
  address = {San Francisco, USA},
  month = sep,
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/ispeech16.pdf},
  abstract = {Recurrent neural network language models (RNNLMs) have been shown to consistently improve Word Error Rates (WERs) of large vocabulary speech recognition systems employing n-gram LMs. In this paper we investigate supervised and unsupervised discriminative adaptation of RNNLMs in a broadcast transcription task to target domains defined by either genre or show. We have explored two approaches based on (1) scaling forward-propagated hidden activations (Learning Hidden Unit Contributions (LHUC) technique) and (2) direct fine-tuning of the parameters of the whole RNNLM. To investigate the effectiveness of the proposed methods we carry out experiments on multi-genre broadcast (MGB) data following the MGB-2015 challenge protocol. We observe small but significant improvements in WER compared to a strong unadapted RNNLM model.},
  categories = {RNNLM, LHUC, unsupervised adaptation, fine-tuning, MGB-Challenge}
}