Publications by Steve Renals
srenals.bib
@inproceedings{swi2012_dnn,
author = {Swietojanski, P. and Ghoshal, A. and Renals, S.},
title = {Unsupervised Cross-lingual knowledge transfer in {DNN-based LVCSR}},
booktitle = {Proc. IEEE Workshop on Spoken Language Technology},
address = {Miami, Florida, USA},
abstract = {We investigate the use of cross-lingual acoustic data to initialise deep neural network (DNN) acoustic models by means
of unsupervised restricted Boltzmann machine (RBM) pretraining.
DNNs for German are pretrained using one or all of German, Portuguese, Spanish and Swedish. The DNNs are used in a tandem configuration, where the network outputs are used as features for a hidden Markov model (HMM) whose
emission densities are modeled by Gaussian mixture models (GMMs), as well as in a hybrid configuration, where the network outputs are used as the HMM state likelihoods. The experiments show that unsupervised pretraining is more crucial
for the hybrid setups, particularly with limited amounts of transcribed training data. More importantly, unsupervised pretraining is shown to be language-independent.},
month = dec,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/ps_slt2012.pdf},
year = 2012
}
@incollection{gotoh-lm03,
author = {Y.~Gotoh and S.~Renals},
title = {Language Modelling},
booktitle = {Text and Speech Triggered Information Access},
editor = {S.~Renals and G.~Grefenstette},
pages = {78--105},
abstract = {This is a preprint of a tutorial on statistical
language modelling, based on Yoshi Gotoh's course at
the \href{http://www.ilsp.gr/testia/testia2000.html}
{ELSNET-2000 Summer School} on Text and Speech
Triggered Information Access. },
categories = {ie,lm,bnews,sheffield},
crossref = {renals-book03},
year = 2003
}
@misc{turk2010,
author = {Turk, Alice and Scobbie, James and Geng, Christian and
Campbell, Barry and Dickie, Catherine and Dubourg,
Eddie and Bard, Ellen Gurman and Hardcastle, William
and Hartinger, Mariam and King, Simon and Lickley,
Robin and Macmartin, Cedric and Nakai, Satsuki and
Renals, Steve and Richmond, Korin and Schaeffler, Sonja
and White, Kevin and Wiegand, Ronny and Wrench, Alan},
title = {An {E}dinburgh speech production facility},
howpublished = {Poster presented at the 12th Conference on Laboratory
Phonology, Albuquerque, New Mexico.},
month = {July},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/ESPF.pdf},
year = 2010
}
@inproceedings{renals2008,
author = {Renals, Steve and Hain, Thomas and Bourlard, Hervé},
title = {Interpretation of Multiparty Meetings: The {AMI} and
{AMIDA} Projects},
booktitle = {IEEE Workshop on Hands-Free Speech Communication and
Microphone Arrays, 2008. HSCMA 2008},
pages = {115--118},
abstract = {The AMI and AMIDA projects are collaborative EU
projects concerned with the automatic recognition and
interpretation of multiparty meetings. This paper
provides an overview of the advances we have made in
these projects with a particular focus on the
multimodal recording infrastructure, the publicly
available AMI corpus of annotated meeting recordings,
and the speech recognition framework that we have
developed for this domain.},
doi = {10.1109/HSCMA.2008.4538700},
keywords = {AMI corpus; Meetings; evaluation; speech recognition},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/renals2008.pdf},
url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=4538666&arnumber=4538700&count=68&index=33},
year = 2008
}
@article{murray2009,
author = {Murray, Gabriel and Kleinbauer, Thomas and Poller,
Peter and Becker, Tilman and Renals, Steve and Kilgour,
Jonathan},
title = {Extrinsic Summarization Evaluation: A Decision Audit
Task},
journal = {ACM Transactions on Speech and Language Processing},
volume = {6},
number = {2},
pages = {1--29},
abstract = {In this work we describe a large-scale extrinsic
evaluation of automatic speech summarization
technologies for meeting speech. The particular task is
a decision audit, wherein a user must satisfy a complex
information need, navigating several meetings in order
to gain an understanding of how and why a given
decision was made. We compare the usefulness of
extractive and abstractive technologies in satisfying
this information need, and assess the impact of
automatic speech recognition (ASR) errors on user
performance. We employ several evaluation methods for
participant performance, including post-questionnaire
data, human subjective and objective judgments, and a
detailed analysis of participant browsing behavior. We
find that while ASR errors affect user satisfaction on
an information retrieval task, users can adapt their
browsing behavior to complete the task satisfactorily.
Results also indicate that users consider extractive
summaries to be intuitive and useful tools for browsing
multimodal meeting data. We discuss areas in which
automatic summarization techniques can be improved in
comparison with gold-standard meeting abstracts.},
doi = {10.1145/1596517.1596518},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/murray-acm09.pdf},
url = {http://doi.acm.org/10.1145/1596517.1596518},
year = 2009
}
@inproceedings{hochberg-arpa94,
author = {M.~Hochberg and S.~Renals and T.~Robinson},
title = {{Abbot}: The {CUED} hybrid {connectionist/HMM} large
vocabulary recognition system},
booktitle = {Proc. ARPA Spoken Language Technology Workshop},
pages = {102--105},
categories = {},
year = 1994
}
@inproceedings{vipperla08,
author = {Ravichander Vipperla and Steve Renals and Joe Frankel},
title = {Longitudinal study of {ASR} performance on ageing
voices},
booktitle = {Proc.~Interspeech},
address = {Brisbane},
abstract = {This paper presents the results of a longitudinal
study of ASR performance on ageing voices. Experiments
were conducted on the audio recordings of the
proceedings of the Supreme Court Of The United States
(SCOTUS). Results show that the Automatic Speech
Recognition (ASR) Word Error Rates (WERs) for elderly
voices are significantly higher than those of adult
voices. The word error rate increases gradually as the
age of the elderly speakers increase. Use of maximum
likelihood linear regression (MLLR) based speaker
adaptation on ageing voices improves the WER though the
performance is still considerably lower compared to
adult voices. Speaker adaptation however reduces the
increase in WER with age during old age.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/vipperla_is08.pdf},
year = 2008
}
@inproceedings{christensen-icassp05,
author = {H. Christensen and B. Kolluru and Y. Gotoh and S.
Renals},
title = {Maximum entropy segmentation of broadcast news},
booktitle = {Proc. IEEE ICASSP},
pages = {},
abstract = {This paper presents an automatic system for
structuring and preparing a news broadcast for
applications such as speech summarization, browsing,
archiving and information retrieval. This process
comprises transcribing the audio using an automatic
speech recognizer and subsequently segmenting the text
into utterances and topics. A maximum entropy approach
is used to build statistical models for both utterance
and topic segmentation. The experimental work addresses
the effect on performance of the topic boundary
detector of three factors: the information sources
used, the quality of the ASR transcripts, and the
quality of the utterance boundary detector. The results
show that the topic segmentation is not affected
severely by transcripts errors, whereas errors in the
utterance segmentation are more devastating. },
categories = {s3l,summarization,bnews,edinburgh,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/christensen-icassp05.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/christensen-icassp05.ps.gz},
year = 2005
}
@article{zhang-spl2008,
author = {Le Zhang and Steve Renals},
title = {Acoustic-Articulatory Modelling with the Trajectory
{HMM}},
journal = {IEEE Signal Processing Letters},
volume = 15,
pages = {245-248},
abstract = { In this letter, we introduce an hidden Markov model
(HMM)-based inversion system to recovery articulatory
movements from speech acoustics. Trajectory HMMs are
used as generative models for modelling articulatory
data. Experiments on the MOCHA-TIMIT corpus indicate
that the jointly trained acoustic-articulatory models
are more accurate (lower RMS error) than the separately
trained ones, and that trajectory HMM training results
in greater accuracy compared with conventional maximum
likelihood HMM training. Moreover, the system has the
ability to synthesize articulatory movements directly
from a textual representation. },
key = {articulatory inversion},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/zhang-spl.pdf},
year = 2008
}
@inproceedings{ultraxIS2012,
author = {Richmond, Korin and Renals, Steve},
title = {Ultrax: An Animated Midsagittal Vocal Tract Display
for Speech Therapy},
booktitle = {Proc. Interspeech},
address = {Portland, Oregon, USA},
abstract = {Speech sound disorders (SSD) are the most common
communication impairment in childhood, and can hamper
social development and learning. Current speech therapy
interventions rely predominantly on the auditory skills
of the child, as little technology is available to
assist in diagnosis and therapy of SSDs. Realtime
visualisation of tongue movements has the potential to
bring enormous benefit to speech therapy. Ultrasound
scanning offers this possibility, although its display
may be hard to interpret. Our ultimate goal is to
exploit ultrasound to track tongue movement, while
displaying a simplified, diagrammatic vocal tract that
is easier for the user to interpret. In this paper, we
outline a general approach to this problem, combining a
latent space model with a dimensionality reducing model
of vocal tract shapes. We assess the feasibility of
this approach using magnetic resonance imaging (MRI)
scans to train a model of vocal tract shapes, which is
animated using electromagnetic articulography (EMA)
data from the same speaker.},
categories = {Ultrasound, speech therapy, vocal tract visualisation},
keywords = {Ultrasound, speech therapy, vocal tract visualisation},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/RichmondRenalsIS2012.pdf},
year = 2012
}
@article{gotoh-roysoc00,
author = {Y.~Gotoh and S.~Renals},
title = {Information Extraction from Broadcast News},
journal = {Philosophical Transactions of the Royal Society of
London, Series A},
volume = {358},
pages = {1295--1310},
abstract = {This paper discusses the development of trainable
statistical models for extracting content from
television and radio news broadcasts. In particular we
concentrate on statistical finite state models for
identifying proper names and other named entities in
broadcast speech. Two models are presented: the first
models name class information as a word attribute; the
second explicitly models both word-word and class-class
transitions. A common n-gram based formulation is used
for both models. The task of named entity
identification is characterized by relatively sparse
training data and issues related to smoothing are
discussed. Experiments are reported using the
DARPA/NIST Hub-4E evaluation for North American
Broadcast News.},
categories = {stobs,ie,lm,bnews,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/rs00-preprint.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/rs00-preprint.ps.gz},
year = 2000
}
@incollection{vipperla2009a,
author = {Vipperla, Ravi Chander and Wolters, Maria and
Georgila, Kallirroi and Renals, Steve},
title = {Speech Input from Older Users in Smart Environments:
Challenges and Perspectives},
booktitle = {Proc. HCI International: Universal Access in
Human-Computer Interaction. Intelligent and Ubiquitous
Interaction Environments},
publisher = {Springer},
number = {5615},
series = {Lecture Notes in Computer Science},
abstract = {Although older people are an important user group for
smart environments, there has been relatively little
work on adapting natural language interfaces to their
requirements. In this paper, we focus on a particularly
thorny problem: processing speech input from older
users. Our experiments on the MATCH corpus show clearly
that we need age-specific adaptation in order to
recognize older users' speech reliably. Language models
need to cover typical interaction patterns of older
people, and acoustic models need to accommodate older
voices. Further research is needed into intelligent
adaptation techniques that will allow existing large,
robust systems to be adapted with relatively small
amounts of in-domain, age appropriate data. In
addition, older users need to be supported with
adequate strategies for handling speech recognition
errors.},
doi = {10.1007/978-3-642-02710-9},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/vipperla2009a.pdf},
url = {http://www.springerlink.com/content/27r01345r1683251/?p=ad2394d646814db59cf9868b0f74b11e&pi=13},
year = 2009
}
@incollection{morgan-guyonbook94,
author = {N.~Morgan and H.~Bourlard and S.~Renals and M.~Cohen
and H.~Franco},
title = {Hybrid neural network/hidden {Markov} model systems
for continuous speech recognition},
booktitle = {Advances in Pattern Recognition Systems using Neural
Networks Technologies},
publisher = {World Scientific Publications},
editor = {I.~Guyon and P.~S.~P.~Wang},
volume = {7},
series = {Series in Machine Perception and Artificial
Intelligence},
categories = {},
year = 1994
}
@inproceedings{koumpis-eurospeech01,
author = {K.~Koumpis and S.~Renals and M.~Niranjan},
title = {Extractive Summarization of Voicemail using Lexical
and Prosodic Feature Subset Selection},
booktitle = {Proc. Eurospeech},
pages = {2377--2380},
address = {Aalborg, Denmark},
abstract = {This paper presents a novel data-driven approach to
summarizing spoken audio transcripts utilizing lexical
and prosodic features. The former are obtained from a
speech recognizer and the latter are extracted
automatically from speech waveforms. We employ a
feature subset selection algorithm, based on ROC
curves, which examines different combinations of
features at different target operating conditions. The
approach is evaluated on the IBM Voicemail corpus,
demonstrating that it is possible and desirable to
avoid complete commitment to a single best classifier
or feature set.},
categories = {voicemail,summarization,prosody,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/eurospeech01.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/eurospeech01.ps.gz},
year = 2001
}
@inproceedings{cuayahuitletal_interspeech06,
author = {Heriberto Cuayáhuitl and Steve Renals and Oliver
Lemon and Hiroshi Shimodaira},
title = {Learning Multi-Goal Dialogue Strategies Using
Reinforcement Learning With Reduced State-Action Spaces},
booktitle = {Proc. of INTERSPEECH},
abstract = {Learning dialogue strategies using the reinforcement
learning framework is problematic due to its expensive
computational cost. In this paper we propose an
algorithm that reduces a state-action space to one
which includes only valid state-actions. We performed
experiments on full and reduced spaces using three
systems (with 5, 9 and 20 slots) in the travel domain
using a simulated environment. The task was to learn
multi-goal dialogue strategies optimizing single and
multiple confirmations. Average results using
strategies learnt on reduced spaces reveal the
following benefits against full spaces: 1) less
computer memory (94\% reduction), 2) faster learning
(93\% faster convergence) and better performance (8.4\%
less time steps and 7.7\% higher reward).},
categories = {reinforcement learning, spoken dialogue systems},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/rss-icslp2006.pdf},
year = 2006
}
@inproceedings{hochberg-icslp94,
author = {M.~Hochberg and S.~Renals and T.~Robinson and
D.~Kershaw},
title = {Large vocabulary continuous speech recognition using a
hybrid {connectionist/HMM} system},
booktitle = {Proc. ICSLP},
pages = {1499--1502},
address = {Yokohama},
categories = {},
year = 1994
}
@inproceedings{zwyssig2010,
author = {Zwyssig, Erich and Lincoln, Mike and Renals, Steve},
title = {A Digital Microphone Array for Distant Speech
Recognition},
booktitle = {Proc. IEEE ICASSP--10},
pages = {5106--5109},
abstract = {In this paper, the design, implementation and testing
of a digital microphone array is presented. The array
uses digital MEMS microphones which integrate the
microphone, amplifier and analogue to digital converter
on a single chip in place of the analogue microphones
and external audio interfaces currently used. The
device has the potential to be smaller, cheaper and
more flexible than typical analogue arrays, however the
effect on speech recognition performance of using
digital microphones is as yet unknown. In order to
evaluate the effect, an analogue array and the new
digital array are used to simultaneously record test
data for a speech recognition experiment. Initial
results employing no adaptation show that performance
using the digital array is significantly worse (14\%
absolute WER) than the analogue device. Subsequent
experiments using MLLR and CMLLR channel adaptation
reduce this gap, and employing MLLR for both channel
and speaker adaptation reduces the difference between
the arrays to 4.5\% absolute WER.},
doi = {10.1109/ICASSP.2010.5495040},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/zwyssig-icassp10.pdf},
year = 2010
}
@inproceedings{garau-interspeech05,
author = {G. Garau and S. Renals and T. Hain},
title = {Applying Vocal Tract Length Normalization to Meeting
Recordings},
booktitle = {Proc. Interspeech},
abstract = {Vocal Tract Length Normalisation (VTLN) is a commonly
used technique to normalise for inter-speaker
variability. It is based on the speaker-specific
warping of the frequency axis, parameterised by a
scalar warp factor. This factor is typically estimated
using maximum likelihood. We discuss how VTLN may be
applied to multiparty conversations, reporting a
substantial decrease in word error rate in experiments
using the ICSI meetings corpus. We investigate the
behaviour of the VTLN warping factor and show that a
stable estimate is not obtained. Instead it appears to
be influenced by the context of the meeting, in
particular the current conversational partner. These
results are consistent with predictions made by the
psycholinguistic interactive alignment account of
dialogue, when applied at the acoustic and phonological
levels.},
categories = {ami,asr,edinburgh,vtln,speaker
adaptation,lvcsr,meetings},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/giuliagarau_eurospeech05.pdf},
year = 2005
}
@article{morgan-ijprai93,
author = {N.~Morgan and H.~Bourlard and S.~Renals and M.~Cohen
and H.~Franco},
title = {Hybrid neural network/hidden {Markov} model systems
for continuous speech recognition},
journal = {Intl. J. Pattern Recog. and Artific. Intell.},
volume = {7},
pages = {899--916},
categories = {},
year = 1993
}
@inproceedings{hochberg-arpa95,
author = {M.~Hochberg and G.~Cook and S.~Renals and T.~Robinson
and R.~Schechtman},
title = {The 1994 {Abbot} hybrid {connectionist--HMM} large
vocabulary recognition system},
booktitle = {Proc. ARPA Spoken Language Technology Workshop},
pages = {170--175},
categories = {wernicke,recognition,wsj,am,hybrid,abbot,eval,search,sheffield,cambridge},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1995/slt95.ps.gz},
year = 1995
}
@incollection{alhames-mlmi05,
author = {M. Al-Hames and A. Dielmann and D. Gatica-Perez and S.
Reiter and S. Renals and G. Rigoll and D. Zhang},
title = {Multimodal Integration for Meeting Group Action
Segmentation and Recognition},
booktitle = {Proc. Multimodal Interaction and Related Machine
Learning Algorithms Workshop (MLMI--05)},
publisher = {Springer},
editor = {S. Renals and S. Bengio},
pages = {52--63},
abstract = {We address the problem of segmentation and recognition
of sequences of multimodal human interactions in
meetings. These interactions can be seen as a rough
structure of a meeting, and can be used either as input
for a meeting browser or as a first step towards a
higher semantic analysis of the meeting. A common
lexicon of multimodal group meeting actions, a shared
meeting data set, and a common evaluation procedure
enable us to compare the different approaches. We
compare three different multimodal feature sets and our
modelling infrastructures: a higher semantic feature
approach, multi-layer HMMs, a multistream DBN, as well
as a multi-stream mixed-state DBN for disturbed data.},
categories = {m4,ami,multimodal,dbn,meetings,edinburgh,IDIAP,munich},
year = 2006
}
@inproceedings{jyamagis07:avss2006,
author = {Junichi Yamagishi and Takao Kobayashi and Steve Renals
and Simon King and Heiga Zen and Tomoki Toda and
Keiichi Tokuda },
title = {Improved Average-Voice-based Speech Synthesis Using
Gender-Mixed Modeling and a Parameter Generation
Algorithm Considering {GV}},
booktitle = {Proc. 6th ISCA Workshop on Speech Synthesis (SSW-6)},
abstract = {For constructing a speech synthesis system which can
achieve diverse voices, we have been developing a
speaker independent approach of HMM-based speech
synthesis in which statistical average voice models are
adapted to a target speaker using a small amount of
speech data. In this paper, we incorporate a
high-quality speech vocoding method STRAIGHT and a
parameter generation algorithm with global variance
into the system for improving quality of synthetic
speech. Furthermore, we introduce a feature-space
speaker adaptive training algorithm and a gender mixed
modeling technique for conducting further normalization
of the average voice model. We build an English
text-to-speech system using these techniques and show
the performance of the system.},
categories = {HMM, speech synthesis, speaker adaptation, HTS},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/ssw6-yamagishi.pdf},
year = 2007
}
@inproceedings{renals2010b,
author = {Renals, Steve},
title = {Recognition and Understanding of Meetings},
booktitle = {Proc. NAACL/HLT},
pages = {1--9},
abstract = {This paper is about interpreting human communication
in meetings using audio, video and other signals.
Automatic meeting recognition and understanding is
extremely challenging, since communication in a meeting
is spontaneous and conversational, and involves
multiple speakers and multiple modalities. This leads
to a number of significant research problems in signal
processing, in speech recognition, and in discourse
interpretation, taking account of both individual and
group behaviours. Addressing these problems requires an
interdisciplinary effort. In this paper, I discuss the
capture and annotation of multimodal meeting recordings
- resulting in the AMI meeting corpus - and how we have
built on this to develop techniques and applications
for the recognition and interpretation of meetings.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/renals-naacl10.pdf},
year = 2010
}
@incollection{renals-sesimbra90,
author = {S.~Renals},
title = {Chaos in neural networks},
booktitle = {Neural Networks},
publisher = {Springer-Verlag},
editor = {L.~B.~Almeida and C.~J.~Wellekens},
number = {412},
series = {Lecture Notes in Computer Science},
pages = {90--99},
categories = {},
year = 1990
}
@inproceedings{renals-ijcnn89,
author = {S.~Renals and R.~Rohwer},
title = {Phoneme classification experiments using radial basis
functions},
booktitle = {Proc. IJCNN},
pages = {461--468},
address = {Washington DC},
categories = {},
year = 1989
}
@inproceedings{kilgour2010,
author = {Kilgour, Jonathan and Carletta, Jean and Renals, Steve},
title = {The {Ambient Spotlight}: Queryless desktop search from
meeting speech},
booktitle = {Proc ACM Multimedia 2010 Workshop SSCS 2010},
abstract = {It has recently become possible to record any small
meeting using a laptop equipped with a plug-and-play
USB microphone array. We show the potential for such
recordings in a personal aid that allows project
managers to record their meetings and, when reviewing
them afterwards through a standard calendar interface,
to find relevant documents on their computer. This
interface is intended to supplement or replace the
textual searches that managers typically perform. The
prototype, which relies on meeting speech recognition
and topic segmentation, formulates and runs desktop
search queries in order to present its results.},
doi = {10.1145/1878101.1878112},
pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2010/AmbientSpot.pdf},
year = 2010
}
@incollection{murray2008c,
author = {Murray, Gabriel and Kleinbauer, Thomas and Poller,
Peter and Renals, Steve and Kilgour, Jonathan},
title = {Extrinsic Summarization Evaluation: A Decision Audit
Task},
booktitle = {Machine Learning for Multimodal Interaction (Proc.
MLMI '08)},
publisher = {Springer},
number = {5237},
series = {Lecture Notes in Computer Science},
pages = {349--361},
abstract = {In this work we describe a large-scale extrinsic
evaluation of automatic speech summarization
technologies for meeting speech. The particular task is
a decision audit, wherein a user must satisfy a complex
information need, navigating several meetings in order
to gain an understanding of how and why a given
decision was made. We compare the usefulness of
extractive and abstractive technologies in satisfying
this information need, and assess the impact of
automatic speech recognition (ASR) errors on user
performance. We employ several evaluation methods for
participant performance, including post-questionnaire
data, human subjective and objective judgments, and an
analysis of participant browsing behaviour.},
doi = {10.1007/978-3-540-85853-9_32},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/murray2008c.pdf},
year = 2008
}
@article{renals-splett96,
author = {S.~Renals},
title = {Phone deactivation pruning in large vocabulary
continuous speech recognition},
journal = {IEEE Signal Processing Letters},
volume = {3},
pages = {4--6},
abstract = {In this letter we introduce a new pruning strategy for
large vocabulary continuous speech recognition based on
direct estimates of local posterior phone
probabilities. This approach is well suited to hybrid
connectionist/hidden Markov model systems. Experiments
on the Wall Street Journal task using a 20,000 word
vocabulary and a trigram language model have
demonstrated that phone deactivation pruning can
increase the speed of recognition-time search by up to
a factor of 10, with a relative increase in error rate
of less than 2\%.},
categories = {wernicke,sprach,recognition,search,wsj,sheffield},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1996/splett96.ps.gz},
year = 1996
}
@incollection{dielmann-mlmi06,
author = {A. Dielmann and S. Renals},
title = {Automatic Dialogue Act Recognition using a Dynamic
{Bayesian} Network},
booktitle = {Proc. Multimodal Interaction and Related Machine
Learning Algorithms Workshop (MLMI--06)},
publisher = {Springer},
editor = {S. Renals and S. Bengio and J. Fiscus},
pages = {178--189},
abstract = {We propose a joint segmentation and classification
approach for the dialogue act recognition task on
natural multi-party meetings ({ICSI} Meeting Corpus).
Five broad DA categories are automatically recognised
using a generative Dynamic {Bayesian} Network based
infrastructure. Prosodic features and a switching
graphical model are used to estimate DA boundaries, in
conjunction with a factored language model which is
used to relate words and DA categories. This easily
generalizable and extensible system promotes a rational
approach to the joint DA segmentation and recognition
task, and is capable of good recognition performance.},
categories = {ami,dialogue act,dbn,factored language
model,meetings,edinburgh},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/dielmann-mlmi06.pdf},
year = 2007
}
@inproceedings{hain-interspeech05,
author = {T. Hain and J. Dines and G. Garau and M. Karafiat and
D. Moore and V. Wan and R. Ordelman and S. Renals},
title = {Transcription of Conference Room Meetings: an
Investigation},
booktitle = {Proc. Interspeech},
abstract = {The automatic processing of speech collected in
conference style meetings has attracted considerable
interest with several large scale projects devoted to
this area. In this paper we explore the use of various
meeting corpora for the purpose of automatic speech
recognition. In particular we investigate the
similarity of these resources and how to efficiently
use them in the construction of a meeting transcription
system. The analysis shows distinctive features for
each resource. However the benefit in pooling data and
hence the similarity seems sufficient to speak of a
generic conference meeting domain . In this context
this paper also presents work on development for the
AMI meeting transcription system, a joint effort by
seven sites working on the AMI (augmented multi-party
interaction) project.},
categories = {ami,asr,edinburgh},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hain-eurospeech05.pdf},
year = 2005
}
@inproceedings{huang2008-is,
author = {Songfang Huang and Steve Renals},
title = {Unsupervised Language Model Adaptation Based on Topic
and Role Information in Multiparty Meetings},
booktitle = {Proc. Interspeech'08},
pages = {833--836},
address = {Brisbane, Australia},
abstract = {We continue our previous work on the modeling of topic
and role information from multiparty meetings using a
hierarchical Dirichlet process (HDP), in the context of
language model adaptation. In this paper we focus on
three problems: 1) an empirical analysis of the HDP as
a nonparametric topic model; 2) the mismatch problem of
vocabularies of the baseline n-gram model and the HDP;
and 3) an automatic speech recognition experiment to
further verify the effectiveness of our adaptation
framework. Experiments on a large meeting corpus of
more than 70 hours speech data show consistent and
significant improvements in terms of word error rate
for language model adaptation based on the topic and
role information.},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/interspeech08.pdf},
year = 2008
}
@inproceedings{hennebert-eurospeech97,
author = {J.~Hennebert and C.~Ris and H.~Bourlard and S.~Renals
and N.~Morgan},
title = {Estimation of global posteriors and forward-backward
training of hybrid {HMM/ANN} systems},
booktitle = {Proc. Eurospeech},
pages = {1951--1954},
address = {Rhodes},
abstract = {The results of our research presented in this paper
are two-fold. First, an estimation of global
posteriors[5~5 is formalized in the framework of hybrid
HMM/ANN systems. It is shown that hybrid HMM/ANN
systems, in which the ANN part estimates local
posteriors can be used to model global posteriors. This
formalization provides us with a clear theory in which
both REMAP and ``classical'' Viterbi trained hybrid
systems are unified. Second, a new forward-backward
training of hybrid HMM/ANN systems is derived from the
previous formulation. Comparisons of performance
between Viterbi and forward-backward hybrid systems are
presented and discussed.},
categories = {sprach,am,hybrid,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/eurosp97-remap.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/eurosp97-remap.ps.gz},
year = 1997
}
@inproceedings{cook-darpa99,
author = {G.~Cook and K.~Al-Ghoneim and D.~Ellis and
E.~Fosler-Lussier and Y.~Gotoh and B.~Kingsbury and
N.~Morgan and S.~Renals and T.~Robinson and G.~Williams},
title = {The {SPRACH} system for the transcription of broadcast
news},
booktitle = {Proc. DARPA Broadcast News Workshop},
pages = {161--166},
abstract = {This paper describes the SPRACH system developed for
the 1998 Hub-4E broadcast news evaluation. The system
is based on the connectionist-HMM framework and uses
both recurrent neural network and multi-layer
perceptron acoustic models. We describe both a system
designed for the primary transcription hub, and a
system for the less-than 10 times real-time spoke. We
then describe recent developments to CHRONOS, a
time-first stack decoder. We show how these
developments have simplified the evaluation system, and
led to significant reductions in the error rate of the
10x real-time system. We also present a system designed
to operate in real-time with negligible search error.},
categories = {sprach,bnews,recognition,am,hybrid,abbot,search,eval,sheffield},
http = {http://homepages.inf.ed.ac.uk/srenals/pubs/1999/darpa99-sprach.html},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/darpa99-sprach.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/darpa99-sprach.ps.gz},
year = 1999
}
@inproceedings{abberley-icassp98,
author = {D.~Abberley and S.~Renals and G.~Cook},
title = {Retrieval of broadcast news documents with the {THISL}
system},
booktitle = {Proc IEEE ICASSP},
pages = {3781--3784},
address = {Seattle},
abstract = {This paper describes a spoken document retrieval
system, combining the Abbot large vocabulary continuous
speech recognition (LVCSR) system developed by
Cambridge University, Sheffield University and
SoftSound, and the PRISE information retrieval engine
developed by NIST. The system was constructed to enable
us to participate in the TREC 6 Spoken Document
Retrieval experimental evaluation. Our key aims in this
work wer e to produce a complete system for the SDR
task, to investigate the effect of a word error rate of
30-50\% on retrieval performance and to investigate the
integration of LVCSR and word spotting in a retrieval
task.},
categories = {thisl,bnews,trec,ir,recognition,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/icassp98.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/icassp98.ps.gz},
year = 1998
}
@inproceedings{uria2011deep,
author = {Uria, Benigno and Renals, Steve and Richmond, Korin},
title = {A Deep Neural Network for Acoustic-Articulatory Speech
Inversion},
booktitle = {Proc. NIPS 2011 Workshop on Deep Learning and
Unsupervised Feature Learning},
address = {Sierra Nevada, Spain},
abstract = {In this work, we implement a deep belief network for
the acoustic-articulatory inversion mapping problem. We
find that adding up to 3 hidden-layers improves
inversion accuracy. We also show that this improvement
is due to the higher ex- pressive capability of a deep
model and not a consequence of adding more adjustable
parameters. Additionally, we show unsupervised
pretraining of the sys- tem improves its performance in
all cases, even for a 1 hidden-layer model. Our
implementation obtained an average root mean square
error of 0.95 mm on the MNGU0 test dataset, beating all
previously published results.},
month = {December},
pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2011/articulatory_inversion.pdf},
year = 2011
}
@inproceedings{robinson-icassp95,
author = {T.~Robinson and J.~Fransen and D.~Pye and J.~Foote and
S.~Renals},
title = {{WSJCAM0}: A {British English} speech corpus for large
vocabulary continuous speech recognition},
booktitle = {Proc IEEE ICASSP},
pages = {81--84},
address = {Detroit},
categories = {},
year = 1995
}
@article{cuayahuitl2009,
author = {Cuayáhuitl, Heriberto and Renals, Steve and Lemon,
Oliver and Shimodaira, Hiroshi},
title = {Evaluation of a hierarchical reinforcement learning
spoken dialogue system},
journal = {Computer Speech and Language},
volume = {24},
number = {2},
pages = {395-429},
abstract = {We describe an evaluation of spoken dialogue
strategies designed using hierarchical reinforcement
learning agents. The dialogue strategies were learnt in
a simulated environment and tested in a laboratory
setting with 32 users. These dialogues were used to
evaluate three types of machine dialogue behaviour:
hand-coded, fully-learnt and semi-learnt. These
experiments also served to evaluate the realism of
simulated dialogues using two proposed metrics
contrasted with ‘Precision-Recall’. The learnt
dialogue behaviours used the Semi-Markov Decision
Process (SMDP) model, and we report the first
evaluation of this model in a realistic conversational
environment. Experimental results in the travel
planning domain provide evidence to support the
following claims: (a) hierarchical semi-learnt dialogue
agents are a better alternative (with higher overall
performance) than deterministic or fully-learnt
behaviour; (b) spoken dialogue strategies learnt with
highly coherent user behaviour and conservative
recognition error rates (keyword error rate of 20\%)
can outperform a reasonable hand-coded strategy; and
(c) hierarchical reinforcement learning dialogue agents
are feasible and promising for the (semi) automatic
design of optimized dialogue behaviours in larger-scale
systems.},
doi = {10.1016/j.csl.2009.07.001},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/cuayahuitl-csl09.pdf},
year = 2009
}
@article{renals-specom00,
author = {S.~Renals and D.~Abberley and D.~Kirby and T.~Robinson},
title = {Indexing and Retrieval of Broadcast News},
journal = {Speech Communication},
volume = {32},
pages = {5--20},
abstract = {This paper describes a spoken document retrieval (SDR)
system for British and North American Broadcast News.
The system is based on a connectionist large vocabulary
speech recognizer and a probabilistic information
retrieval system. We discuss the development of a
realtime Broadcast News speech recognizer, and its
integration into an SDR system. Two advances were made
for this task: automatic segmentation and statistical
query expansion using a secondary corpus. Precision and
recall results using the Text Retrieval Conference
(TREC) SDR evaluation infrastructure are reported
throughout the paper, and we discuss the application of
these developments to a large scale SDR task based on
an archive of British English broadcast news.},
categories = {thisl,bnews,trec,ir,recognition,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/specom00-preprint.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/specom00-preprint.ps.gz},
year = 2000
}
@inproceedings{renals-twente98,
author = {S.~Renals and D.~Abberley},
title = {The {THISL} spoken document retrieval system},
booktitle = {Proc. 14th Twente Workshop on Language Technology},
pages = {129--140},
abstract = {THISL is an ESPRIT Long Term Research Project focused
the development and construction of a system to items
from an archive of television and radio news
broadcasts. In this paper we outline our spoken
document retrieval system based on the Abbot speech
recognizer and a text retrieval system based on Okapi
term-weighting . The system has been evaluated as part
of the TREC-6 and TREC-7 spoken document retrieval
evaluations and we report on the results of the TREC-7
evaluation based on a document collection of 100 hours
of North American broadcast news.},
categories = {thisl,bnews,trec,ir,recognition,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/twente98.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/twente98.ps.gz},
year = 1998
}
@inproceedings{christensen-ecir04,
author = {H. Christensen and B. Kolluru and Y. Gotoh and S.
Renals},
title = {From text summarisation to style-specific
summarisation for broadcast news},
booktitle = {Proc. ECIR--2004},
pages = {},
abstract = {In this paper we report on a series of experiments
investigating the path from text-summarisation to
style-specific summarisation of spoken news stories. We
show that the portability of traditional text
summarisation features to broadcast news is dependent
on the diffusiveness of the information in the
broadcast news story. An analysis of two categories of
news stories (containing only read speech or some
spontaneous speech) demonstrates the importance of the
style and the quality of the transcript, when
extracting the summary-worthy information content.
Further experiments indicate the advantages of doing
style-specific summarisation of broadcast news.},
categories = {s3l,summarization,bnews,edinburgh},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/ecir04.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/ecir04.ps.gz},
year = 2004
}
@inproceedings{robinson-eurospeech99,
author = {T.~Robinson and D.~Abberley and D.~Kirby and S.~Renals},
title = {Recognition, indexing and retrieval of {British}
broadcast news with the {THISL} SYSTEM},
booktitle = {Proc. Eurospeech},
pages = {1067--1070},
address = {Budapest},
abstract = {This paper described the THISL spoken document
retrieval system for British and North American
Broadcast News. The system is based on the Abbot large
vocabulary speech recognizer and a probabilistic text
retrieval system. We discuss the development of a
realtime British English Broadcast News system, and its
integration into a spoken document retrieval system.
Detailed evaluation is performed using a similar North
American Broadcast News system, to take advantage of
the TREC SDR evaluation methodology. We report results
on this evaluation, with particular reference to the
effect of query expansion and of automatic segmentation
algorithms.},
categories = {thisl,bnews,trec,ir,recognition,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/eurospeech99-thisl.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/eurospeech99-thisl.ps.gz},
year = 1999
}
@inproceedings{carreira-nnsp98,
author = {M.~Carreira-Perpiñán and S.~Renals},
title = {Experimental evaluation of latent variable models for
dimensionality reduction},
booktitle = {IEEE Proc. Neural Networks for Signal Processing},
volume = {8},
pages = {165--173},
address = {Cambridge},
abstract = {We use electropalatographic (EPG) data as a test bed
for dimensionality reduction methods based in latent
variable modelling, in which an underlying lower
dimension representation is inferred directly from the
data. Several models (and mixtures of them) are
investigated, including factor analysis and the
generative topographic mapping (GTM). Experiments
indicate that nonlinear latent variable modelling
reveals a low-dimensional structure in the data
inaccessible to the investigated linear models.},
categories = {ml,lv,artic,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/nnsp98.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/nnsp98.ps.gz},
year = 1998
}
@inproceedings{koumpis-icoin01,
author = {K.~Koumpis and C.~Ladas and S. Renals},
title = {An Advanced Integrated Architecture for Wireless
Voicemail Retrieval},
booktitle = {Proc. 15th IEEE International Conference on
Information Networking},
pages = {403--410},
abstract = {This paper describes an alternative architecture for
voicemail data retrieval on the move. It is comprised
of three distinct components: a speech recognizer, a
text summarizer and a WAP push service initiator,
enabling mobile users to receive a text summary of
their voicemail in realtime without an explicit
request. Our approach overcomes the cost and usability
limitations of the conventional voicemail retrieval
paradigm which requires a connection establishment in
order to listen to spoken messages. We report
performance results on all different components of the
system which has been trained on a database containing
1843 North American English messages as well as on the
duration of the corresponding data path. The proposed
architecture can be further customized to meet the
requirements of a complete voicemail value-added
service.},
categories = {voicemail,summarization,sheffield},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/icoin01.ps.gz},
year = 2001
}
@article{renals-sap94,
author = {S.~Renals and N.~Morgan and H.~Bourlard and M.~Cohen
and H.~Franco},
title = {Connectionist probability estimators in {HMM} speech
recognition},
journal = {IEEE Trans. on Speech and Audio Processing},
volume = {2},
pages = {161--175},
abstract = {We are concerned with integrating connectionist
networks into a hidden Markov model (HMM) speech
recognition system. This is achieved through a
statistical interpretation of connectionist networks as
probability estimators. We review the basis of HMM
speech recognition and point out the possible benefits
of incorporating connectionist networks. Issues
necessary to the construction of a connectionist HMM
recognition system are discussed, including choice of
connectionist probability estimator. We describe the
performance of such a system, using a multi-layer
perceptron probability estimator, evaluated on the
speaker-independent DARPA Resource Management database.
In conclusion, we show that a connectionist component
improves a state-of-the-art HMM system.},
categories = {},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1994/sap94.ps.gz},
year = 1994
}
@inproceedings{renals-trec01,
author = {S.~Renals and D.~Abberley},
title = {The {THISL} {SDR} system at {TREC}--9},
booktitle = {Proc. Ninth Text Retrieval Conference (TREC--9)},
pages = {},
abstract = {This paper describes our participation in the TREC-9
Spoken Document Retrieval (SDR) track. The THISL SDR
system consists of a realtime version of a hybrid
connectionist/HMM large vocabulary speech recognition
system and a probabilistic text retrieval system. This
paper describes the configuration of the speech
recognition and text retrieval systems, including
segmentation and query expansion. We report our results
for development tests using the TREC-8 queries, and for
the TREC-9 evaluation.},
categories = {thisl,bnews,trec,ir,recognition,eval,abbot,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/trec9-proc.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/trec9-proc.ps.gz},
year = 2001
}
@article{carreira-nc00,
author = {M.~Carreira-Perpiñán and S.~Renals},
title = {Practical identifiability of finite mixtures of
multivariate {Bernoulli} distributions},
journal = {Neural Computation},
volume = {12},
pages = {141--152},
abstract = {The class of finite mixtures of multivariate Bernoulli
distributions is known to be nonidentifiable, i.e.,
different values of the mixture parameters can
correspond to exactly the same probability
distribution. In principle, this would mean that sample
estimates using this model would give rise to different
interpretations. We give empirical support to the fact
that estimation of this class of mixtures can still
produce meaningful results in practice, thus lessening
the importance of the identifiability problem. We also
show that the EM algorithm is guaranteed to converge to
a proper maximum likelihood estimate, owing to a
property of the log-likelihood surface. Experiments
with synthetic data sets show that an original
generating distribution can be estimated from a sample.
Experiments with an electropalatography (EPG) data set
show important structure in the data.},
categories = {ml,lv,artic,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/nc00-preprint.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/nc00-preprint.ps.gz},
year = 2000
}
@article{bourlard-specom92,
author = {H.~Bourlard and N.~Morgan and S.~Renals},
title = {Neural nets and hidden {Markov} models: Review and
generalizations},
journal = {Speech Communication},
volume = {11},
pages = {237--246},
categories = {},
year = 1992
}
@incollection{renals-nips94,
author = {S.~Renals and M.~Hochberg and T.~Robinson},
title = {Learning temporal dependencies in connectionist speech
recognition},
booktitle = {Advances in Neural Information Processing Systems},
publisher = {Morgan Kaufmann},
editor = {J.~D.~Cowan and G.~Tesauro and J.~Alspector},
volume = {6},
pages = {1051--1058},
categories = {},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1994/1051.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1994/1051.ps.gz},
year = 1994
}
@inproceedings{zhang-icslp2006,
author = {Le Zhang and Steve Renals},
title = {Phone Recognition Analysis for Trajectory {HMM}},
booktitle = {Proc. Interspeech 2006},
address = {Pittsburgh, USA},
abstract = { The trajectory {HMM} has been shown to be useful for
model-based speech synthesis where a smoothed
trajectory is generated using temporal constraints
imposed by dynamic features. To evaluate the
performance of such model on an ASR task, we present a
trajectory decoder based on tree search with delayed
path merging. Experiment on a speaker-dependent phone
recognition task using the MOCHA-TIMIT database shows
that the MLE-trained trajectory model, while retaining
attractive properties of being a proper generative
model, tends to favour over-smoothed trajectory among
competing hypothesises, and does not perform better
than a conventional {HMM}. We use this to build an
argument that models giving better fit on training data
may suffer a reduction of discrimination by being too
faithful to training data. This partially explains why
alternative acoustic models that try to explicitly
model temporal constraints do not achieve significant
improvements in ASR. },
key = {asr},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/zhang-icslp2006.pdf},
year = 2006
}
@inproceedings{dielmann-icassp04,
author = {A. Dielmann and S. Renals},
title = {Dynamic {Bayesian} Networks for Meeting Structuring},
booktitle = {Proc. IEEE ICASSP},
pages = {},
abstract = {This paper is about the automatic structuring of
multiparty meetings using audio information. We have
used a corpus of 53 meetings, recorded using a
microphone array and lapel microphones for each
participant. The task was to segment meetings into a
sequence of meeting actions, or phases. We have adopted
a statistical approach using dynamic Bayesian networks
(DBNs). Two DBN architectures were investigated: a
two-level hidden Markov model (HMM) in which the
acoustic observations were concatenated; and a
multistream DBN in which two separate observation
sequences were modelled. Additionally we have also
explored the use of counter variables to constrain the
number of action transitions. Experimental results
indicate that the DBN architectures are an improvement
over a simple baseline HMM, with the multistream DBN
with counter constraints producing an action error rate
of 6\%.},
categories = {m4,multimodal,dbn,meetings,edinburgh},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/icassp04-m4.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/icassp04-m4.ps.gz},
year = 2004
}
@inproceedings{gotoh-esca99,
author = {Y.~Gotoh and S.~Renals},
title = {Statistical annotation of named entities in spoken
audio},
booktitle = {Proc. ESCA Workshop on Accessing Information In Spoken
Audio},
pages = {43--48},
address = {Cambridge},
abstract = {In this paper we describe stochastic finite state
model for named entity (NE) identification, based on
explicit word-level n-gram relations. NE categories are
incorporated in the model as word attributes. We
present an overview of the approach, describing how the
extensible vocabulary model may be used for NE
identification. We report development and evaluation
results on a North American Broadcast News task. This
approach resulted in average precision and recall
scores of around 83\% on hand transcribed data, and
73\% on the SPRACH recogniser output. We also present
an error analysis and a comparison of our approach with
an alternative statistical approach.},
categories = {sprach,stobs,ie,lm,bnews,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/esca99-ne.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/esca99-ne.ps.gz},
year = 1999
}
@inproceedings{wolters-is:09,
author = {Wolters, Maria and Vipperla, Ravichander and Renals,
Steve},
title = {Age Recognition for Spoken Dialogue Systems: Do We
Need It?},
booktitle = {Proc. Interspeech},
abstract = {When deciding whether to adapt relevant aspects of the
system to the particular needs of older users, spoken
dialogue systems often rely on automatic detection of
chronological age. In this paper, we show that vocal
ageing as measured by acoustic features is an
unreliable indicator of the need for adaptation. Simple
lexical features greatly improve the prediction of both
relevant aspects of cognition and interactions style.
Lexical features also boost age group prediction. We
suggest that adaptation should be based on observed
behaviour, not on chronological age, unless it is not
feasible to build classifiers for relevant adaptation
decisions.},
categories = {age recognition, spoken dialogue systems},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/is09.pdf},
year = 2009
}
@inproceedings{christensen-prosody01,
author = {H.~Christensen and Y.~Gotoh and S.~Renals},
title = {Punctuation Annotation using Statistical Prosody
Models},
booktitle = {Proc. ISCA Workshop on Prosody in Speech Recognition
and Understanding},
pages = {},
address = {Red Bank, NJ, USA},
abstract = {This paper is about the development of statistical
models of prosodic features to generate linguistic
meta-data for spoken language. In particular, we are
concerned with automatically punctuating the output of
a broadcast news speech recogniser. We present a
statistical finite state model that combines prosodic,
linguistic and punctuation class features. Experimental
results are presented using the Hub-4 Broadcast News
corpus, and in the light of our results we discuss the
issue of a suitable method of evaluating the present
task.},
categories = {stobs,ie,lm,prosody,bnews,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/pros01-punc.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/pros01-punc.ps.gz},
year = 2001
}
@inproceedings{huang2009-is,
author = {Songfang Huang and Steve Renals},
title = {A Parallel Training Algorithm for Hierarchical
{P}itman-{Y}or Process Language Models},
booktitle = {Proc. Interspeech'09},
pages = {2695--2698},
address = {Brighton, UK},
abstract = {The Hierarchical Pitman Yor Process Language Model
(HPYLM) is a Bayesian language model based on a
non-parametric prior, the Pitman-Yor Process. It has
been demonstrated, both theoretically and practically,
that the HPYLM can provide better smoothing for
language modeling, compared with state-of-the-art
approaches such as interpolated Kneser-Ney and modified
Kneser-Ney smoothing. However, estimation of Bayesian
language models is expensive in terms of both
computation time and memory; the inference is
approximate and requires a number of iterations to
converge. In this paper, we present a parallel training
algorithm for the HPYLM, which enables the approach to
be applied in the context of automatic speech
recognition, using large training corpora with large
vocabularies. We demonstrate the effectiveness of the
proposed algorithm by estimating language models from
corpora for meeting transcription containing over 200
million words, and observe significant reductions in
perplexity and word error rate.},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/sh_interspeech09.pdf},
year = 2009
}
@incollection{murray2008b,
author = {Murray, Gabriel and Renals, Steve},
title = {Detecting Action Items in Meetings},
booktitle = {Machine Learning for Multimodal Interaction (Proc.
MLMI '08)},
publisher = {Springer},
number = {5237},
series = {Lecture Notes in Computer Science},
pages = {208--213},
abstract = {We present a method for detecting action items in
spontaneous meeting speech. Using a supervised approach
incorporating prosodic, lexical and structural
features, we can classify such items with a high degree
of accuracy. We also examine how well various feature
subclasses can perform this task on their own.},
doi = {10.1007/978-3-540-85853-9_19},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/murray2008b.pdf},
url = {http://dx.doi.org/10.1007/978-3-540-85853-9_19},
year = 2008
}
@inproceedings{rohwer-neuro88,
author = {R.~Rohwer and S.~Renals},
title = {Training Recurrent Networks},
booktitle = {Neural networks from models to applications (Proc.
nEuro '88)},
editor = {L.~Personnaz and G.~Dreyfus},
pages = {207--216},
address = {Paris},
publisher = {I.D.S.E.T.},
categories = {},
year = 1988
}
@article{huang2010,
author = {Huang, Songfang and Renals, Steve},
title = {Hierarchical {Bayesian} Language Models for
Conversational Speech Recognition},
journal = {IEEE Transactions on Audio, Speech and Language
Processing},
volume = {18},
number = {8},
pages = {1941--1954},
abstract = {Traditional n-gram language models are widely used in
state-of-the-art large vocabulary speech recognition
systems. This simple model suffers from some
limitations, such as overfitting of maximum-likelihood
estimation and the lack of rich contextual knowledge
sources. In this paper, we exploit a hierarchical
Bayesian interpretation for language modeling, based on
a nonparametric prior called the Pitman--Yor process.
This offers a principled approach to language model
smoothing, embedding the power-law distribution for
natural language. Experiments on the recognition of
conversational speech in multiparty meetings
demonstrate that by using hierarchical Bayesian
language models, we are able to achieve significant
reductions in perplexity and word error rate.},
doi = {10.1109/TASL.2010.2040782},
keywords = {AMI corpus , conversational speech recognition ,
hierarchical Bayesian model , language model (LM) ,
meetings , smoothing},
month = {January},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/huang-taslp10.pdf},
url = {http://dx.doi.org/10.1109/TASL.2010.2040782},
year = 2010
}
@inproceedings{williams-eurospeech97,
author = {G.~Williams and S.~Renals},
title = {Confidence measures for hybrid {HMM/ANN} speech
recognition},
booktitle = {Proc. Eurospeech},
pages = {1955--1958},
address = {Rhodes},
abstract = {In this paper we introduce four acoustic confidence
measures which are derived from the output of a hybrid
HMM/ANN large vocabulary continuous speech recognition
system. These confidence measures, based on local
posterior probability estimates computed by an ANN, are
evaluated at both phone and word levels, using the
North American Business News corpus.},
categories = {recognition,conf,hybrid,wsj,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/eurosp97-conf.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/eurosp97-conf.ps.gz},
year = 1997
}
@inproceedings{carreira-icphs99,
author = {M.~Carreira-Perpiñán and S.~Renals},
title = {A latent-variable modelling approach to the
acoustic-to-articulatory mapping problem},
booktitle = {Proc. 14th Int. Congress of Phonetic Sciences},
pages = {2013-2016},
address = {San Francisco},
abstract = {We present a latent variable approach to the
acoustic-to-articulatory mapping problem, where
different vocal tract configurations can give rise to
the same acoustics. In latent variable modelling, the
combined acoustic and articulatory data are assumed to
have been generated by an underlying low-dimensional
process. A parametric probabilistic model is estimated
and mappings are derived from the respective
conditional distributions. This has the advantage over
other methods, such as articulatory codebooks or neural
networks, of directly addressing the nonuniqueness
problem. We demonstrate our approach with
electropalatographic and acoustic data from the ACCOR
database.},
categories = {ml,lv,artic,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/icphs99.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/icphs99.ps.gz},
year = 1999
}
@inproceedings{barker-icslp98,
author = {J.~Barker and G.~Williams and S.~Renals},
title = {Acoustic confidence measures for segmenting broadcast
news},
booktitle = {Proc. ICSLP},
pages = {2719--2722},
address = {Sydney},
abstract = {In this paper we define an acoustic confidence measure
based on the estimates of local posterior probabilities
produced by a HMM/ANN large vocabulary continuous
speech recognition system. We use this measure to
segment continuous audio into regions where it is and
is not appropriate to expend recognition effort. The
segmentation is computationally inexpensive and
provides reductions in both overall word error rate and
decoding time. The technique is evaluated using
material from the Broadcast News corpus.},
categories = {recognition,conf,hybrid,bnews,segmentation,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/icslp98-seg.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/icslp98-seg.ps.gz},
year = 1998
}
@inproceedings{renals-icassp92,
author = {S.~Renals and N.~Morgan and M.~Cohen and H.~Franco},
title = {Connectionist probability estimation in the {Decipher}
speech recognition system},
booktitle = {Proc IEEE ICASSP},
pages = {601--604},
address = {San Francisco},
categories = {},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1992/icassp92.ps.gz},
year = 1992
}
@incollection{huang2007-mlmi,
author = {Huang, Songfang and Renals, Steve},
title = {Modeling Prosodic Features in Language Models for
Meetings},
booktitle = {Machine Learning for Multimodal Interaction IV},
publisher = {Springer},
editor = {Popescu-Belis, A. and Renals, S. and Bourlard, H.},
volume = {4892},
series = {Lecture Notes in Computer Science},
pages = {191--202},
abstract = {Prosody has been actively studied as an important
knowledge source for speech recognition and
understanding. In this paper, we are concerned with the
question of exploiting prosody for language models to
aid automatic speech recognition in the context of
meetings. Using an automatic syllable detection
algorithm, the syllable-based prosodic features are
extracted to form the prosodic representation for each
word. Two modeling approaches are then investigated.
One is based on a factored language model, which
directly uses the prosodic representation and treats it
as a `word'. Instead of direct association, the second
approach provides a richer probabilistic structure
within a hierarchical Bayesian framework by introducing
an intermediate latent variable to represent similar
prosodic patterns shared by groups of words. Four-fold
cross-validation experiments on the ICSI Meeting Corpus
show that exploiting prosody for language modeling can
significantly reduce the perplexity, and also have
marginal reductions in word error rate.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/mlmi07.pdf},
year = 2007
}
@article{renals-sap99,
author = {S.~Renals and M.~Hochberg},
title = {Start-synchronous search for large vocabulary
continuous speech recognition},
journal = {IEEE Trans. on Speech and Audio Processing},
volume = {7},
pages = {542--553},
abstract = {In this paper, we present a novel, efficient search
strategy for large vocabulary continuous speech
recognition. The search algorithm, based on a stack
decoder framework, utilizes phone-level posterior
probability estimates (produced by a connectionist/HMM
acoustic model) as a basis for phone deactivation
pruning - a highly efficient method of reducing the
required computation. The single-pass algorithm is
naturally factored into the time-asynchronous
processing of the word sequence and the
time-synchronous processing of the HMM state sequence.
This enables the search to be decoupled from the
language model while still maintaining the
computational benefits of time-synchronous processing.
The incorporation of the language model in the search
is discussed and computationally cheap approximations
to the full language model are introduced. Experiments
were performed on the North American Business News task
using a 60,000 word vocabulary and a trigram language
model. Results indicate that the computational cost of
the search may be reduced by more than a factor of 40
with a relative search error of less than 2\% using the
techniques discussed in the paper.},
categories = {sprach,recognition,search,bnews,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/sap99-preprint.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/sap99-preprint.ps.gz},
year = 1999
}
@article{garau2008,
author = {Garau, Giulia and Renals, Steve},
title = {Combining Spectral Representations for Large
Vocabulary Continuous Speech Recognition},
journal = {IEEE Transactions on Audio, Speech and Language
Processing},
volume = {16},
number = {3},
pages = {508--518},
abstract = {In this paper we investigate the combination of
complementary acoustic feature streams in large
vocabulary continuous speech recognition (LVCSR). We
have explored the use of acoustic features obtained
using a pitch-synchronous analysis, STRAIGHT, in
combination with conventional features such as mel
frequency cepstral coefficients. Pitch-synchronous
acoustic features are of particular interest when used
with vocal tract length normalisation (VTLN) which is
known to be affected by the fundamental frequency. We
have combined these spectral representations directly
at the acoustic feature level using heteroscedastic
linear discriminant analysis (HLDA) and at the system
level using ROVER. We evaluated this approach on three
LVCSR tasks: dictated newspaper text (WSJCAM0),
conversational telephone speech (CTS), and multiparty
meeting transcription. The CTS and meeting
transcription experiments were both evaluated using
standard NIST test sets and evaluation protocols. Our
results indicate that combining conventional and
pitch-synchronous acoustic feature sets using HLDA
results in a consistent, significant decrease in word
error rate across all three tasks. Combining at the
system level using ROVER resulted in a further
significant decrease in word error rate.},
doi = {10.1109/TASL.2008.916519},
pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2008/garau-taslp08.pdf},
url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=4443886},
year = 2008
}
@inproceedings{llu2012map,
author = {Lu, L. and Ghoshal, A. and Renals, S.},
title = {{Maximum a posteriori adaptation of subspace Gaussian
mixture models for cross-lingual speech recognition}},
booktitle = {Proc. ICASSP},
abstract = {This paper concerns cross-lingual acoustic modeling in
the case when there are limited target language
resources. We build on an approach in which a subspace
Gaussian mixture model (SGMM) is adapted to the target
language by reusing the globally shared parameters
estimated from out-of-language training data. In
current cross-lingual systems, these parameters are
fixed when training the target system, which can give
rise to a mismatch between the source and target
systems. We investigate a maximum a posteriori (MAP)
adaptation approach to alleviate the potential
mismatch. In particular, we focus on the adaptation of
phonetic subspace parameters using a matrix variate
Gaussian prior distribution. Experiments on the
GlobalPhone corpus using the MAP adaptation approach
results in word error rate reductions, compared with
the cross-lingual baseline systems and systems updated
using maximum likelihood, for training conditions with
1 hour and 5 hours of target language data.},
keywords = {Subspace Gaussian Mixture Model, Maximum a Posteriori
Adaptation, Cross-lingual Speech Recognition},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/llu-icassp-2012.pdf},
year = 2012
}
@inproceedings{murray06,
author = {G. Murray and S. Renals and J. Moore and J. Carletta},
title = {Incorporating Speaker and Discourse Features into
Speech Summarization},
booktitle = {Proceedings of the Human Language Technology
Conference - North American Chapter of the Association
for Computational Linguistics Meeting (HLT-NAACL) 2006,
New York City, USA},
abstract = {The research presented herein explores the usefulness
of incorporating speaker and discourse features in an
automatic speech summarization system applied to
meeting recordings from the ICSI Meetings corpus. By
analyzing speaker activity, turn-taking and discourse
cues, it is hypothesized that a system can outperform
solely text-based methods inherited from the field of
text summarization. The summarization methods are
described, two evaluation methods are applied and
compared, and the results clearly show that utilizing
such features is advantageous and efficient. Even
simple methods relying on discourse cues and speaker
activity can outperform text summarization approaches.},
categories = {summarization, speech summarization, prosody, latent
semantic analysis},
month = jun,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/hlt2006-final.pdf},
year = 2006
}
@inproceedings{qin:perpinan:richmond:wrench:renals:2008a,
author = {Qin, C. and Carreira-Perpiñán, M. and Richmond, K.
and Wrench, A. and Renals, S.},
title = {Predicting Tongue Shapes from a Few Landmark Locations},
booktitle = {Proc. Interspeech},
pages = {2306--2309},
address = {Brisbane, Australia},
abstract = {We present a method for predicting the midsagittal
tongue contour from the locations of a few landmarks
(metal pellets) on the tongue surface, as used in
articulatory databases such as MOCHA and the Wisconsin
XRDB. Our method learns a mapping using ground-truth
tongue contours derived from ultrasound data and
drastically improves over spline interpolation. We also
determine the optimal locations of the landmarks, and
the number of landmarks required to achieve a desired
prediction error: 3-4 landmarks are enough to achieve
0.3-0.2 mm error per point on the tongue.},
categories = {ultrasound, tongue contour, articulation},
key = {qin:perpinan:richmond:wrench:renals:2008a},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080929.PDF},
year = 2008
}
@inproceedings{koumpis-msdr03,
author = {K.~Koumpis and S.~Renals},
title = {Evaluation of extractive voicemail summarization},
booktitle = {Proc. ISCA Workshop on Multilingual Spoken Document
Retrieval},
pages = {19--24},
abstract = {This paper is about the evaluation of a system that
generates short text summaries of voicemail messages,
suitable for transmission as text messages. Our
approach to summarization is based on a
speech-recognized transcript of the voicemail message,
from which a set of summary words is extracted. The
system uses a classifier to identify the summary words,
with each word being identified by a vector of lexical
and prosodic features. The features are selected using
Parcel, an ROC-based algorithm. Our evaluations of the
system, using a slot error rate metric, have compared
manual and automatic summarization, and manual and
automatic recognition (using two different
recognizers). We also report on two subjective
evaluations using mean opinion score of summaries, and
a set of comprehension tests. The main results from
these experiments were that the perceived difference in
quality of summarization was affected more by errors
resulting from automatic transcription, than by the
automatic summarization process.},
categories = {voicemail,summarization,prosody,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/msdr03.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/msdr03.ps.gz},
year = 2003
}
@inproceedings{robinson-eurospeech93,
author = {A.~J.~Robinson and L.~Almeida and J.-M.~Boite and
H.~Bourlard and F.~Fallside and M.~Hochberg and
D.~Kershaw and P.~Kohn and Y.~Konig and N.~Morgan and
J.~P.~Neto and S.~Renals and M.~Saerens and C.~Wooters},
title = {A neural network based, speaker independent, large
vocabulary, continuous speech recognition system: the
{Wernicke} project},
booktitle = {Proc. Eurospeech},
pages = {1941--1944},
address = {Berlin},
categories = {},
year = 1993
}
@inproceedings{jaimes2007,
author = {Jaimes, Alejandro and Bourlard, Hervé and Renals,
Steve and Carletta, Jean},
title = {Recording, Indexing, Summarizing, and Accessing
Meeting Videos: An Overview of the {AMI} Project},
booktitle = {Proc IEEE ICIAPW},
pages = {59--64},
abstract = {n this paper we give an overview of the AMI project.
AMI developed the following: (1) an infrastructure for
recording meetings using multiple microphones and
cameras; (2) a one hundred hour, manually annotated
meeting corpus; (3) a number of techniques for
indexing, and summarizing of meeting videos using
automatic speech recognition and computer vision, and
(4) an extensible framework for browsing, and searching
of meeting videos. We give an overview of the various
techniques developed in AMI, their integration into our
meeting browser framework, and future plans for AMIDA
(Augmented Multiparty Interaction with Distant Access),
the follow-up project to AMI.},
doi = {10.1109/ICIAPW.2007.36},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/jaimes2007.pdf},
url = {http://ieeexplore.ieee.org/search/srchabstract.jsp?arnumber=4427477&isnumber=4427459&punumber=4427458&k2dockey=4427477@ieeecnfs&query=%28+%28%28renals%29%3Cin%3Eau+%29+%29+%3Cand%3E+%28pyr+%3E%3D+2006+%3Cand%3E+pyr+%3C%3D+2008%29&pos=6&access=no},
year = 2007
}
@inproceedings{abberley-trec98,
author = {D.~Abberley and S.~Renals and G.~Cook and T.~Robinson},
title = {The 1997 {THISL} spoken document retrieval system},
booktitle = {Proc. Sixth Text Retrieval Conference (TREC--6)},
pages = {747--752},
abstract = {The THISL spoken document retrieval system is based on
the Abbot Large Vocabulary Continuous Speech
Recognition (LVCSR) system developed by Cambridge
University, Sheffield University and SoftSound, and
uses PRISE (NIST) for indexing and retrieval. We
participated in full SDR mode. Our approach was to
transcribe the spoken documents at the word level using
Abbot, indexing the resulting text transcriptions using
PRISE. The LVCSR system uses a recurrent network-based
acoustic model (with no adaptation to different
conditions) trained on the 50 hour Broadcast News
training set, a 65,000 word vocabulary and a trigram
language model derived from Broadcast News text. Words
in queries which were out-of-vocabulary (OOV) were word
spotted at query time (utilizing the posterior phone
probabilities output by the acoustic model), added to
the transcriptions of the relevant documents and the
collection was then re-indexed. We generated
pronunciations at run-time for OOV words using the
Festival TTS system (University of Edinburgh).},
categories = {thisl,bnews,trec,ir,recognition,eval,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/trec6.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/trec6.ps.gz},
year = 1998
}
@article{robinson-specom02,
author = {A.~J.~Robinson and G.~D.~Cook and D.~P.~W.~Ellis and
E.~Fosler-Lussier and S.~J.~Renals and
D.~A.~G.~Williams},
title = {Connectionist Speech Recognition of Broadcast News},
journal = {Speech Communication},
volume = {37},
pages = {27--45},
abstract = {This paper describes connectionist techniques for
recognition of Broadcast News. The fundamental
difference between connectionist systems and more
conventional mixture-of-Gaussian systems is that
connectionist models directly estimate posterior
probabilities as opposed to likelihoods. Access to
posterior probabilities has enabled us to develop a
number of novel approaches to confidence estimation,
pronunciation modelling and search. In addition we have
investigated a new feature extraction technique based
on the modulation-filtered spectrogram, and methods for
combining multiple information sources. We have
incorporated all of these techniques into a system for
the transcription of Broadcast News, and we present
results on the 1998 DARPA Hub-4E Broadcast News
evaluation data.},
categories = {sprach,bnews,recognition,am,hybrid,abbot,lm,search,pron,eval,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/specom02-preprint.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/specom02-preprint.ps.gz},
year = 2002
}
@inproceedings{renals-eurospeech93,
author = {S.~Renals and D.~MacKay},
title = {Bayesian regularisation methods in a hybrid {MLP--HMM}
system},
booktitle = {Proc. Eurospeech},
pages = {1719--1722},
address = {Berlin},
categories = {},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1993/eurosp93-bayes.ps.gz},
year = 1993
}
@incollection{renals-nips92,
author = {S.~Renals and H.~Bourlard and N.~Morgan and H.~Franco
and M.~Cohen},
title = {Connectionist optimisation of tied mixture hidden
{Markov} models},
booktitle = {Advances in Neural Information Processing Systems},
publisher = {Morgan-Kaufmann},
editor = {J.~E.~Moody and S.~J.~Hanson and R.~P.~Lippmann},
volume = {4},
pages = {167--174},
categories = {},
year = 1992
}
@inproceedings{renals-icassp03,
author = {S.~Renals and D.~Ellis},
title = {Audio information access from meeting rooms},
booktitle = {Proc. IEEE ICASSP},
volume = {4},
pages = {744--747},
abstract = {We investigate approaches to accessing information
from the streams of audio data that result from
multi-channel recordings of meetings. The methods
investigated use word-level transcriptions, and
information derived from models of speaker activity and
speaker turn patterns. Our experiments include spoken
document retrieval for meetings, automatic structuring
of meetings based on self-similarity matrices of
speaker turn patterns and a simple model of speaker
activity. Meeting recordings are rich in both lexical
and non-lexical information; our results illustrate
some novel kinds of analysis made possible by a
transcribed corpus of natural meetings.},
categories = {m4,multimodal,ir,meetings,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/icassp03-mtg.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/icassp03-mtg.ps.gz},
year = 2003
}
@inproceedings{cabral:renals:richmond:yamagishi:2008a,
author = {Cabral, J. and Renals, S. and Richmond, K. and
Yamagishi, J.},
title = {Glottal Spectral Separation for Parametric Speech
Synthesis},
booktitle = {Proc. Interspeech},
pages = {1829--1832},
address = {Brisbane, Australia},
abstract = {This paper presents a method to control the
characteristics of synthetic speech flexibly by
integrating articulatory features into a Hidden Markov
Model (HMM)-based parametric speech synthesis system.
In contrast to model adaptation and interpolation
approaches for speaking style control, this method is
driven by phonetic knowledge, and target speech samples
are not required. The joint distribution of parallel
acoustic and articulatory features considering
cross-stream feature dependency is estimated. At
synthesis time, acoustic and articulatory features are
generated simultaneously based on the
maximum-likelihood criterion. The synthetic speech can
be controlled flexibly by modifying the generated
articulatory features according to arbitrary phonetic
rules in the parameter generation process. Our
experiments show that the proposed method is effective
in both changing the overall character of synthesized
speech and in controlling the quality of a specific
vowel. },
categories = {HMM speech synthesis, Glottal Spectral Separation,
LF-model},
key = {cabral:renals:richmond:yamagishi:2008a},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS081086.PDF},
year = 2008
}
@inproceedings{pietquin-icassp02,
author = {O.~Pietquin and S.~Renals},
title = {{ASR} system modeling for automatic evaluation and
optimization of dialogue systems},
booktitle = {Proc IEEE ICASSP},
pages = {46--49},
abstract = {Though the field of spoken dialogue systems has
developed quickly in the last decade, rapid design of
dialogue strategies remains uneasy. Several approaches
to the problem of automatic strategy learning have been
proposed and the use of Reinforcement Learning
introduced by Levin and Pieraccini is becoming part of
the state of the art in this area. However, the quality
of the strategy learned by the system depends on the
definition of the optimization criterion and on the
accuracy of the environment model. In this paper, we
propose to bring a model of an ASR system in the
simulated environment in order to enhance the learned
strategy. To do so, we introduced recognition error
rates and confidence levels produced by ASR systems in
the optimization criterion.},
categories = {dialog,rl,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/icassp02-rl.pdf},
year = 2002
}
@inproceedings{kershaw-icslp96,
author = {D.~Kershaw and T.~Robinson and S.~Renals},
title = {The 1995 {Abbot} {LVCSR} system for multiple unknown
microphones},
booktitle = {Proc. ICSLP},
pages = {1325-1328},
address = {Philadelphia PA},
categories = {wernicke,sprach,wsj,recognition,am,hybrid,abbot,search,eval,sheffield},
year = 1996
}
@inproceedings{NistevalAMI05,
author = {T. Hain and L. Burget and J. Dines and G. Garau and M.
Karafiat and M. Lincoln and I. McCowan and D. Moore and
V. Wan and R. Ordelman and S. Renals},
title = {The 2005 {AMI} System for the transcription of Speech
in Meetings},
booktitle = {Proceedings of the Rich Transcription 2005 Spring
Meeting Recognition Evaluation},
abstract = {In this paper we describe the 2005 AMI system for the
transcription of speech in meetings used in the 2005
NIST RT evaluations. The system was designed for
participation in the speech to text part of the
evaluations, in particular for transcription of speech
recorded with multiple distant microphones and
independent headset microphones. System performance was
tested on both conference room and lecture style
meetings. Although input sources are processed using
different frontends, the recognition process is based
on a unified system architecture. The system operates
in multiple passes and makes use of state of the art
technologies such as discriminative training, vocal
tract length normalisation, heteroscedastic linear
discriminant analysis, speaker adaptation with maximum
likelihood linear regression and minimum word error
rate decoding. In this paper we describe the system
performance on the official development and test sets
for the NIST RT05s evaluations. The system was jointly
developed in less than 10 months by a multi-site team
and was shown to achieve competitive performance.},
categories = {LVCSR, NIST Meeting Transcription Evaluation RT05S},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/AMIasr.nist2005.pdf},
year = 2005
}
@inproceedings{cuayahuitletal_slt06,
author = {Heriberto Cuayáhuitl and Steve Renals and Oliver
Lemon and Hiroshi Shimodaira},
title = {Reinforcement Learning of Dialogue Strategies With
Hierarchical Abstract Machines},
booktitle = {Proc. of IEEE/ACL Workshop on Spoken Language
Technology (SLT)},
abstract = {In this paper we propose partially specified dialogue
strategies for dialogue strategy optimization, where
part of the strategy is specified deterministically and
the rest optimized with Reinforcement Learning (RL). To
do this we apply RL with Hierarchical Abstract Machines
(HAMs). We also propose to build simulated users using
HAMs, incorporating a combination of hierarchical
deterministic and probabilistic behaviour. We performed
experiments using a single-goal flight booking dialogue
system, and compare two dialogue strategies
(deterministic and optimized) using three types of
simulated user (novice, experienced and expert). Our
results show that HAMs are promising for both dialogue
optimization and simulation, and provide evidence that
indeed partially specified dialogue strategies can
outperform deterministic ones (on average 4.7 fewer
system turns) with faster learning than the traditional
RL framework.},
categories = {reinforcement learning, spoken dialogue systems},
month = dec,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/ham-slt2006.pdf},
year = 2006
}
@inproceedings{bell12_mlan,
author = {Bell, P. and Gales, M. and Lanchantin, P. and Liu, X.
and Long, Y. and Renals, S. and Swietojanski, P. and
Woodland, P.},
title = {Transcription of multi-genre media archives using
out-of-domain data},
booktitle = {Proc. IEEE Workshop on Spoken Language Technology},
address = {Miami, Florida, USA},
abstract = {We describe our work on developing a speech
recognition system for multi-genre media archives. The
high diversity of the data makes this a challenging
recognition task, which may benefit from systems
trained on a combination of in-domain and out-of-domain
data. Working with tandem HMMs, we present Multi-level
Adaptive Networks (MLAN), a novel technique for
incorporating information from out-of-domain posterior
features using deep neural networks. We show that it
provides a substantial reduction in WER over other
systems, with relative WER reductions of 15\% over a
PLP baseline, 9\% over in-domain tandem features and
8\% over the best out-of-domain tandem features.},
month = dec,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/mlan_slt2012.pdf},
year = 2012
}
@inproceedings{renals2007,
author = {Renals, Steve and Hain, Thomas and Bourlard, Hervé},
title = {Recognition and interpretation of meetings: The {AMI}
and {AMIDA} projects},
booktitle = {Proc. IEEE Workshop on Automatic Speech Recognition
and Understanding (ASRU '07)},
abstract = {The AMI and AMIDA projects are concerned with the
recognition and interpretation of multiparty meetings.
Within these projects we have: developed an
infrastructure for recording meetings using multiple
microphones and cameras; released a 100 hour annotated
corpus of meetings; developed techniques for the
recognition and interpretation of meetings based
primarily on speech recognition and computer vision;
and developed an evaluation framework at both component
and system levels. In this paper we present an overview
of these projects, with an emphasis on speech
recognition and content extraction. },
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/ami-asru2007.pdf},
year = 2007
}
@inproceedings{dielmann-icassp07,
author = {A. Dielmann and S. Renals},
title = {{DBN} based joint Dialogue Act recognition of
multiparty meetings},
booktitle = {Proc. IEEE ICASSP},
volume = 4,
pages = {133--136},
abstract = {Joint Dialogue Act segmentation and classification of
the new {AMI} meeting corpus has been performed through
an integrated framework based on a switching dynamic
{Bayesian} network and a set of continuous features and
language models. The recognition process is based on a
dictionary of 15 {DA} classes tailored for group
decision-making. Experimental results show that a novel
interpolated Factored Language Model results in a low
error rate on the automatic segmentation task, and thus
good recognition results can be achieved on {AMI}
multiparty conversational speech.},
categories = {ami,dialogue act,dbn,factored language
model,meetings,edinburgh},
month = {April},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/dielmann-icassp07.pdf},
year = 2007
}
@inproceedings{kolluru-asru03,
author = {B. Kolluru and H. Christensen and Y. Gotoh and S.
Renals},
title = {Exploring the style-technique interaction in
extractive summarization of broadcast news},
booktitle = {Proc. IEEE Automatic Speech Recognition and
Understanding Workshop},
pages = {},
abstract = {In this paper we seek to explore the interaction
between the style of a broadcast news story and its
summarization technique. We report the performance of
three different summarization techniques on broadcast
news stories, which are split into planned speech and
spontaneous speech. The initial results indicate that
some summarization techniques work better for the
documents with spontaneous speech than for those with
planned speech. Even for human beings some documents
are inherently dif cult to summarize. We observe this
correlation between degree of dif culty in summarizing
and performance of the three automatic summarizers.
Given the high frequency of named entities in broadcast
news and even greater number of references to these
named entities, we also gauge the effect of named
entity and coreference resolution in a news story, on
the performance of these summarizers.},
categories = {s3l,summarization,bnews,edinburgh},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/asru03-style.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/asru03-style.ps.gz},
year = 2003
}
@inproceedings{cabral_ssw7,
author = {Cabral, Jo{\~a}o and Renals, Steve and Richmond, Korin
and Yamagishi, Junichi},
title = {Transforming Voice Source Parameters in a {HMM}-based
Speech Synthesiser with Glottal Post-Filtering},
booktitle = {Proc. 7th ISCA Speech Synthesis Workshop (SSW7)},
pages = {365--370},
address = {NICT/ATR, Kyoto, Japan},
abstract = {Control over voice quality, e.g. breathy and tense
voice, is important for speech synthesis applications.
For example, transformations can be used to modify
aspects of the voice re- lated to speaker's identity
and to improve expressiveness. How- ever, it is hard to
modify voice characteristics of the synthetic speech,
without degrading speech quality. State-of-the-art sta-
tistical speech synthesisers, in particular, do not
typically al- low control over parameters of the
glottal source, which are strongly correlated with
voice quality. Consequently, the con- trol of voice
characteristics in these systems is limited. In con-
trast, the HMM-based speech synthesiser proposed in
this paper uses an acoustic glottal source model. The
system passes the glottal signal through a whitening
filter to obtain the excitation of voiced sounds. This
technique, called glottal post-filtering, allows to
transform voice characteristics of the synthetic speech
by modifying the source model parameters. We evaluated
the proposed synthesiser in a perceptual ex- periment,
in terms of speech naturalness, intelligibility, and
similarity to the original speaker's voice. The results
show that it performed as well as a HMM-based
synthesiser, which generates the speech signal with a
commonly used high-quality speech vocoder.},
keywords = {HMM-based speech synthesis, voice quality, glottal
post-filter},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/cabral_ssw7.pdf},
year = 2010
}
@inproceedings{gotoh-eurospeech97,
author = {Y.~Gotoh and S.~Renals},
title = {Document space models using latent semantic analysis},
booktitle = {Proc. Eurospeech},
pages = {1443--1446},
address = {Rhodes},
abstract = {In this paper, an approach for constructing mixture
language models (LMs) based on some notion of semantics
is discussed. To this end, a technique known as latent
semantic analysis (LSA) is used. The approach
encapsulates corpus-derived semantic information and is
able to model the varying style of the text. Using such
information, the corpus texts are clustered in an
unsupervised manner and mixture LMs are automatically
created. This work builds on previous work in the field
of information retrieval which was recently applied by
Bellegarda et. al. to the problem of clustering words
by semantic categories. The principal contribution of
this work is to characterize the document space
resulting from the LSA modeling and to demonstrate the
approach for mixture LM application. Comparison is made
between manual and automatic clustering in order to
elucidate how the semantic information is expressed in
the space. It is shown that, using semantic
information, mixture LMs performs better than a
conventional single LM with slight increase of
computational cost.},
categories = {sprach,lm,bnc,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/eurosp97-lsa.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/eurosp97-lsa.ps.gz},
year = 1997
}
@inproceedings{vipperla2010a,
author = {Vipperla, Ravi Chander and Renals, Steve and Frankel,
Joe},
title = {Augmentation of adaptation data},
booktitle = {Proc. Interspeech},
pages = {530--533},
address = {Makuhari, Japan},
abstract = {Linear regression based speaker adaptation approaches
can improve Automatic Speech Recognition (ASR) accuracy
significantly for a target speaker. However, when the
available adaptation data is limited to a few seconds,
the accuracy of the speaker adapted models is often
worse compared with speaker independent models. In this
paper, we propose an approach to select a set of
reference speakers acoustically close to the target
speaker whose data can be used to augment the
adaptation data. To determine the acoustic similarity
of two speakers, we propose a distance metric based on
transforming sample points in the acoustic space with
the regression matrices of the two speakers. We show
the validity of this approach through a speaker
identification task. ASR results on SCOTUS and AMI
corpora with limited adaptation data of 10 to 15
seconds augmented by data from selected reference
speakers show a significant improvement in Word Error
Rate over speaker independent and speaker adapted
models.},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/vipperla-is2010.pdf},
year = 2010
}
@inproceedings{terry-icassp88,
author = {M.~Terry and S.~Renals and R.~Rohwer and J.~Harrington},
title = {A connectionist approach to speech recognition using
peripheral auditory modelling},
booktitle = {Proc IEEE ICASSP},
pages = {699--702},
address = {New York},
categories = {},
year = 1988
}
@inproceedings{williams-icslp98,
author = {G.~Williams and S.~Renals},
title = {Confidence measures derived from an acceptor {HMM}},
booktitle = {Proc. ICSLP},
pages = {831--834},
address = {Sydney},
abstract = {In this paper we define a number of confidence
measures derived from an acceptor HMM and evaluate
their performance for the task of utterance
verification using the North American Business News
(NAB) and Broadcast News (BN) corpora. Results are
presented for decodings made at both the word and phone
level which show the relative profitability of
rejection provided by the diverse set of confidence
measures. The results indicate that language model
dependent confidence measures have reduced performance
on BN data relative to that for the more grammatically
constrained NAB data. An explanation linking the
observations that rejection is more profitable for
noisy acoustics, for a reduced vocabulary and at the
phone level is also given.},
categories = {recognition,conf,hybrid,bnews,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/icslp98-conf.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/icslp98-conf.ps.gz},
year = 1998
}
@inproceedings{renals-icassp91,
author = {S.~Renals and D.~McKelvie and F.~McInnes},
title = {A comparative study of continuous speech recognition
using neural networks and hidden {Markov} models},
booktitle = {Proc IEEE ICASSP},
pages = {369--372},
address = {Toronto},
categories = {},
year = 1991
}
@inproceedings{koumpis-icslp00,
author = {K.~Koumpis and S. Renals},
title = {Transcription and Summarization of Voicemail Speech},
booktitle = {Proc. ICSLP},
volume = {2},
pages = {688--691},
address = {Beijing},
abstract = {This paper describes the development of a system to
transcribe and summarize voicemail messages. The
results of the research presented in this paper are
two-fold. First, a hybrid connectionist approach to the
Voicemail transcription task shows that competitive
performance can be achieved using a context-independent
system with fewer parameters than those based on
mixtures of Gaussian likelihoods. Second, an effective
and robust combination of statistical with prior
knowledge sources for term weighting is used to extract
information from the decoders output in order to
deliver summaries to the message recipients via a GSM
Short Message Service (SMS) gateway.},
categories = {voicemail,summarization,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/icslp00.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/icslp00.ps.gz},
year = 2000
}
@inproceedings{gotoh-icassp99,
author = {Y.~Gotoh and S.~Renals and G.~Williams},
title = {Named entity tagged language models},
booktitle = {Proc IEEE ICASSP},
pages = {513--516},
address = {Phoenix AZ},
abstract = {We introduce Named Entity (NE) Language Modelling, a
stochastic finite state machine approach to identifying
both words and NE categories from a stream of spoken
data. We provide an overview of our approach to NE
tagged language model (LM) generation together with
results of the application of such a LM to the task of
out-of-vocabulary (OOV) word reduction in large
vocabulary speech recognition. Using the Wall Street
Journal and Broadcast News corpora, it is shown that
the tagged LM was able to reduce the overall word error
rate by 14\%, detecting up to 70\% of previously OOV
words. We also describe an example of the direct
tagging of spoken data with NE categories.},
categories = {sprach,ie,lm,bnews,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/icassp99.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/icassp99.ps.gz},
year = 1999
}
@inproceedings{huang2007-asru,
author = {Huang, Songfang and Renals, Steve},
title = {Hierarchical {Pitman-Yor} Language Models for {ASR} in
Meetings},
booktitle = {Proc. IEEE Workshop on Automatic Speech Recognition
and Understanding (ASRU'07)},
pages = {124--129},
address = {Kyoto, Japan},
abstract = {In this paper we investigate the application of a
novel technique for language modeling --- a
hierarchical Bayesian language model (LM) based on the
Pitman-Yor process --- on automatic speech recognition
(ASR) for multiparty meetings. The hierarchical
Pitman-Yor language model (HPYLM), which was originally
proposed in the machine learning field, provides a
Bayesian interpretation to language modeling. An
approximation to the HPYLM recovers the exact
formulation of the interpolated Kneser-Ney smoothing
method in n-gram models. This paper focuses on the
application and scalability of HPYLM on a practical
large vocabulary ASR system. Experimental results on
NIST RT06s evaluation meeting data verify that HPYLM is
a competitive and promising language modeling
technique, which consistently performs better than
interpolated Kneser-Ney and modified Kneser-Ney n-gram
LMs in terms of both perplexity (PPL) and word error
rate (WER).},
month = dec,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/asru07.pdf},
year = 2007
}
@inproceedings{huang2010a,
author = {Huang, Songfang and Renals, Steve},
title = {Power Law Discounting for N-Gram Language Models},
booktitle = {Proc. IEEE ICASSP--10},
pages = {5178--5181},
abstract = {We present an approximation to the Bayesian
hierarchical Pitman-Yor process language model which
maintains the power law distribution over word tokens,
while not requiring a computationally expensive
approximate inference process. This approximation,
which we term power law discounting, has a similar
computational complexity to interpolated and modified
Kneser-Ney smoothing. We performed experiments on
meeting transcription using the NIST RT06s evaluation
data and the AMI corpus, with a vocabulary of 50,000
words and a language model training set of up to 211
million words. Our results indicate that power law
discounting results in statistically significant
reductions in perplexity and word error rate compared
to both interpolated and modified Kneser-Ney smoothing,
while producing similar results to the hierarchical
Pitman-Yor process language model.},
doi = {10.1109/ICASSP.2010.5495007},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/huang-icassp10.pdf},
url = {http://dx.doi.org/10.1109/ICASSP.2010.5495007},
year = 2010
}
@inproceedings{renals-eurospeech99,
author = {S.~Renals and Y.~Gotoh},
title = {Integrated transcription and identification of named
entities in broadcast speech},
booktitle = {Proc. Eurospeech},
pages = {1039--1042},
address = {Budapest},
abstract = {This paper presents an approach to integrating
functions for both transcription and named entity (NE)
identification into a large vocabulary continuous
speech recognition system. It builds on NE tagged
language modelling approach, which was recently applied
for development of the statistical NE annotation
system. We also present results for proper name
identification experiment using the Hub-4E open
evaluation data.},
categories = {sprach,stobs,ie,lm,bnews,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/eurospeech99-ne.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/eurospeech99-ne.ps.gz},
year = 1999
}
@inproceedings{renals-icassp95,
author = {S.~Renals and M.~Hochberg},
title = {Efficient search using posterior phone probability
estimates},
booktitle = {Proc IEEE ICASSP},
pages = {596--599},
address = {Detroit},
abstract = {In this paper we present a novel, efficient search
strategy for large vocabulary continuous speech
recognition (LVCSR). The search algorithm, based on
stack decoding, uses posterior phone probability
estimates to substantially increase its efficiency with
minimal effect on accuracy. In particular, the search
space is dramatically reduced by phone deactivation
pruning where phones with a small local posterior
probability are deactivated. This approach is
particularly well-suited to hybrid connectionist/hidden
Markov model systems because posterior phone
probabilities are directly computed by the acoustic
model. On large vocabulary tasks, using a trigram
language model, this increased the search speed by an
order of magnitude, with 2\% or less relative search
error. Results from a hybrid system are presented using
the Wall Street Journal LVCSR database for a 20,000
word task using a backed-off trigram language model.
For this task, our single-pass decoder took around 15
times realtime on an HP735 workstation. At the cost of
7\% relative search error, decoding time can be speeded
up to approximately realtime.},
categories = {wernicke,recognition,wsj,search,sheffield,cambridge},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1995/icassp95-search.ps.gz},
year = 1995
}
@inproceedings{wolters2010,
author = {Wolters, Maria K. and Isaac, Karl B. and Renals, Steve},
title = {Evaluating speech synthesis intelligibility using
{Amazon Mechanical Turk}},
booktitle = {Proc. 7th Speech Synthesis Workshop (SSW7)},
pages = {136--141},
abstract = {Microtask platforms such as Amazon Mechanical Turk
(AMT) are increasingly used to create speech and
language resources. AMT in particular allows
researchers to quickly recruit a large number of fairly
demographically diverse participants. In this study, we
investigated whether AMT can be used for comparing the
intelligibility of speech synthesis systems. We
conducted two experiments in the lab and via AMT, one
comparing US English diphone to US English
speaker-adaptive HTS synthesis and one comparing UK
English unit selection to UK English speaker-dependent
HTS synthesis. While AMT word error rates were worse
than lab error rates, AMT results were more sensitive
to relative differences between systems. This is mainly
due to the larger number of listeners. Boxplots and
multilevel modelling allowed us to identify listeners
who performed particularly badly, while thresholding
was sufficient to eliminate rogue workers. We conclude
that AMT is a viable platform for synthetic speech
intelligibility comparisons.},
categories = {intelligibility, evaluation, semantically
unpredictable sentences, diphone, unit selection,
crowd- sourcing, Mechanical Turk, HMM-based synthesis},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wolters-ssw2010.pdf},
year = 2010
}
@inproceedings{gotoh-icassp00,
author = {Y.~Gotoh and S.~Renals},
title = {Variable word rate n-grams},
booktitle = {Proc IEEE ICASSP},
pages = {1591--1594},
address = {Istanbul},
abstract = {The rate of occurrence of words is not uniform but
varies from document to document. Despite this
observation, parameters for conventional n-gram
language models are usually derived using the
assumption of a constant word rate. In this paper we
investigate the use of variable word rate assumption,
modelled by a Poisson distribution or a continuous
mixture of Poissons. We present an approach to
estimating the relative frequencies of words or n-grams
taking prior information of their occurrences into
account. Discounting and smoothing schemes are also
considered. Using the Broadcast News task, the approach
demonstrates a reduction of perplexity up to 10\%.},
categories = {stobs,lm,bnews,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/icassp2000.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/icassp2000.ps.gz},
year = 2000
}
@inproceedings{renals-fase88,
author = {S.~Renals and R.~Rohwer and M.~Terry},
title = {A comparison of speech recognition front ends using a
connectionist classifier},
booktitle = {Proc. FASE Speech '88},
pages = {1381--1388},
address = {Edinburgh},
categories = {},
year = 1988
}
@article{wrigley-sap05,
author = {S. J. Wrigley and G. J. Brown and V. Wan and S. Renals},
title = {Speech and crosstalk detection in multi-channel audio},
journal = {IEEE Trans. on Speech and Audio Processing},
volume = {13},
pages = {84--91},
abstract = {The analysis of scenarios in which a number of
microphones record the activity of speakers, such as in
a roundtable meeting, presents a number of
computational challenges. For example, if each
participant wears a microphone, it can receive speech
from both the microphone's wearer (local speech) and
from other participants (crosstalk). The recorded audio
can be broadly classified in four ways: local speech,
crosstalk plus local speech, crosstalk alone and
silence. We describe two experiments related to the
automatic classification of audio into these four
classes. The first experiment attempted to optimise a
set of acoustic features for use with a Gaussian
mixture model (GMM) classifier. A large set of
potential acoustic features were considered, some of
which have been employed in previous studies. The
best-performing features were found to be kurtosis,
fundamentalness and cross-correlation metrics. The
second experiment used these features to train an
ergodic hidden Markov model classifier. Tests performed
on a large corpus of recorded meetings show
classification accuracies of up to 96\%, and automatic
speech recognition performance close to that obtained
using ground truth segmentation.},
categories = {m4,meetings,edinburgh,asr,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/sap04-xtalk.pdf},
year = 2005
}
@article{yamagishi2009,
author = {Yamagishi, Junichi and Nose, Takashi and Zen, Heiga
and Ling, Zhenhua and Toda, Tomoki and Tokuda, Keiichi
and King, Simon and Renals, Steve},
title = {Robust Speaker-Adaptive {HMM}-based Text-to-Speech
Synthesis},
journal = {IEEE Transactions on Audio, Speech and Language
Processing},
volume = {17},
number = {6},
pages = {1208--1230},
abstract = {This paper describes a speaker-adaptive HMM-based
speech synthesis system. The new system, called
``HTS-2007,'' employs speaker adaptation (CSMAPLR+MAP),
feature-space adaptive training, mixed-gender modeling,
and full-covariance modeling using CSMAPLR transforms,
in addition to several other techniques that have
proved effective in our previous systems. Subjective
evaluation results show that the new system generates
significantly better quality synthetic speech than
speaker-dependent approaches with realistic amounts of
speech data, and that it bears comparison with
speaker-dependent approaches even when large amounts of
speech data are available. In addition, a comparison
study with several speech synthesis techniques shows
the new system is very robust: It is able to build
voices from less-than-ideal speech data and synthesize
good-quality speech even for out-of-domain sentences.},
pdf = {},
url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=5109758&arnumber=5153555&count=14&index=12},
year = 2009
}
@inproceedings{zwyssig2012determining,
author = {Zwyssig, E. and Renals, S. and Lincoln, M.},
title = {Determining the number of speakers in a meeting using
microphone array features},
booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2012
IEEE International Conference on},
pages = {4765--4768},
year = 2012
}
@inproceedings{neto-eurospeech95,
author = {J.~Neto and L.~Almeida and M.~Hochberg and C.~Martins
and L.~Nunes and S.~Renals and T.~Robinson},
title = {Speaker adaptation for hybrid {HMM--ANN} continuous
speech recogniton system},
booktitle = {Proc. Eurospeech},
pages = {2171--2174},
address = {Madrid},
abstract = {It is well known that recognition performance degrades
significantly when moving from a speaker- dependent to
a speaker-independent system. Traditional hidden Markov
model (HMM) systems have successfully applied
speaker-adaptation approaches to reduce this
degradation. In this paper we present and evaluate some
techniques for speaker-adaptation of a hybrid
HMM-artificial neural network (ANN) continuous speech
recognition system. These techniques are applied to a
well trained, speaker-independent, hybrid HMM-ANN
system and the recognizer parameters are adapted to a
new speaker through off-line procedures. The techniques
are evaluated on the DARPA RM corpus using varying
amounts of adaptation material and different ANN
architectures. The results show that speaker-adaptation
within the hybrid framework can substantially improve
system performance.},
categories = {wernicke,rm,recognition,am,hybrid,adaptation,sheffield,cambridge},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1995/eurosp95.ps.gz},
year = 1995
}
@incollection{renals2010,
author = {Renals, Steve and King, Simon},
title = {Automatic Speech Recognition},
booktitle = {Handbook of Phonetic Sciences},
publisher = {Wiley Blackwell},
editor = {Hardcastle, William J. and Laver, John and Gibbon,
Fiona E.},
chapter = {22},
year = 2010
}
@article{christensen2008,
author = {Christensen, Heidi and Gotoh, Yoshihiko and Renals,
Steve},
title = {A Cascaded Broadcast News Highlighter},
journal = {IEEE Transactions on Audio, Speech and Language
Processing},
volume = {16},
pages = {151--161},
abstract = {This paper presents a fully automatic news skimming
system which takes a broadcast news audio stream and
provides the user with the segmented, structured and
highlighted transcript. This constitutes a system with
three different, cascading stages: converting the audio
stream to text using an automatic speech recogniser,
segmenting into utterances and stories and finally
determining which utterance should be highlighted using
a saliency score. Each stage must operate on the
erroneous output from the previous stage in the system;
an effect which is naturally amplified as the data
progresses through the processing stages. We present a
large corpus of transcribed broadcast news data
enabling us to investigate to which degree information
worth highlighting survives this cascading of
processes. Both extrinsic and intrinsic experimental
results indicate that mistakes in the story boundary
detection has a strong impact on the quality of
highlights, whereas erroneous utterance boundaries
cause only minor problems. Further, the difference in
transcription quality does not affect the overall
performance greatly.},
doi = {10.1109/TASL.2007.910746},
pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2008/christensen-tasl08.pdf},
url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=4407525&arnumber=4383075&count=28&index=16},
year = 2008
}
@inproceedings{koumpis-eurospeech03,
author = {K.~Koumpis and S.~Renals},
title = {Multi-class Extractive Voicemail Summarization},
booktitle = {Proc. Eurospeech},
pages = {2785--2788},
abstract = {This paper is about a system that extracts principal
content words from speech-recognized transcripts of
voicemail messages and classifies them into proper
names, telephone numbers, dates/times and `other'. The
short text summaries generated are suitable for mobile
messaging applications. The system uses a set of
classifiers to identify the summary words, with each
word being identified by a vector of lexical and
prosodic features. The features are selected using
Parcel, an ROC-based algorithm. We visually compare the
role of a large number of individual features and
discuss effective ways to combine them. We finally
evaluate their performance on manual and automatic
transcriptions derived from two different speech
recognition systems.},
categories = {voicemail,summarization,prosody,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/eurospeech03-voicemail.pdf},
year = 2003
}
@incollection{huang2008-mlmi,
author = {Songfang Huang and Steve Renals},
title = {Modeling Topic and Role Information in Meetings using
the Hierarchical {D}irichlet Process},
booktitle = {Machine Learning for Multimodal Interaction V},
publisher = {Springer},
editor = {Popescu-Belis, A. and Stiefelhagen, R.},
volume = {5237},
series = {Lecture Notes in Computer Science},
pages = {214--225},
abstract = {In this paper, we address the modeling of topic and
role information in multiparty meetings, via a
nonparametric Bayesian model called the hierarchical
Dirichlet process. This model provides a powerful
solution to topic modeling and a flexible framework for
the incorporation of other cues such as speaker role
information. We present our modeling framework for
topic and role on the AMI Meeting Corpus, and
illustrate the effectiveness of the approach in the
context of adapting a baseline language model in a
large-vocabulary automatic speech recognition system
for multiparty meetings. The adapted LM produces
significant improvements in terms of both perplexity
and word error rate.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/mlmi08.pdf},
year = 2008
}
@inproceedings{wan-icassp02,
author = {V.~Wan and S.~Renals},
title = {Evaluation of Kernel Methods for Speaker Verification
and Identification},
booktitle = {Proc IEEE ICASSP},
pages = {669--672},
abstract = {Support vector machines are evaluated on speaker
verification and speaker identification tasks. We
compare the polynomial kernel, the Fisher kernel, a
likelihood ratio kernel and the pair hidden Markov
model kernel with baseline systems based on a
discriminative polynomial classifier and generative
Gaussian mixture model classifiers. Simulations were
carried out on the YOHO database and some promising
results were obtained.},
categories = {verification,kernel,svm,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/icassp02-svm.pdf},
year = 2002
}
@article{vipperla2010,
author = {Vipperla, Ravi Chander and Renals, Steve and Frankel,
Joe},
title = {Ageing voices: The effect of changes in voice
parameters on {ASR} performance},
journal = {EURASIP Journal on Audio, Speech, and Music Processing},
abstract = {With ageing, human voices undergo several changes
which are typically characterized by increased
hoarseness and changes in articulation patterns. In
this study, we have examined the effect on Automatic
Speech Recognition (ASR) and found that the Word Error
Rates (WER) on older voices is about 9\% absolute
higher compared to those of adult voices. Subsequently,
we compared several voice source parameters including
fundamental frequency, jitter, shimmer, harmonicity and
cepstral peak prominence of adult and older males.
Several of these parameters show statistically
significant difference for the two groups. However,
artificially increasing jitter and shimmer measures do
not effect the ASR accuracies significantly.
Artificially lowering the fundamental frequency
degrades the ASR performance marginally but this drop
in performance can be overcome to some extent using
Vocal Tract Length Normalisation (VTLN). Overall, we
observe that the changes in the voice source parameters
do not have a significant impact on ASR performance.
Comparison of the likelihood scores of all the phonemes
for the two age groups show that there is a systematic
mismatch in the acoustic space of the two age groups.
Comparison of the phoneme recognition rates show that
mid vowels, nasals and phonemes that depend on the
ability to create constrictions with tongue tip for
articulation are more affected by ageing than other
phonemes.},
doi = {10.1155/2010/525783},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/vipperla-eurasip10.pdf},
url = {http://dx.doi.org/10.1155/2010/525783},
year = 2010
}
@inproceedings{cabral_yrwst,
author = {J. Cabral and S. Renals and K. Richmond and J.
Yamagishi},
title = {{HMM}-based Speech Synthesis with an Acoustic Glottal
Source Model},
booktitle = {Proc. The First Young Researchers Workshop in Speech
Technology},
abstract = {A major cause of degradation of speech quality in
HMM-based speech synthesis is the use of a simple delta
pulse signal to generate the excitation of voiced
speech. This paper describes a new approach to using an
acoustic glottal source model in HMM-based
synthesisers. The goal is to improve speech quality and
parametric flexibility to better model and transform
voice characteristics.},
categories = {HMM-based Speech Synthesis, LF-Model, Glottal Spectral
Separation},
month = apr,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/jscabral-yrwss2009.pdf},
year = 2009
}
@inproceedings{renals-icslp94,
author = {S.~Renals and M.~Hochberg},
title = {Using {Gamma} filters to model temporal dependencies
in speech},
booktitle = {Proc. ICSLP},
pages = {1491--1494},
address = {Yokohama},
categories = {},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1994/icslp94-gamma.ps.gz},
year = 1994
}
@inproceedings{murray2007-interspeech,
author = {Murray, Gabriel and Renals, Steve},
title = {Towards online speech summarization},
booktitle = {Proc. Interspeech '07},
abstract = {The majority of speech summarization research has
focused on extracting the most informative dialogue
acts from recorde d, archived data. However, a
potential use case for speech sum- marization in the
meetings domain is to facilitate a meeting in progress
by providing the participants - whether they are at
tend- ing in-person or remotely - with an indication of
the most im- portant parts of the discussion so far.
This requires being a ble to determine whether a
dialogue act is extract-worthy befor e the global
meeting context is available. This paper introduces a
novel method for weighting dialogue acts using only
very lim- ited local context, and shows that high
summary precision is possible even when information
about the meeting as a whole is lacking. A new
evaluation framework consisting of weighted precision,
recall and f-score is detailed, and the novel onl ine
summarization method is shown to significantly increase
recall and f-score compared with a method using no
contextual infor- mation. },
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/IS070966.PDF},
year = 2007
}
@incollection{renals2010a,
author = {Renals, Steve and Hain, Thomas},
title = {Speech Recognition},
booktitle = {Handbook of Computational Linguistics and Natural
Language Processing},
publisher = {Wiley Blackwell},
editor = {Clark, Alex and Fox, Chris and Lappin, Shalom},
year = 2010
}
@incollection{karlsen-casa97,
author = {B.~L.~Karlsen and G.~J.~Brown and M.~Cooke and
P.~Green and S.~Renals},
title = {Analysis of a simultaneous speaker sound corpus},
booktitle = {Computational Auditory Scene Analysis},
publisher = {Lawrence Erlbaum Associates},
editor = {D.~F.~Rosenthal and H.~G.~Okuno},
pages = {321--334},
categories = {},
year = 1997
}
@article{lu_spl_2011,
author = {Lu, L. and Ghoshal, A. and Renals, S.},
title = {Regularized Subspace Gausian Mixture Models for Speech
Recognition},
journal = {IEEE Signal Processing Letters},
volume = {18},
number = {7},
pages = {419--422},
abstract = {Subspace Gaussian mixture models (SGMMs) provide a
compact representation of the Gaussian parameters in an
acoustic model, but may still suffer from over-fitting
with insufficient training data. In this letter, the
SGMM state parameters are estimated using a penalized
maximum-likelihood objective, based on $\ell_1$ and
$\ell_2$ regularization, as well as their combination,
referred to as the elastic net, for robust model
estimation. Experiments on the 5000-word Wall Street
Journal transcription task show word error rate
reduction and improved model robustness with
regularization.},
categories = {Acoustic Modelling, Regularization, Sparsity, Subspace
Gaussian Mixture Model},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/lu-spl-2011.pdf},
year = 2011
}
@article{goldman2005,
author = {Jerry Goldman and Steve Renals and Steven Bird and
Franciska {de Jong} and Marcello Federico and Carl
Fleischhauer and Mark Kornbluh and Lori Lamel and Doug
Oard and Clare Stewart and Richard Wright},
title = {Accessing the spoken word},
journal = {International Journal of Digital Libraries},
volume = 5,
number = 4,
pages = {287--298},
abstract = {Spoken word audio collections cover many domains,
including radio and television broadcasts, oral
narratives, governmental proceedings, lectures, and
telephone conversations. The collection, access and
preservation of such data is stimulated by political,
economic, cultural and educational needs. This paper
outlines the major issues in the field, reviews the
current state of technology, examines the rapidly
changing policy issues relating to privacy and
copyright, and presents issues relating to the
collection and preservation of spoken audio content.},
categories = {swag,asr,ir,edinburgh},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/swag-ijdl05.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/swag-ijdl05.ps.gz},
year = 2005
}
@inproceedings{hifny-interspeech05,
author = {Y. Hifny and S. Renals and N. Lawrence},
title = {A Hybrid {MaxEnt/HMM} based {ASR} System},
booktitle = {Proc. Interspeech},
abstract = {The aim of this work is to develop a practical
framework, which extends the classical Hidden Markov
Models (HMM) for continuous speech recognition based on
the Maximum Entropy (MaxEnt) principle. The MaxEnt
models can estimate the posterior probabilities
directly as with Hybrid NN/HMM connectionist speech
recognition systems. In particular, a new acoustic
modelling based on discriminative MaxEnt models is
formulated and is being developed to replace the
generative Gaussian Mixture Models (GMM) commonly used
to model acoustic variability. Initial experimental
results using the TIMIT phone task are reported.},
categories = {ml,asr,edinburgh,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hifny-eurospeech05.pdf},
year = 2005
}
@incollection{dielmann-mlmi04,
author = {A. Dielmann and S. Renals},
title = {Multistream dynamic {Bayesian} network for meeting
segmentation},
booktitle = {Proc. Multimodal Interaction and Related Machine
Learning Algorithms Workshop (MLMI--04)},
publisher = {Springer},
editor = {S. Bengio and H. Bourlard},
pages = {76--86},
abstract = {This paper investigates the automatic analysis and
segmentation of meetings. A meeting is analysed in
terms of individual behaviours and group interactions,
in order to decompose each meeting in a sequence of
relevant phases, named meeting actions. Three feature
families are extracted from multimodal recordings:
prosody from individual lapel microphone signals,
speaker activity from microphone array data and lexical
features from textual transcripts. A statistical
approach is then used to relate low-level features with
a set of abstract categories. In order to provide a
flexible and powerful framework, we have employed a
dynamic Bayesian network based model, characterized by
multiple stream processing and flexible state duration
modelling. Experimental results demonstrate the
strength of this system, providing a meeting action
error rate of 9\%.},
categories = {m4,multimodal,dbn,meetings,edinburgh},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/dielmann-mlmi04.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/dielmann-mlmi04.ps.gz},
year = 2005
}
@inproceedings{zwyssig2012effect,
author = {Zwyssig, E. and Renals, S. and Lincoln, M.},
title = {{On the effect of SNR and superdirective beamforming
in speaker diarisation in meetings}},
booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2012
IEEE International Conference on},
pages = {4177--4180},
year = 2012
}
@inproceedings{wan-icassp03,
author = {V.~Wan and S.~Renals},
title = {{SVMSVM}: Support vector machine speaker verification
methodology},
booktitle = {Proc. IEEE ICASSP},
volume = {2},
pages = {221--224},
abstract = {Support vector machines with the Fisher and
score-space kernels are used for text independent
speaker verification to provide direct q discrimination
between complete utterances. This is unlike approaches
such as discriminatively trained Gaussian mixture
models or other discriminative classifiers that
discriminate at the frame-level only. Using the
sequence-level discrimination approach we are able to
achieve error-rates that are significantly better than
the current state-of-the-art on the PolyVar database.},
categories = {verification,kernel,svm,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/icassp03-svm.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/icassp03-svm.ps.gz},
year = 2003
}
@inproceedings{dielmann-mmsp04,
author = {A. Dielmann and S. Renals},
title = {Multi-stream segmentation of meetings},
booktitle = {Proc. IEEE Workshop on Multimedia Signal Processing},
pages = {},
abstract = {This paper investigates the automatic segmentation of
meetings into a sequence of group actions or phases.
Our work is based on a corpus of multiparty meetings
collected in a meeting room instrumented with video
cameras, lapel microphones and a microphone array. We
have extracted a set of feature streams, in this case
extracted from the audio data, based on speaker turns,
prosody and a transcript of what was spoken. We have
related these signals to the higher level semantic
categories via a multistream statistical model based on
dynamic Bayesian networks (DBNs). We report on a set of
experiments in which different DBN architectures are
compared, together with the different feature streams.
The resultant system has an action error rate of 9\%.},
categories = {m4,multimodal,dbn,meetings,edinburgh},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/dielmann-mmsp04.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/dielmann-mmsp04.ps.gz},
year = 2004
}
@inproceedings{christensen-asru03,
author = {H. Christensen and Y. Gotoh and B. Kolluru and S.
Renals},
title = {Are extractive text summarisation techniques portable
to broadcast news?},
booktitle = {Proc. IEEE Automatic Speech Recognition and
Understanding Workshop},
pages = {},
abstract = {In this paper we report on a series of experiments
which compare the effect of individual features on both
text and speech summarisation, the effect of basing the
speech summaries on automatic speech recognition
transcripts with varying word error rates, and the
effect of summarisation approach and transcript source
on summary quality. We show that classical text
summarisation features (based on stylistic and content
information) are portable to broadcast news. However,
the quality of the speech transcripts as well as the
difference in information structure between broadcast
and newspaper news affect the usability of the
individual features.},
categories = {s3l,summarization,bnews,edinburgh},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/asru03-portable.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/asru03-portable.ps.gz},
year = 2003
}
@inproceedings{cabral2011a,
author = {Cabral, J.P. and Renals, S. and Yamagishi, J. and
Richmond, K.},
title = {{HMM}-based speech synthesiser using the {LF}-model of
the glottal source},
booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011
IEEE International Conference on},
pages = {4704--4707},
abstract = {A major factor which causes a deterioration in speech
quality in {HMM}-based speech synthesis is the use of a
simple delta pulse signal to generate the excitation of
voiced speech. This paper sets out a new approach to
using an acoustic glottal source model in HMM-based
synthesisers instead of the traditional pulse signal.
The goal is to improve speech quality and to better
model and transform voice characteristics. We have
found the new method decreases buzziness and also
improves prosodic modelling. A perceptual evaluation
has supported this finding by showing a 55.6%
preference for the new system, as against the baseline.
This improvement, while not being as significant as we
had initially expected, does encourage us to work on
developing the proposed speech synthesiser further.},
categories = {HMM-based speech synthesiser;acoustic glottal source
model LF-model;delta pulse signal;perceptual
evaluation;prosodic modelling;speech quality;voiced
speech generation;hidden Markov models;speech
synthesis;},
doi = {10.1109/ICASSP.2011.5947405},
issn = {1520-6149},
month = may,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/05947405.pdf},
year = 2011
}
@incollection{robinson-yellowbook96,
author = {T.~Robinson and M.~Hochberg and S.~Renals},
title = {The use of recurrent networks in continuous speech
recognition},
booktitle = {Automatic Speech and Speaker Recognition -- Advanced
Topics},
publisher = {Kluwer Academic Publishers},
editor = {C.-H.~Lee and K.~K.~Paliwal and F.~K.~Soong},
pages = {233--258},
abstract = {This chapter describes a use of recurrent neural
networks (ie, feedback is incorporated in the
computation) as an acoustic model for continuous speech
recognition. The form of the recurrent neural network
is described, along with an appropriate parameter
estimation procedure. For each frame of acoustic data,
the recurrent network generates an estimate of the
posterior probability of the possible phones given the
observed acoustic signal. The posteriors are then
converted into scaled likelihoods and used as the
observation probabilities within a conventional
decoding paradigm (eg, Viterbi decoding). The
advantages of the using recurrent networks are that
they require a small number of parameters and provide a
fast decoding capability (relative to conventional
large vocabulary HMM systems).},
categories = {wernicke,sprach,wsj,recognition,am,hybrid,abbot,search,sheffield},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1996/rnn4csr96.ps.gz},
year = 1996
}
@inproceedings{renals-ieeann89,
author = {S.~Renals and R.~Rohwer},
title = {Neural networks for speech pattern classification},
booktitle = {IEE Conference Publication 313, 1st IEE Conference on
Artificial Neural Networks},
pages = {292--296},
address = {London},
categories = {},
year = 1989
}
@incollection{murray2007-mlmi,
author = {Murray, Gabriel and Renals, Steve},
title = {Term-weighting for summarization of multi-party spoken
dialogues},
booktitle = {Machine Learning for Multimodal Interaction IV },
publisher = {Springer},
editor = {Popescu-Belis, A. and Renals, S. and Bourlard, H.},
volume = {4892},
series = {Lecture Notes in Computer Science},
pages = {155--166},
abstract = {This paper explores the issue of term-weighting in the
genre of spontaneous, multi-party spoken dialogues,
with the intent of using such term-weights in the
creation of extractive meeting summaries. The field of
text information retrieval has yielded many
term-weighting tech- niques to import for our purposes;
this paper implements and compares several of these,
namely tf.idf, Residual IDF and Gain. We propose that
term-weighting for multi-party dialogues can exploit
patterns in word us- age among participant speakers,
and introduce the su.idf metric as one attempt to do
so. Results for all metrics are reported on both manual
and automatic speech recognition (ASR) transcripts, and
on both the ICSI and AMI meeting corpora. },
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/48920155.pdf},
year = 2007
}
@article{wan-sap05,
author = {V. Wan and S. Renals},
title = {Speaker verification using sequence discriminant
support vector machines},
journal = {IEEE Trans. on Speech and Audio Processing},
volume = {13},
pages = {203--210},
abstract = {This paper presents a text-independent speaker
verification system using support vector machines
(SVMs) with score-space kernels. Score-space kernels,
generalize Fisher kernels, and are based on an
underlying generative model, such as a Gaussian mixture
model (GMM). This approach provides direct
discrimination between whole sequences, in contrast to
the frame-level approaches at the heart of most current
systems. The resultant SVMs have a very high
dimensionality, since it is related to the number of
parameters in the underlying generative model. To
ameliorate problems that can arise in the resultant
optimization, we introduce a technique called spherical
normalization that preconditions the Hessian matrix. We
have performed speaker verification experiments using
the PolyVar database. The SVM system presented here
reduces the relative error rates by 34\% compared to a
GMM likelihood ratio system.},
categories = {verification,kernel,svm,edinburgh,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/sap05-svm.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/sap05-svm.ps.gz},
year = 2005
}
@article{williams-csl99,
author = {G.~Williams and S.~Renals},
title = {Confidence measures from local posterior probability
estimates},
journal = {Computer Speech and Language},
volume = {13},
pages = {395--411},
abstract = {In this paper we introduce a set of related confidence
measures for large vocabulary continuous speech
recognition (LVCSR) based on local phone posterior
probability estimates output by an acceptor HMM
acoustic model. In addition to their computational
efficiency, these confidence measures are attractive as
they may be applied at the state-, phone-, word- or
utterance-levels, potentially enabling discrimination
between different causes of low confidence recognizer
output, such as unclear acoustics or mismatched
pronunciation models. We have evaluated these
confidence measures for utterance verification using a
number of different metrics. Experiments reveal several
trends in `profitability of rejection', as measured by
the unconditional error rate of a hypothesis test.
These trends suggest that crude pronunciation models
can mask the relatively subtle reductions in confidence
caused by out-of-vocabulary (OOV) words and
disfluencies, but not the gross model mismatches
elicited by non-speech sounds. The observation that a
purely acoustic confidence measure can provide improved
performance over a measure based upon both acoustic and
language model information for data drawn from the
Broadcast News corpus, but not for data drawn from the
North American Business News corpus suggests that the
quality of model fit offered by a trigram language
model is reduced for Broadcast News data. We also argue
that acoustic confidence measures may be used to inform
the search for improved pronunciation models.},
categories = {recognition,conf,hybrid,bnews,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/csl99-preprint.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/csl99-preprint.ps.gz},
year = 1999
}
@inproceedings{abdelhaleem-icassp04,
author = {Y. H. Abdel-Haleem and S. Renals and N. D. Lawrence},
title = {Acoustic space dimensionality selection and
combination using the maximum entropy principle},
booktitle = {Proc. IEEE ICASSP},
pages = {},
abstract = {In this paper we propose a discriminative approach to
acoustic space dimensionality selection based on
maximum entropy modelling. We form a set of constraints
by composing the acoustic space with the space of phone
classes, and use a continuous feature formulation of
maximum entropy modelling to select an optimal feature
set. The suggested approach has two steps: (1) the
selection of the best acoustic space that efficiently
and economically represents the acoustic data and its
variability; (2) the combination of selected acoustic
features in the maximum entropy framework to estimate
the posterior probabilities over the phonetic labels
given the acoustic input. Specific contributions of
this paper include a parameter estimation algorithm
(generalized improved iterative scaling) that enables
the use of negative features, the parameterization of
constraint functions using Gaussian mixture models, and
experimental results using the TIMIT database.},
categories = {ml,maxent,am,recognition,edinburgh,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/icassp04-me.pdf},
year = 2004
}
@inproceedings{hsueh2006asm,
author = {Hsueh, P. and Moore, J. and Renals, S.},
title = {Automatic Segmentation of Multiparty Dialogue},
booktitle = {Proc. EACL06},
abstract = {In this paper, we investigate the prob- lem of
automatically predicting segment boundaries in spoken
multiparty dialogue. We extend prior work in two ways.
We first apply approaches that have been pro- posed for
predicting top-level topic shifts to the problem of
identifying subtopic boundaries. We then explore the
impact on performance of using ASR output as opposed to
human transcription. Exam- ination of the effect of
features shows that predicting top-level and predicting
subtopic boundaries are two distinct tasks: (1) for
predicting subtopic boundaries, the lexical
cohesion-based approach alone can achieve competitive
results, (2) for predicting top-level boundaries, the
ma- chine learning approach that combines
lexical-cohesion and conversational fea- tures performs
best, and (3) conversational cues, such as cue phrases
and overlapping speech, are better indicators for the
top- level prediction task. We also find that the
transcription errors inevitable in ASR output have a
negative impact on models that combine lexical-cohesion
and conver- sational features, but do not change the
general preference of approach for the two tasks. },
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/21_1_hsuehmoorerenals.pdf},
year = 2006
}
@inproceedings{cuayahuitletal_interspeech07,
author = {Heriberto Cuayáhuitl and Steve Renals and Oliver
Lemon and Hiroshi Shimodaira},
title = {Hierarchical Dialogue Optimization Using Semi-Markov
Decision Processes},
booktitle = {Proc. of INTERSPEECH},
abstract = {This paper addresses the problem of dialogue
optimization on large search spaces. For such a
purpose, in this paper we propose to learn dialogue
strategies using multiple Semi-Markov Decision
Processes and hierarchical reinforcement learning. This
approach factorizes state variables and actions in
order to learn a hierarchy of policies. Our experiments
are based on a simulated flight booking dialogue system
and compare flat versus hierarchical reinforcement
learning. Experimental results show that the proposed
approach produced a dramatic search space reduction
(99.36\%), and converged four orders of magnitude
faster than flat reinforcement learning with a very
small loss in optimality (on average 0.3 system turns).
Results also report that the learnt policies
outperformed a hand-crafted one under three different
conditions of ASR confidence levels. This approach is
appealing to dialogue optimization due to faster
learning, reusable subsolutions, and scalability to
larger problems.},
categories = {Spoken dialogue systems, semi-Markov decision
processes, hierarchical reinforcement learning.},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/SMDPs-interspeech2007.pdf},
year = 2007
}
@inproceedings{rohwer-icassp88,
author = {R.~Rohwer and S.~Renals and M.~Terry},
title = {Unstable connectionist networks in speech recognition},
booktitle = {Proc IEEE ICASSP},
pages = {426--428},
address = {New York},
categories = {},
year = 1988
}
@article{renals-jstatphys90,
author = {S.~Renals and R.~Rohwer},
title = {A study of network dynamics},
journal = {J. Stat. Phys.},
volume = {58},
pages = {825--847},
categories = {},
year = 1990
}
@article{carreira-specom98,
author = {M.~Carreira-Perpiñán and S.~Renals},
title = {Dimensionality reduction of electropalatographic data
using latent variable models},
journal = {Speech Communication},
volume = {26},
pages = {259--282},
abstract = {We consider the problem of obtaining a reduced
dimension representation of electropalatographic (EPG)
data. An unsupervised learning approach based on latent
variable modelling is adopted, in which an underlying
lower dimension representation is inferred directly
from the data. Several latent variable models are
investigated, including factor analysis and the
generative topographic mapping (GTM). Experiments were
carried out using a subset of the EUR-ACCOR database,
and the results indicate that these automatic methods
capture important, adaptive structure in the EPG data.
Nonlinear latent variable modelling clearly outperforms
the investigated linear models in terms of
log-likelihood and reconstruction error and suggests a
substantially smaller intrinsic dimensionality for the
EPG data than that claimed by previous studies. A
two-dimensional representation is produced with
applications to speech therapy, language learning and
articulatory dynamics.},
categories = {ml,lv,artic,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/specom98.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/specom98.ps.gz},
year = 1998
}
@inproceedings{wrigley-eurospeech03,
author = {S.~Wrigley and G.~Brown and V.~Wan and S. Renals},
title = {Feature Selection for the Classification of Crosstalk
in Multi-Channel Audio},
booktitle = {Proc. Eurospeech},
pages = {469--472},
abstract = {An extension to the conventional speech / nonspeech
classification framework is presented for a scenario in
which a number of microphones record the activity of
speakers present at a meeting (one microphone per
speaker). Since each microphone can receive speech from
both the participant wearing the microphone (local
speech) and other participants (crosstalk), the
recorded audio can be broadly classified in four ways:
local speech, crosstalk plus local speech, crosstalk
alone and silence. We describe a classifier in which a
Gaussian mixture model (GMM) is used to model each
class. A large set of potential acoustic features are
considered, some of which have been employed in
previous speech / nonspeech classifiers. A combination
of two feature selection algorithms is used to identify
the optimal feature set for each class. Results from
the GMM classifier using the selected features are
superior to those of a previously published approach.},
categories = {m4,crosstalk,meetings,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/eurospeech03-xtalk.pdf},
year = 2003
}
@article{hifny2009,
author = {Hifny, Y. and Renals, S.},
title = {Speech Recognition Using Augmented Conditional Random
Fields},
journal = {IEEE Transactions on Audio, Speech and Language
Processing},
volume = {17},
number = {2},
pages = {354--365},
abstract = {Acoustic modeling based on hidden Markov models (HMMs)
is employed by state-of-the-art stochastic speech
recognition systems. Although HMMs are a natural choice
to warp the time axis and model the temporal phenomena
in the speech signal, their conditional independence
properties limit their ability to model spectral
phenomena well. In this paper, a new acoustic modeling
paradigm based on augmented conditional random fields
(ACRFs) is investigated and developed. This paradigm
addresses some limitations of HMMs while maintaining
many of the aspects which have made them successful. In
particular, the acoustic modeling problem is
reformulated in a data driven, sparse, augmented space
to increase discrimination. Acoustic context modeling
is explicitly integrated to handle the sequential
phenomena of the speech signal. We present an efficient
framework for estimating these models that ensures
scalability and generality. In the TIMIT phone
recognition task, a phone error rate of 23.0\% was
recorded on the full test set, a significant
improvement over comparable HMM-based systems.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/hifny2009.pdf},
url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=4749447&arnumber=4749472&count=25&index=15},
year = 2009
}
@inproceedings{gotoh-asr2000,
author = {Y.~Gotoh and S.~Renals},
title = {Sentence Boundary Detection in Broadcast Speech
Transcripts},
booktitle = {ISCA ITRW: ASR2000},
pages = {228--235},
address = {Paris},
abstract = {This paper presents an approach to identifying
sentence boundaries in broadcast speech transcripts. We
describe finite state models that extract sentence
boundary information statistically from text and audio
sources. An n-gram language model is constructed from a
collection of British English news broadcasts and
scripts. An alternative model is estimated from pause
duration information in speech recogniser outputs
aligned with their programme script counterparts.
Experimental results show that the pause duration model
alone outperforms the language modelling approach and
that, by combining these two models, it can be improved
further and precision and recall scores of over 70\%
were attained for the task.},
categories = {stobs,ie,lm,prosody,bnews,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/asr2000.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/asr2000.ps.gz},
year = 2000
}
@inproceedings{Murray05b,
author = {G. Murray and S. Renals and J. Carletta and J. Moore},
title = {Evaluating Automatic Summaries of Meeting Recordings},
booktitle = {Proceedings of the 43rd Annual Meeting of the
Association for Computational Linguistics, Ann Arbor,
MI, USA},
abstract = {The research below explores schemes for evaluating
automatic summaries of business meetings, using the
ICSI Meeting Corpus. Both automatic and subjective
evaluations were carried out, with a central interest
being whether or not the two types of evaluations
correlate with each other. The evaluation metrics were
used to compare and contrast differing approaches to
automatic summarization, the deterioration of summary
quality on ASR output versus manual transcripts, and to
determine whether manual extracts are rated
significantly higher than automatic extracts. },
categories = {ami,summarization, speech summarization, prosody,
latent semantic analysis, summarization evaluation,
edinburgh},
month = jun,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/murray-renals-carletta-moore.pdf},
year = 2005
}
@inproceedings{williams-escapron98,
author = {G.~Williams and S.~Renals },
title = {Confidence measures for evaluating pronunciation
models},
booktitle = {ESCA Workshop on Modeling pronunciation variation for
automatic speech recognition},
pages = {151--155},
address = {Kerkrade, Netherlands},
abstract = {In this paper, we investigate the use of confidence
measures for the evaluation of pronunciation models and
the employment of these evaluations in an automatic
baseform learning process. The confidence measures and
pronunciation models are obtained from the Abbot hybrid
Hidden Markov Model/Artificial Neural Network Large
Vocabulary Continuous Speech Recognition system.
Experiments were carried out for a number of baseform
learning schemes using the ARPA North American Business
News and the Broadcast News corpora from which it was
found that a confidence measure based scheme provided
the largest reduction in Word Error Rate.},
categories = {recognition,conf,hybrid,abbot,wsj,bnews,pron,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/esca98.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/esca98.ps.gz},
year = 1998
}
@incollection{murray2008a,
author = {Murray, Gabriel and Renals, Steve},
title = {Meta Comments for Summarizing Meeting Speech},
booktitle = {Machine Learning for Multimodal Interaction (Proc.
MLMI '08)},
publisher = {Springer},
number = {5237},
series = {Lecture Notes in Computer Science},
pages = {236--247},
abstract = {This paper is about the extractive summarization of
meeting speech, using the ICSI and AMI corpora. In the
first set of experiments we use prosodic, lexical,
structural and speaker-related features to select the
most informative dialogue acts from each meeting, with
the hypothesis being that such a rich mixture of
features will yield the best results. In the second
part, we present an approach in which the
identification of ``meta-comments'' is used to create
more informative summaries that provide an increased
level of abstraction. We find that the inclusion of
these meta comments improves summarization performance
according to several evaluation metrics.},
doi = {10.1007/978-3-540-85853-9_22},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/murray2008a.pdf},
url = {http://dx.doi.org/10.1007/978-3-540-85853-9_22},
year = 2008
}
@incollection{al-hames2006-mlmi06,
author = {Marc Al-Hames and Thomas Hain and Jan Cernocky and
Sascha Schreiber and Mannes Poel and Ronald Mueller and
Sebastien Marcel and David {van Leeuwen} and Jean-Marc
Odobez and Sileye Ba and Hervé Bourlard and Fabien
Cardinaux and Daniel Gatica-Perez and Adam Janin and
Petr Motlicek and Stephan Reiter and Steve Renals and
Jeroen {van Rest} and Rutger Rienks and Gerhard Rigoll
and Kevin Smith and Andrew Thean and Pavel Zemcik},
title = {Audio-video processing in meetings: Seven questions
and current {AMI} answers},
booktitle = {Machine Learning for Multimodal Interaction (Proc.
MLMI '06)},
publisher = {Springer},
editor = {S. Renals and S. Bengio and J. G. Fiscus},
volume = {4299},
series = {Lecture Notes in Computer Science},
pages = {24--35},
year = 2006
}
@inproceedings{renals-nnsp91,
author = {S.~Renals and N.~Morgan and H.~Bourlard},
title = {Probability estimation by feed-forward networks in
continuous speech recognition},
booktitle = {IEEE Proc. Neural Networks for Signal Processing},
pages = {309--318},
address = {Princeton NJ},
categories = {},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1991/nnsp91.ps.gz},
year = 1991
}
@inproceedings{lu2012jud,
author = {Lu, L. and Ghoshal, A. and Renals, S.},
title = {{Joint uncertainty decoding with unscented transform
for noise robust subspace Gaussian mixture model}},
booktitle = {Proc. Sapa-Scale workshop},
abstract = {Common noise compensation techniques use vector Taylor
series (VTS) to approximate the mismatch function.
Recent work shows that the approximation accuracy may
be improved by sampling. One such sampling technique is
the unscented transform (UT), which draws samples
deterministically from clean speech and noise model to
derive the noise corrupted speech parameters. This
paper applies UT to noise compensation of the subspace
Gaussian mixture model (SGMM). Since UT requires
relatively smaller number of samples for accurate
estimation, it has significantly lower computational
cost compared to other random sampling techniques.
However, the number of surface Gaussians in an SGMM is
typically very large, making the direct application of
UT, for compensating individual Gaussian components,
computationally impractical. In this paper, we avoid
the computational burden by employing UT in the
framework of joint uncertainty decoding (JUD), which
groups all the Gaussian components into small number of
classes, sharing the compensation parameters by class.
We evaluate the JUD-UT technique for an SGMM system
using the Aurora 4 corpus. Experimental results
indicate that UT can lead to increased accuracy
compared to VTS approximation if the JUD phase factor
is untuned, and to similar accuracy if the phase factor
is tuned empirically},
keywords = {noise compensation, SGMM, JUD, UT},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/llu-sapa2012.pdf},
year = 2012
}
@inproceedings{renals-ijcnn92,
author = {S.~Renals and N.~Morgan and M.~Cohen and H.~Franco and
H.~Bourlard},
title = {Improving statistical speech recognition},
booktitle = {Proc. IJCNN},
volume = {2},
pages = {301--307},
address = {Baltimore MD},
categories = {},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1992/ijcnn92.ps.gz},
year = 1992
}
@article{turk:2429,
author = {Alice Turk and James Scobbie and Christian Geng and
Cedric Macmartin and Ellen Bard and Barry Campbell and
Catherine Dickie and Eddie Dubourg and Bill Hardcastle
and Phil Hoole and Evia Kanaida and Robin Lickley and
Satsuki Nakai and Marianne Pouplier and Simon King and
Steve Renals and Korin Richmond and Sonja Schaeffler
and Ronnie Wiegand and Kevin White and Alan Wrench},
title = {The {Edinburgh Speech Production Facility's}
articulatory corpus of spontaneous dialogue.},
journal = {The Journal of the Acoustical Society of America},
volume = {128},
number = {4},
pages = {2429-2429},
abstract = {The EPSRC‐funded Edinburgh Speech Production is
built around two synchronized Carstens AG500
electromagnetic articulographs (EMAs) in order to
capture articulatory∕acoustic data from spontaneous
dialogue. An initial articulatory corpus was designed
with two aims. The first was to elicit a range of
speech styles∕registers from speakers, and therefore
provide an alternative to fully scripted corpora. The
second was to extend the corpus beyond monologue, by
using tasks that promote natural discourse and
interaction. A subsidiary driver was to use dialects
from outwith North America: dialogues paired up a
Scottish English and a Southern British English
speaker. Tasks. Monologue: Story reading of ``Comma
Gets a Cure'' [Honorof et al. (2000)], lexical sets
[Wells (1982)], spontaneous story telling,
diadochokinetic tasks. Dialogue: Map tasks [Anderson et
al. (1991)], ``Spot the Difference'' picture tasks
[Bradlow et al. (2007)], story‐recall. Shadowing of
the spontaneous story telling by the second
participant. Each dialogue session includes
approximately 30 min of speech, and there are
acoustics‐only baseline materials. We will introduce
the corpus and highlight the role of articulatory
production data in helping provide a fuller
understanding of various spontaneous speech phenomena
by presenting examples of naturally occurring covert
speech errors, accent accommodation, turn taking
negotiation, and shadowing.},
doi = {10.1121/1.3508679},
publisher = {ASA},
year = 2010
}
@inproceedings{cabral07,
author = {J. Cabral and S. Renals and K. Richmond and J.
Yamagishi},
title = {Towards an Improved Modeling of the Glottal Source in
Statistical Parametric Speech Synthesis},
booktitle = {Proc.of the 6th ISCA Workshop on Speech Synthesis},
address = {Bonn, Germany},
abstract = {This paper proposes the use of the Liljencrants-Fant
model (LF-model) to represent the glottal source signal
in HMM-based speech synthesis systems. These systems
generally use a pulse train to model the periodicity of
the excitation signal of voiced speech. However, this
model produces a strong and uniform harmonic structure
throughout the spectrum of the excitation which makes
the synthetic speech sound buzzy. The use of a mixed
band excitation and phase manipulation reduces this
effect but it can result in degradation of the speech
quality if the noise component is not weighted
carefully. In turn, the LF-waveform has a decaying
spectrum at higher frequencies, which is more similar
to the real glottal source excitation signal. We
conducted a perceptual experiment to test the
hypothesis that the LF-model can perform as well as or
better than the pulse train in a HMM-based speech
synthesizer. In the synthesis, we used the mean values
of the LF-parameters, calculated by measurements of the
recorded speech. The result of this study is important
not only regarding the improvement in speech quality of
these type of systems, but also because the LF-model
can be used to model many characteristics of the
glottal source, such as voice quality, which are
important for voice transformation and generation of
expressive speech.},
categories = {LF-model, Statistical parametric speech synthesis,
HMM-based speech synthesis},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/cabral07.pdf},
year = 2007
}
@inproceedings{renals-icassp96,
author = {S.~Renals and M.~Hochberg},
title = {Efficient evaluation of the {LVCSR} search space using
the {NOWAY} decoder},
booktitle = {Proc IEEE ICASSP},
pages = {149--152},
address = {Atlanta},
abstract = {This work further develops and analyses the large
vocabulary continuous speech recognition search
strategy reported at ICASSP-95. In particular, the
posterior-based phone deactivation pruning approach has
been extended to include phone-dependent thresholds and
an improved estimate of the least upper bound on the
utterance log-probability has been developed. Analysis
of the pruning procedures and of the search's
interaction with the language model has also been
performed. Experiments were carried out using the ARPA
North American Business News task with a 20,000 word
vocabulary and a trigram language model. As a result of
these improvements and analyses, the computational cost
of the recognition process performed by the Noway
decoder has been substantially reduced.},
categories = {wernicke,sprach,recognition,wsj,search,sheffield},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1996/icassp96.ps.gz},
year = 1996
}
@inproceedings{koumpis-prosody01,
author = {K.~Koumpis and S.~Renals},
title = {The role of prosody in a voicemail summarization
system},
booktitle = {Proc. ISCA Workshop on Prosody in Speech Recognition
and Understanding},
address = {Red Bank, NJ, USA},
abstract = {When a speaker leaves a voicemail message there are
prosodic cues that emphasize the important points in
the message, in addition to lexical content. In this
paper we compare and visualize the relative
contribution of these two types of features within a
voicemail summarization system. We describe the
system's ability to generate summaries of two test
sets, having trained and validated using 700 messages
from the IBM Voicemail corpus. Results measuring the
quality of summary artifacts show that combined lexical
and prosodic features are at least as robust as
combined lexical features alone across all operating
conditions.},
categories = {voicemail,summarization,prosody,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/pros01-vm.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/pros01-vm.ps.gz},
year = 2001
}
@inproceedings{garau2008a,
author = {Garau, Giulia and Renals, Steve},
title = {Pitch adaptive features for {LVCSR}},
booktitle = {Proc. Interspeech '08},
abstract = {We have investigated the use of a pitch adaptive
spectral representation on large vocabulary speech
recognition, in conjunction with speaker normalisation
techniques. We have compared the effect of a smoothed
spectrogram to the pitch adaptive spectral analysis by
decoupling these two components of STRAIGHT.
Experiments performed on a large vocabulary meeting
speech recognition task highlight the importance of
combining a pitch adaptive spectral representation with
a conventional fixed window spectral analysis. We found
evidence that STRAIGHT pitch adaptive features are more
speaker independent than conventional MFCCs without
pitch adaptation, thus they also provide better
performances when combined using feature combination
techniques such as Heteroscedastic Linear Discriminant
Analysis.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/garau2008a.pdf},
year = 2008
}
@article{gotoh-nle99,
author = {Y.~Gotoh and S.~Renals},
title = {Topic-based mixture language modelling},
journal = {Journal of Natural Language Engineering},
volume = {5},
pages = {355--375},
abstract = {This paper describes an approach for constructing a
mixture of language models based on simple statistical
notions of semantics using probabilistic models
developed for information retrieval. The approach
encapsulates corpus-derived semantic information and is
able to model varying styles of text. Using such
information, the corpus texts are clustered in an
unsupervised manner and a mixture of topic-specific
language models is automatically created. The principal
contribution of this work is to characterise the
document space resulting from information retrieval
techniques and to demonstrate the approach for mixture
language modelling. A comparison is made between manual
and automatic clustering in order to elucidate how the
global content information is expressed in the space.
We also compare (in terms of association with manual
clustering and language modelling accuracy) alternative
term-weighting schemes and the effect of singular
valued decomposition dimension reduction (latent
semantic analysis). Test set perplexity results using
the British National Corpus indicate that the approach
can improve the potential of statistical language
modelling. Using an adaptive procedure, the
conventional model may be tuned to track text data with
a slight increase in computational cost.},
categories = {sprach,stobs,lm,bnc,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/jnle99-preprint.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/jnle99-preprint.ps.gz},
year = 1999
}
@inproceedings{murray-interspeech05,
author = {G. Murray and S. Renals and J. Carletta},
title = {Extractive Summarization of Meeting Recordings},
booktitle = {Proc. Interspeech},
abstract = {Several approaches to automatic speech summarization
are discussed below, using the ICSI Meetings corpus. We
contrast feature-based approaches using prosodic and
lexical features with maximal marginal relevance and
latent semantic analysis approaches to summarization.
While the latter two techniques are borrowed directly
from the field of text summarization, feature-based
approaches using prosodic information are able to
utilize characteristics unique to speech data. We also
investigate how the summarization results might
deteriorate when carried out on ASR output as opposed
to manual transcripts. All of the summaries are of an
extractive variety, and are compared using the software
ROUGE.},
categories = {ami,summarization,prosody, latent semantic
analysis,edinburgh},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/murray-eurospeech05.pdf},
year = 2005
}
@inproceedings{cuayahuitletal_asru05,
author = {Heriberto Cuayáhuitl and Steve Renals and Oliver
Lemon and Hiroshi Shimodaira},
title = {Human-Computer Dialogue Simulation Using Hidden Markov
Models},
booktitle = {Proc. of IEEE Workshop on Automatic Speech Recognition
and Understanding (ASRU)},
abstract = {This paper presents a probabilistic method to simulate
task-oriented human-computer dialogues at the intention
level, that may be used to improve or to evaluate the
performance of spoken dialogue systems. Our method uses
a network of Hidden Markov Models (HMMs) to predict
system and user intentions, where a ``language model''
predicts sequences of goals and the component HMMs
predict sequences of intentions. We compare standard
HMMs, Input HMMs and Input-Output HMMs in an effort to
better predict sequences of intentions. In addition, we
propose a dialogue similarity measure to evaluate the
realism of the simulated dialogues. We performed
experiments using the DARPA Communicator corpora and
report results with three different metrics: dialogue
length, dialogue similarity and precision-recall.},
categories = {dialogue simulation, hidden markov models},
month = nov,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hcp-asru2005.pdf},
year = 2005
}
@inproceedings{kilgour2011,
author = {Kilgour, Jonathan and Carletta, Jean and Renals, Steve},
title = {The {Ambient Spotlight}: Personal meeting capture with
a microphone array},
booktitle = {Proc. HSCMA},
abstract = {We present the Ambient Spotlight system for personal
meeting capture based on a portable USB microphone
array and a laptop. The system combined distant speech
recognition and content linking with personal
productivity tools, and enables recognised meeting
recordings to be integrated with desktop search,
calender, and email. },
doi = {10.1109/HSCMA.2011.5942389},
pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2011/ambientDemo.pdf},
year = 2011
}
@inproceedings{renals-icassp89,
author = {S.~Renals and R.~Rohwer},
title = {Learning phoneme recognition using neural networks},
booktitle = {Proc IEEE ICASSP},
pages = {413--416},
address = {Glasgow},
categories = {},
year = 1989
}
@inproceedings{kershaw-arpa96,
author = {D.~Kershaw and T.~Robinson and S.~Renals},
title = {The 1995 {Abbot} hybrid {connectionist--HMM} large
vocabulary recognition system},
booktitle = {Proc. ARPA Spoken Language Technology Conference},
pages = {93--99},
categories = {wernicke,sprach,wsj,recognition,am,hybrid,abbot,search,eval,sheffield},
year = 1996
}
@inproceedings{hochberg-icassp95,
author = {M.~Hochberg and S.~Renals and T.~Robinson and G.~Cook},
title = {Recent improvements to the {Abbot} large vocabulary
{CSR} system},
booktitle = {Proc IEEE ICASSP},
pages = {69--72},
address = {Detroit},
abstract = {ABBOT is the hybrid connectionist-hidden Markov model
(HMM) large-vocabulary continuous speech recognition
(CSR) system developed at Cambridge University. This
system uses a recurrent network to estimate the
acoustic observation probabilities within an HMM
framework. A major advantage of this approach is that
good performance is achieved using context-independent
acoustic models and requiring many fewer parameters
than comparable HMM systems. This paper presents
substantial performance improvements gained from new
approaches to connectionist model combination and
phone-duration modeling. Additional capability has also
been achieved by extending the decoder to handle larger
vocabulary tasks (20,000 words and greater) with a
trigram language model. This paper describes the recent
modifications to the system and experimental results
are reported for various test and development sets from
the November 1992, 1993, and 1994 ARPA evaluations of
spoken language systems.},
categories = {wernicke,recognition,wsj,am,hybrid,abbot,eval,search,sheffield,cambridge},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1995/icassp95-abbot.ps.gz},
year = 1995
}
@article{renals-elett88,
author = {S.~Renals},
title = {Radial basis functions network for speech pattern
classification},
journal = {Electronics Letters},
volume = {25},
pages = {437--439},
categories = {},
year = 1988
}
@article{koumpis2005-acmslp,
author = {Konstantinos Koumpis and Steve Renals},
title = {Automatic summarization of voicemail messages using
lexical and prosodic features},
journal = {ACM Transactions on Speech and Language Processing},
volume = 2,
number = 1,
pages = {1--24},
abstract = {This paper presents trainable methods for extracting
principal content words from voicemail messages. The
short text summaries generated are suitable for mobile
messaging applications. The system uses a set of
classifiers to identify the summary words, with each
word being identified by a vector of lexical and
prosodic features. We use an ROC-based algorithm,
Parcel, to select input features (and classifiers). We
have performed a series of objective and subjective
evaluations using unseen data from two different speech
recognition systems, as well as human transcriptions of
voicemail speech.},
categories = {voicemail,summarization,prosody,sheffield,edinburgh},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/tslp05.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/tslp05.ps.gz},
year = 2005
}
@inproceedings{huang2008-ptkl,
author = {Songfang Huang and Steve Renals},
title = {Using Participant Role in Multiparty Meetings as Prior
Knowledge for Nonparametric Topic Modeling},
booktitle = {Proc. ICML/UAI/COLT Workshop on Prior Knowledge for
Text and Language Processing},
pages = {21--24},
address = {Helsinki, Finland},
abstract = {In this paper we introduce our attempts to incorporate
the participant role information in multiparty meetings
for document modeling using the hierarchical Dirichlet
process. The perplexity and automatic speech
recognition results demonstrate that the participant
role information is a promising prior knowledge source
to be combined with language models for automatic
speech recognition and interaction modeling for
multiparty meetings.},
month = jul,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/ptkl.pdf},
year = 2008
}
@inproceedings{lu2012noise,
author = {Lu, L. and Chin, KK and Ghoshal, A. and Renals, S.},
title = {{Noise compensation for subspace Gaussian mixture
models}},
booktitle = {Proc. INTERSPEECH},
abstract = {Joint uncertainty decoding (JUD) is an effective
model-based noise compensation technique for
conventional Gaussian mixture model (GMM) based speech
recognition systems. In this paper, we apply JUD to
subspace Gaussian mixture model (SGMM) based acoustic
models. The total number of Gaussians in the SGMM
acoustic model is usually much larger than for
conventional GMMs, which limits the application of
approaches which explicitly compensate each Gaussian,
such as vector Taylor series (VTS). However, by
clustering the Gaussian components into a number of
regression classes, JUD-based noise compensation can be
successfully applied to SGMM systems. We evaluate the
JUD/SGMM technique using the Aurora 4 corpus, and the
experimental results indicated that it is more accurate
than conventional GMM-based systems using either VTS or
JUD noise compensation.},
keywords = {acoustic modelling, noise compensation, SGMM, JUD},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/llu-is2012.pdf},
year = 2012
}
@article{dielmann2007-tmm,
author = {Dielmann, Alfred and Renals, Steve},
title = {Automatic meeting segmentation using dynamic
{Bayesian} networks},
journal = {IEEE Transactions on Multimedia},
volume = {9},
number = {1},
pages = {25--36},
abstract = {Multiparty meetings are a ubiquitous feature of
organizations, and there are considerable economic
benefits that would arise from their automatic analysis
and structuring. In this paper, we are concerned with
the segmentation and structuring of meetings (recorded
using multiple cameras and microphones) into sequences
of group meeting actions such as monologue, discussion
and presentation. We outline four families of
multimodal features based on speaker turns, lexical
transcription, prosody, and visual motion that are
extracted from the raw audio and video recordings. We
relate these low-level features to more complex group
behaviors using a multistream modelling framework based
on multistream dynamic Bayesian networks (DBNs). This
results in an effective approach to the segmentation
problem, resulting in an action error rate of 12.2\%,
compared with 43\% using an approach based on hidden
Markov models. Moreover, the multistream DBN developed
here leaves scope for many further improvements and
extensions.},
doi = {10.1109/TMM.2006.886337},
pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2007/dielmann2007-tmm.pdf},
url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=4032598&arnumber=4032608&count=23&index=3},
year = 2007
}
@inproceedings{robinson-icassp94,
author = {T.~Robinson and M.~Hochberg and S.~Renals},
title = {{IPA}: Improved phone modelling with recurrent neural
networks},
booktitle = {Proc IEEE ICASSP},
pages = {37--40},
address = {Adelaide},
categories = {},
year = 1994
}
@inproceedings{renals-mmsp99,
author = {S.~Renals and D.~Abberley and D.~Kirby and T.~Robinson},
title = {The {THISL} System for Indexing and Retrieval of
Broadcast News},
booktitle = {Proc. IEEE Workshop on Multimedia Signal Processing},
pages = {77--82},
address = {Copenhagen},
abstract = {This paper describes the THISL news retrieval system
which maintains an archive of BBC radio and television
news recordings. The system uses the Abbot large
vocabulary continuous speech recognition system to
transcribe news broadcasts, and the thislIR text
retrieval system to index and access the transcripts.
Decoding and indexing is performed automatically, and
the archive is updated with three hours of new material
every day. A web-based interface to the retrieval
system has been devised to facilitate access to the
archive.},
categories = {thisl,bnews,trec,ir,recognition,sheffield},
http = {http://homepages.inf.ed.ac.uk/srenals/pubs/1999/mmsp99-54/},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/mmsp99.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/mmsp99.ps.gz},
year = 1999
}
@article{koumpis2005-spmag,
author = {Koumpis, Konstantinos and Renals, Steve},
title = {Content-based access to spoken audio},
journal = {IEEE Signal Processing Magazine},
volume = 22,
number = 5,
pages = {61--69},
abstract = {"How analysis, retrieval and delivery phases make
spoken audio content more accessible"},
categories = {asr,ir,summarization,edinburgh},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/koumpis-spm05.pdf},
year = 2005
}
@inproceedings{kilgour2010a,
author = {Kilgour, Jonathan and Carletta, Jean and Renals, Steve},
title = {The {Ambient Spotlight}: Personal multimodal search
without query},
booktitle = {Proc. ICMI-MLMI},
abstract = {The Ambient Spotlight is a prototype system based on
personal meeting capture using a laptop and a portable
microphone array. The system automatically recognises
and structures the meeting content using automatic
speech recognition, topic segmentation and extractive
summarisation. The recognised speech in the meeting is
used to construct queries to automatically link meeting
segments to other relevant material, both multimodal
and textual. The interface to the system is constructed
around a standard calendar interface, and it is
integrated with the laptop's standard indexing, search
and retrieval.},
doi = {10.1145/1891903.1891919},
pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2010/ambientDemo-icmi.pdf},
url = {http://dx.doi.org/10.1145/1891903.1891919},
year = 2010
}
@inproceedings{AMIMLMI05,
author = {T. Hain and L. Burget and J. Dines and G. Garau and M.
Karafiat and M. Lincoln and I. McCowan and D. Moore and
V. Wan and R. Ordelman and S. Renals},
title = {The Development of the {AMI} System for the
Transcription of Speech in Meetings},
booktitle = {2nd Joint Workshop on Multimodal Interaction and
Related Machine Learning Algorithms},
abstract = {The automatic processing of speech collected in
conference style meetings has attracted considerable
interest with several large scale projects devoted to
this area. This paper describes the development of a
baseline automatic speech transcription system for
meetings in the context of the AMI (Augmented
Multiparty Interaction) project. We present several
techniques important to processing of this data and
show the performance in terms of word error rates
(WERs). An important aspect of transcription of this
data is the necessary flexibility in terms of audio
pre-processing. Real world systems have to deal with
flexible input, for example by using microphone arrays
or randomly placed microphones in a room. Automatic
segmentation and microphone array processing techniques
are described and the effect on WERs is discussed. The
system and its components presented in this paper yield
compettive performance and form a baseline for future
research in this domain.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/AMIsystemMLMI05.pdf},
year = 2005
}
@inproceedings{uriaIS2012,
author = {Benigno Uria and Iain Murray and Steve Renals and
Korin Richmond},
title = {Deep Architectures for Articulatory Inversion},
booktitle = {Proc. Interspeech},
address = {Portland, Oregon, USA},
abstract = { We implement two deep architectures for the
acoustic-articulatory inversion mapping problem: a deep
neural network and a deep trajectory mixture density
network. We find that in both cases, deep architectures
produce more accurate predictions than shallow
architectures and that this is due to the higher
expressive capability of a deep model and not a
consequence of adding more adjustable parameters. We
also find that a deep trajectory mixture density
network is able to obtain better inversion accuracies
than smoothing the results of a deep neural network.
Our best model obtained an average root mean square
error of 0.885 mm on the MNGU0 test dataset.},
categories = {Articulatory inversion, deep neural network, deep
belief network, deep regression network, pretraining},
keywords = {Articulatory inversion, deep neural network, deep
belief network, deep regression network, pretraining},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Uria_et_al_IS2012.pdf},
year = 2012
}
@inproceedings{renals-eurospeech89,
author = {S.~Renals and J.~Dalby},
title = {Analysis of a neural network model for speech
recognition},
booktitle = {Proc. Eurospeech},
volume = {1},
pages = {333--336},
address = {Paris},
categories = {},
year = 1989
}
@inproceedings{murray06b,
author = {G. Murray and S. Renals and M. Taboada},
title = {Prosodic Correlates of Rhetorical Relations},
booktitle = {Proceedings of HLT/NAACL ACTS Workshop, 2006, New York
City, USA},
abstract = {This paper investigates the usefulness of prosodic
features in classifying rhetorical relations between
utterances in meeting recordings. Five rhetorical
relations of \textit{contrast}, \textit{elaboration},
\textit{summary}, \textit{question} and \textit{cause}
are explored. Three training methods - supervised,
unsupervised, and combined - are compared, and
classification is carried out using support vector
machines. The results of this pilot study are
encouraging but mixed, with pairwise classification
achieving an average of 68\% accuracy in discerning
between relation pairs using only prosodic features,
but multi-class classification performing only slightly
better than chance.},
categories = {rhetorical structure theory, prosody, unsupervised
learning},
month = jun,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/dacts-hlt.pdf},
year = 2006
}
@article{2012E121001,
author = {Junichi Yamagishi and Christophe Veaux and Simon King
and Steve Renals},
title = {Speech synthesis technologies for individuals with
vocal disabilities: Voice banking and reconstruction},
journal = {Acoustical Science and Technology},
volume = {33},
number = {1},
pages = {1--5},
url = {http://www.jstage.jst.go.jp/browse/ast/33/1/_contents},
year = 2012
}
@inproceedings{abberley-esca99,
author = {D.~Abberley and D.~Kirby and S.~Renals and T.~Robinson},
title = {The {THISL} broadcast news retrieval system},
booktitle = {Proc. ESCA Workshop on Accessing Information In Spoken
Audio},
pages = {19--24},
address = {Cambridge},
abstract = {This paper described the THISL spoken document
retrieval system for British and North American
Broadcast News. The system is based on the
\textsc{Abbot} large vocabulary speech recognizer,
using a recurrent network acoustic model, and a
probabilistic text retrieval system. We discuss the
development of a realtime British English Broadcast
News system, and its integration into a spoken document
retrieval system. Detailed evaluation is performed
using a similar North American Broadcast News system,
to take advantage of the TREC SDR evaluation
methodology. We report results on this evaluation, with
particular reference to the effect of query expansion
and of automatic segmentation algorithms.},
categories = {thisl,bnews,trec,ir,recognition,sheffield},
http = {http://homepages.inf.ed.ac.uk/srenals/pubs/1999/esca99-thisl/},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/esca99-thisl.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/esca99-thisl.ps.gz},
year = 1999
}
@inproceedings{bourlard-icassp92,
author = {H.~Bourlard and N.~Morgan and C.~Wooters and S.~Renals},
title = {{CDNN}: A context-dependent neural network for
continuous speech recognition},
booktitle = {Proc IEEE ICASSP},
pages = {349--352},
address = {San Francisco},
categories = {},
year = 1992
}
@article{dielmann2008,
author = {Dielmann, Alfred and Renals, Steve},
title = {Recognition of Dialogue Acts in Multiparty Meetings
using a Switching {DBN}},
journal = {IEEE Transactions on Audio, Speech and Language
Processing},
volume = {16},
number = {7},
pages = {1303--1314},
abstract = {This paper is concerned with the automatic recognition
of dialogue acts (DAs) in multiparty conversational
speech. We present a joint generative model for DA
recognition in which segmentation and classification of
DAs are carried out in parallel. Our approach to DA
recognition is based on a switching dynamic Bayesian
network (DBN) architecture. This generative approach
models a set of features, related to lexical content
and prosody, and incorporates a weighted interpolated
factored language model. The switching DBN coordinates
the recognition process by integrating the component
models. The factored language model, which is estimated
from multiple conversational data corpora, is used in
conjunction with additional task-specific language
models. In conjunction with this joint generative
model, we have also investigated the use of a
discriminative approach, based on conditional random
fields, to perform a reclassification of the segmented
DAs. We have carried out experiments on the AMI corpus
of multimodal meeting recordings, using both manually
transcribed speech, and the output of an automatic
speech recognizer, and using different configurations
of the generative model. Our results indicate that the
system performs well both on reference and fully
automatic transcriptions. A further significant
improvement in recognition accuracy is obtained by the
application of the discriminative reranking approach
based on conditional random fields.},
doi = {10.1109/TASL.2008.922463},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/dielmann2008.pdf},
url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=4599391&arnumber=4497831&count=18&index=9},
year = 2008
}
@inproceedings{lu_asru_2011,
author = {Lu, L. and Ghoshal, A. and Renals, S.},
title = {Regularized Subspace {G}ausian Mixture Models for
Cross-lingual Speech Recognition},
booktitle = {Proc. ASRU},
abstract = {We investigate cross-lingual acoustic modelling for
low resource languages using the subspace Gaussian
mixture model (SGMM). We assume the presence of
acoustic models trained on multiple source languages,
and use the global subspace parameters from those
models for improved modelling in a target language with
limited amounts of transcribed speech. Experiments on
the GlobalPhone corpus using Spanish, Portuguese, and
Swedish as source languages and German as target
language (with 1 hour and 5 hours of transcribed audio)
show that multilingually trained SGMM shared parameters
result in lower word error rates (WERs) than using
those from a single source language. We also show that
regularizing the estimation of the SGMM state vectors
by penalizing their $\ell_1$-norm help to overcome
numerical instabilities and lead to lower WER.},
categories = {Subspace Gaussian Mixture Model, Cross-lingual, model
regularization},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/lu-asru-2011.pdf},
year = 2011
}
@inproceedings{abberley-trec00,
author = {D.~Abberley and S.~Renals and D.~Ellis and T.~Robinson},
title = {The {THISL} {SDR} system at {TREC}--8},
booktitle = {Proc. Eighth Text Retrieval Conference (TREC--8)},
abstract = {This paper describes the participation of the THISL
group at the TREC-8 Spoken Document Retrieval (SDR)
track. The THISL SDR system consists of the realtime
version of the Abbot large vocabulary speech
recognition system and the thislIR text retrieval
system. The TREC-8 evaluation assessed SDR performance
on a corpus of 500 hours of broadcast news material
collected over a five month period. The main test
condition involved retrieval of stories defined by
manual segmentation of the corpus in which non-news
material, such as commercials, were excluded. An
optional test condition required required retrieval of
the same stories from the unsegmented audio stream. The
THISL SDR system participated at both test conditions.
The results show that a system such as THISL can
produce respectable information retrieval performance
on a realistically-sized corpus of unsegmented audio
material.},
categories = {thisl,bnews,trec,ir,recognition,eval,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/trec8.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/trec8.ps.gz},
year = 2000
}
@inproceedings{hochberg-nnsp94,
author = {M.~Hochberg and G.~Cook and S.~Renals and T.~Robinson},
title = {Connectionist model combination for large vocabulary
speech recognition},
booktitle = {IEEE Proc. Neural Networks for Signal Processing},
volume = {4},
pages = {269--278},
categories = {},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1994/nnsp94.ps.gz},
year = 1994
}
@inproceedings{renals-darpa99,
author = {S.~Renals and Y.~Gotoh and R.~Gaizauskas and
M.~Stevenson},
title = {The {SPRACH/LaSIE} system for named entity
identification in broadcast news},
booktitle = {Proc. DARPA Broadcast News Workshop},
pages = {47--50},
abstract = {We have developed two conceptually different systems
that are able to identify named entities from spoken
audio. One (referred to as SPRACH-S) has a stochastic
finite state machine structure for use with an acoustic
model that identifies both words and named entities
from speech data. The other (referred to as SPRACH-R)
is a rule-based system which uses matching against
stored name lists, part-of-speech tagging, and light
phrasal parsing with specialised named entity grammars.
We provide an overview of the two approaches and
present results on the Hub-4E IE-NE evaluation task.},
categories = {sprach,stobs,ie,lm,bnews,sheffield},
http = {http://homepages.inf.ed.ac.uk/srenals/pubs/1999/darpa99-ne.html},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/darpa99-ne.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/darpa99-ne.ps.gz},
year = 1999
}
@inproceedings{abberley-trec99,
author = {D.~Abberley and S.~Renals and G.~Cook and T.~Robinson},
title = {Retrieval of broadcast news documents with the {THISL}
system},
booktitle = {Proc. Seventh Text Retrieval Conference (TREC--7)},
pages = {181--190},
abstract = {This paper describes the THISL system that
participated in the TREC-7 evaluation, Spoken Document
Retrieval (SDR) Track, and presents the results
obtained, together with some analysis. The THISL system
is based on the {\sc Abbot} speech recognition system
and the thislIR text retrieval system. In this
evaluation we were concerned with investigating the
suitability for SDR of a recognizer running at less
than ten times realtime, the use of multiple
transcriptions and word graphs, the effect of simple
query expansion algorithms and the effect of varying
standard IR parameters.},
categories = {thisl,bnews,trec,ir,recognition,eval,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/trec7.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/trec7.ps.gz},
year = 1999
}
@book{renals-book03,
editor = {S.~Renals and G.~Grefenstette},
title = {Text and Speech Triggered Information Access},
publisher = {Springer-Verlag},
number = {2705},
series = {Lecture Notes in Computer Science},
abstract = {Edited collection of revised lectures from the
\href{http://www.ilsp.gr/testia/testia2000.html}
{ELSNET-2000 Summer School} on Text and Speech
Triggered Information Access. },
categories = {recognition,ir,ie,lm,multimodal,sheffield},
url = {http://www.springerlink.com/openurl.asp?genre=issue&issn=0302-9743&volume=2705&issue=preprint},
year = 2003
}
@inproceedings{murray06c,
author = {G. Murray and S. Renals},
title = {Dialogue Act Compression Via Pitch Contour
Preservation},
booktitle = {Proceedings of the 9th International Conference on
Spoken Language Processing, Pittsburgh, USA},
abstract = {This paper explores the usefulness of prosody in
automatically compressing dialogue acts from meeting
speech. Specifically, this work attempts to compress
utterances by preserving the pitch contour of the
original whole utterance. Two methods of doing this are
described in detail and are evaluated
\textit{subjectively} using human annotators and
\textit{objectively} using edit distance with a
human-authored gold-standard. Both metrics show that
such a prosodic approach is much better than the random
baseline approach and significantly better than a
simple text compression method.},
categories = {automatic compression, prosody, summarization},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/inter2006.pdf},
year = 2006
}
@inproceedings{bourlard2008,
author = {Bourlard, Herve and Renals, Steve},
title = {Recognition and Understanding of Meetings: Overview of
the {European} {AMI} and {AMIDA} Projects},
booktitle = {Proc. LangTech 2008},
abstract = {The AMI and AMIDA projects are concerned with the
recognition and interpretation of multiparty
(face-to-face and remote) meetings. Within these
projects we have developed the following: (1) an
infrastructure for recording meetings using multiple
microphones and cameras; (2) a one hundred hour,
manually annotated meeting corpus; (3) a number of
techniques for indexing, and summarizing of meeting
videos using automatic speech recognition and computer
vision, and (4) a extensible framework for browsing,
and searching of meeting videos. We give an overview of
the various techniques developed in AMI (mainly
involving face-to-face meetings), their integration
into our meeting browser framework, and future plans
for AMIDA (Augmented Multiparty Interaction with
Distant Access), the follow-up project to AMI.
Technical and business information related to these two
projects can be found at www.amiproject.org,
respectively on the Scientific and Business portals. },
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/bourlard2008.pdf},
year = 2008
}