The Centre for Speech Technology Research, The university of Edinburgh

Publications by Mike Lincoln

mlincol1.bib

@inproceedings{zwyssig2010,
  author = {Zwyssig, Erich and Lincoln, Mike and Renals, Steve},
  title = {A Digital Microphone Array for Distant Speech
                   Recognition},
  booktitle = {Proc. IEEE ICASSP--10},
  pages = {5106--5109},
  abstract = {In this paper, the design, implementation and testing
                   of a digital microphone array is presented. The array
                   uses digital MEMS microphones which integrate the
                   microphone, amplifier and analogue to digital converter
                   on a single chip in place of the analogue microphones
                   and external audio interfaces currently used. The
                   device has the potential to be smaller, cheaper and
                   more flexible than typical analogue arrays, however the
                   effect on speech recognition performance of using
                   digital microphones is as yet unknown. In order to
                   evaluate the effect, an analogue array and the new
                   digital array are used to simultaneously record test
                   data for a speech recognition experiment. Initial
                   results employing no adaptation show that performance
                   using the digital array is significantly worse (14\%
                   absolute WER) than the analogue device. Subsequent
                   experiments using MLLR and CMLLR channel adaptation
                   reduce this gap, and employing MLLR for both channel
                   and speaker adaptation reduces the difference between
                   the arrays to 4.5\% absolute WER.},
  doi = {10.1109/ICASSP.2010.5495040},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/zwyssig-icassp10.pdf},
  year = 2010
}
@inproceedings{Ban00,
  author = {Bangham, J.A. and Cox, S.J. and Lincoln, M. and
                   Marshall, I. and Tutt, M. and Wells, M},
  title = {Signing for the deaf using virtual humans},
  booktitle = {IEE Colloquium on Speech and Language processing for
                   Disabled and Elderly},
  abstract = {Research at Televirtual (Norwich) and the University
                   of East Anglia, funded predominantly by the Independent
                   Television Commission and more recently by the UK Post
                   Office also, has investigated the feasibility of using
                   virtual signing as a communication medium for
                   presenting information to the Deaf. We describe and
                   demonstrate the underlying virtual signer technology,
                   and discuss the language processing techniques and
                   discourse models which have been investigated for
                   information communication in a transaction application
                   in Post Offices, and for presentation of more general
                   textual material in texts such as subtitles
                   accompanying television programmes.},
  categories = {visicast,sign language,translation,UEA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/iee2000-04PaperAFinal.pdf},
  year = 2000
}
@article{Wray04-LC04,
  author = {Wray, A. and Cox, S.J. and Lincoln, M. and Tryggvason,
                   J.},
  title = {A Formulaic Approach to Translation at the Post
                   Office: Reading the Signs},
  journal = {Language and Communication},
  volume = {24},
  number = {1},
  pages = {59-75},
  abstract = {TESSA is an interactive translation system designed to
                   support transactions between a post office clerk and a
                   deaf customer. The system translates the clerk's speech
                   into British Sign Language (BSL), displayed on a
                   screen, using a specially-developed avatar (virtual
                   human). TESSA is a context-constrained exemplification
                   of one of two basic approaches to machine translation,
                   neither of which can currently fulfil all of the
                   demands of successful automatic translation. Drawing on
                   recent research in theoretical psycholinguistics, we
                   show how TESSA is a convincing prototype model of one
                   aspect of real human language processing. Ways are
                   suggested of exploiting this parallel, potentially
                   offering new possibilities for the future design of
                   artificial language systems.},
  categories = {visicast,sign language,translation,UEA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/WrayCoxetal2004.pdf},
  year = 2004
}
@phdthesis{LIN99-phd,
  author = {Lincoln, M.},
  title = {Characterization of Speakers for Improved Automatic
                   Speech Recognition},
  school = {University of East Anglia},
  abstract = {Automatic speech recognition technology is becoming
                   increasingly widespread in many applications. For
                   dictation tasks, where a single talker is to use the
                   system for long periods of time, the high recognition
                   accuracies obtained are in part due to the user
                   performing a lengthy enrolment procedure to tune the
                   parameters of the recogniser to their particular voice
                   characteristics and speaking style. Interactive speech
                   systems, where the speaker is using the system for only
                   a short period of time (for example to obtain
                   information) do not have the luxury of long enrolments
                   and have to adapt rapidly to new speakers and speaking
                   styles. This thesis discusses the variations between
                   speakers and speaking styles which result in decreased
                   recognition performance when there is a mismatch
                   between the talker and the systems models. An
                   unsupervised method to rapidly identify and normalise
                   differences in vocal tract length is presented and
                   shown to give improvements in recognition accuracy for
                   little computational overhead. Two unsupervised methods
                   of identifying speakers with similar speaking styles
                   are also presented. The first, a data-driven technique,
                   is shown to accurately classify British and American
                   accented speech, and is also used to improve
                   recognition accuracy by clustering groups of similar
                   talkers. The second uses the phonotactic information
                   available within pronunciation dictionaries to model
                   British and American accented speech. This model is
                   then used to rapidly and accurately classify speakers.},
  categories = {adaptation, ASR, speaker characteristics, BT, UEA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/thesis.pdf},
  year = 1999
}
@inproceedings{NistevalAMI06,
  author = {T. Hain and L. Burget and L. Burget and J. dines and
                   G. Garau and M. Karafiat and M. Lincoln and J. Vepa and
                   V. Wan},
  title = {The {AMI} Meeting Transcription System: Progress and
                   Performance},
  booktitle = {Proceedings of the Rich Transcription 2006 Spring
                   Meeting Recognition Evaluation},
  abstract = {We present the AMI 2006 system for the transcription
                   of speech in meetings. The system was jointly developed
                   by multiple sites on the basis of the 2005 system for
                   participation in the NIST RT'05 evaluations. The paper
                   describes major developments such as improvements in
                   automatic segmentation, cross-domain model adaptation,
                   inclusion of MLP based features, improvements in
                   decoding, language modelling and vocal tract length
                   normalisation, the use of a new decoder, and a new
                   system architecture. This is followed by a
                   comprehensive description of the final system and its
                   performance in the NIST RT'06s evaluations. In
                   comparison to the previous year word error rate results
                   on the individual headset microphone task were reduced
                   by 20\% relative.},
  categories = {LVCSR, NIST Meeting Transcription Evaluation RT06S},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/AMIasr.nist06.pdf},
  year = 2006
}
@inproceedings{Lin97,
  author = {Lincoln, M. and Cox, S.J. and Ringland, S.},
  title = {A fast method of speaker normalisation using formant
                   estimation},
  booktitle = {5th European Conference on Speech Communication and
                   Technology},
  pages = {2095--2098},
  address = {Rhodes},
  abstract = {It has recently been shown that normalisation of vocal
                   tract length can significantly increase recognition
                   accuracy in speaker independent automatic speech
                   recognition systems. An inherent difficulty with this
                   technique is in automatically estimating the
                   normalisation parameter from a new speaker's speech and
                   previous techniques have typically relied on an
                   exhaustive search to estimate this parameter. In this
                   paper, we present a method of normalising utterances by
                   a linear warping of the mel filter bank channels in
                   which in which the normalisation parameter is estimated
                   by fitting formant estimates to a probabilistic model.
                   This method is fast, computitionally inexpensive and
                   requires only a limited amount of data for estimation.
                   It generates normalisations which are close to those
                   which would be found by an exhaustive search. The
                   normalisation is applied to a phoneme recognition task
                   using the TIMIT database and results show a useful
                   improvement over an un-normalised speaker independent
                   system.},
  categories = {adaptation, vocal tract, speaker characteristics, BT,
                   UEA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/lincoln-espeech-97.pdf},
  year = 1997
}
@inproceedings{Cox02d,
  author = {Cox, S.J. and Lincoln, M. and Tryggvason, J and
                   Nakisa, M and Wells, Mand Tutt, M. and Abbott, S},
  title = {{TESSA}, a system to aid communication with deaf
                   people},
  booktitle = {ASSETS 2002, Fifth International {ACM SIGCAPH}
                   Conference on Assistive Technologies},
  pages = {205-212},
  address = {Edinburgh, Scotland},
  abstract = {{TESSA} is an experimental system that aims to aid
                   transactions between a deaf person and a clerk in a
                   Post Office by translating the clerks speech to sign
                   language. A speech recogniser recognises speech from
                   the clerk and the system then synthesizes the
                   appropriate sequence of signs in British Sign language
                   (BSL) using a speciallydeveloped avatar. By using a
                   phrase lookup approach to language translation, which
                   is appropriate for the highly constrained discourse in
                   a Post Office, we were able to build a working system
                   that we could evaluate. We summarise the results of
                   this evaluation (undertaken by deaf users and Post
                   office clerks), and discuss how the findings from the
                   evaluation are being used in the development of an
                   improved system},
  categories = {visicast,sign language,translation,UEA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/Cox-Assets-2000.pdf},
  year = 2002
}
@inproceedings{NistevalAMI05,
  author = {T. Hain and L. Burget and J. Dines and G. Garau and M.
                   Karafiat and M. Lincoln and I. McCowan and D. Moore and
                   V. Wan and R. Ordelman and S. Renals},
  title = {The 2005 {AMI} System for the transcription of Speech
                   in Meetings},
  booktitle = {Proceedings of the Rich Transcription 2005 Spring
                   Meeting Recognition Evaluation},
  abstract = {In this paper we describe the 2005 AMI system for the
                   transcription of speech in meetings used in the 2005
                   NIST RT evaluations. The system was designed for
                   participation in the speech to text part of the
                   evaluations, in particular for transcription of speech
                   recorded with multiple distant microphones and
                   independent headset microphones. System performance was
                   tested on both conference room and lecture style
                   meetings. Although input sources are processed using
                   different frontends, the recognition process is based
                   on a unified system architecture. The system operates
                   in multiple passes and makes use of state of the art
                   technologies such as discriminative training, vocal
                   tract length normalisation, heteroscedastic linear
                   discriminant analysis, speaker adaptation with maximum
                   likelihood linear regression and minimum word error
                   rate decoding. In this paper we describe the system
                   performance on the official development and test sets
                   for the NIST RT05s evaluations. The system was jointly
                   developed in less than 10 months by a multi-site team
                   and was shown to achieve competitive performance.},
  categories = {LVCSR, NIST Meeting Transcription Evaluation RT05S},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/AMIasr.nist2005.pdf},
  year = 2005
}
@inproceedings{Lin98,
  author = {Lincoln, M. and Cox, S.J. and Ringland, S.},
  title = {A Comparison of Two Unsupervised Approaches to Accent
                   Identification},
  booktitle = {Int. Conf. on Spoken Language Processing},
  pages = {109-112},
  address = {Sydney},
  abstract = {The ability to automatically identify a speaker's
                   accent would be very useful for a speech recognition
                   system as it would enable the system to use both a
                   pronunciation dictionary and speech models speci c to
                   the accent, techniques which have been shown to improve
                   accuracy. Here, we describe some experiments in
                   unsupervised accent classi cation. Two techniques have
                   been investigated to classify British- and
                   Americanaccented speech: an acoustic approach, in which
                   we analyse the pattern of usage of the distributions in
                   the recogniser by a speaker to decide on his most
                   probable accent, and a high-level approach in which we
                   use a phonotactic model for classi cation of the
                   accent. Results show that both techniques give
                   excellent performance on this task which is maintained
                   when testing is done on data from an independent
                   dataset.},
  categories = {accent identification, speaker characteristics, BT,
                   UEA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/lincoln-icslp-98.pdf},
  year = 1998
}
@inproceedings{zwyssig2012determining,
  author = {Zwyssig, E. and Renals, S. and Lincoln, M.},
  title = {Determining the number of speakers in a meeting using
                   microphone array features},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2012
                   IEEE International Conference on},
  pages = {4765--4768},
  year = 2012
}
@inproceedings{zwyssig2012effect,
  author = {Zwyssig, E. and Renals, S. and Lincoln, M.},
  title = {{On the effect of SNR and superdirective beamforming
                   in speaker diarisation in meetings}},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2012
                   IEEE International Conference on},
  pages = {4177--4180},
  year = 2012
}
@inproceedings{jyamagis:emime,
  author = {Junichi Yamagishi and Mike Lincoln and Simon King and
                   John Dines and Matthew Gibson and Jilei Tian and Yong
                   Guan},
  title = {Analysis of Unsupervised and Noise-Robust
                   Speaker-Adaptive {HMM}-Based Speech Synthesis Systems
                   toward a Unified {ASR} and {TTS} Framework},
  booktitle = {Proc. Interspeech 2009},
  address = {Edinburgh, U.K.},
  abstract = {For the 2009 Blizzard Challenge we have built an
                   unsupervised version of the HTS-2008 speaker-adaptive
                   HMM-based speech synthesis system for English, and a
                   noise robust version of the systems for Mandarin. They
                   are designed from a multidisciplinary application point
                   of view in that we attempt to integrate the components
                   of the TTS system with other technologies such as ASR.
                   All the average voice models are trained exclusively
                   from recognized, publicly available, ASR databases.
                   Multi-pass LVCSR and confidence scores calculated from
                   confusion network are used for the unsupervised
                   systems, and noisy data recorded in cars or public
                   spaces is used for the noise robust system. We believe
                   the developed systems form solid benchmarks and provide
                   good connections to ASR fields. This paper describes
                   the development of the systems and reports the results
                   and analysis of their evaluation.},
  month = sep,
  year = 2009
}
@article{Cox-ijhci03,
  author = {Cox, S.J. and Lincoln, M. and Nakisa, M. and Wells, M.
                   and Tutt, M. and Abbott, S.},
  title = {The Development and Evaluation of a Speech to Sign
                   Translation System to Assist Transactions},
  journal = {Int. Journal of Human Computer Interaction},
  volume = {16},
  number = {2},
  pages = {141-161},
  abstract = {The design, development, and evaluation of an
                   experimental translation system that aims to aid
                   transactions between a deaf person and a clerk in a
                   post office (PO) is described. The system uses a speech
                   recognizer to recognize speech from a PO clerk and then
                   synthesizes recognized phrases in British Sign language
                   (BSL) using a specially developed avatar. The main
                   objective in developing this prototype system was to
                   determine how useful it would be to a customer whose
                   first language was BSL, and to discover what areas of
                   the system required more research and development to
                   make it more effective. The system was evaluated by 6
                   prelingually profoundly deaf people and 3 PO clerks.
                   Deaf users and PO clerks were supportive of the system,
                   but the former group required a higher quality of
                   signing from the avatar and the latter a system that
                   was less constrained in the phrases it could recognize;
                   both these areas are being addressed in the next phase
                   of development.},
  categories = {visicast,sign language,translation,UEA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/ijhci.pdf},
  year = 2003
}
@inproceedings{AMIMLMI05,
  author = {T. Hain and L. Burget and J. Dines and G. Garau and M.
                   Karafiat and M. Lincoln and I. McCowan and D. Moore and
                   V. Wan and R. Ordelman and S. Renals},
  title = {The Development of the {AMI} System for the
                   Transcription of Speech in Meetings},
  booktitle = {2nd Joint Workshop on Multimodal Interaction and
                   Related Machine Learning Algorithms},
  abstract = {The automatic processing of speech collected in
                   conference style meetings has attracted considerable
                   interest with several large scale projects devoted to
                   this area. This paper describes the development of a
                   baseline automatic speech transcription system for
                   meetings in the context of the AMI (Augmented
                   Multiparty Interaction) project. We present several
                   techniques important to processing of this data and
                   show the performance in terms of word error rates
                   (WERs). An important aspect of transcription of this
                   data is the necessary flexibility in terms of audio
                   pre-processing. Real world systems have to deal with
                   flexible input, for example by using microphone arrays
                   or randomly placed microphones in a room. Automatic
                   segmentation and microphone array processing techniques
                   are described and the effect on WERs is discussed. The
                   system and its components presented in this paper yield
                   compettive performance and form a baseline for future
                   research in this domain.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/AMIsystemMLMI05.pdf},
  year = 2005
}
@inproceedings{AMIsystemICASSP2007,
  author = {T. Hain and L. Burget and J. Dines and G. Garau and M.
                   Karafiat and M. Lincoln and J. Vepa and V. Wan},
  title = {{The {AMI} System for the Transcription of Speech in
                   Meetings}},
  booktitle = {Proc. {ICASSP}},
  abstract = {This paper describes the AMI transcription system for
                   speech in meetings developed in collaboration by five
                   research groups. The system includes generic techniques
                   such as discriminative and speaker adaptive training,
                   vocal tract length normalisation, heteroscedastic
                   linear discriminant analysis, maximum likelihood linear
                   regression, and phone posterior based features, as well
                   as techniques specifically designed for meeting data.
                   These include segmentation and cross-talk suppression,
                   beam-forming, domain adaptation, web-data collection,
                   and channel adaptive training. The system was improved
                   by more than 20\% relative in word error rate compared
                   to our previous system and was usd in the NIST RTÂ’06
                   evaluations where it was found to yield competitive
                   performance.},
  categories = {LVCSR, NIST Meeting Transcription Evaluation RT06S},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/ICASSP07.AMIasrsystem.pdf},
  year = 2007
}
@inproceedings{Lin03,
  author = {Lincoln, M. and Cox, S.J.},
  title = {A Comparison of Language Processing Techniques for a
                   Constrained Speech Translation System},
  booktitle = {IEEE Conference on Acoustics, Speech and Signal
                   Processing},
  address = {Hong Kong},
  abstract = {A system designed to allow Post Office counter clerks
                   to communicate with deaf customers by translating
                   speech into sign language is described. The system uses
                   approximately 370 pre-stored phrases which may be
                   signed to the customer using a specially designed
                   avatar. The clerk is unable to memorise this number of
                   phrases and therefore the system attempts to map from
                   their input speech to the semantically equivalent
                   pre-stored phrase. We describe a number of language
                   processing techniques developed to perform the mapping,
                   and give results obtained using alternative
                   formulations of the phrases from a number of speakers.
                   We then give results for recognised speech input and
                   show how mis-recognitions effect the mapping system.
                   Best performance is obtained using a mapping system
                   based on an entropy weighted, vector based distance
                   measure between the test phrase and each of the signed
                   phrases.},
  categories = {visicast,sign language,translation,UEA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/icassp2003.pdf},
  year = 2003
}