The Centre for Speech Technology Research, The university of Edinburgh

Publications by Mark Sinclair

s0975530.bib

@inproceedings{sinclair_ICASSP13,
  author = {Sinclair, Mark and King, Simon},
  title = {Where are the challenges in speaker diarization?},
  abstract = {We present a study on the contributions to Diarization Error Rate by the various components of speaker diarization system. Following on from an earlier study by Huijbregts and Wooters, we extend into more areas and draw somewhat different conclusions. From a series of experiments combining real, oracle and ideal system components, we are able to conclude that the primary cause of error in diarization is the training of speaker models on impure data, something that is in fact done in every current system. We conclude by suggesting ways to improve future systems, including a focus on training the speaker models from smaller quantities of pure data instead of all the data, as is currently done.},
  year = {2013},
  month = {May},
  address = {Vancouver, British Columbia, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/3512.pdf},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2013 IEEE International Conference on},
  categories = {speaker diarization, diarization error rate}
}
@inproceedings{jdriesen:iwslt_german,
  author = {Driesen, Joris and Bell, Peter and Sinclair, Mark and Renals, Steve},
  title = {Description of the {UEDIN} system for {German ASR}},
  booktitle = {Proc IWSLT},
  year = {2013},
  month = {December},
  address = {Heidelberg, Germany},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/german_iwslt.pdf},
  abstract = {In this paper we describe the ASR system for German built at the University of Edinburgh (UEDIN) for the 2013 IWSLT evaluation campaign. For ASR, the major challenge to overcome, was to find suitable acoustic training data. Due to the lack of expertly transcribed acoustic speech data for German, acoustic model training had to be performed on publicly available data crawled from the internet. For evaluation, lack of a manual segmentation into utterances was handled in two different ways: by generating an automatic segmentation, and by treating entire input files as a single segment. Demonstrating the latter method is superior in the current task, we obtained a WER of 28.16% on the dev set and 36.21% on the test set.}
}
@inproceedings{bell13_iwslt,
  author = {Bell, Peter and McInnes, Fergus and Gangireddy, Siva Reddy and Sinclair, Mark and Birch, Alexandra and Renals, Steve},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/bell13_iwslt_system.pdf},
  booktitle = {Proc. International Workshop on Spoken Language Translation},
  title = {The {UEDIN} English {ASR} System for the {IWSLT} 2013 Evaluation},
  abstract = {This paper describes the University of Edinburgh (UEDIN) English ASR system for the IWSLT 2013 Evaluation. \mbox{Notable} features of the system include deep neural network acoustic models in both tandem and hybrid configuration, cross-domain adaptation with multi-level adaptive networks, and the use of a recurrent neural network language model. Improvements to our system since the 2012 evaluation -- which include the use of a significantly improved n-gram language model -- result in a 19\% relative WER reduction on the \tstD set.},
  year = {2013}
}
@inproceedings{sinclairbell_interspeech14,
  author = {Sinclair, Mark and Bell, Peter and Birch, Alexandra and McInnes, Fergus},
  title = {A semi-Markov model for speech segmentation with an utterance-break prior},
  booktitle = {Proc. Interspeech},
  month = {September},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/interspeech2014.pdf},
  abstract = {Speech segmentation is the problem of finding the end points of a speech utterance for passing to an automatic speech recognition (ASR) system. The quality of this segmentation can have a large impact on the accuracy of the ASR system; in this paper we demonstrate that it can have an even larger impact on downstream natural language processing tasks – in this case, machine translation. We develop a novel semi-Markov model which allows the segmentation of audio streams into speech utterances which are optimised for the desired distribution of sentence lengths for the target domain. We compare this with existing state-of-the-art methods and show that it is able to achieve not only improved ASR performance, but also to yield significant benefits to a speech translation task.},
  categories = {speech activity detection, speech segmentation, machine translation, speech recognition}
}
@inproceedings{bell14_iwslt,
  author = {Bell, Peter and Swietojanski, Pawel and Driesen, Joris and Sinclair, Mark and McInnes, Fergus and Renals, Steve},
  title = {The {UEDIN} {ASR} Systems for the {IWSLT} 2014 Evaluation},
  booktitle = {Proc. IWSLT},
  address = {South Lake Tahoe, USA},
  month = {December},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/bell14_iwslt.pdf},
  abstract = {This paper describes the University of Edinburgh (UEDIN) ASR systems for the 2014 IWSLT Evaluation. Notable features of the English system include deep neural network acoustic models in both tandem and hybrid configuration with the use of multi-level adaptive networks, LHUC adaptation and Maxout units. The German system includes lightly supervised training and a new method for dictionary generation. Our voice activity detection system now uses a semi-Markov model to incorporate a prior on utterance lengths. There are improvements of up to 30\% relative WER on the tst2013 English test set.}
}
@inproceedings{bell15_news_summarisation,
  author = {Bell, Peter and Lai, Catherine and Llewellyn, Clare and Birch, Alexandra and Sinclair, Mark},
  title = {A system for automatic broadcast news summarisation, geolocation and translation},
  booktitle = {Proc. Interspeech (demo session)},
  address = {Dresden, Germany},
  month = {September},
  year = {2015},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/bell15_news_summarisation.pdf},
  abstract = {An increasing amount of news content is produced in audio-video form every day. To effectively analyse and monitoring this multilingual data stream, we require methods to extract and present audio content in accessible ways. In this paper, we describe an end-to-end system for processing and browsing audio news data. This fully automated system brings together our recent research on audio scene analysis, speech recognition, summarisation, named entity detection, geolocation, and machine translation. The graphical interface allows users to visualise the distribution of news content by entity names and story location. Browsing of news events is facilitated through extractive summaries and the ability to view transcripts in multiple languages.}
}
@inproceedings{bell17_transcription_correction,
  author = {Bell, Peter and Fainberg, Joachim and Lai, Catherine and Sinclair, Mark},
  title = {A system for real-time collaborative transcription correction},
  booktitle = {Proc. Interspeech (demo session)},
  month = aug,
  year = {2017},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2017/is2017demo_nh_1.pdf},
  abstract = {We present a system to enable efficient, collaborative human correction of ASR transcripts, designed to operate in real-time situations, for example, when post-editing live captions generated for news broadcasts. In the system, confusion networks derived from ASR lattices are used to highlight low-confident words and present alternatives to the user for quick correction. The system uses a client-server architecture, whereby information about each manual edit is posted to the server. Such information can be used to dynamically update the one-best ASR output for all utterances currently in the editing pipeline. We propose to make updates in three different ways; by finding a new one-best path through an existing ASR lattice consistent with the correction received; by identifying further instances of out-of-vocabulary terms entered by the user; and by adapting the language model on the fly. Updates are received asynchronously by the client.}
}
@inproceedings{bell2017system,
  author = {Bell, Peter and Fainberg, Joachim and Lai, Catherine and Sinclair, Mark},
  title = {A System for Real Time Collaborative Transcription Correction},
  abstract = {We present a system to enable efficient, collaborative human correction of ASR transcripts, designed to operate in real-time situations, for example, when post-editing live captions generated for news broadcasts. In the system, confusion networks derived from ASR lattices are used to highlight low-confident words and present alternatives to the user for quick correction. The system uses a client-server architecture, whereby information about each manual edit is posted to the server. Such information can be used to dynamically update the one-best ASR output for all utterances currently in the editing pipeline. We propose to make updates in three different ways; by finding a new one-best path through an existing ASR lattice consistent with the correction received; by identifying further instances of out-of-vocabulary terms entered by the user; and by adapting the language model on the fly. Updates are received asynchronously by the client.},
  pages = {817--818},
  year = {2017},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2017/bell2017transcriber.PDF},
  booktitle = {Proceedings of Interspeech 2017},
  categories = {speech recognition, speech transcription, language modelling}
}