The Centre for Speech Technology Research, The university of Edinburgh

Publications by Pawel Swietojanski

s1136550.bib

@inproceedings{Swietojanski:ICASSP13,
  author = {Swietojanski, Pawel and Ghoshal, Arnab and Renals, Steve},
  doi = {10.1109/ICASSP.2013.6638967},
  title = {Revisiting Hybrid and {GMM-HMM} system combination techniques},
  booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Swietojanski_ICASSP2013.pdf},
  abstract = {In this paper we investigate techniques to combine hybrid HMM-DNN (hidden Markov model -- deep neural network) and tandem HMM-GMM (hidden Markov model -- Gaussian mixture model) acoustic models using: (1) model averaging, and (2) lattice combination with Minimum Bayes Risk decoding. We have performed experiments on the ``TED Talks'' task following the protocol of the IWSLT-2012 evaluation. Our experimental results suggest that DNN-based and GMM- based acoustic models are complementary, with error rates being reduced by up to 8% relative when the DNN and GMM systems are combined at model-level in a multi-pass auto- matic speech recognition (ASR) system. Additionally, further gains were obtained by combining model-averaged lat- tices with the one obtained from baseline systems.},
  categories = {deep neural networks, tandem, hybrid, system combination, TED}
}
@inproceedings{Ghoshal:ICASSP13,
  author = {Ghoshal, Arnab and Swietojanski, Pawel and Renals, Steve},
  doi = {10.1109/ICASSP.2013.6639084},
  title = {Multilingual training of deep neural networks},
  booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Ghoshal_ICASSP2013.pdf},
  abstract = {We investigate multilingual modeling in the context of a deep neural network (DNN) -- hidden Markov model (HMM) hy- brid, where the DNN outputs are used as the HMM state like- lihoods. By viewing neural networks as a cascade of fea- ture extractors followed by a logistic regression classifier, we hypothesise that the hidden layers, which act as feature ex- tractors, will be transferable between languages. As a corol- lary, we propose that training the hidden layers on multiple languages makes them more suitable for such cross-lingual transfer. We experimentally confirm these hypotheses on the GlobalPhone corpus using seven languages from three dif- ferent language families: Germanic, Romance, and Slavic. The experiments demonstrate substantial improvements over a monolingual DNN-HMM hybrid baseline, and hint at av- enues of further exploration.},
  categories = {Speech recognition, deep learning, neural networks, multilingual modeling}
}
@inproceedings{hasler2012,
  author = {Hasler, Eva and Bell, Peter and Ghoshal, Arnab and Haddow, Barry and Koehn, Philipp and McInnes, Fergus and Renals, Steve and Swietojanski, Pawel},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/paper_50.pdf},
  abstract = {This paper describes the University of Edinburgh (UEDIN) systems for the IWSLT 2012 Evaluation. We participated in the ASR (English), MT (English-French, German-English) and SLT (English-French) tracks.},
  year = {2012},
  booktitle = {Proc. International Workshop on Spoken Language Translation},
  title = {The {UEDIN} system for the {IWSLT} 2012 evaluation}
}
@inproceedings{bell13_mlan,
  author = {Bell, Peter and Swietojanski, Pawel and Renals, Steve},
  doi = {10.1109/ICASSP.2013.6639014},
  title = {Multi-level adaptive networks in tandem and hybrid {ASR} systems},
  abstract = {In this paper we investigate the use of Multi-level adaptive networks (MLAN) to incorporate out-of-domain data when training large vocabulary speech recognition systems. In a set of experiments on multi-genre broadcast data and on TED lecture recordings we present results using of out-of-domain features in a hybrid DNN system and explore tandem systems using a variety of input acoustic features. Our experiments indicate using the MLAN approach in both hybrid and tandem systems results in consistent reductions in word error rate of 5--10\% relative.},
  address = {Vancouver, Canada},
  month = {May},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/mlan_icassp2013.pdf},
  booktitle = {Proc. ICASSP}
}
@inproceedings{swi2012_dnn,
  author = {Swietojanski, P. and Ghoshal, A. and Renals, S.},
  doi = {10.1109/SLT.2012.6424230},
  title = {Unsupervised Cross-lingual knowledge transfer in {DNN-based LVCSR}},
  booktitle = {Proc. IEEE Workshop on Spoken Language Technology},
  year = {2012},
  abstract = {We investigate the use of cross-lingual acoustic data to initialise deep neural network (DNN) acoustic models by means of unsupervised restricted Boltzmann machine (RBM) pretraining. DNNs for German are pretrained using one or all of German, Portuguese, Spanish and Swedish. The DNNs are used in a tandem configuration, where the network outputs are used as features for a hidden Markov model (HMM) whose emission densities are modeled by Gaussian mixture models (GMMs), as well as in a hybrid configuration, where the network outputs are used as the HMM state likelihoods. The experiments show that unsupervised pretraining is more crucial for the hybrid setups, particularly with limited amounts of transcribed training data. More importantly, unsupervised pretraining is shown to be language-independent.},
  month = {December},
  address = {Miami, Florida, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/ps_slt2012.pdf},
  pages = {246--251}
}
@inproceedings{bell12_mlan,
  author = {Bell, P. and Gales, M. and Lanchantin, P. and Liu, X. and Long, Y. and Renals, S. and Swietojanski, P. and Woodland, P.},
  doi = {10.1109/SLT.2012.6424244},
  title = {Transcription of multi-genre media archives using out-of-domain data},
  booktitle = {Proc. IEEE Workshop on Spoken Language Technology},
  year = {2012},
  abstract = {We describe our work on developing a speech recognition system for multi-genre media archives. The high diversity of the data makes this a challenging recognition task, which may benefit from systems trained on a combination of in-domain and out-of-domain data. Working with tandem HMMs, we present Multi-level Adaptive Networks (MLAN), a novel technique for incorporating information from out-of-domain posterior features using deep neural networks. We show that it provides a substantial reduction in WER over other systems, with relative WER reductions of 15\% over a PLP baseline, 9\% over in-domain tandem features and 8\% over the best out-of-domain tandem features.},
  month = {December},
  address = {Miami, Florida, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/mlan_slt2012.pdf},
  pages = {324--329}
}
@inproceedings{Swietojanski:ASRU13,
  author = {Swietojanski, P. and Ghoshal, A. and Renals, S.},
  doi = {10.1109/ASRU.2013.6707744},
  title = {HYBRID ACOUSTIC MODELS FOR DISTANT AND MULTICHANNEL LARGE VOCABULARY SPEECH RECOGNITION},
  abstract = {We investigate the application of deep neural network (DNN)-hidden Markov model (HMM) hybrid acoustic models for far-field speech recognition of meetings recorded using microphone arrays. We show that the hybrid models achieve significantly better accuracy than conventional systems based on Gaussian mixture models (GMMs). We observe up to 8% absolute word error rate (WER) reduction from a discriminatively trained GMM baseline when using a single distant microphone, and between 4–6% absolute WER reduction when using beamforming on various combinations of array channels. By training the networks on audio from multiple channels, we find the networks can recover significant part of accuracy difference between the single distant microphone and beamformed configurations. Finally, we show that the accuracy of a network recognising speech from a single distant microphone can approach that of a multi-microphone setup by training with data from other microphones.},
  month = {December},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Swietojanski_ASRU2013.pdf},
  booktitle = {Proc. IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)},
  categories = {Distant Speech Recognition, Deep Neural Networks, Microphone Arrays, Beamforming, Meeting recognition}
}
@inproceedings{bell13_lecture_transcription,
  author = {Bell, Peter and Yamamoto, Hitoshi and Swietojanski, Pawel and Wu, Youzheng and McInnes, Fergus and Hori, Chiori and Renals, Steve},
  title = {A lecture transcription system combining neural network acoustic and language models},
  booktitle = {Proc. Interspeech},
  address = {Lyon, France},
  month = {August},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/lecture_transcription_is2013.pdf},
  abstract = {This paper presents a new system for automatic transcription of lectures. The system combines a number of novel features, including deep neural network acoustic models using multi-level adaptive networks to incorporate out-of-domain information, and factored recurrent neural network language models. We demonstrate that the system achieves large improvements on the TED lecture transcription task from the 2012 IWSLT evaluation -- our results are currently the best reported on this task, showing an relative WER reduction of more than 16\% compared to the closest competing system from the evaluation.}
}
@inproceedings{christensen13_disordered,
  author = {Christensen, H. and Aniol, M. and Bell, P. and Green, P. and Hain, T. and King, S. and Swietojanski, P.},
  title = {Combining in-domain and out-of-domain speech data for automatic recognition of disordered speech},
  booktitle = {Proc. Interspeech},
  address = {Lyon, France},
  month = {August},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/christensen_is13_2_final.pdf},
  abstract = {Recently there has been increasing interest in ways of using out-of-domain (OOD) data to improve automatic speech recognition performance in domains where only limited data is available. This paper focuses on one such domain, namely that of disordered speech for which only very small databases exist, but where normal speech can be considered OOD. Standard approaches for handling small data domains use adaptation from OOD models into the target domain, but here we investigate an alternative approach with its focus on the feature extraction stage: OOD data is used to train feature-generating deep belief neural networks. Using AMI meeting and TED talk datasets, we investigate various tandem-based speaker independent systems as well as maximum a posteriori adapted speaker dependent systems. Results on the UAspeech isolated word task of disordered speech are very promising with our overall best system (using a combination of AMI and TED data) giving a correctness of 62.5\%; an increase of 15\% on previously best published results based on conventional model adaptation. We show that the relative benefit of using OOD data varies considerably from speaker to speaker and is only loosely correlated with the severity of a speaker's impairments.}
}
@inproceedings{lanchantin13_multigenre_transcription,
  author = {Lanchantin, P. and Bell, P. and Gales, M. and Hain, T. and Liu, X. and Long, Y. and Quinnell, J. and Renals, S. and Saz, O. and Seigel, M. and Swietojanski, P. and Woodland, P.},
  title = {Automatic Transcription of Multi-genre Media Archives},
  booktitle = {Proc. Workshop on Speech, Language and Audio in Multimedia},
  year = {2013},
  address = {Marseille, France},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/lanchantin13_multigenre_transcription.pdf},
  abstract = {This paper describes some recent results of our collaborative work on developing a speech recognition system for the automatic transcription or media archives from the British Broadcasting Corporation (BBC). Material includes a high diversity of shows with their associated transcriptions. The latter are highly diverse in terms of completeness, reliability and accuracy. First, we investigate how to improve lightly supervised acoustic training when time-stamps information is inaccurate or when speech deviates significantly from the transcription. To address the last issue, word and segment level combination approaches are used between the lightly supervised transcripts and the original programme scripts which yield improved transcriptions. Experimental results show that systems trained using these improved transcriptions consistently outperform those trained using only the original lightly supervised decoding hypotheses. Secondly, we show that the recognition task may benefit from systems trained on a combination of in-domain and out-of-domain data. Working with tandem HMMs, we present Multi-level Adaptive Networks, a novel technique for incorporating information from out-of domain posterior features using deep neural network. We show that it provides a substantial reduction in WER over other systems including PLP baseline, in-domain tandem features and best out-of-domain tandem features.}
}
@inproceedings{Swietojanski:ICASSP14,
  author = {Swietojanski, P. and Li, J. and Huang, J-T},
  title = {Investigation of Maxout Networks for Speech Recognition},
  booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/Swietojanski_ICASSP14.pdf},
  abstract = {We explore the use of maxout neuron in various aspects of acoustic modelling for large vocabulary speech recognition systems; including low-resource scenario and multilingual knowledge transfers. Through the experiments on voice search and short message dictation datasets, we found that maxout networks are around three times faster to train and offer lower or comparable word error rates on several tasks, when compared to the networks with logistic nonlinearity. We also present a detailed study of the maxout unit internal behaviour suggesting the use of different nonlinearities in different layers.},
  categories = {deep neural networks, maxout networks, multi-task learning, low-resource speech recognition}
}
@article{Swietojanski:SPL14,
  author = {Swietojanski, P. and Ghoshal, A. and Renals, S.},
  doi = {10.1109/LSP.2014.2325781},
  title = {Convolutional Neural Networks for Distant Speech Recognition},
  journal = {Signal Processing Letters, IEEE},
  issn = {1070-9908},
  number = {9},
  month = {September},
  volume = {21},
  pages = {1120-1124},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/Swietojanski_SPL14.pdf},
  abstract = {We investigate convolutional neural networks (CNNs) for large vocabulary distant speech recognition, trained using speech recorded from a single distant microphone (SDM) and multiple distant microphones (MDM). In the MDM case we explore a beamformed signal input representation compared with the direct use of multiple acoustic channels as a parallel input to the CNN. We have explored different weight sharing approaches, and propose a channel-wise convolution with two-way pooling. Our experiments, using the AMI meeting corpus, found that CNNs improve the word error rate (WER) by 6.5% relative compared to conventional deep neural network (DNN) models and 15.7% over a discriminatively trained Gaussian mixture model (GMM) baseline. For cross-channel CNN training, the WER improves by 3.5% relative over the comparable DNN structure. Compared with the best beamformed GMM system, cross-channel convolution reduces the WER by 9.7% relative, and matches the accuracy of a beamformed DNN.},
  categories = {distant speech recognition, deep neural networks, convolutional neural networks, meetings, AMI corpus}
}
@inproceedings{Renals:HSCMA14,
  author = {Renals, S. and Swietojanski, P.},
  title = {Neural Networks for Distant Speech Recognition},
  booktitle = {The 4th Joint Workshop on Hands-free Speech Communication and Microphone Arrays (HSCMA)},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/srenals-hscma2014.pdf},
  abstract = {Distant conversational speech recognition is challenging owing to the presence of multiple, overlapping talkers, additional non-speech acoustic sources, and the effects of reverberation. In this paper we review work on distant speech recognition, with an emphasis on approaches which combine multichannel signal processing with acoustic modelling, and investigate the use of hybrid neural network / hidden Markov model acoustic models for distant speech recognition of meetings recorded using microphone arrays. In particular we investigate the use of convolutional and fully-connected neural networks with different activation functions (sigmoid, rectified linear, and maxout). We performed experiments on the AMI and ICSI meeting corpora, with results indicating that neural network models are capable of significant improvements in accuracy compared with discriminatively trained Gaussian mixture models.},
  categories = {convolutional neural networks, distant speech recognition, rectifier unit, maxout networks, beamforming, meetings, AMI corpus, ICSI corpus}
}
@incollection{CPSwT2014,
  editor = {Zielinski, T. and Korohoda, P. and Rumian, R.},
  author = {Makowski, R. and Swietojanski, P. and Wielgat, R.},
  publisher = {Wydawnictwo Naukowe PWN - Polish Scientific Publishers PWN},
  isbn = {978-83-01-17445-3},
  title = {Automatyczne Rozpoznawanie Mowy},
  url = {http://teledsp.kt.agh.edu.pl},
  booktitle = {Cyfrowe Przetwarzanie Sygnalow w Telekomunikacji. Podstawy, multimedia, transmisja.},
  address = {Warszawa},
  year = {2014},
  abstract = {Książka omawia metody analizy i przetwarzania sygnałów cyfrowych. Dokonano w niej karkołomnego przejścia od podstaw cyfrowego przetwarzania sygnałów do najnowszej technologii LTE IV generacji.},
  categories = {telekomunikacja, przetwarzanie sygnalow, kompresja danych, transmisja informacji, teleinformatyka, analiza sygnalow}
}
@inproceedings{Swietojanski2014_lhuc,
  author = {Swietojanski, P. and Renals, S.},
  title = {Learning Hidden Unit Contributions for Unsupervised Speaker Adaptation of Neural Network Acoustic Models},
  booktitle = {Proc. IEEE Workshop on Spoken Language Technology},
  year = {2014},
  month = {December},
  address = {Lake Tahoe, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/ps-slt14.pdf},
  abstract = {This paper proposes a simple yet effective model-based neural network speaker adaptation technique that learns speaker- specific hidden unit contributions given adaptation data, without requiring any form of speaker-adaptive training, or labelled adaptation data. An additional amplitude parameter is defined for each hidden unit; the amplitude parameters are tied for each speaker, and are learned using unsupervised adaptation. We conducted experiments on the TED talks data, as used in the International Workshop on Spoken Language Translation (IWSLT) evaluations. Our results indicate that the approach can reduce word error rates on standard IWSLT test sets by about 8–15% relative compared to unadapted systems, with a further reduction of 4–6% relative when combined with feature-space maximum likelihood linear re- gression (fMLLR). The approach can be employed in most existing feed-forward neural network architectures, and we report results using various hidden unit activation functions: sigmoid, maxout, and rectifying linear units (ReLU).},
  categories = {Speaker Adaptation, Deep Neural Networks, TED, IWSLT, LHUC}
}
@inproceedings{SwietojanskiICASSP15,
  author = {Swietojanski, P. and Renals, S.},
  title = {Differentiable Pooling for Unsupervised Speaker Adaptation},
  booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
  year = {2015},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/Swietojanski_ICASSP2015.pdf},
  abstract = {This paper proposes a differentiable pooling mechanism to perform model-based neural network speaker adaptation. The proposed technique learns a speaker-dependent combination of activations within pools of hidden units, was shown to work well unsupervised, and does not require speaker-adaptive training. We have conducted a set of experiments on the TED talks data, as used in the IWSLT evaluations. Our results indicate that the approach can reduce word error rates (WERs) on standard IWSLT test sets by about 5–11% relative compared to speaker-independent systems and was found complementary to the recently proposed learning hidden units contribution (LHUC) approach, reducing WER by 6–13% relative. Both methods were also found to work well when adapting with small amounts of unsupervised data – 10 seconds is able to decrease the WER by 5% relative compared to the baseline speaker independent system},
  categories = {Differentiable pooling, Speaker Adaptation, Deep Neural Networks, TED, LHUC}
}
@inproceedings{Swietojanski2015,
  author = {Swietojanski, P. and Bell, P. and Renals, S.},
  doi = {},
  title = {Structured Output Layer with Auxiliary Targets for Context-Dependent Acoustic Modelling},
  booktitle = {Proc. Interspeech},
  year = {2015},
  abstract = {In previous work we have introduced a multi-task training technique for neural network acoustic modelling, in which context-dependent and context-independent targets are jointly learned. In this paper, we extend the approach by structuring the out-put layer such that the context-dependent outputs are dependent on the context-independent outputs, thus using the context-independent predictions at run-time. We have also investigated the applicability of this idea to unsupervised speaker adaptation as an approach to overcome the data sparsity issues that comes to the fore when estimating systems with a large number of context-dependent states, when data is limited. We have experimented with various amounts of training material (from 10 to 300 hours) and find the proposed techniques are particularly well suited to data-constrained conditions allowing to better utilise large context-dependent state-clustered trees. Experimental results are reported for large vocabulary speech recognition using the Switchboard and TED corpora.},
  month = {September},
  address = {Dresden, Germany},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/Swietojanski_Interspeech2015.pdf},
  pages = {},
  categories = {multitask learning, structured output layer, adap- tation, deep neural networks}
}
@inproceedings{bell14_iwslt,
  author = {Bell, Peter and Swietojanski, Pawel and Driesen, Joris and Sinclair, Mark and McInnes, Fergus and Renals, Steve},
  title = {The {UEDIN} {ASR} Systems for the {IWSLT} 2014 Evaluation},
  booktitle = {Proc. IWSLT},
  address = {South Lake Tahoe, USA},
  month = {December},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/bell14_iwslt.pdf},
  abstract = {This paper describes the University of Edinburgh (UEDIN) ASR systems for the 2014 IWSLT Evaluation. Notable features of the English system include deep neural network acoustic models in both tandem and hybrid configuration with the use of multi-level adaptive networks, LHUC adaptation and Maxout units. The German system includes lightly supervised training and a new method for dictionary generation. Our voice activity detection system now uses a semi-Markov model to incorporate a prior on utterance lengths. There are improvements of up to 30\% relative WER on the tst2013 English test set.}
}
@inproceedings{wu2015adaptation,
  author = {Wu, Zhizheng and Swietojanski, Pawel and Veaux, Christophe and Renals, Steve and King, Simon},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/is2015_dnn_adaptation.pdf},
  booktitle = {Interspeech},
  year = {2015},
  title = {A study of speaker adaptation for {DNN}-based speech synthesis}
}
@inproceedings{Swietojanski_ICASSP2016,
  author = {Swietojanski, P. and Renals, S.},
  title = {SAT-LHUC: Speaker Adaptive Training for Learning Hidden Unit Contributions},
  booktitle = {Proc. IEEE ICASSP},
  address = {Shanghai, China},
  month = {March},
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/Swietojanski_ICASSP2016.pdf},
  abstract = {This paper extends learning hidden unit contributions (LHUC) unsupervised speaker adaptation with speaker adaptive training (SAT). Contrary to other SAT approaches, the proposed technique does not require speaker-dependent features, the generation of auxiliary generative models to estimate or extract speaker-dependent information, or any changes to the speaker-independent model structure. SAT-LHUC is directly integrated into the objective and jointly learns speaker-independent and speaker-dependent representations. We demonstrate that the SAT-LHUC technique can match feature-space regression transforms for matched narrow-band data and outperform it on wide-band data when the runtime distribution differs significantly from training one. We have obtained 6.5%, 10% and 18.5% relative word error rate reductions compared to speaker-independent models on Switchboard, AMI meetings and TED lectures, respectively. This corresponds to relative gains of 2%, 4% and 6% compared with non-SAT LHUC adaptation. SAT-LHUC was also found to be complementary to SAT with feature-space maximum likelihood linear regression transforms.},
  categories = {SAT, Deep Neural Networks, LHUC}
}
@inproceedings{swietojanskiICASSP16,
  author = {Swietojanski, P. and Renals, S.},
  title = {{SAT-LHUC}: Speaker Adaptive Training for Learning Hidden Unit Contributions},
  booktitle = {Proc. IEEE Int. Conf. Acoustic, Speech Signal Processing (ICASSP)},
  abstract = {This paper extends learning hidden unit contributions (LHUC) unsupervised speaker adaptation with speaker adaptive training (SAT). Contrary to other SAT approaches, the proposed technique does not require speaker-dependent features, the generation of auxiliary generative models to estimate or extract speaker-dependent information, or any changes to the speaker-independent model structure. SAT-LHUC is directly integrated into the objective and jointly learns speaker-independent and speaker-dependent representations. We demonstrate that the SAT-LHUC technique can match feature-space regression transforms for matched narrow-band data and outperform it on wide-band data when the runtime distribution differs significantly from training one. We have obtained 6.5\%, 10\% and 18.5\% relative word error rate reductions compared to speaker-independent models on Switchboard, AMI meetings and TED lectures, respectively. This corresponds to relative gains of 2\%, 4\% and 6\% compared with non-SAT LHUC adaptation. SAT-LHUC was also found to be complementary to SAT with feature-space maximum likelihood linear regression transforms.},
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/Swietojanski_ICASSP2016.pdf},
  pages = {5010--5014}
}
@article{swietojanski2016lhuc,
  author = {Swietojanski, P. and Li, J. and Renals, S.},
  doi = {10.1109/TASLP.2016.2560534},
  title = {Learning Hidden Unit Contributions for Unsupervised Acoustic Model Adaptation},
  journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
  issn = {2329-9290},
  number = {8},
  month = {August},
  volume = {24},
  pages = {1450-1463},
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/swietojanski2016lhuc.pdf},
  abstract = {This work presents a broad study on the adaptation of neural network acoustic models by means of learning hidden unit contributions (LHUC) -- a method that linearly re-combines hidden units in a speaker- or environment-dependent manner using small amounts of unsupervised adaptation data. We also extend LHUC to a speaker adaptive training (SAT) framework that leads to a more adaptable DNN acoustic model, working both in a speaker-dependent and a speaker-independent manner, without the requirements to maintain auxiliary speaker-dependent feature extractors or to introduce significant speaker-dependent changes to the DNN structure. Through a series of experiments on four different speech recognition benchmarks (TED talks, Switchboard, AMI meetings, and Aurora4) comprising 270 test speakers, we show that LHUC in both its test-only and SAT variants results in consistent word error rate reductions ranging from 5\% to 23\% relative depending on the task and the degree of mismatch between training and test data. In addition, we have investigated the effect of the amount of adaptation data per speaker, the quality of unsupervised adaptation targets, the complementarity to other adaptation techniques, one-shot adaptation, and an extension to adapting DNNs trained in a sequence discriminative manner.}
}
@article{swietojanski2016diffp,
  author = {{Swietojanski}, P. and {Renals}, S.},
  doi = {10.1109/TASLP.2016.2584700},
  title = {{Differentiable Pooling for Unsupervised Acoustic Model Adaptation}},
  journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
  issn = {2329-9290},
  number = {10},
  month = {October},
  volume = {24},
  pages = {1773-1784},
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/swietojanski2016diffp.pdf},
  abstract = {We present a deep neural network (DNN) acoustic model that includes parametrised and differentiable pooling operators. Unsupervised acoustic model adaptation is cast as the problem of updating the decision boundaries implemented by each pooling operator. In particular, we experiment with two types of pooling parametrisations: learned $L_p$-norm pooling and weighted Gaussian pooling, in which the weights of both operators are treated as speaker-dependent. We perform investigations using three different large vocabulary speech recognition corpora: AMI meetings, TED talks and Switchboard conversational telephone speech. We demonstrate that differentiable pooling operators provide a robust and relatively low-dimensional way to adapt acoustic models, with relative word error rates reductions ranging from 5--20\% with respect to unadapted systems, which themselves are better than the baseline fully-connected DNN-based acoustic models. We also investigate how the proposed techniques work under various adaptation conditions including the quality of adaptation data and complementarity to other feature- and model-space adaptation methods, as well as providing an analysis of the characteristics of each of the proposed approaches.}
}
@phdthesis{swietojanski2016phdthesis,
  author = {Swietojanski, P.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/swietojanski_phdthesis.pdf},
  school = {University of Edinburgh},
  title = {Learning Representations for Speech Recognition using Artificial Neural Networks},
  abstract = {Learning representations is a central challenge in machine learning. For speech recognition, we are interested in learning robust representations that are stable across different acoustic environments, recording equipment and irrelevant inter-- and intra-- speaker variabilities. This thesis is concerned with representation learning for acoustic model adaptation to speakers and environments, construction of acoustic models in low-resource settings, and learning representations from multiple acoustic channels. The investigations are primarily focused on the hybrid approach to acoustic modelling based on hidden Markov models and artificial neural networks (ANN). The first contribution concerns acoustic model adaptation. This comprises two new adaptation transforms operating in ANN parameters space. Both operate at the level of activation functions and treat a trained ANN acoustic model as a canonical set of fixed-basis functions, from which one can later derive variants tailored to the specific distribution present in adaptation data. The first technique, termed Learning Hidden Unit Contributions (LHUC), depends on learning distribution-dependent linear combination coefficients for hidden units. This technique is then extended to altering groups of hidden units with parametric and differentiable pooling operators. We found the proposed adaptation techniques pose many desirable properties: they are relatively low-dimensional, do not over-fit and can work in both a supervised and an unsupervised manner. For LHUC we also present extensions to speaker adaptive training and environment factorisation. On average, depending on the characteristics of the test set, 5-25\% relative word error rate (WERR) reductions are obtained in an unsupervised two-pass adaptation setting. The second contribution concerns building acoustic models in low-resource data scenarios. In particular, we are concerned with insufficient amounts of transcribed acoustic material for estimating acoustic models in the target language -- thus assuming resources like lexicons or texts to estimate language models are available. First we proposed an ANN with a structured output layer which models both context--dependent and context--independent speech units, with the context-independent predictions used at runtime to aid the prediction of context-dependent states. We also propose to perform multi-task adaptation with a structured output layer. We obtain consistent WERR reductions up to 6.4\% in low-resource speaker-independent acoustic modelling. Adapting those models in a multi-task manner with LHUC decreases WERRs by an additional 13.6\%, compared to 12.7\% for non multi-task LHUC. We then demonstrate that one can build better acoustic models with unsupervised multi-- and cross-- lingual initialisation and find that pre-training is a largely language-independent. Up to 14.4\% WERR reductions are observed, depending on the amount of the available transcribed acoustic data in the target language. The third contribution concerns building acoustic models from multi-channel acoustic data. For this purpose we investigate various ways of integrating and learning multi-channel representations. In particular, we investigate channel concatenation and the applicability of convolutional layers for this purpose. We propose a multi-channel convolutional layer with cross-channel pooling, which can be seen as a data-driven non-parametric auditory attention mechanism. We find that for unconstrained microphone arrays, our approach is able to match the performance of the comparable models trained on beamform-enhanced signals.},
  year = {2016}
}
@inproceedings{sgangireddy_interspeech16,
  author = {Gangireddy, Siva Reddy and Swietojanski, Pawel and Bell, Peter and Renals, Steve},
  title = {{Unsupervised adaptation of Recurrent Neural Network Language Models}},
  booktitle = {Proc. Interspeech},
  year = {2016},
  month = {September},
  address = {San Francisco, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/ispeech16.pdf},
  abstract = {Recurrent neural network language models (RNNLMs) have been shown to consistently improve Word Error Rates (WERs) of large vocabulary speech recognition systems employing n-gram LMs. In this paper we investigate supervised and unsupervised discriminative adaptation of RNNLMs in a broadcast transcription task to target domains defined by either genre or show. We have explored two approaches based on (1) scaling forward-propagated hidden activations (Learning Hidden Unit Contributions (LHUC) technique) and (2) direct fine-tuning of the parameters of the whole RNNLM. To investigate the effectiveness of the proposed methods we carry out experiments on multi-genre broadcast (MGB) data following the MGB-2015 challenge protocol. We observe small but significant improvements in WER compared to a strong unadapted RNNLM model.},
  categories = {RNNLM, LHUC, unsupervised adaptation, fine-tuning, MGB-Challenge}
}
@article{bell2017multitask,
  author = {Bell, Peter and Swietojanski, Pawel and Renals, Steve},
  publisher = {IEEE},
  title = {Multitask Learning of Context-Dependent Targets in Deep Neural Network Acoustic Models},
  journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
  number = {2},
  abstract = {This paper investigates the use of multitask learning to improve context-dependent deep neural network (DNN) acoustic models. The use of hybrid DNN systems with clustered triphone targets is now standard in automatic speech recognition. However, we suggest that using a single set of DNN targets in this manner may not be the most effective choice, since the targets are the result of a somewhat arbitrary clustering process that may not be optimal for discrimination. We propose to remedy this problem through the addition of secondary tasks predicting alternative content-dependent or context-independent targets. We present a comprehensive set of experiments on a lecture recognition task showing that DNNs trained through multitask learning in this manner give consistently improved performance compared to standard hybrid DNNs. The technique is evaluated across a range of data and output sizes. Improvements are seen when training uses the cross entropy criterion and also when sequence training is applied.},
  volume = {25},
  year = {2017},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2017/master_final_1.pdf},
  pages = {238--247}
}