The Centre for Speech Technology Research, The university of Edinburgh

Publications by Arnab Ghoshal

aghoshal.bib

@inproceedings{swi2012_dnn,
  author = {Swietojanski, P. and Ghoshal, A. and Renals, S.},
  title = {Unsupervised Cross-lingual knowledge transfer in {DNN-based LVCSR}},
  booktitle = {Proc. IEEE Workshop on Spoken Language Technology},
  address = {Miami, Florida, USA},
  abstract = {We investigate the use of cross-lingual acoustic data to initialise deep neural network (DNN) acoustic models by means
of unsupervised restricted Boltzmann machine (RBM) pretraining.
DNNs for German are pretrained using one or all of German, Portuguese, Spanish and Swedish. The DNNs are used in a tandem configuration, where the network outputs are used as features for a hidden Markov model (HMM) whose
emission densities are modeled by Gaussian mixture models (GMMs), as well as in a hybrid configuration, where the network outputs are used as the HMM state likelihoods. The experiments show that unsupervised pretraining is more crucial
for the hybrid setups, particularly with limited amounts of transcribed training data. More importantly, unsupervised pretraining is shown to be language-independent.},
  month = dec,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/ps_slt2012.pdf},
  year = 2012
}
@inproceedings{llu2012map,
  author = {Lu, L. and Ghoshal, A. and Renals, S.},
  title = {{Maximum a posteriori adaptation of subspace Gaussian
                   mixture models for cross-lingual speech recognition}},
  booktitle = {Proc. ICASSP},
  abstract = {This paper concerns cross-lingual acoustic modeling in
                   the case when there are limited target language
                   resources. We build on an approach in which a subspace
                   Gaussian mixture model (SGMM) is adapted to the target
                   language by reusing the globally shared parameters
                   estimated from out-of-language training data. In
                   current cross-lingual systems, these parameters are
                   fixed when training the target system, which can give
                   rise to a mismatch between the source and target
                   systems. We investigate a maximum a posteriori (MAP)
                   adaptation approach to alleviate the potential
                   mismatch. In particular, we focus on the adaptation of
                   phonetic subspace parameters using a matrix variate
                   Gaussian prior distribution. Experiments on the
                   GlobalPhone corpus using the MAP adaptation approach
                   results in word error rate reductions, compared with
                   the cross-lingual baseline systems and systems updated
                   using maximum likelihood, for training conditions with
                   1 hour and 5 hours of target language data.},
  keywords = {Subspace Gaussian Mixture Model, Maximum a Posteriori
                   Adaptation, Cross-lingual Speech Recognition},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/llu-icassp-2012.pdf},
  year = 2012
}
@article{lu_spl_2011,
  author = {Lu, L. and Ghoshal, A. and Renals, S.},
  title = {Regularized Subspace Gausian Mixture Models for Speech
                   Recognition},
  journal = {IEEE Signal Processing Letters},
  volume = {18},
  number = {7},
  pages = {419--422},
  abstract = {Subspace Gaussian mixture models (SGMMs) provide a
                   compact representation of the Gaussian parameters in an
                   acoustic model, but may still suffer from over-fitting
                   with insufficient training data. In this letter, the
                   SGMM state parameters are estimated using a penalized
                   maximum-likelihood objective, based on $\ell_1$ and
                   $\ell_2$ regularization, as well as their combination,
                   referred to as the elastic net, for robust model
                   estimation. Experiments on the 5000-word Wall Street
                   Journal transcription task show word error rate
                   reduction and improved model robustness with
                   regularization.},
  categories = {Acoustic Modelling, Regularization, Sparsity, Subspace
                   Gaussian Mixture Model},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/lu-spl-2011.pdf},
  year = 2011
}
@inproceedings{lu2012jud,
  author = {Lu, L. and Ghoshal, A. and Renals, S.},
  title = {{Joint uncertainty decoding with unscented transform
                   for noise robust subspace Gaussian mixture model}},
  booktitle = {Proc. Sapa-Scale workshop},
  abstract = {Common noise compensation techniques use vector Taylor
                   series (VTS) to approximate the mismatch function.
                   Recent work shows that the approximation accuracy may
                   be improved by sampling. One such sampling technique is
                   the unscented transform (UT), which draws samples
                   deterministically from clean speech and noise model to
                   derive the noise corrupted speech parameters. This
                   paper applies UT to noise compensation of the subspace
                   Gaussian mixture model (SGMM). Since UT requires
                   relatively smaller number of samples for accurate
                   estimation, it has significantly lower computational
                   cost compared to other random sampling techniques.
                   However, the number of surface Gaussians in an SGMM is
                   typically very large, making the direct application of
                   UT, for compensating individual Gaussian components,
                   computationally impractical. In this paper, we avoid
                   the computational burden by employing UT in the
                   framework of joint uncertainty decoding (JUD), which
                   groups all the Gaussian components into small number of
                   classes, sharing the compensation parameters by class.
                   We evaluate the JUD-UT technique for an SGMM system
                   using the Aurora 4 corpus. Experimental results
                   indicate that UT can lead to increased accuracy
                   compared to VTS approximation if the JUD phase factor
                   is untuned, and to similar accuracy if the phase factor
                   is tuned empirically},
  keywords = {noise compensation, SGMM, JUD, UT},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/llu-sapa2012.pdf},
  year = 2012
}
@inproceedings{lu2012noise,
  author = {Lu, L. and Chin, KK and Ghoshal, A. and Renals, S.},
  title = {{Noise compensation for subspace Gaussian mixture
                   models}},
  booktitle = {Proc. INTERSPEECH},
  abstract = {Joint uncertainty decoding (JUD) is an effective
                   model-based noise compensation technique for
                   conventional Gaussian mixture model (GMM) based speech
                   recognition systems. In this paper, we apply JUD to
                   subspace Gaussian mixture model (SGMM) based acoustic
                   models. The total number of Gaussians in the SGMM
                   acoustic model is usually much larger than for
                   conventional GMMs, which limits the application of
                   approaches which explicitly compensate each Gaussian,
                   such as vector Taylor series (VTS). However, by
                   clustering the Gaussian components into a number of
                   regression classes, JUD-based noise compensation can be
                   successfully applied to SGMM systems. We evaluate the
                   JUD/SGMM technique using the Aurora 4 corpus, and the
                   experimental results indicated that it is more accurate
                   than conventional GMM-based systems using either VTS or
                   JUD noise compensation.},
  keywords = {acoustic modelling, noise compensation, SGMM, JUD},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/llu-is2012.pdf},
  year = 2012
}
@inproceedings{lu_asru_2011,
  author = {Lu, L. and Ghoshal, A. and Renals, S.},
  title = {Regularized Subspace {G}ausian Mixture Models for
                   Cross-lingual Speech Recognition},
  booktitle = {Proc. ASRU},
  abstract = {We investigate cross-lingual acoustic modelling for
                   low resource languages using the subspace Gaussian
                   mixture model (SGMM). We assume the presence of
                   acoustic models trained on multiple source languages,
                   and use the global subspace parameters from those
                   models for improved modelling in a target language with
                   limited amounts of transcribed speech. Experiments on
                   the GlobalPhone corpus using Spanish, Portuguese, and
                   Swedish as source languages and German as target
                   language (with 1 hour and 5 hours of transcribed audio)
                   show that multilingually trained SGMM shared parameters
                   result in lower word error rates (WERs) than using
                   those from a single source language. We also show that
                   regularizing the estimation of the SGMM state vectors
                   by penalizing their $\ell_1$-norm help to overcome
                   numerical instabilities and lead to lower WER.},
  categories = {Subspace Gaussian Mixture Model, Cross-lingual, model
                   regularization},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/lu-asru-2011.pdf},
  year = 2011
}