The Centre for Speech Technology Research, The university of Edinburgh

Publications by Zhang Le

s0450736.bib

@article{zhang-spl2008,
  author = {Le Zhang and Steve Renals},
  title = {Acoustic-Articulatory Modelling with the Trajectory
                   {HMM}},
  journal = {IEEE Signal Processing Letters},
  volume = 15,
  pages = {245-248},
  abstract = { In this letter, we introduce an hidden Markov model
                   (HMM)-based inversion system to recovery articulatory
                   movements from speech acoustics. Trajectory HMMs are
                   used as generative models for modelling articulatory
                   data. Experiments on the MOCHA-TIMIT corpus indicate
                   that the jointly trained acoustic-articulatory models
                   are more accurate (lower RMS error) than the separately
                   trained ones, and that trajectory HMM training results
                   in greater accuracy compared with conventional maximum
                   likelihood HMM training. Moreover, the system has the
                   ability to synthesize articulatory movements directly
                   from a textual representation. },
  key = {articulatory inversion},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/zhang-spl.pdf},
  year = 2008
}
@incollection{alhames-mlmi05,
  author = {M. Al-Hames and A. Dielmann and D. Gatica-Perez and S.
                   Reiter and S. Renals and G. Rigoll and D. Zhang},
  title = {Multimodal Integration for Meeting Group Action
                   Segmentation and Recognition},
  booktitle = {Proc. Multimodal Interaction and Related Machine
                   Learning Algorithms Workshop (MLMI--05)},
  publisher = {Springer},
  editor = {S. Renals and S. Bengio},
  pages = {52--63},
  abstract = {We address the problem of segmentation and recognition
                   of sequences of multimodal human interactions in
                   meetings. These interactions can be seen as a rough
                   structure of a meeting, and can be used either as input
                   for a meeting browser or as a first step towards a
                   higher semantic analysis of the meeting. A common
                   lexicon of multimodal group meeting actions, a shared
                   meeting data set, and a common evaluation procedure
                   enable us to compare the different approaches. We
                   compare three different multimodal feature sets and our
                   modelling infrastructures: a higher semantic feature
                   approach, multi-layer HMMs, a multistream DBN, as well
                   as a multi-stream mixed-state DBN for disturbed data.},
  categories = {m4,ami,multimodal,dbn,meetings,edinburgh,IDIAP,munich},
  year = 2006
}
@inproceedings{zhang-icslp2006,
  author = {Le Zhang and Steve Renals},
  title = {Phone Recognition Analysis for Trajectory {HMM}},
  booktitle = {Proc. Interspeech 2006},
  address = {Pittsburgh, USA},
  abstract = { The trajectory {HMM} has been shown to be useful for
                   model-based speech synthesis where a smoothed
                   trajectory is generated using temporal constraints
                   imposed by dynamic features. To evaluate the
                   performance of such model on an ASR task, we present a
                   trajectory decoder based on tree search with delayed
                   path merging. Experiment on a speaker-dependent phone
                   recognition task using the MOCHA-TIMIT database shows
                   that the MLE-trained trajectory model, while retaining
                   attractive properties of being a proper generative
                   model, tends to favour over-smoothed trajectory among
                   competing hypothesises, and does not perform better
                   than a conventional {HMM}. We use this to build an
                   argument that models giving better fit on training data
                   may suffer a reduction of discrimination by being too
                   faithful to training data. This partially explains why
                   alternative acoustic models that try to explicitly
                   model temporal constraints do not achieve significant
                   improvements in ASR. },
  key = {asr},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/zhang-icslp2006.pdf},
  year = 2006
}
@phdthesis{zhang-thesis2009,
  author = {Le Zhang},
  title = {Modelling Speech Dynamics with Trajectory-{HMM}s},
  school = {School of Informatics, University of Edinburgh},
  abstract = { The conditional independence assumption imposed by
                   the hidden Markov models (HMMs) makes it difficult to
                   model temporal correlation patterns in human speech.
                   Traditionally, this limitation is circumvented by
                   appending the first and second-order regression
                   coefficients to the observation feature vectors.
                   Although this leads to improved performance in
                   recognition tasks, we argue that a straightforward use
                   of dynamic features in HMMs will result in an inferior
                   model, due to the incorrect handling of dynamic
                   constraints. In this thesis I will show that an HMM can
                   be transformed into a Trajectory-HMM capable of
                   generating smoothed output mean trajectories, by
                   performing a per-utterance normalisation. The resulting
                   model can be trained by either maximising model
                   log-likelihood or minimising mean generation errors on
                   the training data. To combat the exponential growth of
                   paths in searching, the idea of delayed path merging is
                   proposed and a new time-synchronous decoding algorithm
                   built on the concept of token-passing is designed for
                   use in the recognition task. The Trajectory-HMM brings
                   a new way of sharing knowledge between speech
                   recognition and synthesis components, by tackling both
                   problems in a coherent statistical framework. I
                   evaluated the Trajectory-HMM on two different speech
                   tasks using the speaker-dependent MOCHA-TIMIT database.
                   First as a generative model to recover articulatory
                   features from speech signal, where the Trajectory-HMM
                   was used in a complementary way to the conventional HMM
                   modelling techniques, within a joint
                   Acoustic-Articulatory framework. Experiments indicate
                   that the jointly trained acoustic-articulatory models
                   are more accurate (having a lower Root Mean Square
                   error) than the separately trained ones, and that
                   Trajectory-HMM training results in greater accuracy
                   compared with conventional Baum-Welch parameter
                   updating. In addition, the Root Mean Square (RMS)
                   training objective proves to be consistently better
                   than the Maximum Likelihood objective. However,
                   experiment of the phone recognition task shows that the
                   MLE trained Trajectory-HMM, while retaining attractive
                   properties of being a proper generative model, tends to
                   favour over-smoothed trajectories among competing
                   hypothesises, and does not perform better than a
                   conventional HMM. We use this to build an argument that
                   models giving a better fit on training data may suffer
                   a reduction of discrimination by being too faithful to
                   the training data. Finally, experiments on using
                   triphone models show that increasing modelling detail
                   is an effective way to leverage modelling performance
                   with little added complexity in training. },
  key = {speech recognition, speech synthesis, MOCHA,
                   trajectory HMM},
  month = {January},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/zhangle_thesis.pdf},
  year = 2009
}