The Centre for Speech Technology Research, The university of Edinburgh

Publications by Zhang Le

s0450736.bib

@article{zhang-spl2008,
  author = {Zhang, Le and Renals, Steve},
  title = {Acoustic-Articulatory Modelling with the Trajectory {HMM}},
  journal = {IEEE Signal Processing Letters},
  pages = {245-248},
  volume = {15},
  key = {articulatory inversion},
  year = {2008},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/zhang-spl.pdf},
  abstract = {In this letter, we introduce an hidden Markov model (HMM)-based inversion system to recovery articulatory movements from speech acoustics. Trajectory HMMs are used as generative models for modelling articulatory data. Experiments on the MOCHA-TIMIT corpus indicate that the jointly trained acoustic-articulatory models are more accurate (lower RMS error) than the separately trained ones, and that trajectory HMM training results in greater accuracy compared with conventional maximum likelihood HMM training. Moreover, the system has the ability to synthesize articulatory movements directly from a textual representation.}
}
@inproceedings{zhang-icslp2006,
  author = {Zhang, Le and Renals, Steve},
  title = {Phone Recognition Analysis for Trajectory {HMM}},
  booktitle = {Proc. Interspeech 2006},
  year = {2006},
  month = {September},
  key = {asr},
  address = {Pittsburgh, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/zhang-icslp2006.pdf},
  abstract = {The trajectory {HMM} has been shown to be useful for model-based speech synthesis where a smoothed trajectory is generated using temporal constraints imposed by dynamic features. To evaluate the performance of such model on an ASR task, we present a trajectory decoder based on tree search with delayed path merging. Experiment on a speaker-dependent phone recognition task using the MOCHA-TIMIT database shows that the MLE-trained trajectory model, while retaining attractive properties of being a proper generative model, tends to favour over-smoothed trajectory among competing hypothesises, and does not perform better than a conventional {HMM}. We use this to build an argument that models giving better fit on training data may suffer a reduction of discrimination by being too faithful to training data. This partially explains why alternative acoustic models that try to explicitly model temporal constraints do not achieve significant improvements in ASR.}
}
@phdthesis{zhang-thesis2009,
  author = {Zhang, Le},
  school = {School of Informatics, University of Edinburgh},
  title = {Modelling Speech Dynamics with Trajectory-{HMM}s},
  abstract = {The conditional independence assumption imposed by the hidden Markov models (HMMs) makes it difficult to model temporal correlation patterns in human speech. Traditionally, this limitation is circumvented by appending the first and second-order regression coefficients to the observation feature vectors. Although this leads to improved performance in recognition tasks, we argue that a straightforward use of dynamic features in HMMs will result in an inferior model, due to the incorrect handling of dynamic constraints. In this thesis I will show that an HMM can be transformed into a Trajectory-HMM capable of generating smoothed output mean trajectories, by performing a per-utterance normalisation. The resulting model can be trained by either maximising model log-likelihood or minimising mean generation errors on the training data. To combat the exponential growth of paths in searching, the idea of delayed path merging is proposed and a new time-synchronous decoding algorithm built on the concept of token-passing is designed for use in the recognition task. The Trajectory-HMM brings a new way of sharing knowledge between speech recognition and synthesis components, by tackling both problems in a coherent statistical framework. I evaluated the Trajectory-HMM on two different speech tasks using the speaker-dependent MOCHA-TIMIT database. First as a generative model to recover articulatory features from speech signal, where the Trajectory-HMM was used in a complementary way to the conventional HMM modelling techniques, within a joint Acoustic-Articulatory framework. Experiments indicate that the jointly trained acoustic-articulatory models are more accurate (having a lower Root Mean Square error) than the separately trained ones, and that Trajectory-HMM training results in greater accuracy compared with conventional Baum-Welch parameter updating. In addition, the Root Mean Square (RMS) training objective proves to be consistently better than the Maximum Likelihood objective. However, experiment of the phone recognition task shows that the MLE trained Trajectory-HMM, while retaining attractive properties of being a proper generative model, tends to favour over-smoothed trajectories among competing hypothesises, and does not perform better than a conventional HMM. We use this to build an argument that models giving a better fit on training data may suffer a reduction of discrimination by being too faithful to the training data. Finally, experiments on using triphone models show that increasing modelling detail is an effective way to leverage modelling performance with little added complexity in training.},
  month = {January},
  key = {speech recognition, speech synthesis, MOCHA, trajectory HMM},
  year = {2009},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/zhangle_thesis.pdf}
}
@inproceedings{llu_is2015b,
  author = {Lu, Liang and Zhang, Xingxing and Cho, KyungHyun and Renals, Steve},
  date-modified = {2015-08-19 10:22:47 +0100},
  title = {A Study of the Recurrent Neural Network Encoder-Decoder for Large Vocabulary Speech Recognition},
  abstract = {Deep neural networks have advanced the state-of-the-art in automatic speech recognition, when combined with hidden Markov models (HMMs). Recently there has been interest in using systems based on recurrent neural networks (RNNs) to perform sequence modelling directly, without the requirement of an HMM superstructure. In this paper, we study the RNN encoder-decoder approach for large vocabulary end-to-end speech recognition, whereby an encoder transforms a sequence of acoustic vectors into a sequence of feature representations, from which a decoder recovers a sequence of words. We investigated this approach on the Switchboard corpus using a training set of around 300 hours of transcribed audio data. Without the use of an explicit language model or pronunciation lexicon, we achieved promising recognition accuracy, demonstrating that this approach warrants further investigation.},
  year = {2015},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/liang_is15a.pdf},
  booktitle = {Proc. Interspeech},
  categories = {end-to-end speech recognition, deep neural networks, recurrent neural networks, encoder-decoder},
  date-added = {2015-08-19 10:14:21 +0100}
}