The Centre for Speech Technology Research, The university of Edinburgh

Publications by Songfang Huang

s0562315.bib

@inproceedings{huang2008-is,
  author = {Huang, Songfang and Renals, Steve},
  title = {Unsupervised Language Model Adaptation Based on Topic and Role Information in Multiparty Meetings},
  booktitle = {Proc. Interspeech'08},
  year = {2008},
  abstract = {We continue our previous work on the modeling of topic and role information from multiparty meetings using a hierarchical Dirichlet process (HDP), in the context of language model adaptation. In this paper we focus on three problems: 1) an empirical analysis of the HDP as a nonparametric topic model; 2) the mismatch problem of vocabularies of the baseline n-gram model and the HDP; and 3) an automatic speech recognition experiment to further verify the effectiveness of our adaptation framework. Experiments on a large meeting corpus of more than 70 hours speech data show consistent and significant improvements in terms of word error rate for language model adaptation based on the topic and role information.},
  month = {September},
  address = {Brisbane, Australia},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/interspeech08.pdf},
  pages = {833--836}
}
@inproceedings{huang2009-icassp,
  author = {Huang, Songfang and Zhou, Bowen},
  title = {An {EM} Algorithm for {SCFG} in Formal Syntax-based Translation},
  booktitle = {Proc. IEEE International Conference on Acoustic, Speech, and Signal Processing (ICASSP'09)},
  year = {2009},
  abstract = {In this paper, we investigate the use of bilingual parsing on parallel corpora to better estimate the rule parameters in a formal syntax-based machine translation system, which are normally estimated from the inaccurate heuristics. We use an Expectation-Maximization (EM) algorithm to re-estimate the parameters of synchronous context-free grammar (SCFG) rules according to the derivation knowledge from parallel corpora based on maximum likelihood principle, rather than using only the heuristic information. The proposed algorithm produces significantly better BLEU scores than a state-of-the-art formal syntax-based machine translation system on the IWSLT 2006 Chinese to English task.},
  month = {April},
  address = {Taiwan, China},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/icassp09.pdf},
  pages = {4813--4816}
}
@inproceedings{huang2009-is,
  author = {Huang, Songfang and Renals, Steve},
  title = {A Parallel Training Algorithm for Hierarchical {P}itman-{Y}or Process Language Models},
  booktitle = {Proc. Interspeech'09},
  year = {2009},
  abstract = {The Hierarchical Pitman Yor Process Language Model (HPYLM) is a Bayesian language model based on a non-parametric prior, the Pitman-Yor Process. It has been demonstrated, both theoretically and practically, that the HPYLM can provide better smoothing for language modeling, compared with state-of-the-art approaches such as interpolated Kneser-Ney and modified Kneser-Ney smoothing. However, estimation of Bayesian language models is expensive in terms of both computation time and memory; the inference is approximate and requires a number of iterations to converge. In this paper, we present a parallel training algorithm for the HPYLM, which enables the approach to be applied in the context of automatic speech recognition, using large training corpora with large vocabularies. We demonstrate the effectiveness of the proposed algorithm by estimating language models from corpora for meeting transcription containing over 200 million words, and observe significant reductions in perplexity and word error rate.},
  month = {September},
  address = {Brighton, UK},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/sh_interspeech09.pdf},
  pages = {2695--2698}
}
@article{huang2010,
  author = {Huang, Songfang and Renals, Steve},
  doi = {10.1109/TASL.2010.2040782},
  title = {Hierarchical {Bayesian} Language Models for Conversational Speech Recognition},
  url = {http://dx.doi.org/10.1109/TASL.2010.2040782},
  journal = {IEEE Transactions on Audio, Speech and Language Processing},
  number = {8},
  abstract = {Traditional n-gram language models are widely used in state-of-the-art large vocabulary speech recognition systems. This simple model suffers from some limitations, such as overfitting of maximum-likelihood estimation and the lack of rich contextual knowledge sources. In this paper, we exploit a hierarchical Bayesian interpretation for language modeling, based on a nonparametric prior called the Pitman--Yor process. This offers a principled approach to language model smoothing, embedding the power-law distribution for natural language. Experiments on the recognition of conversational speech in multiparty meetings demonstrate that by using hierarchical Bayesian language models, we are able to achieve significant reductions in perplexity and word error rate.},
  month = {January},
  volume = {18},
  year = {2010},
  keywords = {AMI corpus , conversational speech recognition , hierarchical Bayesian model , language model (LM) , meetings , smoothing},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/huang-taslp10.pdf},
  pages = {1941--1954}
}
@incollection{huang2007-mlmi,
  editor = {Popescu-Belis, A. and Renals, S. and Bourlard, H.},
  author = {Huang, Songfang and Renals, Steve},
  publisher = {Springer},
  title = {Modeling Prosodic Features in Language Models for Meetings},
  series = {Lecture Notes in Computer Science},
  booktitle = {Machine Learning for Multimodal Interaction IV},
  abstract = {Prosody has been actively studied as an important knowledge source for speech recognition and understanding. In this paper, we are concerned with the question of exploiting prosody for language models to aid automatic speech recognition in the context of meetings. Using an automatic syllable detection algorithm, the syllable-based prosodic features are extracted to form the prosodic representation for each word. Two modeling approaches are then investigated. One is based on a factored language model, which directly uses the prosodic representation and treats it as a `word'. Instead of direct association, the second approach provides a richer probabilistic structure within a hierarchical Bayesian framework by introducing an intermediate latent variable to represent similar prosodic patterns shared by groups of words. Four-fold cross-validation experiments on the ICSI Meeting Corpus show that exploiting prosody for language modeling can significantly reduce the perplexity, and also have marginal reductions in word error rate.},
  volume = {4892},
  year = {2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/mlmi07.pdf},
  pages = {191--202}
}
@inproceedings{huang2007-asru,
  author = {Huang, Songfang and Renals, Steve},
  title = {Hierarchical {Pitman-Yor} Language Models for {ASR} in Meetings},
  booktitle = {Proc. IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU'07)},
  year = {2007},
  abstract = {In this paper we investigate the application of a novel technique for language modeling --- a hierarchical Bayesian language model (LM) based on the Pitman-Yor process --- on automatic speech recognition (ASR) for multiparty meetings. The hierarchical Pitman-Yor language model (HPYLM), which was originally proposed in the machine learning field, provides a Bayesian interpretation to language modeling. An approximation to the HPYLM recovers the exact formulation of the interpolated Kneser-Ney smoothing method in n-gram models. This paper focuses on the application and scalability of HPYLM on a practical large vocabulary ASR system. Experimental results on NIST RT06s evaluation meeting data verify that HPYLM is a competitive and promising language modeling technique, which consistently performs better than interpolated Kneser-Ney and modified Kneser-Ney n-gram LMs in terms of both perplexity (PPL) and word error rate (WER).},
  month = {December},
  address = {Kyoto, Japan},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/asru07.pdf},
  pages = {124--129}
}
@inproceedings{huang2010a,
  author = {Huang, Songfang and Renals, Steve},
  doi = {10.1109/ICASSP.2010.5495007},
  title = {Power Law Discounting for N-Gram Language Models},
  url = {http://dx.doi.org/10.1109/ICASSP.2010.5495007},
  booktitle = {Proc. IEEE ICASSP--10},
  abstract = {We present an approximation to the Bayesian hierarchical Pitman-Yor process language model which maintains the power law distribution over word tokens, while not requiring a computationally expensive approximate inference process. This approximation, which we term power law discounting, has a similar computational complexity to interpolated and modified Kneser-Ney smoothing. We performed experiments on meeting transcription using the NIST RT06s evaluation data and the AMI corpus, with a vocabulary of 50,000 words and a language model training set of up to 211 million words. Our results indicate that power law discounting results in statistically significant reductions in perplexity and word error rate compared to both interpolated and modified Kneser-Ney smoothing, while producing similar results to the hierarchical Pitman-Yor process language model.},
  year = {2010},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/huang-icassp10.pdf},
  pages = {5178--5181}
}
@incollection{huang2008-mlmi,
  editor = {Popescu-Belis, A. and Stiefelhagen, R.},
  author = {Huang, Songfang and Renals, Steve},
  publisher = {Springer},
  title = {Modeling Topic and Role Information in Meetings using the Hierarchical {D}irichlet Process},
  series = {Lecture Notes in Computer Science},
  booktitle = {Machine Learning for Multimodal Interaction V},
  abstract = {In this paper, we address the modeling of topic and role information in multiparty meetings, via a nonparametric Bayesian model called the hierarchical Dirichlet process. This model provides a powerful solution to topic modeling and a flexible framework for the incorporation of other cues such as speaker role information. We present our modeling framework for topic and role on the AMI Meeting Corpus, and illustrate the effectiveness of the approach in the context of adapting a baseline language model in a large-vocabulary automatic speech recognition system for multiparty meetings. The adapted LM produces significant improvements in terms of both perplexity and word error rate.},
  volume = {5237},
  year = {2008},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/mlmi08.pdf},
  pages = {214--225}
}
@inproceedings{huang2008-ptkl,
  author = {Huang, Songfang and Renals, Steve},
  title = {Using Participant Role in Multiparty Meetings as Prior Knowledge for Nonparametric Topic Modeling},
  booktitle = {Proc. ICML/UAI/COLT Workshop on Prior Knowledge for Text and Language Processing},
  year = {2008},
  abstract = {In this paper we introduce our attempts to incorporate the participant role information in multiparty meetings for document modeling using the hierarchical Dirichlet process. The perplexity and automatic speech recognition results demonstrate that the participant role information is a promising prior knowledge source to be combined with language models for automatic speech recognition and interaction modeling for multiparty meetings.},
  month = {July},
  address = {Helsinki, Finland},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/ptkl.pdf},
  pages = {21--24}
}