Publications by Dong Wang
dwang2.bib
@article{Wang_JCST2012,
author = {Dong Wang and Javier Tejedor and Simon King and Joe
Frankel},
title = {Term-dependent Confidence Normalization for
Out-of-Vocabulary Spoken Term Detection},
journal = {Journal of Computer Science and Technology},
volume = {27},
number = {2},
abstract = {Spoken Term Detection (STD) is a fundamental component
of spoken information retrieval systems. A key task of
an STD system is to determine reliable detections and
reject false alarms based on certain confidence
measures. The detection posterior probability, which is
often computed from lattices, is a widely used
confidence measure. However, a potential problem of
this confidence measure is that the confidence scores
of detections of all search terms are treated
uniformly, regardless of how much they may differ in
terms of phonetic or linguistic properties. This
problem is particularly evident for out-of-vocabulary
(OOV) terms which tend to exhibit high intra-term
diversity. To address the discrepancy on confidence
levels that the same confidence score may convey for
different terms, a term-dependent decision strategy is
desirable – for example, the term-specific threshold
(TST) approach. In this work, we propose a
term-dependent normalisation technique which
compensates for term diversity on confidence
estimation. Particularly, we propose a linear bias
compensation and a discriminative compensation to deal
with the bias problem that is inherent in lattice-based
confidence measuring from which the TST approach
suffers. We tested the proposed technique on speech
data from the multi-party meeting domain with two
state-of-the-art STD systems based on phonemes and
words respectively. The experimental results
demonstrate that the confidence normalisation approach
leads to a significant performance improvement in STD,
particularly for OOV terms with phoneme-based systems.},
doi = {http://dx.doi.org/10.1007/s11390-012-1228-x},
year = 2012
}
@inproceedings{wang_acmsccs2010,
author = {Dong Wang and Simon King and Nick Evans and Raphael
Troncy},
title = {Direct Posterior Confidence For Out-of-Vocabulary
Spoken Term Detection},
booktitle = {Proc. ACM Multimedia 2010 Searching Spontaneous
Conversational Speech Workshop},
abstract = {Spoken term detection (STD) is a fundamental task in
spoken information retrieval. Compared to conventional
speech transcription and keyword spotting, STD is an
open-vocabulary task and is necessarily required to
address out-of-vocabulary (OOV) terms. Approaches based
on subword units, e.g. phonemes, are widely used to
solve the OOV issue; however, performance on OOV terms
is still significantly inferior to that for
in-vocabulary (INV) terms. The performance degradation
on OOV terms can be attributed to a multitude of
factors. A particular factor we address in this paper
is that the acoustic and language models used for
speech transcribing are highly vulnerable to OOV terms,
which leads to unreliable confidence measures and
error-prone detections. A direct posterior confidence
measure that is derived from discriminative models has
been proposed for STD. In this paper, we utilize this
technique to tackle the weakness of OOV terms in
confidence estimation. Neither acoustic models nor
language models being included in the computation, the
new confidence avoids the weak modeling problem with
OOV terms. Our experiments, set up on multi-party
meeting speech which is highly spontaneous and
conversational, demonstrate that the proposed technique
improves STD performance on OOV terms significantly;
when combined with conventional lattice-based
confidence, a significant improvement in performance is
obtained on both INVs and OOVs. Furthermore, the new
confidence measure technique can be combined together
with other advanced techniques for OOV treatment, such
as stochastic pronunciation modeling and term-dependent
confidence discrimination, which leads to an integrated
solution for OOV STD with greatly improved performance.},
categories = {confidence estimation, spoken term detection, speech
recognition},
doi = {10.1145/1878101.1878107},
month = oct,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wang_acmsccs2010.pdf},
year = 2010
}
@inproceedings{Wang_TOIS2012,
author = {Wang, Dong and King, Simon and Evans, Nicholas W. D.
and Troncy, Raphaël},
title = {Direct posterior confidence for out-of-vocabulary
spoken term detection},
booktitle = {{SSCS} 2010, {ACM} {W}orkshop on {S}earching
{S}pontaneous {C}onversational {S}peech, {S}eptember
20-24, 2010, {F}irenze, {I}taly},
address = {{F}irenze, {ITALY}},
abstract = {Spoken term detection (STD) is a fundamental task in
spoken information retrieval. Compared to conventional
speech transcription and keyword spotting, STD is an
open-vocabulary task and is necessarily required to
address out-of-vocabulary (OOV) terms. Approaches based
on subword units, e.g. phonemes, are widely used to
solve the OOV issue; however, performance on OOV terms
is still significantly inferior to that for
in-vocabulary (INV) terms. The performance degradation
on OOV terms can be attributed to a multitude of
factors. A particular factor we address in this paper
is that the acoustic and language models used for
speech transcribing are highly vulnerable to OOV terms,
which leads to unreliable confidence measures and
error-prone detections. A direct posterior confidence
measure that is derived from discriminative models has
been proposed for STD. In this paper, we utilize this
technique to tackle the weakness of OOV terms in
confidence estimation. Neither acoustic models nor
language models being included in the computation, the
new confidence avoids the weak modeling problem with
OOV terms. Our experiments, set up on multi-party
meeting speech which is highly spontaneous and
conversational, demonstrate that the proposed technique
improves STD performance on OOV terms significantly;
when combined with conventional lattice-based
confidence, a significant improvement in performance is
obtained on both INVs and OOVs. Furthermore, the new
confidence measure technique can be combined together
with other advanced techniques for OOV treatment, such
as stochastic pronunciation modeling and term-dependent
confidence discrimination, which leads to an integrated
solution for OOV STD with greatly improved performance.},
doi = {http://dx.doi.org/10.1145/1878101.1878107},
month = sep,
year = 2010
}
@inproceedings{wang:frankel:tejedor:king:icassp2008,
author = {Dong Wang and Joe Frankel and Javier Tejedor and Simon
King},
title = {A comparison of phone and grapheme-based spoken term
detection},
booktitle = {Proc. ICASSP},
pages = {4969--4972 },
abstract = {We propose grapheme-based sub-word units for spoken
term detection (STD). Compared to phones, graphemes
have a number of potential advantages. For
out-of-vocabulary search terms, phone- based approaches
must generate a pronunciation using letter-to-sound
rules. Using graphemes obviates this potentially
error-prone hard decision, shifting pronunciation
modelling into the statistical models describing the
observation space. In addition, long-span grapheme
language models can be trained directly from large text
corpora. We present experiments on Spanish and English
data, comparing phone and grapheme-based STD. For
Spanish, where phone and grapheme-based systems give
similar transcription word error rates (WERs),
grapheme-based STD significantly outperforms a phone-
based approach. The converse is found for English,
where the phone-based system outperforms a grapheme
approach. However, we present additional analysis which
suggests that phone-based STD performance levels may be
achieved by a grapheme-based approach despite lower
transcription accuracy, and that the two approaches may
usefully be combined. We propose a number of directions
for future development of these ideas, and suggest that
if grapheme-based STD can match phone-based
performance, the inherent flexibility in dealing with
out-of-vocabulary terms makes this a desirable
approach.},
doi = {10.1109/ICASSP.2008.4518773},
month = {March-April},
year = 2008
}
@article{5510125,
author = {Wang, D. and King, S. and Frankel, J.},
title = {Stochastic Pronunciation Modelling for
Out-of-Vocabulary Spoken Term Detection},
journal = {Audio, Speech, and Language Processing, IEEE
Transactions on},
volume = {PP},
number = {99},
abstract = {Spoken term detection (STD) is the name given to the
task of searching large amounts of audio for
occurrences of spoken terms, which are typically single
words or short phrases. One reason that STD is a hard
task is that search terms tend to contain a
disproportionate number of out-of-vocabulary (OOV)
words. The most common approach to STD uses subword
units. This, in conjunction with some method for
predicting pronunciations of OOVs from their written
form, enables the detection of OOV terms but
performance is considerably worse than for
in-vocabulary terms. This performance differential can
be largely attributed to the special properties of
OOVs. One such property is the high degree of
uncertainty in the pronunciation of OOVs. We present a
stochastic pronunciation model (SPM) which explicitly
deals with this uncertainty. The key insight is to
search for all possible pronunciations when detecting
an OOV term, explicitly capturing the uncertainty in
pronunciation. This requires a probabilistic model of
pronunciation, able to estimate a distribution over all
possible pronunciations. We use a joint-multigram model
(JMM) for this and compare the JMM-based SPM with the
conventional soft match approach. Experiments using
speech from the meetings domain demonstrate that the
SPM performs better than soft match in most operating
regions, especially at low false alarm probabilities.
Furthermore, SPM and soft match are found to be
complementary: their combination provides further
performance gains.},
categories = {confidence estimation, spoken term detection, speech
recognition, OOVs},
doi = {10.1109/TASL.2010.2058800},
issn = {1558-7916},
month = jul,
year = 2010
}
@inproceedings{dongwang_interspeech09_spm,
author = {Dong Wang and Simon King and Joe Frankel},
title = {Stochastic Pronunciation Modelling for Spoken Term
Detection},
booktitle = {Proc. of Interspeech},
pages = {2135--2138},
address = {Brighton, UK},
abstract = {A major challenge faced by a spoken term detection
(STD) system is the detection of out-of-vocabulary
(OOV) terms. Although a subword-based STD system is
able to detect OOV terms, performance reduction is
always observed compared to in-vocabulary terms.
Current approaches to STD do not acknowledge the
particular properties of OOV terms, such as
pronunciation uncertainty. In this paper, we use a
stochastic pronunciation model to deal with the
uncertain pronunciations of OOV terms. By considering
all possible term pronunciations, predicted by a
joint-multigram model, we observe a significant
performance improvement. },
categories = {joint-multigram, pronunciation model, spoken term
detection, speech recognition},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/spm.pdf},
year = 2009
}
@inproceedings{dongwang_icassp09,
author = {Dong Wang and Tejedor Tejedor and Joe Frankel and
Simon King},
title = {Posterior-based confidence measures for spoken term
detection},
booktitle = {Proc. of ICASSP09},
address = {Taiwan},
abstract = {Confidence measures play a key role in spoken term
detection (STD) tasks. The confidence measure expresses
the posterior probability of the search term appearing
in the detection period, given the speech. Traditional
approaches are based on the acoustic and language model
scores for candidate detections found using automatic
speech recognition, with Bayes' rule being used to
compute the desired posterior probability. In this
paper, we present a novel direct posterior-based
confidence measure which, instead of resorting to the
Bayesian formula, calculates posterior probabilities
from a multi-layer perceptron (MLP) directly. Compared
with traditional Bayesian-based methods, the
direct-posterior approach is conceptually and
mathematically simpler. Moreover, the MLP-based model
does not require assumptions to be made about the
acoustic features such as their statistical
distribution and the independence of static and dynamic
co-efficients. Our experimental results in both English
and Spanish demonstrate that the proposed direct
posterior-based confidence improves STD performance. },
categories = {Spoken term detection, confidence measure, posterior
probabilities, MLP},
month = {April},
page = {4889--4892},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/posterior.pdf},
year = 2009
}
@inproceedings{dongwang_interspeech09_conf,
author = {Dong Wang and Simon King and Joe Frankel and Peter
Bell},
title = {Term-Dependent Confidence for Out-of-Vocabulary Term
Detection},
booktitle = {Proc. Interspeech},
pages = {2139--2142},
address = {Brighton, UK},
abstract = { Within a spoken term detection (STD) system, the
decision maker plays an important role in retrieving
reliable detections. Most of the state-of-the-art STD
systems make decisions based on a confidence measure
that is term-independent, which poses a serious problem
for out-of-vocabulary (OOV) term detection. In this
paper, we study a term-dependent confidence measure
based on confidence normalisation and discriminative
modelling, particularly focusing on its remarkable
effectiveness for detecting OOV terms. Experimental
results indicate that the term-dependent confidence
provides much more significant improvement for OOV
terms than terms in-vocabulary. },
categories = {joint-multigram, pronunciation model, spoken term
detection, speech recognition},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/conf.pdf},
year = 2009
}
@inproceedings{dong_ivan_joe_simon_interspeech08_marray,
author = {Dong Wang and Ivan Himawan and Joe Frankel and Simon
King},
title = {A Posterior Approach for Microphone Array Based Speech
Recognition},
booktitle = {Proc. Interspeech},
pages = {996--999},
abstract = {Automatic speech recognition (ASR) becomes rather
difficult in meetings domains because of the adverse
acoustic conditions, including more background noise,
more echo and reverberation and frequent cross-talking.
Microphone arrays have been demonstrated able to boost
ASR performance dramatically in such noisy and
reverberant environments, with various beamforming
algorithms. However, almost all existing beamforming
measures work in the acoustic domain, resorting to
signal processing theories and geometric explanation.
This limits their application, and induces significant
performance degradation when the geometric property is
unavailable or hard to estimate, or if heterogenous
channels exist in the audio system. In this paper, we
preset a new posterior-based approach for array-based
speech recognition. The main idea is, instead of
enhancing speech signals, we try to enhance the
posterior probabilities that frames belonging to
recognition units, e.g., phones. These enhanced
posteriors are then transferred to posterior
probability based features and are modeled by HMMs,
leading to a tandem ANN-HMM hybrid system presented by
Hermansky et al.. Experimental results demonstrated the
validity of this posterior approach. With the posterior
accumulation or enhancement, significant improvement
was achieved over the single channel baseline.
Moreover, we can combine the acoustic enhancement and
posterior enhancement together, leading to a hybrid
acoustic-posterior beamforming approach, which works
significantly better than just the acoustic
beamforming, especially in the scenario with
moving-speakers. },
categories = {speech recognition, microphone array, beamforming,
tandem approach},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/marray.a.pdf},
year = 2008
}
@inproceedings{wang_interspeech10,
author = {Dong Wang and Simon King and Nick Evans and Raphael
Troncy},
title = {{CRF}-based Stochastic Pronunciation Modelling for
Out-of-Vocabulary Spoken Term Detection},
booktitle = {Proc. Interspeech},
address = {Makuhari, Chiba, Japan},
abstract = {Out-of-vocabulary (OOV) terms present a significant
challenge to spoken term detection (STD). This
challenge, to a large extent, lies in the high degree
of uncertainty in pronunciations of OOV terms. In
previous work, we presented a stochastic pronunciation
modeling (SPM) approach to compensate for this
uncertainty. A shortcoming of our original work,
however, is that the SPM was based on a joint-multigram
model (JMM), which is suboptimal. In this paper, we
propose to use conditional random fields (CRFs) for
letter-to-sound conversion, which significantly
improves quality of the predicted pronunciations. When
applied to OOV STD, we achieve consider- able
performance improvement with both a 1-best system and
an SPM-based system.},
categories = {speech recognition, spoken term detection, conditional
random field, joint multigram model},
month = sep,
year = 2010
}
@article{wang_ieeesigprocletters2011,
author = {Dong Wang and Simon King},
title = {Letter-to-Sound Pronunciation Prediction Using
Conditional Random Fields},
journal = {IEEE Signal Processing Letters},
volume = {18},
number = {2},
pages = {122--125},
abstract = {Pronunciation prediction, or letter-to-sound (LTS)
conversion, is an essential task for speech synthesis,
open vo- cabulary spoken term detection and other
applications dealing with novel words. Most current
approaches (at least for English) employ data-driven
methods to learn and represent pronunciation ``rules''
using statistical models such as decision trees, hidden
Markov models (HMMs) or joint-multigram models (JMMs).
The LTS task remains challenging, particularly for
languages with a complex relationship between spelling
and pronunciation such as English. In this paper, we
propose to use a conditional random field (CRF) to
perform LTS because it avoids having to model a
distribution over observations and can perform global
inference, suggesting that it may be more suitable for
LTS than decision trees, HMMs or JMMs. One challenge in
applying CRFs to LTS is that the phoneme and grapheme
sequences of a word are generally of different lengths,
which makes CRF training difficult. To solve this
problem, we employed a joint-multigram model to
generate aligned training exemplars. Experiments
conducted with the AMI05 dictionary demonstrate that a
CRF significantly outperforms other models, especially
if n-best lists of predictions are generated.},
categories = {Terms—letter-to-sound, conditional random field,
joint multigram model, speech synthesis, spoken term
detection},
doi = {10.1109/LSP.2010.2098440 },
month = feb,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/wang_ieeesigprocletters2011.pdf},
year = 2011
}
@inproceedings{wang_std_covariance_icassp2010,
author = {Wang, Dong and King, Simon and Frankel, Joe and Bell,
Peter},
title = {Stochastic Pronunciation Modelling and Soft Match for
Out-of-vocabulary Spoken Term Detection},
booktitle = {Proc. ICASSP},
address = {Dallas, Texas, USA},
abstract = {A major challenge faced by a spoken term detection
(STD) system is the detection of out-of-vocabulary
(OOV) terms. Although a subword-based STD system is
able to detect OOV terms, performance reduction is
always observed compared to in-vocabulary terms. One
challenge that OOV terms bring to STD is the
pronunciation uncertainty. A commonly used approach to
address this problem is a soft matching procedure,and
the other is the stochastic pronunciation modelling
(SPM) proposed by the authors. In this paper we compare
these two approaches, and combine them using a
discriminative decision strategy. Experimental results
demonstrated that SPM and soft match are highly
complementary, and their combination gives significant
performance improvement to OOV term detection.},
keywords = {confidence estimation, spoken term detection, speech
recognition},
month = mar,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wang10_icassp.pdf},
year = 2010
}
@inproceedings{joe_dong_simon_interspeech08_bottle,
author = {Joe Frankel and Dong Wang and Simon King},
title = {Growing bottleneck features for tandem {ASR}},
booktitle = {Proc. Interspeech},
pages = {1549},
abstract = { We present a method for training bottleneck MLPs for
use in tandem ASR. Experiments on meetings data show
that this approach leads to improved performance
compared with training MLPs from a random
initialization. },
categories = {tandem ASR, bottleneck MLP},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/bottlenet.a.pdf},
year = 2008
}
@inproceedings{dongwang_interspeech09_cmb,
author = {Javier Tejedor and Dong Wang and Simon King and Joe
Frankel and Jose Colas},
title = {A Posterior Probability-based System Hybridisation and
Combination for Spoken Term Detection},
booktitle = {Proc. Interspeech},
pages = {2131--2134},
address = {Brighton, UK},
abstract = {Spoken term detection (STD) is a fundamental task for
multimedia information retrieval. To improve the
detection performance, we have presented a direct
posterior-based confidence measure generated from a
neural network. In this paper, we propose a
detection-independent confidence estimation based on
the direct posterior confidence measure, in which the
decision making is totally separated from the term
detection. Based on this idea, we first present a
hybrid system which conducts the term detection and
confidence estimation based on different sub-word
units, and then propose a combination method which
merges detections from heterogeneous term detectors
based on the direct posterior-based confidence.
Experimental results demonstrated that the proposed
methods improved system performance considerably for
both English and Spanish. },
categories = {joint-multigram, pronunciation model, spoken term
detection, speech recognition},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/cmb.pdf},
year = 2009
}
@article{tejedor:wang:frankel:king:colas:specom2008,
author = {Javier Tejedor and Dong Wang and Joe Frankel and Simon
King and José Colás},
title = {A comparison of grapheme and phoneme-based units for
{S}panish spoken term detection},
journal = {Speech Communication},
volume = {50},
number = {11-12},
pages = {980-991},
abstract = {The ever-increasing volume of audio data available
online through the world wide web means that automatic
methods for indexing and search are becoming essential.
Hidden Markov model (HMM) keyword spotting and lattice
search techniques are the two most common approaches
used by such systems. In keyword spotting, models or
templates are defined for each search term prior to
accessing the speech and used to find matches. Lattice
search (referred to as spoken term detection), uses a
pre-indexing of speech data in terms of word or
sub-word units, which can then quickly be searched for
arbitrary terms without referring to the original
audio. In both cases, the search term can be modelled
in terms of sub-word units, typically phonemes. For
in-vocabulary words (i.e. words that appear in the
pronunciation dictionary), the letter-to-sound
conversion systems are accepted to work well. However,
for out-of-vocabulary (OOV) search terms,
letter-to-sound conversion must be used to generate a
pronunciation for the search term. This is usually a
hard decision (i.e. not probabilistic and with no
possibility of backtracking), and errors introduced at
this step are difficult to recover from. We therefore
propose the direct use of graphemes (i.e., letter-based
sub-word units) for acoustic modelling. This is
expected to work particularly well in languages such as
Spanish, where despite the letter-to-sound mapping
being very regular, the correspondence is not
one-to-one, and there will be benefits from avoiding
hard decisions at early stages of processing. In this
article, we compare three approaches for Spanish
keyword spotting or spoken term detection, and within
each of these we compare acoustic modelling based on
phone and grapheme units. Experiments were performed
using the Spanish geographical-domain Albayzin corpus.
Results achieved in the two approaches proposed for
spoken term detection show us that trigrapheme units
for acoustic modelling match or exceed the performance
of phone-based acoustic models. In the method proposed
for keyword spotting, the results achieved with each
acoustic model are very similar.},
categories = {Spoken term detection; Keyword spotting; Graphemes;
Spanish},
doi = {10.1016/j.specom.2008.03.005},
month = {November-December},
year = 2008
}
@inproceedings{tejedor_interspeech10,
author = {Javier Tejedor and Doroteo T. Toledano and Miguel
Bautista and Simon King and Dong Wang and Jose Colas},
title = {Augmented set of features for confidence estimation in
spoken term detection},
booktitle = {Proc. Interspeech},
abstract = {Discriminative confidence estimation along with
confidence normalisation have been shown to construct
robust decision maker modules in spoken term detection
(STD) systems. Discriminative confidence estimation,
making use of termdependent features, has been shown to
improve the widely used lattice-based confidence
estimation in STD. In this work, we augment the set of
these term-dependent features and show a significant
improvement in the STD performance both in terms of
ATWV and DET curves in experiments conducted on a
Spanish geographical corpus. This work also proposes a
multiple linear regression analysis to carry out the
feature selection. Next, the most informative features
derived from it are used within the discriminative
confidence on the STD system.},
categories = {confidence estimation, feature selection, spoken term
detection, speech recognition},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/features.pdf},
year = 2010
}
@inproceedings{wang_icassp2011a,
author = {Dong Wang and Nicholas Evans and Raphael Troncy and
Simon King},
title = {Handling overlaps in spoken term detection},
booktitle = {Proc. International Conference on Acoustics, Speech
and Signal Processing},
pages = {5656--5659},
abstract = {Spoken term detection (STD) systems usually arrive at
many overlapping detections which are often addressed
with some pragmatic approaches, e.g. choosing the best
detection to represent all the overlaps. In this paper
we present a theoretical study based on a concept of
acceptance space. In particular, we present two
confidence estimation approaches based on Bayesian and
evidence perspectives respectively. Analysis shows that
both approaches possess respective ad vantages and
shortcomings, and that their combination has the
potential to provide an improved confidence estimation.
Experiments conducted on meeting data confirm our
analysis and show considerable performance improvement
with the combined approach, in particular for
out-of-vocabulary spoken term detection with stochastic
pronunciation modeling.},
categories = {spoken term detection, speech recognition},
doi = {10.1109/ICASSP.2011.5947643},
month = may,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/wang_icassp2011a.pdf},
year = 2011
}