The Centre for Speech Technology Research, The university of Edinburgh

Publications by Dong Wang

dwang2.bib

@article{Wang_JCST2012,
  author = {Dong Wang and Javier Tejedor and Simon King and Joe
                   Frankel},
  title = {Term-dependent Confidence Normalization for
                   Out-of-Vocabulary Spoken Term Detection},
  journal = {Journal of Computer Science and Technology},
  volume = {27},
  number = {2},
  abstract = {Spoken Term Detection (STD) is a fundamental component
                   of spoken information retrieval systems. A key task of
                   an STD system is to determine reliable detections and
                   reject false alarms based on certain confidence
                   measures. The detection posterior probability, which is
                   often computed from lattices, is a widely used
                   confidence measure. However, a potential problem of
                   this confidence measure is that the confidence scores
                   of detections of all search terms are treated
                   uniformly, regardless of how much they may differ in
                   terms of phonetic or linguistic properties. This
                   problem is particularly evident for out-of-vocabulary
                   (OOV) terms which tend to exhibit high intra-term
                   diversity. To address the discrepancy on confidence
                   levels that the same confidence score may convey for
                   different terms, a term-dependent decision strategy is
                   desirable – for example, the term-specific threshold
                   (TST) approach. In this work, we propose a
                   term-dependent normalisation technique which
                   compensates for term diversity on confidence
                   estimation. Particularly, we propose a linear bias
                   compensation and a discriminative compensation to deal
                   with the bias problem that is inherent in lattice-based
                   confidence measuring from which the TST approach
                   suffers. We tested the proposed technique on speech
                   data from the multi-party meeting domain with two
                   state-of-the-art STD systems based on phonemes and
                   words respectively. The experimental results
                   demonstrate that the confidence normalisation approach
                   leads to a significant performance improvement in STD,
                   particularly for OOV terms with phoneme-based systems.},
  doi = {http://dx.doi.org/10.1007/s11390-012-1228-x},
  year = 2012
}
@inproceedings{wang_acmsccs2010,
  author = {Dong Wang and Simon King and Nick Evans and Raphael
                   Troncy},
  title = {Direct Posterior Confidence For Out-of-Vocabulary
                   Spoken Term Detection},
  booktitle = {Proc. ACM Multimedia 2010 Searching Spontaneous
                   Conversational Speech Workshop},
  abstract = {Spoken term detection (STD) is a fundamental task in
                   spoken information retrieval. Compared to conventional
                   speech transcription and keyword spotting, STD is an
                   open-vocabulary task and is necessarily required to
                   address out-of-vocabulary (OOV) terms. Approaches based
                   on subword units, e.g. phonemes, are widely used to
                   solve the OOV issue; however, performance on OOV terms
                   is still significantly inferior to that for
                   in-vocabulary (INV) terms. The performance degradation
                   on OOV terms can be attributed to a multitude of
                   factors. A particular factor we address in this paper
                   is that the acoustic and language models used for
                   speech transcribing are highly vulnerable to OOV terms,
                   which leads to unreliable confidence measures and
                   error-prone detections. A direct posterior confidence
                   measure that is derived from discriminative models has
                   been proposed for STD. In this paper, we utilize this
                   technique to tackle the weakness of OOV terms in
                   confidence estimation. Neither acoustic models nor
                   language models being included in the computation, the
                   new confidence avoids the weak modeling problem with
                   OOV terms. Our experiments, set up on multi-party
                   meeting speech which is highly spontaneous and
                   conversational, demonstrate that the proposed technique
                   improves STD performance on OOV terms significantly;
                   when combined with conventional lattice-based
                   confidence, a significant improvement in performance is
                   obtained on both INVs and OOVs. Furthermore, the new
                   confidence measure technique can be combined together
                   with other advanced techniques for OOV treatment, such
                   as stochastic pronunciation modeling and term-dependent
                   confidence discrimination, which leads to an integrated
                   solution for OOV STD with greatly improved performance.},
  categories = {confidence estimation, spoken term detection, speech
                   recognition},
  doi = {10.1145/1878101.1878107},
  month = oct,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wang_acmsccs2010.pdf},
  year = 2010
}
@inproceedings{Wang_TOIS2012,
  author = {Wang, Dong and King, Simon and Evans, Nicholas W. D.
                   and Troncy, Raphaël},
  title = {Direct posterior confidence for out-of-vocabulary
                   spoken term detection},
  booktitle = {{SSCS} 2010, {ACM} {W}orkshop on {S}earching
                   {S}pontaneous {C}onversational {S}peech, {S}eptember
                   20-24, 2010, {F}irenze, {I}taly},
  address = {{F}irenze, {ITALY}},
  abstract = {Spoken term detection (STD) is a fundamental task in
                   spoken information retrieval. Compared to conventional
                   speech transcription and keyword spotting, STD is an
                   open-vocabulary task and is necessarily required to
                   address out-of-vocabulary (OOV) terms. Approaches based
                   on subword units, e.g. phonemes, are widely used to
                   solve the OOV issue; however, performance on OOV terms
                   is still significantly inferior to that for
                   in-vocabulary (INV) terms. The performance degradation
                   on OOV terms can be attributed to a multitude of
                   factors. A particular factor we address in this paper
                   is that the acoustic and language models used for
                   speech transcribing are highly vulnerable to OOV terms,
                   which leads to unreliable confidence measures and
                   error-prone detections. A direct posterior confidence
                   measure that is derived from discriminative models has
                   been proposed for STD. In this paper, we utilize this
                   technique to tackle the weakness of OOV terms in
                   confidence estimation. Neither acoustic models nor
                   language models being included in the computation, the
                   new confidence avoids the weak modeling problem with
                   OOV terms. Our experiments, set up on multi-party
                   meeting speech which is highly spontaneous and
                   conversational, demonstrate that the proposed technique
                   improves STD performance on OOV terms significantly;
                   when combined with conventional lattice-based
                   confidence, a significant improvement in performance is
                   obtained on both INVs and OOVs. Furthermore, the new
                   confidence measure technique can be combined together
                   with other advanced techniques for OOV treatment, such
                   as stochastic pronunciation modeling and term-dependent
                   confidence discrimination, which leads to an integrated
                   solution for OOV STD with greatly improved performance.},
  doi = {http://dx.doi.org/10.1145/1878101.1878107},
  month = sep,
  year = 2010
}
@inproceedings{wang:frankel:tejedor:king:icassp2008,
  author = {Dong Wang and Joe Frankel and Javier Tejedor and Simon
                   King},
  title = {A comparison of phone and grapheme-based spoken term
                   detection},
  booktitle = {Proc. ICASSP},
  pages = {4969--4972 },
  abstract = {We propose grapheme-based sub-word units for spoken
                   term detection (STD). Compared to phones, graphemes
                   have a number of potential advantages. For
                   out-of-vocabulary search terms, phone- based approaches
                   must generate a pronunciation using letter-to-sound
                   rules. Using graphemes obviates this potentially
                   error-prone hard decision, shifting pronunciation
                   modelling into the statistical models describing the
                   observation space. In addition, long-span grapheme
                   language models can be trained directly from large text
                   corpora. We present experiments on Spanish and English
                   data, comparing phone and grapheme-based STD. For
                   Spanish, where phone and grapheme-based systems give
                   similar transcription word error rates (WERs),
                   grapheme-based STD significantly outperforms a phone-
                   based approach. The converse is found for English,
                   where the phone-based system outperforms a grapheme
                   approach. However, we present additional analysis which
                   suggests that phone-based STD performance levels may be
                   achieved by a grapheme-based approach despite lower
                   transcription accuracy, and that the two approaches may
                   usefully be combined. We propose a number of directions
                   for future development of these ideas, and suggest that
                   if grapheme-based STD can match phone-based
                   performance, the inherent flexibility in dealing with
                   out-of-vocabulary terms makes this a desirable
                   approach.},
  doi = {10.1109/ICASSP.2008.4518773},
  month = {March-April},
  year = 2008
}
@article{5510125,
  author = {Wang, D. and King, S. and Frankel, J.},
  title = {Stochastic Pronunciation Modelling for
                   Out-of-Vocabulary Spoken Term Detection},
  journal = {Audio, Speech, and Language Processing, IEEE
                   Transactions on},
  volume = {PP},
  number = {99},
  abstract = {Spoken term detection (STD) is the name given to the
                   task of searching large amounts of audio for
                   occurrences of spoken terms, which are typically single
                   words or short phrases. One reason that STD is a hard
                   task is that search terms tend to contain a
                   disproportionate number of out-of-vocabulary (OOV)
                   words. The most common approach to STD uses subword
                   units. This, in conjunction with some method for
                   predicting pronunciations of OOVs from their written
                   form, enables the detection of OOV terms but
                   performance is considerably worse than for
                   in-vocabulary terms. This performance differential can
                   be largely attributed to the special properties of
                   OOVs. One such property is the high degree of
                   uncertainty in the pronunciation of OOVs. We present a
                   stochastic pronunciation model (SPM) which explicitly
                   deals with this uncertainty. The key insight is to
                   search for all possible pronunciations when detecting
                   an OOV term, explicitly capturing the uncertainty in
                   pronunciation. This requires a probabilistic model of
                   pronunciation, able to estimate a distribution over all
                   possible pronunciations. We use a joint-multigram model
                   (JMM) for this and compare the JMM-based SPM with the
                   conventional soft match approach. Experiments using
                   speech from the meetings domain demonstrate that the
                   SPM performs better than soft match in most operating
                   regions, especially at low false alarm probabilities.
                   Furthermore, SPM and soft match are found to be
                   complementary: their combination provides further
                   performance gains.},
  categories = {confidence estimation, spoken term detection, speech
                   recognition, OOVs},
  doi = {10.1109/TASL.2010.2058800},
  issn = {1558-7916},
  month = jul,
  year = 2010
}
@inproceedings{dongwang_interspeech09_spm,
  author = {Dong Wang and Simon King and Joe Frankel},
  title = {Stochastic Pronunciation Modelling for Spoken Term
                   Detection},
  booktitle = {Proc. of Interspeech},
  pages = {2135--2138},
  address = {Brighton, UK},
  abstract = {A major challenge faced by a spoken term detection
                   (STD) system is the detection of out-of-vocabulary
                   (OOV) terms. Although a subword-based STD system is
                   able to detect OOV terms, performance reduction is
                   always observed compared to in-vocabulary terms.
                   Current approaches to STD do not acknowledge the
                   particular properties of OOV terms, such as
                   pronunciation uncertainty. In this paper, we use a
                   stochastic pronunciation model to deal with the
                   uncertain pronunciations of OOV terms. By considering
                   all possible term pronunciations, predicted by a
                   joint-multigram model, we observe a significant
                   performance improvement. },
  categories = {joint-multigram, pronunciation model, spoken term
                   detection, speech recognition},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/spm.pdf},
  year = 2009
}
@inproceedings{dongwang_icassp09,
  author = {Dong Wang and Tejedor Tejedor and Joe Frankel and
                   Simon King},
  title = {Posterior-based confidence measures for spoken term
                   detection},
  booktitle = {Proc. of ICASSP09},
  address = {Taiwan},
  abstract = {Confidence measures play a key role in spoken term
                   detection (STD) tasks. The confidence measure expresses
                   the posterior probability of the search term appearing
                   in the detection period, given the speech. Traditional
                   approaches are based on the acoustic and language model
                   scores for candidate detections found using automatic
                   speech recognition, with Bayes' rule being used to
                   compute the desired posterior probability. In this
                   paper, we present a novel direct posterior-based
                   confidence measure which, instead of resorting to the
                   Bayesian formula, calculates posterior probabilities
                   from a multi-layer perceptron (MLP) directly. Compared
                   with traditional Bayesian-based methods, the
                   direct-posterior approach is conceptually and
                   mathematically simpler. Moreover, the MLP-based model
                   does not require assumptions to be made about the
                   acoustic features such as their statistical
                   distribution and the independence of static and dynamic
                   co-efficients. Our experimental results in both English
                   and Spanish demonstrate that the proposed direct
                   posterior-based confidence improves STD performance. },
  categories = {Spoken term detection, confidence measure, posterior
                   probabilities, MLP},
  month = {April},
  page = {4889--4892},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/posterior.pdf},
  year = 2009
}
@inproceedings{dongwang_interspeech09_conf,
  author = {Dong Wang and Simon King and Joe Frankel and Peter
                   Bell},
  title = {Term-Dependent Confidence for Out-of-Vocabulary Term
                   Detection},
  booktitle = {Proc. Interspeech},
  pages = {2139--2142},
  address = {Brighton, UK},
  abstract = { Within a spoken term detection (STD) system, the
                   decision maker plays an important role in retrieving
                   reliable detections. Most of the state-of-the-art STD
                   systems make decisions based on a confidence measure
                   that is term-independent, which poses a serious problem
                   for out-of-vocabulary (OOV) term detection. In this
                   paper, we study a term-dependent confidence measure
                   based on confidence normalisation and discriminative
                   modelling, particularly focusing on its remarkable
                   effectiveness for detecting OOV terms. Experimental
                   results indicate that the term-dependent confidence
                   provides much more significant improvement for OOV
                   terms than terms in-vocabulary. },
  categories = {joint-multigram, pronunciation model, spoken term
                   detection, speech recognition},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/conf.pdf},
  year = 2009
}
@inproceedings{dong_ivan_joe_simon_interspeech08_marray,
  author = {Dong Wang and Ivan Himawan and Joe Frankel and Simon
                   King},
  title = {A Posterior Approach for Microphone Array Based Speech
                   Recognition},
  booktitle = {Proc. Interspeech},
  pages = {996--999},
  abstract = {Automatic speech recognition (ASR) becomes rather
                   difficult in meetings domains because of the adverse
                   acoustic conditions, including more background noise,
                   more echo and reverberation and frequent cross-talking.
                   Microphone arrays have been demonstrated able to boost
                   ASR performance dramatically in such noisy and
                   reverberant environments, with various beamforming
                   algorithms. However, almost all existing beamforming
                   measures work in the acoustic domain, resorting to
                   signal processing theories and geometric explanation.
                   This limits their application, and induces significant
                   performance degradation when the geometric property is
                   unavailable or hard to estimate, or if heterogenous
                   channels exist in the audio system. In this paper, we
                   preset a new posterior-based approach for array-based
                   speech recognition. The main idea is, instead of
                   enhancing speech signals, we try to enhance the
                   posterior probabilities that frames belonging to
                   recognition units, e.g., phones. These enhanced
                   posteriors are then transferred to posterior
                   probability based features and are modeled by HMMs,
                   leading to a tandem ANN-HMM hybrid system presented by
                   Hermansky et al.. Experimental results demonstrated the
                   validity of this posterior approach. With the posterior
                   accumulation or enhancement, significant improvement
                   was achieved over the single channel baseline.
                   Moreover, we can combine the acoustic enhancement and
                   posterior enhancement together, leading to a hybrid
                   acoustic-posterior beamforming approach, which works
                   significantly better than just the acoustic
                   beamforming, especially in the scenario with
                   moving-speakers. },
  categories = {speech recognition, microphone array, beamforming,
                   tandem approach},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/marray.a.pdf},
  year = 2008
}
@inproceedings{wang_interspeech10,
  author = {Dong Wang and Simon King and Nick Evans and Raphael
                   Troncy},
  title = {{CRF}-based Stochastic Pronunciation Modelling for
                   Out-of-Vocabulary Spoken Term Detection},
  booktitle = {Proc. Interspeech},
  address = {Makuhari, Chiba, Japan},
  abstract = {Out-of-vocabulary (OOV) terms present a significant
                   challenge to spoken term detection (STD). This
                   challenge, to a large extent, lies in the high degree
                   of uncertainty in pronunciations of OOV terms. In
                   previous work, we presented a stochastic pronunciation
                   modeling (SPM) approach to compensate for this
                   uncertainty. A shortcoming of our original work,
                   however, is that the SPM was based on a joint-multigram
                   model (JMM), which is suboptimal. In this paper, we
                   propose to use conditional random fields (CRFs) for
                   letter-to-sound conversion, which significantly
                   improves quality of the predicted pronunciations. When
                   applied to OOV STD, we achieve consider- able
                   performance improvement with both a 1-best system and
                   an SPM-based system.},
  categories = {speech recognition, spoken term detection, conditional
                   random field, joint multigram model},
  month = sep,
  year = 2010
}
@article{wang_ieeesigprocletters2011,
  author = {Dong Wang and Simon King},
  title = {Letter-to-Sound Pronunciation Prediction Using
                   Conditional Random Fields},
  journal = {IEEE Signal Processing Letters},
  volume = {18},
  number = {2},
  pages = {122--125},
  abstract = {Pronunciation prediction, or letter-to-sound (LTS)
                   conversion, is an essential task for speech synthesis,
                   open vo- cabulary spoken term detection and other
                   applications dealing with novel words. Most current
                   approaches (at least for English) employ data-driven
                   methods to learn and represent pronunciation ``rules''
                   using statistical models such as decision trees, hidden
                   Markov models (HMMs) or joint-multigram models (JMMs).
                   The LTS task remains challenging, particularly for
                   languages with a complex relationship between spelling
                   and pronunciation such as English. In this paper, we
                   propose to use a conditional random field (CRF) to
                   perform LTS because it avoids having to model a
                   distribution over observations and can perform global
                   inference, suggesting that it may be more suitable for
                   LTS than decision trees, HMMs or JMMs. One challenge in
                   applying CRFs to LTS is that the phoneme and grapheme
                   sequences of a word are generally of different lengths,
                   which makes CRF training difficult. To solve this
                   problem, we employed a joint-multigram model to
                   generate aligned training exemplars. Experiments
                   conducted with the AMI05 dictionary demonstrate that a
                   CRF significantly outperforms other models, especially
                   if n-best lists of predictions are generated.},
  categories = {Terms—letter-to-sound, conditional random field,
                   joint multigram model, speech synthesis, spoken term
                   detection},
  doi = {10.1109/LSP.2010.2098440 },
  month = feb,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/wang_ieeesigprocletters2011.pdf},
  year = 2011
}
@inproceedings{wang_std_covariance_icassp2010,
  author = {Wang, Dong and King, Simon and Frankel, Joe and Bell,
                   Peter},
  title = {Stochastic Pronunciation Modelling and Soft Match for
                   Out-of-vocabulary Spoken Term Detection},
  booktitle = {Proc. ICASSP},
  address = {Dallas, Texas, USA},
  abstract = {A major challenge faced by a spoken term detection
                   (STD) system is the detection of out-of-vocabulary
                   (OOV) terms. Although a subword-based STD system is
                   able to detect OOV terms, performance reduction is
                   always observed compared to in-vocabulary terms. One
                   challenge that OOV terms bring to STD is the
                   pronunciation uncertainty. A commonly used approach to
                   address this problem is a soft matching procedure,and
                   the other is the stochastic pronunciation modelling
                   (SPM) proposed by the authors. In this paper we compare
                   these two approaches, and combine them using a
                   discriminative decision strategy. Experimental results
                   demonstrated that SPM and soft match are highly
                   complementary, and their combination gives significant
                   performance improvement to OOV term detection.},
  keywords = {confidence estimation, spoken term detection, speech
                   recognition},
  month = mar,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wang10_icassp.pdf},
  year = 2010
}
@inproceedings{joe_dong_simon_interspeech08_bottle,
  author = {Joe Frankel and Dong Wang and Simon King},
  title = {Growing bottleneck features for tandem {ASR}},
  booktitle = {Proc. Interspeech},
  pages = {1549},
  abstract = { We present a method for training bottleneck MLPs for
                   use in tandem ASR. Experiments on meetings data show
                   that this approach leads to improved performance
                   compared with training MLPs from a random
                   initialization. },
  categories = {tandem ASR, bottleneck MLP},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/bottlenet.a.pdf},
  year = 2008
}
@inproceedings{dongwang_interspeech09_cmb,
  author = {Javier Tejedor and Dong Wang and Simon King and Joe
                   Frankel and Jose Colas},
  title = {A Posterior Probability-based System Hybridisation and
                   Combination for Spoken Term Detection},
  booktitle = {Proc. Interspeech},
  pages = {2131--2134},
  address = {Brighton, UK},
  abstract = {Spoken term detection (STD) is a fundamental task for
                   multimedia information retrieval. To improve the
                   detection performance, we have presented a direct
                   posterior-based confidence measure generated from a
                   neural network. In this paper, we propose a
                   detection-independent confidence estimation based on
                   the direct posterior confidence measure, in which the
                   decision making is totally separated from the term
                   detection. Based on this idea, we first present a
                   hybrid system which conducts the term detection and
                   confidence estimation based on different sub-word
                   units, and then propose a combination method which
                   merges detections from heterogeneous term detectors
                   based on the direct posterior-based confidence.
                   Experimental results demonstrated that the proposed
                   methods improved system performance considerably for
                   both English and Spanish. },
  categories = {joint-multigram, pronunciation model, spoken term
                   detection, speech recognition},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/cmb.pdf},
  year = 2009
}
@article{tejedor:wang:frankel:king:colas:specom2008,
  author = {Javier Tejedor and Dong Wang and Joe Frankel and Simon
                   King and José Colás},
  title = {A comparison of grapheme and phoneme-based units for
                   {S}panish spoken term detection},
  journal = {Speech Communication},
  volume = {50},
  number = {11-12},
  pages = {980-991},
  abstract = {The ever-increasing volume of audio data available
                   online through the world wide web means that automatic
                   methods for indexing and search are becoming essential.
                   Hidden Markov model (HMM) keyword spotting and lattice
                   search techniques are the two most common approaches
                   used by such systems. In keyword spotting, models or
                   templates are defined for each search term prior to
                   accessing the speech and used to find matches. Lattice
                   search (referred to as spoken term detection), uses a
                   pre-indexing of speech data in terms of word or
                   sub-word units, which can then quickly be searched for
                   arbitrary terms without referring to the original
                   audio. In both cases, the search term can be modelled
                   in terms of sub-word units, typically phonemes. For
                   in-vocabulary words (i.e. words that appear in the
                   pronunciation dictionary), the letter-to-sound
                   conversion systems are accepted to work well. However,
                   for out-of-vocabulary (OOV) search terms,
                   letter-to-sound conversion must be used to generate a
                   pronunciation for the search term. This is usually a
                   hard decision (i.e. not probabilistic and with no
                   possibility of backtracking), and errors introduced at
                   this step are difficult to recover from. We therefore
                   propose the direct use of graphemes (i.e., letter-based
                   sub-word units) for acoustic modelling. This is
                   expected to work particularly well in languages such as
                   Spanish, where despite the letter-to-sound mapping
                   being very regular, the correspondence is not
                   one-to-one, and there will be benefits from avoiding
                   hard decisions at early stages of processing. In this
                   article, we compare three approaches for Spanish
                   keyword spotting or spoken term detection, and within
                   each of these we compare acoustic modelling based on
                   phone and grapheme units. Experiments were performed
                   using the Spanish geographical-domain Albayzin corpus.
                   Results achieved in the two approaches proposed for
                   spoken term detection show us that trigrapheme units
                   for acoustic modelling match or exceed the performance
                   of phone-based acoustic models. In the method proposed
                   for keyword spotting, the results achieved with each
                   acoustic model are very similar.},
  categories = {Spoken term detection; Keyword spotting; Graphemes;
                   Spanish},
  doi = {10.1016/j.specom.2008.03.005},
  month = {November-December},
  year = 2008
}
@inproceedings{tejedor_interspeech10,
  author = {Javier Tejedor and Doroteo T. Toledano and Miguel
                   Bautista and Simon King and Dong Wang and Jose Colas},
  title = {Augmented set of features for confidence estimation in
                   spoken term detection},
  booktitle = {Proc. Interspeech},
  abstract = {Discriminative confidence estimation along with
                   confidence normalisation have been shown to construct
                   robust decision maker modules in spoken term detection
                   (STD) systems. Discriminative confidence estimation,
                   making use of termdependent features, has been shown to
                   improve the widely used lattice-based confidence
                   estimation in STD. In this work, we augment the set of
                   these term-dependent features and show a significant
                   improvement in the STD performance both in terms of
                   ATWV and DET curves in experiments conducted on a
                   Spanish geographical corpus. This work also proposes a
                   multiple linear regression analysis to carry out the
                   feature selection. Next, the most informative features
                   derived from it are used within the discriminative
                   confidence on the STD system.},
  categories = {confidence estimation, feature selection, spoken term
                   detection, speech recognition},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/features.pdf},
  year = 2010
}
@inproceedings{wang_icassp2011a,
  author = {Dong Wang and Nicholas Evans and Raphael Troncy and
                   Simon King},
  title = {Handling overlaps in spoken term detection},
  booktitle = {Proc. International Conference on Acoustics, Speech
                   and Signal Processing},
  pages = {5656--5659},
  abstract = {Spoken term detection (STD) systems usually arrive at
                   many overlapping detections which are often addressed
                   with some pragmatic approaches, e.g. choosing the best
                   detection to represent all the overlaps. In this paper
                   we present a theoretical study based on a concept of
                   acceptance space. In particular, we present two
                   confidence estimation approaches based on Bayesian and
                   evidence perspectives respectively. Analysis shows that
                   both approaches possess respective ad vantages and
                   shortcomings, and that their combination has the
                   potential to provide an improved confidence estimation.
                   Experiments conducted on meeting data confirm our
                   analysis and show considerable performance improvement
                   with the combined approach, in particular for
                   out-of-vocabulary spoken term detection with stochastic
                   pronunciation modeling.},
  categories = {spoken term detection, speech recognition},
  doi = {10.1109/ICASSP.2011.5947643},
  month = may,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/wang_icassp2011a.pdf},
  year = 2011
}