CSTR logo
SSRC logo
Edinburgh Speech
Science and
Technology
Marie Curie Logo UoE logo QMUC logo


Publications of EdSST fellows

@inproceedings{cabral_yrwst,
  author = {J. Cabral and S. Renals and K. Richmond and J. Yamagishi},
  title = {{HMM}-based Speech Synthesis with an Acoustic Glottal
                   Source Model},
  booktitle = {The First Young Researchers Workshop in Speech
                   Technology},
  abstract = {A major cause of degradation of speech quality in
                   HMM-based speech synthesis is the use of a simple delta
                   pulse signal to generate the excitation of voiced
                   speech. This paper describes a new approach to using an
                   acoustic glottal source model in HMM-based
                   synthesisers. The goal is to improve speech quality and
                   parametric flexibility to better model and transform
                   voice characteristics.},
  categories = {HMM-based Speech Synthesis, LF-Model, Glottal Spectral
                   Separation},
  month = apr,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/jscabral-yrwss2009.pdf},
  year = 2009
}
@article{liker2008,
  author = {Marko Liker and Damir Horga and Ines Safaric},
  title = {Koartikulacijski pritisak i koartikulacijski otpor: ultrazvucno istrazivanje (Coarticulatory pressure and coarticulatory resistance: an ultrasound study)},
  journal = {Govor/Speech},
  year = 2008,
  key = {liker2008},
  volume = 25,
  number = 2,
  pages = {171--188}
}
@inproceedings{tietze:09,
  author = {Martin I. Tietze and Andi Winterboer and Johanna D.
                   Moore},
  title = {The effect of linguistic devices in information
                   presentation messages on recall and comprehension},
  booktitle = {Proceedings ENLG09},
  categories = {discourse cues, verbal information presentation,
                   recall, eye-tracking, Mechanical Turk},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/tietze.ENLG09.pdf},
  year = 2009
}
@inproceedings{kocjancic_issp08,
  author = {Kocjancic, Tanja},
  title = {Ultrasound investigation of tongue movements in
                   syllables with different onset structure},
  booktitle = {Proc. of the Eighth International Seminar on Speech
                   Production (ISSP)},
  abstract = {This study is an attempt to describe syllables with
                   different onset structure not only in terms of
                   durational changes but also in terms of the distance
                   the tongue travels over a syllable by using ultrasound
                   and to compare the ratio between the two parameters,
                   expressed as speed. Results indicate that both measures
                   increase with an increasing number of onset segments
                   but not to the same degree for all targets. Therefore
                   speed was not constant over all of them. Additionally,
                   type of onset constituent greatly influenced the three
                   parameters and there were large between-speaker
                   similarities in case of durational changes.},
  categories = {tongue movements, ultrasound},
  month = dec,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/Kocjancic_ISSP_2008.pdf},
  year = 2008
}
@inproceedings{kocjancic_exling08,
  author = {Kocjancic, Tanja},
  title = {Tongue movement and syllable onset complexity:
                   ultrasound study},
  booktitle = {Proc. of ISCA Experimental Linguistics ExLing 2008},
  abstract = {In this study ultrasound was used to investigate
                   tongue movements in syllables with different number and
                   type of onset consonants. Ultrasound recordings
                   provided the information of the distance the tongue
                   travels over a target, and audio recordings of the time
                   needed. The speed of tongue’s travel was calculated
                   from the two measurements. Results of ten speakers have
                   shown that both duration and distance travelled
                   increase with an increased number of onset segments,
                   but that distance travelled is additionally influenced
                   by the type of the segment, as is speed. Duration also
                   seemed to be the least speaker-dependant of the three
                   parameters.},
  categories = {tongue movements, ultrasound},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/Kocjancic_ISCA_ExLing_2008.pdf},
  year = 2008
}
@inproceedings{ling:richmond:yamagishi:wang:2008a,
  author = {Ling, Zhen-Hua and Richmond, Korin and Yamagishi,
                   Junichi and Wang, Ren-Hua },
  title = {Articulatory Control of {HMM}-based Parametric Speech
                   Synthesis Driven by Phonetic Knowledge},
  booktitle = {Proc. Interspeech},
  pages = {573--576},
  address = {Brisbane, Australia},
  abstract = {This paper presents a method to control the
                   characteristics of synthetic speech flexibly by
                   integrating articulatory features into a Hidden Markov
                   Model (HMM)-based parametric speech synthesis system.
                   In contrast to model adaptation and interpolation
                   approaches for speaking style control, this method is
                   driven by phonetic knowledge, and target speech samples
                   are not required. The joint distribution of parallel
                   acoustic and articulatory features considering
                   cross-stream feature dependency is estimated. At
                   synthesis time, acoustic and articulatory features are
                   generated simultaneously based on the
                   maximum-likelihood criterion. The synthetic speech can
                   be controlled flexibly by modifying the generated
                   articulatory features according to arbitrary phonetic
                   rules in the parameter generation process. Our
                   experiments show that the proposed method is effective
                   in both changing the overall character of synthesized
                   speech and in controlling the quality of a specific
                   vowel.},
  categories = {speech synthesis, HMM, articulatory features, phonetic
                   knowledge},
  key = {ling:richmond:yamagishi:wang:2008a},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080582.PDF},
  year = 2008
}
@inproceedings{tietze:08:sci,
  author = {Martin Tietze and Vera Demberg and Johanna D. Moore},
  title = {Syntactic Complexity induces Explicit Grounding in the
                   {MapTask} corpus},
  booktitle = {Proc. Interspeech},
  abstract = {This paper provides evidence for theories of grounding
                   and dialogue management in human conversation. For each
                   utterance in a corpus of task-oriented dialogues, we
                   calculated integration costs, which are based on
                   syntactic sentence complexity. We compared the
                   integration costs and grounding behavior under two
                   conditions, namely face-to-face and a no-eye-contact
                   condition. The results show that integration costs were
                   significantlyhigher for explicitly grounded utterances
                   in the no-eye-contact condition, but not in the
                   face-to-face condition.},
  categories = {dialogue, syntactic complexity, grounding},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS081130.pdf},
  year = 2008
}
@inproceedings{wang:frankel:tejedor:king:icassp2008,
  author = {Dong Wang and Joe Frankel and Javier Tejedor and Simon
                   King},
  title = {A comparison of phone and grapheme-based spoken term
                   detection},
  booktitle = {Proc. ICASSP},
  pages = {4969--4972 },
  abstract = {We propose grapheme-based sub-word units for spoken
                   term detection (STD). Compared to phones, graphemes
                   have a number of potential advantages. For
                   out-of-vocabulary search terms, phone- based approaches
                   must generate a pronunciation using letter-to-sound
                   rules. Using graphemes obviates this potentially
                   error-prone hard decision, shifting pronunciation
                   modelling into the statistical models describing the
                   observation space. In addition, long-span grapheme
                   language models can be trained directly from large text
                   corpora. We present experiments on Spanish and English
                   data, comparing phone and grapheme-based STD. For
                   Spanish, where phone and grapheme-based systems give
                   similar transcription word error rates (WERs),
                   grapheme-based STD significantly outperforms a phone-
                   based approach. The converse is found for English,
                   where the phone-based system outperforms a grapheme
                   approach. However, we present additional analysis which
                   suggests that phone-based STD performance levels may be
                   achieved by a grapheme-based approach despite lower
                   transcription accuracy, and that the two approaches may
                   usefully be combined. We propose a number of directions
                   for future development of these ideas, and suggest that
                   if grapheme-based STD can match phone-based
                   performance, the inherent flexibility in dealing with
                   out-of-vocabulary terms makes this a desirable
                   approach.},
  doi = {10.1109/ICASSP.2008.4518773},
  month = {March-April},
  year = 2008
}
@inproceedings{robust-hts,
  author = {Junichi Yamagishi and Zhenhua Ling and Simon King},
  title = {Robustness of HMM-based Speech Synthesis},
  booktitle = {Proc. Interspeech 2008},
  pages = {581--584},
  address = {Brisbane, Australia},
  abstract = {As speech synthesis techniques become more advanced,
                   we are able to consider building high-quality voices
                   from data collected outside the usual highly-controlled
                   recording studio environment. This presents new
                   challenges that are not present in conventional
                   text-to-speech synthesis: the available speech data are
                   not perfectly clean, the recording conditions are not
                   consistent, and/or the phonetic balance of the material
                   is not ideal. Although a clear picture of the
                   performance of various speech synthesis techniques
                   (e.g., concatenative, HMM-based or hybrid) under good
                   conditions is provided by the Blizzard Challenge, it is
                   not well understood how robust these algorithms are to
                   less favourable conditions. In this paper, we analyse
                   the performance of several speech synthesis methods
                   under such conditions. This is, as far as we know, a
                   new research topic: ``Robust speech synthesis.'' As a
                   consequence of our investigations, we propose a new
                   robust training method for the HMM-based speech
                   synthesis in for use with speech data collected in
                   unfavourable conditions.},
  categories = {speech synthesis, HMM-based speech synthesis, HTS,
                   speaker adaptation, voice conversion, average voice,
                   unit selection},
  key = {robust-hts},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/RobustnessHTS.pdf},
  year = 2008
}
@article{ling2008,
  author = {Ling, Z. and Richmond, K. and Yamagishi, J. and Wang,
                   R.},
  title = {Integrating Articulatory Features into {HMM}-based
                   Parametric Speech Synthesis},
  journal = {IEEE Transactions on Audio, Speech and Language
                   Processing },
  note = {Accepted for publication},
  abstract = {This paper presents an investigation of ways to
                   integrate articulatory features into Hidden Markov
                   Model (HMM)-based parametric speech synthesis,
                   primarily with the aim of improving the performance of
                   acoustic parameter generation. The joint distribution
                   of acoustic and articulatory features is estimated
                   during training and is then used for parameter
                   generation at synthesis time in conjunction with a
                   maximum-likelihood criterion. Different model
                   structures are explored to allow the articulatory
                   features to influence acoustic modeling: model
                   clustering, state synchrony and cross-stream feature
                   dependency. The results of objective evaluation show
                   that the accuracy of acoustic parameter prediction can
                   be improved when shared clustering and
                   asynchronous-state model structures are adopted for
                   combined acoustic and articulatory features. More
                   significantly, our experiments demonstrate that
                   modeling the dependency between these two feature
                   streams can make speech synthesis more flexible. The
                   characteristics of synthetic speech can be easily
                   controlled by modifying generated articulatory features
                   as part of the process of acoustic parameter
                   generation.},
  categories = {Speech synthesis, articulation, HMM-based synthesis},
  key = {ling2008},
  optmonth = {},
  optnumber = {},
  optpages = {},
  optvolume = {},
  year = 2009
}
@inproceedings{qin:perpinan:richmond:wrench:renals:2008a,
  author = {Qin, C. and Carreira-Perpiñán, M. and Richmond, K.
                   and Wrench, A. and Renals, S.},
  title = {Predicting Tongue Shapes from a Few Landmark Locations},
  booktitle = {Proc. Interspeech},
  pages = {2306--2309},
  address = {Brisbane, Australia},
  abstract = {We present a method for predicting the midsagittal
                   tongue contour from the locations of a few landmarks
                   (metal pellets) on the tongue surface, as used in
                   articulatory databases such as MOCHA and the Wisconsin
                   XRDB. Our method learns a mapping using ground-truth
                   tongue contours derived from ultrasound data and
                   drastically improves over spline interpolation. We also
                   determine the optimal locations of the landmarks, and
                   the number of landmarks required to achieve a desired
                   prediction error: 3-4 landmarks are enough to achieve
                   0.3-0.2 mm error per point on the tongue.},
  categories = {ultrasound, tongue contour, articulation},
  key = {qin:perpinan:richmond:wrench:renals:2008a},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080929.PDF},
  year = 2008
}
@article{hts2007-junichi,
  author = {Junichi Yamagishi and Takashi Nose and Zhen-Hua Ling
                   and Heiga Zen and Tomoki Toda and Keiichi Tokuda and
                   Simon King and Steve Renals},
  title = {Robust Speaker-Adaptive {HMM}-based Text-to-Speech
                   Synthesis},
  journal = {IEEE Transactions on Audio, Speech and Language
                   Processing},
  note = {In review},
  abstract = {This paper describes a speaker-adaptive HMM-based
                   speech synthesis system. The new system, called
                   ``HTS-2007'', employs speaker adaptation (CSMAPLR+MAP),
                   feature-space adaptive training, mixed-gender modeling,
                   and full-covariance modeling using CSMAPLR transforms,
                   in addition to several other techniques that have
                   proved effective in our previous systems. Subjective
                   evaluation results show that the new system generates
                   significantly better quality synthetic speech than
                   speaker-dependent approaches with realistic amounts of
                   speech data, and that it bears comparison with
                   speaker-dependent approaches even when large amounts of
                   speech data are available. In addition, a comparison
                   study with several speech synthesis techniques shows
                   the new system is very robust: It is able to build
                   voices from less-than-ideal speech data and synthesize
                   good-quality speech even for out-of-domain sentences.},
  categories = {speech synthesis, HMM-based speech synthesis, HTS,
                   speaker adaptation, voice conversion, average voice},
  key = {hts2007-junichi},
  year = 2008
}
@inproceedings{leo_08-3,
  author = {J. Sebastian Andersson and Leonardo Badino and Oliver
                   S. Watts and Matthew P.Aylett},
  title = {The {CSTR/Cereproc B}lizzard Entry 2008: The
                   Inconvenient Data},
  booktitle = {Proc. Blizzard Challenge Workshop (in Proc.
                   Interspeech 2008)},
  address = {Brisbane, Australia},
  abstract = {In a commercial system data used for unit selection
                   systems is collected with a heavy emphasis on
                   homogeneous neutral data that has sufficient coverage
                   for the units that will be used in the system. In this
                   years Blizzard entry CSTR and CereProc present a joint
                   entry where the emphasis has been to explore techniques
                   to deal with data which is not homogeneous (the English
                   entry) and did not have appropriate coverage for a
                   diphone based system (the Mandarin entry where
                   tone/phone combinations were treated as distinct phone
                   categories). In addition, two further problems were
                   addressed, 1) Making use of non-homogeneous data for
                   creating a voice that can realise both expressive and
                   neutral speaking styles (the English entry) 2) Building
                   a unit selection system with no native understanding of
                   the language but depending instead on external native
                   evaluation (the Mandarin Entry).},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/cstr-cereproc_Blizzard2008.pdf},
  year = 2008
}
@inproceedings{dong_ivan_joe_simon_interspeech08_marray,
  author = {Dong Wang and Ivan Himawan and Joe Frankel and Simon
                   King},
  title = {A Posterior Approach for Microphone Array Based Speech
                   Recognition},
  booktitle = {Proc. Interspeech},
  pages = {996--999},
  abstract = {Automatic speech recognition (ASR) becomes rather
                   difficult in meetings domains because of the adverse
                   acoustic conditions, including more background noise,
                   more echo and reverberation and frequent cross-talking.
                   Microphone arrays have been demonstrated able to boost
                   ASR performance dramatically in such noisy and
                   reverberant environments, with various beamforming
                   algorithms. However, almost all existing beamforming
                   measures work in the acoustic domain, resorting to
                   signal processing theories and geometric explanation.
                   This limits their application, and induces significant
                   performance degradation when the geometric property is
                   unavailable or hard to estimate, or if heterogenous
                   channels exist in the audio system. In this paper, we
                   preset a new posterior-based approach for array-based
                   speech recognition. The main idea is, instead of
                   enhancing speech signals, we try to enhance the
                   posterior probabilities that frames belonging to
                   recognition units, e.g., phones. These enhanced
                   posteriors are then transferred to posterior
                   probability based features and are modeled by HMMs,
                   leading to a tandem ANN-HMM hybrid system presented by
                   Hermansky et al.. Experimental results demonstrated the
                   validity of this posterior approach. With the posterior
                   accumulation or enhancement, significant improvement
                   was achieved over the single channel baseline.
                   Moreover, we can combine the acoustic enhancement and
                   posterior enhancement together, leading to a hybrid
                   acoustic-posterior beamforming approach, which works
                   significantly better than just the acoustic
                   beamforming, especially in the scenario with
                   moving-speakers. },
  categories = {speech recognition, microphone array, beamforming,
                   tandem approach},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/marray.a.pdf},
  year = 2008
}
@inproceedings{steiner:richmond:2008a,
  author = {Steiner, I. and Richmond, K.},
  title = {Generating gestural timing from {EMA} data using
                   articulatory resynthesis},
  booktitle = {Proc. 8th International Seminar on Speech Production},
  address = {Strasbourg, France},
  abstract = {As part of ongoing work to integrate an articulatory
                   synthesizer into a modular TTS platform, a method is
                   presented which allows gestural timings to be generated
                   automatically from EMA data. Further work is outlined
                   which will adapt the vocal tract model and phoneset to
                   English using new articulatory data, and use
                   statistical trajectory models. },
  categories = {articulatory synthesis, EMA, VocalTractLab },
  key = {steiner:richmond:2008a},
  month = dec,
  year = 2008
}
@inproceedings{cabral:renals:richmond:yamagishi:2008a,
  author = {Cabral, J. and Renals, S. and Richmond, K. and
                   Yamagishi, J.},
  title = {Glottal Spectral Separation for Parametric Speech
                   Synthesis},
  booktitle = {Proc. Interspeech},
  pages = {1829--1832},
  address = {Brisbane, Australia},
  abstract = {This paper presents a method to control the
                   characteristics of synthetic speech flexibly by
                   integrating articulatory features into a Hidden Markov
                   Model (HMM)-based parametric speech synthesis system.
                   In contrast to model adaptation and interpolation
                   approaches for speaking style control, this method is
                   driven by phonetic knowledge, and target speech samples
                   are not required. The joint distribution of parallel
                   acoustic and articulatory features considering
                   cross-stream feature dependency is estimated. At
                   synthesis time, acoustic and articulatory features are
                   generated simultaneously based on the
                   maximum-likelihood criterion. The synthetic speech can
                   be controlled flexibly by modifying the generated
                   articulatory features according to arbitrary phonetic
                   rules in the parameter generation process. Our
                   experiments show that the proposed method is effective
                   in both changing the overall character of synthesized
                   speech and in controlling the quality of a specific
                   vowel. },
  categories = {HMM speech synthesis, Glottal Spectral Separation,
                   LF-model},
  key = {cabral:renals:richmond:yamagishi:2008a},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS081086.PDF},
  year = 2008
}
@inproceedings{leo_07-2,
  author = {Matthew P. Aylett and J. Sebastian Andersson and
                   Leonardo Badino and Christopher J. Pidcock},
  title = {The {C}erevoice {B}lizzard Entry 2007: Are Small
                   Database Errors Worse than Compression Artifacts?},
  booktitle = {Proc. Blizzard Challenge Workshop 2007},
  address = {Bonn, Germany},
  abstract = {In commercial systems the memory footprint of unit
                   selection systems is often a key issue. This is
                   especially true for PDAs and other embedded devices. In
                   this year's Blizzard entry CereProc R gave itself the
                   criteria that the full database system entered would
                   have a smaller memory footprint than either of the two
                   smaller database entries. This was accomplished by
                   applying Speex speech compression to the full database
                   entry. In turn a set of small database techniques used
                   to improve the quality of small database systems in
                   last years entry were extended. Finally, for all
                   systems, two quality control methods were applied to
                   the underlying database to improve the lexicon and
                   transcription match to the underlying data. Results
                   suggest that mild audio quality artifacts introduced by
                   lossy compression have almost as much impact on MOS
                   perceived quality as concatenation errors introduced by
                   sparse data in the smaller systems with bulked
                   diphones.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blz3_004.pdf},
  year = 2007
}
@inproceedings{joe_dong_simon_interspeech08_bottle,
  author = {Joe Frankel and Dong Wang and Simon King},
  title = {Growing bottleneck features for tandem {ASR}},
  booktitle = {Proc. Interspeech},
  pages = {1549},
  abstract = { We present a method for training bottleneck MLPs for
                   use in tandem ASR. Experiments on meetings data show
                   that this approach leads to improved performance
                   compared with training MLPs from a random
                   initialization. },
  categories = {tandem ASR, bottleneck MLP},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/bottlenet.a.pdf},
  month = sep,
  year = 2008
}
@inproceedings{cabral07,
  author = {J. Cabral and S. Renals and K. Richmond and J.
                   Yamagishi},
  title = {Towards an Improved Modeling of the Glottal Source in
                   Statistical Parametric Speech Synthesis},
  booktitle = {Proc.of the 6th ISCA Workshop on Speech Synthesis},
  address = {Bonn, Germany},
  abstract = {This paper proposes the use of the Liljencrants-Fant
                   model (LF-model) to represent the glottal source signal
                   in HMM-based speech synthesis systems. These systems
                   generally use a pulse train to model the periodicity of
                   the excitation signal of voiced speech. However, this
                   model produces a strong and uniform harmonic structure
                   throughout the spectrum of the excitation which makes
                   the synthetic speech sound buzzy. The use of a mixed
                   band excitation and phase manipulation reduces this
                   effect but it can result in degradation of the speech
                   quality if the noise component is not weighted
                   carefully. In turn, the LF-waveform has a decaying
                   spectrum at higher frequencies, which is more similar
                   to the real glottal source excitation signal. We
                   conducted a perceptual experiment to test the
                   hypothesis that the LF-model can perform as well as or
                   better than the pulse train in a HMM-based speech
                   synthesizer. In the synthesis, we used the mean values
                   of the LF-parameters, calculated by measurements of the
                   recorded speech. The result of this study is important
                   not only regarding the improvement in speech quality of
                   these type of systems, but also because the LF-model
                   can be used to model many characteristics of the
                   glottal source, such as voice quality, which are
                   important for voice transformation and generation of
                   expressive speech.},
  categories = {LF-model, Statistical parametric speech synthesis,
                   HMM-based speech synthesis},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/cabral07.pdf},
  year = 2007
}
@article{tejedor:wang:frankel:king:colas:specom2008,
  author = {Javier Tejedor and Dong Wang and Joe Frankel and Simon
                   King and José Colás},
  title = {A comparison of grapheme and phoneme-based units for
                   {S}panish spoken term detection},
  journal = {Speech Communication},
  volume = {50},
  number = {11-12},
  pages = {980-991},
  abstract = {The ever-increasing volume of audio data available
                   online through the world wide web means that automatic
                   methods for indexing and search are becoming essential.
                   Hidden Markov model (HMM) keyword spotting and lattice
                   search techniques are the two most common approaches
                   used by such systems. In keyword spotting, models or
                   templates are defined for each search term prior to
                   accessing the speech and used to find matches. Lattice
                   search (referred to as spoken term detection), uses a
                   pre-indexing of speech data in terms of word or
                   sub-word units, which can then quickly be searched for
                   arbitrary terms without referring to the original
                   audio. In both cases, the search term can be modelled
                   in terms of sub-word units, typically phonemes. For
                   in-vocabulary words (i.e. words that appear in the
                   pronunciation dictionary), the letter-to-sound
                   conversion systems are accepted to work well. However,
                   for out-of-vocabulary (OOV) search terms,
                   letter-to-sound conversion must be used to generate a
                   pronunciation for the search term. This is usually a
                   hard decision (i.e. not probabilistic and with no
                   possibility of backtracking), and errors introduced at
                   this step are difficult to recover from. We therefore
                   propose the direct use of graphemes (i.e., letter-based
                   sub-word units) for acoustic modelling. This is
                   expected to work particularly well in languages such as
                   Spanish, where despite the letter-to-sound mapping
                   being very regular, the correspondence is not
                   one-to-one, and there will be benefits from avoiding
                   hard decisions at early stages of processing. In this
                   article, we compare three approaches for Spanish
                   keyword spotting or spoken term detection, and within
                   each of these we compare acoustic modelling based on
                   phone and grapheme units. Experiments were performed
                   using the Spanish geographical-domain Albayzin corpus.
                   Results achieved in the two approaches proposed for
                   spoken term detection show us that trigrapheme units
                   for acoustic modelling match or exceed the performance
                   of phone-based acoustic models. In the method proposed
                   for keyword spotting, the results achieved with each
                   acoustic model are very similar.},
  categories = {Spoken term detection; Keyword spotting; Graphemes;
                   Spanish},
  doi = {doi:10.1016/j.specom.2008.03.005 },
  month = {November-December},
  year = 2008
}