2009.bib

@comment{{This file has been generated by bib2bib 1.92}}
@comment{{Command line: /home/korin/bibtex2html-1.92-LINUX/bib2bib -oc /home/korin/projects/publications/new_output/transitdata/2009-citations -ob /home/korin/projects/publications/new_output/transitdata/2009.bib -c 'year : "2009"' /home/korin/projects/publications/filtlists/full_publications_list.bib}}
@inproceedings{Ehnes2009An-Automated-Me,
  author = {Ehnes, Jochen},
  title = {An Automated Meeting Assistant: A Tangible Mixed
                   Reality Interface for the {AMIDA} Automatic Content
                   Linking Device},
  booktitle = {ICEIS},
  pages = {952--962},
  abstract = {We describe our approach to support ongoing meetings
                   with an automated meeting assistant. The system based
                   on the AMIDA Content Linking Device aims at providing
                   relevant documents used in previous meetings for the
                   ongoing meeting based on automatic speech recognition.
                   Once the content linking device finds documents linked
                   to a discussion about a similar subject in a previous
                   meeting, it assumes they may be relevant for the
                   current discussion as well. We believe that the way
                   these documents are offered to the meeting participants
                   is equally important as the way they are found. We
                   developed a mixed reality, projection based user
                   interface that lets the documents appear on the table
                   tops in front of the meeting participants. They can
                   hand them over to others or bring them onto the shared
                   projection screen easily if they consider them
                   relevant. Yet, irrelevant documents don't draw too much
                   attention from the discussion. In this paper we
                   describe the concept and implementation of this user
                   interface and provide some preliminary results. },
  bibsource = {DBLP, http://dblp.uni-trier.de},
  categories = {Tangible User Interface, Mixed Reality, AMI, Content
                   Linking},
  crossref = {DBLP:conf/iceis/2009},
  doi = {10.1007/978-3-642-01347-8_79},
  keywords = {Tangible User Interface, Mixed Reality, AMI, Content
                   Linking},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/00240952.pdf},
  year = 2009
}
@article{murray2009,
  author = {Murray, Gabriel and Kleinbauer, Thomas and Poller,
                   Peter and Becker, Tilman and Renals, Steve and Kilgour,
                   Jonathan},
  title = {Extrinsic Summarization Evaluation: A Decision Audit
                   Task},
  journal = {ACM Transactions on Speech and Language Processing},
  volume = {6},
  number = {2},
  pages = {1--29},
  abstract = {In this work we describe a large-scale extrinsic
                   evaluation of automatic speech summarization
                   technologies for meeting speech. The particular task is
                   a decision audit, wherein a user must satisfy a complex
                   information need, navigating several meetings in order
                   to gain an understanding of how and why a given
                   decision was made. We compare the usefulness of
                   extractive and abstractive technologies in satisfying
                   this information need, and assess the impact of
                   automatic speech recognition (ASR) errors on user
                   performance. We employ several evaluation methods for
                   participant performance, including post-questionnaire
                   data, human subjective and objective judgments, and a
                   detailed analysis of participant browsing behavior. We
                   find that while ASR errors affect user satisfaction on
                   an information retrieval task, users can adapt their
                   browsing behavior to complete the task satisfactorily.
                   Results also indicate that users consider extractive
                   summaries to be intuitive and useful tools for browsing
                   multimodal meeting data. We discuss areas in which
                   automatic summarization techniques can be improved in
                   comparison with gold-standard meeting abstracts.},
  doi = {10.1145/1596517.1596518},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/murray-acm09.pdf},
  url = {http://doi.acm.org/10.1145/1596517.1596518},
  year = 2009
}
@inproceedings{anderssoncabral09,
  author = {J. Sebastian Andersson and Joao P. Cabral and Leonardo
                   Badino and Junichi Yamagishi and Robert A.J. Clark},
  title = {Glottal Source and Prosodic Prominence Modelling in
                   {HMM}-based Speech Synthesis for the {B}lizzard
                   {C}hallenge 2009},
  booktitle = {The Blizzard Challenge 2009},
  address = {Edinburgh, U.K.},
  abstract = {This paper describes the CSTR entry for the Blizzard
                   Challenge 2009. The work focused on modifying two parts
                   of the Nitech 2005 HTS speech synthesis system to
                   improve naturalness and contextual appropriateness. The
                   first part incorporated an implementation of the
                   Linjencrants-Fant (LF) glottal source model. The second
                   part focused on improving synthesis of prosodic
                   prominence including emphasis through context dependent
                   phonemes. Emphasis was assigned to the synthesised test
                   sentences based on a handful of theory based rules. The
                   two parts (LF-model and prosodic prominence) were not
                   combined and hence evaluated separately. The results on
                   naturalness for the LF-model showed that it is not yet
                   perceived as natural as the Benchmark HTS system for
                   neutral speech. The results for the prosodic prominence
                   modelling showed that it was perceived as contextually
                   appropriate as the Benchmark HTS system, despite a low
                   naturalness score. The Blizzard challenge evaluation
                   has provided valuable information on the status of our
                   work and continued work will begin with analysing why
                   our modifications resulted in reduced naturalness
                   compared to the Benchmark HTS system.},
  categories = {HMM, HTS, speech synthesis, LF-model, glottal source,
                   prosodic prominence, emphasis},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/cstr_Blizzard2009.pdf},
  year = 2009
}
@phdthesis{cuayahuitl_thesis2009,
  author = {Heriberto Cuayáhuitl},
  title = {Hierarchical Reinforcement Learning for Spoken
                   Dialogue Systems},
  school = {School of Informatics, University of Edinburgh},
  abstract = {This thesis focuses on the problem of scalable
                   optimization of dialogue behaviour in speech-based
                   conversational systems using reinforcement learning.
                   Most previous investigations in dialogue strategy
                   learning have proposed flat reinforcement learning
                   methods, which are more suitable for small-scale spoken
                   dialogue systems. This research formulates the problem
                   in terms of Semi-Markov Decision Processes (SMDPs), and
                   proposes two hierarchical reinforcement learning
                   methods to optimize sub-dialogues rather than full
                   dialogues. The first method uses a hierarchy of SMDPs,
                   where every SMDP ignores irrelevant state variables and
                   actions in order to optimize a sub-dialogue. The second
                   method extends the first one by constraining every SMDP
                   in the hierarchy with prior expert knowledge. The
                   latter method proposes a learning algorithm called
                   'HAM+HSMQ-Learning', which combines two existing
                   algorithms in the literature of hierarchical
                   reinforcement learning. Whilst the first method
                   generates fully-learnt behaviour, the second one
                   generates semi-learnt behaviour. In addition, this
                   research proposes a heuristic dialogue simulation
                   environment for automatic dialogue strategy learning.
                   Experiments were performed on simulated and real
                   environments based on a travel planning spoken dialogue
                   system. Experimental results provided evidence to
                   support the following claims: First, both methods scale
                   well at the cost of near-optimal solutions, resulting
                   in slightly longer dialogues than the optimal
                   solutions. Second, dialogue strategies learnt with
                   coherent user behaviour and conservative recognition
                   error rates can outperform a reasonable hand-coded
                   strategy. Third, semi-learnt dialogue behaviours are a
                   better alternative (because of their higher overall
                   performance) than hand-coded or fully-learnt dialogue
                   behaviours. Last, hierarchical reinforcement learning
                   dialogue agents are feasible and promising for the
                   (semi) automatic design of adaptive behaviours in
                   larger-scale spoken dialogue systems. This research
                   makes the following contributions to spoken dialogue
                   systems which learn their dialogue behaviour. First,
                   the Semi-Markov Decision Process (SMDP) model was
                   proposed to learn spoken dialogue strategies in a
                   scalable way. Second, the concept of 'partially
                   specified dialogue strategies' was proposed for
                   integrating simultaneously hand-coded and learnt spoken
                   dialogue behaviours into a single learning framework.
                   Third, an evaluation with real users of hierarchical
                   reinforcement learning dialogue agents was essential to
                   validate their effectiveness in a realistic
                   environment.},
  key = {spoken dialogue systems, (semi-)automatic dialogue
                   strategy design, hierarchical control, prior expert
                   knowledge, Semi-Markov decision processes, hierarchical
                   reinforcement learning},
  month = {January},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/PhDThesis-HeribertoCuayahuitl-Final.pdf},
  year = 2009
}
@incollection{vipperla2009a,
  author = {Vipperla, Ravi Chander and Wolters, Maria and
                   Georgila, Kallirroi and Renals, Steve},
  title = {Speech Input from Older Users in Smart Environments:
                   Challenges and Perspectives},
  booktitle = {Proc. HCI International: Universal Access in
                   Human-Computer Interaction. Intelligent and Ubiquitous
                   Interaction Environments},
  publisher = {Springer},
  number = {5615},
  series = {Lecture Notes in Computer Science},
  abstract = {Although older people are an important user group for
                   smart environments, there has been relatively little
                   work on adapting natural language interfaces to their
                   requirements. In this paper, we focus on a particularly
                   thorny problem: processing speech input from older
                   users. Our experiments on the MATCH corpus show clearly
                   that we need age-specific adaptation in order to
                   recognize older users' speech reliably. Language models
                   need to cover typical interaction patterns of older
                   people, and acoustic models need to accommodate older
                   voices. Further research is needed into intelligent
                   adaptation techniques that will allow existing large,
                   robust systems to be adapted with relatively small
                   amounts of in-domain, age appropriate data. In
                   addition, older users need to be supported with
                   adequate strategies for handling speech recognition
                   errors.},
  doi = {10.1007/978-3-642-02710-9},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/vipperla2009a.pdf},
  url = {http://www.springerlink.com/content/27r01345r1683251/?p=ad2394d646814db59cf9868b0f74b11e&pi=13},
  year = 2009
}
@inproceedings{Ehnes2009A-Tangible-Mixed,
  author = {Ehnes, Jochen},
  title = {A Tangible Mixed Reality Interface for the {AMI}
                   Automated Meeting Assistant},
  booktitle = {Human Interface and the Management of Information},
  editor = {Smith, Michael J. and Salvendy, Gavriel},
  volume = {5617},
  series = {Lecture Notes in Computer Science},
  pages = {485--494},
  publisher = {Springer},
  abstract = {In this paper we describe our approach to support
                   ongoing meetings with an automated meeting assistant.
                   We propose an alternative user interface for the AMIDA
                   Content Linking Device. In order for the system to be
                   less distractive and more collaborative than the
                   original laptop screen based one, we developed a system
                   that projects documents onto the table tops right in
                   front of the meeting participants. This way they appear
                   as if they were printed on paper, lying in front of the
                   participants. We describe our setup as well as the user
                   interface we built to handle and share these documents.},
  categories = {Mixed Reality, AMI, Content Linking, User Interface},
  isbn = {978-3-642-02555-6},
  location = {Heidelberg},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/56170485.pdf},
  year = 2009
}
@inproceedings{zen:HTSoverview,
  author = {Heiga Zen and Keiichiro Oura and Takashi Nose and
                   Junichi Yamagishi and Shinji Sako and Tomoki Toda and
                   Takashi Masuko and Alan W. Black and Keiichi Tokuda},
  title = {Recent development of the {HMM}-based speech synthesis
                   system ({HTS})},
  booktitle = {Proc. 2009 Asia-Pacific Signal and Information
                   Processing Association (APSIPA)},
  address = {Sapporo, Japan},
  abstract = {A statistical parametric approach to speech synthesis
                   based on hidden Markov models (HMMs) has grown in
                   popularity over the last few years. In this approach,
                   spectrum, excitation, and duration of speech are
                   simultaneously modeled by context-dependent HMMs, and
                   speech waveforms are generate from the HMMs themselves.
                   Since December 2002, we have publicly released an
                   open-source software toolkit named “HMM-based speech
                   synthesis system (HTS)” to provide a research and
                   development toolkit for statistical parametric speech
                   synthesis. This paper describes recent developments of
                   HTS in detail, as well as future release plans.},
  month = oct,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/zen_APSIPA2009.pdf},
  year = 2009
}
@article{cuayahuitl2009,
  author = {Cuayáhuitl, Heriberto and Renals, Steve and Lemon,
                   Oliver and Shimodaira, Hiroshi},
  title = {Evaluation of a hierarchical reinforcement learning
                   spoken dialogue system},
  journal = {Computer Speech and Language},
  volume = {24},
  number = {2},
  pages = {395-429},
  abstract = {We describe an evaluation of spoken dialogue
                   strategies designed using hierarchical reinforcement
                   learning agents. The dialogue strategies were learnt in
                   a simulated environment and tested in a laboratory
                   setting with 32 users. These dialogues were used to
                   evaluate three types of machine dialogue behaviour:
                   hand-coded, fully-learnt and semi-learnt. These
                   experiments also served to evaluate the realism of
                   simulated dialogues using two proposed metrics
                   contrasted with ‘Precision-Recall’. The learnt
                   dialogue behaviours used the Semi-Markov Decision
                   Process (SMDP) model, and we report the first
                   evaluation of this model in a realistic conversational
                   environment. Experimental results in the travel
                   planning domain provide evidence to support the
                   following claims: (a) hierarchical semi-learnt dialogue
                   agents are a better alternative (with higher overall
                   performance) than deterministic or fully-learnt
                   behaviour; (b) spoken dialogue strategies learnt with
                   highly coherent user behaviour and conservative
                   recognition error rates (keyword error rate of 20\%)
                   can outperform a reasonable hand-coded strategy; and
                   (c) hierarchical reinforcement learning dialogue agents
                   are feasible and promising for the (semi) automatic
                   design of optimized dialogue behaviours in larger-scale
                   systems.},
  doi = {10.1016/j.csl.2009.07.001},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/cuayahuitl-csl09.pdf},
  year = 2009
}
@incollection{sarah:hts09,
  author = {Sarah Creer and Phil Green and Stuart Cunningham and
                   Junichi Yamagishi},
  title = {Building personalised synthesised voices for
                   individuals with dysarthria using the {HTS} toolkit},
  booktitle = {Computer Synthesized Speech Technologies: Tools for
                   Aiding Impairment},
  publisher = {IGI Global},
  editor = {John W. Mullennix and Steven E. Stern},
  edition = {1st},
  note = {in press},
  abstract = {When the speech of an individual becomes
                   unintelligible due to a neurological disorder, a
                   synthesized voice can replace that of the individual.
                   To fully replace all functions of human speech
                   communication: communication of information,
                   maintenance of social relationships and displaying
                   identity, the voice must be intelligible,
                   natural-sounding and retain the vocal identity of the
                   speaker. For speakers with dysarthria, achieving this
                   output with minimal data recordings and deteriorating
                   speech is difficult. An alternative to this is using
                   Hidden Markov models (HMMs) which require much less
                   speech data than needed for concatenative methods, to
                   adapt a robust statistical model of speech towards the
                   speaker characteristics captured in the data recorded
                   by the individual. This chapter implements this
                   technique using the HTS toolkit to build personalized
                   synthetic voices for two individuals with dysarthria.
                   An evaluation of the voices by the participants
                   themselves suggests that this technique shows promise
                   for building and reconstructing personalized voices for
                   individuals with dysarthria once deterioration has
                   begun.},
  year = 2009
}
@inproceedings{Ayletetal09,
  author = {Matthew P. Aylett and Simon King and Junichi Yamagishi},
  title = {Speech Synthesis Without a Phone Inventory},
  booktitle = {Interspeech},
  pages = {2087--2090},
  abstract = { In speech synthesis the unit inventory is decided
                   using phonological and phonetic expertise. This process
                   is resource intensive and potentially sub-optimal. In
                   this paper we investigate how acoustic clustering,
                   together with lexicon constraints, can be used to build
                   a self-organised inventory. Six English speech
                   synthesis systems were built using two frameworks, unit
                   selection and parametric HTS for three inventory
                   conditions: 1) a traditional phone set, 2) a system
                   using orthographic units, and 3) a self-organised
                   inventory. A listening test showed a strong preference
                   for the classic system, and for the orthographic system
                   over the self-organised system. Results also varied by
                   letter to sound complexity and database coverage. This
                   suggests the self-organised approach failed to
                   generalise pronunciation as well as introducing noise
                   above and beyond that caused by orthographic sound
                   mismatch.},
  categories = {speech synthesis, unit selection, parametric
                   synthesis, phone inventory, orthographic synthesis},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/ma_interspeech09.pdf},
  place = {Brighton},
  year = 2009
}
@inproceedings{dongwang_interspeech09_spm,
  author = {Dong Wang and Simon King and Joe Frankel},
  title = {Stochastic Pronunciation Modelling for Spoken Term
                   Detection},
  booktitle = {Proc. of Interspeech},
  pages = {2135--2138},
  address = {Brighton, UK},
  abstract = {A major challenge faced by a spoken term detection
                   (STD) system is the detection of out-of-vocabulary
                   (OOV) terms. Although a subword-based STD system is
                   able to detect OOV terms, performance reduction is
                   always observed compared to in-vocabulary terms.
                   Current approaches to STD do not acknowledge the
                   particular properties of OOV terms, such as
                   pronunciation uncertainty. In this paper, we use a
                   stochastic pronunciation model to deal with the
                   uncertain pronunciations of OOV terms. By considering
                   all possible term pronunciations, predicted by a
                   joint-multigram model, we observe a significant
                   performance improvement. },
  categories = {joint-multigram, pronunciation model, spoken term
                   detection, speech recognition},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/spm.pdf},
  year = 2009
}
@inproceedings{johnson-aas:09,
  author = {Johnson, Christine and Campbell, Pauline and
                   DePlacido, Christine and Liddell, Amy and Wolters,
                   Maria},
  title = {Does Peripheral Hearing Loss Affect {RGDT} Thresholds
                   in Older Adults},
  booktitle = {Proceedings of the {A}merican {A}uditory {S}ociety
                   {C}onference},
  abstract = {},
  categories = {speech synthesis, older users},
  month = mar,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/aas09.pdf},
  year = 2009
}
@inproceedings{wolters-is:09,
  author = {Wolters, Maria and Vipperla, Ravichander and Renals,
                   Steve},
  title = {Age Recognition for Spoken Dialogue Systems: Do We
                   Need It?},
  booktitle = {Proc. Interspeech},
  abstract = {When deciding whether to adapt relevant aspects of the
                   system to the particular needs of older users, spoken
                   dialogue systems often rely on automatic detection of
                   chronological age. In this paper, we show that vocal
                   ageing as measured by acoustic features is an
                   unreliable indicator of the need for adaptation. Simple
                   lexical features greatly improve the prediction of both
                   relevant aspects of cognition and interactions style.
                   Lexical features also boost age group prediction. We
                   suggest that adaptation should be based on observed
                   behaviour, not on chronological age, unless it is not
                   feasible to build classifiers for relevant adaptation
                   decisions.},
  categories = {age recognition, spoken dialogue systems},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/is09.pdf},
  year = 2009
}
@inproceedings{huang2009-icassp,
  author = {Songfang Huang and Bowen Zhou},
  title = {An {EM} Algorithm for {SCFG} in Formal Syntax-based
                   Translation},
  booktitle = {Proc. IEEE International Conference on Acoustic,
                   Speech, and Signal Processing (ICASSP'09)},
  pages = {4813--4816},
  address = {Taiwan, China},
  abstract = {In this paper, we investigate the use of bilingual
                   parsing on parallel corpora to better estimate the rule
                   parameters in a formal syntax-based machine translation
                   system, which are normally estimated from the
                   inaccurate heuristics. We use an
                   Expectation-Maximization (EM) algorithm to re-estimate
                   the parameters of synchronous context-free grammar
                   (SCFG) rules according to the derivation knowledge from
                   parallel corpora based on maximum likelihood principle,
                   rather than using only the heuristic information. The
                   proposed algorithm produces significantly better BLEU
                   scores than a state-of-the-art formal syntax-based
                   machine translation system on the IWSLT 2006 Chinese to
                   English task.},
  month = apr,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/icassp09.pdf},
  year = 2009
}
@inproceedings{huang2009-is,
  author = {Songfang Huang and Steve Renals},
  title = {A Parallel Training Algorithm for Hierarchical
                   {P}itman-{Y}or Process Language Models},
  booktitle = {Proc. Interspeech'09},
  pages = {2695--2698},
  address = {Brighton, UK},
  abstract = {The Hierarchical Pitman Yor Process Language Model
                   (HPYLM) is a Bayesian language model based on a
                   non-parametric prior, the Pitman-Yor Process. It has
                   been demonstrated, both theoretically and practically,
                   that the HPYLM can provide better smoothing for
                   language modeling, compared with state-of-the-art
                   approaches such as interpolated Kneser-Ney and modified
                   Kneser-Ney smoothing. However, estimation of Bayesian
                   language models is expensive in terms of both
                   computation time and memory; the inference is
                   approximate and requires a number of iterations to
                   converge. In this paper, we present a parallel training
                   algorithm for the HPYLM, which enables the approach to
                   be applied in the context of automatic speech
                   recognition, using large training corpora with large
                   vocabularies. We demonstrate the effectiveness of the
                   proposed algorithm by estimating language models from
                   corpora for meeting transcription containing over 200
                   million words, and observe significant reductions in
                   perplexity and word error rate.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/sh_interspeech09.pdf},
  year = 2009
}
@article{McGowanBerger2009,
  author = {Richard S. McGowan and Michael A. Berger},
  title = {Acoustic-articulatory mapping in vowels by locally
                   weighted regression},
  journal = {Journal of the Acoustical Society of America},
  volume = {126},
  number = {4},
  pages = {2011-2032},
  abstract = {A method for mapping between simultaneously measured
                   articulatory and acoustic data is proposed. The method
                   uses principal components analysis on the articulatory
                   and acoustic variables, and mapping between the domains
                   by locally weighted linear regression, or loess
                   [Cleveland, W. S. (1979) J. Am. Stat. Assoc. 74,
                   829--836]. The latter method permits local variation in
                   the slopes of the linear regression, assuming that the
                   function being approximated is smooth. The methodology
                   is applied to vowels of four speakers in the Wisconsin
                   X-ray Microbeam Speech Production Database, with
                   formant analysis. Results are examined in terms of (1)
                   examples of forward (articulation-to-acoustics)
                   mappings and inverse mappings, (2) distributions of
                   local slopes and constants, (3) examples of
                   correlations among slopes and constants, (4)
                   root-mean-square error, and (5) sensitivity of formant
                   frequencies to articulatory change. It is shown that
                   the results are qualitatively correct and that loess
                   performs better than global regression. The forward
                   mappings show different root-mean-square error
                   properties than the inverse mappings indicating that
                   this method is better suited for the forward mappings
                   than the inverse mappings, at least for the data chosen
                   for the current study. Some preliminary results on
                   sensitivity of the first two formant frequencies to the
                   two most important articulatory principal components
                   are presented.},
  categories = {Articulatory inversion, locally weighted regression,
                   X-ray microbeam, formant analysis},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/aam.pdf},
  year = 2009
}
@inproceedings{tietze:09,
  author = {Martin I. Tietze and Andi Winterboer and Johanna D.
                   Moore},
  title = {The effect of linguistic devices in information
                   presentation messages on recall and comprehension},
  booktitle = {Proceedings ENLG09},
  categories = {discourse cues, verbal information presentation,
                   recall, eye-tracking, Mechanical Turk},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/tietze.ENLG09.pdf},
  year = 2009
}
@article{ling2008,
  author = {Ling, Z. and Richmond, K. and Yamagishi, J. and Wang,
                   R.},
  title = {Integrating Articulatory Features into {HMM}-based
                   Parametric Speech Synthesis},
  journal = {IEEE Transactions on Audio, Speech and Language
                   Processing },
  volume = 17,
  number = 6,
  pages = {1171--1185},
  note = {\textbf{IEEE SPS 2010 Young Author Best Paper Award}},
  abstract = {This paper presents an investigation of ways to
                   integrate articulatory features into Hidden Markov
                   Model (HMM)-based parametric speech synthesis,
                   primarily with the aim of improving the performance of
                   acoustic parameter generation. The joint distribution
                   of acoustic and articulatory features is estimated
                   during training and is then used for parameter
                   generation at synthesis time in conjunction with a
                   maximum-likelihood criterion. Different model
                   structures are explored to allow the articulatory
                   features to influence acoustic modeling: model
                   clustering, state synchrony and cross-stream feature
                   dependency. The results of objective evaluation show
                   that the accuracy of acoustic parameter prediction can
                   be improved when shared clustering and
                   asynchronous-state model structures are adopted for
                   combined acoustic and articulatory features. More
                   significantly, our experiments demonstrate that
                   modeling the dependency between these two feature
                   streams can make speech synthesis more flexible. The
                   characteristics of synthetic speech can be easily
                   controlled by modifying generated articulatory features
                   as part of the process of acoustic parameter
                   generation.},
  categories = {Speech synthesis, articulation, HMM-based synthesis},
  doi = {10.1109/TASL.2009.2014796},
  key = {ling2008},
  month = aug,
  year = 2009
}
@inproceedings{child_synthesis_2009,
  author = {Oliver Watts and Junichi Yamagishi and Simon King and
                   Kay Berkling},
  title = {{HMM} Adaptation and Voice Conversion for the
                   Synthesis of Child Speech: A Comparison},
  booktitle = {Proc. Interspeech 2009},
  pages = {2627--2630},
  address = {Brighton, U.K.},
  abstract = {This study compares two different methodologies for
                   producing data-driven synthesis of child speech from
                   existing systems that have been trained on the speech
                   of adults. On one hand, an existing statistical
                   parametric synthesiser is transformed using model
                   adaptation techniques, informed by linguistic and
                   prosodic knowledge, to the speaker characteristics of a
                   child speaker. This is compared with the application of
                   voice conversion techniques to convert the output of an
                   existing waveform concatenation synthesiser with no
                   explicit linguistic or prosodic knowledge. In a
                   subjective evaluation of the similarity of synthetic
                   speech to natural speech from the target speaker, the
                   HMM-based systems evaluated are generally preferred,
                   although this is at least in part due to the higher
                   dimensional acoustic features supported by these
                   techniques.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/child_synthesis_2009.pdf},
  year = 2009
}
@inproceedings{Blizzard_summary_09,
  author = {Simon King and Vasilis Karaiskos},
  title = {The {B}lizzard {C}hallenge 2009},
  booktitle = {Proc. Blizzard Challenge Workshop},
  address = {Edinburgh, UK},
  abstract = {The Blizzard Challenge 2009 was the fifth annual
                   Blizzard Challenge. As in 2008, UK English and Mandarin
                   Chinese were the chosen languages for the 2009
                   Challenge. The English corpus was the same one used in
                   2008. The Mandarin corpus was pro- vided by iFLYTEK. As
                   usual, participants with limited resources or limited
                   experience in these languages had the option of using
                   unaligned labels that were provided for both corpora
                   and for the test sentences. An accent-specific
                   pronunciation dictionary was also available for the
                   English speaker. This year, the tasks were organised in
                   the form of `hubs' and `spokes' where each hub task
                   involved building a general-purpose voice and each
                   spoke task involved building a voice for a specific
                   application. A set of test sentences was released to
                   participants, who were given a limited time in which to
                   synthesise them and submit the synthetic speech. An
                   online listening test was conducted to evaluate
                   naturalness, intelligibility, degree of similarity to
                   the original speaker and, for one of the spoke tasks,
                   "appropriateness."},
  categories = {Blizzard Challenge, speech synthesis, evaluation,
                   listening test},
  keywords = {Blizzard},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/summary_Blizzard2009.pdf},
  year = 2009
}
@inproceedings{dongwang_icassp09,
  author = {Dong Wang and Tejedor Tejedor and Joe Frankel and
                   Simon King},
  title = {Posterior-based confidence measures for spoken term
                   detection},
  booktitle = {Proc. of ICASSP09},
  address = {Taiwan},
  abstract = {Confidence measures play a key role in spoken term
                   detection (STD) tasks. The confidence measure expresses
                   the posterior probability of the search term appearing
                   in the detection period, given the speech. Traditional
                   approaches are based on the acoustic and language model
                   scores for candidate detections found using automatic
                   speech recognition, with Bayes' rule being used to
                   compute the desired posterior probability. In this
                   paper, we present a novel direct posterior-based
                   confidence measure which, instead of resorting to the
                   Bayesian formula, calculates posterior probabilities
                   from a multi-layer perceptron (MLP) directly. Compared
                   with traditional Bayesian-based methods, the
                   direct-posterior approach is conceptually and
                   mathematically simpler. Moreover, the MLP-based model
                   does not require assumptions to be made about the
                   acoustic features such as their statistical
                   distribution and the independence of static and dynamic
                   co-efficients. Our experimental results in both English
                   and Spanish demonstrate that the proposed direct
                   posterior-based confidence improves STD performance. },
  categories = {Spoken term detection, confidence measure, posterior
                   probabilities, MLP},
  month = {April},
  page = {4889--4892},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/posterior.pdf},
  year = 2009
}
@article{wolters-taccess:09,
  author = {Maria Wolters and Kallirroi Georgila and Sarah
                   MacPherson and Johanna Moore},
  title = {Being Old Doesn't Mean Acting Old: Older Users'
                   Interaction with Spoken Dialogue Systems},
  journal = {ACM Transactions on Accessible Computing},
  volume = {2},
  number = {1},
  pages = {1--39},
  abstract = {Most studies on adapting voice interfaces to older
                   users work top-down by comparing the interaction
                   behavior of older and younger users. In contrast, we
                   present a bottom-up approach. A statistical cluster
                   analysis of 447 appointment scheduling dialogs between
                   50 older and younger users and 9 simulated spoken
                   dialog systems revealed two main user groups, a
                   “social” group and a “factual” group.
                   “Factual” users adapted quickly to the systems and
                   interacted efficiently with them. “Social” users,
                   on the other hand, were more likely to treat the system
                   like a human, and did not adapt their interaction
                   style. While almost all “social” users were older,
                   over a third of all older users belonged in the
                   “factual” group. Cognitive abilities and gender did
                   not predict group membership. We conclude that spoken
                   dialog systems should adapt to users based on observed
                   behavior, not on age. },
  categories = {spoken dialogue systems, older users, human-computer
                   interaction},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/citation.cfm},
  year = 2009
}
@inproceedings{dongwang_interspeech09_conf,
  author = {Dong Wang and Simon King and Joe Frankel and Peter
                   Bell},
  title = {Term-Dependent Confidence for Out-of-Vocabulary Term
                   Detection},
  booktitle = {Proc. Interspeech},
  pages = {2139--2142},
  address = {Brighton, UK},
  abstract = { Within a spoken term detection (STD) system, the
                   decision maker plays an important role in retrieving
                   reliable detections. Most of the state-of-the-art STD
                   systems make decisions based on a confidence measure
                   that is term-independent, which poses a serious problem
                   for out-of-vocabulary (OOV) term detection. In this
                   paper, we study a term-dependent confidence measure
                   based on confidence normalisation and discriminative
                   modelling, particularly focusing on its remarkable
                   effectiveness for detecting OOV terms. Experimental
                   results indicate that the term-dependent confidence
                   provides much more significant improvement for OOV
                   terms than terms in-vocabulary. },
  categories = {joint-multigram, pronunciation model, spoken term
                   detection, speech recognition},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/conf.pdf},
  year = 2009
}
@inproceedings{dziemianko_interspeech2009,
  author = {Michal Dziemianko and Gregor Hofer and Hiroshi
                   Shimodaira},
  title = {{HMM}-Based Automatic Eye-Blink Synthesis from Speech},
  booktitle = {Proc. Interspeech},
  pages = {1799--1802},
  address = {Brighton, UK},
  abstract = {In this paper we present a novel technique to
                   automatically synthesise eye blinking from a speech
                   signal. Animating the eyes of a talking head is
                   important as they are a major focus of attention during
                   interaction. The developed system predicts eye blinks
                   from the speech signal and generates animation
                   trajectories automatically employing a ''Trajectory
                   Hidden Markov Model''. The evaluation of the
                   recognition performance showed that the timing of
                   blinking can be predicted from speech with an F-score
                   value upwards of 52\%, which is well above chance.
                   Additionally, a preliminary perceptual evaluation was
                   conducted, that confirmed that adding eye blinking
                   significantly improves the perception the character.
                   Finally it showed that the speech synchronised
                   synthesised blinks outperform random blinking in
                   naturalness ratings.},
  categories = {animation, motion synthesis, time series analysis,
                   trajectory model},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/dziemianko_interspeech2009.pdf},
  year = 2009
}
@inproceedings{leo_09-1,
  author = {Leonardo Badino and J. Sebastian Andersson and Junichi
                   Yamagishi and Robert A.J. Clark},
  title = {Identification of Contrast and Its Emphatic
                   Realization in {HMM}-based Speech Synthesis},
  booktitle = {Proc. Interspeech 2009},
  address = {Brighton, U.K.},
  abstract = {The work presented in this paper proposes to identify
                   contrast in the form of contrastive word pairs and
                   prosodically signal it with emphatic accents in a
                   Text-to-Speech (TTS) application using a
                   Hidden-Markov-Model (HMM) based speech synthesis
                   system. We first describe a novel method to
                   automatically detect contrastive word pairs using
                   textual features only and report its performance on a
                   corpus of spontaneous conversations in English.
                   Subsequently we describe the set of features selected
                   to train a HMM-based speech synthesis system and
                   attempting to properly control prosodic prominence
                   (including emphasis). Results from a large scale
                   perceptual test show that in the majority of cases
                   listeners judge emphatic contrastive word pairs as
                   acceptable as their non-emphatic counterpart, while
                   emphasis on non-contrastive pairs is almost never
                   acceptable.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/IS090749.PDF},
  year = 2009
}
@article{yamagishi2009,
  author = {Yamagishi, Junichi and Nose, Takashi and Zen, Heiga
                   and Ling, Zhenhua and Toda, Tomoki and Tokuda, Keiichi
                   and King, Simon and Renals, Steve},
  title = {Robust Speaker-Adaptive {HMM}-based Text-to-Speech
                   Synthesis},
  journal = {IEEE Transactions on Audio, Speech and Language
                   Processing},
  volume = {17},
  number = {6},
  pages = {1208--1230},
  abstract = {This paper describes a speaker-adaptive HMM-based
                   speech synthesis system. The new system, called
                   ``HTS-2007,'' employs speaker adaptation (CSMAPLR+MAP),
                   feature-space adaptive training, mixed-gender modeling,
                   and full-covariance modeling using CSMAPLR transforms,
                   in addition to several other techniques that have
                   proved effective in our previous systems. Subjective
                   evaluation results show that the new system generates
                   significantly better quality synthetic speech than
                   speaker-dependent approaches with realistic amounts of
                   speech data, and that it bears comparison with
                   speaker-dependent approaches even when large amounts of
                   speech data are available. In addition, a comparison
                   study with several speech synthesis techniques shows
                   the new system is very robust: It is able to build
                   voices from less-than-ideal speech data and synthesize
                   good-quality speech even for out-of-domain sentences.},
  pdf = {},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=5109758&arnumber=5153555&count=14&index=12},
  year = 2009
}
@inproceedings{cabral_yrwst,
  author = {J. Cabral and S. Renals and K. Richmond and J.
                   Yamagishi},
  title = {{HMM}-based Speech Synthesis with an Acoustic Glottal
                   Source Model},
  booktitle = {Proc. The First Young Researchers Workshop in Speech
                   Technology},
  abstract = {A major cause of degradation of speech quality in
                   HMM-based speech synthesis is the use of a simple delta
                   pulse signal to generate the excitation of voiced
                   speech. This paper describes a new approach to using an
                   acoustic glottal source model in HMM-based
                   synthesisers. The goal is to improve speech quality and
                   parametric flexibility to better model and transform
                   voice characteristics.},
  categories = {HMM-based Speech Synthesis, LF-Model, Glottal Spectral
                   Separation},
  month = apr,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/jscabral-yrwss2009.pdf},
  year = 2009
}
@proceedings{DBLP:conf/iceis/2009,
  title = {Enterprise Information Systems, 11th International
                   Conference, ICEIS 2009, Milan, Italy, May 6-10, 2009.
                   Proceedings},
  editor = {Filipe, Joaquim and Cordeiro, José},
  volume = {24},
  series = {Lecture Notes in Business Information Processing},
  publisher = {Springer},
  bibsource = {DBLP, http://dblp.uni-trier.de},
  booktitle = {ICEIS},
  doi = {10.1007/978-3-642-01347-8},
  isbn = {978-3-642-01346-1},
  year = 2009
}
@inproceedings{jyamagis:emime,
  author = {Junichi Yamagishi and Mike Lincoln and Simon King and
                   John Dines and Matthew Gibson and Jilei Tian and Yong
                   Guan},
  title = {Analysis of Unsupervised and Noise-Robust
                   Speaker-Adaptive {HMM}-Based Speech Synthesis Systems
                   toward a Unified {ASR} and {TTS} Framework},
  booktitle = {Proc. Interspeech 2009},
  address = {Edinburgh, U.K.},
  abstract = {For the 2009 Blizzard Challenge we have built an
                   unsupervised version of the HTS-2008 speaker-adaptive
                   HMM-based speech synthesis system for English, and a
                   noise robust version of the systems for Mandarin. They
                   are designed from a multidisciplinary application point
                   of view in that we attempt to integrate the components
                   of the TTS system with other technologies such as ASR.
                   All the average voice models are trained exclusively
                   from recognized, publicly available, ASR databases.
                   Multi-pass LVCSR and confidence scores calculated from
                   confusion network are used for the unsupervised
                   systems, and noisy data recorded in cars or public
                   spaces is used for the noise robust system. We believe
                   the developed systems form solid benchmarks and provide
                   good connections to ASR fields. This paper describes
                   the development of the systems and reports the results
                   and analysis of their evaluation.},
  month = sep,
  year = 2009
}
@inproceedings{richmond2009b,
  author = {Richmond, K.},
  title = {Preliminary Inversion Mapping Results with a New {EMA}
                   Corpus},
  booktitle = {Proc. Interspeech},
  pages = {2835--2838},
  address = {Brighton, UK},
  abstract = {In this paper, we apply our inversion mapping method,
                   the trajectory mixture density network (TMDN), to a new
                   corpus of articulatory data, recorded with a Carstens
                   AG500 electromagnetic articulograph. This new data set,
                   mngu0, is relatively large and phonetically rich, among
                   other beneficial characteristics. We obtain good
                   results, with a root mean square (RMS) error of only
                   0.99mm. This compares very well with our previous
                   lowest result of 1.54mm RMS error for equivalent coils
                   of the MOCHA fsew0 EMA data. We interpret this as
                   showing the mngu0 data set is potentially more
                   consistent than the fsew0 data set, and is very useful
                   for research which calls for articulatory trajectory
                   data. It also supports our view that the TMDN is very
                   much suited to the inversion mapping problem.},
  keywords = {acoustic-articulatory inversion mapping, neural
                   network},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/IS090544.pdf},
  year = 2009
}
@inproceedings{richmond2009a,
  author = {Richmond, K. and Clark, R. and Fitt, S.},
  title = {Robust {LTS} rules with the {Combilex} speech
                   technology lexicon},
  booktitle = {Proc. Interspeech},
  pages = {1295--1298},
  address = {Brighton, UK},
  abstract = {Combilex is a high quality pronunciation lexicon aimed
                   at speech technology applications that has recently
                   been released by CSTR. Combilex benefits from several
                   advanced features. This paper evaluates one of these:
                   the explicit alignment of phones to graphemes in a
                   word. This alignment can help to rapidly develop robust
                   and accurate letter-to-sound (LTS) rules, without
                   needing to rely on automatic alignment methods. To
                   evaluate this, we used Festival's LTS module, comparing
                   its standard automatic alignment with Combilex's
                   explicit alignment. Our results show using Combilex's
                   alignment improves LTS accuracy: 86.50\% words correct
                   as opposed to 84.49\%, with our most general form of
                   lexicon. In addition, building LTS models is greatly
                   accelerated, as the need to list allowed alignments is
                   removed. Finally, loose comparison with other studies
                   indicates Combilex is a superior quality lexicon in
                   terms of consistency and size.},
  keywords = {combilex, letter-to-sound rules, grapheme-to-phoneme
                   conversion},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/IS090308.pdf},
  year = 2009
}
@inproceedings{Ehnes2009A-Tangible-Inte,
  author = {Ehnes, Jochen},
  title = {A Tangible Interface for the {AMI} Content Linking
                   Device -- The Automated Meeting Assistant},
  booktitle = {Proceedings of HSI 2009},
  editor = {Bello, Lucia Lo and Iannizzotto, Giancarlo},
  pages = {306-313},
  note = {Best Paper Award (Human Machine Interaction)},
  abstract = {In this Paper we describe our approach to support
                   ongoing meetings with an automated meeting assistant.
                   The system based on the AMIDA Content Linking Device
                   aims at providing relevant documents used in previous
                   meetings for the ongoing meeting based on automatic
                   speech recognition. Once the content linking device
                   finds documents linked to a discussion about a similar
                   subject in a previous meeting, it assumes they may be
                   relevant for the current discussion as well. We believe
                   that the way these documents are offered to the meeting
                   participants is equally important as the way they are
                   found. We developed a projection based mixed reality
                   user interface that lets the documents appear on the
                   table tops in front of the meeting participants. They
                   can hand them over to others or bring them onto the
                   shared projection screen easily if they consider them
                   relevant for others as well. Yet, irrelevant documents
                   do not draw too much attention from the discussion. In
                   this paper we describe the concept and implementation
                   of this user interface and provide some preliminary
                   results.},
  categories = {Tangible Interface, AMI, Content Linking, Mixed
                   Reality},
  isbn = {978-1-4244-3960-7},
  keywords = {Tangible Interface, Mixed Reality, Projection System,
                   Content Linking, Automatic Meeting Assistant},
  lccn = {2009900916},
  location = {Catania, Italy},
  month = {May},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/tt4_001902.pdf},
  year = 2009
}
@phdthesis{zhang-thesis2009,
  author = {Le Zhang},
  title = {Modelling Speech Dynamics with Trajectory-{HMM}s},
  school = {School of Informatics, University of Edinburgh},
  abstract = { The conditional independence assumption imposed by
                   the hidden Markov models (HMMs) makes it difficult to
                   model temporal correlation patterns in human speech.
                   Traditionally, this limitation is circumvented by
                   appending the first and second-order regression
                   coefficients to the observation feature vectors.
                   Although this leads to improved performance in
                   recognition tasks, we argue that a straightforward use
                   of dynamic features in HMMs will result in an inferior
                   model, due to the incorrect handling of dynamic
                   constraints. In this thesis I will show that an HMM can
                   be transformed into a Trajectory-HMM capable of
                   generating smoothed output mean trajectories, by
                   performing a per-utterance normalisation. The resulting
                   model can be trained by either maximising model
                   log-likelihood or minimising mean generation errors on
                   the training data. To combat the exponential growth of
                   paths in searching, the idea of delayed path merging is
                   proposed and a new time-synchronous decoding algorithm
                   built on the concept of token-passing is designed for
                   use in the recognition task. The Trajectory-HMM brings
                   a new way of sharing knowledge between speech
                   recognition and synthesis components, by tackling both
                   problems in a coherent statistical framework. I
                   evaluated the Trajectory-HMM on two different speech
                   tasks using the speaker-dependent MOCHA-TIMIT database.
                   First as a generative model to recover articulatory
                   features from speech signal, where the Trajectory-HMM
                   was used in a complementary way to the conventional HMM
                   modelling techniques, within a joint
                   Acoustic-Articulatory framework. Experiments indicate
                   that the jointly trained acoustic-articulatory models
                   are more accurate (having a lower Root Mean Square
                   error) than the separately trained ones, and that
                   Trajectory-HMM training results in greater accuracy
                   compared with conventional Baum-Welch parameter
                   updating. In addition, the Root Mean Square (RMS)
                   training objective proves to be consistently better
                   than the Maximum Likelihood objective. However,
                   experiment of the phone recognition task shows that the
                   MLE trained Trajectory-HMM, while retaining attractive
                   properties of being a proper generative model, tends to
                   favour over-smoothed trajectories among competing
                   hypothesises, and does not perform better than a
                   conventional HMM. We use this to build an argument that
                   models giving a better fit on training data may suffer
                   a reduction of discrimination by being too faithful to
                   the training data. Finally, experiments on using
                   triphone models show that increasing modelling detail
                   is an effective way to leverage modelling performance
                   with little added complexity in training. },
  key = {speech recognition, speech synthesis, MOCHA,
                   trajectory HMM},
  month = {January},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/zhangle_thesis.pdf},
  year = 2009
}
@article{wolters-iwc:09,
  author = {Maria Wolters and Kallirroi Georgila and Robert Logie
                   and Sarah MacPherson and Johanna Moore and Matt Watson},
  title = {Reducing Working Memory Load in Spoken Dialogue
                   Systems},
  journal = {Interacting with Computers},
  volume = {21},
  number = {4},
  pages = {276-287},
  abstract = {We evaluated two strategies for alleviating working
                   memory load for users of voice interfaces: presenting
                   fewer options per turn and providing confirmations.
                   Forty-eight users booked appointments using nine
                   different dialogue systems, which varied in the number
                   of options presented and the confirmation strategy
                   used. Participants also performed four cognitive tests
                   and rated the usability of each dialogue system on a
                   standardised questionnaire. When systems presented more
                   options per turn and avoided explicit confirmation
                   subdialogues, both older and younger users booked
                   appointments more quickly without compromising task
                   success. Users with lower information processing speed
                   were less likely to remember all relevant aspects of
                   the appointment. Working memory span did not affect
                   appointment recall. Older users were slightly less
                   satisfied with the dialogue systems than younger users.
                   We conclude that the number of options is less
                   important than an accurate assessment of the actual
                   cognitive demands of the task at hand.},
  categories = {spoken dialogue; ageing; older adults; cognitive
                   aging; working memory},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/iwc09.pdf},
  year = 2009
}
@article{hifny2009,
  author = {Hifny, Y. and Renals, S.},
  title = {Speech Recognition Using Augmented Conditional Random
                   Fields},
  journal = {IEEE Transactions on Audio, Speech and Language
                   Processing},
  volume = {17},
  number = {2},
  pages = {354--365},
  abstract = {Acoustic modeling based on hidden Markov models (HMMs)
                   is employed by state-of-the-art stochastic speech
                   recognition systems. Although HMMs are a natural choice
                   to warp the time axis and model the temporal phenomena
                   in the speech signal, their conditional independence
                   properties limit their ability to model spectral
                   phenomena well. In this paper, a new acoustic modeling
                   paradigm based on augmented conditional random fields
                   (ACRFs) is investigated and developed. This paradigm
                   addresses some limitations of HMMs while maintaining
                   many of the aspects which have made them successful. In
                   particular, the acoustic modeling problem is
                   reformulated in a data driven, sparse, augmented space
                   to increase discrimination. Acoustic context modeling
                   is explicitly integrated to handle the sequential
                   phenomena of the speech signal. We present an efficient
                   framework for estimating these models that ensures
                   scalability and generality. In the TIMIT phone
                   recognition task, a phone error rate of 23.0\% was
                   recorded on the full test set, a significant
                   improvement over comparable HMM-based systems.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/hifny2009.pdf},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=4749447&arnumber=4749472&count=25&index=15},
  year = 2009
}
@inproceedings{john:HTSGAP,
  author = {J. Dines and J. Yamagishi and S. King},
  title = {Measuring the gap between {HMM}-based {ASR} and {TTS}},
  booktitle = {Proc. Interspeech},
  pages = {1391--1394},
  address = {Brighton, U.K.},
  abstract = {The EMIME European project is conducting research in
                   the development of technologies for mobile,
                   personalised speech-to-speech translation systems. The
                   hidden Markov model is being used as the underlying
                   technology in both automatic speech recognition (ASR)
                   and text-to-speech synthesis (TTS) components, thus,
                   the investigation of unified statistical modelling
                   approaches has become an implicit goal of our research.
                   As one of the first steps towards this goal, we have
                   been investigating commonalities and differences
                   between HMM-based ASR and TTS. In this paper we present
                   results and analysis of a series of experiments that
                   have been conducted on English ASR and TTS systems,
                   measuring their performance with respect to phone set
                   and lexicon, acoustic feature type and dimensionality
                   and HMM topology. Our results show that, although the
                   fundamental statistical model may be essentially the
                   same, optimal ASR and TTS performance often demands
                   diametrically opposed system designs. This represents a
                   major challenge to be addressed in the investigation of
                   such unified modelling approaches.},
  month = sep,
  year = 2009
}
@inproceedings{dongwang_interspeech09_cmb,
  author = {Javier Tejedor and Dong Wang and Simon King and Joe
                   Frankel and Jose Colas},
  title = {A Posterior Probability-based System Hybridisation and
                   Combination for Spoken Term Detection},
  booktitle = {Proc. Interspeech},
  pages = {2131--2134},
  address = {Brighton, UK},
  abstract = {Spoken term detection (STD) is a fundamental task for
                   multimedia information retrieval. To improve the
                   detection performance, we have presented a direct
                   posterior-based confidence measure generated from a
                   neural network. In this paper, we propose a
                   detection-independent confidence estimation based on
                   the direct posterior confidence measure, in which the
                   decision making is totally separated from the term
                   detection. Based on this idea, we first present a
                   hybrid system which conducts the term detection and
                   confidence estimation based on different sub-word
                   units, and then propose a combination method which
                   merges detections from heterogeneous term detectors
                   based on the direct posterior-based confidence.
                   Experimental results demonstrated that the proposed
                   methods improved system performance considerably for
                   both English and Spanish. },
  categories = {joint-multigram, pronunciation model, spoken term
                   detection, speech recognition},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/cmb.pdf},
  year = 2009
}
@inproceedings{bell_king_full_covariance_asru2009,
  author = {Bell, Peter and King, Simon},
  title = {Diagonal Priors for Full Covariance Speech Recognition},
  booktitle = {Proc. IEEE Workshop on Automatic Speech Recognition
                   and Understanding},
  address = {Merano, Italy},
  abstract = {We investigate the use of full covariance Gaussians
                   for large-vocabulary speech recognition. The large
                   number of parameters gives high modelling power, but
                   when training data is limited, the standard sample
                   covariance matrix is often poorly conditioned, and has
                   high variance. We explain how these problems may be
                   solved by the use of a diagonal covariance smoothing
                   prior, and relate this to the shrinkage estimator, for
                   which the optimal shrinkage parameter may itself be
                   estimated from the training data. We also compare the
                   use of generatively and discriminatively trained
                   priors. Results are presented on a large vocabulary
                   conversational telephone speech recognition task.},
  doi = {10.1109/ASRU.2009.5373344},
  month = dec,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/shrinkage_asru2009.pdf},
  year = 2009
}
@inproceedings{steiner_is2009a,
  author = {Steiner, I. and Richmond, K.},
  title = {Towards Unsupervised Articulatory Resynthesis of
                   {G}erman Utterances using {EMA} data},
  booktitle = {Proc. Interspeech},
  pages = {2055--2058},
  address = {Brighton, UK},
  abstract = {As part of ongoing research towards integrating an
                   articulatory synthesizer into a text-to-speech (TTS)
                   framework, a corpus of German utterances recorded with
                   electromagnetic articulography (EMA) is resynthesized
                   to provide training data for statistical models. The
                   resynthesis is based on a measure of similarity between
                   the original and resynthesized EMA trajectories,
                   weighted by articulatory relevance. Preliminary results
                   are discussed and future work outlined.},
  keywords = {articulatory speech synthesis, copy synthesis,
                   electromagnetic articulography, EMA},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/IS090558.pdf},
  year = 2009
}
@inproceedings{jyamagis:1000sHTS,
  author = {J. Yamagishi and Bela Usabaev and Simon King and
                   Oliver Watts and John Dines and Jilei Tian and Rile Hu
                   and Yong Guan and Keiichiro Oura and Keiichi Tokuda and
                   Reima Karhila and Mikko Kurimo},
  title = {Thousands of voices for {HMM}-based speech synthesis},
  booktitle = {Proc. Interspeech},
  pages = {420--423},
  address = {Brighton, U.K.},
  abstract = {Our recent experiments with HMM-based speech synthesis
                   systems have demonstrated that speaker-adaptive
                   HMM-based speech synthesis (which uses an ‘average
                   voice model’ plus model adaptation) is robust to
                   non-ideal speech data that are recorded under various
                   conditions and with varying microphones, that are not
                   perfectly clean, and/or that lack of phonetic balance.
                   This enables us consider building high-quality voices
                   on ’non-TTS’ corpora such as ASR corpora. Since ASR
                   corpora generally include a large number of speakers,
                   this leads to the possibility of producing an enormous
                   number of voices automatically. In this paper we show
                   thousands of voices for HMM-based speech synthesis that
                   we have made from several popular ASR corpora such as
                   the Wall Street Journal databases (WSJ0/WSJ1/WSJCAM0),
                   Resource Management, Globalphone and Speecon. We report
                   some perceptual evaluation results and outline the
                   outstanding issues.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/index.php},
  year = 2009
}
@inproceedings{NiekraszMoore09,
  author = {John Niekrasz and Johanna Moore},
  title = {Participant Subjectivity and Involvement as a Basis
                   for Discourse Segmentation},
  booktitle = {{Proceedings of the SIGDIAL 2009 Conference}},
  pages = {54--61},
  abstract = {We propose a framework for analyzing episodic
                   conversational activities in terms of expressed
                   relationships between the participants and utterance
                   content. We test the hypothesis that linguistic
                   features which express such properties, e.g. tense,
                   aspect, and person deixis, are a useful basis for
                   automatic intentional discourse segmentation. We
                   present a novel algorithm and test our hypothesis on a
                   set of intentionally segmented conversational
                   monologues. Our algorithm performs better than a simple
                   baseline and as well as or better than well-known
                   lexical-semantic segmentation methods.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/NiekraszMoore09.pdf},
  year = 2009
}