2010.bib

@comment{{This file has been generated by bib2bib 1.92}}
@comment{{Command line: /home/korin/bibtex2html-1.92-LINUX/bib2bib -oc /home/korin/projects/publications/new_output/transitdata/2010-citations -ob /home/korin/projects/publications/new_output/transitdata/2010.bib -c 'year : "2010"' /home/korin/projects/publications/filtlists/full_publications_list.bib}}
@misc{turk2010,
  author = {Turk, Alice and Scobbie, James and Geng, Christian and
                   Campbell, Barry and Dickie, Catherine and Dubourg,
                   Eddie and Bard, Ellen Gurman and Hardcastle, William
                   and Hartinger, Mariam and King, Simon and Lickley,
                   Robin and Macmartin, Cedric and Nakai, Satsuki and
                   Renals, Steve and Richmond, Korin and Schaeffler, Sonja
                   and White, Kevin and Wiegand, Ronny and Wrench, Alan},
  title = {An {E}dinburgh speech production facility},
  howpublished = {Poster presented at the 12th Conference on Laboratory
                   Phonology, Albuquerque, New Mexico.},
  month = {July},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/ESPF.pdf},
  year = 2010
}
@inproceedings{anderssonetal2010,
  author = {Sebastian Andersson and Kallirroi Georgila and David
                   Traum and Matthew Aylett and Robert Clark},
  title = {Prediction and Realisation of Conversational
                   Characteristics by Utilising Spontaneous Speech for
                   Unit Selection},
  booktitle = {Speech Prosody 2010},
  abstract = {Unit selection speech synthesis has reached high
                   levels of naturalness and intelligibility for neutral
                   read aloud speech. However, synthetic speech generated
                   using neutral read aloud data lacks all the attitude,
                   intention and spontaneity associated with everyday
                   conversations. Unit selection is heavily data dependent
                   and thus in order to simulate human conversational
                   speech, or create synthetic voices for believable
                   virtual characters, we need to utilise speech data with
                   examples of how people talk rather than how people
                   read. In this paper we included carefully selected
                   utterances from spontaneous conversational speech in a
                   unit selection voice. Using this voice and by
                   automatically predicting type and placement of lexical
                   fillers and filled pauses we can synthesise utterances
                   with conversational characteristics. A perceptual
                   listening test showed that it is possible to make
                   synthetic speech sound more conversational without
                   degrading naturalness.},
  categories = {speech synthesis, unit selection, conversation,
                   spontaneous speech, lexical fillers, filled pauses},
  month = may,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/100116.pdf},
  year = 2010
}
@inproceedings{zwyssig2010,
  author = {Zwyssig, Erich and Lincoln, Mike and Renals, Steve},
  title = {A Digital Microphone Array for Distant Speech
                   Recognition},
  booktitle = {Proc. IEEE ICASSP--10},
  pages = {5106--5109},
  abstract = {In this paper, the design, implementation and testing
                   of a digital microphone array is presented. The array
                   uses digital MEMS microphones which integrate the
                   microphone, amplifier and analogue to digital converter
                   on a single chip in place of the analogue microphones
                   and external audio interfaces currently used. The
                   device has the potential to be smaller, cheaper and
                   more flexible than typical analogue arrays, however the
                   effect on speech recognition performance of using
                   digital microphones is as yet unknown. In order to
                   evaluate the effect, an analogue array and the new
                   digital array are used to simultaneously record test
                   data for a speech recognition experiment. Initial
                   results employing no adaptation show that performance
                   using the digital array is significantly worse (14\%
                   absolute WER) than the analogue device. Subsequent
                   experiments using MLLR and CMLLR channel adaptation
                   reduce this gap, and employing MLLR for both channel
                   and speaker adaptation reduces the difference between
                   the arrays to 4.5\% absolute WER.},
  doi = {10.1109/ICASSP.2010.5495040},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/zwyssig-icassp10.pdf},
  year = 2010
}
@inproceedings{wolters-aaate:10,
  author = {Maria Wolters and Marilyn McGee-Lennon},
  title = {Designing Usable and Acceptable Reminders for the Home},
  booktitle = {Proc. AAATE Workshop AT Technology Transfer,
                   Sheffield, UK},
  abstract = {Electronic reminders can play a key role in enabling
                   people to manage their care and remain independent in
                   their own homes for longer. The MultiMemoHome project
                   aims to develop reminder designs that are accessible
                   and usable for users with a range of abilities and
                   preferences. In an initial exploration of key design
                   parameters, we surveyed 378 adults from all age groups
                   online (N=206) and by post (N= 172). The wide spread of
                   preferences that we found illustrates the importance of
                   adapting reminder solutions to individuals. We present
                   two reusable personas that emerged from the research
                   and discuss how questionnaires can be used for
                   technology transfer.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/Wolters_McGee-Lennon_AAATE_Final.pdf},
  year = 2010
}
@inproceedings{renals2010b,
  author = {Renals, Steve},
  title = {Recognition and Understanding of Meetings},
  booktitle = {Proc. NAACL/HLT},
  pages = {1--9},
  abstract = {This paper is about interpreting human communication
                   in meetings using audio, video and other signals.
                   Automatic meeting recognition and understanding is
                   extremely challenging, since communication in a meeting
                   is spontaneous and conversational, and involves
                   multiple speakers and multiple modalities. This leads
                   to a number of significant research problems in signal
                   processing, in speech recognition, and in discourse
                   interpretation, taking account of both individual and
                   group behaviours. Addressing these problems requires an
                   interdisciplinary effort. In this paper, I discuss the
                   capture and annotation of multimodal meeting recordings
                   - resulting in the AMI meeting corpus - and how we have
                   built on this to develop techniques and applications
                   for the recognition and interpretation of meetings.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/renals-naacl10.pdf},
  year = 2010
}
@inproceedings{kilgour2010,
  author = {Kilgour, Jonathan and Carletta, Jean and Renals, Steve},
  title = {The {Ambient Spotlight}: Queryless desktop search from
                   meeting speech},
  booktitle = {Proc ACM Multimedia 2010 Workshop SSCS 2010},
  abstract = {It has recently become possible to record any small
                   meeting using a laptop equipped with a plug-and-play
                   USB microphone array. We show the potential for such
                   recordings in a personal aid that allows project
                   managers to record their meetings and, when reviewing
                   them afterwards through a standard calendar interface,
                   to find relevant documents on their computer. This
                   interface is intended to supplement or replace the
                   textual searches that managers typically perform. The
                   prototype, which relies on meeting speech recognition
                   and topic segmentation, formulates and runs desktop
                   search queries in order to present its results.},
  doi = {10.1145/1878101.1878112},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2010/AmbientSpot.pdf},
  year = 2010
}
@inproceedings{wang_acmsccs2010,
  author = {Dong Wang and Simon King and Nick Evans and Raphael
                   Troncy},
  title = {Direct Posterior Confidence For Out-of-Vocabulary
                   Spoken Term Detection},
  booktitle = {Proc. ACM Multimedia 2010 Searching Spontaneous
                   Conversational Speech Workshop},
  abstract = {Spoken term detection (STD) is a fundamental task in
                   spoken information retrieval. Compared to conventional
                   speech transcription and keyword spotting, STD is an
                   open-vocabulary task and is necessarily required to
                   address out-of-vocabulary (OOV) terms. Approaches based
                   on subword units, e.g. phonemes, are widely used to
                   solve the OOV issue; however, performance on OOV terms
                   is still significantly inferior to that for
                   in-vocabulary (INV) terms. The performance degradation
                   on OOV terms can be attributed to a multitude of
                   factors. A particular factor we address in this paper
                   is that the acoustic and language models used for
                   speech transcribing are highly vulnerable to OOV terms,
                   which leads to unreliable confidence measures and
                   error-prone detections. A direct posterior confidence
                   measure that is derived from discriminative models has
                   been proposed for STD. In this paper, we utilize this
                   technique to tackle the weakness of OOV terms in
                   confidence estimation. Neither acoustic models nor
                   language models being included in the computation, the
                   new confidence avoids the weak modeling problem with
                   OOV terms. Our experiments, set up on multi-party
                   meeting speech which is highly spontaneous and
                   conversational, demonstrate that the proposed technique
                   improves STD performance on OOV terms significantly;
                   when combined with conventional lattice-based
                   confidence, a significant improvement in performance is
                   obtained on both INVs and OOVs. Furthermore, the new
                   confidence measure technique can be combined together
                   with other advanced techniques for OOV treatment, such
                   as stochastic pronunciation modeling and term-dependent
                   confidence discrimination, which leads to an integrated
                   solution for OOV STD with greatly improved performance.},
  categories = {confidence estimation, spoken term detection, speech
                   recognition},
  doi = {10.1145/1878101.1878107},
  month = oct,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wang_acmsccs2010.pdf},
  year = 2010
}
@misc{Carnival_SIGGRAPH_2010,
  author = {Michael Berger and Gregor Hofer and Hiroshi Shimodaira},
  title = {Carnival: a modular framework for automated facial
                   animation},
  howpublished = {Poster at SIGGRAPH 2010},
  note = {Bronze award winner, ACM Student Research Competition},
  abtract = {We present a software framework for speech- or
                   text-driven animation--including a platform-independent
                   API and an application implementing it--which unifies
                   state-of-the-art speech technology and graphics
                   technology within a single system.},
  address = {Los Angeles, Calif., USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/carnival.pdf},
  year = 2010
}
@inproceedings{Wang_TOIS2012,
  author = {Wang, Dong and King, Simon and Evans, Nicholas W. D.
                   and Troncy, Raphaël},
  title = {Direct posterior confidence for out-of-vocabulary
                   spoken term detection},
  booktitle = {{SSCS} 2010, {ACM} {W}orkshop on {S}earching
                   {S}pontaneous {C}onversational {S}peech, {S}eptember
                   20-24, 2010, {F}irenze, {I}taly},
  address = {{F}irenze, {ITALY}},
  abstract = {Spoken term detection (STD) is a fundamental task in
                   spoken information retrieval. Compared to conventional
                   speech transcription and keyword spotting, STD is an
                   open-vocabulary task and is necessarily required to
                   address out-of-vocabulary (OOV) terms. Approaches based
                   on subword units, e.g. phonemes, are widely used to
                   solve the OOV issue; however, performance on OOV terms
                   is still significantly inferior to that for
                   in-vocabulary (INV) terms. The performance degradation
                   on OOV terms can be attributed to a multitude of
                   factors. A particular factor we address in this paper
                   is that the acoustic and language models used for
                   speech transcribing are highly vulnerable to OOV terms,
                   which leads to unreliable confidence measures and
                   error-prone detections. A direct posterior confidence
                   measure that is derived from discriminative models has
                   been proposed for STD. In this paper, we utilize this
                   technique to tackle the weakness of OOV terms in
                   confidence estimation. Neither acoustic models nor
                   language models being included in the computation, the
                   new confidence avoids the weak modeling problem with
                   OOV terms. Our experiments, set up on multi-party
                   meeting speech which is highly spontaneous and
                   conversational, demonstrate that the proposed technique
                   improves STD performance on OOV terms significantly;
                   when combined with conventional lattice-based
                   confidence, a significant improvement in performance is
                   obtained on both INVs and OOVs. Furthermore, the new
                   confidence measure technique can be combined together
                   with other advanced techniques for OOV treatment, such
                   as stochastic pronunciation modeling and term-dependent
                   confidence discrimination, which leads to an integrated
                   solution for OOV STD with greatly improved performance.},
  doi = {http://dx.doi.org/10.1145/1878101.1878107},
  month = sep,
  year = 2010
}
@incollection{king:gold_and_morgan_chapter2009,
  author = {Simon King},
  title = {Speech Synthesis},
  booktitle = {Speech and Audio Signal Processing},
  publisher = {Wiley},
  editor = {Morgan and Ellis},
  abstract = {No abstract (this is a book chapter)},
  categories = {speech synthesis},
  year = 2010
}
@inproceedings{ling_interspeech2010,
  author = {Ling, Zhen-Hua and Richmond, Korin and Yamagishi,
                   Junichi},
  title = {{HMM}-based Text-to-Articulatory-Movement Prediction
                   and Analysis of Critical Articulators},
  booktitle = {Proc. Interspeech},
  pages = {2194--2197},
  address = {Makuhari, Japan},
  abstract = {In this paper we present a method to predict the
                   movement of a speaker's mouth from text input using
                   hidden Markov models (HMM). We have used a corpus of
                   human articulatory movements, recorded by
                   electromagnetic articulography (EMA), to train HMMs. To
                   predict articulatory movements from text, a suitable
                   model sequence is selected and the maximum-likelihood
                   parameter generation (MLPG) algorithm is used to
                   generate output articulatory trajectories. In our
                   experiments, we find that fully context-dependent
                   models outperform monophone and quinphone models,
                   achieving an average root mean square (RMS) error of
                   1.945mm when state durations are predicted from text,
                   and 0.872mm when natural state durations are used.
                   Finally, we go on to analyze the prediction error for
                   different EMA dimensions and phone types. We find a
                   clear pattern emerges that the movements of so-called
                   critical articulators can be predicted more accurately
                   than the average performance.},
  keywords = {Hidden Markov model, articulatory features, parameter
                   generation, critical articulators},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100349.pdf},
  year = 2010
}
@article{roberto:specom2010,
  author = {R. Barra-Chicote and J. Yamagishi and S. King and J.
                   Manuel Monero and J. Macias-Guarasa},
  title = {Analysis of Statistical Parametric and Unit-Selection
                   Speech Synthesis Systems Applied to Emotional Speech},
  journal = {Speech Communication},
  volume = {52},
  number = {5},
  pages = {394--404},
  abstract = {We have applied two state-of-the-art speech synthesis
                   techniques (unit selection and HMM-based synthesis) to
                   the synthesis of emotional speech. A series of
                   carefully designed perceptual tests to evaluate speech
                   quality, emotion identification rates and emotional
                   strength were used for the six emotions which we
                   recorded -- happiness, sadness, anger, surprise, fear,
                   disgust. For the HMM-based method, we evaluated
                   spectral and source components separately and
                   identified which components contribute to which
                   emotion. Our analysis shows that, although the HMM
                   method produces significantly better neutral speech,
                   the two methods produce emotional speech of similar
                   quality, except for emotions having context-dependent
                   prosodic patterns. Whilst synthetic speech produced
                   using the unit selection method has better emotional
                   strength scores than the HMM-based method, the
                   HMM-based method has the ability to manipulate the
                   emotional strength. For emotions that are characterized
                   by both spectral and prosodic components, synthetic
                   speech using unit selection methods was more accurately
                   identified by listeners. For emotions mainly
                   characterized by prosodic components, HMM-based
                   synthetic speech was more accurately identified. This
                   finding differs from previous results regarding
                   listener judgements of speaker similarity for neutral
                   speech. We conclude that unit selection methods require
                   improvements to prosodic modeling and that HMM-based
                   methods require improvements to spectral modeling for
                   emotional speech. Certain emotions cannot be reproduced
                   well by either method.},
  doi = {10.1016/j.specom.2009.12.007},
  keywords = {Emotional speech synthesis; HMM-based synthesis; Unit
                   selection},
  month = may,
  year = 2010
}
@inproceedings{janska_clark:2010a,
  author = {Anna C. Janska and Robert A. J. Clark},
  title = {Native and Non-Native Speaker Judgements on the
                   Quality of Synthesized Speech},
  booktitle = {Proc. Interspeech},
  pages = {1121--1124},
  abstract = {The difference between native speakers' and non-native
                   speak- ers' naturalness judgements of synthetic speech
                   is investigated. Similar/difference judgements are
                   analysed via a multidimensional scaling analysis and
                   compared to Mean opinion scores. It is shown that
                   although the two groups generally behave in a similar
                   manner the variance of non-native speaker judgements is
                   generally higher. While both groups of subject can
                   clearly distinguish natural speech from the best
                   synthetic examples, the groups' responses to different
                   artefacts present in the synthetic speech can vary. },
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/janskaclark_interspeech2010.pdf},
  year = 2010
}
@techreport{wester_accent2010,
  author = {Wester, M.},
  title = {The {EMIME} {B}ilingual {D}atabase},
  institution = {The University of Edinburgh},
  number = {EDI-INF-RR-1388},
  abstract = {This paper describes the collection of a bilingual
                   database of Finnish/English and German/English data. In
                   addition, the accents of the talkers in the database
                   have been rated. English, German and Finnish listeners
                   assessed the English, German and Finnish talkers{\^a}
                   degree of foreign accent in English. Native English
                   listeners showed higher inter-listener agreement than
                   non-native listeners. Further analyses showed that
                   non-native listeners judged Finnish and German female
                   talkers to be significantly less accented than do
                   English listeners. German males are judged less
                   accented by Finnish listeners than they are by English
                   and German listeners and there is no difference between
                   listeners as to how they judge the accent of Finnish
                   males. Finally, all English talkers are judged more
                   accented by non-native listeners than they are by
                   native English listeners.},
  categories = {evaluation,cross-lingual, accent rating},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wester_accent_2010.pdf},
  year = 2010
}
@article{5510125,
  author = {Wang, D. and King, S. and Frankel, J.},
  title = {Stochastic Pronunciation Modelling for
                   Out-of-Vocabulary Spoken Term Detection},
  journal = {Audio, Speech, and Language Processing, IEEE
                   Transactions on},
  volume = {PP},
  number = {99},
  abstract = {Spoken term detection (STD) is the name given to the
                   task of searching large amounts of audio for
                   occurrences of spoken terms, which are typically single
                   words or short phrases. One reason that STD is a hard
                   task is that search terms tend to contain a
                   disproportionate number of out-of-vocabulary (OOV)
                   words. The most common approach to STD uses subword
                   units. This, in conjunction with some method for
                   predicting pronunciations of OOVs from their written
                   form, enables the detection of OOV terms but
                   performance is considerably worse than for
                   in-vocabulary terms. This performance differential can
                   be largely attributed to the special properties of
                   OOVs. One such property is the high degree of
                   uncertainty in the pronunciation of OOVs. We present a
                   stochastic pronunciation model (SPM) which explicitly
                   deals with this uncertainty. The key insight is to
                   search for all possible pronunciations when detecting
                   an OOV term, explicitly capturing the uncertainty in
                   pronunciation. This requires a probabilistic model of
                   pronunciation, able to estimate a distribution over all
                   possible pronunciations. We use a joint-multigram model
                   (JMM) for this and compare the JMM-based SPM with the
                   conventional soft match approach. Experiments using
                   speech from the meetings domain demonstrate that the
                   SPM performs better than soft match in most operating
                   regions, especially at low false alarm probabilities.
                   Furthermore, SPM and soft match are found to be
                   complementary: their combination provides further
                   performance gains.},
  categories = {confidence estimation, spoken term detection, speech
                   recognition, OOVs},
  doi = {10.1109/TASL.2010.2058800},
  issn = {1558-7916},
  month = jul,
  year = 2010
}
@inproceedings{phillip:icassp2010,
  author = {P. L. De Leon and V. R. Apsingekar and M. Pucher and
                   J. Yamagishi},
  title = {Revisiting the security of speaker verification
                   systems against imposture using synthetic speech},
  booktitle = {{Proc. ICASSP 2010}},
  address = {Dallas, Texas, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/main_r2.pdf},
  year = 2010
}
@inproceedings{felps_interspeech2010,
  author = {Felps, Daniel and Geng, Christian and Berger, Michael
                   and Richmond, Korin and Gutierrez-Osuna, Ricardo},
  title = {Relying on critical articulators to estimate vocal
                   tract spectra in an articulatory-acoustic database},
  booktitle = {Proc. Interspeech},
  pages = {1990--1993},
  abstract = {We present a new phone-dependent feature weighting
                   scheme that can be used to map articulatory
                   configurations (e.g. EMA) onto vocal tract spectra
                   (e.g. MFCC) through table lookup. The approach consists
                   of assigning feature weights according to a feature's
                   ability to predict the acoustic distance between
                   frames. Since an articulator's predictive accuracy is
                   phone-dependent (e.g., lip location is a better
                   predictor for bilabial sounds than for palatal sounds),
                   a unique weight vector is found for each phone.
                   Inspection of the weights reveals a correspondence with
                   the expected critical articulators for many phones. The
                   proposed method reduces overall cepstral error by 6\%
                   when compared to a uniform weighting scheme. Vowels
                   show the greatest benefit, though improvements occur
                   for 80\% of the tested phones.},
  keywords = {speech production, speech synthesis},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100076.pdf},
  year = 2010
}
@article{wolters-uais:10,
  author = {Maria Wolters and Klaus-Peter Engelbrecht and Florian
                   G\"odde and Sebastian M\"oller and Anja Naumann and
                   Robert Schleicher},
  title = {Making it Easier for Older People to Talk to Smart
                   Homes: Using Help Prompts to Shape Users' Speech},
  journal = {Universal Access in the Information Society},
  volume = {9},
  number = {4},
  pages = {311-325},
  abstract = {It is well known that help prompts shape how users
                   talk to spoken dialogue systems. This study
                   investigated the effect of help prompt placement on
                   older users' interaction with a smart home interface.
                   In the dynamic help condition, help was only given in
                   response to system errors; in the inherent help
                   condition, it was also given at the start of each task.
                   Fifteen older and sixteen younger users interacted with
                   a smart home system using two different scenarios. Each
                   scenario consisted of several tasks. The linguistic
                   style users employed to communicate with the system
                   (interaction style) was measured using the ratio of
                   commands to the overall utterance length (keyword
                   ratio) and the percentage of content words in the
                   user's utterance that could be understood by the system
                   (shared vocabulary). While the timing of help prompts
                   did not affect the interaction style of younger users,
                   it was early task-specific help supported older users
                   in adapting their interaction style to the system's
                   capabilities. Well-placed help prompts can
                   significantly increase the usability of spoken dialogue
                   systems for older people.},
  categories = {spoken dialogue systems, usability, older adults,
                   smart homes, help prompts},
  doi = {10.1007/s10209-009-0184-x},
  year = 2010
}
@article{white_clark_moore:2010,
  author = {Michael White and Robert A. J. Clark and Johanna D.
                   Moore},
  title = {Generating Tailored, Comparative Descriptions with
                   Contextually Appropriate Intonation},
  journal = {Computational Linguistics},
  volume = {36},
  number = {2},
  pages = {159-201},
  abstract = {Generating responses that take user preferences into
                   account requires adaptation at all levels of the
                   generation process. This article describes a
                   multi-level approach to presenting user-tailored
                   information in spoken dialogues which brings together
                   for the first time multi-attribute decision models,
                   strategic content planning, surface realization that
                   incorporates prosody prediction, and unit selection
                   synthesis that takes the resulting prosodic structure
                   into account. The system selects the most important
                   options to mention and the attributes that are most
                   relevant to choosing between them, based on the user
                   model. Multiple options are selected when each offers a
                   compelling trade-off. To convey these trade-offs, the
                   system employs a novel presentation strategy which
                   straightforwardly lends itself to the determination of
                   information structure, as well as the contents of
                   referring expressions. During surface realization, the
                   prosodic structure is derived from the information
                   structure using Combinatory Categorial Grammar in a way
                   that allows phrase boundaries to be determined in a
                   flexible, data-driven fashion. This approach to
                   choosing pitch accents and edge tones is shown to yield
                   prosodic structures with significantly higher
                   acceptability than baseline prosody prediction models
                   in an expert evaluation. These prosodic structures are
                   then shown to enable perceptibly more natural synthesis
                   using a unit selection voice that aims to produce the
                   target tunes, in comparison to two baseline synthetic
                   voices. An expert evaluation and f0 analysis confirm
                   the superiority of the generator-driven intonation and
                   its contribution to listeners' ratings.},
  doi = {10.1162/coli.09-023-R1-08-002},
  year = 2010
}
@inproceedings{friedrich:COST2102,
  author = {Michael Pucher and Friedrich Neubarth and Volker Strom},
  title = {Optimizing Phonetic Encoding for {V}iennese Unit
                   Selection Speech Synthesis},
  booktitle = {COST 2102 Int. Training School 2009, LNCS},
  editor = {A. Esposito et al.},
  address = {Heidelberg},
  publisher = {Springer-Verlag},
  abstract = {While developing lexical resources for a particular
                   language variety (Viennese), we experimented with a set
                   of 5 different phonetic encodings, termed phone sets,
                   used for unit selection speech synthesis. We started
                   with a very rich phone set based on phonological
                   considerations and covering as much phonetic
                   variability as possible, which was then reduced to
                   smaller sets by applying transformation rules that map
                   or merge phone symbols. The optimal trade-off was found
                   measuring the phone error rates of automatically learnt
                   grapheme-to-phone rules and by a perceptual evaluation
                   of 27 representative synthesized sentences. Further, we
                   describe a method to semi-automatically enlarge the
                   lexical resources for the target language variety using
                   a lexicon base for Standard Austrian German.},
  categories = {speech synthesis, language varieties, phonetic
                   encoding, grapheme-to-phone, pronunciation lexicon.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/COST2102.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/COST2102.ps},
  year = 2010
}
@article{huang2010,
  author = {Huang, Songfang and Renals, Steve},
  title = {Hierarchical {Bayesian} Language Models for
                   Conversational Speech Recognition},
  journal = {IEEE Transactions on Audio, Speech and Language
                   Processing},
  volume = {18},
  number = {8},
  pages = {1941--1954},
  abstract = {Traditional n-gram language models are widely used in
                   state-of-the-art large vocabulary speech recognition
                   systems. This simple model suffers from some
                   limitations, such as overfitting of maximum-likelihood
                   estimation and the lack of rich contextual knowledge
                   sources. In this paper, we exploit a hierarchical
                   Bayesian interpretation for language modeling, based on
                   a nonparametric prior called the Pitman--Yor process.
                   This offers a principled approach to language model
                   smoothing, embedding the power-law distribution for
                   natural language. Experiments on the recognition of
                   conversational speech in multiparty meetings
                   demonstrate that by using hierarchical Bayesian
                   language models, we are able to achieve significant
                   reductions in perplexity and word error rate.},
  doi = {10.1109/TASL.2010.2040782},
  keywords = {AMI corpus , conversational speech recognition ,
                   hierarchical Bayesian model , language model (LM) ,
                   meetings , smoothing},
  month = {January},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/huang-taslp10.pdf},
  url = {http://dx.doi.org/10.1109/TASL.2010.2040782},
  year = 2010
}
@inproceedings{richmond_interspeech2010,
  author = {Richmond, Korin and Clark, Robert and Fitt, Sue},
  title = {On Generating {C}ombilex Pronunciations via
                   Morphological Analysis},
  booktitle = {Proc. Interspeech},
  pages = {1974--1977},
  address = {Makuhari, Japan},
  abstract = {Combilex is a high-quality lexicon that has been
                   developed specifically for speech technology purposes
                   and recently released by CSTR. Combilex benefits from
                   many advanced features. This paper explores one of
                   these: the ability to generate fully-specified
                   transcriptions for morphologically derived words
                   automatically. This functionality was originally
                   implemented to encode the pronunciations of derived
                   words in terms of their constituent morphemes, thus
                   accelerating lexicon development and ensuring a high
                   level of consistency. In this paper, we propose this
                   method of modelling pronunciations can be exploited
                   further by combining it with a morphological parser,
                   thus yielding a method to generate full transcriptions
                   for unknown derived words. Not only could this
                   accelerate adding new derived words to Combilex, but it
                   could also serve as an alternative to conventional
                   letter-to-sound rules. This paper presents preliminary
                   work indicating this is a promising direction.},
  keywords = {combilex lexicon, letter-to-sound rules,
                   grapheme-to-phoneme conversion, morphological
                   decomposition},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100683.pdf},
  year = 2010
}
@inproceedings{yong:ssw7,
  author = {Yong Guan and Jilei Tian and Yi-Jian Wu and Junichi
                   Yamagishi and Jani Nurminen},
  title = {A Unified and Automatic Approach Of {M}andarin {HTS}
                   System},
  booktitle = {{Proc. SSW7}},
  address = {Kyoto, Japan},
  keywords = {HTS, speech synthesis, mandarin},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/HTS_Yong_ssw7.pdf},
  year = 2010
}
@inproceedings{wester:interspeech:10,
  author = {Mirjam Wester},
  title = {Cross-lingual talker discrimination},
  booktitle = {Proc. of Interspeech},
  address = {Makuhari, Japan},
  abstract = {This paper describes a talker discrimination
                   experiment in which native English listeners were
                   presented with two sentences spoken by bilingual
                   talkers (English/German and English/Finnish) and were
                   asked to judge whether they thought the sentences were
                   spoken by the same person or not. Equal amounts of
                   cross-lingual and matched-language trials were
                   presented. The experiments showed that listeners are
                   able to complete this task well, they can discriminate
                   between talkers significantly better than chance.
                   However, listeners are significantly less accurate on
                   cross-lingual talker trials than on matched-language
                   pairs. No significant differences were found on this
                   task between German and Finnish. Bias (B'') and
                   Sensitivity (A') values are presented to analyse the
                   listeners' behaviour in more detail. The results are
                   promising for the evaluation of EMIME, a project
                   covering speech-to-speech translation with speaker
                   adaptation.},
  categories = {evaluation},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wester_interspeech2010.pdf},
  year = 2010
}
@inproceedings{friedrich:lrec2010,
  author = {Michael Pucher and Friedrich Neubarth and Volker Strom
                   and Sylvia Moosmüller and Gregor Hofer and Christian
                   Kranzler and Gudrun Schuchmann and Dietmar Schabus},
  title = {Resources for speech synthesis of Viennese varieties},
  booktitle = {Proc.~Int.~Conf.~on Language Resources and Evaluation,
                   LREC'10},
  address = {Malta},
  publisher = {European Language Resources Association (ELRA)},
  abstract = {This paper describes our work on developing corpora of
                   three varieties of Viennese for unit selection speech
                   synthesis. The synthetic voices for Viennese varieties,
                   implemented with the open domain unit selection speech
                   synthesis engine Multisyn of Festival will also be
                   released within Festival. The paper especially focuses
                   on two questions: how we selected the appropriate
                   speakers and how we obtained the text sources needed
                   for the recording of these non-standard varieties.
                   Regarding the first one, it turned out that working
                   with a ‘prototypical’ professional speaker was much
                   more preferable than striving for authenticity. In
                   addition, we give a brief outline about the differences
                   between the Austrian standard and its dialectal
                   varieties and how we solved certain technical problems
                   that are related to these differences. In particular,
                   the specific set of phones applicable to each variety
                   had to be determined by applying various constraints.
                   Since such a set does not serve any descriptive
                   purposes but rather is influencing the quality of
                   speech synthesis, a careful design of such a (in most
                   cases reduced) set was an important task.},
  categories = {speech synthesis, language varieties, phonetic
                   encoding, graphem-to-phone, pronunciation lexicon.},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2010/lrec2010_viennese.pdf},
  ps = {http://www.cstr.ed.ac.uk/downloads/publications/2010/lrec2010_viennese.ps},
  year = 2010
}
@inproceedings{janska_clark:2010b,
  author = {Anna C. Janska and Robert A. J. Clark},
  title = {Further exploration of the possibilities and pitfalls
                   of multidimensional scaling as a tool for the
                   evaluation of the quality of synthesized speech},
  booktitle = {The 7th ISCA Tutorial and Research Workshop on Speech
                   Synthesis},
  pages = {142--147},
  abstract = {Multidimensional scaling (MDS) has been suggested as a
                   use- ful tool for the evaluation of the quality of
                   synthesized speech. However, it has not yet been
                   extensively tested for its applica- tion in this
                   specific area of evaluation. In a series of experi-
                   ments based on data from the Blizzard Challenge 2008
                   the relations between Weighted Euclidean Distance
                   Scaling and Simple Euclidean Distance Scaling is
                   investigated to understand how aggregating data affects
                   the MDS configuration. These results are compared to
                   those collected as mean opinion scores (MOS). The ranks
                   correspond, and MOS can be predicted from an object's
                   space in the MDS generated stimulus space. The big
                   advantage of MDS over MOS is its diagnostic value;
                   dimensions along which stimuli vary are not correlated,
                   as is the case in modular evaluation using MOS.
                   Finally, it will be attempted to generalize from the
                   MDS representations of the thoroughly tested subset to
                   the aggregated data of the larger-scale Blizzard
                   Challenge.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/janskaclark_ssw7.pdf},
  year = 2010
}
@inproceedings{cabral_ssw7,
  author = {Cabral, Jo{\~a}o and Renals, Steve and Richmond, Korin
                   and Yamagishi, Junichi},
  title = {Transforming Voice Source Parameters in a {HMM}-based
                   Speech Synthesiser with Glottal Post-Filtering},
  booktitle = {Proc. 7th ISCA Speech Synthesis Workshop (SSW7)},
  pages = {365--370},
  address = {NICT/ATR, Kyoto, Japan},
  abstract = {Control over voice quality, e.g. breathy and tense
                   voice, is important for speech synthesis applications.
                   For example, transformations can be used to modify
                   aspects of the voice re- lated to speaker's identity
                   and to improve expressiveness. How- ever, it is hard to
                   modify voice characteristics of the synthetic speech,
                   without degrading speech quality. State-of-the-art sta-
                   tistical speech synthesisers, in particular, do not
                   typically al- low control over parameters of the
                   glottal source, which are strongly correlated with
                   voice quality. Consequently, the con- trol of voice
                   characteristics in these systems is limited. In con-
                   trast, the HMM-based speech synthesiser proposed in
                   this paper uses an acoustic glottal source model. The
                   system passes the glottal signal through a whitening
                   filter to obtain the excitation of voiced sounds. This
                   technique, called glottal post-filtering, allows to
                   transform voice characteristics of the synthetic speech
                   by modifying the source model parameters. We evaluated
                   the proposed synthesiser in a perceptual ex- periment,
                   in terms of speech naturalness, intelligibility, and
                   similarity to the original speaker's voice. The results
                   show that it performed as well as a HMM-based
                   synthesiser, which generates the speech signal with a
                   commonly used high-quality speech vocoder.},
  keywords = {HMM-based speech synthesis, voice quality, glottal
                   post-filter},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/cabral_ssw7.pdf},
  year = 2010
}
@inproceedings{vipperla2010a,
  author = {Vipperla, Ravi Chander and Renals, Steve and Frankel,
                   Joe},
  title = {Augmentation of adaptation data},
  booktitle = {Proc. Interspeech},
  pages = {530--533},
  address = {Makuhari, Japan},
  abstract = {Linear regression based speaker adaptation approaches
                   can improve Automatic Speech Recognition (ASR) accuracy
                   significantly for a target speaker. However, when the
                   available adaptation data is limited to a few seconds,
                   the accuracy of the speaker adapted models is often
                   worse compared with speaker independent models. In this
                   paper, we propose an approach to select a set of
                   reference speakers acoustically close to the target
                   speaker whose data can be used to augment the
                   adaptation data. To determine the acoustic similarity
                   of two speakers, we propose a distance metric based on
                   transforming sample points in the acoustic space with
                   the regression matrices of the two speakers. We show
                   the validity of this approach through a speaker
                   identification task. ASR results on SCOTUS and AMI
                   corpora with limited adaptation data of 10 to 15
                   seconds augmented by data from selected reference
                   speakers show a significant improvement in Word Error
                   Rate over speaker independent and speaker adapted
                   models.},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/vipperla-is2010.pdf},
  year = 2010
}
@article{Ling2010834,
  author = {Zhen-Hua Ling and Korin Richmond and Junichi Yamagishi},
  title = {An Analysis of {HMM}-based prediction of articulatory
                   movements},
  journal = {Speech Communication},
  volume = {52},
  number = {10},
  pages = {834--846},
  abstract = { This paper presents an investigation into predicting
                   the movement of a speaker's mouth from text input using
                   hidden Markov models (HMM). A corpus of human
                   articulatory movements, recorded by electromagnetic
                   articulography (EMA), is used to train HMMs. To predict
                   articulatory movements for input text, a suitable model
                   sequence is selected and a maximum-likelihood parameter
                   generation (MLPG) algorithm is used to generate output
                   articulatory trajectories. Unified
                   acoustic-articulatory HMMs are introduced to integrate
                   acoustic features when an acoustic signal is also
                   provided with the input text. Several aspects of this
                   method are analyzed in this paper, including the
                   effectiveness of context-dependent modeling, the role
                   of supplementary acoustic input, and the
                   appropriateness of certain model structures for the
                   unified acoustic-articulatory models. When text is the
                   sole input, we find that fully context-dependent models
                   significantly outperform monophone and quinphone
                   models, achieving an average root mean square (RMS)
                   error of 1.945 mm and an average correlation
                   coefficient of 0.600. When both text and acoustic
                   features are given as input to the system, the
                   difference between the performance of quinphone models
                   and fully context-dependent models is no longer
                   significant. The best performance overall is achieved
                   using unified acoustic-articulatory quinphone HMMs with
                   separate clustering of acoustic and articulatory model
                   parameters, a synchronous-state sequence, and a
                   dependent-feature model structure, with an RMS error of
                   0.900 mm and a correlation coefficient of 0.855 on
                   average. Finally, we also apply the same quinphone HMMs
                   to the acoustic-articulatory, or inversion, mapping
                   problem, where only acoustic input is available. An
                   average root mean square (RMS) error of 1.076 mm and an
                   average correlation coefficient of 0.812 are achieved.
                   Taken together, our results demonstrate how text and
                   acoustic inputs both contribute to the prediction of
                   articulatory movements in the method used.},
  doi = {10.1016/j.specom.2010.06.006},
  issn = {0167-6393},
  keywords = {Hidden Markov model; Articulatory features; Parameter
                   generation},
  month = {October},
  year = 2010
}
@inproceedings{huang2010a,
  author = {Huang, Songfang and Renals, Steve},
  title = {Power Law Discounting for N-Gram Language Models},
  booktitle = {Proc. IEEE ICASSP--10},
  pages = {5178--5181},
  abstract = {We present an approximation to the Bayesian
                   hierarchical Pitman-Yor process language model which
                   maintains the power law distribution over word tokens,
                   while not requiring a computationally expensive
                   approximate inference process. This approximation,
                   which we term power law discounting, has a similar
                   computational complexity to interpolated and modified
                   Kneser-Ney smoothing. We performed experiments on
                   meeting transcription using the NIST RT06s evaluation
                   data and the AMI corpus, with a vocabulary of 50,000
                   words and a language model training set of up to 211
                   million words. Our results indicate that power law
                   discounting results in statistically significant
                   reductions in perplexity and word error rate compared
                   to both interpolated and modified Kneser-Ney smoothing,
                   while producing similar results to the hierarchical
                   Pitman-Yor process language model.},
  doi = {10.1109/ICASSP.2010.5495007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/huang-icassp10.pdf},
  url = {http://dx.doi.org/10.1109/ICASSP.2010.5495007},
  year = 2010
}
@inproceedings{wolters2010,
  author = {Wolters, Maria K. and Isaac, Karl B. and Renals, Steve},
  title = {Evaluating speech synthesis intelligibility using
                   {Amazon Mechanical Turk}},
  booktitle = {Proc. 7th Speech Synthesis Workshop (SSW7)},
  pages = {136--141},
  abstract = {Microtask platforms such as Amazon Mechanical Turk
                   (AMT) are increasingly used to create speech and
                   language resources. AMT in particular allows
                   researchers to quickly recruit a large number of fairly
                   demographically diverse participants. In this study, we
                   investigated whether AMT can be used for comparing the
                   intelligibility of speech synthesis systems. We
                   conducted two experiments in the lab and via AMT, one
                   comparing US English diphone to US English
                   speaker-adaptive HTS synthesis and one comparing UK
                   English unit selection to UK English speaker-dependent
                   HTS synthesis. While AMT word error rates were worse
                   than lab error rates, AMT results were more sensitive
                   to relative differences between systems. This is mainly
                   due to the larger number of listeners. Boxplots and
                   multilevel modelling allowed us to identify listeners
                   who performed particularly badly, while thresholding
                   was sufficient to eliminate rogue workers. We conclude
                   that AMT is a viable platform for synthetic speech
                   intelligibility comparisons.},
  categories = {intelligibility, evaluation, semantically
                   unpredictable sentences, diphone, unit selection,
                   crowd- sourcing, Mechanical Turk, HMM-based synthesis},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wolters-ssw2010.pdf},
  year = 2010
}
@inproceedings{phillip:odyssey2010,
  author = {P.L. De Leon and M. Pucher and J. Yamagishi},
  title = {Evaluation of the Vulnerability of Speaker
                   Verification to Synthetic Speech},
  booktitle = {{Proc. Odyssey (The speaker and language recognition
                   workshop) 2010}},
  address = {Brno, Czech Republic},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/main_v2.pdf},
  year = 2010
}
@incollection{renals2010,
  author = {Renals, Steve and King, Simon},
  title = {Automatic Speech Recognition},
  booktitle = {Handbook of Phonetic Sciences},
  publisher = {Wiley Blackwell},
  editor = {Hardcastle, William J. and Laver, John and Gibbon,
                   Fiona E.},
  chapter = {22},
  year = 2010
}
@article{vipperla2010,
  author = {Vipperla, Ravi Chander and Renals, Steve and Frankel,
                   Joe},
  title = {Ageing voices: The effect of changes in voice
                   parameters on {ASR} performance},
  journal = {EURASIP Journal on Audio, Speech, and Music Processing},
  abstract = {With ageing, human voices undergo several changes
                   which are typically characterized by increased
                   hoarseness and changes in articulation patterns. In
                   this study, we have examined the effect on Automatic
                   Speech Recognition (ASR) and found that the Word Error
                   Rates (WER) on older voices is about 9\% absolute
                   higher compared to those of adult voices. Subsequently,
                   we compared several voice source parameters including
                   fundamental frequency, jitter, shimmer, harmonicity and
                   cepstral peak prominence of adult and older males.
                   Several of these parameters show statistically
                   significant difference for the two groups. However,
                   artificially increasing jitter and shimmer measures do
                   not effect the ASR accuracies significantly.
                   Artificially lowering the fundamental frequency
                   degrades the ASR performance marginally but this drop
                   in performance can be overcome to some extent using
                   Vocal Tract Length Normalisation (VTLN). Overall, we
                   observe that the changes in the voice source parameters
                   do not have a significant impact on ASR performance.
                   Comparison of the likelihood scores of all the phonemes
                   for the two age groups show that there is a systematic
                   mismatch in the acoustic space of the two age groups.
                   Comparison of the phoneme recognition rates show that
                   mid vowels, nasals and phonemes that depend on the
                   ability to create constrictions with tongue tip for
                   articulation are more affected by ageing than other
                   phonemes.},
  doi = {10.1155/2010/525783},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/vipperla-eurasip10.pdf},
  url = {http://dx.doi.org/10.1155/2010/525783},
  year = 2010
}
@inproceedings{oura:icassp:10,
  author = {Keiichiro Oura and Keiichi Tokuda and Junichi
                   Yamagishi and Mirjam Wester and Simon King},
  title = {Unsupervised Cross-lingual Speaker Adaptation for
                   {HMM}-based Speech Synthesis},
  booktitle = {Proc. of ICASSP},
  volume = {I},
  pages = {4954-4957},
  abstract = {In the EMIME project, we are developing a mobile
                   device that performs personalized speech-to-speech
                   translation such that a user's spoken input in one
                   language is used to produce spoken output in another
                   language, while continuing to sound like the user's
                   voice. We integrate two techniques, unsupervised
                   adaptation for HMM-based TTS using a word-based
                   large-vocabulary continuous speech recognizer and
                   cross-lingual speaker adaptation for HMM-based TTS,
                   into a single architecture. Thus, an unsupervised
                   cross-lingual speaker adaptation system can be
                   developed. Listening tests show very promising results,
                   demonstrating that adapted voices sound similar to the
                   target speaker and that differences between supervised
                   and unsupervised cross-lingual speaker adaptation are
                   small.},
  categories = {speaker adaptation, TTS},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/oura_icassp2010.pdf},
  year = 2010
}
@misc{Hofer_Berger:sigg2010,
  author = {Gregor Hofer and Korin Richmond and Michael Berger},
  title = {Lip Synchronization by Acoustic Inversion},
  howpublished = {Poster at Siggraph 2010},
  address = {Los Angeles, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/lipsync-sig10.pdf},
  year = 2010
}
@incollection{renals2010a,
  author = {Renals, Steve and Hain, Thomas},
  title = {Speech Recognition},
  booktitle = {Handbook of Computational Linguistics and Natural
                   Language Processing},
  publisher = {Wiley Blackwell},
  editor = {Clark, Alex and Fox, Chris and Lappin, Shalom},
  year = 2010
}
@inproceedings{wang_interspeech10,
  author = {Dong Wang and Simon King and Nick Evans and Raphael
                   Troncy},
  title = {{CRF}-based Stochastic Pronunciation Modelling for
                   Out-of-Vocabulary Spoken Term Detection},
  booktitle = {Proc. Interspeech},
  address = {Makuhari, Chiba, Japan},
  abstract = {Out-of-vocabulary (OOV) terms present a significant
                   challenge to spoken term detection (STD). This
                   challenge, to a large extent, lies in the high degree
                   of uncertainty in pronunciations of OOV terms. In
                   previous work, we presented a stochastic pronunciation
                   modeling (SPM) approach to compensate for this
                   uncertainty. A shortcoming of our original work,
                   however, is that the SPM was based on a joint-multigram
                   model (JMM), which is suboptimal. In this paper, we
                   propose to use conditional random fields (CRFs) for
                   letter-to-sound conversion, which significantly
                   improves quality of the predicted pronunciations. When
                   applied to OOV STD, we achieve consider- able
                   performance improvement with both a 1-best system and
                   an SPM-based system.},
  categories = {speech recognition, spoken term detection, conditional
                   random field, joint multigram model},
  month = sep,
  year = 2010
}
@inproceedings{strom10d,
  author = {Volker Strom and Simon King},
  title = {A classifier-based target cost for unit selection
                   speech synthesis trained on perceptual data},
  booktitle = {Proc.~Interspeech},
  address = {Makuhari, Japan},
  abstract = {Our goal is to automatically learn a
                   PERCEPTUALLY-optimal target cost function for a unit
                   selection speech synthesiser. The approach we take here
                   is to train a classifier on human perceptual judgements
                   of synthetic speech. The output of the classifier is
                   used to make a simple three-way distinction rather than
                   to estimate a continuously-valued cost. In order to
                   collect the necessary perceptual data, we synthesised
                   145,137 short sentences with the usual target cost
                   switched off, so that the search was driven by the join
                   cost only. We then selected the 7200 sentences with the
                   best joins and asked 60 listeners to judge them,
                   providing their ratings for each syllable. From this,
                   we derived a rating for each demiphone. Using as input
                   the same context features employed in our conventional
                   target cost function, we trained a classifier on these
                   human perceptual ratings. We synthesised two sets of
                   test sentences with both our standard target cost and
                   the new target cost based on the classifier. A/B
                   preference tests showed that the classifier-based
                   target cost, which was learned completely automatically
                   from modest amounts of perceptual data, is almost as
                   good as our carefully- and expertly-tuned standard
                   target cost.},
  categories = {speech synthesis, unit selection, target cost},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/strom10d.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/strom10d.ps},
  year = 2010
}
@inproceedings{wang_std_covariance_icassp2010,
  author = {Wang, Dong and King, Simon and Frankel, Joe and Bell,
                   Peter},
  title = {Stochastic Pronunciation Modelling and Soft Match for
                   Out-of-vocabulary Spoken Term Detection},
  booktitle = {Proc. ICASSP},
  address = {Dallas, Texas, USA},
  abstract = {A major challenge faced by a spoken term detection
                   (STD) system is the detection of out-of-vocabulary
                   (OOV) terms. Although a subword-based STD system is
                   able to detect OOV terms, performance reduction is
                   always observed compared to in-vocabulary terms. One
                   challenge that OOV terms bring to STD is the
                   pronunciation uncertainty. A commonly used approach to
                   address this problem is a soft matching procedure,and
                   the other is the stochastic pronunciation modelling
                   (SPM) proposed by the authors. In this paper we compare
                   these two approaches, and combine them using a
                   discriminative decision strategy. Experimental results
                   demonstrated that SPM and soft match are highly
                   complementary, and their combination gives significant
                   performance improvement to OOV term detection.},
  keywords = {confidence estimation, spoken term detection, speech
                   recognition},
  month = mar,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wang10_icassp.pdf},
  year = 2010
}
@inproceedings{georgila-sigdial:10,
  author = {Georgila, Kallirroi and Wolters, Maria and Moore,
                   Johanna D.},
  title = {Learning Dialogue Strategies from Older and Younger
                   Simulated Users},
  booktitle = {Proc. SIGDIAL},
  abstract = {Older adults are a challenging user group because
                   their behaviour can be highly variable. To the best of
                   our knowledge, this is the first study where dialogue
                   strategies are learned and evaluated with both
                   simulated younger users and simulated older users. The
                   simulated users were derived from a corpus of
                   interactions with a strict system-initiative spoken
                   dialogue system (SDS). Learning from simulated younger
                   users leads to a policy which is close to one of the
                   dialogue strategies of the underlying SDS, while the
                   simulated older users allow us to learn more flexible
                   dialogue strategies that accommodate mixed initiative.
                   We conclude that simulated users are a useful technique
                   for modelling the behaviour of new user groups.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/sigdial_final.pdf},
  year = 2010
}
@inproceedings{higher_level,
  author = {Oliver Watts and Junichi Yamagishi and Simon King},
  title = {The role of higher-level linguistic features in
                   {HMM}-based speech synthesis},
  booktitle = {Proc. Interspeech},
  pages = {841-844},
  address = {Makuhari, Japan},
  abstract = {We analyse the contribution of higher-level elements
                   of the linguistic specification of a data-driven speech
                   synthesiser to the naturalness of the synthetic speech
                   which it generates. The system is trained using various
                   subsets of the full feature-set, in which features
                   relating to syntactic category, intonational phrase
                   boundary, pitch accent and boundary tones are
                   selectively removed. Utterances synthesised by the
                   different configurations of the system are then
                   compared in a subjective evaluation of their
                   naturalness. The work presented forms background
                   analysis for an ongoing set of experiments in
                   performing text-to-speech (TTS) conversion based on
                   shallow features: features that can be trivially
                   extracted from text. By building a range of systems,
                   each assuming the availability of a different level of
                   linguistic annotation, we obtain benchmarks for our
                   on-going work.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100565.pdf},
  year = 2010
}
@inproceedings{hofer_interspeech2010,
  author = {Hofer, Gregor and Richmond, Korin},
  title = {Comparison of {HMM} and {TMDN} Methods for Lip
                   Synchronisation},
  booktitle = {Proc. Interspeech},
  pages = {454--457},
  address = {Makuhari, Japan},
  abstract = {This paper presents a comparison between a hidden
                   Markov model (HMM) based method and a novel artificial
                   neural network (ANN) based method for lip
                   synchronisation. Both model types were trained on
                   motion tracking data, and a perceptual evaluation was
                   carried out comparing the output of the models, both to
                   each other and to the original tracked data. It was
                   found that the ANN-based method was judged
                   significantly better than the HMM based method.
                   Furthermore, the original data was not judged
                   significantly better than the output of the ANN method.},
  keywords = {hidden Markov model (HMM), mixture density network,
                   lip synchronisation, inversion mapping},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100668.pdf},
  year = 2010
}
@inproceedings{Ehnes2010A-Precise-Contr,
  author = {Ehnes, Jochen},
  title = {A Precise Controllable Projection System for Projected
                   Virtual Characters and Its Calibration},
  booktitle = {IEEE International Symposium on Mixed and Augmented
                   Reality 2010 Science and Technolgy Proceedings},
  pages = {221--222},
  address = {Seoul, Korea},
  abstract = {In this paper we describe a system to project virtual
                   characters that shall live with us in the same
                   environment. In order to project the characters' visual
                   representations onto room surfaces we use a con-
                   trollable projector.},
  categories = {steerable projector, virtual characters, projector
                   calibration},
  month = {October},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/ehnes.pdf},
  year = 2010
}
@article{turk:2429,
  author = {Alice Turk and James Scobbie and Christian Geng and
                   Cedric Macmartin and Ellen Bard and Barry Campbell and
                   Catherine Dickie and Eddie Dubourg and Bill Hardcastle
                   and Phil Hoole and Evia Kanaida and Robin Lickley and
                   Satsuki Nakai and Marianne Pouplier and Simon King and
                   Steve Renals and Korin Richmond and Sonja Schaeffler
                   and Ronnie Wiegand and Kevin White and Alan Wrench},
  title = {The {Edinburgh Speech Production Facility's}
                   articulatory corpus of spontaneous dialogue.},
  journal = {The Journal of the Acoustical Society of America},
  volume = {128},
  number = {4},
  pages = {2429-2429},
  abstract = {The EPSRC‐funded Edinburgh Speech Production is
                   built around two synchronized Carstens AG500
                   electromagnetic articulographs (EMAs) in order to
                   capture articulatory∕acoustic data from spontaneous
                   dialogue. An initial articulatory corpus was designed
                   with two aims. The first was to elicit a range of
                   speech styles∕registers from speakers, and therefore
                   provide an alternative to fully scripted corpora. The
                   second was to extend the corpus beyond monologue, by
                   using tasks that promote natural discourse and
                   interaction. A subsidiary driver was to use dialects
                   from outwith North America: dialogues paired up a
                   Scottish English and a Southern British English
                   speaker. Tasks. Monologue: Story reading of ``Comma
                   Gets a Cure'' [Honorof et al. (2000)], lexical sets
                   [Wells (1982)], spontaneous story telling,
                   diadochokinetic tasks. Dialogue: Map tasks [Anderson et
                   al. (1991)], ``Spot the Difference'' picture tasks
                   [Bradlow et al. (2007)], story‐recall. Shadowing of
                   the spontaneous story telling by the second
                   participant. Each dialogue session includes
                   approximately 30 min of speech, and there are
                   acoustics‐only baseline materials. We will introduce
                   the corpus and highlight the role of articulatory
                   production data in helping provide a fuller
                   understanding of various spontaneous speech phenomena
                   by presenting examples of naturally occurring covert
                   speech errors, accent accommodation, turn taking
                   negotiation, and shadowing.},
  doi = {10.1121/1.3508679},
  publisher = {ASA},
  year = 2010
}
@inproceedings{kurimo:acl:10,
  author = {Mikko Kurimo and William Byrne and John Dines and
                   Philip N. Garner and Matthew Gibson and Yong Guan and
                   Teemu Hirsim\"{a}ki and Reima Karhila and Simon King
                   and Hui Liang and Keiichiro Oura and Lakshmi Saheer and
                   Matt Shannon and Sayaka Shiota and Jilei Tian and
                   Keiichi Tokuda and Mirjam Wester and Yi-Jian Wu and
                   Junichi Yamagishi},
  title = {Personalising speech-to-speech translation in the
                   {EMIME} project},
  booktitle = {Proc. of the ACL 2010 System Demonstrations},
  address = {Uppsala, Sweden},
  abstract = {In the EMIME project we have studied unsupervised
                   cross-lingual speaker adaptation. We have employed an
                   HMM statistical framework for both speech recognition
                   and synthesis which provides transformation mechanisms
                   to adapt the synthesized voice in TTS (text-to-speech)
                   using the recognized voice in ASR (automatic speech
                   recognition). An important application for this
                   research is personalised speech-to-speech translation
                   that will use the voice of the speaker in the input
                   language to utter the translated sentences in the
                   output language. In mobile environments this enhances
                   the users' interaction across language barriers by
                   making the output speech sound more like the original
                   speaker's way of speaking, even if she or he could not
                   speak the output language.},
  categories = {speaker adaptation},
  month = {July},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/kurimo_acl_2010.pdf},
  year = 2010
}
@article{michael09:dialectHTS,
  author = {Michael Pucher and Dietmar Schabus and Junichi
                   Yamagishi and Friedrich Neubarth and Volker Strom},
  title = {Modeling and Interpolation of {Austrian German and
                   Viennese} Dialect in {HMM}-based Speech Synthesis},
  journal = {Speech Communication},
  volume = {52},
  number = {2},
  pages = {164--179},
  abstract = {An HMM-based speech synthesis framework is applied to
                   both Standard Austrian German and a Viennese dialectal
                   variety and several training strategies for
                   multi-dialect modeling such as dialect clustering and
                   dialect-adaptive training are investigated. For
                   bridging the gap between processing on the level of
                   HMMs and on the linguistic level, we add phonological
                   transformations to the HMM interpolation and apply them
                   to dialect interpolation. The crucial steps are to
                   employ several formalized phonological rules between
                   Austrian German and Viennese dialect as constraints for
                   the HMM interpolation. We verify the effectiveness of
                   this strategy in a number of perceptual evaluations.
                   Since the HMM space used is not articulatory but
                   acoustic space, there are some variations in evaluation
                   results between the phonological rules. However, in
                   general we obtained good evaluation results which show
                   that listeners can perceive both continuous and
                   categorical changes of dialect varieties by using
                   phonological transformations employed as switching
                   rules in the HMM interpolation.},
  categories = {speech synthesis, hidden Markov model, dialect,
                   sociolect, Austrian German},
  doi = {10.1016/j.specom.2009.09.004},
  year = 2010
}
@article{georgila-lrec:10,
  author = {Georgila, Kallirroi and Wolters, Maria and Moore,
                   Johanna D. and Logie, Robert H.},
  title = {The {MATCH} Corpus: A Corpus of Older and Younger
                   Users' Interactions with Spoken Dialogue Systems.},
  journal = {Language Resources and Evaluation},
  volume = {44},
  number = {3},
  pages = {221--261},
  abstract = {We present the MATCH corpus, a unique data set of 447
                   dialogues in which 26 older and 24 younger adults
                   interact with nine different spoken dialogue systems.
                   The systems varied in the number of options presented
                   and the confirmation strategy used. The corpus also
                   contains information about the users' cognitive
                   abilities and detailed usability assessments of each
                   dialogue system. The corpus, which was collected using
                   a Wizard-of-Oz methodology, has been fully transcribed
                   and annotated with dialogue acts and ``Information
                   State Update'' (ISU) representations of dialogue
                   context. Dialogue act and ISU annotations were
                   performed semi-automatically. In addition to describing
                   the corpus collection and annotation, we present a
                   quantitative analysis of the interaction behaviour of
                   older and younger users and discuss further
                   applications of the corpus. We expect that the corpus
                   will provide a key resource for modelling older
                   people's interaction with spoken dialogue systems. },
  doi = {10.1007/s10579-010-9118-8},
  keywords = {Spoken dialogue corpora, Spoken dialogue systems,
                   Cognitive ageing, Annotation, Information states,
                   Speech acts, User simulations, Speech recognition},
  month = {March},
  year = 2010
}
@article{child_speech_journal_2010,
  author = {Watts, O. and Yamagishi, J. and King, S. and Berkling,
                   K.},
  title = {Synthesis of Child Speech with {HMM} Adaptation and
                   Voice Conversion},
  journal = {Audio, Speech, and Language Processing, IEEE
                   Transactions on},
  volume = {18},
  number = {5},
  pages = {1005--1016},
  abstract = {The synthesis of child speech presents challenges both
                   in the collection of data and in the building of a
                   synthesizer from that data. We chose to build a
                   statistical parametric synthesizer using the hidden
                   Markov model (HMM)-based system HTS, as this technique
                   has previously been shown to perform well for limited
                   amounts of data, and for data collected under imperfect
                   conditions. Six different configurations of the
                   synthesizer were compared, using both speaker-dependent
                   and speaker-adaptive modeling techniques, and using
                   varying amounts of data. For comparison with HMM
                   adaptation, techniques from voice conversion were used
                   to transform existing synthesizers to the
                   characteristics of the target speaker. Speaker-adaptive
                   voices generally outperformed child speaker-dependent
                   voices in the evaluation. HMM adaptation outperformed
                   voice conversion style techniques when using the full
                   target speaker corpus; with fewer adaptation data,
                   however, no significant listener preference for either
                   HMM adaptation or voice conversion methods was found.},
  doi = {10.1109/TASL.2009.2035029},
  issn = {1558-7916},
  keywords = {HMM adaptation techniques;child speech
                   synthesis;hidden Markov model;speaker adaptive modeling
                   technique;speaker dependent technique;speaker-adaptive
                   voice;statistical parametric synthesizer;target speaker
                   corpus;voice conversion;hidden Markov models;speech
                   synthesis;},
  month = jul,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/YamagishiJ_Synthesis\%20of\%20Child\%20Speech.pdf},
  year = 2010
}
@inproceedings{wolters-pqs:10,
  author = {Wolters, Maria K. and G\"odde, Florian and M\"oller,
                   Sebastian and Engelbrecht, Klaus-Peter},
  title = {Finding Patterns in User Quality Judgements},
  booktitle = {Proc. ISCA Workshop Perceptual Quality of Speech
                   Systems, Dresden, Germany},
  abstract = {User quality judgements can show a bewildering amount
                   of variation that is diffcult to capture using
                   traditional quality prediction approaches. Using
                   clustering, an ex- ploratory statistical analysis
                   technique, we reanalysed the data set of a Wizard-of-Oz
                   experiment where 25 users were asked to rate the
                   dialogue after each turn. The sparse data problem was
                   addressed by careful a priori parameter choices and
                   comparison of the results of different cluster
                   algorithms. We found two distinct classes of users,
                   positive and critical. Positive users were generally
                   happy with the dialogue system, and did not mind
                   errors. Critical users downgraded their opinion of the
                   system after errors, used a wider range of ratings, and
                   were less likely to rate the system positively overall.
                   These user groups could not be predicted by experience
                   with spoken dialogue systems, attitude to spoken
                   dialogue systems, anity with technology, demographics,
                   or short-term memory capacity. We suggest that
                   evaluation research should focus on critical users and
                   discuss how these might be identified.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/Wolters_et_al_PQS.pdf},
  year = 2010
}
@inproceedings{junichi:interspeech2010,
  author = {Junichi Yamagishi and Oliver Watts and Simon King and
                   Bela Usabaev},
  title = {Roles of the Average Voice in Speaker-adaptive
                   {HMM}-based Speech Synthesis},
  booktitle = {{Proc. Interspeech}},
  pages = {418--421},
  address = {Makuhari, Japan},
  abstract = {In speaker-adaptive HMM-based speech synthesis, there
                   are typically a few speakers for which the output
                   synthetic speech sounds worse than that of other
                   speakers, despite having the same amount of adaptation
                   data from within the same corpus. This paper
                   investigates these fluctuations in quality and
                   concludes that as mel-cepstral distance from the
                   average voice becomes larger, the MOS naturalness
                   scores generally become worse. Although this negative
                   correlation is not that strong, it suggests a way to
                   improve the training and adaptation strategies. We also
                   draw comparisons between our findings and the work of
                   other researchers regarding ``vocal attractiveness.''},
  keywords = {speech synthesis, HMM, average voice, speaker
                   adaptation},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100361.pdf},
  year = 2010
}
@inproceedings{junichi:icassp2010,
  author = {J. Yamagishi and S. King},
  title = {Simple methods for improving speaker-similarity of
                   {HMM}-based speech synthesis},
  booktitle = {{Proc. ICASSP 2010}},
  address = {Dallas, Texas, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/JunichiICASSP10.pdf},
  year = 2010
}
@article{junichi:ieee2010,
  author = {J. Yamagishi and B. Usabaev and S. King and O. Watts
                   and J. Dines and J. Tian and R. Hu and Y. Guan and K.
                   Oura and K. Tokuda and R. Karhila and M. Kurimo},
  title = {Thousands of Voices for {HMM}-based Speech Synthesis
                   -- Analysis and Application of {TTS} Systems Built on
                   Various {ASR} Corpora},
  journal = {IEEE Transactions on Audio, Speech and Language
                   Processing},
  volume = 18,
  number = 5,
  pages = {984--1004},
  abstract = {In conventional speech synthesis, large amounts of
                   phonetically balanced speech data recorded in highly
                   controlled recording studio environments are typically
                   required to build a voice. Although using such data is
                   a straightforward solution for high quality synthesis,
                   the number of voices available will always be limited,
                   because recording costs are high. On the other hand,
                   our recent experiments with HMM-based speech synthesis
                   systems have demonstrated that speaker-adaptive
                   HMM-based speech synthesis (which uses an ``average
                   voice model'' plus model adaptation) is robust to
                   non-ideal speech data that are recorded under various
                   conditions and with varying microphones, that are not
                   perfectly clean, and/or that lack phonetic balance.
                   This enables us to consider building high-quality
                   voices on ``non-TTS'' corpora such as ASR corpora.
                   Since ASR corpora generally include a large number of
                   speakers, this leads to the possibility of producing an
                   enormous number of voices automatically. In this paper,
                   we demonstrate the thousands of voices for HMM-based
                   speech synthesis that we have made from several popular
                   ASR corpora such as the Wall Street Journal (WSJ0,
                   WSJ1, and WSJCAM0), Resource Management, Globalphone,
                   and SPEECON databases. We also present the results of
                   associated analysis based on perceptual evaluation, and
                   discuss remaining issues.},
  doi = {10.1109/TASL.2010.2045237},
  keywords = {Automatic speech recognition (ASR), H Triple S (HTS),
                   SPEECON database, WSJ database, average voice, hidden
                   Markov model (HMM)-based speech synthesis, speaker
                   adaptation, speech synthesis, voice conversion},
  month = jul,
  year = 2010
}
@inproceedings{wester:ssw7:10,
  author = {Mirjam Wester and John Dines and Matthew Gibson and
                   Hui Liang and Yi-Jian Wu and Lakshmi Saheer and Simon
                   King and Keiichiro Oura and Philip N. Garner and
                   William Byrne and Yong Guan and Teemu Hirsim\"{a}ki and
                   Reima Karhila and Mikko Kurimo and Matt Shannon and
                   Sayaka Shiota and Jilei Tian and Keiichi Tokuda and
                   Junichi Yamagishi},
  title = {Speaker adaptation and the evaluation of speaker
                   similarity in the {EMIME} speech-to-speech translation
                   project},
  booktitle = {Proc. of 7th ISCA Speech Synthesis Workshop},
  address = {Kyoto, Japan},
  abstract = {This paper provides an overview of speaker adaptation
                   research carried out in the EMIME speech-to-speech
                   translation (S2ST) project. We focus on how speaker
                   adaptation transforms can be learned from speech in one
                   language and applied to the acoustic models of another
                   language. The adaptation is transferred across
                   languages and/or from recognition models to synthesis
                   models. The various approaches investigated can all be
                   viewed as a process in which a mapping is defined in
                   terms of either acoustic model states or linguistic
                   units. The mapping is used to transfer either speech
                   data or adaptation transforms between the two models.
                   Because the success of speaker adaptation in
                   text-to-speech synthesis is measured by judging speaker
                   similarity, we also discuss issues concerning
                   evaluation of speaker similarity in an S2ST scenario.},
  categories = {speaker adaptation, evaluation},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wester_ssw7_2010.pdf},
  year = 2010
}
@inproceedings{kilgour2010a,
  author = {Kilgour, Jonathan and Carletta, Jean and Renals, Steve},
  title = {The {Ambient Spotlight}: Personal multimodal search
                   without query},
  booktitle = {Proc. ICMI-MLMI},
  abstract = {The Ambient Spotlight is a prototype system based on
                   personal meeting capture using a laptop and a portable
                   microphone array. The system automatically recognises
                   and structures the meeting content using automatic
                   speech recognition, topic segmentation and extractive
                   summarisation. The recognised speech in the meeting is
                   used to construct queries to automatically link meeting
                   segments to other relevant material, both multimodal
                   and textual. The interface to the system is constructed
                   around a standard calendar interface, and it is
                   integrated with the laptop's standard indexing, search
                   and retrieval.},
  doi = {10.1145/1891903.1891919},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2010/ambientDemo-icmi.pdf},
  url = {http://dx.doi.org/10.1145/1891903.1891919},
  year = 2010
}
@inproceedings{michael:interspeech2010,
  author = {Michael Pucher and Dietmar Schabus and Junichi
                   Yamagishi},
  title = {Synthesis of fast speech with interpolation of adapted
                   {HSMMs} and its evaluation by blind and sighted
                   listeners},
  booktitle = {Proc. Interspeech},
  pages = {2186--2189},
  address = {Makuhari, Japan},
  abstract = {In this paper we evaluate a method for generating
                   synthetic speech at high speaking rates based on the
                   interpolation of hidden semi-Markov models (HSMMs)
                   trained on speech data recorded at normal and fast
                   speaking rates. The subjective evaluation was carried
                   out with both blind listeners, who are used to very
                   fast speaking rates, and sighted listeners. We show
                   that we can achieve a better intelligibility rate and
                   higher voice quality with this method compared to
                   standard HSMM-based duration modeling. We also evaluate
                   duration modeling with the interpolation of all the
                   acoustic features including not only duration but also
                   spectral and F0 models. An analysis of the mean squared
                   error (MSE) of standard HSMM-based duration modeling
                   for fast speech identifies problematic linguistic
                   contexts for duration modeling.},
  keywords = {speech synthesis, fast speech, hidden semi- Markov
                   model},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100294.pdf},
  year = 2010
}
@inproceedings{king_hmm_tutorial:india2010,
  author = {Simon King},
  title = {A tutorial on {HMM} speech synthesis (Invited paper)},
  booktitle = {Sadhana -- Academy Proceedings in Engineering
                   Sciences, Indian Institute of Sciences},
  abstract = {Statistical parametric speech synthesis, based on
                   HMM-like models, has become competitive with
                   established concatenative techniques over the last few
                   years. This paper offers a non-mathematical
                   introduction to this method of speech synthesis. It is
                   intended to be complementary to the wide range of
                   excellent technical publications already available.
                   Rather than offer a comprehensive literature review,
                   this paper instead gives a small number of carefully
                   chosen references which are good starting points for
                   further reading.},
  categories = {speech synthesis, HMM synthesis},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/king_hmm_tutorial.pdf},
  year = 2010
}
@inproceedings{anderssonetal2010_ssw7,
  author = {Sebastian Andersson and Junichi Yamagishi and Robert
                   Clark},
  title = {Utilising Spontaneous Conversational Speech in
                   {HMM}-Based Speech Synthesis},
  booktitle = {The 7th ISCA Tutorial and Research Workshop on Speech
                   Synthesis},
  abstract = {Spontaneous conversational speech has many
                   characteristics that are currently not well modelled in
                   unit selection and HMM-based speech synthesis. But in
                   order to build synthetic voices more suitable for
                   interaction we need data that exhibits more
                   conversational characteristics than the generally used
                   read aloud sentences. In this paper we will show how
                   carefully selected utterances from a spontaneous
                   conversation was instrumental for building an HMM-based
                   synthetic voices with more natural sounding
                   conversational characteristics than a voice based on
                   carefully read aloud sentences. We also investigated a
                   style blending technique as a solution to the inherent
                   problem of phonetic coverage in spontaneous speech
                   data. But the lack of an appropriate representation of
                   spontaneous speech phenomena probably contributed to
                   results showing that we could not yet compete with the
                   speech quality achieved for grammatical sentences.},
  categories = {HMM, speech synthesis, spontaneous speech,
                   conversation, lexical fillers, filled pauses},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/ssw7_paper.pdf},
  year = 2010
}
@inproceedings{tejedor_interspeech10,
  author = {Javier Tejedor and Doroteo T. Toledano and Miguel
                   Bautista and Simon King and Dong Wang and Jose Colas},
  title = {Augmented set of features for confidence estimation in
                   spoken term detection},
  booktitle = {Proc. Interspeech},
  abstract = {Discriminative confidence estimation along with
                   confidence normalisation have been shown to construct
                   robust decision maker modules in spoken term detection
                   (STD) systems. Discriminative confidence estimation,
                   making use of termdependent features, has been shown to
                   improve the widely used lattice-based confidence
                   estimation in STD. In this work, we augment the set of
                   these term-dependent features and show a significant
                   improvement in the STD performance both in terms of
                   ATWV and DET curves in experiments conducted on a
                   Spanish geographical corpus. This work also proposes a
                   multiple linear regression analysis to carry out the
                   feature selection. Next, the most informative features
                   derived from it are used within the discriminative
                   confidence on the STD system.},
  categories = {confidence estimation, feature selection, spoken term
                   detection, speech recognition},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/features.pdf},
  year = 2010
}
@inproceedings{letter_based_TTS,
  author = {Oliver Watts and Junichi Yamagishi and Simon King},
  title = {Letter-based speech synthesis},
  booktitle = {Proc. Speech Synthesis Workshop 2010},
  pages = {317-322},
  address = {Nara, Japan},
  abstract = {Initial attempts at performing text-to-speech
                   conversion based on standard orthographic units are
                   presented, forming part of a larger scheme of training
                   TTS systems on features that can be trivially extracted
                   from text. We evaluate the possibility of using the
                   technique of decision-tree-based context clustering
                   conventionally used in HMM-based systems for
                   parametertying to handle letter-to-sound conversion. We
                   present the application of a method of compound-feature
                   discovery to corpusbased speech synthesis. Finally, an
                   evaluation of intelligibility of letter-based systems
                   and more conventional phoneme-based systems is
                   presented.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/ssw7.pdf},
  year = 2010
}