2011.bib

@comment{{This file has been generated by bib2bib 1.92}}
@comment{{Command line: /home/korin/bibtex2html-1.92-LINUX/bib2bib -oc /home/korin/projects/publications/new_output/transitdata/2011-citations -ob /home/korin/projects/publications/new_output/transitdata/2011.bib -c 'year : "2011"' /home/korin/projects/publications/filtlists/full_publications_list.bib}}
@inproceedings{kilgour2011,
  abstract = {We present the Ambient Spotlight system for
personal meeting capture based on a portable USB microphone array and
a laptop.  The system combined distant speech recognition and content
linking with personal productivity tools, and enables recognised
meeting recordings to be integrated with desktop search, calender, and
email.  },
  author = {Kilgour, Jonathan and Carletta, Jean and Renals, Steve},
  booktitle = {Proc. HSCMA},
  doi = {10.1109/HSCMA.2011.5942389},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2011/ambientDemo.pdf},
  title = {The {Ambient Spotlight}: Personal meeting capture with a microphone array},
  year = {2011}
}
@article{renals2011,
  abstract = {This paper is about the recognition and
	interpretation of multiparty meetings captured as audio, video
	and other signals.  This is a challenging task since the
	meetings consist of spontaneous and conversational
	interactions between a number of participants: it is a
	multimodal, multiparty, multistream problem.  We discuss the
	capture and annotation of the AMI meeting corpus, the
	development of a meeting speech recognition system, and
	systems for the automatic segmentation, summarisation and
	social processing of meetings, together with some example
	applications based on these systems.},
  author = {Renals, S},
  journal = {SADHANA - Academy Proceedings in Engineering Sciences},
  number = {5},
  pages = {917--932},
  doi = {10.1007/s12046-011-0051-3},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2011/renals-sadhana10.pdf},
  title = {Automatic analysis of multiparty meetings},
  volume = {36},
  year = {2011}
}
@inproceedings{dzikovskaSIGDIAL20112,
  author = {Dzikovska, Myroslava and Isard, Amy and Bell, Peter
                   and Moore, Johanna and Steinhauser, Natalie and
                   Campbell, Gwendolyn},
  title = {{Beetle II}: an adaptable tutorial dialogue system},
  booktitle = {Proceedings of the SIGDIAL 2011 Conference, demo
                   session},
  pages = {338--340},
  address = {Portland, Oregon},
  publisher = {Association for Computational Linguistics},
  abstract = {We present Beetle II, a tutorial dialogue system which
                   accepts unrestricted language input and supports
                   experimentation with different tutorial planning and
                   dialogue strategies. Our first system evaluation
                   compared two tutorial policies and demonstrated that
                   the system can be used to study the impact of different
                   approaches to tutoring. The system is also designed to
                   allow experimentation with a variety of natural
                   language techniques, and discourse and dialogue
                   strategies.},
  month = jun,
  url = {http://www.aclweb.org/anthology/W11-2041},
  year = 2011
}
@inproceedings{karhila_interspeech:11,
  author = {Reima Karhila and Mirjam Wester},
  title = {Rapid Adaptation of Foreign-accented {HMM}-based
                   Speech Synthesis},
  booktitle = {Proc. Interspeech},
  address = {Florence, Italy},
  abstract = {This paper presents findings of listeners’
                   perception of speaker identity in synthetic speech.
                   Specifically, we investigated what the effect is on the
                   perceived identity of a speaker when using differently
                   accented average voice models and limited amounts (five
                   and fifteen sentences) of a speaker’s data to create
                   the synthetic stimuli. A speaker discrimination task
                   was used to measure speaker identity. Native English
                   listeners were presented with natural and synthetic
                   speech stimuli in English and were asked to decide
                   whether they thought the sentences were spoken by the
                   same person or not. An accent rating task was also
                   carried out to measure the perceived accents of the
                   synthetic speech stimuli. The results show that
                   listeners, for the most part, perform as well at
                   speaker discrimination when the stimuli have been
                   created using five or fifteen adaptation sentences as
                   when using 105 sentences. Furthermore, the accent of
                   the average voice model does not affect listeners’
                   speaker discrimination performance even though the
                   accent rating task shows listeners are perceiving
                   different accents in the synthetic stimuli. Listeners
                   do not base their speaker similarity decisions on
                   perceived accent.},
  categories = {speech synthesis, rapid adaptation},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/karhila_wester_interspeech_2011.pdf},
  year = 2011
}
@inproceedings{DBLP:conf/aied/DzikovskaIBMSCTCS11,
  author = {Myroslava Dzikovska and Amy Isard and Peter Bell and
                   Johanna D. Moore and Natalie B. Steinhauser and
                   Gwendolyn E. Campbell and Leanne S. Taylor and Simon
                   Caine and Charlie Scott},
  title = {Adaptive Intelligent Tutorial Dialogue in the {Beetle
                   II} System},
  booktitle = {Artificial Intelligence in Education - 15th
                   International Conference (AIED 2011), interactive event},
  volume = {6738},
  series = {Lecture Notes in Computer Science},
  pages = {621},
  address = {Auckland, New Zealand},
  publisher = {Springer},
  doi = {10.1007/978-3-642-21869-9_122},
  year = 2011
}
@inproceedings{wester_interspeech:11,
  author = {Mirjam Wester and Hui Liang},
  title = {Cross-Lingual Speaker Discrimination Using Natural and
                   Synthetic Speech},
  booktitle = {Proc. Interspeech},
  address = {Florence, Italy},
  abstract = {This paper describes speaker discrimination
                   experiments in which native English listeners were
                   presented with either natural speech stimuli in English
                   and Mandarin, synthetic speech stimuli in English and
                   Mandarin, or natural Mandarin speech and synthetic
                   English speech stimuli. In each experiment, listeners
                   were asked to decide whether they thought the sentences
                   were spoken by the same person or not. We found that
                   the results for Mandarin/English speaker discrimination
                   are very similar to results found in previous work on
                   German/English and Finnish/English speaker
                   discrimination. We conclude from this and previous work
                   that listeners are able to identify speakers across
                   languages and they are able to identify speakers across
                   speech types, but the combination of these two factors
                   leads to a speaker discrimination task which is too
                   difficult for listeners to perform successfully, given
                   the quality of across-language speaker adapted speech
                   synthesis at present.},
  categories = {speaker discrimination, speaker adaptation, HMM-based
                   speech synthesis},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/wester_liang_interspeech_2011.pdf},
  year = 2011
}
@article{tuomo:ieee2011,
  author = {T. Raitio and A. Suni and J. Yamagishi and H. Pulakka
                   and J. Nurminen and M. Vainio and P. Alku},
  title = {{HMM}-Based Speech Synthesis Utilizing Glottal Inverse
                   Filtering},
  journal = {IEEE Transactions on Audio, Speech and Language
                   Processing},
  volume = 19,
  number = 1,
  pages = {153--165},
  abstract = {This paper describes an hidden Markov model
                   (HMM)-based speech synthesizer that utilizes glottal
                   inverse filtering for generating natural sounding
                   synthetic speech. In the proposed method, speech is
                   first decomposed into the glottal source signal and the
                   model of the vocal tract filter through glottal inverse
                   filtering, and thus parametrized into excitation and
                   spectral features. The source and filter features are
                   modeled individually in the framework of HMM and
                   generated in the synthesis stage according to the text
                   input. The glottal excitation is synthesized through
                   interpolating and concatenating natural glottal flow
                   pulses, and the excitation signal is further modified
                   according to the spectrum of the desired voice source
                   characteristics. Speech is synthesized by filtering the
                   reconstructed source signal with the vocal tract
                   filter. Experiments show that the proposed system is
                   capable of generating natural sounding speech, and the
                   quality is clearly better compared to two HMM-based
                   speech synthesis systems based on widely used vocoder
                   techniques.},
  doi = {10.1109/TASL.2010.2045239},
  keywords = {Glottal inverse filtering , hidden Markov model (HMM)
                   , speech synthesis},
  month = jan,
  year = 2011
}
@inproceedings{watts_yamagishi_king_2011,
  author = {Oliver Watts and Junichi Yamagishi and Simon King},
  title = {Unsupervised continuous-valued word features for
                   phrase-break prediction without a part-of-speech tagger},
  booktitle = {Proc. Interspeech},
  pages = {2157--2160},
  address = {Florence, Italy},
  abstract = {Part of speech (POS) tags are foremost among the
                   features conventionally used to predict intonational
                   phrase-breaks for text to speech (TTS) conversion. The
                   construction of such systems therefore presupposes the
                   availability of a POS tagger for the relevant language,
                   or of a corpus manually tagged with POS. However, such
                   tools and resources are not available in the majority
                   of the world’s languages, and manually labelling text
                   with POS tags is an expensive and time-consuming
                   process. We therefore propose the use of
                   continuous-valued features that summarise the
                   distributional characteristics of word types as
                   surrogates for POS features. Importantly, such features
                   are obtained in an unsupervised manner from an untagged
                   text corpus. We present results on the phrase-break
                   prediction task, where use of the features closes the
                   gap in performance between a baseline system (using
                   only basic punctuation-related features) and a topline
                   system (incorporating a state-of-the-art POS tagger).},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/watts_yamagishi_king_2011.pdf},
  year = 2011
}
@inproceedings{Cassia_IS11,
  author = {Valentini-Botinhao, Cassia and Yamagishi, Junichi and
                   King, Simon},
  title = {Can Objective Measures Predict the Intelligibility of
                   Modified {HMM}-based Synthetic Speech in Noise?},
  booktitle = {Proc. Interspeech},
  abstract = {{Synthetic speech can be modified to improve
                   intelligibility in noise. In order to perform
                   modifications automatically, it would be useful to have
                   an objective measure that could predict the
                   intelligibility of modified synthetic speech for human
                   listeners. We analysed the impact on intelligibility
                   – and on how well objective measures predict it –
                   when we separately modify speaking rate, fundamental
                   frequency, line spectral pairs and spectral peaks.
                   Shifting LSPs can increase intelligibility for human
                   listeners; other modifications had weaker effects.
                   Among the objective measures we evaluated, the Dau
                   model and the Glimpse proportion were the best
                   predictors of human performance.}},
  categories = {HMM-based speech synthesis, objective measures of
                   intelligibility},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/Cassia_IS11.pdf},
  year = 2011
}
@inproceedings{richmond2011a,
  author = {Richmond, Korin and Hoole, Phil and King, Simon},
  title = {Announcing the Electromagnetic Articulography (Day 1)
                   Subset of the mngu0 Articulatory Corpus},
  booktitle = {Proc. Interspeech},
  pages = {1505--1508},
  address = {Florence, Italy},
  abstract = {This paper serves as an initial announcement of the
                   availability of a corpus of articulatory data called
                   mngu0. This corpus will ultimately consist of a
                   collection of multiple sources of articulatory data
                   acquired from a single speaker: electromagnetic
                   articulography (EMA), audio, video, volumetric MRI
                   scans, and 3D scans of dental impressions. This data
                   will be provided free for research use. In this first
                   stage of the release, we are making available one
                   subset of EMA data, consisting of more than 1,300
                   phonetically diverse utterances recorded with a
                   Carstens AG500 electromagnetic articulograph.
                   Distribution of mngu0 will be managed by a dedicated
                   ``forum-style'' web site. This paper both outlines the
                   general goals motivating the distribution of the data
                   and the creation of the mngu0 web forum, and also
                   provides a description of the EMA data contained in
                   this initial release.},
  categories = {articulography, corpus, EMA},
  month = {August},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/IS110767.pdf},
  year = 2011
}
@inproceedings{mcinnes_cogsci2011,
  author = {Fergus R. McInnes and Sharon J. Goldwater},
  title = {Unsupervised Extraction of Recurring Words from
                   Infant-Directed Speech},
  booktitle = {Proceedings of CogSci 2011},
  address = {Boston, Massachusetts},
  abstract = {To date, most computational models of infant word
                   segmentation have worked from phonemic or phonetic
                   input, or have used toy datasets. In this paper, we
                   present an algorithm for word extraction that works
                   directly from naturalistic acoustic input:
                   infant-directed speech from the CHILDES corpus. The
                   algorithm identifies recurring acoustic patterns that
                   are candidates for identification as words or phrases,
                   and then clusters together the most similar patterns.
                   The recurring patterns are found in a single pass
                   through the corpus using an incremental method, where
                   only a small number of utterances are considered at
                   once. Despite this limitation, we show that the
                   algorithm is able to extract a number of recurring
                   words, including some that infants learn earliest, such
                   as "Mommy" and the child’s name. We also introduce a
                   novel information-theoretic evaluation measure.},
  categories = {language acquisition, word segmentation, speech
                   recognition, computational modelling},
  month = jul,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/mcinnes_cogsci2011.pdf},
  year = 2011
}
@inproceedings{lei2011a,
  author = {Lei, Ming and Yamagishi, Junichi and Richmond, Korin
                   and Ling, Zhen-Hua and King, Simon and Dai, Li-Rong},
  title = {Formant-controlled {HMM}-based speech synthesis},
  booktitle = {Proc. Interspeech},
  pages = {2777--2780},
  address = {Florence, Italy},
  abstract = {This paper proposes a novel framework that enables us
                   to manipulate and control formants in HMM-based speech
                   synthesis. In this framework, the dependency between
                   formants and spectral features is modelled by piecewise
                   linear transforms; formant parameters are effectively
                   mapped by these to the means of Gaussian distributions
                   over the spectral synthesis parameters. The spectral
                   envelope features generated under the influence of
                   formants in this way may then be passed to high-quality
                   vocoders to generate the speech waveform. This provides
                   two major advantages over conventional frameworks.
                   First, we can achieve spectral modification by changing
                   formants only in those parts where we want control,
                   whereas the user must specify all formants manually in
                   conventional formant synthesisers (e.g. Klatt). Second,
                   this can produce high-quality speech. Our results show
                   the proposed method can control vowels in the
                   synthesized speech by manipulating F 1 and F 2 without
                   any degradation in synthesis quality.},
  categories = {speech synthesis, hidden Markov model, formants,
                   controllability},
  month = {August},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/IS110769.pdf},
  year = 2011
}
@article{john:ieee2011,
  author = {J. Dines and J. Yamagishi and S. King},
  title = {Measuring the gap between {HMM}-based {ASR} and {TTS}},
  journal = {IEEE Selected Topics in Signal Processing},
  note = {(in press)},
  abstract = {The EMIME European project is conducting research in
                   the development of technologies for mobile,
                   personalised speech-to-speech translation systems. The
                   hidden Markov model (HMM) is being used as the
                   underlying technology in both automatic speech
                   recognition (ASR) and text-to-speech synthesis (TTS)
                   components, thus, the investigation of unified
                   statistical modelling approaches has become an implicit
                   goal of our research. As one of the first steps towards
                   this goal, we have been investigating commonalities and
                   differences between HMM-based ASR and TTS. In this
                   paper we present results and analysis of a series of
                   experiments that have been conducted on English ASR and
                   TTS systems measuring their performance with respect to
                   phone set and lexicon; acoustic feature type and
                   dimensionality; HMM topology; and speaker adaptation.
                   Our results show that, although the fundamental
                   statistical model may be essentially the same, optimal
                   ASR and TTS performance often demands diametrically
                   opposed system designs. This represents a major
                   challenge to be addressed in the investigation of such
                   unified modelling approaches.},
  doi = {10.1109/JSTSP.2010.2079315},
  keywords = {Acoustics, Adaptation model, Context modeling, Hidden
                   Markov models, Speech, Speech recognition, Training,
                   speech recognition, speech synthesis, unified models},
  year = 2011
}
@inproceedings{Cassia_ICASSP11,
  author = {Valentini-Botinhao, Cassia and Yamagishi, Junichi and
                   King, Simon},
  title = {Evaluation of objective measures for intelligibility
                   prediction of {HMM}-based synthetic speech in noise},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011
                   IEEE International Conference on},
  pages = {5112--5115},
  abstract = {{In this paper we evaluate four objective measures of
                   speech with regards to intelligibility prediction of
                   synthesized speech in diverse noisy situations. We
                   evaluated three intelligibility measures, the Dau
                   measure, the glimpse proportion and the Speech
                   Intelligibility Index (SII) and a quality measure, the
                   Perceptual Evaluation of Speech Quality (PESQ). For the
                   generation of synthesized speech we used a state of the
                   art HMM-based speech synthesis system. The noisy
                   conditions comprised four additive noises. The measures
                   were compared with subjective intelligibility scores
                   obtained in listening tests. The results show the Dau
                   and the glimpse measures to be the best predictors of
                   intelligibility, with correlations of around 0.83 to
                   subjective scores. All measures gave less accurate
                   predictions of intelligibility for synthetic speech
                   than have previously been found for natural speech; in
                   particular the SII measure. In additional experiments,
                   we processed the synthesized speech by an ideal binary
                   mask before adding noise. The Glimpse measure gave the
                   most accurate intelligibility predictions in this
                   situation.}},
  categories = {HMM-based speech synthesis, objective measures of
                   intelligibility},
  doi = {10.1109/ICASSP.2011.5947507},
  issn = {1520-6149},
  month = may,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/Cassia_ICASSP11.pdf},
  year = 2011
}
@inproceedings{wester_icassp:11,
  author = {Mirjam Wester and Reima Karhila},
  title = {Speaker Similarity Evaluation of Foreign-accented
                   Speech Synthesis using {HMM}-based Speaker Adaptation},
  booktitle = {Proc. ICASSP},
  pages = {5372--5375},
  address = {Prague, Czech Republic},
  abstract = {This paper describes a speaker discrimination
                   experiment in which native English listeners were
                   presented with natural and synthetic speech stimuli in
                   English and were asked to judge whether they thought
                   the sentences were spoken by the same person or not.
                   The natural speech consisted of recordings of Finnish
                   speakers speaking English. The synthetic stimuli were
                   created using adaptation data from the same Finnish
                   speakers. Two average voice models were compared: one
                   trained on Finnish-accented English and the other on
                   American-accented English. The experiments illustrate
                   that listeners perform well at speaker discrimination
                   when the stimuli are both natural or both synthetic,
                   but when the speech types are crossed performance drops
                   significantly. We also found that the type of accent in
                   the average voice model had no effect on the
                   listeners’ speaker discrimination performance.},
  categories = {Similarity Evaluation, Speaker Adaptation,
                   HMM-synthesis},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/wester_icassp_2011.pdf},
  year = 2011
}
@inproceedings{Wolters2011,
  author = {Wolters, Maria Klara and Johnson, Christine and Isaac,
                   Karl B},
  title = {Can the Hearing Handicap Inventory for Adults Be Used
                   As a Screen for Perception Experiments?},
  booktitle = {Proc. ICPhS XVII},
  address = {Hong Kong},
  abstract = {When screening participants for speech perception
                   experiments, formal audiometric screens are often not
                   an option, especially when studies are conducted over
                   the Internet. We investigated whether a brief
                   standardized self-report questionnaire, the screening
                   version of the Hearing Handicap Inventory for Adults
                   (HHIA-S), could be used to approximate the results of
                   audiometric screening. Our results suggest that while
                   the HHIA-S is useful, it needs to be used with
                   extremely strict cut-off values that could exclude
                   around 25\% of people with no hearing impairment who
                   are interested in participating. Well constructed,
                   standardized single questions might be a more feasible
                   alternative, in particular for web experiments.},
  categories = {audiometry,hearing handicap inventory,screening},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/Wolters_icphs.pdf},
  year = 2011
}
@article{lu_spl_2011,
  author = {Lu, L. and Ghoshal, A. and Renals, S.},
  title = {Regularized Subspace Gausian Mixture Models for Speech
                   Recognition},
  journal = {IEEE Signal Processing Letters},
  volume = {18},
  number = {7},
  pages = {419--422},
  abstract = {Subspace Gaussian mixture models (SGMMs) provide a
                   compact representation of the Gaussian parameters in an
                   acoustic model, but may still suffer from over-fitting
                   with insufficient training data. In this letter, the
                   SGMM state parameters are estimated using a penalized
                   maximum-likelihood objective, based on $\ell_1$ and
                   $\ell_2$ regularization, as well as their combination,
                   referred to as the elastic net, for robust model
                   estimation. Experiments on the 5000-word Wall Street
                   Journal transcription task show word error rate
                   reduction and improved model robustness with
                   regularization.},
  categories = {Acoustic Modelling, Regularization, Sparsity, Subspace
                   Gaussian Mixture Model},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/lu-spl-2011.pdf},
  year = 2011
}
@incollection{Pipe_etal:2011,
  author = {A. G. Pipe and R. Vaidyanathan and C. Melhuish and P.
                   Bremner and P. Robinson and R. A. J. Clark and A. Lenz
                   and K. Eder and N. Hawes and Z. Ghahramani and M.
                   Fraser and M. Mermehdi and P. Healey and S. Skachek},
  title = {Affective Robotics: Human Motion and Behavioural
                   Inspiration for Cooperation between Humans and
                   Assistive Robots},
  booktitle = {Biomimetics: Nature-Based Innovation},
  publisher = {Taylor and Francis},
  editor = {Yoseph Bar-Cohen},
  chapter = {15},
  year = 2011
}
@inproceedings{wilson_hofer:iui2011,
  author = {Theresa Wilson and Gregor Hofer},
  title = {Using Linguistic and Vocal Expressiveness in Social
                   Role Recognition},
  booktitle = {Proc~Int.~Conf.~on Intelligent User Interfaces,
                   IUI2011},
  address = {Palo Alto, USA},
  publisher = {ACM},
  abstract = {In this paper, we investigate two types of
                   expressiveness, linguistic and vocal, and whether they
                   are useful for recog- nising the social roles of
                   participants in meetings. Our ex- periments show that
                   combining expressiveness features with speech activity
                   does improve social role recognition over speech
                   activity features alone.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/WilsonHoferIUI2010sub.pdf},
  year = 2011
}
@article{wang_ieeesigprocletters2011,
  author = {Dong Wang and Simon King},
  title = {Letter-to-Sound Pronunciation Prediction Using
                   Conditional Random Fields},
  journal = {IEEE Signal Processing Letters},
  volume = {18},
  number = {2},
  pages = {122--125},
  abstract = {Pronunciation prediction, or letter-to-sound (LTS)
                   conversion, is an essential task for speech synthesis,
                   open vo- cabulary spoken term detection and other
                   applications dealing with novel words. Most current
                   approaches (at least for English) employ data-driven
                   methods to learn and represent pronunciation ``rules''
                   using statistical models such as decision trees, hidden
                   Markov models (HMMs) or joint-multigram models (JMMs).
                   The LTS task remains challenging, particularly for
                   languages with a complex relationship between spelling
                   and pronunciation such as English. In this paper, we
                   propose to use a conditional random field (CRF) to
                   perform LTS because it avoids having to model a
                   distribution over observations and can perform global
                   inference, suggesting that it may be more suitable for
                   LTS than decision trees, HMMs or JMMs. One challenge in
                   applying CRFs to LTS is that the phoneme and grapheme
                   sequences of a word are generally of different lengths,
                   which makes CRF training difficult. To solve this
                   problem, we employed a joint-multigram model to
                   generate aligned training exemplars. Experiments
                   conducted with the AMI05 dictionary demonstrate that a
                   CRF significantly outperforms other models, especially
                   if n-best lists of predictions are generated.},
  categories = {Terms—letter-to-sound, conditional random field,
                   joint multigram model, speech synthesis, spoken term
                   detection},
  doi = {10.1109/LSP.2010.2098440 },
  month = feb,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/wang_ieeesigprocletters2011.pdf},
  year = 2011
}
@inproceedings{cabral2011a,
  author = {Cabral, J.P. and Renals, S. and Yamagishi, J. and
                   Richmond, K.},
  title = {{HMM}-based speech synthesiser using the {LF}-model of
                   the glottal source},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011
                   IEEE International Conference on},
  pages = {4704--4707},
  abstract = {A major factor which causes a deterioration in speech
                   quality in {HMM}-based speech synthesis is the use of a
                   simple delta pulse signal to generate the excitation of
                   voiced speech. This paper sets out a new approach to
                   using an acoustic glottal source model in HMM-based
                   synthesisers instead of the traditional pulse signal.
                   The goal is to improve speech quality and to better
                   model and transform voice characteristics. We have
                   found the new method decreases buzziness and also
                   improves prosodic modelling. A perceptual evaluation
                   has supported this finding by showing a 55.6%
                   preference for the new system, as against the baseline.
                   This improvement, while not being as significant as we
                   had initially expected, does encourage us to work on
                   developing the proposed speech synthesiser further.},
  categories = {HMM-based speech synthesiser;acoustic glottal source
                   model LF-model;delta pulse signal;perceptual
                   evaluation;prosodic modelling;speech quality;voiced
                   speech generation;hidden Markov models;speech
                   synthesis;},
  doi = {10.1109/ICASSP.2011.5947405},
  issn = {1520-6149},
  month = may,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/05947405.pdf},
  year = 2011
}
@article{10.1109/MCG.2011.71,
  author = {Michael A. Berger and Gregor Hofer and Hiroshi
                   Shimodaira},
  title = {Carnival -- Combining Speech Technology and Computer
                   Animation},
  journal = {IEEE Computer Graphics and Applications},
  volume = {31},
  pages = {80-89},
  address = {Los Alamitos, CA, USA},
  doi = {10.1109/MCG.2011.71},
  issn = {0272-1716},
  publisher = {IEEE Computer Society},
  year = 2011
}
@inproceedings{watts_zhou_2011,
  author = {Oliver Watts and Bowen Zhou},
  title = {Unsupervised features from text for speech synthesis
                   in a speech-to-speech translation system},
  booktitle = {Proc. Interspeech},
  pages = {2153--2156},
  address = {Florence, Italy},
  abstract = {We explore the use of linguistic features for text to
                   speech (TTS) conversion in the context of a
                   speech-to-speech translation system that can be
                   extracted from unannotated text in an unsupervised,
                   language-independent fashion. The features are intended
                   to act as surrogates for conventional part of speech
                   (POS) features. Unlike POS features, the experimental
                   features assume only the availability of tools and data
                   that must already be in place for the construction of
                   other components of the translation system, and can
                   therefore be used for the TTS module without incurring
                   additional TTS-specific costs. We here describe the use
                   of the experimental features in a speech synthesiser,
                   using six different configurations of the system to
                   allow the comparison of the proposed features with
                   conventional, knowledge-based POS features. We present
                   results of objective and subjective evaluations of the
                   usefulness of the new features.},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/watts_zhou_2011.pdf},
  year = 2011
}
@inproceedings{ling2011a,
  author = {Ling, Zhen-Hua and Richmond, Korin and Yamagishi,
                   Junichi},
  title = {Feature-space transform tying in unified
                   acoustic-articulatory modelling of articulatory control
                   of {HMM}-based speech synthesis},
  booktitle = {Proc. Interspeech},
  pages = {117--120},
  address = {Florence, Italy},
  abstract = {In previous work, we have proposed a method to control
                   the characteristics of synthetic speech flexibly by
                   integrating articulatory features into hidden Markov
                   model (HMM) based parametric speech synthesis. A
                   unified acoustic-articulatory model was trained and a
                   piecewise linear transform was adopted to describe the
                   dependency between these two feature streams. The
                   transform matrices were trained for each HMM state and
                   were tied based on each state's context. In this paper,
                   an improved acoustic-articulatory modelling method is
                   proposed. A Gaussian mixture model (GMM) is introduced
                   to model the articulatory space and the cross-stream
                   transform matrices are trained for each Gaussian
                   mixture instead of context-dependently. This means the
                   dependency relationship can vary with the change of
                   articulatory features flexibly. Our results show this
                   method improves the effectiveness of control over vowel
                   quality by modifing articulatory trajectories without
                   degrading naturalness.},
  categories = {speech synthesis, articulatory features, hidden Markov
                   model, Gaussian mixture model},
  month = {August},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/IS110482.pdf},
  year = 2011
}
@techreport{wester_mandarin:11,
  author = {Mirjam Wester and Hui Liang},
  title = {The {EMIME} {M}andarin {B}ilingual {D}atabase},
  institution = {The University of Edinburgh},
  number = {EDI-INF-RR-1396},
  abstract = {This paper describes the collection of a bilingual
                   database of Mandarin/English data. In addition, the
                   accents of the talkers in the database have been rated.
                   English and Mandarin listeners assessed the English and
                   Mandarin talkers' degree of foreign accent in English.},
  categories = {evaluation,cross-lingual, accent rating},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/wester_mandarin_2011.pdf},
  year = 2011
}
@inproceedings{lu_asru_2011,
  author = {Lu, L. and Ghoshal, A. and Renals, S.},
  title = {Regularized Subspace {G}ausian Mixture Models for
                   Cross-lingual Speech Recognition},
  booktitle = {Proc. ASRU},
  abstract = {We investigate cross-lingual acoustic modelling for
                   low resource languages using the subspace Gaussian
                   mixture model (SGMM). We assume the presence of
                   acoustic models trained on multiple source languages,
                   and use the global subspace parameters from those
                   models for improved modelling in a target language with
                   limited amounts of transcribed speech. Experiments on
                   the GlobalPhone corpus using Spanish, Portuguese, and
                   Swedish as source languages and German as target
                   language (with 1 hour and 5 hours of transcribed audio)
                   show that multilingually trained SGMM shared parameters
                   result in lower word error rates (WERs) than using
                   those from a single source language. We also show that
                   regularizing the estimation of the SGMM state vectors
                   by penalizing their $\ell_1$-norm help to overcome
                   numerical instabilities and lead to lower WER.},
  categories = {Subspace Gaussian Mixture Model, Cross-lingual, model
                   regularization},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/lu-asru-2011.pdf},
  year = 2011
}
@article{winterboer-csl:11,
  author = {Andi K. Winterboer and Martin I. Tietze and Maria K.
                   Wolters and Johanna D. Moore},
  title = {The user-model based summarize and refine approach
                   improves information presentation in spoken dialog
                   systems},
  journal = {Computer Speech and Language},
  volume = {25},
  number = {2},
  pages = {175-191},
  abstract = {A common task for spoken dialog systems (SDS) is to
                   help users select a suitable option (e.g., flight,
                   hotel, and restaurant) from the set of options
                   available. As the number of options increases, the
                   system must have strategies for generating summaries
                   that enable the user to browse the option space
                   efficiently and successfully. In the user-model based
                   summarize and refine approach (UMSR, Demberg and Moore,
                   2006), options are clustered to maximize utility with
                   respect to a user model, and linguistic devices such as
                   discourse cues and adverbials are used to highlight the
                   trade-offs among the presented items. In a Wizard-of-Oz
                   experiment, we show that the UMSR approach leads to
                   improvements in task success, efficiency, and user
                   satisfaction compared to an approach that clusters the
                   available options to maximize coverage of the domain
                   (Polifroni et al., 2003). In both a laboratory
                   experiment and a web-based experimental paradigm
                   employing the Amazon Mechanical Turk platform, we show
                   that the discourse cues in UMSR summaries help users
                   compare different options and choose between options,
                   even though they do not improve verbatim recall. This
                   effect was observed for both written and spoken
                   stimuli.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/CSL10.pdf},
  year = 2011
}
@inproceedings{wang_icassp2011a,
  author = {Dong Wang and Nicholas Evans and Raphael Troncy and
                   Simon King},
  title = {Handling overlaps in spoken term detection},
  booktitle = {Proc. International Conference on Acoustics, Speech
                   and Signal Processing},
  pages = {5656--5659},
  abstract = {Spoken term detection (STD) systems usually arrive at
                   many overlapping detections which are often addressed
                   with some pragmatic approaches, e.g. choosing the best
                   detection to represent all the overlaps. In this paper
                   we present a theoretical study based on a concept of
                   acceptance space. In particular, we present two
                   confidence estimation approaches based on Bayesian and
                   evidence perspectives respectively. Analysis shows that
                   both approaches possess respective ad vantages and
                   shortcomings, and that their combination has the
                   potential to provide an improved confidence estimation.
                   Experiments conducted on meeting data confirm our
                   analysis and show considerable performance improvement
                   with the combined approach, in particular for
                   out-of-vocabulary spoken term detection with stochastic
                   pronunciation modeling.},
  categories = {spoken term detection, speech recognition},
  doi = {10.1109/ICASSP.2011.5947643},
  month = may,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/wang_icassp2011a.pdf},
  year = 2011
}