2011.bib

@comment{{This file has been generated by bib2bib 1.92}}
@comment{{Command line: /home/korin/bibtex2html-1.92-LINUX/bib2bib -oc /home/korin/projects/publications/new_output/transitdata/2011-citations -ob /home/korin/projects/publications/new_output/transitdata/2011.bib -c 'year : "2011"' /home/korin/projects/publications/filtlists/full_publications_list.bib}}
@inproceedings{dzikovskaSIGDIAL20112,
  author = {Dzikovska, Myroslava and Isard, Amy and Bell, Peter
                   and Moore, Johanna and Steinhauser, Natalie and
                   Campbell, Gwendolyn},
  title = {{Beetle II}: an adaptable tutorial dialogue system},
  booktitle = {Proceedings of the SIGDIAL 2011 Conference, demo
                   session},
  pages = {338--340},
  address = {Portland, Oregon},
  publisher = {Association for Computational Linguistics},
  abstract = {We present Beetle II, a tutorial dialogue system which
                   accepts unrestricted language input and supports
                   experimentation with different tutorial planning and
                   dialogue strategies. Our first system evaluation
                   compared two tutorial policies and demonstrated that
                   the system can be used to study the impact of different
                   approaches to tutoring. The system is also designed to
                   allow experimentation with a variety of natural
                   language techniques, and discourse and dialogue
                   strategies.},
  month = jun,
  url = {http://www.aclweb.org/anthology/W11-2041},
  year = 2011
}
@inproceedings{karhila_interspeech:11,
  author = {Reima Karhila and Mirjam Wester},
  title = {Rapid Adaptation of Foreign-accented {HMM}-based
                   Speech Synthesis},
  booktitle = {Proc. Interspeech},
  address = {Florence, Italy},
  abstract = {This paper presents findings of listeners’
                   perception of speaker identity in synthetic speech.
                   Specifically, we investigated what the effect is on the
                   perceived identity of a speaker when using differently
                   accented average voice models and limited amounts (five
                   and fifteen sentences) of a speaker’s data to create
                   the synthetic stimuli. A speaker discrimination task
                   was used to measure speaker identity. Native English
                   listeners were presented with natural and synthetic
                   speech stimuli in English and were asked to decide
                   whether they thought the sentences were spoken by the
                   same person or not. An accent rating task was also
                   carried out to measure the perceived accents of the
                   synthetic speech stimuli. The results show that
                   listeners, for the most part, perform as well at
                   speaker discrimination when the stimuli have been
                   created using five or fifteen adaptation sentences as
                   when using 105 sentences. Furthermore, the accent of
                   the average voice model does not affect listeners’
                   speaker discrimination performance even though the
                   accent rating task shows listeners are perceiving
                   different accents in the synthetic stimuli. Listeners
                   do not base their speaker similarity decisions on
                   perceived accent.},
  categories = {speech synthesis, rapid adaptation},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/karhila_wester_interspeech_2011.pdf},
  year = 2011
}
@inproceedings{DBLP:conf/aied/DzikovskaIBMSCTCS11,
  author = {Myroslava Dzikovska and Amy Isard and Peter Bell and
                   Johanna D. Moore and Natalie B. Steinhauser and
                   Gwendolyn E. Campbell and Leanne S. Taylor and Simon
                   Caine and Charlie Scott},
  title = {Adaptive Intelligent Tutorial Dialogue in the {Beetle
                   II} System},
  booktitle = {Artificial Intelligence in Education - 15th
                   International Conference (AIED 2011), interactive event},
  volume = {6738},
  series = {Lecture Notes in Computer Science},
  pages = {621},
  address = {Auckland, New Zealand},
  publisher = {Springer},
  doi = {10.1007/978-3-642-21869-9_122},
  year = 2011
}
@inproceedings{uria2011deep,
  author = {Uria, Benigno and Renals, Steve and Richmond, Korin},
  title = {A Deep Neural Network for Acoustic-Articulatory Speech
                   Inversion},
  booktitle = {Proc. NIPS 2011 Workshop on Deep Learning and
                   Unsupervised Feature Learning},
  address = {Sierra Nevada, Spain},
  abstract = {In this work, we implement a deep belief network for
                   the acoustic-articulatory inversion mapping problem. We
                   find that adding up to 3 hidden-layers improves
                   inversion accuracy. We also show that this improvement
                   is due to the higher ex- pressive capability of a deep
                   model and not a consequence of adding more adjustable
                   parameters. Additionally, we show unsupervised
                   pretraining of the sys- tem improves its performance in
                   all cases, even for a 1 hidden-layer model. Our
                   implementation obtained an average root mean square
                   error of 0.95 mm on the MNGU0 test dataset, beating all
                   previously published results.},
  month = {December},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2011/articulatory_inversion.pdf},
  year = 2011
}
@inproceedings{5947571,
  author = {Andraszewicz, S. and Yamagishi, J. and King, S.},
  title = {Vocal attractiveness of statistical speech
                   synthesisers},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011
                   IEEE International Conference on},
  pages = {5368--5371},
  abstract = {Our previous analysis of speaker-adaptive HMM-based
                   speech synthesis methods suggested that there are two
                   possible reasons why average voices can obtain higher
                   subjective scores than any individual adapted voice: 1)
                   model adaptation degrades speech quality proportionally
                   to the distance 'moved' by the transforms, and 2)
                   psychoacoustic effects relating to the attractiveness
                   of the voice. This paper is a follow-on from that
                   analysis and aims to separate these effects out. Our
                   latest perceptual experiments focus on attractiveness,
                   using average voices and speaker-dependent voices
                   without model trans formation, and show that using
                   several speakers to create a voice improves smoothness
                   (measured by Harmonics-to-Noise Ratio), reduces
                   distance from the the average voice in the log F0-F1
                   space of the final voice and hence makes it more
                   attractive at the segmental level. However, this is
                   weakened or overridden at supra-segmental or sentence
                   levels.},
  doi = {10.1109/ICASSP.2011.5947571},
  issn = {1520-6149},
  keywords = {speaker-adaptive HMM-based speech synthesis
                   methods;speaker-dependent voices;statistical speech
                   synthesisers;vocal attractiveness;hidden Markov
                   models;speaker recognition;speech synthesis;},
  month = may,
  year = 2011
}
@inproceedings{wester_interspeech:11,
  author = {Mirjam Wester and Hui Liang},
  title = {Cross-Lingual Speaker Discrimination Using Natural and
                   Synthetic Speech},
  booktitle = {Proc. Interspeech},
  address = {Florence, Italy},
  abstract = {This paper describes speaker discrimination
                   experiments in which native English listeners were
                   presented with either natural speech stimuli in English
                   and Mandarin, synthetic speech stimuli in English and
                   Mandarin, or natural Mandarin speech and synthetic
                   English speech stimuli. In each experiment, listeners
                   were asked to decide whether they thought the sentences
                   were spoken by the same person or not. We found that
                   the results for Mandarin/English speaker discrimination
                   are very similar to results found in previous work on
                   German/English and Finnish/English speaker
                   discrimination. We conclude from this and previous work
                   that listeners are able to identify speakers across
                   languages and they are able to identify speakers across
                   speech types, but the combination of these two factors
                   leads to a speaker discrimination task which is too
                   difficult for listeners to perform successfully, given
                   the quality of across-language speaker adapted speech
                   synthesis at present.},
  categories = {speaker discrimination, speaker adaptation, HMM-based
                   speech synthesis},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/wester_liang_interspeech_2011.pdf},
  year = 2011
}
@article{tuomo:ieee2011,
  author = {T. Raitio and A. Suni and J. Yamagishi and H. Pulakka
                   and J. Nurminen and M. Vainio and P. Alku},
  title = {{HMM}-Based Speech Synthesis Utilizing Glottal Inverse
                   Filtering},
  journal = {IEEE Transactions on Audio, Speech and Language
                   Processing},
  volume = 19,
  number = 1,
  pages = {153--165},
  abstract = {This paper describes an hidden Markov model
                   (HMM)-based speech synthesizer that utilizes glottal
                   inverse filtering for generating natural sounding
                   synthetic speech. In the proposed method, speech is
                   first decomposed into the glottal source signal and the
                   model of the vocal tract filter through glottal inverse
                   filtering, and thus parametrized into excitation and
                   spectral features. The source and filter features are
                   modeled individually in the framework of HMM and
                   generated in the synthesis stage according to the text
                   input. The glottal excitation is synthesized through
                   interpolating and concatenating natural glottal flow
                   pulses, and the excitation signal is further modified
                   according to the spectrum of the desired voice source
                   characteristics. Speech is synthesized by filtering the
                   reconstructed source signal with the vocal tract
                   filter. Experiments show that the proposed system is
                   capable of generating natural sounding speech, and the
                   quality is clearly better compared to two HMM-based
                   speech synthesis systems based on widely used vocoder
                   techniques.},
  doi = {10.1109/TASL.2010.2045239},
  keywords = {Glottal inverse filtering , hidden Markov model (HMM)
                   , speech synthesis},
  month = jan,
  year = 2011
}
@inproceedings{watts_yamagishi_king_2011,
  author = {Oliver Watts and Junichi Yamagishi and Simon King},
  title = {Unsupervised continuous-valued word features for
                   phrase-break prediction without a part-of-speech tagger},
  booktitle = {Proc. Interspeech},
  pages = {2157--2160},
  address = {Florence, Italy},
  abstract = {Part of speech (POS) tags are foremost among the
                   features conventionally used to predict intonational
                   phrase-breaks for text to speech (TTS) conversion. The
                   construction of such systems therefore presupposes the
                   availability of a POS tagger for the relevant language,
                   or of a corpus manually tagged with POS. However, such
                   tools and resources are not available in the majority
                   of the world’s languages, and manually labelling text
                   with POS tags is an expensive and time-consuming
                   process. We therefore propose the use of
                   continuous-valued features that summarise the
                   distributional characteristics of word types as
                   surrogates for POS features. Importantly, such features
                   are obtained in an unsupervised manner from an untagged
                   text corpus. We present results on the phrase-break
                   prediction task, where use of the features closes the
                   gap in performance between a baseline system (using
                   only basic punctuation-related features) and a topline
                   system (incorporating a state-of-the-art POS tagger).},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/watts_yamagishi_king_2011.pdf},
  year = 2011
}
@inproceedings{Cassia_IS11,
  author = {Valentini-Botinhao, Cassia and Yamagishi, Junichi and
                   King, Simon},
  title = {Can Objective Measures Predict the Intelligibility of
                   Modified {HMM}-based Synthetic Speech in Noise?},
  booktitle = {Proc. Interspeech},
  abstract = {{Synthetic speech can be modified to improve
                   intelligibility in noise. In order to perform
                   modifications automatically, it would be useful to have
                   an objective measure that could predict the
                   intelligibility of modified synthetic speech for human
                   listeners. We analysed the impact on intelligibility
                   – and on how well objective measures predict it –
                   when we separately modify speaking rate, fundamental
                   frequency, line spectral pairs and spectral peaks.
                   Shifting LSPs can increase intelligibility for human
                   listeners; other modifications had weaker effects.
                   Among the objective measures we evaluated, the Dau
                   model and the Glimpse proportion were the best
                   predictors of human performance.}},
  categories = {HMM-based speech synthesis, objective measures of
                   intelligibility},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/Cassia_IS11.pdf},
  year = 2011
}
@inproceedings{richmond2011a,
  author = {Richmond, Korin and Hoole, Phil and King, Simon},
  title = {Announcing the Electromagnetic Articulography (Day 1)
                   Subset of the mngu0 Articulatory Corpus},
  booktitle = {Proc. Interspeech},
  pages = {1505--1508},
  address = {Florence, Italy},
  abstract = {This paper serves as an initial announcement of the
                   availability of a corpus of articulatory data called
                   mngu0. This corpus will ultimately consist of a
                   collection of multiple sources of articulatory data
                   acquired from a single speaker: electromagnetic
                   articulography (EMA), audio, video, volumetric MRI
                   scans, and 3D scans of dental impressions. This data
                   will be provided free for research use. In this first
                   stage of the release, we are making available one
                   subset of EMA data, consisting of more than 1,300
                   phonetically diverse utterances recorded with a
                   Carstens AG500 electromagnetic articulograph.
                   Distribution of mngu0 will be managed by a dedicated
                   ``forum-style'' web site. This paper both outlines the
                   general goals motivating the distribution of the data
                   and the creation of the mngu0 web forum, and also
                   provides a description of the EMA data contained in
                   this initial release.},
  categories = {articulography, corpus, EMA},
  month = {August},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/IS110767.pdf},
  year = 2011
}
@inproceedings{mcinnes_cogsci2011,
  author = {Fergus R. McInnes and Sharon J. Goldwater},
  title = {Unsupervised Extraction of Recurring Words from
                   Infant-Directed Speech},
  booktitle = {Proceedings of CogSci 2011},
  address = {Boston, Massachusetts},
  abstract = {To date, most computational models of infant word
                   segmentation have worked from phonemic or phonetic
                   input, or have used toy datasets. In this paper, we
                   present an algorithm for word extraction that works
                   directly from naturalistic acoustic input:
                   infant-directed speech from the CHILDES corpus. The
                   algorithm identifies recurring acoustic patterns that
                   are candidates for identification as words or phrases,
                   and then clusters together the most similar patterns.
                   The recurring patterns are found in a single pass
                   through the corpus using an incremental method, where
                   only a small number of utterances are considered at
                   once. Despite this limitation, we show that the
                   algorithm is able to extract a number of recurring
                   words, including some that infants learn earliest, such
                   as "Mommy" and the child’s name. We also introduce a
                   novel information-theoretic evaluation measure.},
  categories = {language acquisition, word segmentation, speech
                   recognition, computational modelling},
  month = jul,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/mcinnes_cogsci2011.pdf},
  year = 2011
}
@inproceedings{5947440,
  author = {De Leon, P.L. and Hernaez, I. and Saratxaga, I. and
                   Pucher, M. and Yamagishi, J.},
  title = {Detection of synthetic speech for the problem of
                   imposture},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011
                   IEEE International Conference on},
  pages = {4844--4847},
  abstract = {In this paper, we present new results from our
                   research into the vulnerability of a speaker
                   verification (SV) system to synthetic speech. We use a
                   HMM-based speech synthesizer, which creates synthetic
                   speech for a targeted speaker through adaptation of a
                   background model and both GMM-UBM and support vector
                   machine (SVM) SV systems. Using 283 speakers from the
                   Wall-Street Journal (WSJ) corpus, our SV systems have a
                   0.35% EER. When the systems are tested with synthetic
                   speech generated from speaker models derived from the
                   WSJ journal corpus, over 91% of the matched claims are
                   accepted. We propose the use of relative phase shift
                   (RPS) in order to detect synthetic speech and develop a
                   GMM-based synthetic speech classifier (SSC). Using the
                   SSC, we are able to correctly classify human speech in
                   95% of tests and synthetic speech in 88% of tests thus
                   significantly reducing the vulnerability.},
  doi = {10.1109/ICASSP.2011.5947440},
  issn = {1520-6149},
  keywords = {EER;GMM-UBM;GMM-based synthetic speech
                   classifier;HMM-based speech synthesizer;RPS;SSC;SV
                   system;WSJ corpus;Wall-Street Journal corpus;relative
                   phase shift;speaker verification system;support vector
                   machine;hidden Markov models;speaker recognition;speech
                   synthesis;support vector machines;},
  month = may,
  year = 2011
}
@inproceedings{wilson_hofer:iui2011,
  author = {Theresa Wilson and Gregor Hofer},
  title = {Using Linguistic and Vocal Expressiveness in Social
                   Role Recognition},
  booktitle = {Proc~Int.~Conf.~on Intelligent User Interfaces,
                   IUI2011},
  address = {Palo Alto, USA},
  publisher = {ACM},
  abstract = {In this paper, we investigate two types of
                   expressiveness, linguistic and vocal, and whether they
                   are useful for recog- nising the social roles of
                   participants in meetings. Our ex- periments show that
                   combining expressiveness features with speech activity
                   does improve social role recognition over speech
                   activity features alone.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/WilsonHoferIUI2010sub.pdf},
  year = 2011
}
@inproceedings{lei2011a,
  author = {Lei, Ming and Yamagishi, Junichi and Richmond, Korin
                   and Ling, Zhen-Hua and King, Simon and Dai, Li-Rong},
  title = {Formant-controlled {HMM}-based speech synthesis},
  booktitle = {Proc. Interspeech},
  pages = {2777--2780},
  address = {Florence, Italy},
  abstract = {This paper proposes a novel framework that enables us
                   to manipulate and control formants in HMM-based speech
                   synthesis. In this framework, the dependency between
                   formants and spectral features is modelled by piecewise
                   linear transforms; formant parameters are effectively
                   mapped by these to the means of Gaussian distributions
                   over the spectral synthesis parameters. The spectral
                   envelope features generated under the influence of
                   formants in this way may then be passed to high-quality
                   vocoders to generate the speech waveform. This provides
                   two major advantages over conventional frameworks.
                   First, we can achieve spectral modification by changing
                   formants only in those parts where we want control,
                   whereas the user must specify all formants manually in
                   conventional formant synthesisers (e.g. Klatt). Second,
                   this can produce high-quality speech. Our results show
                   the proposed method can control vowels in the
                   synthesized speech by manipulating F 1 and F 2 without
                   any degradation in synthesis quality.},
  categories = {speech synthesis, hidden Markov model, formants,
                   controllability},
  month = {August},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/IS110769.pdf},
  year = 2011
}
@article{john:ieee2011,
  author = {J. Dines and J. Yamagishi and S. King},
  title = {Measuring the gap between {HMM}-based {ASR} and {TTS}},
  journal = {IEEE Selected Topics in Signal Processing},
  note = {(in press)},
  abstract = {The EMIME European project is conducting research in
                   the development of technologies for mobile,
                   personalised speech-to-speech translation systems. The
                   hidden Markov model (HMM) is being used as the
                   underlying technology in both automatic speech
                   recognition (ASR) and text-to-speech synthesis (TTS)
                   components, thus, the investigation of unified
                   statistical modelling approaches has become an implicit
                   goal of our research. As one of the first steps towards
                   this goal, we have been investigating commonalities and
                   differences between HMM-based ASR and TTS. In this
                   paper we present results and analysis of a series of
                   experiments that have been conducted on English ASR and
                   TTS systems measuring their performance with respect to
                   phone set and lexicon; acoustic feature type and
                   dimensionality; HMM topology; and speaker adaptation.
                   Our results show that, although the fundamental
                   statistical model may be essentially the same, optimal
                   ASR and TTS performance often demands diametrically
                   opposed system designs. This represents a major
                   challenge to be addressed in the investigation of such
                   unified modelling approaches.},
  doi = {10.1109/JSTSP.2010.2079315},
  keywords = {Acoustics, Adaptation model, Context modeling, Hidden
                   Markov models, Speech, Speech recognition, Training,
                   speech recognition, speech synthesis, unified models},
  year = 2011
}
@inproceedings{Cassia_ICASSP11,
  author = {Valentini-Botinhao, Cassia and Yamagishi, Junichi and
                   King, Simon},
  title = {Evaluation of objective measures for intelligibility
                   prediction of {HMM}-based synthetic speech in noise},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011
                   IEEE International Conference on},
  pages = {5112--5115},
  abstract = {{In this paper we evaluate four objective measures of
                   speech with regards to intelligibility prediction of
                   synthesized speech in diverse noisy situations. We
                   evaluated three intelligibility measures, the Dau
                   measure, the glimpse proportion and the Speech
                   Intelligibility Index (SII) and a quality measure, the
                   Perceptual Evaluation of Speech Quality (PESQ). For the
                   generation of synthesized speech we used a state of the
                   art HMM-based speech synthesis system. The noisy
                   conditions comprised four additive noises. The measures
                   were compared with subjective intelligibility scores
                   obtained in listening tests. The results show the Dau
                   and the glimpse measures to be the best predictors of
                   intelligibility, with correlations of around 0.83 to
                   subjective scores. All measures gave less accurate
                   predictions of intelligibility for synthetic speech
                   than have previously been found for natural speech; in
                   particular the SII measure. In additional experiments,
                   we processed the synthesized speech by an ideal binary
                   mask before adding noise. The Glimpse measure gave the
                   most accurate intelligibility predictions in this
                   situation.}},
  categories = {HMM-based speech synthesis, objective measures of
                   intelligibility},
  doi = {10.1109/ICASSP.2011.5947507},
  issn = {1520-6149},
  month = may,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/Cassia_ICASSP11.pdf},
  year = 2011
}
@inproceedings{wester_icassp:11,
  author = {Mirjam Wester and Reima Karhila},
  title = {Speaker Similarity Evaluation of Foreign-accented
                   Speech Synthesis using {HMM}-based Speaker Adaptation},
  booktitle = {Proc. ICASSP},
  pages = {5372--5375},
  address = {Prague, Czech Republic},
  abstract = {This paper describes a speaker discrimination
                   experiment in which native English listeners were
                   presented with natural and synthetic speech stimuli in
                   English and were asked to judge whether they thought
                   the sentences were spoken by the same person or not.
                   The natural speech consisted of recordings of Finnish
                   speakers speaking English. The synthetic stimuli were
                   created using adaptation data from the same Finnish
                   speakers. Two average voice models were compared: one
                   trained on Finnish-accented English and the other on
                   American-accented English. The experiments illustrate
                   that listeners perform well at speaker discrimination
                   when the stimuli are both natural or both synthetic,
                   but when the speech types are crossed performance drops
                   significantly. We also found that the type of accent in
                   the average voice model had no effect on the
                   listeners’ speaker discrimination performance.},
  categories = {Similarity Evaluation, Speaker Adaptation,
                   HMM-synthesis},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/wester_icassp_2011.pdf},
  year = 2011
}
@inproceedings{Wolters2011,
  author = {Wolters, Maria Klara and Johnson, Christine and Isaac,
                   Karl B},
  title = {Can the Hearing Handicap Inventory for Adults Be Used
                   As a Screen for Perception Experiments?},
  booktitle = {Proc. ICPhS XVII},
  address = {Hong Kong},
  abstract = {When screening participants for speech perception
                   experiments, formal audiometric screens are often not
                   an option, especially when studies are conducted over
                   the Internet. We investigated whether a brief
                   standardized self-report questionnaire, the screening
                   version of the Hearing Handicap Inventory for Adults
                   (HHIA-S), could be used to approximate the results of
                   audiometric screening. Our results suggest that while
                   the HHIA-S is useful, it needs to be used with
                   extremely strict cut-off values that could exclude
                   around 25\% of people with no hearing impairment who
                   are interested in participating. Well constructed,
                   standardized single questions might be a more feasible
                   alternative, in particular for web experiments.},
  categories = {audiometry,hearing handicap inventory,screening},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/Wolters_icphs.pdf},
  year = 2011
}
@article{Stan2011442,
  author = {Adriana Stan and Junichi Yamagishi and Simon King and
                   Matthew Aylett},
  title = {The {R}omanian speech synthesis ({RSS}) corpus:
                   Building a high quality {HMM}-based speech synthesis
                   system using a high sampling rate},
  journal = {Speech Communication},
  volume = {53},
  number = {3},
  pages = {442--450},
  note = {},
  abstract = {This paper first introduces a newly-recorded high
                   quality Romanian speech corpus designed for speech
                   synthesis, called ``RSS'', along with Romanian
                   front-end text processing modules and HMM-based
                   synthetic voices built from the corpus. All of these
                   are now freely available for academic use in order to
                   promote Romanian speech technology research. The RSS
                   corpus comprises 3500 training sentences and 500 test
                   sentences uttered by a female speaker and was recorded
                   using multiple microphones at 96 kHz sampling
                   frequency in a hemianechoic chamber. The details of the
                   new Romanian text processor we have developed are also
                   given. Using the database, we then revisit some basic
                   configuration choices of speech synthesis, such as
                   waveform sampling frequency and auditory frequency
                   warping scale, with the aim of improving speaker
                   similarity, which is an acknowledged weakness of
                   current HMM-based speech synthesisers. As we
                   demonstrate using perceptual tests, these configuration
                   choices can make substantial differences to the quality
                   of the synthetic speech. Contrary to common practice in
                   automatic speech recognition, higher waveform sampling
                   frequencies can offer enhanced feature extraction and
                   improved speaker similarity for HMM-based speech
                   synthesis.},
  doi = {10.1016/j.specom.2010.12.002},
  issn = {0167-6393},
  keywords = {Speech synthesis, HTS, Romanian, HMMs, Sampling
                   frequency, Auditory scale},
  url = {http://www.sciencedirect.com/science/article/pii/S0167639310002074},
  year = 2011
}
@article{lu_spl_2011,
  author = {Lu, L. and Ghoshal, A. and Renals, S.},
  title = {Regularized Subspace Gausian Mixture Models for Speech
                   Recognition},
  journal = {IEEE Signal Processing Letters},
  volume = {18},
  number = {7},
  pages = {419--422},
  abstract = {Subspace Gaussian mixture models (SGMMs) provide a
                   compact representation of the Gaussian parameters in an
                   acoustic model, but may still suffer from over-fitting
                   with insufficient training data. In this letter, the
                   SGMM state parameters are estimated using a penalized
                   maximum-likelihood objective, based on $\ell_1$ and
                   $\ell_2$ regularization, as well as their combination,
                   referred to as the elastic net, for robust model
                   estimation. Experiments on the 5000-word Wall Street
                   Journal transcription task show word error rate
                   reduction and improved model robustness with
                   regularization.},
  categories = {Acoustic Modelling, Regularization, Sparsity, Subspace
                   Gaussian Mixture Model},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/lu-spl-2011.pdf},
  year = 2011
}
@incollection{Pipe_etal:2011,
  author = {A. G. Pipe and R. Vaidyanathan and C. Melhuish and P.
                   Bremner and P. Robinson and R. A. J. Clark and A. Lenz
                   and K. Eder and N. Hawes and Z. Ghahramani and M.
                   Fraser and M. Mermehdi and P. Healey and S. Skachek},
  title = {Affective Robotics: Human Motion and Behavioural
                   Inspiration for Cooperation between Humans and
                   Assistive Robots},
  booktitle = {Biomimetics: Nature-Based Innovation},
  publisher = {Taylor and Francis},
  editor = {Yoseph Bar-Cohen},
  chapter = {15},
  year = 2011
}
@article{wang_ieeesigprocletters2011,
  author = {Dong Wang and Simon King},
  title = {Letter-to-Sound Pronunciation Prediction Using
                   Conditional Random Fields},
  journal = {IEEE Signal Processing Letters},
  volume = {18},
  number = {2},
  pages = {122--125},
  abstract = {Pronunciation prediction, or letter-to-sound (LTS)
                   conversion, is an essential task for speech synthesis,
                   open vo- cabulary spoken term detection and other
                   applications dealing with novel words. Most current
                   approaches (at least for English) employ data-driven
                   methods to learn and represent pronunciation ``rules''
                   using statistical models such as decision trees, hidden
                   Markov models (HMMs) or joint-multigram models (JMMs).
                   The LTS task remains challenging, particularly for
                   languages with a complex relationship between spelling
                   and pronunciation such as English. In this paper, we
                   propose to use a conditional random field (CRF) to
                   perform LTS because it avoids having to model a
                   distribution over observations and can perform global
                   inference, suggesting that it may be more suitable for
                   LTS than decision trees, HMMs or JMMs. One challenge in
                   applying CRFs to LTS is that the phoneme and grapheme
                   sequences of a word are generally of different lengths,
                   which makes CRF training difficult. To solve this
                   problem, we employed a joint-multigram model to
                   generate aligned training exemplars. Experiments
                   conducted with the AMI05 dictionary demonstrate that a
                   CRF significantly outperforms other models, especially
                   if n-best lists of predictions are generated.},
  categories = {Terms—letter-to-sound, conditional random field,
                   joint multigram model, speech synthesis, spoken term
                   detection},
  doi = {10.1109/LSP.2010.2098440 },
  month = feb,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/wang_ieeesigprocletters2011.pdf},
  year = 2011
}
@inproceedings{cabral2011a,
  author = {Cabral, J.P. and Renals, S. and Yamagishi, J. and
                   Richmond, K.},
  title = {{HMM}-based speech synthesiser using the {LF}-model of
                   the glottal source},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011
                   IEEE International Conference on},
  pages = {4704--4707},
  abstract = {A major factor which causes a deterioration in speech
                   quality in {HMM}-based speech synthesis is the use of a
                   simple delta pulse signal to generate the excitation of
                   voiced speech. This paper sets out a new approach to
                   using an acoustic glottal source model in HMM-based
                   synthesisers instead of the traditional pulse signal.
                   The goal is to improve speech quality and to better
                   model and transform voice characteristics. We have
                   found the new method decreases buzziness and also
                   improves prosodic modelling. A perceptual evaluation
                   has supported this finding by showing a 55.6%
                   preference for the new system, as against the baseline.
                   This improvement, while not being as significant as we
                   had initially expected, does encourage us to work on
                   developing the proposed speech synthesiser further.},
  categories = {HMM-based speech synthesiser;acoustic glottal source
                   model LF-model;delta pulse signal;perceptual
                   evaluation;prosodic modelling;speech quality;voiced
                   speech generation;hidden Markov models;speech
                   synthesis;},
  doi = {10.1109/ICASSP.2011.5947405},
  issn = {1520-6149},
  month = may,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/05947405.pdf},
  year = 2011
}
@inproceedings{watts_zhou_2011,
  author = {Oliver Watts and Bowen Zhou},
  title = {Unsupervised features from text for speech synthesis
                   in a speech-to-speech translation system},
  booktitle = {Proc. Interspeech},
  pages = {2153--2156},
  address = {Florence, Italy},
  abstract = {We explore the use of linguistic features for text to
                   speech (TTS) conversion in the context of a
                   speech-to-speech translation system that can be
                   extracted from unannotated text in an unsupervised,
                   language-independent fashion. The features are intended
                   to act as surrogates for conventional part of speech
                   (POS) features. Unlike POS features, the experimental
                   features assume only the availability of tools and data
                   that must already be in place for the construction of
                   other components of the translation system, and can
                   therefore be used for the TTS module without incurring
                   additional TTS-specific costs. We here describe the use
                   of the experimental features in a speech synthesiser,
                   using six different configurations of the system to
                   allow the comparison of the proposed features with
                   conventional, knowledge-based POS features. We present
                   results of objective and subjective evaluations of the
                   usefulness of the new features.},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/watts_zhou_2011.pdf},
  year = 2011
}
@inproceedings{ling2011a,
  author = {Ling, Zhen-Hua and Richmond, Korin and Yamagishi,
                   Junichi},
  title = {Feature-space transform tying in unified
                   acoustic-articulatory modelling of articulatory control
                   of {HMM}-based speech synthesis},
  booktitle = {Proc. Interspeech},
  pages = {117--120},
  address = {Florence, Italy},
  abstract = {In previous work, we have proposed a method to control
                   the characteristics of synthetic speech flexibly by
                   integrating articulatory features into hidden Markov
                   model (HMM) based parametric speech synthesis. A
                   unified acoustic-articulatory model was trained and a
                   piecewise linear transform was adopted to describe the
                   dependency between these two feature streams. The
                   transform matrices were trained for each HMM state and
                   were tied based on each state's context. In this paper,
                   an improved acoustic-articulatory modelling method is
                   proposed. A Gaussian mixture model (GMM) is introduced
                   to model the articulatory space and the cross-stream
                   transform matrices are trained for each Gaussian
                   mixture instead of context-dependently. This means the
                   dependency relationship can vary with the change of
                   articulatory features flexibly. Our results show this
                   method improves the effectiveness of control over vowel
                   quality by modifing articulatory trajectories without
                   degrading naturalness.},
  categories = {speech synthesis, articulatory features, hidden Markov
                   model, Gaussian mixture model},
  month = {August},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/IS110482.pdf},
  year = 2011
}
@article{10.1109/MCG.2011.71,
  author = {Michael A. Berger and Gregor Hofer and Hiroshi
                   Shimodaira},
  title = {Carnival -- Combining Speech Technology and Computer
                   Animation},
  journal = {IEEE Computer Graphics and Applications},
  volume = {31},
  pages = {80-89},
  address = {Los Alamitos, CA, USA},
  doi = {10.1109/MCG.2011.71},
  issn = {0272-1716},
  publisher = {IEEE Computer Society},
  year = 2011
}
@inproceedings{kilgour2011,
  author = {Kilgour, Jonathan and Carletta, Jean and Renals, Steve},
  title = {The {Ambient Spotlight}: Personal meeting capture with
                   a microphone array},
  booktitle = {Proc. HSCMA},
  abstract = {We present the Ambient Spotlight system for personal
                   meeting capture based on a portable USB microphone
                   array and a laptop. The system combined distant speech
                   recognition and content linking with personal
                   productivity tools, and enables recognised meeting
                   recordings to be integrated with desktop search,
                   calender, and email. },
  doi = {10.1109/HSCMA.2011.5942389},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2011/ambientDemo.pdf},
  year = 2011
}
@inproceedings{5947506,
  author = {Hashimoto, K. and Yamagishi, J. and Byrne, W. and
                   King, S. and Tokuda, K.},
  title = {An analysis of machine translation and speech
                   synthesis in speech-to-speech translation system},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011
                   IEEE International Conference on},
  pages = {5108--5111},
  abstract = {This paper provides an analysis of the impacts of
                   machine translation and speech synthesis on
                   speech-to-speech translation systems. The
                   speech-to-speech translation system consists of three
                   components: speech recognition, machine translation and
                   speech synthesis. Many techniques for integration of
                   speech recognition and machine translation have been
                   proposed. However, speech synthesis has not yet been
                   considered. Therefore, in this paper, we focus on
                   machine translation and speech synthesis, and report a
                   subjective evaluation to analyze the impact of each
                   component. The results of these analyses show that the
                   naturalness and intelligibility of synthesized speech
                   are strongly affected by the fluency of the translated
                   sentences.},
  doi = {10.1109/ICASSP.2011.5947506},
  issn = {1520-6149},
  keywords = {machine translation;speech recognition;speech
                   synthesis;speech-to-speech translation system;speech
                   recognition;speech synthesis;},
  month = may,
  year = 2011
}
@article{renals2011,
  author = {Renals, S},
  title = {Automatic analysis of multiparty meetings},
  journal = {SADHANA - Academy Proceedings in Engineering Sciences},
  volume = {36},
  number = {5},
  pages = {917--932},
  abstract = {This paper is about the recognition and interpretation
                   of multiparty meetings captured as audio, video and
                   other signals. This is a challenging task since the
                   meetings consist of spontaneous and conversational
                   interactions between a number of participants: it is a
                   multimodal, multiparty, multistream problem. We discuss
                   the capture and annotation of the AMI meeting corpus,
                   the development of a meeting speech recognition system,
                   and systems for the automatic segmentation,
                   summarisation and social processing of meetings,
                   together with some example applications based on these
                   systems.},
  doi = {10.1007/s12046-011-0051-3},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2011/renals-sadhana10.pdf},
  year = 2011
}
@techreport{wester_mandarin:11,
  author = {Mirjam Wester and Hui Liang},
  title = {The {EMIME} {M}andarin {B}ilingual {D}atabase},
  institution = {The University of Edinburgh},
  number = {EDI-INF-RR-1396},
  abstract = {This paper describes the collection of a bilingual
                   database of Mandarin/English data. In addition, the
                   accents of the talkers in the database have been rated.
                   English and Mandarin listeners assessed the English and
                   Mandarin talkers' degree of foreign accent in English.},
  categories = {evaluation,cross-lingual, accent rating},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/wester_mandarin_2011.pdf},
  year = 2011
}
@article{winterboer-csl:11,
  author = {Andi K. Winterboer and Martin I. Tietze and Maria K.
                   Wolters and Johanna D. Moore},
  title = {The user-model based summarize and refine approach
                   improves information presentation in spoken dialog
                   systems},
  journal = {Computer Speech and Language},
  volume = {25},
  number = {2},
  pages = {175-191},
  abstract = {A common task for spoken dialog systems (SDS) is to
                   help users select a suitable option (e.g., flight,
                   hotel, and restaurant) from the set of options
                   available. As the number of options increases, the
                   system must have strategies for generating summaries
                   that enable the user to browse the option space
                   efficiently and successfully. In the user-model based
                   summarize and refine approach (UMSR, Demberg and Moore,
                   2006), options are clustered to maximize utility with
                   respect to a user model, and linguistic devices such as
                   discourse cues and adverbials are used to highlight the
                   trade-offs among the presented items. In a Wizard-of-Oz
                   experiment, we show that the UMSR approach leads to
                   improvements in task success, efficiency, and user
                   satisfaction compared to an approach that clusters the
                   available options to maximize coverage of the domain
                   (Polifroni et al., 2003). In both a laboratory
                   experiment and a web-based experimental paradigm
                   employing the Amazon Mechanical Turk platform, we show
                   that the discourse cues in UMSR summaries help users
                   compare different options and choose between options,
                   even though they do not improve verbatim recall. This
                   effect was observed for both written and spoken
                   stimuli.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/CSL10.pdf},
  year = 2011
}
@article{mayo:clark:king:10,
  author = {Mayo, C. and Clark, R. A. J. and King, S.},
  title = {Listeners' Weighting of Acoustic Cues to Synthetic
                   Speech Naturalness: A Multidimensional Scaling Analysis},
  journal = {Speech Communication},
  volume = {53},
  number = {3},
  pages = {311--326},
  abstract = {The quality of current commercial speech synthesis
                   systems is now so high that system improvements are
                   being made at subtle sub- and supra-segmental levels.
                   Human perceptual evaluation of such subtle improvements
                   requires a highly sophisticated level of perceptual
                   attention to specific acoustic characteristics or cues.
                   However, it is not well understood what acoustic cues
                   listeners attend to by default when asked to evaluate
                   synthetic speech. It may, therefore, be potentially
                   quite difficult to design an evaluation method that
                   allows listeners to concentrate on only one dimension
                   of the signal, while ignoring others that are
                   perceptually more important to them. The aim of the
                   current study was to determine which acoustic
                   characteristics of unit-selection synthetic speech are
                   most salient to listeners when evaluating the
                   naturalness of such speech. This study made use of
                   multidimensional scaling techniques to analyse
                   listeners' pairwise comparisons of synthetic speech
                   sentences. Results indicate that listeners place a
                   great deal of perceptual importance on the presence of
                   artifacts and discontinuities in the speech, somewhat
                   less importance on aspects of segmental quality, and
                   very little importance on stress/intonation
                   appropriateness. These relative differences in
                   importance will impact on listeners' ability to attend
                   to these different acoustic characteristics of
                   synthetic speech, and should therefore be taken into
                   account when designing appropriate methods of synthetic
                   speech evaluation.},
  doi = {10.1016/j.specom.2010.10.003},
  keywords = {Speech synthesis; Evaluation; Speech perception;
                   Acoustic cue weighting; Multidimensional scaling},
  year = 2011
}
@inproceedings{lu_asru_2011,
  author = {Lu, L. and Ghoshal, A. and Renals, S.},
  title = {Regularized Subspace {G}ausian Mixture Models for
                   Cross-lingual Speech Recognition},
  booktitle = {Proc. ASRU},
  abstract = {We investigate cross-lingual acoustic modelling for
                   low resource languages using the subspace Gaussian
                   mixture model (SGMM). We assume the presence of
                   acoustic models trained on multiple source languages,
                   and use the global subspace parameters from those
                   models for improved modelling in a target language with
                   limited amounts of transcribed speech. Experiments on
                   the GlobalPhone corpus using Spanish, Portuguese, and
                   Swedish as source languages and German as target
                   language (with 1 hour and 5 hours of transcribed audio)
                   show that multilingually trained SGMM shared parameters
                   result in lower word error rates (WERs) than using
                   those from a single source language. We also show that
                   regularizing the estimation of the SGMM state vectors
                   by penalizing their $\ell_1$-norm help to overcome
                   numerical instabilities and lead to lower WER.},
  categories = {Subspace Gaussian Mixture Model, Cross-lingual, model
                   regularization},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/lu-asru-2011.pdf},
  year = 2011
}
@inproceedings{wang_icassp2011a,
  author = {Dong Wang and Nicholas Evans and Raphael Troncy and
                   Simon King},
  title = {Handling overlaps in spoken term detection},
  booktitle = {Proc. International Conference on Acoustics, Speech
                   and Signal Processing},
  pages = {5656--5659},
  abstract = {Spoken term detection (STD) systems usually arrive at
                   many overlapping detections which are often addressed
                   with some pragmatic approaches, e.g. choosing the best
                   detection to represent all the overlaps. In this paper
                   we present a theoretical study based on a concept of
                   acceptance space. In particular, we present two
                   confidence estimation approaches based on Bayesian and
                   evidence perspectives respectively. Analysis shows that
                   both approaches possess respective ad vantages and
                   shortcomings, and that their combination has the
                   potential to provide an improved confidence estimation.
                   Experiments conducted on meeting data confirm our
                   analysis and show considerable performance improvement
                   with the combined approach, in particular for
                   out-of-vocabulary spoken term detection with stochastic
                   pronunciation modeling.},
  categories = {spoken term detection, speech recognition},
  doi = {10.1109/ICASSP.2011.5947643},
  month = may,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/wang_icassp2011a.pdf},
  year = 2011
}