2011.bib
@comment{{This file has been generated by bib2bib 1.92}}
@comment{{Command line: /home/korin/bibtex2html-1.92-LINUX/bib2bib -oc /home/korin/projects/publications/new_output/transitdata/2011-citations -ob /home/korin/projects/publications/new_output/transitdata/2011.bib -c 'year : "2011"' /home/korin/projects/publications/filtlists/full_publications_list.bib}}
@inproceedings{dzikovskaSIGDIAL20112,
author = {Dzikovska, Myroslava and Isard, Amy and Bell, Peter
and Moore, Johanna and Steinhauser, Natalie and
Campbell, Gwendolyn},
title = {{Beetle II}: an adaptable tutorial dialogue system},
booktitle = {Proceedings of the SIGDIAL 2011 Conference, demo
session},
pages = {338--340},
address = {Portland, Oregon},
publisher = {Association for Computational Linguistics},
abstract = {We present Beetle II, a tutorial dialogue system which
accepts unrestricted language input and supports
experimentation with different tutorial planning and
dialogue strategies. Our first system evaluation
compared two tutorial policies and demonstrated that
the system can be used to study the impact of different
approaches to tutoring. The system is also designed to
allow experimentation with a variety of natural
language techniques, and discourse and dialogue
strategies.},
month = jun,
url = {http://www.aclweb.org/anthology/W11-2041},
year = 2011
}
@inproceedings{karhila_interspeech:11,
author = {Reima Karhila and Mirjam Wester},
title = {Rapid Adaptation of Foreign-accented {HMM}-based
Speech Synthesis},
booktitle = {Proc. Interspeech},
address = {Florence, Italy},
abstract = {This paper presents findings of listeners’
perception of speaker identity in synthetic speech.
Specifically, we investigated what the effect is on the
perceived identity of a speaker when using differently
accented average voice models and limited amounts (five
and fifteen sentences) of a speaker’s data to create
the synthetic stimuli. A speaker discrimination task
was used to measure speaker identity. Native English
listeners were presented with natural and synthetic
speech stimuli in English and were asked to decide
whether they thought the sentences were spoken by the
same person or not. An accent rating task was also
carried out to measure the perceived accents of the
synthetic speech stimuli. The results show that
listeners, for the most part, perform as well at
speaker discrimination when the stimuli have been
created using five or fifteen adaptation sentences as
when using 105 sentences. Furthermore, the accent of
the average voice model does not affect listeners’
speaker discrimination performance even though the
accent rating task shows listeners are perceiving
different accents in the synthetic stimuli. Listeners
do not base their speaker similarity decisions on
perceived accent.},
categories = {speech synthesis, rapid adaptation},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/karhila_wester_interspeech_2011.pdf},
year = 2011
}
@inproceedings{DBLP:conf/aied/DzikovskaIBMSCTCS11,
author = {Myroslava Dzikovska and Amy Isard and Peter Bell and
Johanna D. Moore and Natalie B. Steinhauser and
Gwendolyn E. Campbell and Leanne S. Taylor and Simon
Caine and Charlie Scott},
title = {Adaptive Intelligent Tutorial Dialogue in the {Beetle
II} System},
booktitle = {Artificial Intelligence in Education - 15th
International Conference (AIED 2011), interactive event},
volume = {6738},
series = {Lecture Notes in Computer Science},
pages = {621},
address = {Auckland, New Zealand},
publisher = {Springer},
doi = {10.1007/978-3-642-21869-9_122},
year = 2011
}
@inproceedings{uria2011deep,
author = {Uria, Benigno and Renals, Steve and Richmond, Korin},
title = {A Deep Neural Network for Acoustic-Articulatory Speech
Inversion},
booktitle = {Proc. NIPS 2011 Workshop on Deep Learning and
Unsupervised Feature Learning},
address = {Sierra Nevada, Spain},
abstract = {In this work, we implement a deep belief network for
the acoustic-articulatory inversion mapping problem. We
find that adding up to 3 hidden-layers improves
inversion accuracy. We also show that this improvement
is due to the higher ex- pressive capability of a deep
model and not a consequence of adding more adjustable
parameters. Additionally, we show unsupervised
pretraining of the sys- tem improves its performance in
all cases, even for a 1 hidden-layer model. Our
implementation obtained an average root mean square
error of 0.95 mm on the MNGU0 test dataset, beating all
previously published results.},
month = {December},
pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2011/articulatory_inversion.pdf},
year = 2011
}
@inproceedings{5947571,
author = {Andraszewicz, S. and Yamagishi, J. and King, S.},
title = {Vocal attractiveness of statistical speech
synthesisers},
booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011
IEEE International Conference on},
pages = {5368--5371},
abstract = {Our previous analysis of speaker-adaptive HMM-based
speech synthesis methods suggested that there are two
possible reasons why average voices can obtain higher
subjective scores than any individual adapted voice: 1)
model adaptation degrades speech quality proportionally
to the distance 'moved' by the transforms, and 2)
psychoacoustic effects relating to the attractiveness
of the voice. This paper is a follow-on from that
analysis and aims to separate these effects out. Our
latest perceptual experiments focus on attractiveness,
using average voices and speaker-dependent voices
without model trans formation, and show that using
several speakers to create a voice improves smoothness
(measured by Harmonics-to-Noise Ratio), reduces
distance from the the average voice in the log F0-F1
space of the final voice and hence makes it more
attractive at the segmental level. However, this is
weakened or overridden at supra-segmental or sentence
levels.},
doi = {10.1109/ICASSP.2011.5947571},
issn = {1520-6149},
keywords = {speaker-adaptive HMM-based speech synthesis
methods;speaker-dependent voices;statistical speech
synthesisers;vocal attractiveness;hidden Markov
models;speaker recognition;speech synthesis;},
month = may,
year = 2011
}
@inproceedings{wester_interspeech:11,
author = {Mirjam Wester and Hui Liang},
title = {Cross-Lingual Speaker Discrimination Using Natural and
Synthetic Speech},
booktitle = {Proc. Interspeech},
address = {Florence, Italy},
abstract = {This paper describes speaker discrimination
experiments in which native English listeners were
presented with either natural speech stimuli in English
and Mandarin, synthetic speech stimuli in English and
Mandarin, or natural Mandarin speech and synthetic
English speech stimuli. In each experiment, listeners
were asked to decide whether they thought the sentences
were spoken by the same person or not. We found that
the results for Mandarin/English speaker discrimination
are very similar to results found in previous work on
German/English and Finnish/English speaker
discrimination. We conclude from this and previous work
that listeners are able to identify speakers across
languages and they are able to identify speakers across
speech types, but the combination of these two factors
leads to a speaker discrimination task which is too
difficult for listeners to perform successfully, given
the quality of across-language speaker adapted speech
synthesis at present.},
categories = {speaker discrimination, speaker adaptation, HMM-based
speech synthesis},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/wester_liang_interspeech_2011.pdf},
year = 2011
}
@article{tuomo:ieee2011,
author = {T. Raitio and A. Suni and J. Yamagishi and H. Pulakka
and J. Nurminen and M. Vainio and P. Alku},
title = {{HMM}-Based Speech Synthesis Utilizing Glottal Inverse
Filtering},
journal = {IEEE Transactions on Audio, Speech and Language
Processing},
volume = 19,
number = 1,
pages = {153--165},
abstract = {This paper describes an hidden Markov model
(HMM)-based speech synthesizer that utilizes glottal
inverse filtering for generating natural sounding
synthetic speech. In the proposed method, speech is
first decomposed into the glottal source signal and the
model of the vocal tract filter through glottal inverse
filtering, and thus parametrized into excitation and
spectral features. The source and filter features are
modeled individually in the framework of HMM and
generated in the synthesis stage according to the text
input. The glottal excitation is synthesized through
interpolating and concatenating natural glottal flow
pulses, and the excitation signal is further modified
according to the spectrum of the desired voice source
characteristics. Speech is synthesized by filtering the
reconstructed source signal with the vocal tract
filter. Experiments show that the proposed system is
capable of generating natural sounding speech, and the
quality is clearly better compared to two HMM-based
speech synthesis systems based on widely used vocoder
techniques.},
doi = {10.1109/TASL.2010.2045239},
keywords = {Glottal inverse filtering , hidden Markov model (HMM)
, speech synthesis},
month = jan,
year = 2011
}
@inproceedings{watts_yamagishi_king_2011,
author = {Oliver Watts and Junichi Yamagishi and Simon King},
title = {Unsupervised continuous-valued word features for
phrase-break prediction without a part-of-speech tagger},
booktitle = {Proc. Interspeech},
pages = {2157--2160},
address = {Florence, Italy},
abstract = {Part of speech (POS) tags are foremost among the
features conventionally used to predict intonational
phrase-breaks for text to speech (TTS) conversion. The
construction of such systems therefore presupposes the
availability of a POS tagger for the relevant language,
or of a corpus manually tagged with POS. However, such
tools and resources are not available in the majority
of the world’s languages, and manually labelling text
with POS tags is an expensive and time-consuming
process. We therefore propose the use of
continuous-valued features that summarise the
distributional characteristics of word types as
surrogates for POS features. Importantly, such features
are obtained in an unsupervised manner from an untagged
text corpus. We present results on the phrase-break
prediction task, where use of the features closes the
gap in performance between a baseline system (using
only basic punctuation-related features) and a topline
system (incorporating a state-of-the-art POS tagger).},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/watts_yamagishi_king_2011.pdf},
year = 2011
}
@inproceedings{Cassia_IS11,
author = {Valentini-Botinhao, Cassia and Yamagishi, Junichi and
King, Simon},
title = {Can Objective Measures Predict the Intelligibility of
Modified {HMM}-based Synthetic Speech in Noise?},
booktitle = {Proc. Interspeech},
abstract = {{Synthetic speech can be modified to improve
intelligibility in noise. In order to perform
modifications automatically, it would be useful to have
an objective measure that could predict the
intelligibility of modified synthetic speech for human
listeners. We analysed the impact on intelligibility
– and on how well objective measures predict it –
when we separately modify speaking rate, fundamental
frequency, line spectral pairs and spectral peaks.
Shifting LSPs can increase intelligibility for human
listeners; other modifications had weaker effects.
Among the objective measures we evaluated, the Dau
model and the Glimpse proportion were the best
predictors of human performance.}},
categories = {HMM-based speech synthesis, objective measures of
intelligibility},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/Cassia_IS11.pdf},
year = 2011
}
@inproceedings{richmond2011a,
author = {Richmond, Korin and Hoole, Phil and King, Simon},
title = {Announcing the Electromagnetic Articulography (Day 1)
Subset of the mngu0 Articulatory Corpus},
booktitle = {Proc. Interspeech},
pages = {1505--1508},
address = {Florence, Italy},
abstract = {This paper serves as an initial announcement of the
availability of a corpus of articulatory data called
mngu0. This corpus will ultimately consist of a
collection of multiple sources of articulatory data
acquired from a single speaker: electromagnetic
articulography (EMA), audio, video, volumetric MRI
scans, and 3D scans of dental impressions. This data
will be provided free for research use. In this first
stage of the release, we are making available one
subset of EMA data, consisting of more than 1,300
phonetically diverse utterances recorded with a
Carstens AG500 electromagnetic articulograph.
Distribution of mngu0 will be managed by a dedicated
``forum-style'' web site. This paper both outlines the
general goals motivating the distribution of the data
and the creation of the mngu0 web forum, and also
provides a description of the EMA data contained in
this initial release.},
categories = {articulography, corpus, EMA},
month = {August},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/IS110767.pdf},
year = 2011
}
@inproceedings{mcinnes_cogsci2011,
author = {Fergus R. McInnes and Sharon J. Goldwater},
title = {Unsupervised Extraction of Recurring Words from
Infant-Directed Speech},
booktitle = {Proceedings of CogSci 2011},
address = {Boston, Massachusetts},
abstract = {To date, most computational models of infant word
segmentation have worked from phonemic or phonetic
input, or have used toy datasets. In this paper, we
present an algorithm for word extraction that works
directly from naturalistic acoustic input:
infant-directed speech from the CHILDES corpus. The
algorithm identifies recurring acoustic patterns that
are candidates for identification as words or phrases,
and then clusters together the most similar patterns.
The recurring patterns are found in a single pass
through the corpus using an incremental method, where
only a small number of utterances are considered at
once. Despite this limitation, we show that the
algorithm is able to extract a number of recurring
words, including some that infants learn earliest, such
as "Mommy" and the child’s name. We also introduce a
novel information-theoretic evaluation measure.},
categories = {language acquisition, word segmentation, speech
recognition, computational modelling},
month = jul,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/mcinnes_cogsci2011.pdf},
year = 2011
}
@inproceedings{5947440,
author = {De Leon, P.L. and Hernaez, I. and Saratxaga, I. and
Pucher, M. and Yamagishi, J.},
title = {Detection of synthetic speech for the problem of
imposture},
booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011
IEEE International Conference on},
pages = {4844--4847},
abstract = {In this paper, we present new results from our
research into the vulnerability of a speaker
verification (SV) system to synthetic speech. We use a
HMM-based speech synthesizer, which creates synthetic
speech for a targeted speaker through adaptation of a
background model and both GMM-UBM and support vector
machine (SVM) SV systems. Using 283 speakers from the
Wall-Street Journal (WSJ) corpus, our SV systems have a
0.35% EER. When the systems are tested with synthetic
speech generated from speaker models derived from the
WSJ journal corpus, over 91% of the matched claims are
accepted. We propose the use of relative phase shift
(RPS) in order to detect synthetic speech and develop a
GMM-based synthetic speech classifier (SSC). Using the
SSC, we are able to correctly classify human speech in
95% of tests and synthetic speech in 88% of tests thus
significantly reducing the vulnerability.},
doi = {10.1109/ICASSP.2011.5947440},
issn = {1520-6149},
keywords = {EER;GMM-UBM;GMM-based synthetic speech
classifier;HMM-based speech synthesizer;RPS;SSC;SV
system;WSJ corpus;Wall-Street Journal corpus;relative
phase shift;speaker verification system;support vector
machine;hidden Markov models;speaker recognition;speech
synthesis;support vector machines;},
month = may,
year = 2011
}
@inproceedings{wilson_hofer:iui2011,
author = {Theresa Wilson and Gregor Hofer},
title = {Using Linguistic and Vocal Expressiveness in Social
Role Recognition},
booktitle = {Proc~Int.~Conf.~on Intelligent User Interfaces,
IUI2011},
address = {Palo Alto, USA},
publisher = {ACM},
abstract = {In this paper, we investigate two types of
expressiveness, linguistic and vocal, and whether they
are useful for recog- nising the social roles of
participants in meetings. Our ex- periments show that
combining expressiveness features with speech activity
does improve social role recognition over speech
activity features alone.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/WilsonHoferIUI2010sub.pdf},
year = 2011
}
@inproceedings{lei2011a,
author = {Lei, Ming and Yamagishi, Junichi and Richmond, Korin
and Ling, Zhen-Hua and King, Simon and Dai, Li-Rong},
title = {Formant-controlled {HMM}-based speech synthesis},
booktitle = {Proc. Interspeech},
pages = {2777--2780},
address = {Florence, Italy},
abstract = {This paper proposes a novel framework that enables us
to manipulate and control formants in HMM-based speech
synthesis. In this framework, the dependency between
formants and spectral features is modelled by piecewise
linear transforms; formant parameters are effectively
mapped by these to the means of Gaussian distributions
over the spectral synthesis parameters. The spectral
envelope features generated under the influence of
formants in this way may then be passed to high-quality
vocoders to generate the speech waveform. This provides
two major advantages over conventional frameworks.
First, we can achieve spectral modification by changing
formants only in those parts where we want control,
whereas the user must specify all formants manually in
conventional formant synthesisers (e.g. Klatt). Second,
this can produce high-quality speech. Our results show
the proposed method can control vowels in the
synthesized speech by manipulating F 1 and F 2 without
any degradation in synthesis quality.},
categories = {speech synthesis, hidden Markov model, formants,
controllability},
month = {August},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/IS110769.pdf},
year = 2011
}
@article{john:ieee2011,
author = {J. Dines and J. Yamagishi and S. King},
title = {Measuring the gap between {HMM}-based {ASR} and {TTS}},
journal = {IEEE Selected Topics in Signal Processing},
note = {(in press)},
abstract = {The EMIME European project is conducting research in
the development of technologies for mobile,
personalised speech-to-speech translation systems. The
hidden Markov model (HMM) is being used as the
underlying technology in both automatic speech
recognition (ASR) and text-to-speech synthesis (TTS)
components, thus, the investigation of unified
statistical modelling approaches has become an implicit
goal of our research. As one of the first steps towards
this goal, we have been investigating commonalities and
differences between HMM-based ASR and TTS. In this
paper we present results and analysis of a series of
experiments that have been conducted on English ASR and
TTS systems measuring their performance with respect to
phone set and lexicon; acoustic feature type and
dimensionality; HMM topology; and speaker adaptation.
Our results show that, although the fundamental
statistical model may be essentially the same, optimal
ASR and TTS performance often demands diametrically
opposed system designs. This represents a major
challenge to be addressed in the investigation of such
unified modelling approaches.},
doi = {10.1109/JSTSP.2010.2079315},
keywords = {Acoustics, Adaptation model, Context modeling, Hidden
Markov models, Speech, Speech recognition, Training,
speech recognition, speech synthesis, unified models},
year = 2011
}
@inproceedings{Cassia_ICASSP11,
author = {Valentini-Botinhao, Cassia and Yamagishi, Junichi and
King, Simon},
title = {Evaluation of objective measures for intelligibility
prediction of {HMM}-based synthetic speech in noise},
booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011
IEEE International Conference on},
pages = {5112--5115},
abstract = {{In this paper we evaluate four objective measures of
speech with regards to intelligibility prediction of
synthesized speech in diverse noisy situations. We
evaluated three intelligibility measures, the Dau
measure, the glimpse proportion and the Speech
Intelligibility Index (SII) and a quality measure, the
Perceptual Evaluation of Speech Quality (PESQ). For the
generation of synthesized speech we used a state of the
art HMM-based speech synthesis system. The noisy
conditions comprised four additive noises. The measures
were compared with subjective intelligibility scores
obtained in listening tests. The results show the Dau
and the glimpse measures to be the best predictors of
intelligibility, with correlations of around 0.83 to
subjective scores. All measures gave less accurate
predictions of intelligibility for synthetic speech
than have previously been found for natural speech; in
particular the SII measure. In additional experiments,
we processed the synthesized speech by an ideal binary
mask before adding noise. The Glimpse measure gave the
most accurate intelligibility predictions in this
situation.}},
categories = {HMM-based speech synthesis, objective measures of
intelligibility},
doi = {10.1109/ICASSP.2011.5947507},
issn = {1520-6149},
month = may,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/Cassia_ICASSP11.pdf},
year = 2011
}
@inproceedings{wester_icassp:11,
author = {Mirjam Wester and Reima Karhila},
title = {Speaker Similarity Evaluation of Foreign-accented
Speech Synthesis using {HMM}-based Speaker Adaptation},
booktitle = {Proc. ICASSP},
pages = {5372--5375},
address = {Prague, Czech Republic},
abstract = {This paper describes a speaker discrimination
experiment in which native English listeners were
presented with natural and synthetic speech stimuli in
English and were asked to judge whether they thought
the sentences were spoken by the same person or not.
The natural speech consisted of recordings of Finnish
speakers speaking English. The synthetic stimuli were
created using adaptation data from the same Finnish
speakers. Two average voice models were compared: one
trained on Finnish-accented English and the other on
American-accented English. The experiments illustrate
that listeners perform well at speaker discrimination
when the stimuli are both natural or both synthetic,
but when the speech types are crossed performance drops
significantly. We also found that the type of accent in
the average voice model had no effect on the
listeners’ speaker discrimination performance.},
categories = {Similarity Evaluation, Speaker Adaptation,
HMM-synthesis},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/wester_icassp_2011.pdf},
year = 2011
}
@inproceedings{Wolters2011,
author = {Wolters, Maria Klara and Johnson, Christine and Isaac,
Karl B},
title = {Can the Hearing Handicap Inventory for Adults Be Used
As a Screen for Perception Experiments?},
booktitle = {Proc. ICPhS XVII},
address = {Hong Kong},
abstract = {When screening participants for speech perception
experiments, formal audiometric screens are often not
an option, especially when studies are conducted over
the Internet. We investigated whether a brief
standardized self-report questionnaire, the screening
version of the Hearing Handicap Inventory for Adults
(HHIA-S), could be used to approximate the results of
audiometric screening. Our results suggest that while
the HHIA-S is useful, it needs to be used with
extremely strict cut-off values that could exclude
around 25\% of people with no hearing impairment who
are interested in participating. Well constructed,
standardized single questions might be a more feasible
alternative, in particular for web experiments.},
categories = {audiometry,hearing handicap inventory,screening},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/Wolters_icphs.pdf},
year = 2011
}
@article{Stan2011442,
author = {Adriana Stan and Junichi Yamagishi and Simon King and
Matthew Aylett},
title = {The {R}omanian speech synthesis ({RSS}) corpus:
Building a high quality {HMM}-based speech synthesis
system using a high sampling rate},
journal = {Speech Communication},
volume = {53},
number = {3},
pages = {442--450},
note = {},
abstract = {This paper first introduces a newly-recorded high
quality Romanian speech corpus designed for speech
synthesis, called ``RSS'', along with Romanian
front-end text processing modules and HMM-based
synthetic voices built from the corpus. All of these
are now freely available for academic use in order to
promote Romanian speech technology research. The RSS
corpus comprises 3500 training sentences and 500 test
sentences uttered by a female speaker and was recorded
using multiple microphones at 96 kHz sampling
frequency in a hemianechoic chamber. The details of the
new Romanian text processor we have developed are also
given. Using the database, we then revisit some basic
configuration choices of speech synthesis, such as
waveform sampling frequency and auditory frequency
warping scale, with the aim of improving speaker
similarity, which is an acknowledged weakness of
current HMM-based speech synthesisers. As we
demonstrate using perceptual tests, these configuration
choices can make substantial differences to the quality
of the synthetic speech. Contrary to common practice in
automatic speech recognition, higher waveform sampling
frequencies can offer enhanced feature extraction and
improved speaker similarity for HMM-based speech
synthesis.},
doi = {10.1016/j.specom.2010.12.002},
issn = {0167-6393},
keywords = {Speech synthesis, HTS, Romanian, HMMs, Sampling
frequency, Auditory scale},
url = {http://www.sciencedirect.com/science/article/pii/S0167639310002074},
year = 2011
}
@article{lu_spl_2011,
author = {Lu, L. and Ghoshal, A. and Renals, S.},
title = {Regularized Subspace Gausian Mixture Models for Speech
Recognition},
journal = {IEEE Signal Processing Letters},
volume = {18},
number = {7},
pages = {419--422},
abstract = {Subspace Gaussian mixture models (SGMMs) provide a
compact representation of the Gaussian parameters in an
acoustic model, but may still suffer from over-fitting
with insufficient training data. In this letter, the
SGMM state parameters are estimated using a penalized
maximum-likelihood objective, based on $\ell_1$ and
$\ell_2$ regularization, as well as their combination,
referred to as the elastic net, for robust model
estimation. Experiments on the 5000-word Wall Street
Journal transcription task show word error rate
reduction and improved model robustness with
regularization.},
categories = {Acoustic Modelling, Regularization, Sparsity, Subspace
Gaussian Mixture Model},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/lu-spl-2011.pdf},
year = 2011
}
@incollection{Pipe_etal:2011,
author = {A. G. Pipe and R. Vaidyanathan and C. Melhuish and P.
Bremner and P. Robinson and R. A. J. Clark and A. Lenz
and K. Eder and N. Hawes and Z. Ghahramani and M.
Fraser and M. Mermehdi and P. Healey and S. Skachek},
title = {Affective Robotics: Human Motion and Behavioural
Inspiration for Cooperation between Humans and
Assistive Robots},
booktitle = {Biomimetics: Nature-Based Innovation},
publisher = {Taylor and Francis},
editor = {Yoseph Bar-Cohen},
chapter = {15},
year = 2011
}
@article{wang_ieeesigprocletters2011,
author = {Dong Wang and Simon King},
title = {Letter-to-Sound Pronunciation Prediction Using
Conditional Random Fields},
journal = {IEEE Signal Processing Letters},
volume = {18},
number = {2},
pages = {122--125},
abstract = {Pronunciation prediction, or letter-to-sound (LTS)
conversion, is an essential task for speech synthesis,
open vo- cabulary spoken term detection and other
applications dealing with novel words. Most current
approaches (at least for English) employ data-driven
methods to learn and represent pronunciation ``rules''
using statistical models such as decision trees, hidden
Markov models (HMMs) or joint-multigram models (JMMs).
The LTS task remains challenging, particularly for
languages with a complex relationship between spelling
and pronunciation such as English. In this paper, we
propose to use a conditional random field (CRF) to
perform LTS because it avoids having to model a
distribution over observations and can perform global
inference, suggesting that it may be more suitable for
LTS than decision trees, HMMs or JMMs. One challenge in
applying CRFs to LTS is that the phoneme and grapheme
sequences of a word are generally of different lengths,
which makes CRF training difficult. To solve this
problem, we employed a joint-multigram model to
generate aligned training exemplars. Experiments
conducted with the AMI05 dictionary demonstrate that a
CRF significantly outperforms other models, especially
if n-best lists of predictions are generated.},
categories = {Terms—letter-to-sound, conditional random field,
joint multigram model, speech synthesis, spoken term
detection},
doi = {10.1109/LSP.2010.2098440 },
month = feb,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/wang_ieeesigprocletters2011.pdf},
year = 2011
}
@inproceedings{cabral2011a,
author = {Cabral, J.P. and Renals, S. and Yamagishi, J. and
Richmond, K.},
title = {{HMM}-based speech synthesiser using the {LF}-model of
the glottal source},
booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011
IEEE International Conference on},
pages = {4704--4707},
abstract = {A major factor which causes a deterioration in speech
quality in {HMM}-based speech synthesis is the use of a
simple delta pulse signal to generate the excitation of
voiced speech. This paper sets out a new approach to
using an acoustic glottal source model in HMM-based
synthesisers instead of the traditional pulse signal.
The goal is to improve speech quality and to better
model and transform voice characteristics. We have
found the new method decreases buzziness and also
improves prosodic modelling. A perceptual evaluation
has supported this finding by showing a 55.6%
preference for the new system, as against the baseline.
This improvement, while not being as significant as we
had initially expected, does encourage us to work on
developing the proposed speech synthesiser further.},
categories = {HMM-based speech synthesiser;acoustic glottal source
model LF-model;delta pulse signal;perceptual
evaluation;prosodic modelling;speech quality;voiced
speech generation;hidden Markov models;speech
synthesis;},
doi = {10.1109/ICASSP.2011.5947405},
issn = {1520-6149},
month = may,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/05947405.pdf},
year = 2011
}
@inproceedings{watts_zhou_2011,
author = {Oliver Watts and Bowen Zhou},
title = {Unsupervised features from text for speech synthesis
in a speech-to-speech translation system},
booktitle = {Proc. Interspeech},
pages = {2153--2156},
address = {Florence, Italy},
abstract = {We explore the use of linguistic features for text to
speech (TTS) conversion in the context of a
speech-to-speech translation system that can be
extracted from unannotated text in an unsupervised,
language-independent fashion. The features are intended
to act as surrogates for conventional part of speech
(POS) features. Unlike POS features, the experimental
features assume only the availability of tools and data
that must already be in place for the construction of
other components of the translation system, and can
therefore be used for the TTS module without incurring
additional TTS-specific costs. We here describe the use
of the experimental features in a speech synthesiser,
using six different configurations of the system to
allow the comparison of the proposed features with
conventional, knowledge-based POS features. We present
results of objective and subjective evaluations of the
usefulness of the new features.},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/watts_zhou_2011.pdf},
year = 2011
}
@inproceedings{ling2011a,
author = {Ling, Zhen-Hua and Richmond, Korin and Yamagishi,
Junichi},
title = {Feature-space transform tying in unified
acoustic-articulatory modelling of articulatory control
of {HMM}-based speech synthesis},
booktitle = {Proc. Interspeech},
pages = {117--120},
address = {Florence, Italy},
abstract = {In previous work, we have proposed a method to control
the characteristics of synthetic speech flexibly by
integrating articulatory features into hidden Markov
model (HMM) based parametric speech synthesis. A
unified acoustic-articulatory model was trained and a
piecewise linear transform was adopted to describe the
dependency between these two feature streams. The
transform matrices were trained for each HMM state and
were tied based on each state's context. In this paper,
an improved acoustic-articulatory modelling method is
proposed. A Gaussian mixture model (GMM) is introduced
to model the articulatory space and the cross-stream
transform matrices are trained for each Gaussian
mixture instead of context-dependently. This means the
dependency relationship can vary with the change of
articulatory features flexibly. Our results show this
method improves the effectiveness of control over vowel
quality by modifing articulatory trajectories without
degrading naturalness.},
categories = {speech synthesis, articulatory features, hidden Markov
model, Gaussian mixture model},
month = {August},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/IS110482.pdf},
year = 2011
}
@article{10.1109/MCG.2011.71,
author = {Michael A. Berger and Gregor Hofer and Hiroshi
Shimodaira},
title = {Carnival -- Combining Speech Technology and Computer
Animation},
journal = {IEEE Computer Graphics and Applications},
volume = {31},
pages = {80-89},
address = {Los Alamitos, CA, USA},
doi = {10.1109/MCG.2011.71},
issn = {0272-1716},
publisher = {IEEE Computer Society},
year = 2011
}
@inproceedings{kilgour2011,
author = {Kilgour, Jonathan and Carletta, Jean and Renals, Steve},
title = {The {Ambient Spotlight}: Personal meeting capture with
a microphone array},
booktitle = {Proc. HSCMA},
abstract = {We present the Ambient Spotlight system for personal
meeting capture based on a portable USB microphone
array and a laptop. The system combined distant speech
recognition and content linking with personal
productivity tools, and enables recognised meeting
recordings to be integrated with desktop search,
calender, and email. },
doi = {10.1109/HSCMA.2011.5942389},
pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2011/ambientDemo.pdf},
year = 2011
}
@inproceedings{5947506,
author = {Hashimoto, K. and Yamagishi, J. and Byrne, W. and
King, S. and Tokuda, K.},
title = {An analysis of machine translation and speech
synthesis in speech-to-speech translation system},
booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011
IEEE International Conference on},
pages = {5108--5111},
abstract = {This paper provides an analysis of the impacts of
machine translation and speech synthesis on
speech-to-speech translation systems. The
speech-to-speech translation system consists of three
components: speech recognition, machine translation and
speech synthesis. Many techniques for integration of
speech recognition and machine translation have been
proposed. However, speech synthesis has not yet been
considered. Therefore, in this paper, we focus on
machine translation and speech synthesis, and report a
subjective evaluation to analyze the impact of each
component. The results of these analyses show that the
naturalness and intelligibility of synthesized speech
are strongly affected by the fluency of the translated
sentences.},
doi = {10.1109/ICASSP.2011.5947506},
issn = {1520-6149},
keywords = {machine translation;speech recognition;speech
synthesis;speech-to-speech translation system;speech
recognition;speech synthesis;},
month = may,
year = 2011
}
@article{renals2011,
author = {Renals, S},
title = {Automatic analysis of multiparty meetings},
journal = {SADHANA - Academy Proceedings in Engineering Sciences},
volume = {36},
number = {5},
pages = {917--932},
abstract = {This paper is about the recognition and interpretation
of multiparty meetings captured as audio, video and
other signals. This is a challenging task since the
meetings consist of spontaneous and conversational
interactions between a number of participants: it is a
multimodal, multiparty, multistream problem. We discuss
the capture and annotation of the AMI meeting corpus,
the development of a meeting speech recognition system,
and systems for the automatic segmentation,
summarisation and social processing of meetings,
together with some example applications based on these
systems.},
doi = {10.1007/s12046-011-0051-3},
pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2011/renals-sadhana10.pdf},
year = 2011
}
@techreport{wester_mandarin:11,
author = {Mirjam Wester and Hui Liang},
title = {The {EMIME} {M}andarin {B}ilingual {D}atabase},
institution = {The University of Edinburgh},
number = {EDI-INF-RR-1396},
abstract = {This paper describes the collection of a bilingual
database of Mandarin/English data. In addition, the
accents of the talkers in the database have been rated.
English and Mandarin listeners assessed the English and
Mandarin talkers' degree of foreign accent in English.},
categories = {evaluation,cross-lingual, accent rating},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/wester_mandarin_2011.pdf},
year = 2011
}
@article{winterboer-csl:11,
author = {Andi K. Winterboer and Martin I. Tietze and Maria K.
Wolters and Johanna D. Moore},
title = {The user-model based summarize and refine approach
improves information presentation in spoken dialog
systems},
journal = {Computer Speech and Language},
volume = {25},
number = {2},
pages = {175-191},
abstract = {A common task for spoken dialog systems (SDS) is to
help users select a suitable option (e.g., flight,
hotel, and restaurant) from the set of options
available. As the number of options increases, the
system must have strategies for generating summaries
that enable the user to browse the option space
efficiently and successfully. In the user-model based
summarize and refine approach (UMSR, Demberg and Moore,
2006), options are clustered to maximize utility with
respect to a user model, and linguistic devices such as
discourse cues and adverbials are used to highlight the
trade-offs among the presented items. In a Wizard-of-Oz
experiment, we show that the UMSR approach leads to
improvements in task success, efficiency, and user
satisfaction compared to an approach that clusters the
available options to maximize coverage of the domain
(Polifroni et al., 2003). In both a laboratory
experiment and a web-based experimental paradigm
employing the Amazon Mechanical Turk platform, we show
that the discourse cues in UMSR summaries help users
compare different options and choose between options,
even though they do not improve verbatim recall. This
effect was observed for both written and spoken
stimuli.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/CSL10.pdf},
year = 2011
}
@article{mayo:clark:king:10,
author = {Mayo, C. and Clark, R. A. J. and King, S.},
title = {Listeners' Weighting of Acoustic Cues to Synthetic
Speech Naturalness: A Multidimensional Scaling Analysis},
journal = {Speech Communication},
volume = {53},
number = {3},
pages = {311--326},
abstract = {The quality of current commercial speech synthesis
systems is now so high that system improvements are
being made at subtle sub- and supra-segmental levels.
Human perceptual evaluation of such subtle improvements
requires a highly sophisticated level of perceptual
attention to specific acoustic characteristics or cues.
However, it is not well understood what acoustic cues
listeners attend to by default when asked to evaluate
synthetic speech. It may, therefore, be potentially
quite difficult to design an evaluation method that
allows listeners to concentrate on only one dimension
of the signal, while ignoring others that are
perceptually more important to them. The aim of the
current study was to determine which acoustic
characteristics of unit-selection synthetic speech are
most salient to listeners when evaluating the
naturalness of such speech. This study made use of
multidimensional scaling techniques to analyse
listeners' pairwise comparisons of synthetic speech
sentences. Results indicate that listeners place a
great deal of perceptual importance on the presence of
artifacts and discontinuities in the speech, somewhat
less importance on aspects of segmental quality, and
very little importance on stress/intonation
appropriateness. These relative differences in
importance will impact on listeners' ability to attend
to these different acoustic characteristics of
synthetic speech, and should therefore be taken into
account when designing appropriate methods of synthetic
speech evaluation.},
doi = {10.1016/j.specom.2010.10.003},
keywords = {Speech synthesis; Evaluation; Speech perception;
Acoustic cue weighting; Multidimensional scaling},
year = 2011
}
@inproceedings{lu_asru_2011,
author = {Lu, L. and Ghoshal, A. and Renals, S.},
title = {Regularized Subspace {G}ausian Mixture Models for
Cross-lingual Speech Recognition},
booktitle = {Proc. ASRU},
abstract = {We investigate cross-lingual acoustic modelling for
low resource languages using the subspace Gaussian
mixture model (SGMM). We assume the presence of
acoustic models trained on multiple source languages,
and use the global subspace parameters from those
models for improved modelling in a target language with
limited amounts of transcribed speech. Experiments on
the GlobalPhone corpus using Spanish, Portuguese, and
Swedish as source languages and German as target
language (with 1 hour and 5 hours of transcribed audio)
show that multilingually trained SGMM shared parameters
result in lower word error rates (WERs) than using
those from a single source language. We also show that
regularizing the estimation of the SGMM state vectors
by penalizing their $\ell_1$-norm help to overcome
numerical instabilities and lead to lower WER.},
categories = {Subspace Gaussian Mixture Model, Cross-lingual, model
regularization},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/lu-asru-2011.pdf},
year = 2011
}
@inproceedings{wang_icassp2011a,
author = {Dong Wang and Nicholas Evans and Raphael Troncy and
Simon King},
title = {Handling overlaps in spoken term detection},
booktitle = {Proc. International Conference on Acoustics, Speech
and Signal Processing},
pages = {5656--5659},
abstract = {Spoken term detection (STD) systems usually arrive at
many overlapping detections which are often addressed
with some pragmatic approaches, e.g. choosing the best
detection to represent all the overlaps. In this paper
we present a theoretical study based on a concept of
acceptance space. In particular, we present two
confidence estimation approaches based on Bayesian and
evidence perspectives respectively. Analysis shows that
both approaches possess respective ad vantages and
shortcomings, and that their combination has the
potential to provide an improved confidence estimation.
Experiments conducted on meeting data confirm our
analysis and show considerable performance improvement
with the combined approach, in particular for
out-of-vocabulary spoken term detection with stochastic
pronunciation modeling.},
categories = {spoken term detection, speech recognition},
doi = {10.1109/ICASSP.2011.5947643},
month = may,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/wang_icassp2011a.pdf},
year = 2011
}