2010.bib
@comment{{This file has been generated by bib2bib 1.92}}
@comment{{Command line: /home/korin/bibtex2html-1.92-LINUX/bib2bib -oc /home/korin/projects/publications/new_output/transitdata/2010-citations -ob /home/korin/projects/publications/new_output/transitdata/2010.bib -c 'year : "2010"' /home/korin/projects/publications/filtlists/full_publications_list.bib}}
@misc{turk2010,
author = {Turk, Alice and Scobbie, James and Geng, Christian and
Campbell, Barry and Dickie, Catherine and Dubourg,
Eddie and Bard, Ellen Gurman and Hardcastle, William
and Hartinger, Mariam and King, Simon and Lickley,
Robin and Macmartin, Cedric and Nakai, Satsuki and
Renals, Steve and Richmond, Korin and Schaeffler, Sonja
and White, Kevin and Wiegand, Ronny and Wrench, Alan},
title = {An {E}dinburgh speech production facility},
howpublished = {Poster presented at the 12th Conference on Laboratory
Phonology, Albuquerque, New Mexico.},
month = {July},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/ESPF.pdf},
year = 2010
}
@inproceedings{anderssonetal2010,
author = {Sebastian Andersson and Kallirroi Georgila and David
Traum and Matthew Aylett and Robert Clark},
title = {Prediction and Realisation of Conversational
Characteristics by Utilising Spontaneous Speech for
Unit Selection},
booktitle = {Speech Prosody 2010},
abstract = {Unit selection speech synthesis has reached high
levels of naturalness and intelligibility for neutral
read aloud speech. However, synthetic speech generated
using neutral read aloud data lacks all the attitude,
intention and spontaneity associated with everyday
conversations. Unit selection is heavily data dependent
and thus in order to simulate human conversational
speech, or create synthetic voices for believable
virtual characters, we need to utilise speech data with
examples of how people talk rather than how people
read. In this paper we included carefully selected
utterances from spontaneous conversational speech in a
unit selection voice. Using this voice and by
automatically predicting type and placement of lexical
fillers and filled pauses we can synthesise utterances
with conversational characteristics. A perceptual
listening test showed that it is possible to make
synthetic speech sound more conversational without
degrading naturalness.},
categories = {speech synthesis, unit selection, conversation,
spontaneous speech, lexical fillers, filled pauses},
month = may,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/100116.pdf},
year = 2010
}
@inproceedings{zwyssig2010,
author = {Zwyssig, Erich and Lincoln, Mike and Renals, Steve},
title = {A Digital Microphone Array for Distant Speech
Recognition},
booktitle = {Proc. IEEE ICASSP--10},
pages = {5106--5109},
abstract = {In this paper, the design, implementation and testing
of a digital microphone array is presented. The array
uses digital MEMS microphones which integrate the
microphone, amplifier and analogue to digital converter
on a single chip in place of the analogue microphones
and external audio interfaces currently used. The
device has the potential to be smaller, cheaper and
more flexible than typical analogue arrays, however the
effect on speech recognition performance of using
digital microphones is as yet unknown. In order to
evaluate the effect, an analogue array and the new
digital array are used to simultaneously record test
data for a speech recognition experiment. Initial
results employing no adaptation show that performance
using the digital array is significantly worse (14\%
absolute WER) than the analogue device. Subsequent
experiments using MLLR and CMLLR channel adaptation
reduce this gap, and employing MLLR for both channel
and speaker adaptation reduces the difference between
the arrays to 4.5\% absolute WER.},
doi = {10.1109/ICASSP.2010.5495040},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/zwyssig-icassp10.pdf},
year = 2010
}
@inproceedings{wolters-aaate:10,
author = {Maria Wolters and Marilyn McGee-Lennon},
title = {Designing Usable and Acceptable Reminders for the Home},
booktitle = {Proc. AAATE Workshop AT Technology Transfer,
Sheffield, UK},
abstract = {Electronic reminders can play a key role in enabling
people to manage their care and remain independent in
their own homes for longer. The MultiMemoHome project
aims to develop reminder designs that are accessible
and usable for users with a range of abilities and
preferences. In an initial exploration of key design
parameters, we surveyed 378 adults from all age groups
online (N=206) and by post (N= 172). The wide spread of
preferences that we found illustrates the importance of
adapting reminder solutions to individuals. We present
two reusable personas that emerged from the research
and discuss how questionnaires can be used for
technology transfer.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/Wolters_McGee-Lennon_AAATE_Final.pdf},
year = 2010
}
@inproceedings{renals2010b,
author = {Renals, Steve},
title = {Recognition and Understanding of Meetings},
booktitle = {Proc. NAACL/HLT},
pages = {1--9},
abstract = {This paper is about interpreting human communication
in meetings using audio, video and other signals.
Automatic meeting recognition and understanding is
extremely challenging, since communication in a meeting
is spontaneous and conversational, and involves
multiple speakers and multiple modalities. This leads
to a number of significant research problems in signal
processing, in speech recognition, and in discourse
interpretation, taking account of both individual and
group behaviours. Addressing these problems requires an
interdisciplinary effort. In this paper, I discuss the
capture and annotation of multimodal meeting recordings
- resulting in the AMI meeting corpus - and how we have
built on this to develop techniques and applications
for the recognition and interpretation of meetings.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/renals-naacl10.pdf},
year = 2010
}
@inproceedings{kilgour2010,
author = {Kilgour, Jonathan and Carletta, Jean and Renals, Steve},
title = {The {Ambient Spotlight}: Queryless desktop search from
meeting speech},
booktitle = {Proc ACM Multimedia 2010 Workshop SSCS 2010},
abstract = {It has recently become possible to record any small
meeting using a laptop equipped with a plug-and-play
USB microphone array. We show the potential for such
recordings in a personal aid that allows project
managers to record their meetings and, when reviewing
them afterwards through a standard calendar interface,
to find relevant documents on their computer. This
interface is intended to supplement or replace the
textual searches that managers typically perform. The
prototype, which relies on meeting speech recognition
and topic segmentation, formulates and runs desktop
search queries in order to present its results.},
doi = {10.1145/1878101.1878112},
pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2010/AmbientSpot.pdf},
year = 2010
}
@inproceedings{wang_acmsccs2010,
author = {Dong Wang and Simon King and Nick Evans and Raphael
Troncy},
title = {Direct Posterior Confidence For Out-of-Vocabulary
Spoken Term Detection},
booktitle = {Proc. ACM Multimedia 2010 Searching Spontaneous
Conversational Speech Workshop},
abstract = {Spoken term detection (STD) is a fundamental task in
spoken information retrieval. Compared to conventional
speech transcription and keyword spotting, STD is an
open-vocabulary task and is necessarily required to
address out-of-vocabulary (OOV) terms. Approaches based
on subword units, e.g. phonemes, are widely used to
solve the OOV issue; however, performance on OOV terms
is still significantly inferior to that for
in-vocabulary (INV) terms. The performance degradation
on OOV terms can be attributed to a multitude of
factors. A particular factor we address in this paper
is that the acoustic and language models used for
speech transcribing are highly vulnerable to OOV terms,
which leads to unreliable confidence measures and
error-prone detections. A direct posterior confidence
measure that is derived from discriminative models has
been proposed for STD. In this paper, we utilize this
technique to tackle the weakness of OOV terms in
confidence estimation. Neither acoustic models nor
language models being included in the computation, the
new confidence avoids the weak modeling problem with
OOV terms. Our experiments, set up on multi-party
meeting speech which is highly spontaneous and
conversational, demonstrate that the proposed technique
improves STD performance on OOV terms significantly;
when combined with conventional lattice-based
confidence, a significant improvement in performance is
obtained on both INVs and OOVs. Furthermore, the new
confidence measure technique can be combined together
with other advanced techniques for OOV treatment, such
as stochastic pronunciation modeling and term-dependent
confidence discrimination, which leads to an integrated
solution for OOV STD with greatly improved performance.},
categories = {confidence estimation, spoken term detection, speech
recognition},
doi = {10.1145/1878101.1878107},
month = oct,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wang_acmsccs2010.pdf},
year = 2010
}
@misc{Carnival_SIGGRAPH_2010,
author = {Michael Berger and Gregor Hofer and Hiroshi Shimodaira},
title = {Carnival: a modular framework for automated facial
animation},
howpublished = {Poster at SIGGRAPH 2010},
note = {Bronze award winner, ACM Student Research Competition},
abtract = {We present a software framework for speech- or
text-driven animation--including a platform-independent
API and an application implementing it--which unifies
state-of-the-art speech technology and graphics
technology within a single system.},
address = {Los Angeles, Calif., USA},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/carnival.pdf},
year = 2010
}
@inproceedings{Wang_TOIS2012,
author = {Wang, Dong and King, Simon and Evans, Nicholas W. D.
and Troncy, Raphaël},
title = {Direct posterior confidence for out-of-vocabulary
spoken term detection},
booktitle = {{SSCS} 2010, {ACM} {W}orkshop on {S}earching
{S}pontaneous {C}onversational {S}peech, {S}eptember
20-24, 2010, {F}irenze, {I}taly},
address = {{F}irenze, {ITALY}},
abstract = {Spoken term detection (STD) is a fundamental task in
spoken information retrieval. Compared to conventional
speech transcription and keyword spotting, STD is an
open-vocabulary task and is necessarily required to
address out-of-vocabulary (OOV) terms. Approaches based
on subword units, e.g. phonemes, are widely used to
solve the OOV issue; however, performance on OOV terms
is still significantly inferior to that for
in-vocabulary (INV) terms. The performance degradation
on OOV terms can be attributed to a multitude of
factors. A particular factor we address in this paper
is that the acoustic and language models used for
speech transcribing are highly vulnerable to OOV terms,
which leads to unreliable confidence measures and
error-prone detections. A direct posterior confidence
measure that is derived from discriminative models has
been proposed for STD. In this paper, we utilize this
technique to tackle the weakness of OOV terms in
confidence estimation. Neither acoustic models nor
language models being included in the computation, the
new confidence avoids the weak modeling problem with
OOV terms. Our experiments, set up on multi-party
meeting speech which is highly spontaneous and
conversational, demonstrate that the proposed technique
improves STD performance on OOV terms significantly;
when combined with conventional lattice-based
confidence, a significant improvement in performance is
obtained on both INVs and OOVs. Furthermore, the new
confidence measure technique can be combined together
with other advanced techniques for OOV treatment, such
as stochastic pronunciation modeling and term-dependent
confidence discrimination, which leads to an integrated
solution for OOV STD with greatly improved performance.},
doi = {http://dx.doi.org/10.1145/1878101.1878107},
month = sep,
year = 2010
}
@incollection{king:gold_and_morgan_chapter2009,
author = {Simon King},
title = {Speech Synthesis},
booktitle = {Speech and Audio Signal Processing},
publisher = {Wiley},
editor = {Morgan and Ellis},
abstract = {No abstract (this is a book chapter)},
categories = {speech synthesis},
year = 2010
}
@inproceedings{ling_interspeech2010,
author = {Ling, Zhen-Hua and Richmond, Korin and Yamagishi,
Junichi},
title = {{HMM}-based Text-to-Articulatory-Movement Prediction
and Analysis of Critical Articulators},
booktitle = {Proc. Interspeech},
pages = {2194--2197},
address = {Makuhari, Japan},
abstract = {In this paper we present a method to predict the
movement of a speaker's mouth from text input using
hidden Markov models (HMM). We have used a corpus of
human articulatory movements, recorded by
electromagnetic articulography (EMA), to train HMMs. To
predict articulatory movements from text, a suitable
model sequence is selected and the maximum-likelihood
parameter generation (MLPG) algorithm is used to
generate output articulatory trajectories. In our
experiments, we find that fully context-dependent
models outperform monophone and quinphone models,
achieving an average root mean square (RMS) error of
1.945mm when state durations are predicted from text,
and 0.872mm when natural state durations are used.
Finally, we go on to analyze the prediction error for
different EMA dimensions and phone types. We find a
clear pattern emerges that the movements of so-called
critical articulators can be predicted more accurately
than the average performance.},
keywords = {Hidden Markov model, articulatory features, parameter
generation, critical articulators},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100349.pdf},
year = 2010
}
@article{roberto:specom2010,
author = {R. Barra-Chicote and J. Yamagishi and S. King and J.
Manuel Monero and J. Macias-Guarasa},
title = {Analysis of Statistical Parametric and Unit-Selection
Speech Synthesis Systems Applied to Emotional Speech},
journal = {Speech Communication},
volume = {52},
number = {5},
pages = {394--404},
abstract = {We have applied two state-of-the-art speech synthesis
techniques (unit selection and HMM-based synthesis) to
the synthesis of emotional speech. A series of
carefully designed perceptual tests to evaluate speech
quality, emotion identification rates and emotional
strength were used for the six emotions which we
recorded -- happiness, sadness, anger, surprise, fear,
disgust. For the HMM-based method, we evaluated
spectral and source components separately and
identified which components contribute to which
emotion. Our analysis shows that, although the HMM
method produces significantly better neutral speech,
the two methods produce emotional speech of similar
quality, except for emotions having context-dependent
prosodic patterns. Whilst synthetic speech produced
using the unit selection method has better emotional
strength scores than the HMM-based method, the
HMM-based method has the ability to manipulate the
emotional strength. For emotions that are characterized
by both spectral and prosodic components, synthetic
speech using unit selection methods was more accurately
identified by listeners. For emotions mainly
characterized by prosodic components, HMM-based
synthetic speech was more accurately identified. This
finding differs from previous results regarding
listener judgements of speaker similarity for neutral
speech. We conclude that unit selection methods require
improvements to prosodic modeling and that HMM-based
methods require improvements to spectral modeling for
emotional speech. Certain emotions cannot be reproduced
well by either method.},
doi = {10.1016/j.specom.2009.12.007},
keywords = {Emotional speech synthesis; HMM-based synthesis; Unit
selection},
month = may,
year = 2010
}
@inproceedings{janska_clark:2010a,
author = {Anna C. Janska and Robert A. J. Clark},
title = {Native and Non-Native Speaker Judgements on the
Quality of Synthesized Speech},
booktitle = {Proc. Interspeech},
pages = {1121--1124},
abstract = {The difference between native speakers' and non-native
speak- ers' naturalness judgements of synthetic speech
is investigated. Similar/difference judgements are
analysed via a multidimensional scaling analysis and
compared to Mean opinion scores. It is shown that
although the two groups generally behave in a similar
manner the variance of non-native speaker judgements is
generally higher. While both groups of subject can
clearly distinguish natural speech from the best
synthetic examples, the groups' responses to different
artefacts present in the synthetic speech can vary. },
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/janskaclark_interspeech2010.pdf},
year = 2010
}
@techreport{wester_accent2010,
author = {Wester, M.},
title = {The {EMIME} {B}ilingual {D}atabase},
institution = {The University of Edinburgh},
number = {EDI-INF-RR-1388},
abstract = {This paper describes the collection of a bilingual
database of Finnish/English and German/English data. In
addition, the accents of the talkers in the database
have been rated. English, German and Finnish listeners
assessed the English, German and Finnish talkers{\^a}
degree of foreign accent in English. Native English
listeners showed higher inter-listener agreement than
non-native listeners. Further analyses showed that
non-native listeners judged Finnish and German female
talkers to be significantly less accented than do
English listeners. German males are judged less
accented by Finnish listeners than they are by English
and German listeners and there is no difference between
listeners as to how they judge the accent of Finnish
males. Finally, all English talkers are judged more
accented by non-native listeners than they are by
native English listeners.},
categories = {evaluation,cross-lingual, accent rating},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wester_accent_2010.pdf},
year = 2010
}
@article{5510125,
author = {Wang, D. and King, S. and Frankel, J.},
title = {Stochastic Pronunciation Modelling for
Out-of-Vocabulary Spoken Term Detection},
journal = {Audio, Speech, and Language Processing, IEEE
Transactions on},
volume = {PP},
number = {99},
abstract = {Spoken term detection (STD) is the name given to the
task of searching large amounts of audio for
occurrences of spoken terms, which are typically single
words or short phrases. One reason that STD is a hard
task is that search terms tend to contain a
disproportionate number of out-of-vocabulary (OOV)
words. The most common approach to STD uses subword
units. This, in conjunction with some method for
predicting pronunciations of OOVs from their written
form, enables the detection of OOV terms but
performance is considerably worse than for
in-vocabulary terms. This performance differential can
be largely attributed to the special properties of
OOVs. One such property is the high degree of
uncertainty in the pronunciation of OOVs. We present a
stochastic pronunciation model (SPM) which explicitly
deals with this uncertainty. The key insight is to
search for all possible pronunciations when detecting
an OOV term, explicitly capturing the uncertainty in
pronunciation. This requires a probabilistic model of
pronunciation, able to estimate a distribution over all
possible pronunciations. We use a joint-multigram model
(JMM) for this and compare the JMM-based SPM with the
conventional soft match approach. Experiments using
speech from the meetings domain demonstrate that the
SPM performs better than soft match in most operating
regions, especially at low false alarm probabilities.
Furthermore, SPM and soft match are found to be
complementary: their combination provides further
performance gains.},
categories = {confidence estimation, spoken term detection, speech
recognition, OOVs},
doi = {10.1109/TASL.2010.2058800},
issn = {1558-7916},
month = jul,
year = 2010
}
@inproceedings{phillip:icassp2010,
author = {P. L. De Leon and V. R. Apsingekar and M. Pucher and
J. Yamagishi},
title = {Revisiting the security of speaker verification
systems against imposture using synthetic speech},
booktitle = {{Proc. ICASSP 2010}},
address = {Dallas, Texas, USA},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/main_r2.pdf},
year = 2010
}
@inproceedings{felps_interspeech2010,
author = {Felps, Daniel and Geng, Christian and Berger, Michael
and Richmond, Korin and Gutierrez-Osuna, Ricardo},
title = {Relying on critical articulators to estimate vocal
tract spectra in an articulatory-acoustic database},
booktitle = {Proc. Interspeech},
pages = {1990--1993},
abstract = {We present a new phone-dependent feature weighting
scheme that can be used to map articulatory
configurations (e.g. EMA) onto vocal tract spectra
(e.g. MFCC) through table lookup. The approach consists
of assigning feature weights according to a feature's
ability to predict the acoustic distance between
frames. Since an articulator's predictive accuracy is
phone-dependent (e.g., lip location is a better
predictor for bilabial sounds than for palatal sounds),
a unique weight vector is found for each phone.
Inspection of the weights reveals a correspondence with
the expected critical articulators for many phones. The
proposed method reduces overall cepstral error by 6\%
when compared to a uniform weighting scheme. Vowels
show the greatest benefit, though improvements occur
for 80\% of the tested phones.},
keywords = {speech production, speech synthesis},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100076.pdf},
year = 2010
}
@article{wolters-uais:10,
author = {Maria Wolters and Klaus-Peter Engelbrecht and Florian
G\"odde and Sebastian M\"oller and Anja Naumann and
Robert Schleicher},
title = {Making it Easier for Older People to Talk to Smart
Homes: Using Help Prompts to Shape Users' Speech},
journal = {Universal Access in the Information Society},
volume = {9},
number = {4},
pages = {311-325},
abstract = {It is well known that help prompts shape how users
talk to spoken dialogue systems. This study
investigated the effect of help prompt placement on
older users' interaction with a smart home interface.
In the dynamic help condition, help was only given in
response to system errors; in the inherent help
condition, it was also given at the start of each task.
Fifteen older and sixteen younger users interacted with
a smart home system using two different scenarios. Each
scenario consisted of several tasks. The linguistic
style users employed to communicate with the system
(interaction style) was measured using the ratio of
commands to the overall utterance length (keyword
ratio) and the percentage of content words in the
user's utterance that could be understood by the system
(shared vocabulary). While the timing of help prompts
did not affect the interaction style of younger users,
it was early task-specific help supported older users
in adapting their interaction style to the system's
capabilities. Well-placed help prompts can
significantly increase the usability of spoken dialogue
systems for older people.},
categories = {spoken dialogue systems, usability, older adults,
smart homes, help prompts},
doi = {10.1007/s10209-009-0184-x},
year = 2010
}
@article{white_clark_moore:2010,
author = {Michael White and Robert A. J. Clark and Johanna D.
Moore},
title = {Generating Tailored, Comparative Descriptions with
Contextually Appropriate Intonation},
journal = {Computational Linguistics},
volume = {36},
number = {2},
pages = {159-201},
abstract = {Generating responses that take user preferences into
account requires adaptation at all levels of the
generation process. This article describes a
multi-level approach to presenting user-tailored
information in spoken dialogues which brings together
for the first time multi-attribute decision models,
strategic content planning, surface realization that
incorporates prosody prediction, and unit selection
synthesis that takes the resulting prosodic structure
into account. The system selects the most important
options to mention and the attributes that are most
relevant to choosing between them, based on the user
model. Multiple options are selected when each offers a
compelling trade-off. To convey these trade-offs, the
system employs a novel presentation strategy which
straightforwardly lends itself to the determination of
information structure, as well as the contents of
referring expressions. During surface realization, the
prosodic structure is derived from the information
structure using Combinatory Categorial Grammar in a way
that allows phrase boundaries to be determined in a
flexible, data-driven fashion. This approach to
choosing pitch accents and edge tones is shown to yield
prosodic structures with significantly higher
acceptability than baseline prosody prediction models
in an expert evaluation. These prosodic structures are
then shown to enable perceptibly more natural synthesis
using a unit selection voice that aims to produce the
target tunes, in comparison to two baseline synthetic
voices. An expert evaluation and f0 analysis confirm
the superiority of the generator-driven intonation and
its contribution to listeners' ratings.},
doi = {10.1162/coli.09-023-R1-08-002},
year = 2010
}
@inproceedings{friedrich:COST2102,
author = {Michael Pucher and Friedrich Neubarth and Volker Strom},
title = {Optimizing Phonetic Encoding for {V}iennese Unit
Selection Speech Synthesis},
booktitle = {COST 2102 Int. Training School 2009, LNCS},
editor = {A. Esposito et al.},
address = {Heidelberg},
publisher = {Springer-Verlag},
abstract = {While developing lexical resources for a particular
language variety (Viennese), we experimented with a set
of 5 different phonetic encodings, termed phone sets,
used for unit selection speech synthesis. We started
with a very rich phone set based on phonological
considerations and covering as much phonetic
variability as possible, which was then reduced to
smaller sets by applying transformation rules that map
or merge phone symbols. The optimal trade-off was found
measuring the phone error rates of automatically learnt
grapheme-to-phone rules and by a perceptual evaluation
of 27 representative synthesized sentences. Further, we
describe a method to semi-automatically enlarge the
lexical resources for the target language variety using
a lexicon base for Standard Austrian German.},
categories = {speech synthesis, language varieties, phonetic
encoding, grapheme-to-phone, pronunciation lexicon.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/COST2102.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/COST2102.ps},
year = 2010
}
@article{huang2010,
author = {Huang, Songfang and Renals, Steve},
title = {Hierarchical {Bayesian} Language Models for
Conversational Speech Recognition},
journal = {IEEE Transactions on Audio, Speech and Language
Processing},
volume = {18},
number = {8},
pages = {1941--1954},
abstract = {Traditional n-gram language models are widely used in
state-of-the-art large vocabulary speech recognition
systems. This simple model suffers from some
limitations, such as overfitting of maximum-likelihood
estimation and the lack of rich contextual knowledge
sources. In this paper, we exploit a hierarchical
Bayesian interpretation for language modeling, based on
a nonparametric prior called the Pitman--Yor process.
This offers a principled approach to language model
smoothing, embedding the power-law distribution for
natural language. Experiments on the recognition of
conversational speech in multiparty meetings
demonstrate that by using hierarchical Bayesian
language models, we are able to achieve significant
reductions in perplexity and word error rate.},
doi = {10.1109/TASL.2010.2040782},
keywords = {AMI corpus , conversational speech recognition ,
hierarchical Bayesian model , language model (LM) ,
meetings , smoothing},
month = {January},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/huang-taslp10.pdf},
url = {http://dx.doi.org/10.1109/TASL.2010.2040782},
year = 2010
}
@inproceedings{richmond_interspeech2010,
author = {Richmond, Korin and Clark, Robert and Fitt, Sue},
title = {On Generating {C}ombilex Pronunciations via
Morphological Analysis},
booktitle = {Proc. Interspeech},
pages = {1974--1977},
address = {Makuhari, Japan},
abstract = {Combilex is a high-quality lexicon that has been
developed specifically for speech technology purposes
and recently released by CSTR. Combilex benefits from
many advanced features. This paper explores one of
these: the ability to generate fully-specified
transcriptions for morphologically derived words
automatically. This functionality was originally
implemented to encode the pronunciations of derived
words in terms of their constituent morphemes, thus
accelerating lexicon development and ensuring a high
level of consistency. In this paper, we propose this
method of modelling pronunciations can be exploited
further by combining it with a morphological parser,
thus yielding a method to generate full transcriptions
for unknown derived words. Not only could this
accelerate adding new derived words to Combilex, but it
could also serve as an alternative to conventional
letter-to-sound rules. This paper presents preliminary
work indicating this is a promising direction.},
keywords = {combilex lexicon, letter-to-sound rules,
grapheme-to-phoneme conversion, morphological
decomposition},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100683.pdf},
year = 2010
}
@inproceedings{yong:ssw7,
author = {Yong Guan and Jilei Tian and Yi-Jian Wu and Junichi
Yamagishi and Jani Nurminen},
title = {A Unified and Automatic Approach Of {M}andarin {HTS}
System},
booktitle = {{Proc. SSW7}},
address = {Kyoto, Japan},
keywords = {HTS, speech synthesis, mandarin},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/HTS_Yong_ssw7.pdf},
year = 2010
}
@inproceedings{wester:interspeech:10,
author = {Mirjam Wester},
title = {Cross-lingual talker discrimination},
booktitle = {Proc. of Interspeech},
address = {Makuhari, Japan},
abstract = {This paper describes a talker discrimination
experiment in which native English listeners were
presented with two sentences spoken by bilingual
talkers (English/German and English/Finnish) and were
asked to judge whether they thought the sentences were
spoken by the same person or not. Equal amounts of
cross-lingual and matched-language trials were
presented. The experiments showed that listeners are
able to complete this task well, they can discriminate
between talkers significantly better than chance.
However, listeners are significantly less accurate on
cross-lingual talker trials than on matched-language
pairs. No significant differences were found on this
task between German and Finnish. Bias (B'') and
Sensitivity (A') values are presented to analyse the
listeners' behaviour in more detail. The results are
promising for the evaluation of EMIME, a project
covering speech-to-speech translation with speaker
adaptation.},
categories = {evaluation},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wester_interspeech2010.pdf},
year = 2010
}
@inproceedings{friedrich:lrec2010,
author = {Michael Pucher and Friedrich Neubarth and Volker Strom
and Sylvia Moosmüller and Gregor Hofer and Christian
Kranzler and Gudrun Schuchmann and Dietmar Schabus},
title = {Resources for speech synthesis of Viennese varieties},
booktitle = {Proc.~Int.~Conf.~on Language Resources and Evaluation,
LREC'10},
address = {Malta},
publisher = {European Language Resources Association (ELRA)},
abstract = {This paper describes our work on developing corpora of
three varieties of Viennese for unit selection speech
synthesis. The synthetic voices for Viennese varieties,
implemented with the open domain unit selection speech
synthesis engine Multisyn of Festival will also be
released within Festival. The paper especially focuses
on two questions: how we selected the appropriate
speakers and how we obtained the text sources needed
for the recording of these non-standard varieties.
Regarding the first one, it turned out that working
with a ‘prototypical’ professional speaker was much
more preferable than striving for authenticity. In
addition, we give a brief outline about the differences
between the Austrian standard and its dialectal
varieties and how we solved certain technical problems
that are related to these differences. In particular,
the specific set of phones applicable to each variety
had to be determined by applying various constraints.
Since such a set does not serve any descriptive
purposes but rather is influencing the quality of
speech synthesis, a careful design of such a (in most
cases reduced) set was an important task.},
categories = {speech synthesis, language varieties, phonetic
encoding, graphem-to-phone, pronunciation lexicon.},
pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2010/lrec2010_viennese.pdf},
ps = {http://www.cstr.ed.ac.uk/downloads/publications/2010/lrec2010_viennese.ps},
year = 2010
}
@inproceedings{janska_clark:2010b,
author = {Anna C. Janska and Robert A. J. Clark},
title = {Further exploration of the possibilities and pitfalls
of multidimensional scaling as a tool for the
evaluation of the quality of synthesized speech},
booktitle = {The 7th ISCA Tutorial and Research Workshop on Speech
Synthesis},
pages = {142--147},
abstract = {Multidimensional scaling (MDS) has been suggested as a
use- ful tool for the evaluation of the quality of
synthesized speech. However, it has not yet been
extensively tested for its applica- tion in this
specific area of evaluation. In a series of experi-
ments based on data from the Blizzard Challenge 2008
the relations between Weighted Euclidean Distance
Scaling and Simple Euclidean Distance Scaling is
investigated to understand how aggregating data affects
the MDS configuration. These results are compared to
those collected as mean opinion scores (MOS). The ranks
correspond, and MOS can be predicted from an object's
space in the MDS generated stimulus space. The big
advantage of MDS over MOS is its diagnostic value;
dimensions along which stimuli vary are not correlated,
as is the case in modular evaluation using MOS.
Finally, it will be attempted to generalize from the
MDS representations of the thoroughly tested subset to
the aggregated data of the larger-scale Blizzard
Challenge.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/janskaclark_ssw7.pdf},
year = 2010
}
@inproceedings{cabral_ssw7,
author = {Cabral, Jo{\~a}o and Renals, Steve and Richmond, Korin
and Yamagishi, Junichi},
title = {Transforming Voice Source Parameters in a {HMM}-based
Speech Synthesiser with Glottal Post-Filtering},
booktitle = {Proc. 7th ISCA Speech Synthesis Workshop (SSW7)},
pages = {365--370},
address = {NICT/ATR, Kyoto, Japan},
abstract = {Control over voice quality, e.g. breathy and tense
voice, is important for speech synthesis applications.
For example, transformations can be used to modify
aspects of the voice re- lated to speaker's identity
and to improve expressiveness. How- ever, it is hard to
modify voice characteristics of the synthetic speech,
without degrading speech quality. State-of-the-art sta-
tistical speech synthesisers, in particular, do not
typically al- low control over parameters of the
glottal source, which are strongly correlated with
voice quality. Consequently, the con- trol of voice
characteristics in these systems is limited. In con-
trast, the HMM-based speech synthesiser proposed in
this paper uses an acoustic glottal source model. The
system passes the glottal signal through a whitening
filter to obtain the excitation of voiced sounds. This
technique, called glottal post-filtering, allows to
transform voice characteristics of the synthetic speech
by modifying the source model parameters. We evaluated
the proposed synthesiser in a perceptual ex- periment,
in terms of speech naturalness, intelligibility, and
similarity to the original speaker's voice. The results
show that it performed as well as a HMM-based
synthesiser, which generates the speech signal with a
commonly used high-quality speech vocoder.},
keywords = {HMM-based speech synthesis, voice quality, glottal
post-filter},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/cabral_ssw7.pdf},
year = 2010
}
@inproceedings{vipperla2010a,
author = {Vipperla, Ravi Chander and Renals, Steve and Frankel,
Joe},
title = {Augmentation of adaptation data},
booktitle = {Proc. Interspeech},
pages = {530--533},
address = {Makuhari, Japan},
abstract = {Linear regression based speaker adaptation approaches
can improve Automatic Speech Recognition (ASR) accuracy
significantly for a target speaker. However, when the
available adaptation data is limited to a few seconds,
the accuracy of the speaker adapted models is often
worse compared with speaker independent models. In this
paper, we propose an approach to select a set of
reference speakers acoustically close to the target
speaker whose data can be used to augment the
adaptation data. To determine the acoustic similarity
of two speakers, we propose a distance metric based on
transforming sample points in the acoustic space with
the regression matrices of the two speakers. We show
the validity of this approach through a speaker
identification task. ASR results on SCOTUS and AMI
corpora with limited adaptation data of 10 to 15
seconds augmented by data from selected reference
speakers show a significant improvement in Word Error
Rate over speaker independent and speaker adapted
models.},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/vipperla-is2010.pdf},
year = 2010
}
@article{Ling2010834,
author = {Zhen-Hua Ling and Korin Richmond and Junichi Yamagishi},
title = {An Analysis of {HMM}-based prediction of articulatory
movements},
journal = {Speech Communication},
volume = {52},
number = {10},
pages = {834--846},
abstract = { This paper presents an investigation into predicting
the movement of a speaker's mouth from text input using
hidden Markov models (HMM). A corpus of human
articulatory movements, recorded by electromagnetic
articulography (EMA), is used to train HMMs. To predict
articulatory movements for input text, a suitable model
sequence is selected and a maximum-likelihood parameter
generation (MLPG) algorithm is used to generate output
articulatory trajectories. Unified
acoustic-articulatory HMMs are introduced to integrate
acoustic features when an acoustic signal is also
provided with the input text. Several aspects of this
method are analyzed in this paper, including the
effectiveness of context-dependent modeling, the role
of supplementary acoustic input, and the
appropriateness of certain model structures for the
unified acoustic-articulatory models. When text is the
sole input, we find that fully context-dependent models
significantly outperform monophone and quinphone
models, achieving an average root mean square (RMS)
error of 1.945 mm and an average correlation
coefficient of 0.600. When both text and acoustic
features are given as input to the system, the
difference between the performance of quinphone models
and fully context-dependent models is no longer
significant. The best performance overall is achieved
using unified acoustic-articulatory quinphone HMMs with
separate clustering of acoustic and articulatory model
parameters, a synchronous-state sequence, and a
dependent-feature model structure, with an RMS error of
0.900 mm and a correlation coefficient of 0.855 on
average. Finally, we also apply the same quinphone HMMs
to the acoustic-articulatory, or inversion, mapping
problem, where only acoustic input is available. An
average root mean square (RMS) error of 1.076 mm and an
average correlation coefficient of 0.812 are achieved.
Taken together, our results demonstrate how text and
acoustic inputs both contribute to the prediction of
articulatory movements in the method used.},
doi = {10.1016/j.specom.2010.06.006},
issn = {0167-6393},
keywords = {Hidden Markov model; Articulatory features; Parameter
generation},
month = {October},
year = 2010
}
@inproceedings{huang2010a,
author = {Huang, Songfang and Renals, Steve},
title = {Power Law Discounting for N-Gram Language Models},
booktitle = {Proc. IEEE ICASSP--10},
pages = {5178--5181},
abstract = {We present an approximation to the Bayesian
hierarchical Pitman-Yor process language model which
maintains the power law distribution over word tokens,
while not requiring a computationally expensive
approximate inference process. This approximation,
which we term power law discounting, has a similar
computational complexity to interpolated and modified
Kneser-Ney smoothing. We performed experiments on
meeting transcription using the NIST RT06s evaluation
data and the AMI corpus, with a vocabulary of 50,000
words and a language model training set of up to 211
million words. Our results indicate that power law
discounting results in statistically significant
reductions in perplexity and word error rate compared
to both interpolated and modified Kneser-Ney smoothing,
while producing similar results to the hierarchical
Pitman-Yor process language model.},
doi = {10.1109/ICASSP.2010.5495007},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/huang-icassp10.pdf},
url = {http://dx.doi.org/10.1109/ICASSP.2010.5495007},
year = 2010
}
@inproceedings{wolters2010,
author = {Wolters, Maria K. and Isaac, Karl B. and Renals, Steve},
title = {Evaluating speech synthesis intelligibility using
{Amazon Mechanical Turk}},
booktitle = {Proc. 7th Speech Synthesis Workshop (SSW7)},
pages = {136--141},
abstract = {Microtask platforms such as Amazon Mechanical Turk
(AMT) are increasingly used to create speech and
language resources. AMT in particular allows
researchers to quickly recruit a large number of fairly
demographically diverse participants. In this study, we
investigated whether AMT can be used for comparing the
intelligibility of speech synthesis systems. We
conducted two experiments in the lab and via AMT, one
comparing US English diphone to US English
speaker-adaptive HTS synthesis and one comparing UK
English unit selection to UK English speaker-dependent
HTS synthesis. While AMT word error rates were worse
than lab error rates, AMT results were more sensitive
to relative differences between systems. This is mainly
due to the larger number of listeners. Boxplots and
multilevel modelling allowed us to identify listeners
who performed particularly badly, while thresholding
was sufficient to eliminate rogue workers. We conclude
that AMT is a viable platform for synthetic speech
intelligibility comparisons.},
categories = {intelligibility, evaluation, semantically
unpredictable sentences, diphone, unit selection,
crowd- sourcing, Mechanical Turk, HMM-based synthesis},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wolters-ssw2010.pdf},
year = 2010
}
@inproceedings{phillip:odyssey2010,
author = {P.L. De Leon and M. Pucher and J. Yamagishi},
title = {Evaluation of the Vulnerability of Speaker
Verification to Synthetic Speech},
booktitle = {{Proc. Odyssey (The speaker and language recognition
workshop) 2010}},
address = {Brno, Czech Republic},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/main_v2.pdf},
year = 2010
}
@incollection{renals2010,
author = {Renals, Steve and King, Simon},
title = {Automatic Speech Recognition},
booktitle = {Handbook of Phonetic Sciences},
publisher = {Wiley Blackwell},
editor = {Hardcastle, William J. and Laver, John and Gibbon,
Fiona E.},
chapter = {22},
year = 2010
}
@article{vipperla2010,
author = {Vipperla, Ravi Chander and Renals, Steve and Frankel,
Joe},
title = {Ageing voices: The effect of changes in voice
parameters on {ASR} performance},
journal = {EURASIP Journal on Audio, Speech, and Music Processing},
abstract = {With ageing, human voices undergo several changes
which are typically characterized by increased
hoarseness and changes in articulation patterns. In
this study, we have examined the effect on Automatic
Speech Recognition (ASR) and found that the Word Error
Rates (WER) on older voices is about 9\% absolute
higher compared to those of adult voices. Subsequently,
we compared several voice source parameters including
fundamental frequency, jitter, shimmer, harmonicity and
cepstral peak prominence of adult and older males.
Several of these parameters show statistically
significant difference for the two groups. However,
artificially increasing jitter and shimmer measures do
not effect the ASR accuracies significantly.
Artificially lowering the fundamental frequency
degrades the ASR performance marginally but this drop
in performance can be overcome to some extent using
Vocal Tract Length Normalisation (VTLN). Overall, we
observe that the changes in the voice source parameters
do not have a significant impact on ASR performance.
Comparison of the likelihood scores of all the phonemes
for the two age groups show that there is a systematic
mismatch in the acoustic space of the two age groups.
Comparison of the phoneme recognition rates show that
mid vowels, nasals and phonemes that depend on the
ability to create constrictions with tongue tip for
articulation are more affected by ageing than other
phonemes.},
doi = {10.1155/2010/525783},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/vipperla-eurasip10.pdf},
url = {http://dx.doi.org/10.1155/2010/525783},
year = 2010
}
@inproceedings{oura:icassp:10,
author = {Keiichiro Oura and Keiichi Tokuda and Junichi
Yamagishi and Mirjam Wester and Simon King},
title = {Unsupervised Cross-lingual Speaker Adaptation for
{HMM}-based Speech Synthesis},
booktitle = {Proc. of ICASSP},
volume = {I},
pages = {4954-4957},
abstract = {In the EMIME project, we are developing a mobile
device that performs personalized speech-to-speech
translation such that a user's spoken input in one
language is used to produce spoken output in another
language, while continuing to sound like the user's
voice. We integrate two techniques, unsupervised
adaptation for HMM-based TTS using a word-based
large-vocabulary continuous speech recognizer and
cross-lingual speaker adaptation for HMM-based TTS,
into a single architecture. Thus, an unsupervised
cross-lingual speaker adaptation system can be
developed. Listening tests show very promising results,
demonstrating that adapted voices sound similar to the
target speaker and that differences between supervised
and unsupervised cross-lingual speaker adaptation are
small.},
categories = {speaker adaptation, TTS},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/oura_icassp2010.pdf},
year = 2010
}
@misc{Hofer_Berger:sigg2010,
author = {Gregor Hofer and Korin Richmond and Michael Berger},
title = {Lip Synchronization by Acoustic Inversion},
howpublished = {Poster at Siggraph 2010},
address = {Los Angeles, USA},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/lipsync-sig10.pdf},
year = 2010
}
@incollection{renals2010a,
author = {Renals, Steve and Hain, Thomas},
title = {Speech Recognition},
booktitle = {Handbook of Computational Linguistics and Natural
Language Processing},
publisher = {Wiley Blackwell},
editor = {Clark, Alex and Fox, Chris and Lappin, Shalom},
year = 2010
}
@inproceedings{wang_interspeech10,
author = {Dong Wang and Simon King and Nick Evans and Raphael
Troncy},
title = {{CRF}-based Stochastic Pronunciation Modelling for
Out-of-Vocabulary Spoken Term Detection},
booktitle = {Proc. Interspeech},
address = {Makuhari, Chiba, Japan},
abstract = {Out-of-vocabulary (OOV) terms present a significant
challenge to spoken term detection (STD). This
challenge, to a large extent, lies in the high degree
of uncertainty in pronunciations of OOV terms. In
previous work, we presented a stochastic pronunciation
modeling (SPM) approach to compensate for this
uncertainty. A shortcoming of our original work,
however, is that the SPM was based on a joint-multigram
model (JMM), which is suboptimal. In this paper, we
propose to use conditional random fields (CRFs) for
letter-to-sound conversion, which significantly
improves quality of the predicted pronunciations. When
applied to OOV STD, we achieve consider- able
performance improvement with both a 1-best system and
an SPM-based system.},
categories = {speech recognition, spoken term detection, conditional
random field, joint multigram model},
month = sep,
year = 2010
}
@inproceedings{strom10d,
author = {Volker Strom and Simon King},
title = {A classifier-based target cost for unit selection
speech synthesis trained on perceptual data},
booktitle = {Proc.~Interspeech},
address = {Makuhari, Japan},
abstract = {Our goal is to automatically learn a
PERCEPTUALLY-optimal target cost function for a unit
selection speech synthesiser. The approach we take here
is to train a classifier on human perceptual judgements
of synthetic speech. The output of the classifier is
used to make a simple three-way distinction rather than
to estimate a continuously-valued cost. In order to
collect the necessary perceptual data, we synthesised
145,137 short sentences with the usual target cost
switched off, so that the search was driven by the join
cost only. We then selected the 7200 sentences with the
best joins and asked 60 listeners to judge them,
providing their ratings for each syllable. From this,
we derived a rating for each demiphone. Using as input
the same context features employed in our conventional
target cost function, we trained a classifier on these
human perceptual ratings. We synthesised two sets of
test sentences with both our standard target cost and
the new target cost based on the classifier. A/B
preference tests showed that the classifier-based
target cost, which was learned completely automatically
from modest amounts of perceptual data, is almost as
good as our carefully- and expertly-tuned standard
target cost.},
categories = {speech synthesis, unit selection, target cost},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/strom10d.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/strom10d.ps},
year = 2010
}
@inproceedings{wang_std_covariance_icassp2010,
author = {Wang, Dong and King, Simon and Frankel, Joe and Bell,
Peter},
title = {Stochastic Pronunciation Modelling and Soft Match for
Out-of-vocabulary Spoken Term Detection},
booktitle = {Proc. ICASSP},
address = {Dallas, Texas, USA},
abstract = {A major challenge faced by a spoken term detection
(STD) system is the detection of out-of-vocabulary
(OOV) terms. Although a subword-based STD system is
able to detect OOV terms, performance reduction is
always observed compared to in-vocabulary terms. One
challenge that OOV terms bring to STD is the
pronunciation uncertainty. A commonly used approach to
address this problem is a soft matching procedure,and
the other is the stochastic pronunciation modelling
(SPM) proposed by the authors. In this paper we compare
these two approaches, and combine them using a
discriminative decision strategy. Experimental results
demonstrated that SPM and soft match are highly
complementary, and their combination gives significant
performance improvement to OOV term detection.},
keywords = {confidence estimation, spoken term detection, speech
recognition},
month = mar,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wang10_icassp.pdf},
year = 2010
}
@inproceedings{georgila-sigdial:10,
author = {Georgila, Kallirroi and Wolters, Maria and Moore,
Johanna D.},
title = {Learning Dialogue Strategies from Older and Younger
Simulated Users},
booktitle = {Proc. SIGDIAL},
abstract = {Older adults are a challenging user group because
their behaviour can be highly variable. To the best of
our knowledge, this is the first study where dialogue
strategies are learned and evaluated with both
simulated younger users and simulated older users. The
simulated users were derived from a corpus of
interactions with a strict system-initiative spoken
dialogue system (SDS). Learning from simulated younger
users leads to a policy which is close to one of the
dialogue strategies of the underlying SDS, while the
simulated older users allow us to learn more flexible
dialogue strategies that accommodate mixed initiative.
We conclude that simulated users are a useful technique
for modelling the behaviour of new user groups.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/sigdial_final.pdf},
year = 2010
}
@inproceedings{higher_level,
author = {Oliver Watts and Junichi Yamagishi and Simon King},
title = {The role of higher-level linguistic features in
{HMM}-based speech synthesis},
booktitle = {Proc. Interspeech},
pages = {841-844},
address = {Makuhari, Japan},
abstract = {We analyse the contribution of higher-level elements
of the linguistic specification of a data-driven speech
synthesiser to the naturalness of the synthetic speech
which it generates. The system is trained using various
subsets of the full feature-set, in which features
relating to syntactic category, intonational phrase
boundary, pitch accent and boundary tones are
selectively removed. Utterances synthesised by the
different configurations of the system are then
compared in a subjective evaluation of their
naturalness. The work presented forms background
analysis for an ongoing set of experiments in
performing text-to-speech (TTS) conversion based on
shallow features: features that can be trivially
extracted from text. By building a range of systems,
each assuming the availability of a different level of
linguistic annotation, we obtain benchmarks for our
on-going work.},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100565.pdf},
year = 2010
}
@inproceedings{hofer_interspeech2010,
author = {Hofer, Gregor and Richmond, Korin},
title = {Comparison of {HMM} and {TMDN} Methods for Lip
Synchronisation},
booktitle = {Proc. Interspeech},
pages = {454--457},
address = {Makuhari, Japan},
abstract = {This paper presents a comparison between a hidden
Markov model (HMM) based method and a novel artificial
neural network (ANN) based method for lip
synchronisation. Both model types were trained on
motion tracking data, and a perceptual evaluation was
carried out comparing the output of the models, both to
each other and to the original tracked data. It was
found that the ANN-based method was judged
significantly better than the HMM based method.
Furthermore, the original data was not judged
significantly better than the output of the ANN method.},
keywords = {hidden Markov model (HMM), mixture density network,
lip synchronisation, inversion mapping},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100668.pdf},
year = 2010
}
@inproceedings{Ehnes2010A-Precise-Contr,
author = {Ehnes, Jochen},
title = {A Precise Controllable Projection System for Projected
Virtual Characters and Its Calibration},
booktitle = {IEEE International Symposium on Mixed and Augmented
Reality 2010 Science and Technolgy Proceedings},
pages = {221--222},
address = {Seoul, Korea},
abstract = {In this paper we describe a system to project virtual
characters that shall live with us in the same
environment. In order to project the characters' visual
representations onto room surfaces we use a con-
trollable projector.},
categories = {steerable projector, virtual characters, projector
calibration},
month = {October},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/ehnes.pdf},
year = 2010
}
@article{turk:2429,
author = {Alice Turk and James Scobbie and Christian Geng and
Cedric Macmartin and Ellen Bard and Barry Campbell and
Catherine Dickie and Eddie Dubourg and Bill Hardcastle
and Phil Hoole and Evia Kanaida and Robin Lickley and
Satsuki Nakai and Marianne Pouplier and Simon King and
Steve Renals and Korin Richmond and Sonja Schaeffler
and Ronnie Wiegand and Kevin White and Alan Wrench},
title = {The {Edinburgh Speech Production Facility's}
articulatory corpus of spontaneous dialogue.},
journal = {The Journal of the Acoustical Society of America},
volume = {128},
number = {4},
pages = {2429-2429},
abstract = {The EPSRC‐funded Edinburgh Speech Production is
built around two synchronized Carstens AG500
electromagnetic articulographs (EMAs) in order to
capture articulatory∕acoustic data from spontaneous
dialogue. An initial articulatory corpus was designed
with two aims. The first was to elicit a range of
speech styles∕registers from speakers, and therefore
provide an alternative to fully scripted corpora. The
second was to extend the corpus beyond monologue, by
using tasks that promote natural discourse and
interaction. A subsidiary driver was to use dialects
from outwith North America: dialogues paired up a
Scottish English and a Southern British English
speaker. Tasks. Monologue: Story reading of ``Comma
Gets a Cure'' [Honorof et al. (2000)], lexical sets
[Wells (1982)], spontaneous story telling,
diadochokinetic tasks. Dialogue: Map tasks [Anderson et
al. (1991)], ``Spot the Difference'' picture tasks
[Bradlow et al. (2007)], story‐recall. Shadowing of
the spontaneous story telling by the second
participant. Each dialogue session includes
approximately 30 min of speech, and there are
acoustics‐only baseline materials. We will introduce
the corpus and highlight the role of articulatory
production data in helping provide a fuller
understanding of various spontaneous speech phenomena
by presenting examples of naturally occurring covert
speech errors, accent accommodation, turn taking
negotiation, and shadowing.},
doi = {10.1121/1.3508679},
publisher = {ASA},
year = 2010
}
@inproceedings{kurimo:acl:10,
author = {Mikko Kurimo and William Byrne and John Dines and
Philip N. Garner and Matthew Gibson and Yong Guan and
Teemu Hirsim\"{a}ki and Reima Karhila and Simon King
and Hui Liang and Keiichiro Oura and Lakshmi Saheer and
Matt Shannon and Sayaka Shiota and Jilei Tian and
Keiichi Tokuda and Mirjam Wester and Yi-Jian Wu and
Junichi Yamagishi},
title = {Personalising speech-to-speech translation in the
{EMIME} project},
booktitle = {Proc. of the ACL 2010 System Demonstrations},
address = {Uppsala, Sweden},
abstract = {In the EMIME project we have studied unsupervised
cross-lingual speaker adaptation. We have employed an
HMM statistical framework for both speech recognition
and synthesis which provides transformation mechanisms
to adapt the synthesized voice in TTS (text-to-speech)
using the recognized voice in ASR (automatic speech
recognition). An important application for this
research is personalised speech-to-speech translation
that will use the voice of the speaker in the input
language to utter the translated sentences in the
output language. In mobile environments this enhances
the users' interaction across language barriers by
making the output speech sound more like the original
speaker's way of speaking, even if she or he could not
speak the output language.},
categories = {speaker adaptation},
month = {July},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/kurimo_acl_2010.pdf},
year = 2010
}
@article{michael09:dialectHTS,
author = {Michael Pucher and Dietmar Schabus and Junichi
Yamagishi and Friedrich Neubarth and Volker Strom},
title = {Modeling and Interpolation of {Austrian German and
Viennese} Dialect in {HMM}-based Speech Synthesis},
journal = {Speech Communication},
volume = {52},
number = {2},
pages = {164--179},
abstract = {An HMM-based speech synthesis framework is applied to
both Standard Austrian German and a Viennese dialectal
variety and several training strategies for
multi-dialect modeling such as dialect clustering and
dialect-adaptive training are investigated. For
bridging the gap between processing on the level of
HMMs and on the linguistic level, we add phonological
transformations to the HMM interpolation and apply them
to dialect interpolation. The crucial steps are to
employ several formalized phonological rules between
Austrian German and Viennese dialect as constraints for
the HMM interpolation. We verify the effectiveness of
this strategy in a number of perceptual evaluations.
Since the HMM space used is not articulatory but
acoustic space, there are some variations in evaluation
results between the phonological rules. However, in
general we obtained good evaluation results which show
that listeners can perceive both continuous and
categorical changes of dialect varieties by using
phonological transformations employed as switching
rules in the HMM interpolation.},
categories = {speech synthesis, hidden Markov model, dialect,
sociolect, Austrian German},
doi = {10.1016/j.specom.2009.09.004},
year = 2010
}
@article{georgila-lrec:10,
author = {Georgila, Kallirroi and Wolters, Maria and Moore,
Johanna D. and Logie, Robert H.},
title = {The {MATCH} Corpus: A Corpus of Older and Younger
Users' Interactions with Spoken Dialogue Systems.},
journal = {Language Resources and Evaluation},
volume = {44},
number = {3},
pages = {221--261},
abstract = {We present the MATCH corpus, a unique data set of 447
dialogues in which 26 older and 24 younger adults
interact with nine different spoken dialogue systems.
The systems varied in the number of options presented
and the confirmation strategy used. The corpus also
contains information about the users' cognitive
abilities and detailed usability assessments of each
dialogue system. The corpus, which was collected using
a Wizard-of-Oz methodology, has been fully transcribed
and annotated with dialogue acts and ``Information
State Update'' (ISU) representations of dialogue
context. Dialogue act and ISU annotations were
performed semi-automatically. In addition to describing
the corpus collection and annotation, we present a
quantitative analysis of the interaction behaviour of
older and younger users and discuss further
applications of the corpus. We expect that the corpus
will provide a key resource for modelling older
people's interaction with spoken dialogue systems. },
doi = {10.1007/s10579-010-9118-8},
keywords = {Spoken dialogue corpora, Spoken dialogue systems,
Cognitive ageing, Annotation, Information states,
Speech acts, User simulations, Speech recognition},
month = {March},
year = 2010
}
@article{child_speech_journal_2010,
author = {Watts, O. and Yamagishi, J. and King, S. and Berkling,
K.},
title = {Synthesis of Child Speech with {HMM} Adaptation and
Voice Conversion},
journal = {Audio, Speech, and Language Processing, IEEE
Transactions on},
volume = {18},
number = {5},
pages = {1005--1016},
abstract = {The synthesis of child speech presents challenges both
in the collection of data and in the building of a
synthesizer from that data. We chose to build a
statistical parametric synthesizer using the hidden
Markov model (HMM)-based system HTS, as this technique
has previously been shown to perform well for limited
amounts of data, and for data collected under imperfect
conditions. Six different configurations of the
synthesizer were compared, using both speaker-dependent
and speaker-adaptive modeling techniques, and using
varying amounts of data. For comparison with HMM
adaptation, techniques from voice conversion were used
to transform existing synthesizers to the
characteristics of the target speaker. Speaker-adaptive
voices generally outperformed child speaker-dependent
voices in the evaluation. HMM adaptation outperformed
voice conversion style techniques when using the full
target speaker corpus; with fewer adaptation data,
however, no significant listener preference for either
HMM adaptation or voice conversion methods was found.},
doi = {10.1109/TASL.2009.2035029},
issn = {1558-7916},
keywords = {HMM adaptation techniques;child speech
synthesis;hidden Markov model;speaker adaptive modeling
technique;speaker dependent technique;speaker-adaptive
voice;statistical parametric synthesizer;target speaker
corpus;voice conversion;hidden Markov models;speech
synthesis;},
month = jul,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/YamagishiJ_Synthesis\%20of\%20Child\%20Speech.pdf},
year = 2010
}
@inproceedings{wolters-pqs:10,
author = {Wolters, Maria K. and G\"odde, Florian and M\"oller,
Sebastian and Engelbrecht, Klaus-Peter},
title = {Finding Patterns in User Quality Judgements},
booktitle = {Proc. ISCA Workshop Perceptual Quality of Speech
Systems, Dresden, Germany},
abstract = {User quality judgements can show a bewildering amount
of variation that is diffcult to capture using
traditional quality prediction approaches. Using
clustering, an ex- ploratory statistical analysis
technique, we reanalysed the data set of a Wizard-of-Oz
experiment where 25 users were asked to rate the
dialogue after each turn. The sparse data problem was
addressed by careful a priori parameter choices and
comparison of the results of different cluster
algorithms. We found two distinct classes of users,
positive and critical. Positive users were generally
happy with the dialogue system, and did not mind
errors. Critical users downgraded their opinion of the
system after errors, used a wider range of ratings, and
were less likely to rate the system positively overall.
These user groups could not be predicted by experience
with spoken dialogue systems, attitude to spoken
dialogue systems, anity with technology, demographics,
or short-term memory capacity. We suggest that
evaluation research should focus on critical users and
discuss how these might be identified.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/Wolters_et_al_PQS.pdf},
year = 2010
}
@inproceedings{junichi:interspeech2010,
author = {Junichi Yamagishi and Oliver Watts and Simon King and
Bela Usabaev},
title = {Roles of the Average Voice in Speaker-adaptive
{HMM}-based Speech Synthesis},
booktitle = {{Proc. Interspeech}},
pages = {418--421},
address = {Makuhari, Japan},
abstract = {In speaker-adaptive HMM-based speech synthesis, there
are typically a few speakers for which the output
synthetic speech sounds worse than that of other
speakers, despite having the same amount of adaptation
data from within the same corpus. This paper
investigates these fluctuations in quality and
concludes that as mel-cepstral distance from the
average voice becomes larger, the MOS naturalness
scores generally become worse. Although this negative
correlation is not that strong, it suggests a way to
improve the training and adaptation strategies. We also
draw comparisons between our findings and the work of
other researchers regarding ``vocal attractiveness.''},
keywords = {speech synthesis, HMM, average voice, speaker
adaptation},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100361.pdf},
year = 2010
}
@inproceedings{junichi:icassp2010,
author = {J. Yamagishi and S. King},
title = {Simple methods for improving speaker-similarity of
{HMM}-based speech synthesis},
booktitle = {{Proc. ICASSP 2010}},
address = {Dallas, Texas, USA},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/JunichiICASSP10.pdf},
year = 2010
}
@article{junichi:ieee2010,
author = {J. Yamagishi and B. Usabaev and S. King and O. Watts
and J. Dines and J. Tian and R. Hu and Y. Guan and K.
Oura and K. Tokuda and R. Karhila and M. Kurimo},
title = {Thousands of Voices for {HMM}-based Speech Synthesis
-- Analysis and Application of {TTS} Systems Built on
Various {ASR} Corpora},
journal = {IEEE Transactions on Audio, Speech and Language
Processing},
volume = 18,
number = 5,
pages = {984--1004},
abstract = {In conventional speech synthesis, large amounts of
phonetically balanced speech data recorded in highly
controlled recording studio environments are typically
required to build a voice. Although using such data is
a straightforward solution for high quality synthesis,
the number of voices available will always be limited,
because recording costs are high. On the other hand,
our recent experiments with HMM-based speech synthesis
systems have demonstrated that speaker-adaptive
HMM-based speech synthesis (which uses an ``average
voice model'' plus model adaptation) is robust to
non-ideal speech data that are recorded under various
conditions and with varying microphones, that are not
perfectly clean, and/or that lack phonetic balance.
This enables us to consider building high-quality
voices on ``non-TTS'' corpora such as ASR corpora.
Since ASR corpora generally include a large number of
speakers, this leads to the possibility of producing an
enormous number of voices automatically. In this paper,
we demonstrate the thousands of voices for HMM-based
speech synthesis that we have made from several popular
ASR corpora such as the Wall Street Journal (WSJ0,
WSJ1, and WSJCAM0), Resource Management, Globalphone,
and SPEECON databases. We also present the results of
associated analysis based on perceptual evaluation, and
discuss remaining issues.},
doi = {10.1109/TASL.2010.2045237},
keywords = {Automatic speech recognition (ASR), H Triple S (HTS),
SPEECON database, WSJ database, average voice, hidden
Markov model (HMM)-based speech synthesis, speaker
adaptation, speech synthesis, voice conversion},
month = jul,
year = 2010
}
@inproceedings{wester:ssw7:10,
author = {Mirjam Wester and John Dines and Matthew Gibson and
Hui Liang and Yi-Jian Wu and Lakshmi Saheer and Simon
King and Keiichiro Oura and Philip N. Garner and
William Byrne and Yong Guan and Teemu Hirsim\"{a}ki and
Reima Karhila and Mikko Kurimo and Matt Shannon and
Sayaka Shiota and Jilei Tian and Keiichi Tokuda and
Junichi Yamagishi},
title = {Speaker adaptation and the evaluation of speaker
similarity in the {EMIME} speech-to-speech translation
project},
booktitle = {Proc. of 7th ISCA Speech Synthesis Workshop},
address = {Kyoto, Japan},
abstract = {This paper provides an overview of speaker adaptation
research carried out in the EMIME speech-to-speech
translation (S2ST) project. We focus on how speaker
adaptation transforms can be learned from speech in one
language and applied to the acoustic models of another
language. The adaptation is transferred across
languages and/or from recognition models to synthesis
models. The various approaches investigated can all be
viewed as a process in which a mapping is defined in
terms of either acoustic model states or linguistic
units. The mapping is used to transfer either speech
data or adaptation transforms between the two models.
Because the success of speaker adaptation in
text-to-speech synthesis is measured by judging speaker
similarity, we also discuss issues concerning
evaluation of speaker similarity in an S2ST scenario.},
categories = {speaker adaptation, evaluation},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wester_ssw7_2010.pdf},
year = 2010
}
@inproceedings{kilgour2010a,
author = {Kilgour, Jonathan and Carletta, Jean and Renals, Steve},
title = {The {Ambient Spotlight}: Personal multimodal search
without query},
booktitle = {Proc. ICMI-MLMI},
abstract = {The Ambient Spotlight is a prototype system based on
personal meeting capture using a laptop and a portable
microphone array. The system automatically recognises
and structures the meeting content using automatic
speech recognition, topic segmentation and extractive
summarisation. The recognised speech in the meeting is
used to construct queries to automatically link meeting
segments to other relevant material, both multimodal
and textual. The interface to the system is constructed
around a standard calendar interface, and it is
integrated with the laptop's standard indexing, search
and retrieval.},
doi = {10.1145/1891903.1891919},
pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2010/ambientDemo-icmi.pdf},
url = {http://dx.doi.org/10.1145/1891903.1891919},
year = 2010
}
@inproceedings{michael:interspeech2010,
author = {Michael Pucher and Dietmar Schabus and Junichi
Yamagishi},
title = {Synthesis of fast speech with interpolation of adapted
{HSMMs} and its evaluation by blind and sighted
listeners},
booktitle = {Proc. Interspeech},
pages = {2186--2189},
address = {Makuhari, Japan},
abstract = {In this paper we evaluate a method for generating
synthetic speech at high speaking rates based on the
interpolation of hidden semi-Markov models (HSMMs)
trained on speech data recorded at normal and fast
speaking rates. The subjective evaluation was carried
out with both blind listeners, who are used to very
fast speaking rates, and sighted listeners. We show
that we can achieve a better intelligibility rate and
higher voice quality with this method compared to
standard HSMM-based duration modeling. We also evaluate
duration modeling with the interpolation of all the
acoustic features including not only duration but also
spectral and F0 models. An analysis of the mean squared
error (MSE) of standard HSMM-based duration modeling
for fast speech identifies problematic linguistic
contexts for duration modeling.},
keywords = {speech synthesis, fast speech, hidden semi- Markov
model},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100294.pdf},
year = 2010
}
@inproceedings{king_hmm_tutorial:india2010,
author = {Simon King},
title = {A tutorial on {HMM} speech synthesis (Invited paper)},
booktitle = {Sadhana -- Academy Proceedings in Engineering
Sciences, Indian Institute of Sciences},
abstract = {Statistical parametric speech synthesis, based on
HMM-like models, has become competitive with
established concatenative techniques over the last few
years. This paper offers a non-mathematical
introduction to this method of speech synthesis. It is
intended to be complementary to the wide range of
excellent technical publications already available.
Rather than offer a comprehensive literature review,
this paper instead gives a small number of carefully
chosen references which are good starting points for
further reading.},
categories = {speech synthesis, HMM synthesis},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/king_hmm_tutorial.pdf},
year = 2010
}
@inproceedings{anderssonetal2010_ssw7,
author = {Sebastian Andersson and Junichi Yamagishi and Robert
Clark},
title = {Utilising Spontaneous Conversational Speech in
{HMM}-Based Speech Synthesis},
booktitle = {The 7th ISCA Tutorial and Research Workshop on Speech
Synthesis},
abstract = {Spontaneous conversational speech has many
characteristics that are currently not well modelled in
unit selection and HMM-based speech synthesis. But in
order to build synthetic voices more suitable for
interaction we need data that exhibits more
conversational characteristics than the generally used
read aloud sentences. In this paper we will show how
carefully selected utterances from a spontaneous
conversation was instrumental for building an HMM-based
synthetic voices with more natural sounding
conversational characteristics than a voice based on
carefully read aloud sentences. We also investigated a
style blending technique as a solution to the inherent
problem of phonetic coverage in spontaneous speech
data. But the lack of an appropriate representation of
spontaneous speech phenomena probably contributed to
results showing that we could not yet compete with the
speech quality achieved for grammatical sentences.},
categories = {HMM, speech synthesis, spontaneous speech,
conversation, lexical fillers, filled pauses},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/ssw7_paper.pdf},
year = 2010
}
@inproceedings{tejedor_interspeech10,
author = {Javier Tejedor and Doroteo T. Toledano and Miguel
Bautista and Simon King and Dong Wang and Jose Colas},
title = {Augmented set of features for confidence estimation in
spoken term detection},
booktitle = {Proc. Interspeech},
abstract = {Discriminative confidence estimation along with
confidence normalisation have been shown to construct
robust decision maker modules in spoken term detection
(STD) systems. Discriminative confidence estimation,
making use of termdependent features, has been shown to
improve the widely used lattice-based confidence
estimation in STD. In this work, we augment the set of
these term-dependent features and show a significant
improvement in the STD performance both in terms of
ATWV and DET curves in experiments conducted on a
Spanish geographical corpus. This work also proposes a
multiple linear regression analysis to carry out the
feature selection. Next, the most informative features
derived from it are used within the discriminative
confidence on the STD system.},
categories = {confidence estimation, feature selection, spoken term
detection, speech recognition},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/features.pdf},
year = 2010
}
@inproceedings{letter_based_TTS,
author = {Oliver Watts and Junichi Yamagishi and Simon King},
title = {Letter-based speech synthesis},
booktitle = {Proc. Speech Synthesis Workshop 2010},
pages = {317-322},
address = {Nara, Japan},
abstract = {Initial attempts at performing text-to-speech
conversion based on standard orthographic units are
presented, forming part of a larger scheme of training
TTS systems on features that can be trivially extracted
from text. We evaluate the possibility of using the
technique of decision-tree-based context clustering
conventionally used in HMM-based systems for
parametertying to handle letter-to-sound conversion. We
present the application of a method of compound-feature
discovery to corpusbased speech synthesis. Finally, an
evaluation of intelligibility of letter-based systems
and more conventional phoneme-based systems is
presented.},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/ssw7.pdf},
year = 2010
}