2009.bib
@comment{{This file has been generated by bib2bib 1.92}}
@comment{{Command line: /home/korin/bibtex2html-1.92-LINUX/bib2bib -oc /home/korin/projects/publications/new_output/transitdata/2009-citations -ob /home/korin/projects/publications/new_output/transitdata/2009.bib -c 'year : "2009"' /home/korin/projects/publications/filtlists/full_publications_list.bib}}
@inproceedings{Ehnes2009An-Automated-Me,
author = {Ehnes, Jochen},
title = {An Automated Meeting Assistant: A Tangible Mixed
Reality Interface for the {AMIDA} Automatic Content
Linking Device},
booktitle = {ICEIS},
pages = {952--962},
abstract = {We describe our approach to support ongoing meetings
with an automated meeting assistant. The system based
on the AMIDA Content Linking Device aims at providing
relevant documents used in previous meetings for the
ongoing meeting based on automatic speech recognition.
Once the content linking device finds documents linked
to a discussion about a similar subject in a previous
meeting, it assumes they may be relevant for the
current discussion as well. We believe that the way
these documents are offered to the meeting participants
is equally important as the way they are found. We
developed a mixed reality, projection based user
interface that lets the documents appear on the table
tops in front of the meeting participants. They can
hand them over to others or bring them onto the shared
projection screen easily if they consider them
relevant. Yet, irrelevant documents don't draw too much
attention from the discussion. In this paper we
describe the concept and implementation of this user
interface and provide some preliminary results. },
bibsource = {DBLP, http://dblp.uni-trier.de},
categories = {Tangible User Interface, Mixed Reality, AMI, Content
Linking},
crossref = {DBLP:conf/iceis/2009},
doi = {10.1007/978-3-642-01347-8_79},
keywords = {Tangible User Interface, Mixed Reality, AMI, Content
Linking},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/00240952.pdf},
year = 2009
}
@article{murray2009,
author = {Murray, Gabriel and Kleinbauer, Thomas and Poller,
Peter and Becker, Tilman and Renals, Steve and Kilgour,
Jonathan},
title = {Extrinsic Summarization Evaluation: A Decision Audit
Task},
journal = {ACM Transactions on Speech and Language Processing},
volume = {6},
number = {2},
pages = {1--29},
abstract = {In this work we describe a large-scale extrinsic
evaluation of automatic speech summarization
technologies for meeting speech. The particular task is
a decision audit, wherein a user must satisfy a complex
information need, navigating several meetings in order
to gain an understanding of how and why a given
decision was made. We compare the usefulness of
extractive and abstractive technologies in satisfying
this information need, and assess the impact of
automatic speech recognition (ASR) errors on user
performance. We employ several evaluation methods for
participant performance, including post-questionnaire
data, human subjective and objective judgments, and a
detailed analysis of participant browsing behavior. We
find that while ASR errors affect user satisfaction on
an information retrieval task, users can adapt their
browsing behavior to complete the task satisfactorily.
Results also indicate that users consider extractive
summaries to be intuitive and useful tools for browsing
multimodal meeting data. We discuss areas in which
automatic summarization techniques can be improved in
comparison with gold-standard meeting abstracts.},
doi = {10.1145/1596517.1596518},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/murray-acm09.pdf},
url = {http://doi.acm.org/10.1145/1596517.1596518},
year = 2009
}
@inproceedings{anderssoncabral09,
author = {J. Sebastian Andersson and Joao P. Cabral and Leonardo
Badino and Junichi Yamagishi and Robert A.J. Clark},
title = {Glottal Source and Prosodic Prominence Modelling in
{HMM}-based Speech Synthesis for the {B}lizzard
{C}hallenge 2009},
booktitle = {The Blizzard Challenge 2009},
address = {Edinburgh, U.K.},
abstract = {This paper describes the CSTR entry for the Blizzard
Challenge 2009. The work focused on modifying two parts
of the Nitech 2005 HTS speech synthesis system to
improve naturalness and contextual appropriateness. The
first part incorporated an implementation of the
Linjencrants-Fant (LF) glottal source model. The second
part focused on improving synthesis of prosodic
prominence including emphasis through context dependent
phonemes. Emphasis was assigned to the synthesised test
sentences based on a handful of theory based rules. The
two parts (LF-model and prosodic prominence) were not
combined and hence evaluated separately. The results on
naturalness for the LF-model showed that it is not yet
perceived as natural as the Benchmark HTS system for
neutral speech. The results for the prosodic prominence
modelling showed that it was perceived as contextually
appropriate as the Benchmark HTS system, despite a low
naturalness score. The Blizzard challenge evaluation
has provided valuable information on the status of our
work and continued work will begin with analysing why
our modifications resulted in reduced naturalness
compared to the Benchmark HTS system.},
categories = {HMM, HTS, speech synthesis, LF-model, glottal source,
prosodic prominence, emphasis},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/cstr_Blizzard2009.pdf},
year = 2009
}
@phdthesis{cuayahuitl_thesis2009,
author = {Heriberto Cuayáhuitl},
title = {Hierarchical Reinforcement Learning for Spoken
Dialogue Systems},
school = {School of Informatics, University of Edinburgh},
abstract = {This thesis focuses on the problem of scalable
optimization of dialogue behaviour in speech-based
conversational systems using reinforcement learning.
Most previous investigations in dialogue strategy
learning have proposed flat reinforcement learning
methods, which are more suitable for small-scale spoken
dialogue systems. This research formulates the problem
in terms of Semi-Markov Decision Processes (SMDPs), and
proposes two hierarchical reinforcement learning
methods to optimize sub-dialogues rather than full
dialogues. The first method uses a hierarchy of SMDPs,
where every SMDP ignores irrelevant state variables and
actions in order to optimize a sub-dialogue. The second
method extends the first one by constraining every SMDP
in the hierarchy with prior expert knowledge. The
latter method proposes a learning algorithm called
'HAM+HSMQ-Learning', which combines two existing
algorithms in the literature of hierarchical
reinforcement learning. Whilst the first method
generates fully-learnt behaviour, the second one
generates semi-learnt behaviour. In addition, this
research proposes a heuristic dialogue simulation
environment for automatic dialogue strategy learning.
Experiments were performed on simulated and real
environments based on a travel planning spoken dialogue
system. Experimental results provided evidence to
support the following claims: First, both methods scale
well at the cost of near-optimal solutions, resulting
in slightly longer dialogues than the optimal
solutions. Second, dialogue strategies learnt with
coherent user behaviour and conservative recognition
error rates can outperform a reasonable hand-coded
strategy. Third, semi-learnt dialogue behaviours are a
better alternative (because of their higher overall
performance) than hand-coded or fully-learnt dialogue
behaviours. Last, hierarchical reinforcement learning
dialogue agents are feasible and promising for the
(semi) automatic design of adaptive behaviours in
larger-scale spoken dialogue systems. This research
makes the following contributions to spoken dialogue
systems which learn their dialogue behaviour. First,
the Semi-Markov Decision Process (SMDP) model was
proposed to learn spoken dialogue strategies in a
scalable way. Second, the concept of 'partially
specified dialogue strategies' was proposed for
integrating simultaneously hand-coded and learnt spoken
dialogue behaviours into a single learning framework.
Third, an evaluation with real users of hierarchical
reinforcement learning dialogue agents was essential to
validate their effectiveness in a realistic
environment.},
key = {spoken dialogue systems, (semi-)automatic dialogue
strategy design, hierarchical control, prior expert
knowledge, Semi-Markov decision processes, hierarchical
reinforcement learning},
month = {January},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/PhDThesis-HeribertoCuayahuitl-Final.pdf},
year = 2009
}
@incollection{vipperla2009a,
author = {Vipperla, Ravi Chander and Wolters, Maria and
Georgila, Kallirroi and Renals, Steve},
title = {Speech Input from Older Users in Smart Environments:
Challenges and Perspectives},
booktitle = {Proc. HCI International: Universal Access in
Human-Computer Interaction. Intelligent and Ubiquitous
Interaction Environments},
publisher = {Springer},
number = {5615},
series = {Lecture Notes in Computer Science},
abstract = {Although older people are an important user group for
smart environments, there has been relatively little
work on adapting natural language interfaces to their
requirements. In this paper, we focus on a particularly
thorny problem: processing speech input from older
users. Our experiments on the MATCH corpus show clearly
that we need age-specific adaptation in order to
recognize older users' speech reliably. Language models
need to cover typical interaction patterns of older
people, and acoustic models need to accommodate older
voices. Further research is needed into intelligent
adaptation techniques that will allow existing large,
robust systems to be adapted with relatively small
amounts of in-domain, age appropriate data. In
addition, older users need to be supported with
adequate strategies for handling speech recognition
errors.},
doi = {10.1007/978-3-642-02710-9},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/vipperla2009a.pdf},
url = {http://www.springerlink.com/content/27r01345r1683251/?p=ad2394d646814db59cf9868b0f74b11e&pi=13},
year = 2009
}
@inproceedings{Ehnes2009A-Tangible-Mixed,
author = {Ehnes, Jochen},
title = {A Tangible Mixed Reality Interface for the {AMI}
Automated Meeting Assistant},
booktitle = {Human Interface and the Management of Information},
editor = {Smith, Michael J. and Salvendy, Gavriel},
volume = {5617},
series = {Lecture Notes in Computer Science},
pages = {485--494},
publisher = {Springer},
abstract = {In this paper we describe our approach to support
ongoing meetings with an automated meeting assistant.
We propose an alternative user interface for the AMIDA
Content Linking Device. In order for the system to be
less distractive and more collaborative than the
original laptop screen based one, we developed a system
that projects documents onto the table tops right in
front of the meeting participants. This way they appear
as if they were printed on paper, lying in front of the
participants. We describe our setup as well as the user
interface we built to handle and share these documents.},
categories = {Mixed Reality, AMI, Content Linking, User Interface},
isbn = {978-3-642-02555-6},
location = {Heidelberg},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/56170485.pdf},
year = 2009
}
@inproceedings{zen:HTSoverview,
author = {Heiga Zen and Keiichiro Oura and Takashi Nose and
Junichi Yamagishi and Shinji Sako and Tomoki Toda and
Takashi Masuko and Alan W. Black and Keiichi Tokuda},
title = {Recent development of the {HMM}-based speech synthesis
system ({HTS})},
booktitle = {Proc. 2009 Asia-Pacific Signal and Information
Processing Association (APSIPA)},
address = {Sapporo, Japan},
abstract = {A statistical parametric approach to speech synthesis
based on hidden Markov models (HMMs) has grown in
popularity over the last few years. In this approach,
spectrum, excitation, and duration of speech are
simultaneously modeled by context-dependent HMMs, and
speech waveforms are generate from the HMMs themselves.
Since December 2002, we have publicly released an
open-source software toolkit named “HMM-based speech
synthesis system (HTS)” to provide a research and
development toolkit for statistical parametric speech
synthesis. This paper describes recent developments of
HTS in detail, as well as future release plans.},
month = oct,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/zen_APSIPA2009.pdf},
year = 2009
}
@article{cuayahuitl2009,
author = {Cuayáhuitl, Heriberto and Renals, Steve and Lemon,
Oliver and Shimodaira, Hiroshi},
title = {Evaluation of a hierarchical reinforcement learning
spoken dialogue system},
journal = {Computer Speech and Language},
volume = {24},
number = {2},
pages = {395-429},
abstract = {We describe an evaluation of spoken dialogue
strategies designed using hierarchical reinforcement
learning agents. The dialogue strategies were learnt in
a simulated environment and tested in a laboratory
setting with 32 users. These dialogues were used to
evaluate three types of machine dialogue behaviour:
hand-coded, fully-learnt and semi-learnt. These
experiments also served to evaluate the realism of
simulated dialogues using two proposed metrics
contrasted with ‘Precision-Recall’. The learnt
dialogue behaviours used the Semi-Markov Decision
Process (SMDP) model, and we report the first
evaluation of this model in a realistic conversational
environment. Experimental results in the travel
planning domain provide evidence to support the
following claims: (a) hierarchical semi-learnt dialogue
agents are a better alternative (with higher overall
performance) than deterministic or fully-learnt
behaviour; (b) spoken dialogue strategies learnt with
highly coherent user behaviour and conservative
recognition error rates (keyword error rate of 20\%)
can outperform a reasonable hand-coded strategy; and
(c) hierarchical reinforcement learning dialogue agents
are feasible and promising for the (semi) automatic
design of optimized dialogue behaviours in larger-scale
systems.},
doi = {10.1016/j.csl.2009.07.001},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/cuayahuitl-csl09.pdf},
year = 2009
}
@incollection{sarah:hts09,
author = {Sarah Creer and Phil Green and Stuart Cunningham and
Junichi Yamagishi},
title = {Building personalised synthesised voices for
individuals with dysarthria using the {HTS} toolkit},
booktitle = {Computer Synthesized Speech Technologies: Tools for
Aiding Impairment},
publisher = {IGI Global},
editor = {John W. Mullennix and Steven E. Stern},
edition = {1st},
note = {in press},
abstract = {When the speech of an individual becomes
unintelligible due to a neurological disorder, a
synthesized voice can replace that of the individual.
To fully replace all functions of human speech
communication: communication of information,
maintenance of social relationships and displaying
identity, the voice must be intelligible,
natural-sounding and retain the vocal identity of the
speaker. For speakers with dysarthria, achieving this
output with minimal data recordings and deteriorating
speech is difficult. An alternative to this is using
Hidden Markov models (HMMs) which require much less
speech data than needed for concatenative methods, to
adapt a robust statistical model of speech towards the
speaker characteristics captured in the data recorded
by the individual. This chapter implements this
technique using the HTS toolkit to build personalized
synthetic voices for two individuals with dysarthria.
An evaluation of the voices by the participants
themselves suggests that this technique shows promise
for building and reconstructing personalized voices for
individuals with dysarthria once deterioration has
begun.},
year = 2009
}
@inproceedings{Ayletetal09,
author = {Matthew P. Aylett and Simon King and Junichi Yamagishi},
title = {Speech Synthesis Without a Phone Inventory},
booktitle = {Interspeech},
pages = {2087--2090},
abstract = { In speech synthesis the unit inventory is decided
using phonological and phonetic expertise. This process
is resource intensive and potentially sub-optimal. In
this paper we investigate how acoustic clustering,
together with lexicon constraints, can be used to build
a self-organised inventory. Six English speech
synthesis systems were built using two frameworks, unit
selection and parametric HTS for three inventory
conditions: 1) a traditional phone set, 2) a system
using orthographic units, and 3) a self-organised
inventory. A listening test showed a strong preference
for the classic system, and for the orthographic system
over the self-organised system. Results also varied by
letter to sound complexity and database coverage. This
suggests the self-organised approach failed to
generalise pronunciation as well as introducing noise
above and beyond that caused by orthographic sound
mismatch.},
categories = {speech synthesis, unit selection, parametric
synthesis, phone inventory, orthographic synthesis},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/ma_interspeech09.pdf},
place = {Brighton},
year = 2009
}
@inproceedings{dongwang_interspeech09_spm,
author = {Dong Wang and Simon King and Joe Frankel},
title = {Stochastic Pronunciation Modelling for Spoken Term
Detection},
booktitle = {Proc. of Interspeech},
pages = {2135--2138},
address = {Brighton, UK},
abstract = {A major challenge faced by a spoken term detection
(STD) system is the detection of out-of-vocabulary
(OOV) terms. Although a subword-based STD system is
able to detect OOV terms, performance reduction is
always observed compared to in-vocabulary terms.
Current approaches to STD do not acknowledge the
particular properties of OOV terms, such as
pronunciation uncertainty. In this paper, we use a
stochastic pronunciation model to deal with the
uncertain pronunciations of OOV terms. By considering
all possible term pronunciations, predicted by a
joint-multigram model, we observe a significant
performance improvement. },
categories = {joint-multigram, pronunciation model, spoken term
detection, speech recognition},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/spm.pdf},
year = 2009
}
@inproceedings{johnson-aas:09,
author = {Johnson, Christine and Campbell, Pauline and
DePlacido, Christine and Liddell, Amy and Wolters,
Maria},
title = {Does Peripheral Hearing Loss Affect {RGDT} Thresholds
in Older Adults},
booktitle = {Proceedings of the {A}merican {A}uditory {S}ociety
{C}onference},
abstract = {},
categories = {speech synthesis, older users},
month = mar,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/aas09.pdf},
year = 2009
}
@inproceedings{wolters-is:09,
author = {Wolters, Maria and Vipperla, Ravichander and Renals,
Steve},
title = {Age Recognition for Spoken Dialogue Systems: Do We
Need It?},
booktitle = {Proc. Interspeech},
abstract = {When deciding whether to adapt relevant aspects of the
system to the particular needs of older users, spoken
dialogue systems often rely on automatic detection of
chronological age. In this paper, we show that vocal
ageing as measured by acoustic features is an
unreliable indicator of the need for adaptation. Simple
lexical features greatly improve the prediction of both
relevant aspects of cognition and interactions style.
Lexical features also boost age group prediction. We
suggest that adaptation should be based on observed
behaviour, not on chronological age, unless it is not
feasible to build classifiers for relevant adaptation
decisions.},
categories = {age recognition, spoken dialogue systems},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/is09.pdf},
year = 2009
}
@inproceedings{huang2009-icassp,
author = {Songfang Huang and Bowen Zhou},
title = {An {EM} Algorithm for {SCFG} in Formal Syntax-based
Translation},
booktitle = {Proc. IEEE International Conference on Acoustic,
Speech, and Signal Processing (ICASSP'09)},
pages = {4813--4816},
address = {Taiwan, China},
abstract = {In this paper, we investigate the use of bilingual
parsing on parallel corpora to better estimate the rule
parameters in a formal syntax-based machine translation
system, which are normally estimated from the
inaccurate heuristics. We use an
Expectation-Maximization (EM) algorithm to re-estimate
the parameters of synchronous context-free grammar
(SCFG) rules according to the derivation knowledge from
parallel corpora based on maximum likelihood principle,
rather than using only the heuristic information. The
proposed algorithm produces significantly better BLEU
scores than a state-of-the-art formal syntax-based
machine translation system on the IWSLT 2006 Chinese to
English task.},
month = apr,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/icassp09.pdf},
year = 2009
}
@inproceedings{huang2009-is,
author = {Songfang Huang and Steve Renals},
title = {A Parallel Training Algorithm for Hierarchical
{P}itman-{Y}or Process Language Models},
booktitle = {Proc. Interspeech'09},
pages = {2695--2698},
address = {Brighton, UK},
abstract = {The Hierarchical Pitman Yor Process Language Model
(HPYLM) is a Bayesian language model based on a
non-parametric prior, the Pitman-Yor Process. It has
been demonstrated, both theoretically and practically,
that the HPYLM can provide better smoothing for
language modeling, compared with state-of-the-art
approaches such as interpolated Kneser-Ney and modified
Kneser-Ney smoothing. However, estimation of Bayesian
language models is expensive in terms of both
computation time and memory; the inference is
approximate and requires a number of iterations to
converge. In this paper, we present a parallel training
algorithm for the HPYLM, which enables the approach to
be applied in the context of automatic speech
recognition, using large training corpora with large
vocabularies. We demonstrate the effectiveness of the
proposed algorithm by estimating language models from
corpora for meeting transcription containing over 200
million words, and observe significant reductions in
perplexity and word error rate.},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/sh_interspeech09.pdf},
year = 2009
}
@article{McGowanBerger2009,
author = {Richard S. McGowan and Michael A. Berger},
title = {Acoustic-articulatory mapping in vowels by locally
weighted regression},
journal = {Journal of the Acoustical Society of America},
volume = {126},
number = {4},
pages = {2011-2032},
abstract = {A method for mapping between simultaneously measured
articulatory and acoustic data is proposed. The method
uses principal components analysis on the articulatory
and acoustic variables, and mapping between the domains
by locally weighted linear regression, or loess
[Cleveland, W. S. (1979) J. Am. Stat. Assoc. 74,
829--836]. The latter method permits local variation in
the slopes of the linear regression, assuming that the
function being approximated is smooth. The methodology
is applied to vowels of four speakers in the Wisconsin
X-ray Microbeam Speech Production Database, with
formant analysis. Results are examined in terms of (1)
examples of forward (articulation-to-acoustics)
mappings and inverse mappings, (2) distributions of
local slopes and constants, (3) examples of
correlations among slopes and constants, (4)
root-mean-square error, and (5) sensitivity of formant
frequencies to articulatory change. It is shown that
the results are qualitatively correct and that loess
performs better than global regression. The forward
mappings show different root-mean-square error
properties than the inverse mappings indicating that
this method is better suited for the forward mappings
than the inverse mappings, at least for the data chosen
for the current study. Some preliminary results on
sensitivity of the first two formant frequencies to the
two most important articulatory principal components
are presented.},
categories = {Articulatory inversion, locally weighted regression,
X-ray microbeam, formant analysis},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/aam.pdf},
year = 2009
}
@inproceedings{tietze:09,
author = {Martin I. Tietze and Andi Winterboer and Johanna D.
Moore},
title = {The effect of linguistic devices in information
presentation messages on recall and comprehension},
booktitle = {Proceedings ENLG09},
categories = {discourse cues, verbal information presentation,
recall, eye-tracking, Mechanical Turk},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/tietze.ENLG09.pdf},
year = 2009
}
@article{ling2008,
author = {Ling, Z. and Richmond, K. and Yamagishi, J. and Wang,
R.},
title = {Integrating Articulatory Features into {HMM}-based
Parametric Speech Synthesis},
journal = {IEEE Transactions on Audio, Speech and Language
Processing },
volume = 17,
number = 6,
pages = {1171--1185},
note = {\textbf{IEEE SPS 2010 Young Author Best Paper Award}},
abstract = {This paper presents an investigation of ways to
integrate articulatory features into Hidden Markov
Model (HMM)-based parametric speech synthesis,
primarily with the aim of improving the performance of
acoustic parameter generation. The joint distribution
of acoustic and articulatory features is estimated
during training and is then used for parameter
generation at synthesis time in conjunction with a
maximum-likelihood criterion. Different model
structures are explored to allow the articulatory
features to influence acoustic modeling: model
clustering, state synchrony and cross-stream feature
dependency. The results of objective evaluation show
that the accuracy of acoustic parameter prediction can
be improved when shared clustering and
asynchronous-state model structures are adopted for
combined acoustic and articulatory features. More
significantly, our experiments demonstrate that
modeling the dependency between these two feature
streams can make speech synthesis more flexible. The
characteristics of synthetic speech can be easily
controlled by modifying generated articulatory features
as part of the process of acoustic parameter
generation.},
categories = {Speech synthesis, articulation, HMM-based synthesis},
doi = {10.1109/TASL.2009.2014796},
key = {ling2008},
month = aug,
year = 2009
}
@inproceedings{child_synthesis_2009,
author = {Oliver Watts and Junichi Yamagishi and Simon King and
Kay Berkling},
title = {{HMM} Adaptation and Voice Conversion for the
Synthesis of Child Speech: A Comparison},
booktitle = {Proc. Interspeech 2009},
pages = {2627--2630},
address = {Brighton, U.K.},
abstract = {This study compares two different methodologies for
producing data-driven synthesis of child speech from
existing systems that have been trained on the speech
of adults. On one hand, an existing statistical
parametric synthesiser is transformed using model
adaptation techniques, informed by linguistic and
prosodic knowledge, to the speaker characteristics of a
child speaker. This is compared with the application of
voice conversion techniques to convert the output of an
existing waveform concatenation synthesiser with no
explicit linguistic or prosodic knowledge. In a
subjective evaluation of the similarity of synthetic
speech to natural speech from the target speaker, the
HMM-based systems evaluated are generally preferred,
although this is at least in part due to the higher
dimensional acoustic features supported by these
techniques.},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/child_synthesis_2009.pdf},
year = 2009
}
@inproceedings{Blizzard_summary_09,
author = {Simon King and Vasilis Karaiskos},
title = {The {B}lizzard {C}hallenge 2009},
booktitle = {Proc. Blizzard Challenge Workshop},
address = {Edinburgh, UK},
abstract = {The Blizzard Challenge 2009 was the fifth annual
Blizzard Challenge. As in 2008, UK English and Mandarin
Chinese were the chosen languages for the 2009
Challenge. The English corpus was the same one used in
2008. The Mandarin corpus was pro- vided by iFLYTEK. As
usual, participants with limited resources or limited
experience in these languages had the option of using
unaligned labels that were provided for both corpora
and for the test sentences. An accent-specific
pronunciation dictionary was also available for the
English speaker. This year, the tasks were organised in
the form of `hubs' and `spokes' where each hub task
involved building a general-purpose voice and each
spoke task involved building a voice for a specific
application. A set of test sentences was released to
participants, who were given a limited time in which to
synthesise them and submit the synthetic speech. An
online listening test was conducted to evaluate
naturalness, intelligibility, degree of similarity to
the original speaker and, for one of the spoke tasks,
"appropriateness."},
categories = {Blizzard Challenge, speech synthesis, evaluation,
listening test},
keywords = {Blizzard},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/summary_Blizzard2009.pdf},
year = 2009
}
@inproceedings{dongwang_icassp09,
author = {Dong Wang and Tejedor Tejedor and Joe Frankel and
Simon King},
title = {Posterior-based confidence measures for spoken term
detection},
booktitle = {Proc. of ICASSP09},
address = {Taiwan},
abstract = {Confidence measures play a key role in spoken term
detection (STD) tasks. The confidence measure expresses
the posterior probability of the search term appearing
in the detection period, given the speech. Traditional
approaches are based on the acoustic and language model
scores for candidate detections found using automatic
speech recognition, with Bayes' rule being used to
compute the desired posterior probability. In this
paper, we present a novel direct posterior-based
confidence measure which, instead of resorting to the
Bayesian formula, calculates posterior probabilities
from a multi-layer perceptron (MLP) directly. Compared
with traditional Bayesian-based methods, the
direct-posterior approach is conceptually and
mathematically simpler. Moreover, the MLP-based model
does not require assumptions to be made about the
acoustic features such as their statistical
distribution and the independence of static and dynamic
co-efficients. Our experimental results in both English
and Spanish demonstrate that the proposed direct
posterior-based confidence improves STD performance. },
categories = {Spoken term detection, confidence measure, posterior
probabilities, MLP},
month = {April},
page = {4889--4892},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/posterior.pdf},
year = 2009
}
@article{wolters-taccess:09,
author = {Maria Wolters and Kallirroi Georgila and Sarah
MacPherson and Johanna Moore},
title = {Being Old Doesn't Mean Acting Old: Older Users'
Interaction with Spoken Dialogue Systems},
journal = {ACM Transactions on Accessible Computing},
volume = {2},
number = {1},
pages = {1--39},
abstract = {Most studies on adapting voice interfaces to older
users work top-down by comparing the interaction
behavior of older and younger users. In contrast, we
present a bottom-up approach. A statistical cluster
analysis of 447 appointment scheduling dialogs between
50 older and younger users and 9 simulated spoken
dialog systems revealed two main user groups, a
“social” group and a “factual” group.
“Factual” users adapted quickly to the systems and
interacted efficiently with them. “Social” users,
on the other hand, were more likely to treat the system
like a human, and did not adapt their interaction
style. While almost all “social” users were older,
over a third of all older users belonged in the
“factual” group. Cognitive abilities and gender did
not predict group membership. We conclude that spoken
dialog systems should adapt to users based on observed
behavior, not on age. },
categories = {spoken dialogue systems, older users, human-computer
interaction},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/citation.cfm},
year = 2009
}
@inproceedings{dongwang_interspeech09_conf,
author = {Dong Wang and Simon King and Joe Frankel and Peter
Bell},
title = {Term-Dependent Confidence for Out-of-Vocabulary Term
Detection},
booktitle = {Proc. Interspeech},
pages = {2139--2142},
address = {Brighton, UK},
abstract = { Within a spoken term detection (STD) system, the
decision maker plays an important role in retrieving
reliable detections. Most of the state-of-the-art STD
systems make decisions based on a confidence measure
that is term-independent, which poses a serious problem
for out-of-vocabulary (OOV) term detection. In this
paper, we study a term-dependent confidence measure
based on confidence normalisation and discriminative
modelling, particularly focusing on its remarkable
effectiveness for detecting OOV terms. Experimental
results indicate that the term-dependent confidence
provides much more significant improvement for OOV
terms than terms in-vocabulary. },
categories = {joint-multigram, pronunciation model, spoken term
detection, speech recognition},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/conf.pdf},
year = 2009
}
@inproceedings{dziemianko_interspeech2009,
author = {Michal Dziemianko and Gregor Hofer and Hiroshi
Shimodaira},
title = {{HMM}-Based Automatic Eye-Blink Synthesis from Speech},
booktitle = {Proc. Interspeech},
pages = {1799--1802},
address = {Brighton, UK},
abstract = {In this paper we present a novel technique to
automatically synthesise eye blinking from a speech
signal. Animating the eyes of a talking head is
important as they are a major focus of attention during
interaction. The developed system predicts eye blinks
from the speech signal and generates animation
trajectories automatically employing a ''Trajectory
Hidden Markov Model''. The evaluation of the
recognition performance showed that the timing of
blinking can be predicted from speech with an F-score
value upwards of 52\%, which is well above chance.
Additionally, a preliminary perceptual evaluation was
conducted, that confirmed that adding eye blinking
significantly improves the perception the character.
Finally it showed that the speech synchronised
synthesised blinks outperform random blinking in
naturalness ratings.},
categories = {animation, motion synthesis, time series analysis,
trajectory model},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/dziemianko_interspeech2009.pdf},
year = 2009
}
@inproceedings{leo_09-1,
author = {Leonardo Badino and J. Sebastian Andersson and Junichi
Yamagishi and Robert A.J. Clark},
title = {Identification of Contrast and Its Emphatic
Realization in {HMM}-based Speech Synthesis},
booktitle = {Proc. Interspeech 2009},
address = {Brighton, U.K.},
abstract = {The work presented in this paper proposes to identify
contrast in the form of contrastive word pairs and
prosodically signal it with emphatic accents in a
Text-to-Speech (TTS) application using a
Hidden-Markov-Model (HMM) based speech synthesis
system. We first describe a novel method to
automatically detect contrastive word pairs using
textual features only and report its performance on a
corpus of spontaneous conversations in English.
Subsequently we describe the set of features selected
to train a HMM-based speech synthesis system and
attempting to properly control prosodic prominence
(including emphasis). Results from a large scale
perceptual test show that in the majority of cases
listeners judge emphatic contrastive word pairs as
acceptable as their non-emphatic counterpart, while
emphasis on non-contrastive pairs is almost never
acceptable.},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/IS090749.PDF},
year = 2009
}
@article{yamagishi2009,
author = {Yamagishi, Junichi and Nose, Takashi and Zen, Heiga
and Ling, Zhenhua and Toda, Tomoki and Tokuda, Keiichi
and King, Simon and Renals, Steve},
title = {Robust Speaker-Adaptive {HMM}-based Text-to-Speech
Synthesis},
journal = {IEEE Transactions on Audio, Speech and Language
Processing},
volume = {17},
number = {6},
pages = {1208--1230},
abstract = {This paper describes a speaker-adaptive HMM-based
speech synthesis system. The new system, called
``HTS-2007,'' employs speaker adaptation (CSMAPLR+MAP),
feature-space adaptive training, mixed-gender modeling,
and full-covariance modeling using CSMAPLR transforms,
in addition to several other techniques that have
proved effective in our previous systems. Subjective
evaluation results show that the new system generates
significantly better quality synthetic speech than
speaker-dependent approaches with realistic amounts of
speech data, and that it bears comparison with
speaker-dependent approaches even when large amounts of
speech data are available. In addition, a comparison
study with several speech synthesis techniques shows
the new system is very robust: It is able to build
voices from less-than-ideal speech data and synthesize
good-quality speech even for out-of-domain sentences.},
pdf = {},
url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=5109758&arnumber=5153555&count=14&index=12},
year = 2009
}
@inproceedings{cabral_yrwst,
author = {J. Cabral and S. Renals and K. Richmond and J.
Yamagishi},
title = {{HMM}-based Speech Synthesis with an Acoustic Glottal
Source Model},
booktitle = {Proc. The First Young Researchers Workshop in Speech
Technology},
abstract = {A major cause of degradation of speech quality in
HMM-based speech synthesis is the use of a simple delta
pulse signal to generate the excitation of voiced
speech. This paper describes a new approach to using an
acoustic glottal source model in HMM-based
synthesisers. The goal is to improve speech quality and
parametric flexibility to better model and transform
voice characteristics.},
categories = {HMM-based Speech Synthesis, LF-Model, Glottal Spectral
Separation},
month = apr,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/jscabral-yrwss2009.pdf},
year = 2009
}
@proceedings{DBLP:conf/iceis/2009,
title = {Enterprise Information Systems, 11th International
Conference, ICEIS 2009, Milan, Italy, May 6-10, 2009.
Proceedings},
editor = {Filipe, Joaquim and Cordeiro, José},
volume = {24},
series = {Lecture Notes in Business Information Processing},
publisher = {Springer},
bibsource = {DBLP, http://dblp.uni-trier.de},
booktitle = {ICEIS},
doi = {10.1007/978-3-642-01347-8},
isbn = {978-3-642-01346-1},
year = 2009
}
@inproceedings{jyamagis:emime,
author = {Junichi Yamagishi and Mike Lincoln and Simon King and
John Dines and Matthew Gibson and Jilei Tian and Yong
Guan},
title = {Analysis of Unsupervised and Noise-Robust
Speaker-Adaptive {HMM}-Based Speech Synthesis Systems
toward a Unified {ASR} and {TTS} Framework},
booktitle = {Proc. Interspeech 2009},
address = {Edinburgh, U.K.},
abstract = {For the 2009 Blizzard Challenge we have built an
unsupervised version of the HTS-2008 speaker-adaptive
HMM-based speech synthesis system for English, and a
noise robust version of the systems for Mandarin. They
are designed from a multidisciplinary application point
of view in that we attempt to integrate the components
of the TTS system with other technologies such as ASR.
All the average voice models are trained exclusively
from recognized, publicly available, ASR databases.
Multi-pass LVCSR and confidence scores calculated from
confusion network are used for the unsupervised
systems, and noisy data recorded in cars or public
spaces is used for the noise robust system. We believe
the developed systems form solid benchmarks and provide
good connections to ASR fields. This paper describes
the development of the systems and reports the results
and analysis of their evaluation.},
month = sep,
year = 2009
}
@inproceedings{richmond2009b,
author = {Richmond, K.},
title = {Preliminary Inversion Mapping Results with a New {EMA}
Corpus},
booktitle = {Proc. Interspeech},
pages = {2835--2838},
address = {Brighton, UK},
abstract = {In this paper, we apply our inversion mapping method,
the trajectory mixture density network (TMDN), to a new
corpus of articulatory data, recorded with a Carstens
AG500 electromagnetic articulograph. This new data set,
mngu0, is relatively large and phonetically rich, among
other beneficial characteristics. We obtain good
results, with a root mean square (RMS) error of only
0.99mm. This compares very well with our previous
lowest result of 1.54mm RMS error for equivalent coils
of the MOCHA fsew0 EMA data. We interpret this as
showing the mngu0 data set is potentially more
consistent than the fsew0 data set, and is very useful
for research which calls for articulatory trajectory
data. It also supports our view that the TMDN is very
much suited to the inversion mapping problem.},
keywords = {acoustic-articulatory inversion mapping, neural
network},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/IS090544.pdf},
year = 2009
}
@inproceedings{richmond2009a,
author = {Richmond, K. and Clark, R. and Fitt, S.},
title = {Robust {LTS} rules with the {Combilex} speech
technology lexicon},
booktitle = {Proc. Interspeech},
pages = {1295--1298},
address = {Brighton, UK},
abstract = {Combilex is a high quality pronunciation lexicon aimed
at speech technology applications that has recently
been released by CSTR. Combilex benefits from several
advanced features. This paper evaluates one of these:
the explicit alignment of phones to graphemes in a
word. This alignment can help to rapidly develop robust
and accurate letter-to-sound (LTS) rules, without
needing to rely on automatic alignment methods. To
evaluate this, we used Festival's LTS module, comparing
its standard automatic alignment with Combilex's
explicit alignment. Our results show using Combilex's
alignment improves LTS accuracy: 86.50\% words correct
as opposed to 84.49\%, with our most general form of
lexicon. In addition, building LTS models is greatly
accelerated, as the need to list allowed alignments is
removed. Finally, loose comparison with other studies
indicates Combilex is a superior quality lexicon in
terms of consistency and size.},
keywords = {combilex, letter-to-sound rules, grapheme-to-phoneme
conversion},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/IS090308.pdf},
year = 2009
}
@inproceedings{Ehnes2009A-Tangible-Inte,
author = {Ehnes, Jochen},
title = {A Tangible Interface for the {AMI} Content Linking
Device -- The Automated Meeting Assistant},
booktitle = {Proceedings of HSI 2009},
editor = {Bello, Lucia Lo and Iannizzotto, Giancarlo},
pages = {306-313},
note = {Best Paper Award (Human Machine Interaction)},
abstract = {In this Paper we describe our approach to support
ongoing meetings with an automated meeting assistant.
The system based on the AMIDA Content Linking Device
aims at providing relevant documents used in previous
meetings for the ongoing meeting based on automatic
speech recognition. Once the content linking device
finds documents linked to a discussion about a similar
subject in a previous meeting, it assumes they may be
relevant for the current discussion as well. We believe
that the way these documents are offered to the meeting
participants is equally important as the way they are
found. We developed a projection based mixed reality
user interface that lets the documents appear on the
table tops in front of the meeting participants. They
can hand them over to others or bring them onto the
shared projection screen easily if they consider them
relevant for others as well. Yet, irrelevant documents
do not draw too much attention from the discussion. In
this paper we describe the concept and implementation
of this user interface and provide some preliminary
results.},
categories = {Tangible Interface, AMI, Content Linking, Mixed
Reality},
isbn = {978-1-4244-3960-7},
keywords = {Tangible Interface, Mixed Reality, Projection System,
Content Linking, Automatic Meeting Assistant},
lccn = {2009900916},
location = {Catania, Italy},
month = {May},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/tt4_001902.pdf},
year = 2009
}
@phdthesis{zhang-thesis2009,
author = {Le Zhang},
title = {Modelling Speech Dynamics with Trajectory-{HMM}s},
school = {School of Informatics, University of Edinburgh},
abstract = { The conditional independence assumption imposed by
the hidden Markov models (HMMs) makes it difficult to
model temporal correlation patterns in human speech.
Traditionally, this limitation is circumvented by
appending the first and second-order regression
coefficients to the observation feature vectors.
Although this leads to improved performance in
recognition tasks, we argue that a straightforward use
of dynamic features in HMMs will result in an inferior
model, due to the incorrect handling of dynamic
constraints. In this thesis I will show that an HMM can
be transformed into a Trajectory-HMM capable of
generating smoothed output mean trajectories, by
performing a per-utterance normalisation. The resulting
model can be trained by either maximising model
log-likelihood or minimising mean generation errors on
the training data. To combat the exponential growth of
paths in searching, the idea of delayed path merging is
proposed and a new time-synchronous decoding algorithm
built on the concept of token-passing is designed for
use in the recognition task. The Trajectory-HMM brings
a new way of sharing knowledge between speech
recognition and synthesis components, by tackling both
problems in a coherent statistical framework. I
evaluated the Trajectory-HMM on two different speech
tasks using the speaker-dependent MOCHA-TIMIT database.
First as a generative model to recover articulatory
features from speech signal, where the Trajectory-HMM
was used in a complementary way to the conventional HMM
modelling techniques, within a joint
Acoustic-Articulatory framework. Experiments indicate
that the jointly trained acoustic-articulatory models
are more accurate (having a lower Root Mean Square
error) than the separately trained ones, and that
Trajectory-HMM training results in greater accuracy
compared with conventional Baum-Welch parameter
updating. In addition, the Root Mean Square (RMS)
training objective proves to be consistently better
than the Maximum Likelihood objective. However,
experiment of the phone recognition task shows that the
MLE trained Trajectory-HMM, while retaining attractive
properties of being a proper generative model, tends to
favour over-smoothed trajectories among competing
hypothesises, and does not perform better than a
conventional HMM. We use this to build an argument that
models giving a better fit on training data may suffer
a reduction of discrimination by being too faithful to
the training data. Finally, experiments on using
triphone models show that increasing modelling detail
is an effective way to leverage modelling performance
with little added complexity in training. },
key = {speech recognition, speech synthesis, MOCHA,
trajectory HMM},
month = {January},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/zhangle_thesis.pdf},
year = 2009
}
@article{wolters-iwc:09,
author = {Maria Wolters and Kallirroi Georgila and Robert Logie
and Sarah MacPherson and Johanna Moore and Matt Watson},
title = {Reducing Working Memory Load in Spoken Dialogue
Systems},
journal = {Interacting with Computers},
volume = {21},
number = {4},
pages = {276-287},
abstract = {We evaluated two strategies for alleviating working
memory load for users of voice interfaces: presenting
fewer options per turn and providing confirmations.
Forty-eight users booked appointments using nine
different dialogue systems, which varied in the number
of options presented and the confirmation strategy
used. Participants also performed four cognitive tests
and rated the usability of each dialogue system on a
standardised questionnaire. When systems presented more
options per turn and avoided explicit confirmation
subdialogues, both older and younger users booked
appointments more quickly without compromising task
success. Users with lower information processing speed
were less likely to remember all relevant aspects of
the appointment. Working memory span did not affect
appointment recall. Older users were slightly less
satisfied with the dialogue systems than younger users.
We conclude that the number of options is less
important than an accurate assessment of the actual
cognitive demands of the task at hand.},
categories = {spoken dialogue; ageing; older adults; cognitive
aging; working memory},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/iwc09.pdf},
year = 2009
}
@article{hifny2009,
author = {Hifny, Y. and Renals, S.},
title = {Speech Recognition Using Augmented Conditional Random
Fields},
journal = {IEEE Transactions on Audio, Speech and Language
Processing},
volume = {17},
number = {2},
pages = {354--365},
abstract = {Acoustic modeling based on hidden Markov models (HMMs)
is employed by state-of-the-art stochastic speech
recognition systems. Although HMMs are a natural choice
to warp the time axis and model the temporal phenomena
in the speech signal, their conditional independence
properties limit their ability to model spectral
phenomena well. In this paper, a new acoustic modeling
paradigm based on augmented conditional random fields
(ACRFs) is investigated and developed. This paradigm
addresses some limitations of HMMs while maintaining
many of the aspects which have made them successful. In
particular, the acoustic modeling problem is
reformulated in a data driven, sparse, augmented space
to increase discrimination. Acoustic context modeling
is explicitly integrated to handle the sequential
phenomena of the speech signal. We present an efficient
framework for estimating these models that ensures
scalability and generality. In the TIMIT phone
recognition task, a phone error rate of 23.0\% was
recorded on the full test set, a significant
improvement over comparable HMM-based systems.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/hifny2009.pdf},
url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=4749447&arnumber=4749472&count=25&index=15},
year = 2009
}
@inproceedings{john:HTSGAP,
author = {J. Dines and J. Yamagishi and S. King},
title = {Measuring the gap between {HMM}-based {ASR} and {TTS}},
booktitle = {Proc. Interspeech},
pages = {1391--1394},
address = {Brighton, U.K.},
abstract = {The EMIME European project is conducting research in
the development of technologies for mobile,
personalised speech-to-speech translation systems. The
hidden Markov model is being used as the underlying
technology in both automatic speech recognition (ASR)
and text-to-speech synthesis (TTS) components, thus,
the investigation of unified statistical modelling
approaches has become an implicit goal of our research.
As one of the first steps towards this goal, we have
been investigating commonalities and differences
between HMM-based ASR and TTS. In this paper we present
results and analysis of a series of experiments that
have been conducted on English ASR and TTS systems,
measuring their performance with respect to phone set
and lexicon, acoustic feature type and dimensionality
and HMM topology. Our results show that, although the
fundamental statistical model may be essentially the
same, optimal ASR and TTS performance often demands
diametrically opposed system designs. This represents a
major challenge to be addressed in the investigation of
such unified modelling approaches.},
month = sep,
year = 2009
}
@inproceedings{dongwang_interspeech09_cmb,
author = {Javier Tejedor and Dong Wang and Simon King and Joe
Frankel and Jose Colas},
title = {A Posterior Probability-based System Hybridisation and
Combination for Spoken Term Detection},
booktitle = {Proc. Interspeech},
pages = {2131--2134},
address = {Brighton, UK},
abstract = {Spoken term detection (STD) is a fundamental task for
multimedia information retrieval. To improve the
detection performance, we have presented a direct
posterior-based confidence measure generated from a
neural network. In this paper, we propose a
detection-independent confidence estimation based on
the direct posterior confidence measure, in which the
decision making is totally separated from the term
detection. Based on this idea, we first present a
hybrid system which conducts the term detection and
confidence estimation based on different sub-word
units, and then propose a combination method which
merges detections from heterogeneous term detectors
based on the direct posterior-based confidence.
Experimental results demonstrated that the proposed
methods improved system performance considerably for
both English and Spanish. },
categories = {joint-multigram, pronunciation model, spoken term
detection, speech recognition},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/cmb.pdf},
year = 2009
}
@inproceedings{bell_king_full_covariance_asru2009,
author = {Bell, Peter and King, Simon},
title = {Diagonal Priors for Full Covariance Speech Recognition},
booktitle = {Proc. IEEE Workshop on Automatic Speech Recognition
and Understanding},
address = {Merano, Italy},
abstract = {We investigate the use of full covariance Gaussians
for large-vocabulary speech recognition. The large
number of parameters gives high modelling power, but
when training data is limited, the standard sample
covariance matrix is often poorly conditioned, and has
high variance. We explain how these problems may be
solved by the use of a diagonal covariance smoothing
prior, and relate this to the shrinkage estimator, for
which the optimal shrinkage parameter may itself be
estimated from the training data. We also compare the
use of generatively and discriminatively trained
priors. Results are presented on a large vocabulary
conversational telephone speech recognition task.},
doi = {10.1109/ASRU.2009.5373344},
month = dec,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/shrinkage_asru2009.pdf},
year = 2009
}
@inproceedings{steiner_is2009a,
author = {Steiner, I. and Richmond, K.},
title = {Towards Unsupervised Articulatory Resynthesis of
{G}erman Utterances using {EMA} data},
booktitle = {Proc. Interspeech},
pages = {2055--2058},
address = {Brighton, UK},
abstract = {As part of ongoing research towards integrating an
articulatory synthesizer into a text-to-speech (TTS)
framework, a corpus of German utterances recorded with
electromagnetic articulography (EMA) is resynthesized
to provide training data for statistical models. The
resynthesis is based on a measure of similarity between
the original and resynthesized EMA trajectories,
weighted by articulatory relevance. Preliminary results
are discussed and future work outlined.},
keywords = {articulatory speech synthesis, copy synthesis,
electromagnetic articulography, EMA},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/IS090558.pdf},
year = 2009
}
@inproceedings{jyamagis:1000sHTS,
author = {J. Yamagishi and Bela Usabaev and Simon King and
Oliver Watts and John Dines and Jilei Tian and Rile Hu
and Yong Guan and Keiichiro Oura and Keiichi Tokuda and
Reima Karhila and Mikko Kurimo},
title = {Thousands of voices for {HMM}-based speech synthesis},
booktitle = {Proc. Interspeech},
pages = {420--423},
address = {Brighton, U.K.},
abstract = {Our recent experiments with HMM-based speech synthesis
systems have demonstrated that speaker-adaptive
HMM-based speech synthesis (which uses an ‘average
voice model’ plus model adaptation) is robust to
non-ideal speech data that are recorded under various
conditions and with varying microphones, that are not
perfectly clean, and/or that lack of phonetic balance.
This enables us consider building high-quality voices
on ’non-TTS’ corpora such as ASR corpora. Since ASR
corpora generally include a large number of speakers,
this leads to the possibility of producing an enormous
number of voices automatically. In this paper we show
thousands of voices for HMM-based speech synthesis that
we have made from several popular ASR corpora such as
the Wall Street Journal databases (WSJ0/WSJ1/WSJCAM0),
Resource Management, Globalphone and Speecon. We report
some perceptual evaluation results and outline the
outstanding issues.},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/index.php},
year = 2009
}
@inproceedings{NiekraszMoore09,
author = {John Niekrasz and Johanna Moore},
title = {Participant Subjectivity and Involvement as a Basis
for Discourse Segmentation},
booktitle = {{Proceedings of the SIGDIAL 2009 Conference}},
pages = {54--61},
abstract = {We propose a framework for analyzing episodic
conversational activities in terms of expressed
relationships between the participants and utterance
content. We test the hypothesis that linguistic
features which express such properties, e.g. tense,
aspect, and person deixis, are a useful basis for
automatic intentional discourse segmentation. We
present a novel algorithm and test our hypothesis on a
set of intentionally segmented conversational
monologues. Our algorithm performs better than a simple
baseline and as well as or better than well-known
lexical-semantic segmentation methods.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/NiekraszMoore09.pdf},
year = 2009
}