2008.bib
@comment{{This file has been generated by bib2bib 1.92}}
@comment{{Command line: /home/korin/bibtex2html-1.92-LINUX/bib2bib -oc /home/korin/projects/publications/new_output/transitdata/2008-citations -ob /home/korin/projects/publications/new_output/transitdata/2008.bib -c 'year : "2008"' /home/korin/projects/publications/filtlists/full_publications_list.bib}}
@article{analysis-hts-adaptation-junichi,
author = {Junichi Yamagishi and Takao Kobayashi and Yuji Nakano
and Katsumi Ogata and Juri Isogai},
title = {Analysis of Speaker Adaptation Algorihms for
{HMM}-based Speech Synthesis and a Constrained {SMAPLR}
Adaptation Algorithm},
journal = {IEEE Transactions on Audio, Speech and Language
Processing},
note = {In print},
abstract = {In this paper we analyze the effects of several
factors and configuration choices encountered during
training and model construction when we want to obtain
better and more stable adaptation in HMM-based speech
synthesis. We then propose a new adaptation algorithm
called constrained structural maximum a posteriori
linear regression (CSMAPLR) whose derivation is based
on the knowledge obtained in this analysis and on the
results of comparing several conventional adaptation
algorithms. Here we investigate six major aspects of
the speaker adaptation: initial models transform
functions, estimation criteria, and sensitivity of
several linear regression adaptation algorithms
algorithms. Analyzing the effect of the initial model,
we compare speaker-dependent models, gender-independent
models, and the simultaneous use of the
gender-dependent models to single use of the
gender-dependent models. Analyzing the effect of the
transform functions, we compare the transform function
for only mean vectors with that for mean vectors and
covariance matrices. Analyzing the effect of the
estimation criteria, we compare the ML criterion with a
robust estimation criterion called structural MAP. We
evaluate the sensitivity of several thresholds for the
piecewise linear regression algorithms and take up
methods combining MAP adaptation with the linear
regression algorithms. We incorporate these adaptation
algorithms into our speech synthesis system and present
several subjective and objective evaluation results
showing the utility and effectiveness of these
algorithms in speaker adaptation for HMM-based speech
synthesis.},
categories = {speech synthesis, HMM-based speech synthesis, HTS,
speaker adaptation, voice conversion, average voice},
key = {analysis-hts-adaptation-junichi},
year = 2008
}
@inproceedings{renals2008,
author = {Renals, Steve and Hain, Thomas and Bourlard, Hervé},
title = {Interpretation of Multiparty Meetings: The {AMI} and
{AMIDA} Projects},
booktitle = {IEEE Workshop on Hands-Free Speech Communication and
Microphone Arrays, 2008. HSCMA 2008},
pages = {115--118},
abstract = {The AMI and AMIDA projects are collaborative EU
projects concerned with the automatic recognition and
interpretation of multiparty meetings. This paper
provides an overview of the advances we have made in
these projects with a particular focus on the
multimodal recording infrastructure, the publicly
available AMI corpus of annotated meeting recordings,
and the speech recognition framework that we have
developed for this domain.},
doi = {10.1109/HSCMA.2008.4538700},
keywords = {AMI corpus; Meetings; evaluation; speech recognition},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/renals2008.pdf},
url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=4538666&arnumber=4538700&count=68&index=33},
year = 2008
}
@inproceedings{vipperla08,
author = {Ravichander Vipperla and Steve Renals and Joe Frankel},
title = {Longitudinal study of {ASR} performance on ageing
voices},
booktitle = {Proc.~Interspeech},
address = {Brisbane},
abstract = {This paper presents the results of a longitudinal
study of ASR performance on ageing voices. Experiments
were conducted on the audio recordings of the
proceedings of the Supreme Court Of The United States
(SCOTUS). Results show that the Automatic Speech
Recognition (ASR) Word Error Rates (WERs) for elderly
voices are significantly higher than those of adult
voices. The word error rate increases gradually as the
age of the elderly speakers increase. Use of maximum
likelihood linear regression (MLLR) based speaker
adaptation on ageing voices improves the WER though the
performance is still considerably lower compared to
adult voices. Speaker adaptation however reduces the
increase in WER with age during old age.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/vipperla_is08.pdf},
year = 2008
}
@article{zhang-spl2008,
author = {Le Zhang and Steve Renals},
title = {Acoustic-Articulatory Modelling with the Trajectory
{HMM}},
journal = {IEEE Signal Processing Letters},
volume = 15,
pages = {245-248},
abstract = { In this letter, we introduce an hidden Markov model
(HMM)-based inversion system to recovery articulatory
movements from speech acoustics. Trajectory HMMs are
used as generative models for modelling articulatory
data. Experiments on the MOCHA-TIMIT corpus indicate
that the jointly trained acoustic-articulatory models
are more accurate (lower RMS error) than the separately
trained ones, and that trajectory HMM training results
in greater accuracy compared with conventional maximum
likelihood HMM training. Moreover, the system has the
ability to synthesize articulatory movements directly
from a textual representation. },
key = {articulatory inversion},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/zhang-spl.pdf},
year = 2008
}
@article{treeboosting-junichi,
author = {Junichi Yamagishi and Hisashi Kawai and Takao
Kobayashi},
title = {Phone Duration Modeling Using Gradient Tree Boosting},
journal = {Speech Communication},
volume = 50,
number = 5,
pages = {405--415},
note = {},
abstract = { In text-to-speech synthesis systems, phone duration
influences the quality and naturalness of synthetic
speech. In this study, we incorporate an ensemble
learning technique called gradient tree boosting into
phone duration modeling as an alternative to the
conventional approach using regression trees, and
objectively evaluate the prediction accuracy of
Japanese, Mandarin, and English phone duration. The
gradient tree boosting algorithm is a meta algorithm of
regression trees: it iteratively builds the regression
tree from the residuals and outputs weighting sum of
the regression trees. Our evaluation results show that
compared to the regression trees or other techniques
related to the regression trees, the gradient tree
boosting algorithm can substantially and robustly
improve the predictive accuracy of the phone duration
regardless of languages, speakers, or domains.},
categories = {Text-to-speech synthesis, Phone duration modeling,
Gradient tree boosing},
doi = {10.1016/j.specom.2007.12.003},
key = {treeboosting-junichi},
month = may,
year = 2008
}
@inproceedings{ling:richmond:yamagishi:wang:2008a,
author = {Ling, Zhen-Hua and Richmond, Korin and Yamagishi,
Junichi and Wang, Ren-Hua },
title = {Articulatory Control of {HMM}-based Parametric Speech
Synthesis Driven by Phonetic Knowledge},
booktitle = {Proc. Interspeech},
pages = {573--576},
address = {Brisbane, Australia},
abstract = {This paper presents a method to control the
characteristics of synthetic speech flexibly by
integrating articulatory features into a Hidden Markov
Model (HMM)-based parametric speech synthesis system.
In contrast to model adaptation and interpolation
approaches for speaking style control, this method is
driven by phonetic knowledge, and target speech samples
are not required. The joint distribution of parallel
acoustic and articulatory features considering
cross-stream feature dependency is estimated. At
synthesis time, acoustic and articulatory features are
generated simultaneously based on the
maximum-likelihood criterion. The synthetic speech can
be controlled flexibly by modifying the generated
articulatory features according to arbitrary phonetic
rules in the parameter generation process. Our
experiments show that the proposed method is effective
in both changing the overall character of synthesized
speech and in controlling the quality of a specific
vowel.},
categories = {speech synthesis, HMM, articulatory features, phonetic
knowledge},
key = {ling:richmond:yamagishi:wang:2008a},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080582.PDF},
year = 2008
}
@inproceedings{goedde:08,
author = {Florian G\"odde and Sebastian M\"oller and Klaus-Peter
Engelbrecht and Christine K\"uhnel and Robert
Schleicher and Anja Naumann and Maria Wolters},
title = {Study of a Speech-based Smart Home System with Older
Users},
booktitle = {International Workshop on Intelligent User Interfaces
for Ambient Assisted Living},
pages = {17--22},
year = 2008
}
@inproceedings{cereproc-hts,
author = {Matthew P. Aylett and Junichi Yamagishi},
title = {Combining Statistical Parameteric Speech Synthesis and
Unit-Selection for Automatic Voice Cloning},
booktitle = {Proc. LangTech 2008},
address = {Brisbane, Australia},
abstract = {The ability to use the recorded audio of a subjects
voice to produce an open-domain synthesis system has
generated much interest both in academic research and
in commercial speech technology. The ability to produce
synthetic versions of a subjects voice has potential
commercial applications, such as virtual celebrity
actors, or potential clinical applications, such as
offering a synthetic replacement voice in the case of a
laryngectomy. Recent developments in HMM-based speech
synthesis have shown it is possible to produce
synthetic voices from quite small amounts of speech
data. However, mimicking the depth and variation of a
speakers prosody as well as synthesising natural
voice quality is still a challenging research problem.
In contrast, unit-selection systems have shown it is
possible to strongly retain the character of the voice
but only with sufficient original source material.
Often this runs into hours and may require significant
manual checking and labelling. In this paper we will
present two state of the art systems, an HMM based
system HTS-2007, developed by CSTR and Nagoya Institute
Technology, and a commercial unit-selection system
CereVoice, developed by Cereproc. Both systems have
been used to mimic the voice of George W. Bush (43rd
president of the United States) using freely available
audio from the web. In addition we will present a
hybrid system which combines both technologies. We
demonstrate examples of synthetic voices created from
10, 40 and 210 minutes of randomly selected speech. We
will then discuss the underlying problems associated
with voice cloning using found audio, and the
scalability of our solution.},
categories = {speech synthesis, HMM-based speech synthesis, HTS,
speaker adaptation, voice conversion, average voice},
key = {cereproc-hts},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/03_AYLETT.pdf},
year = 2008
}
@inproceedings{tietze:08:sci,
author = {Martin Tietze and Vera Demberg and Johanna D. Moore},
title = {Syntactic Complexity induces Explicit Grounding in the
{MapTask} corpus},
booktitle = {Proc. Interspeech},
abstract = {This paper provides evidence for theories of grounding
and dialogue management in human conversation. For each
utterance in a corpus of task-oriented dialogues, we
calculated integration costs, which are based on
syntactic sentence complexity. We compared the
integration costs and grounding behavior under two
conditions, namely face-to-face and a no-eye-contact
condition. The results show that integration costs were
significantlyhigher for explicitly grounded utterances
in the no-eye-contact condition, but not in the
face-to-face condition.},
categories = {dialogue, syntactic complexity, grounding},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS081130.pdf},
year = 2008
}
@inproceedings{bell_king_shrinkage_is2008,
author = {Bell, Peter and King, Simon},
title = {A Shrinkage Estimator for Speech Recognition with Full
Covariance {HMM}s},
booktitle = {Proc. Interspeech},
address = {Brisbane, Australia},
note = {Shortlisted for best student paper award.},
abstract = {We consider the problem of parameter estimation in
full-covariance Gaussian mixture systems for automatic
speech recognition. Due to the high dimensionality of
the acoustic feature vector, the standard sample
covariance matrix has a high variance and is often
poorly-conditioned when the amount of training data is
limited. We explain how the use of a shrinkage
estimator can solve these problems, and derive a
formula for the optimal shrinkage intensity. We present
results of experiments on a phone recognition task,
showing that the estimator gives a performance
improvement over a standard full-covariance system},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/shrinkage_is2008.pdf},
year = 2008
}
@incollection{murray2008c,
author = {Murray, Gabriel and Kleinbauer, Thomas and Poller,
Peter and Renals, Steve and Kilgour, Jonathan},
title = {Extrinsic Summarization Evaluation: A Decision Audit
Task},
booktitle = {Machine Learning for Multimodal Interaction (Proc.
MLMI '08)},
publisher = {Springer},
number = {5237},
series = {Lecture Notes in Computer Science},
pages = {349--361},
abstract = {In this work we describe a large-scale extrinsic
evaluation of automatic speech summarization
technologies for meeting speech. The particular task is
a decision audit, wherein a user must satisfy a complex
information need, navigating several meetings in order
to gain an understanding of how and why a given
decision was made. We compare the usefulness of
extractive and abstractive technologies in satisfying
this information need, and assess the impact of
automatic speech recognition (ASR) errors on user
performance. We employ several evaluation methods for
participant performance, including post-questionnaire
data, human subjective and objective judgments, and an
analysis of participant browsing behaviour.},
doi = {10.1007/978-3-540-85853-9_32},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/murray2008c.pdf},
year = 2008
}
@inproceedings{wang:frankel:tejedor:king:icassp2008,
author = {Dong Wang and Joe Frankel and Javier Tejedor and Simon
King},
title = {A comparison of phone and grapheme-based spoken term
detection},
booktitle = {Proc. ICASSP},
pages = {4969--4972 },
abstract = {We propose grapheme-based sub-word units for spoken
term detection (STD). Compared to phones, graphemes
have a number of potential advantages. For
out-of-vocabulary search terms, phone- based approaches
must generate a pronunciation using letter-to-sound
rules. Using graphemes obviates this potentially
error-prone hard decision, shifting pronunciation
modelling into the statistical models describing the
observation space. In addition, long-span grapheme
language models can be trained directly from large text
corpora. We present experiments on Spanish and English
data, comparing phone and grapheme-based STD. For
Spanish, where phone and grapheme-based systems give
similar transcription word error rates (WERs),
grapheme-based STD significantly outperforms a phone-
based approach. The converse is found for English,
where the phone-based system outperforms a grapheme
approach. However, we present additional analysis which
suggests that phone-based STD performance levels may be
achieved by a grapheme-based approach despite lower
transcription accuracy, and that the two approaches may
usefully be combined. We propose a number of directions
for future development of these ideas, and suggest that
if grapheme-based STD can match phone-based
performance, the inherent flexibility in dealing with
out-of-vocabulary terms makes this a desirable
approach.},
doi = {10.1109/ICASSP.2008.4518773},
month = {March-April},
year = 2008
}
@inproceedings{huang2008-is,
author = {Songfang Huang and Steve Renals},
title = {Unsupervised Language Model Adaptation Based on Topic
and Role Information in Multiparty Meetings},
booktitle = {Proc. Interspeech'08},
pages = {833--836},
address = {Brisbane, Australia},
abstract = {We continue our previous work on the modeling of topic
and role information from multiparty meetings using a
hierarchical Dirichlet process (HDP), in the context of
language model adaptation. In this paper we focus on
three problems: 1) an empirical analysis of the HDP as
a nonparametric topic model; 2) the mismatch problem of
vocabularies of the baseline n-gram model and the HDP;
and 3) an automatic speech recognition experiment to
further verify the effectiveness of our adaptation
framework. Experiments on a large meeting corpus of
more than 70 hours speech data show consistent and
significant improvements in terms of word error rate
for language model adaptation based on the topic and
role information.},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/interspeech08.pdf},
year = 2008
}
@inproceedings{kocjancic_issp08,
author = {Kocjancic, Tanja},
title = {Ultrasound investigation of tongue movements in
syllables with different onset structure},
booktitle = {Proc. of the Eighth International Seminar on Speech
Production (ISSP)},
abstract = {This study is an attempt to describe syllables with
different onset structure not only in terms of
durational changes but also in terms of the distance
the tongue travels over a syllable by using ultrasound
and to compare the ratio between the two parameters,
expressed as speed. Results indicate that both measures
increase with an increasing number of onset segments
but not to the same degree for all targets. Therefore
speed was not constant over all of them. Additionally,
type of onset constituent greatly influenced the three
parameters and there were large between-speaker
similarities in case of durational changes.},
categories = {tongue movements, ultrasound},
month = dec,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/Kocjancic_ISSP_2008.pdf},
year = 2008
}
@inproceedings{gibbonmayo:08,
author = {Gibbon, F. and Mayo, C.},
title = {Adults' perception of conflicting acoustic cues
associated with EPG-defined undifferentiated gestures},
booktitle = {4th International EPG Symposium, Edinburgh, UK.},
categories = {speech perception, cue weighting, undifferentiated
gestures, electropalatography},
year = 2008
}
@article{goubanova:king:specom2008,
author = {Olga Goubanova and Simon King},
title = {Bayesian networks for phone duration prediction},
journal = {Speech Communication},
volume = {50},
number = {4},
pages = {301-311},
abstract = {In a text-to-speech system, the duration of each phone
may be predicted by a duration model. This model is
usually trained using a database of phones with known
durations; each phone (and the context it appears in)
is characterised by a feature vector that is composed
of a set of linguistic factor values. We describe the
use of a graphical model -- a Bayesian network -- for
predicting the duration of a phone, given the values
for these factors. The network has one discrete
variable for each of the linguistic factors and a
single continuous variable for the phone's duration.
Dependencies between variables (or the lack of them)
are represented in the BN structure by arcs (or missing
arcs) between pairs of nodes. During training, both the
topology of the network and its parameters are learned
from labelled data. We compare the results of the BN
model with results for sums of products and CART models
on the same data. In terms of the root mean square
error, the BN model performs much better than both CART
and SoP models. In terms of correlation coefficient,
the BN model performs better than the SoP model, and as
well as the CART model. A BN model has certain
advantages over CART and SoP models. Training SoP
models requires a high degree of expertise. CART models
do not deal with interactions between factors in any
explicit way. As we demonstrate, a BN model can also
make accurate predictions of a phone's duration, even
when the values for some of the linguistic factors are
unknown.},
categories = {Text-to-speech; Bayesian networks; Duration modelling;
Sums of products; Classification and regression trees},
doi = {10.1016/j.specom.2007.10.002},
month = {April},
year = 2008
}
@inproceedings{Aylett+King08,
author = {Matthew P. Aylett and Simon King},
title = {Single Speaker Segmentation and Inventory Selection
Using Dynamic Time Warping Self Organization and Joint
Multigram Mapping},
booktitle = {SSW06},
pages = {258--263},
abstract = {In speech synthesis the inventory of units is decided
by inspection and on the basis of phonological and
phonetic expertise. The ephone (or emergent phone)
project at CSTR is investigating how self organisation
techniques can be applied to build an inventory based
on collected acoustic data together with the
constraints of a synthesis lexicon. In this paper we
will describe a prototype inventory creation method
using dynamic time warping (DTW) for acoustic
clustering and a joint multigram approach for relating
a series of symbols that represent the speech to these
emerged units. We initially examined two symbol sets:
1) A baseline of standard phones 2) Orthographic
symbols. The success of the approach is evaluated by
comparing word boundaries generated by the emergent
phones against those created using state-of-the-art HMM
segmentation. Initial results suggest the DTW
segmentation can match word boundaries with a root mean
square error (RMSE) of 35ms. Results from mapping units
onto phones resulted in a higher RMSE of 103ms. This
error was increased when multiple multigram types were
added and when the default unit clustering was altered
from 40 (our baseline) to 10. Results for orthographic
matching had a higher RMSE of 125ms. To conclude we
discuss future work that we believe can reduce this
error rate to a level sufficient for the techniques to
be applied to a unit selection synthesis system. },
categories = {speech synthesis, unit selection, parametric
synthesis, phone inventory, orthographic synthesis},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/ssw06.pdf},
place = {Bonn},
year = 2008
}
@incollection{murray2008b,
author = {Murray, Gabriel and Renals, Steve},
title = {Detecting Action Items in Meetings},
booktitle = {Machine Learning for Multimodal Interaction (Proc.
MLMI '08)},
publisher = {Springer},
number = {5237},
series = {Lecture Notes in Computer Science},
pages = {208--213},
abstract = {We present a method for detecting action items in
spontaneous meeting speech. Using a supervised approach
incorporating prosodic, lexical and structural
features, we can classify such items with a high degree
of accuracy. We also examine how well various feature
subclasses can perform this task on their own.},
doi = {10.1007/978-3-540-85853-9_19},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/murray2008b.pdf},
url = {http://dx.doi.org/10.1007/978-3-540-85853-9_19},
year = 2008
}
@inproceedings{robust-hts,
author = {Junichi Yamagishi and Zhenhua Ling and Simon King},
title = {Robustness of HMM-based Speech Synthesis},
booktitle = {Proc. Interspeech 2008},
pages = {581--584},
address = {Brisbane, Australia},
abstract = {As speech synthesis techniques become more advanced,
we are able to consider building high-quality voices
from data collected outside the usual highly-controlled
recording studio environment. This presents new
challenges that are not present in conventional
text-to-speech synthesis: the available speech data are
not perfectly clean, the recording conditions are not
consistent, and/or the phonetic balance of the material
is not ideal. Although a clear picture of the
performance of various speech synthesis techniques
(e.g., concatenative, HMM-based or hybrid) under good
conditions is provided by the Blizzard Challenge, it is
not well understood how robust these algorithms are to
less favourable conditions. In this paper, we analyse
the performance of several speech synthesis methods
under such conditions. This is, as far as we know, a
new research topic: ``Robust speech synthesis.'' As a
consequence of our investigations, we propose a new
robust training method for the HMM-based speech
synthesis in for use with speech data collected in
unfavourable conditions.},
categories = {speech synthesis, HMM-based speech synthesis, HTS,
speaker adaptation, voice conversion, average voice,
unit selection},
key = {robust-hts},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/RobustnessHTS.pdf},
year = 2008
}
@inproceedings{kocjancic_exling08,
author = {Kocjancic, Tanja},
title = {Tongue movement and syllable onset complexity:
ultrasound study},
booktitle = {Proc. of ISCA Experimental Linguistics ExLing 2008},
abstract = {In this study ultrasound was used to investigate
tongue movements in syllables with different number and
type of onset consonants. Ultrasound recordings
provided the information of the distance the tongue
travels over a target, and audio recordings of the time
needed. The speed of tongue’s travel was calculated
from the two measurements. Results of ten speakers have
shown that both duration and distance travelled
increase with an increased number of onset segments,
but that distance travelled is additionally influenced
by the type of the segment, as is speed. Duration also
seemed to be the least speaker-dependant of the three
parameters.},
categories = {tongue movements, ultrasound},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/Kocjancic_ISCA_ExLing_2008.pdf},
year = 2008
}
@inproceedings{georgila:08,
author = {Kallirroi Georgila and Maria Wolters and Vasilis
Karaiskos and Melissa Kronenthal and Robert Logie and
Neil Mayo and Johanna Moore and Matt Watson},
title = {A Fully Annotated Corpus for Studying the Effect of
Cognitive Ageing on Users' Interactions with Spoken
Dialogue Systems},
booktitle = {Proceedings of the 6th International Conference on
Language Resources and Evaluation},
year = 2008
}
@article{garau2008,
author = {Garau, Giulia and Renals, Steve},
title = {Combining Spectral Representations for Large
Vocabulary Continuous Speech Recognition},
journal = {IEEE Transactions on Audio, Speech and Language
Processing},
volume = {16},
number = {3},
pages = {508--518},
abstract = {In this paper we investigate the combination of
complementary acoustic feature streams in large
vocabulary continuous speech recognition (LVCSR). We
have explored the use of acoustic features obtained
using a pitch-synchronous analysis, STRAIGHT, in
combination with conventional features such as mel
frequency cepstral coefficients. Pitch-synchronous
acoustic features are of particular interest when used
with vocal tract length normalisation (VTLN) which is
known to be affected by the fundamental frequency. We
have combined these spectral representations directly
at the acoustic feature level using heteroscedastic
linear discriminant analysis (HLDA) and at the system
level using ROVER. We evaluated this approach on three
LVCSR tasks: dictated newspaper text (WSJCAM0),
conversational telephone speech (CTS), and multiparty
meeting transcription. The CTS and meeting
transcription experiments were both evaluated using
standard NIST test sets and evaluation protocols. Our
results indicate that combining conventional and
pitch-synchronous acoustic feature sets using HLDA
results in a consistent, significant decrease in word
error rate across all three tasks. Combining at the
system level using ROVER resulted in a further
significant decrease in word error rate.},
doi = {10.1109/TASL.2008.916519},
pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2008/garau-taslp08.pdf},
url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=4443886},
year = 2008
}
@inproceedings{qin:perpinan:richmond:wrench:renals:2008a,
author = {Qin, C. and Carreira-Perpiñán, M. and Richmond, K.
and Wrench, A. and Renals, S.},
title = {Predicting Tongue Shapes from a Few Landmark Locations},
booktitle = {Proc. Interspeech},
pages = {2306--2309},
address = {Brisbane, Australia},
abstract = {We present a method for predicting the midsagittal
tongue contour from the locations of a few landmarks
(metal pellets) on the tongue surface, as used in
articulatory databases such as MOCHA and the Wisconsin
XRDB. Our method learns a mapping using ground-truth
tongue contours derived from ultrasound data and
drastically improves over spline interpolation. We also
determine the optimal locations of the landmarks, and
the number of landmarks required to achieve a desired
prediction error: 3-4 landmarks are enough to achieve
0.3-0.2 mm error per point on the tongue.},
categories = {ultrasound, tongue contour, articulation},
key = {qin:perpinan:richmond:wrench:renals:2008a},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080929.PDF},
year = 2008
}
@inproceedings{cabral:renals:richmond:yamagishi:2008a,
author = {Cabral, J. and Renals, S. and Richmond, K. and
Yamagishi, J.},
title = {Glottal Spectral Separation for Parametric Speech
Synthesis},
booktitle = {Proc. Interspeech},
pages = {1829--1832},
address = {Brisbane, Australia},
abstract = {This paper presents a method to control the
characteristics of synthetic speech flexibly by
integrating articulatory features into a Hidden Markov
Model (HMM)-based parametric speech synthesis system.
In contrast to model adaptation and interpolation
approaches for speaking style control, this method is
driven by phonetic knowledge, and target speech samples
are not required. The joint distribution of parallel
acoustic and articulatory features considering
cross-stream feature dependency is estimated. At
synthesis time, acoustic and articulatory features are
generated simultaneously based on the
maximum-likelihood criterion. The synthetic speech can
be controlled flexibly by modifying the generated
articulatory features according to arbitrary phonetic
rules in the parameter generation process. Our
experiments show that the proposed method is effective
in both changing the overall character of synthesized
speech and in controlling the quality of a specific
vowel. },
categories = {HMM speech synthesis, Glottal Spectral Separation,
LF-model},
key = {cabral:renals:richmond:yamagishi:2008a},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS081086.PDF},
year = 2008
}
@inproceedings{leo_08-3,
author = {J. Sebastian Andersson and Leonardo Badino and Oliver
S. Watts and Matthew P.Aylett},
title = {The {CSTR/Cereproc B}lizzard Entry 2008: The
Inconvenient Data},
booktitle = {Proc. Blizzard Challenge Workshop (in Proc.
Interspeech 2008)},
address = {Brisbane, Australia},
abstract = {In a commercial system data used for unit selection
systems is collected with a heavy emphasis on
homogeneous neutral data that has sufficient coverage
for the units that will be used in the system. In this
years Blizzard entry CSTR and CereProc present a joint
entry where the emphasis has been to explore techniques
to deal with data which is not homogeneous (the English
entry) and did not have appropriate coverage for a
diphone based system (the Mandarin entry where
tone/phone combinations were treated as distinct phone
categories). In addition, two further problems were
addressed, 1) Making use of non-homogeneous data for
creating a voice that can realise both expressive and
neutral speaking styles (the English entry) 2) Building
a unit selection system with no native understanding of
the language but depending instead on external native
evaluation (the Mandarin Entry).},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/cstr-cereproc_Blizzard2008.pdf},
year = 2008
}
@inproceedings{hts-child-oliver,
author = {Oliver Watts and Junichi Yamagishi and Kay Berkling
and Simon King},
title = {{HMM}-based synthesis of child speech},
booktitle = {Proc. of The 1st Workshop on Child, Computer and
Interaction (ICMI'08 post-conference workshop)},
address = {Crete, Greece},
abstract = {The synthesis of child speech presents challenges both
in the collection of data and in the building of a
synthesiser from that data. Because only limited data
can be collected, and the domain of that data is
constrained, it is difficult to obtain the type of
phonetically-balanced corpus usually used in speech
synthesis. As a consequence, building a synthesiser
from this data is difficult. Concatenative synthesisers
are not robust to corpora with many missing units (as
is likely when the corpus content is not carefully
designed), so we chose to build a statistical
parametric synthesiser using the HMM-based system HTS.
This technique has previously been shown to perform
well for limited amounts of data, and for data
collected under imperfect conditions. We compared 6
different configurations of the synthesiser, using both
speaker-dependent and speaker-adaptive modelling
techniques, and using varying amounts of data. The
output from these systems was evaluated alongside
natural and vocoded speech, in a Blizzard-style
listening test.},
categories = {speech synthesis, HMM-based speech synthesis, HTS,
speaker adaptation, voice conversion, average voice,
child speech},
key = {hts-child-oliver},
month = oct,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/child-hts-oliver.pdf},
year = 2008
}
@inproceedings{strom08,
author = {Volker Strom and Simon King},
title = {Investigating {F}estival's target cost function using
perceptual experiments},
booktitle = {Proc.~Interspeech},
address = {Brisbane},
abstract = {We describe an investigation of the target cost used
in the Festival unit selection speech synthesis system.
Our ultimate goal is to automatically learn a
perceptually optimal target cost function. In this
study, we investigated the behaviour of the target cost
for one segment type. The target cost is based on
counting the mismatches in several context features. A
carrier sentence (``My name is Roger'') was synthesised
using all 147,820 possible combinations of the diphones
/n_ei/ and /ei_m/. 92 representative versions were
selected and presented to listeners as 460 pairwise
comparisons. The listeners' preference votes were used
to analyse the behaviour of the target cost, with
respect to the values of its component linguistic
context features.},
categories = {speech synthesis, unit selection, target costs},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080514.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080514.ps},
year = 2008
}
@inproceedings{dong_ivan_joe_simon_interspeech08_marray,
author = {Dong Wang and Ivan Himawan and Joe Frankel and Simon
King},
title = {A Posterior Approach for Microphone Array Based Speech
Recognition},
booktitle = {Proc. Interspeech},
pages = {996--999},
abstract = {Automatic speech recognition (ASR) becomes rather
difficult in meetings domains because of the adverse
acoustic conditions, including more background noise,
more echo and reverberation and frequent cross-talking.
Microphone arrays have been demonstrated able to boost
ASR performance dramatically in such noisy and
reverberant environments, with various beamforming
algorithms. However, almost all existing beamforming
measures work in the acoustic domain, resorting to
signal processing theories and geometric explanation.
This limits their application, and induces significant
performance degradation when the geometric property is
unavailable or hard to estimate, or if heterogenous
channels exist in the audio system. In this paper, we
preset a new posterior-based approach for array-based
speech recognition. The main idea is, instead of
enhancing speech signals, we try to enhance the
posterior probabilities that frames belonging to
recognition units, e.g., phones. These enhanced
posteriors are then transferred to posterior
probability based features and are modeled by HMMs,
leading to a tandem ANN-HMM hybrid system presented by
Hermansky et al.. Experimental results demonstrated the
validity of this posterior approach. With the posterior
accumulation or enhancement, significant improvement
was achieved over the single channel baseline.
Moreover, we can combine the acoustic enhancement and
posterior enhancement together, leading to a hybrid
acoustic-posterior beamforming approach, which works
significantly better than just the acoustic
beamforming, especially in the scenario with
moving-speakers. },
categories = {speech recognition, microphone array, beamforming,
tandem approach},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/marray.a.pdf},
year = 2008
}
@article{christensen2008,
author = {Christensen, Heidi and Gotoh, Yoshihiko and Renals,
Steve},
title = {A Cascaded Broadcast News Highlighter},
journal = {IEEE Transactions on Audio, Speech and Language
Processing},
volume = {16},
pages = {151--161},
abstract = {This paper presents a fully automatic news skimming
system which takes a broadcast news audio stream and
provides the user with the segmented, structured and
highlighted transcript. This constitutes a system with
three different, cascading stages: converting the audio
stream to text using an automatic speech recogniser,
segmenting into utterances and stories and finally
determining which utterance should be highlighted using
a saliency score. Each stage must operate on the
erroneous output from the previous stage in the system;
an effect which is naturally amplified as the data
progresses through the processing stages. We present a
large corpus of transcribed broadcast news data
enabling us to investigate to which degree information
worth highlighting survives this cascading of
processes. Both extrinsic and intrinsic experimental
results indicate that mistakes in the story boundary
detection has a strong impact on the quality of
highlights, whereas erroneous utterance boundaries
cause only minor problems. Further, the difference in
transcription quality does not affect the overall
performance greatly.},
doi = {10.1109/TASL.2007.910746},
pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2008/christensen-tasl08.pdf},
url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=4407525&arnumber=4383075&count=28&index=16},
year = 2008
}
@incollection{huang2008-mlmi,
author = {Songfang Huang and Steve Renals},
title = {Modeling Topic and Role Information in Meetings using
the Hierarchical {D}irichlet Process},
booktitle = {Machine Learning for Multimodal Interaction V},
publisher = {Springer},
editor = {Popescu-Belis, A. and Stiefelhagen, R.},
volume = {5237},
series = {Lecture Notes in Computer Science},
pages = {214--225},
abstract = {In this paper, we address the modeling of topic and
role information in multiparty meetings, via a
nonparametric Bayesian model called the hierarchical
Dirichlet process. This model provides a powerful
solution to topic modeling and a flexible framework for
the incorporation of other cues such as speaker role
information. We present our modeling framework for
topic and role on the AMI Meeting Corpus, and
illustrate the effectiveness of the approach in the
context of adapting a baseline language model in a
large-vocabulary automatic speech recognition system
for multiparty meetings. The adapted LM produces
significant improvements in terms of both perplexity
and word error rate.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/mlmi08.pdf},
year = 2008
}
@inproceedings{wolters-itg:08,
author = {Wolters, Maria and Campbell, Pauline and DePlacido,
Christine and Liddell, Amy and Owens, David},
title = {Adapting {S}peech {S}ynthesis {S}ystems to {U}sers
with {A}ge-{R}elated {H}earing {L}oss},
booktitle = {Beitr{\"a}ge der 8. {ITG} {F}achtagung
{S}prachkommunikation},
abstract = {This paper summarises the main results of a pilot
study into the effect of auditory ageing on the
intelligibility of synthetic speech. 32 older and 12
younger users had to answer simple questions about a
series of meeting reminders and medication reminders.
They also underwent an extensive battery of
audiological and cognitive assessments. Older users
only had more difficulty understanding the synthetic
voice than younger people if they had elevated
pure-tone thresholds and if they were asked to
unfamiliar medication names. We suggest that these
problems can be remedied by better prompt design. User
interviews show that the synthetic voice used was quite
natural. Problems mentioned by users fit the results of
a previous error analysis. },
categories = {speech synthesis, older users},
month = sep,
url = {http://homepages.inf.ed.ac.uk/mwolters/itg08.pdf},
year = 2008
}
@inproceedings{steiner:richmond:2008a,
author = {Steiner, I. and Richmond, K.},
title = {Generating gestural timing from {EMA} data using
articulatory resynthesis},
booktitle = {Proc. 8th International Seminar on Speech Production},
address = {Strasbourg, France},
abstract = {As part of ongoing work to integrate an articulatory
synthesizer into a modular TTS platform, a method is
presented which allows gestural timings to be generated
automatically from EMA data. Further work is outlined
which will adapt the vocal tract model and phoneset to
English using new articulatory data, and use
statistical trajectory models. },
categories = {articulatory synthesis, EMA, VocalTractLab },
key = {steiner:richmond:2008a},
month = dec,
year = 2008
}
@inproceedings{leo_08-2,
author = {Leonardo Badino and Robert A.J. Clark and Volker Strom},
title = {Including Pitch Accent Optionality in Unit Selection
Text-to-Speech Synthesis},
booktitle = {Proc.~Interspeech},
address = {Brisbane},
abstract = {A significant variability in pitch accent placement is
found when comparing the patterns of prosodic
prominence realized by different English speakers
reading the same sentences. In this paper we describe a
simple approach to incorporate this variability to
synthesize prosodic prominence in unit selection
text-to-speech synthesis. The main motivation of our
approach is that by taking into account the variability
of accent placements we enlarge the set of prosodically
acceptable speech units, thus increasing the chances of
selecting a good quality sequence of units, both in
prosodic and segmental terms. Results on a large scale
perceptual test show the benefits of our approach and
indicate directions for further improvements.},
categories = {speech synthesis, unit selection, prosodic prominence,
pitch accents},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080159.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080159.ps},
year = 2008
}
@inproceedings{joe_dong_simon_interspeech08_bottle,
author = {Joe Frankel and Dong Wang and Simon King},
title = {Growing bottleneck features for tandem {ASR}},
booktitle = {Proc. Interspeech},
pages = {1549},
abstract = { We present a method for training bottleneck MLPs for
use in tandem ASR. Experiments on meetings data show
that this approach leads to improved performance
compared with training MLPs from a random
initialization. },
categories = {tandem ASR, bottleneck MLP},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/bottlenet.a.pdf},
year = 2008
}
@inproceedings{tts_barra08,
author = {R. Barra-Chicote and J. Yamagishi and J.M. Montero and
S. King and S. Lutfi and J. Macias-Guarasa},
title = {Generacion de una voz sintetica en {C}astellano basada
en {HSMM} para la {E}valuacion {A}lbayzin 2008:
conversion texto a voz},
booktitle = {V Jornadas en Tecnologia del Habla},
pages = {115-118},
note = {(in Spanish)},
month = nov,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/tts-jth08.pdf},
year = 2008
}
@inproceedings{lips08-gregpr,
author = {Gregor Hofer and Junichi Yamagishi and Hiroshi
Shimodaira},
title = {Speech-driven Lip Motion Generation with a Trajectory
{HMM}},
booktitle = {Proc. Interspeech 2008},
pages = {2314--2317},
address = {Brisbane, Australia},
abstract = {Automatic speech animation remains a challenging
problem that can be described as finding the optimal
sequence of animation parameter configurations given
some speech. In this paper we present a novel technique
to automatically synthesise lip motion trajectories
from a speech signal. The developed system predicts lip
motion units from the speech signal and generates
animation trajectories automatically employing a
Trajectory Hidden Markov Model. Using the MLE
criterion, its parameter generation algorithm produces
the optimal smooth motion trajectories that are used to
drive control points on the lips directly.
Additionally, experiments were carried out to find a
suitable model unit that produces the most accurate
results. Finally a perceptual evaluation was conducted,
that showed that the developed motion units perform
better than phonemes.},
categories = {visual speech synthesis, trajectory HMM, HTS},
key = {lips08-gregpr},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/GregorLipsChallenge08.pdf},
year = 2008
}
@incollection{murray2008a,
author = {Murray, Gabriel and Renals, Steve},
title = {Meta Comments for Summarizing Meeting Speech},
booktitle = {Machine Learning for Multimodal Interaction (Proc.
MLMI '08)},
publisher = {Springer},
number = {5237},
series = {Lecture Notes in Computer Science},
pages = {236--247},
abstract = {This paper is about the extractive summarization of
meeting speech, using the ICSI and AMI corpora. In the
first set of experiments we use prosodic, lexical,
structural and speaker-related features to select the
most informative dialogue acts from each meeting, with
the hypothesis being that such a rich mixture of
features will yield the best results. In the second
part, we present an approach in which the
identification of ``meta-comments'' is used to create
more informative summaries that provide an increased
level of abstraction. We find that the inclusion of
these meta comments improves summarization performance
according to several evaluation metrics.},
doi = {10.1007/978-3-540-85853-9_22},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/murray2008a.pdf},
url = {http://dx.doi.org/10.1007/978-3-540-85853-9_22},
year = 2008
}
@inproceedings{moeller:08,
author = {Sebastian M\"oller and Florian G\"odde and Maria
Wolters},
title = {A Corpus Analysis of Spoken Smart-Home Interactions
with Older Users},
booktitle = {Proceedings of the 6th International Conference on
Language Resources and Evaluation},
year = 2008
}
@inproceedings{king:tokuda:zen:yamagishi:interspeech2008,
author = {Simon King and Keiichi Tokuda and Heiga Zen and
Junichi Yamagishi},
title = {Unsupervised adaptation for HMM-based speech synthesis},
booktitle = {Proc. Interspeech},
pages = {1869-1872},
address = {Brisbane, Australia},
abstract = {It is now possible to synthesise speech using HMMs
with a comparable quality to unit-selection techniques.
Generating speech from a model has many potential
advantages over concatenating waveforms. The most
exciting is model adaptation. It has been shown that
supervised speaker adaptation can yield high-quality
synthetic voices with an order of magnitude less data
than required to train a speaker-dependent model or to
build a basic unit-selection system. Such supervised
methods require labelled adaptation data for the target
speaker. In this paper, we introduce a method capable
of unsupervised adaptation, using only speech from the
target speaker without any labelling.},
categories = {speech synthesis, HMM-based speech synthesis, HTS,
trajectory HMMs, speaker adaptation, MLLR},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080299.PDF},
year = 2008
}
@inproceedings{toth:frankel:goztolya:king:interspeech2008,
author = {Laszlo Toth and Joe Frankel and Gabor Gosztolya and
Simon King},
title = {Cross-lingual Portability of MLP-Based Tandem Features
-- A Case Study for English and Hungarian},
booktitle = {Proc. Interspeech},
pages = {2695-2698},
address = {Brisbane, Australia},
abstract = {One promising approach for building ASR systems for
less-resourced languages is cross-lingual adaptation.
Tandem ASR is particularly well suited to such
adaptation, as it includes two cascaded modelling
steps: feature extraction using multi-layer perceptrons
(MLPs), followed by modelling using a standard HMM. The
language-specific tuning can be performed by adjusting
the HMM only, leaving the MLP untouched. Here we
examine the portability of feature extractor MLPs
between an Indo-European (English) and a Finno-Ugric
(Hungarian) language. We present experiments which use
both conventional phone-posterior and articulatory
feature (AF) detector MLPs, both trained on a much
larger quantity of (English) data than the monolingual
(Hungarian) system. We find that the cross-lingual
configurations achieve similar performance to the
monolingual system, and that, interestingly, the AF
detectors lead to slightly worse performance, despite
the expectation that they should be more
language-independent than phone-based MLPs. However,
the cross-lingual system outperforms all other
configurations when the English phone MLP is adapted on
the Hungarian data. },
keywords = {tandem, ASR},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080729.PDF},
year = 2008
}
@inproceedings{garau2008a,
author = {Garau, Giulia and Renals, Steve},
title = {Pitch adaptive features for {LVCSR}},
booktitle = {Proc. Interspeech '08},
abstract = {We have investigated the use of a pitch adaptive
spectral representation on large vocabulary speech
recognition, in conjunction with speaker normalisation
techniques. We have compared the effect of a smoothed
spectrogram to the pitch adaptive spectral analysis by
decoupling these two components of STRAIGHT.
Experiments performed on a large vocabulary meeting
speech recognition task highlight the importance of
combining a pitch adaptive spectral representation with
a conventional fixed window spectral analysis. We found
evidence that STRAIGHT pitch adaptive features are more
speaker independent than conventional MFCCs without
pitch adaptation, thus they also provide better
performances when combined using feature combination
techniques such as Heteroscedastic Linear Discriminant
Analysis.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/garau2008a.pdf},
year = 2008
}
@article{tejedor:wang:frankel:king:colas:specom2008,
author = {Javier Tejedor and Dong Wang and Joe Frankel and Simon
King and José Colás},
title = {A comparison of grapheme and phoneme-based units for
{S}panish spoken term detection},
journal = {Speech Communication},
volume = {50},
number = {11-12},
pages = {980-991},
abstract = {The ever-increasing volume of audio data available
online through the world wide web means that automatic
methods for indexing and search are becoming essential.
Hidden Markov model (HMM) keyword spotting and lattice
search techniques are the two most common approaches
used by such systems. In keyword spotting, models or
templates are defined for each search term prior to
accessing the speech and used to find matches. Lattice
search (referred to as spoken term detection), uses a
pre-indexing of speech data in terms of word or
sub-word units, which can then quickly be searched for
arbitrary terms without referring to the original
audio. In both cases, the search term can be modelled
in terms of sub-word units, typically phonemes. For
in-vocabulary words (i.e. words that appear in the
pronunciation dictionary), the letter-to-sound
conversion systems are accepted to work well. However,
for out-of-vocabulary (OOV) search terms,
letter-to-sound conversion must be used to generate a
pronunciation for the search term. This is usually a
hard decision (i.e. not probabilistic and with no
possibility of backtracking), and errors introduced at
this step are difficult to recover from. We therefore
propose the direct use of graphemes (i.e., letter-based
sub-word units) for acoustic modelling. This is
expected to work particularly well in languages such as
Spanish, where despite the letter-to-sound mapping
being very regular, the correspondence is not
one-to-one, and there will be benefits from avoiding
hard decisions at early stages of processing. In this
article, we compare three approaches for Spanish
keyword spotting or spoken term detection, and within
each of these we compare acoustic modelling based on
phone and grapheme units. Experiments were performed
using the Spanish geographical-domain Albayzin corpus.
Results achieved in the two approaches proposed for
spoken term detection show us that trigrapheme units
for acoustic modelling match or exceed the performance
of phone-based acoustic models. In the method proposed
for keyword spotting, the results achieved with each
acoustic model are very similar.},
categories = {Spoken term detection; Keyword spotting; Graphemes;
Spanish},
doi = {10.1016/j.specom.2008.03.005},
month = {November-December},
year = 2008
}
@inproceedings{morgan:08,
author = {Maggie Morgan and Marilyn R. McGee-Lennon and Nick
Hine and John Arnott and Chris Martin and Julia S.
Clark and Maria Wolters},
title = {Requirements Gathering with Diverse User Groups and
Stakeholders},
booktitle = {Proc. 26th Conference on Computer-Human Interaction,
Florence},
year = 2008
}
@inproceedings{hts2007-icassp,
author = {Junichi Yamagishi and Takashi Nose and Heiga Zen and
Tomoki Toda and Keiichi Tokuda},
title = {Performance Evaluation of The Speaker-Independent
{HMM}-based Speech Synthesis System "{HTS}-2007" for
the {Blizzard Challenge 2007}},
booktitle = {Proc. ICASSP 2008},
pages = {3957--3960},
address = {Las Vegas, U.S.A},
abstract = {This paper describes a speaker-independent/adaptive
HMM-based speech synthesis system developed for the
Blizzard Challenge 2007. The new system, named
HTS-2007, employs speaker adaptation
(CSMAPLR+MAP), feature-space adaptive training,
mixed-gender modeling, and full-covariance modeling
using CSMAPLR transforms, in addition to several other
techniques that have proved effective in our previous
systems. Subjective evaluation results show that the
new system generates significantly better quality
synthetic speech than that of speaker-dependent
approaches with realistic amounts of speech data, and
that it bears comparison with speaker-dependent
approaches even when large amounts of speech data are
available.},
categories = {speech synthesis, HMM-based speech synthesis, HTS,
speaker adaptation, voice conversion, average voice},
doi = {10.1109/ICASSP.2008.4518520},
key = {hts2007-icassp},
month = apr,
year = 2008
}
@inproceedings{karaiskos:king:clark:mayo:blizzard2008,
author = {Vasilis Karaiskos and Simon King and Robert A. J.
Clark and Catherine Mayo},
title = {The Blizzard Challenge 2008},
booktitle = {Proc. Blizzard Challenge Workshop},
address = {Brisbane, Australia},
abstract = {The Blizzard Challenge 2008 was the fourth annual
Blizzard Challenge. This year, participants were asked
to build two voices from a UK English corpus and one
voice from a Man- darin Chinese corpus. This is the
first time that a language other than English has been
included and also the first time that a large UK
English corpus has been available. In addi- tion, the
English corpus contained somewhat more expressive
speech than that found in corpora used in previous
Blizzard Challenges. To assist participants with
limited resources or limited ex- perience in
UK-accented English or Mandarin, unaligned la- bels
were provided for both corpora and for the test
sentences. Participants could use the provided labels
or create their own. An accent-specific pronunciation
dictionary was also available for the English speaker.
A set of test sentences was released to participants,
who were given a limited time in which to synthesise
them and submit the synthetic speech. An online
listening test was con- ducted, to evaluate
naturalness, intelligibility and degree of similarity
to the original speaker.},
keywords = {Blizzard},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/summary_Blizzard2008.pdf},
year = 2008
}
@inproceedings{huang2008-ptkl,
author = {Songfang Huang and Steve Renals},
title = {Using Participant Role in Multiparty Meetings as Prior
Knowledge for Nonparametric Topic Modeling},
booktitle = {Proc. ICML/UAI/COLT Workshop on Prior Knowledge for
Text and Language Processing},
pages = {21--24},
address = {Helsinki, Finland},
abstract = {In this paper we introduce our attempts to incorporate
the participant role information in multiparty meetings
for document modeling using the hierarchical Dirichlet
process. The perplexity and automatic speech
recognition results demonstrate that the participant
role information is a promising prior knowledge source
to be combined with language models for automatic
speech recognition and interaction modeling for
multiparty meetings.},
month = jul,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/ptkl.pdf},
year = 2008
}
@inproceedings{hts2008,
author = {Junichi Yamagishi and Heiga Zen and Yi-Jian Wu and
Tomoki Toda and Keiichi Tokuda},
title = {The {HTS}-2008 System: Yet Another Evaluation of the
Speaker-Adaptive {HMM}-based Speech Synthesis System in
The {2008 Blizzard Challenge}},
booktitle = {Proc. Blizzard Challenge 2008},
address = {Brisbane, Australia},
abstract = {For the 2008 Blizzard Challenge, we used the same
speaker-adaptive approach to HMM-based speech synthesis
that was used in the HTS entry to the 2007 challenge,
but an improved system was built in which the
multi-accented English average voice model was trained
on 41 hours of speech data with high-order mel-cepstral
analysis using an efficient forward-backward algorithm
for the HSMM. The listener evaluation scores for the
synthetic speech generated from this system was much
better than in 2007: the system had the equal best
naturalness on the small English data set and the equal
best intelligibility on both small and large data sets
for English, and had the equal best naturalness on the
Mandarin data. In fact, the English system was found to
be as intelligible as human speech.},
categories = {speech synthesis, HMM-based speech synthesis, HTS,
speaker adaptation, voice conversion, average voice,
Blizzard Challenge},
key = {hts2008},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/HTS2008.pdf},
year = 2008
}
@inproceedings{bell_king_lineSearch_is2008,
author = {Bell, Peter and King, Simon},
title = {Covariance Updates for Discriminative Training by
Constrained Line Search},
booktitle = {Proc. Interspeech},
address = {Brisbane, Australia},
abstract = {We investigate the recent Constrained Line Search
algorithm for discriminative training of HMMs and
propose an alternative formula for variance update. We
compare the method to standard techniques on a phone
recognition task.},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/lineSearch_is2008.pdf},
year = 2008
}
@inproceedings{leo_08-1,
author = {Leonardo Badino and Robert A.J. Clark},
title = {Automatic labeling of contrastive word pairs from
spontaneous spoken English},
booktitle = {in 2008 IEEE/ACL Workshop on Spoken Language
Technology},
address = {Goa, India},
abstract = {This paper addresses the problem of automatically
labeling contrast in spontaneous spoken speech, where
contrast here is meant as a relation that ties two
words that explicitly contrast with each other.
Detection of contrast is certainly relevant in the
analysis of discourse and information structure and
also, because of the prosodic correlates of contrast,
could play an important role in speech applications,
such as text-to-speech synthesis, that need an accurate
and discourse context related modeling of prosody. With
this prospect we investigate the feasibility of
automatic contrast labeling by training and evaluating
on the Switchboard corpus a novel contrast tagger,
based on Support Vector Machines (SVM), that combines
lexical features, syntactic dependencies and WordNet
semantic relations.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/0000101.pdf},
year = 2008
}
@article{dielmann2008,
author = {Dielmann, Alfred and Renals, Steve},
title = {Recognition of Dialogue Acts in Multiparty Meetings
using a Switching {DBN}},
journal = {IEEE Transactions on Audio, Speech and Language
Processing},
volume = {16},
number = {7},
pages = {1303--1314},
abstract = {This paper is concerned with the automatic recognition
of dialogue acts (DAs) in multiparty conversational
speech. We present a joint generative model for DA
recognition in which segmentation and classification of
DAs are carried out in parallel. Our approach to DA
recognition is based on a switching dynamic Bayesian
network (DBN) architecture. This generative approach
models a set of features, related to lexical content
and prosody, and incorporates a weighted interpolated
factored language model. The switching DBN coordinates
the recognition process by integrating the component
models. The factored language model, which is estimated
from multiple conversational data corpora, is used in
conjunction with additional task-specific language
models. In conjunction with this joint generative
model, we have also investigated the use of a
discriminative approach, based on conditional random
fields, to perform a reclassification of the segmented
DAs. We have carried out experiments on the AMI corpus
of multimodal meeting recordings, using both manually
transcribed speech, and the output of an automatic
speech recognizer, and using different configurations
of the generative model. Our results indicate that the
system performs well both on reference and fully
automatic transcriptions. A further significant
improvement in recognition accuracy is obtained by the
application of the discriminative reranking approach
based on conditional random fields.},
doi = {10.1109/TASL.2008.922463},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/dielmann2008.pdf},
url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=4599391&arnumber=4497831&count=18&index=9},
year = 2008
}
@inproceedings{bourlard2008,
author = {Bourlard, Herve and Renals, Steve},
title = {Recognition and Understanding of Meetings: Overview of
the {European} {AMI} and {AMIDA} Projects},
booktitle = {Proc. LangTech 2008},
abstract = {The AMI and AMIDA projects are concerned with the
recognition and interpretation of multiparty
(face-to-face and remote) meetings. Within these
projects we have developed the following: (1) an
infrastructure for recording meetings using multiple
microphones and cameras; (2) a one hundred hour,
manually annotated meeting corpus; (3) a number of
techniques for indexing, and summarizing of meeting
videos using automatic speech recognition and computer
vision, and (4) a extensible framework for browsing,
and searching of meeting videos. We give an overview of
the various techniques developed in AMI (mainly
involving face-to-face meetings), their integration
into our meeting browser framework, and future plans
for AMIDA (Augmented Multiparty Interaction with
Distant Access), the follow-up project to AMI.
Technical and business information related to these two
projects can be found at www.amiproject.org,
respectively on the Scientific and Business portals. },
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/bourlard2008.pdf},
year = 2008
}