2007.bib
@comment{{This file has been generated by bib2bib 1.92}}
@comment{{Command line: /home/korin/bibtex2html-1.92-LINUX/bib2bib -oc /home/korin/projects/publications/new_output/transitdata/2007-citations -ob /home/korin/projects/publications/new_output/transitdata/2007.bib -c 'year : "2007"' /home/korin/projects/publications/filtlists/full_publications_list.bib}}
@inproceedings{frankel07:AF_MLP,
author = {Frankel, J. and Magimai-Doss, M. and King, S. and
Livescu, K. and Çetin, Ö.},
title = {Articulatory Feature Classifiers Trained on 2000 hours
of Telephone Speech},
booktitle = {Proc. Interspeech},
address = {Antwerp, Belgium},
abstract = {This paper is intended to advertise the public
availability of the articulatory feature (AF)
classification multi-layer perceptrons (MLPs) which
were used in the Johns Hopkins 2006 summer workshop. We
describe the design choices, data preparation, AF label
generation, and the training of MLPs for feature
classification on close to 2000 hours of telephone
speech. In addition, we present some analysis of the
MLPs in terms of classification accuracy and confusions
along with a brief summary of the results obtained
during the workshop using the MLPs. We invite
interested parties to make use of these MLPs.},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/frankel_AF-MLP.pdf},
year = 2007
}
@inproceedings{wolters-icphs:07,
author = {Maria Wolters and Pauline Campbell and Christine
DePlacido and Amy Liddell and David Owens},
title = {The Effect of Hearing Loss on the Intelligibility of
Synthetic Speech},
booktitle = {Proc. Intl. Conf. Phon. Sci.},
abstract = {Many factors affect the intelligibility of synthetic
speech. One aspect that has been severely neglected in
past work is hearing loss. In this study, we
investigate whether pure-tone audiometry thresholds
across a wide range of frequencies (0.25--20kHz) are
correlated with participants’ performance on a simple
task that involves accurately recalling and processing
reminders. Participants’ scores correlate not only with
thresholds in the frequency ranges commonly associated
with speech, but also with extended high-frequency
thresholds.},
categories = {},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/WoltersetalICPhS2007.pdf},
year = 2007
}
@inproceedings{jyamagis07:avss2006,
author = {Junichi Yamagishi and Takao Kobayashi and Steve Renals
and Simon King and Heiga Zen and Tomoki Toda and
Keiichi Tokuda },
title = {Improved Average-Voice-based Speech Synthesis Using
Gender-Mixed Modeling and a Parameter Generation
Algorithm Considering {GV}},
booktitle = {Proc. 6th ISCA Workshop on Speech Synthesis (SSW-6)},
abstract = {For constructing a speech synthesis system which can
achieve diverse voices, we have been developing a
speaker independent approach of HMM-based speech
synthesis in which statistical average voice models are
adapted to a target speaker using a small amount of
speech data. In this paper, we incorporate a
high-quality speech vocoding method STRAIGHT and a
parameter generation algorithm with global variance
into the system for improving quality of synthetic
speech. Furthermore, we introduce a feature-space
speaker adaptive training algorithm and a gender mixed
modeling technique for conducting further normalization
of the average voice model. We build an English
text-to-speech system using these techniques and show
the performance of the system.},
categories = {HMM, speech synthesis, speaker adaptation, HTS},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/ssw6-yamagishi.pdf},
year = 2007
}
@inproceedings{leo_07-1,
author = {Leonardo Badino and Robert A.J. Clark},
title = {Issues of Optionality in Pitch Accent Placement},
booktitle = {Proc. 6th ISCA Speech Synthesis Workshop},
address = {Bonn, Germany},
abstract = {When comparing the prosodic realization of different
English speakers reading the same text, a significant
disagreement is usually found amongst the pitch accent
patterns of the speakers. Assuming that such
disagreement is due to a partial optionality of pitch
accent placement, it has been recently proposed to
evaluate pitch accent predictors by comparing them with
multi-speaker reference data. In this paper we face the
issue of pitch accent optionality at different levels.
At first we propose a simple mathematical definition of
intra-speaker optionality which allows us to introduce
a function for evaluating pitch accent predictors which
we show being more accurate and robust than those used
in previous works. Subsequently we compare a pitch
accent predictor trained on single speaker data with a
predictor trained on multi-speaker data in order to
point out the large overlapping between intra-speaker
and inter-speaker optionality. Finally, we show our
successful results in predicting intra-speaker
optionality and we suggest how this achievement could
be exploited to improve the performances of a unit
selection text-to speech synthesis (TTS) system.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/ssw6_252.pdf},
year = 2007
}
@article{beaver:07,
author = {David Beaver and Brady Zack Clark and Edward Flemming
and T. Florian Jaeger and Maria Wolters},
title = {When Semantics meets Phonetics: {A}coustical studies
of second occurrence focus},
journal = {Language},
volume = 83,
number = 2,
pages = {245--276},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/BeaverLanguage2007.pdf},
year = 2007
}
@inproceedings{wolters-ssw:07,
author = {Maria Wolters and Pauline Campbell and Christine
DePlacido and Amy Liddell and David Owens},
title = {Making Synthetic Speech Accessible to Older People},
booktitle = {Proc. Sixth ISCA Workshop on Speech Synthesis, Bonn,
Germany },
abstract = {In this paper, we report on an experiment that tested
users’ ability to understand the content of spoken
auditory reminders. Users heard meeting reminders and
medication reminders spoken in both a natural and a
synthetic voice. Our results show that older users can
understand synthetic speech as well as younger users
provided that the prompt texts are well-designed, using
familiar words and contextual cues. As soon as
unfamiliar and complex words are introduced, users’
hearing affects how well they can understand the
synthetic voice, even if their hearing would pass
common screening tests for speech synthesis
experiments. Although hearing thresholds correlate best
with users’ performance, central auditory processing
may also influence performance, especially when complex
errors are made.},
categories = {},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/WoltersetalSSW2007.pdf},
year = 2007
}
@inproceedings{hirai07:5ms2007,
author = {Toshio Hirai and Junichi Yamagishi and Seiichi Tenpaku
},
title = {Utilization of an {HMM}-Based Feature Generation
Module in 5 ms Segment Concatenative Speech Synthesis},
booktitle = {Proc. 6th ISCA Workshop on Speech Synthesis (SSW-6)},
abstract = {If a concatenative speech synthesis system uses more
short speech segments, it increases the potential to
generate natural speech because the concatenation
variation becomes greater. Recently, a synthesis
approach was proposed in which very short (5 ms)
segments are used. In this paper, an implementation of
an HMM-based feature generation module into a very
short segment concatenative synthesis system that has
the advantage of modularity and a synthesis experiment
are described.},
categories = {speech synthesis, HTS, hybrid algorithm},
month = aug,
year = 2007
}
@incollection{dielmann-mlmi06,
author = {A. Dielmann and S. Renals},
title = {Automatic Dialogue Act Recognition using a Dynamic
{Bayesian} Network},
booktitle = {Proc. Multimodal Interaction and Related Machine
Learning Algorithms Workshop (MLMI--06)},
publisher = {Springer},
editor = {S. Renals and S. Bengio and J. Fiscus},
pages = {178--189},
abstract = {We propose a joint segmentation and classification
approach for the dialogue act recognition task on
natural multi-party meetings ({ICSI} Meeting Corpus).
Five broad DA categories are automatically recognised
using a generative Dynamic {Bayesian} Network based
infrastructure. Prosodic features and a switching
graphical model are used to estimate DA boundaries, in
conjunction with a factored language model which is
used to relate words and DA categories. This easily
generalizable and extensible system promotes a rational
approach to the joint DA segmentation and recognition
task, and is capable of good recognition performance.},
categories = {ami,dialogue act,dbn,factored language
model,meetings,edinburgh},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/dielmann-mlmi06.pdf},
year = 2007
}
@article{frankel07:factoring,
author = {Frankel, J. and King, S.},
title = {Factoring {G}aussian Precision Matrices for Linear
Dynamic Models},
journal = {Pattern Recognition Letters},
volume = {28},
number = {16},
pages = {2264-2272},
abstract = {The linear dynamic model (LDM), also known as the
Kalman filter model, has been the subject of research
in the engineering, control, and more recently, machine
learning and speech technology communities. The
Gaussian noise processes are usually assumed to have
diagonal, or occasionally full, covariance matrices. A
number of recent papers have considered modelling the
precision rather than covariance matrix of a Gaussian
distribution, and this work applies such ideas to the
LDM. A Gaussian precision matrix P can be factored into
the form P = UTSU where U is a transform and S a
diagonal matrix. By varying the form of U, the
covariance can be specified as being diagonal or full,
or used to model a given set of spatial dependencies.
Furthermore, the transform and scaling components can
be shared between models, allowing richer distributions
with only marginally more parameters than required to
specify diagonal covariances. The method described in
this paper allows the construction of models with an
appropriate number of parameters for the amount of
available training data. We provide illustrative
experimental results on synthetic and real speech data
in which models with factored precision matrices and
automatically-selected numbers of parameters are as
good as or better than models with diagonal covariances
on small data sets and as good as models with full
covariance matrices on larger data sets.},
categories = {LDM},
doi = {10.1016/j.patrec.2007.07.008},
month = {December},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Frankel_LDM_covar.pdf},
year = 2007
}
@inproceedings{cetin07:crosslingual,
author = {Çetin, Ö. and Magimai-Doss, M. and Kantor, A. and
King, S. and Bartels, C. and Frankel, J. and Livescu,
K.},
title = {Monolingual and crosslingual comparison of tandem
features derived from articulatory and phone {MLP}s},
booktitle = {Proc. ASRU},
address = {Kyoto},
organization = {IEEE},
abstract = {In recent years, the features derived from posteriors
of a multilayer perceptron (MLP), known as tandem
features, have proven to be very effective for
automatic speech recognition. Most tandem features to
date have relied on MLPs trained for phone
classification. We recently showed on a relatively
small data set that MLPs trained for articulatory
feature classification can be equally effective. In
this paper, we provide a similar comparison using MLPs
trained on a much larger data set - 2000 hours of
English conversational telephone speech. We also
explore how portable phone- and articulatory feature-
based tandem features are in an entirely different
language - Mandarin - without any retraining. We find
that while phone-based features perform slightly better
in the matched-language condition, they perform
significantly better in the cross-language condition.
Yet, in the cross-language condition, neither approach
is as effective as the tandem features extracted from
an MLP trained on a relatively small amount of
in-domain data. Beyond feature concatenation, we also
explore novel observation modelling schemes that allow
for greater flexibility in combining the tandem and
standard features at hidden Markov model (HMM) outputs.},
month = {December},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Cetin_etal_ASRU2007.pdf},
year = 2007
}
@inproceedings{clark:podsiadlo:mayo:king:blizzard2007,
author = {Robert A. J. Clark and Monika Podsiadlo and Mark
Fraser and Catherine Mayo and Simon King },
title = {Statistical Analysis of the {B}lizzard {C}hallenge
2007 Listening Test Results },
booktitle = {Proc. Blizzard 2007 (in Proc. Sixth {ISCA} Workshop on
Speech Synthesis)},
address = {Bonn, Germany},
abstract = {Blizzard 2007 is the third Blizzard Challenge, in
which participants build voices from a common dataset.
A large listening test is conducted which allows
comparison of systems in terms of naturalness and
intelligibility. New sections were added to the
listening test for 2007 to test the perceived
similarity of the speaker's identity between natural
and synthetic speech. In this paper, we present the
results of the listening test and the subsequent
statistical analysis. },
categories = {blizzard,listening test},
keywords = {Blizzard},
month = {August},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blz3_003.pdf},
year = 2007
}
@inproceedings{wolters-interspeech:07,
author = {Maria Wolters and Pauline Campbell and Christine
DePlacido and Amy Liddell and David Owens},
title = {The Role of Outer Hair Cell Function in the Perception
of Synthetic versus Natural Speech},
booktitle = {Proc. Interspeech},
abstract = {Hearing loss as assessed by pure-tone audiometry (PTA)
is significantly correlated with the intelligibility of
synthetic speech. However, PTA is a subjective
audiological measure that assesses the entire auditory
pathway and does not discriminate between the different
afferent and efferent contributions. In this paper, we
focus on one particular aspect of hearing that has been
shown to correlate with hearing loss: outer hair cell
(OHC) function. One role of OHCs is to increase
sensitivity and frequency selectivity. This function of
OHCs can be assessed quickly and objectively through
otoacoustic emissions (OAE) testing, which is little
known outside the field of audiology. We find that OHC
function affects the perception of human speech, but
not that of synthetic speech. This has important
implications not just for audiological and
electrophysiological research, but also for adapting
speech synthesis to ageing ears.},
categories = {},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/WoltersetalInterspeech2007.pdf},
year = 2007
}
@article{frankel07:AF_DBN,
author = {Frankel, J. and Wester, M. and King, S.},
title = {Articulatory feature recognition using dynamic
{B}ayesian networks},
journal = {Computer Speech & Language },
volume = {21},
number = {4},
pages = {620--640},
abstract = {We describe a dynamic Bayesian network for
articulatory feature recognition. The model is intended
to be a component of a speech recognizer that avoids
the problems of conventional ``beads-on-a-string''
phoneme-based models. We demonstrate that the model
gives superior recognition of articulatory features
from the speech signal compared with a stateof- the art
neural network system. We also introduce a training
algorithm that offers two major advances: it does not
require time-aligned feature labels and it allows the
model to learn a set of asynchronous feature changes in
a data-driven manner.},
month = oct,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Frankel_etal_CSL2007.pdf},
year = 2007
}
@article{JunichiHTS06,
author = {Junichi Yamagishi and Takao Kobayashi},
title = {Average-Voice-based Speech Synthesis using HSMM-based
Speaker Adaptation and Adaptive Training.},
journal = {IEICE Trans. Information and Systems},
volume = {E90-D},
number = 2,
pages = {533-543},
abstract = {In speaker adaptation for speech synthesis, it is
desirable to convert both voice characteristics and
prosodic features such as F0 and phone duration. For
simultaneous adaptation of spectrum, F0 and phone
duration within the HMM framework, we need to transform
not only the state output distributions corresponding
to spectrum and F0 but also the duration distributions
corresponding to phone duration. However, it is not
straightforward to adapt the state duration because the
original HMM does not have explicit duration
distributions. Therefore, we utilize the framework of
the hidden semi-Markov model (HSMM), which is an HMM
having explicit state duration distributions, and we
apply an HSMM-based model adaptation algorithm to
simultaneously transform both the state output and
state duration distributions. Furthermore, we propose
an HSMM-based adaptive training algorithm to
simultaneously normalize the state output and state
duration distributions of the average voice model. We
incorporate these techniques into our HSMM-based speech
synthesis system, and show their effectiveness from the
results of subjective and objective evaluation tests.},
month = feb,
year = 2007
}
@inproceedings{Cetin07:tandem,
author = {Çetin, Ö. and Kantor, A. and King, S. and Bartels,
C. and Magimai-Doss, M. and Frankel, J. and Livescu, K.},
title = {An articulatory feature-based tandem approach and
factored observation modeling},
booktitle = {Proc. ICASSP},
address = {Honolulu},
abstract = {The so-called tandem approach, where the posteriors of
a multilayer perceptron (MLP) classifier are used as
features in an automatic speech recognition (ASR)
system has proven to be a very effective method. Most
tandem approaches up to date have relied on MLPs
trained for phone classification, and appended the
posterior features to some standard feature hidden
Markov model (HMM). In this paper, we develop an
alternative tandem approach based on MLPs trained for
articulatory feature (AF) classification. We also
develop a factored observation model for characterizing
the posterior and standard features at the HMM outputs,
allowing for separate hidden mixture and state-tying
structures for each factor. In experiments on a subset
of Switchboard, we show that the AFbased tandem
approach is as effective as the phone-based approach,
and that the factored observation model significantly
outperforms the simple feature concatenation approach
while using fewer parameters.},
month = {April},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Cetin_icassp07_tandem.pdf},
year = 2007
}
@inproceedings{fraser:king:blizzard2007,
author = {Mark Fraser and Simon King},
title = {The {B}lizzard {C}hallenge 2007},
booktitle = {Proc. Blizzard 2007 (in Proc. Sixth ISCA Workshop on
Speech Synthesis)},
address = {Bonn, Germany},
abstract = {In Blizzard 2007, the third Blizzard Challenge,
participants were asked to build voices from a dataset,
a defined subset and, following certain constraints, a
subset of their choice. A set of test sentences was
then released to be synthesised. An online evaluation
of the submitted synthesised sentences focused on
naturalness and intelligibility, and added new sec-
tions for degree of similarity to the original speaker,
and similarity in terms of naturalness of pairs of
sentences from different systems. We summarise this
year's Blizzard Challenge and look ahead to possible
designs for Blizzard 2008 in the light of participant
and listener feedback. },
categories = {blizzard, listening test},
keywords = {Blizzard},
month = {August},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blz3_001.pdf},
year = 2007
}
@article{nose07:mrhsmm,
author = {Takashi Nose and Junichi Yamagishi and Takao Kobayashi},
title = {A Style Control Technique for {HMM}-based Expressive
Speech Synthesis},
journal = {IEICE Trans. Information and Systems},
volume = {E90-D},
number = 9,
pages = {1406--1413},
abstract = {This paper describes a technique for controlling the
degree of expressivity of a desired emotional
expression and/or speaking style of synthesized speech
in an HMM-based speech synthesis framework. With this
technique, multiple emotional expressions and speaking
styles of speech are modeled in a single model by using
a multiple-regression hidden semi-Markov model
(MRHSMM). A set of control parameters, called the style
vector, is defined, and each speech synthesis unit is
modeled by using the MRHSMM, in which mean parameters
of the state output and duration distributions are
expressed by multiple-regression of the style vector.
In the synthesis stage, the mean parameters of the
synthesis units are modified by transforming an
arbitrarily given style vector that corresponds to a
point in a low-dimensional space, called style space,
each of whose coordinates represents a certain specific
speaking style or emotion of speech. The results of
subjective evaluation tests show that style and its
intensity can be controlled by changing the style
vector},
categories = {HMM-based speech synthesis, speaking style, emotional
expression, style interpolation, hidden semi-Markov
model (HSMM)},
month = sep,
url = {http://search.ieice.org/bin/summary.php?id=e90-d_9_1406&category=D&lang=E&year=2007&abst=},
year = 2007
}
@incollection{huang2007-mlmi,
author = {Huang, Songfang and Renals, Steve},
title = {Modeling Prosodic Features in Language Models for
Meetings},
booktitle = {Machine Learning for Multimodal Interaction IV},
publisher = {Springer},
editor = {Popescu-Belis, A. and Renals, S. and Bourlard, H.},
volume = {4892},
series = {Lecture Notes in Computer Science},
pages = {191--202},
abstract = {Prosody has been actively studied as an important
knowledge source for speech recognition and
understanding. In this paper, we are concerned with the
question of exploiting prosody for language models to
aid automatic speech recognition in the context of
meetings. Using an automatic syllable detection
algorithm, the syllable-based prosodic features are
extracted to form the prosodic representation for each
word. Two modeling approaches are then investigated.
One is based on a factored language model, which
directly uses the prosodic representation and treats it
as a `word'. Instead of direct association, the second
approach provides a richer probabilistic structure
within a hierarchical Bayesian framework by introducing
an intermediate latent variable to represent similar
prosodic patterns shared by groups of words. Four-fold
cross-validation experiments on the ICSI Meeting Corpus
show that exploiting prosody for language modeling can
significantly reduce the perplexity, and also have
marginal reductions in word error rate.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/mlmi07.pdf},
year = 2007
}
@inproceedings{livescu07:JHU_summary,
author = {Livescu, K. and Çetin, Ö. and Hasegawa-Johnson, M.
and King, S. and Bartels, C. and Borges, N. and Kantor,
A. and Lal, P. and Yung, L. and Bezman,
Dawson-Haggerty, S. and Woods, B. and Frankel, J. and
Magimai-Doss, M. and Saenko, K.},
title = {Articulatory feature-based methods for acoustic and
audio-visual speech recognition: {S}ummary from the
2006 {JHU} {S}ummer {W}orkshop},
booktitle = {Proc. ICASSP},
address = {Honolulu},
abstract = {We report on investigations, conducted at the 2006
Johns HopkinsWorkshop, into the use of articulatory
features (AFs) for observation and pronunciation models
in speech recognition. In the area of observation
modeling, we use the outputs of AF classiers both
directly, in an extension of hybrid HMM/neural network
models, and as part of the observation vector, an
extension of the tandem approach. In the area of
pronunciation modeling, we investigate a model having
multiple streams of AF states with soft synchrony
constraints, for both audio-only and audio-visual
recognition. The models are implemented as dynamic
Bayesian networks, and tested on tasks from the
Small-Vocabulary Switchboard (SVitchboard) corpus and
the CUAVE audio-visual digits corpus. Finally, we
analyze AF classication and forced alignment using a
newly collected set of feature-level manual
transcriptions.},
month = {April},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/livescu_icassp07_sum.pdf},
year = 2007
}
@inproceedings{avss-icassp07,
author = {J. Yamagishi and T. Kobayashi and M. Tachibana and K.
Ogata and Y. Nakano},
title = {Model adaptation approach to speech synthesis with
diverse voices and styles},
booktitle = {Proc. ICASSP},
pages = {1233--1236},
abstract = {In human computer interaction and dialogue systems, it
is often desirable for text-to-speech synthesis to be
able to generate natural sounding speech with an
arbitrary speaker~Afs voice and with varying speaking
styles and/or emotional expressions. We have developed
an average-voice-based speech synthesis method using
statistical average voice models and model adaptation
techniques for this purpose. In this paper, we describe
an overview of the speech synthesis system and show the
current performance with several experimental results.},
year = 2007
}
@inproceedings{jaimes2007,
author = {Jaimes, Alejandro and Bourlard, Hervé and Renals,
Steve and Carletta, Jean},
title = {Recording, Indexing, Summarizing, and Accessing
Meeting Videos: An Overview of the {AMI} Project},
booktitle = {Proc IEEE ICIAPW},
pages = {59--64},
abstract = {n this paper we give an overview of the AMI project.
AMI developed the following: (1) an infrastructure for
recording meetings using multiple microphones and
cameras; (2) a one hundred hour, manually annotated
meeting corpus; (3) a number of techniques for
indexing, and summarizing of meeting videos using
automatic speech recognition and computer vision, and
(4) an extensible framework for browsing, and searching
of meeting videos. We give an overview of the various
techniques developed in AMI, their integration into our
meeting browser framework, and future plans for AMIDA
(Augmented Multiparty Interaction with Distant Access),
the follow-up project to AMI.},
doi = {10.1109/ICIAPW.2007.36},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/jaimes2007.pdf},
url = {http://ieeexplore.ieee.org/search/srchabstract.jsp?arnumber=4427477&isnumber=4427459&punumber=4427458&k2dockey=4427477@ieeecnfs&query=%28+%28%28renals%29%3Cin%3Eau+%29+%29+%3Cand%3E+%28pyr+%3E%3D+2006+%3Cand%3E+pyr+%3C%3D+2008%29&pos=6&access=no},
year = 2007
}
@inproceedings{zen07:hts-2,
author = {Heiga Zen and Takashi Nose and Junichi Yamagishi and
Shinji Sako and Takashi Masuko and Alan Black and
Keiichi Tokuda},
title = {The {HMM}-based speech synthesis system ({HTS})
version 2.0},
booktitle = {Proc. 6th ISCA Workshop on Speech Synthesis (SSW-6)},
abstract = {A statistical parametric speech synthesis system based
on hidden Markov models (HMMs) has grown in popularity
over the last few years. This system simultaneously
models spectrum, excitation, and duration of speech
using context-dependent HMMs and generates speech
waveforms from the HMMs themselves. Since December
2002, we have publicly released an open-source software
toolkit named HMM-based speech synthesis system (HTS)
to provide a research and development platform for the
speech synthesis community. In December 2006, HTS
version 2.0 was released. This version includes a
number of new features which are useful for both speech
synthesis researchers and developers. This paper
describes HTS version 2.0 in detail, as well as future
release plans.},
categories = {HMM, speech synthesis, HTS},
month = aug,
year = 2007
}
@inproceedings{renals2007,
author = {Renals, Steve and Hain, Thomas and Bourlard, Hervé},
title = {Recognition and interpretation of meetings: The {AMI}
and {AMIDA} projects},
booktitle = {Proc. IEEE Workshop on Automatic Speech Recognition
and Understanding (ASRU '07)},
abstract = {The AMI and AMIDA projects are concerned with the
recognition and interpretation of multiparty meetings.
Within these projects we have: developed an
infrastructure for recording meetings using multiple
microphones and cameras; released a 100 hour annotated
corpus of meetings; developed techniques for the
recognition and interpretation of meetings based
primarily on speech recognition and computer vision;
and developed an evaluation framework at both component
and system levels. In this paper we present an overview
of these projects, with an emphasis on speech
recognition and content extraction. },
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/ami-asru2007.pdf},
year = 2007
}
@inproceedings{dielmann-icassp07,
author = {A. Dielmann and S. Renals},
title = {{DBN} based joint Dialogue Act recognition of
multiparty meetings},
booktitle = {Proc. IEEE ICASSP},
volume = 4,
pages = {133--136},
abstract = {Joint Dialogue Act segmentation and classification of
the new {AMI} meeting corpus has been performed through
an integrated framework based on a switching dynamic
{Bayesian} network and a set of continuous features and
language models. The recognition process is based on a
dictionary of 15 {DA} classes tailored for group
decision-making. Experimental results show that a novel
interpolated Factored Language Model results in a low
error rate on the automatic segmentation task, and thus
good recognition results can be achieved on {AMI}
multiparty conversational speech.},
categories = {ami,dialogue act,dbn,factored language
model,meetings,edinburgh},
month = {April},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/dielmann-icassp07.pdf},
year = 2007
}
@inproceedings{tachibana07:styleclassify07,
author = {Makoto Tachibana and Keigo Kawashima and Junichi
Yamagishi and Takao Kobayashi},
title = {Performance Evaluation of {HMM}-Based Style
Classification with a Small Amount of Training Data},
booktitle = {Proc. Interspeech 2007},
abstract = {This paper describes a classification technique for
emotional expressions and speaking styles of speech
using only a small amount of training data of a target
speaker. We model spectral and fundamental frequency
(F0) features simultaneously using multi-space
probability distribution HMM (MSD-HMM), and adapt a
speaker-independent neutral style model to a certain
target speaker’s style model with a small amount of
data using MSD-MLLR which is extended MLLR for MSD-HMM.
We perform classification experiments for professional
narrators’ speech and non-professional speakers'
speech and evaluate the performance of proposed
technique by comparing with other commonly used
classifiers. We show that the proposed technique gives
better result than the other classifiers when using a
few sentences of target speaker’s style data.},
categories = {emotion, speaking style, classification},
month = aug,
year = 2007
}
@inproceedings{huang2007-asru,
author = {Huang, Songfang and Renals, Steve},
title = {Hierarchical {Pitman-Yor} Language Models for {ASR} in
Meetings},
booktitle = {Proc. IEEE Workshop on Automatic Speech Recognition
and Understanding (ASRU'07)},
pages = {124--129},
address = {Kyoto, Japan},
abstract = {In this paper we investigate the application of a
novel technique for language modeling --- a
hierarchical Bayesian language model (LM) based on the
Pitman-Yor process --- on automatic speech recognition
(ASR) for multiparty meetings. The hierarchical
Pitman-Yor language model (HPYLM), which was originally
proposed in the machine learning field, provides a
Bayesian interpretation to language modeling. An
approximation to the HPYLM recovers the exact
formulation of the interpolated Kneser-Ney smoothing
method in n-gram models. This paper focuses on the
application and scalability of HPYLM on a practical
large vocabulary ASR system. Experimental results on
NIST RT06s evaluation meeting data verify that HPYLM is
a competitive and promising language modeling
technique, which consistently performs better than
interpolated Kneser-Ney and modified Kneser-Ney n-gram
LMs in terms of both perplexity (PPL) and word error
rate (WER).},
month = dec,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/asru07.pdf},
year = 2007
}
@inproceedings{owens-efas:07,
author = {David Owens and Pauline Campbell and Amy Liddell and
Christine DePlacido and Maria Wolters},
title = {Random Gap Detection Threshold: A Useful Measure of
Auditory Ageing?},
booktitle = {Proc. Europ. Cong. Fed. Audiol. Heidelberg, Germany},
abstract = {},
categories = {},
month = jun,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Owensetal2007EFAS.pdf},
year = 2007
}
@misc{Hofer_Shimodaira:sigg:2007,
author = {Gregor Hofer and Hiroshi Shimodaira and Junichi
Yamagishi},
title = {Speech-driven Head Motion Synthesis based on a
Trajectory Model},
howpublished = {Poster at Siggraph 2007},
address = {San Diego, USA},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/siggraph07.pdf},
year = 2007
}
@inproceedings{strom:etal:interspeech2007,
author = {Volker Strom and Ani Nenkova and Robert Clark and
Yolanda Vazquez-Alvarez and Jason Brenier and Simon
King and Dan Jurafsky},
title = {Modelling Prominence and Emphasis Improves
Unit-Selection Synthesis},
booktitle = {Proc. Interspeech 2007},
address = {Antwerp, Belgium},
abstract = {We describe the results of large scale perception
experiments showing improvements in synthesising two
distinct kinds of prominence: standard pitch-accent and
strong emphatic accents. Previously prominence
assignment has been mainly evaluated by computing
accuracy on a prominence-labelled test set. By contrast
we integrated an automatic pitch-accent classifier into
the unit selection target cost and showed that
listeners preferred these synthesised sentences. We
also describe an improved recording script for
collecting emphatic accents, and show that generating
emphatic accents leads to further improvements in the
fiction genre over incorporating pitch accent only.
Finally, we show differences in the effects of
prominence between child-directed speech and news and
fiction genres. Index Terms: speech synthesis, prosody,
prominence, pitch accent, unit selection},
categories = {speech synthesis},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/p540.pdf},
year = 2007
}
@inproceedings{murray2007-interspeech,
author = {Murray, Gabriel and Renals, Steve},
title = {Towards online speech summarization},
booktitle = {Proc. Interspeech '07},
abstract = {The majority of speech summarization research has
focused on extracting the most informative dialogue
acts from recorde d, archived data. However, a
potential use case for speech sum- marization in the
meetings domain is to facilitate a meeting in progress
by providing the participants - whether they are at
tend- ing in-person or remotely - with an indication of
the most im- portant parts of the discussion so far.
This requires being a ble to determine whether a
dialogue act is extract-worthy befor e the global
meeting context is available. This paper introduces a
novel method for weighting dialogue acts using only
very lim- ited local context, and shows that high
summary precision is possible even when information
about the meeting as a whole is lacking. A new
evaluation framework consisting of weighted precision,
recall and f-score is detailed, and the novel onl ine
summarization method is shown to significantly increase
recall and f-score compared with a method using no
contextual infor- mation. },
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/IS070966.PDF},
year = 2007
}
@inproceedings{nenkova:07,
author = {Nenkova, Ani and Jason Brenier and Anubha Kothari and
Sasha Calhoun and Laura Whitton and David Beaver and
Dan Jurafsky},
title = {To Memorize or to Predict: Prominence labeling in
Conversational Speech},
booktitle = {NAACL Human Language Technology Conference},
address = {Rochester, NY},
abstract = {The immense prosodic variation of natural
conversational speech makes it challenging to predict
which words are prosodically prominent in this genre.
In this paper, we examine a new feature, accent ratio,
which captures how likely it is that a word will be
realized as prominent or not. We compare this feature
with traditional accentprediction features (based on
part of speech and N-grams) as well as with several
linguistically motivated and manually labeled
information structure features, such as whether a word
is given, new, or contrastive. Our results show that
the linguistic features do not lead to significant
improvements, while accent ratio alone can yield
prediction performance almost as good as the
combination of any other subset of features. Moreover,
this feature is useful even across genres; an
accent-ratio classifier trained only on conversational
speech predicts prominence with high accuracy in
broadcast news. Our results suggest that carefully
chosen lexicalized features can outperform less
fine-grained features.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/nenkovaetalhlt07.pdf},
year = 2007
}
@inproceedings{leo_07-2,
author = {Matthew P. Aylett and J. Sebastian Andersson and
Leonardo Badino and Christopher J. Pidcock},
title = {The {C}erevoice {B}lizzard Entry 2007: Are Small
Database Errors Worse than Compression Artifacts?},
booktitle = {Proc. Blizzard Challenge Workshop 2007},
address = {Bonn, Germany},
abstract = {In commercial systems the memory footprint of unit
selection systems is often a key issue. This is
especially true for PDAs and other embedded devices. In
this year's Blizzard entry CereProc R gave itself the
criteria that the full database system entered would
have a smaller memory footprint than either of the two
smaller database entries. This was accomplished by
applying Speex speech compression to the full database
entry. In turn a set of small database techniques used
to improve the quality of small database systems in
last years entry were extended. Finally, for all
systems, two quality control methods were applied to
the underlying database to improve the lexicon and
transcription match to the underlying data. Results
suggest that mild audio quality artifacts introduced by
lossy compression have almost as much impact on MOS
perceived quality as concatenation errors introduced by
sparse data in the smaller systems with bulked
diphones.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blz3_004.pdf},
year = 2007
}
@article{frankel07:ldm,
author = {Frankel, J. and King, S.},
title = {Speech Recognition using Linear Dynamic Models},
journal = {IEEE {T}ransactions on {S}peech and {A}udio
{P}rocessing},
volume = 15,
number = 1,
pages = {246--256},
abstract = {The majority of automatic speech recognition (ASR)
systems rely on hidden Markov models, in which Gaussian
mixtures model the output distributions associated with
sub-phone states. This approach, whilst successful,
models consecutive feature vectors (augmented to
include derivative information) as statistically
independent. Furthermore, spatial correlations present
in speech parameters are frequently ignored through the
use of diagonal covariance matrices. This paper
continues the work of Digalakis and others who proposed
instead a first-order linear state-space model which
has the capacity to model underlying dynamics, and
furthermore give a model of spatial correlations. This
paper examines the assumptions made in applying such a
model and shows that the addition of a hidden dynamic
state leads to increases in accuracy over otherwise
equivalent static models. We also propose a
time-asynchronous decoding strategy suited to
recognition with segment models. We describe
implementation of decoding for linear dynamic models
and present TIMIT phone recognition results.},
categories = {am,asr,ldm,timit,search,edinburgh},
month = {January},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Frankel_King_IEEE2007.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Frankel_King_IEEE2007.ps},
year = 2007
}
@inproceedings{livescu07:manual,
author = {Livescu, K. and Bezman, A. and Borges, N. and Yung, L.
and Çetin, Ö. and Frankel, J. and King, S. and
Magimai-Doss, M. and Chi, X. and Lavoie, L.},
title = {Manual transcription of conversational speech at the
articulatory feature level},
booktitle = {Proc. ICASSP},
address = {Honolulu},
abstract = {We present an approach for the manual labeling of
speech at the articulatory feature level, and a new set
of labeled conversational speech collected using this
approach. A detailed transcription, including
overlapping or reduced gestures, is useful for studying
the great pronunciation variability in conversational
speech. It also facilitates the testing of feature
classiers, such as those used in articulatory
approaches to automatic speech recognition. We describe
an effort to transcribe a small set of utterances drawn
from the Switchboard database using eight articulatory
tiers. Two transcribers have labeled these utterances
in a multi-pass strategy, allowing for correction of
errors. We describe the data collection methods and
analyze the data to determine how quickly and reliably
this type of transcription can be done. Finally, we
demonstrate one use of the new data set by testing a
set of multilayer perceptron feature classiers against
both the manual labels and forced alignments.},
month = {April},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/livescu_icassp07_trans.pdf},
year = 2007
}
@incollection{murray2007-mlmi,
author = {Murray, Gabriel and Renals, Steve},
title = {Term-weighting for summarization of multi-party spoken
dialogues},
booktitle = {Machine Learning for Multimodal Interaction IV },
publisher = {Springer},
editor = {Popescu-Belis, A. and Renals, S. and Bourlard, H.},
volume = {4892},
series = {Lecture Notes in Computer Science},
pages = {155--166},
abstract = {This paper explores the issue of term-weighting in the
genre of spontaneous, multi-party spoken dialogues,
with the intent of using such term-weights in the
creation of extractive meeting summaries. The field of
text information retrieval has yielded many
term-weighting tech- niques to import for our purposes;
this paper implements and compares several of these,
namely tf.idf, Residual IDF and Gain. We propose that
term-weighting for multi-party dialogues can exploit
patterns in word us- age among participant speakers,
and introduce the su.idf metric as one attempt to do
so. Results for all metrics are reported on both manual
and automatic speech recognition (ASR) transcripts, and
on both the ICSI and AMI meeting corpora. },
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/48920155.pdf},
year = 2007
}
@inproceedings{liddell-efas:07,
author = {Amy Liddell and David Owens and Pauline Campbell and
Christine DePlacido and Maria Wolters},
title = {Can Extended High Frequency Hearing Thresholds be Used
to Detect Auditory Processing Difficulties in an Ageing
Population?},
booktitle = {Proc. Europ. Cong. Fed. Audiol. Heidelberg, Germany},
abstract = {},
categories = {},
month = jun,
year = 2007
}
@inproceedings{Hofer_Shimodaira:proc:2007,
author = {Gregor Hofer and Hiroshi Shimodaira},
title = {Automatic Head Motion Prediction from Speech Data},
booktitle = {Proc. Interspeech 2007},
address = {Antwerp, Belgium},
abstract = {In this paper we present a novel approach to generate
a sequence of head motion units given some speech. The
modelling approach is based on the notion that head
motion can be divided into a number of short
homogeneous units that can each be modelled
individually. The system is based on Hidden Markov
Models (HMM), which are trained on motion units and act
as a sequence generator. They can be evaluated by an
accuracy measure. A database of motion capture data was
collected and manually annotated for head motion and is
used to train the models. It was found that the model
is good at distinguishing high activity regions from
regions with less activity with accuracies around 75
percent. Furthermore the model is able to distinguish
different head motion patterns based on speech features
somewhat reliably, with accuracies reaching almost 70
percent.},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/interspeech07.pdf},
year = 2007
}
@misc{Hofer_Shimodaira:sca:2007,
author = {Gregor Hofer and Hiroshi Shimodaira and Junichi
Yamagishi},
title = {Lip motion synthesis using a context dependent
trajectory hidden {M}arkov model},
howpublished = {Poster at SCA 2007},
address = {San Diego, USA},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/sca07.pdf},
year = 2007
}
@inproceedings{richmond2007_nolisp,
author = {Richmond, K.},
title = {Trajectory Mixture Density Networks With Multiple
Mixtures for Acoustic-Articulatory Inversion},
booktitle = {Advances in Nonlinear Speech Processing, International
Conference on Non-Linear Speech Processing, NOLISP 2007},
editor = {Chetouani, M. and Hussain, A. and Gas, B. and Milgram,
M. and Zarader, J.-L.},
volume = 4885,
series = {Lecture Notes in Computer Science},
pages = {263--272},
publisher = {Springer-Verlag Berlin Heidelberg},
abstract = {We have previously proposed a trajectory model which
is based on a mixture density network (MDN) trained
with target variables augmented with dynamic features
together with an algorithm for estimating maximum
likelihood trajectories which respects the constraints
between those features. In this paper, we have extended
that model to allow diagonal covariance matrices and
multiple mixture components in the trajectory MDN
output probability density functions. We have evaluated
this extended model on an inversion mapping task and
found the trajectory model works well, outperforming
smoothing of equivalent trajectories using low-pass
filtering. Increasing the number of mixture components
in the TMDN improves results further.},
categories = {ANN, TMDN, acoustic-articulatory inversion, MOCHA},
doi = {10.1007/978-3-540-77347-4_23},
key = {richmond2007_nolisp},
month = dec,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/richmond_nolisp2007.pdf},
year = 2007
}
@inproceedings{cuayahuitletal_interspeech07,
author = {Heriberto Cuayáhuitl and Steve Renals and Oliver
Lemon and Hiroshi Shimodaira},
title = {Hierarchical Dialogue Optimization Using Semi-Markov
Decision Processes},
booktitle = {Proc. of INTERSPEECH},
abstract = {This paper addresses the problem of dialogue
optimization on large search spaces. For such a
purpose, in this paper we propose to learn dialogue
strategies using multiple Semi-Markov Decision
Processes and hierarchical reinforcement learning. This
approach factorizes state variables and actions in
order to learn a hierarchy of policies. Our experiments
are based on a simulated flight booking dialogue system
and compare flat versus hierarchical reinforcement
learning. Experimental results show that the proposed
approach produced a dramatic search space reduction
(99.36\%), and converged four orders of magnitude
faster than flat reinforcement learning with a very
small loss in optimality (on average 0.3 system turns).
Results also report that the learnt policies
outperformed a hand-crafted one under three different
conditions of ASR confidence levels. This approach is
appealing to dialogue optimization due to faster
learning, reusable subsolutions, and scalability to
larger problems.},
categories = {Spoken dialogue systems, semi-Markov decision
processes, hierarchical reinforcement learning.},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/SMDPs-interspeech2007.pdf},
year = 2007
}
@inproceedings{mcgeelennon-icad:07,
author = {Marilyn McGee-Lennon and Maria Wolters and Tony
McBryan},
title = {Auditory Reminders in the Home},
booktitle = {Proc. Intl. Conf. Auditory Display (ICAD), Montreal,
Canada},
abstract = {},
categories = {},
month = jun,
year = 2007
}
@inproceedings{cabral07,
author = {J. Cabral and S. Renals and K. Richmond and J.
Yamagishi},
title = {Towards an Improved Modeling of the Glottal Source in
Statistical Parametric Speech Synthesis},
booktitle = {Proc.of the 6th ISCA Workshop on Speech Synthesis},
address = {Bonn, Germany},
abstract = {This paper proposes the use of the Liljencrants-Fant
model (LF-model) to represent the glottal source signal
in HMM-based speech synthesis systems. These systems
generally use a pulse train to model the periodicity of
the excitation signal of voiced speech. However, this
model produces a strong and uniform harmonic structure
throughout the spectrum of the excitation which makes
the synthetic speech sound buzzy. The use of a mixed
band excitation and phase manipulation reduces this
effect but it can result in degradation of the speech
quality if the noise component is not weighted
carefully. In turn, the LF-waveform has a decaying
spectrum at higher frequencies, which is more similar
to the real glottal source excitation signal. We
conducted a perceptual experiment to test the
hypothesis that the LF-model can perform as well as or
better than the pulse train in a HMM-based speech
synthesizer. In the synthesis, we used the mean values
of the LF-parameters, calculated by measurements of the
recorded speech. The result of this study is important
not only regarding the improvement in speech quality of
these type of systems, but also because the LF-model
can be used to model many characteristics of the
glottal source, such as voice quality, which are
important for voice transformation and generation of
expressive speech.},
categories = {LF-model, Statistical parametric speech synthesis,
HMM-based speech synthesis},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/cabral07.pdf},
year = 2007
}
@inproceedings{calhounIS:07,
author = {Calhoun, Sasha},
title = {Predicting Focus through Prominence Structure},
booktitle = {Proceedings of Interspeech},
address = {Antwerp, Belgium},
abstract = {Focus is central to our control of information flow in
dialogue. Spoken language understanding systems
therefore need to be able to detect focus
automatically. It is well known that prominence is a
key marker of focus in English, however, the
relationship is not straight-forward. We present focus
prediction models built using the NXT Switchboard
corpus. We claim that a focus is more likely if a word
is more prominent than expected given its syntactic,
semantic and discourse properties. Crucially, the
perception of prominence arises not only from acoustic
cues, but also the position in prosodic structure. Our
focus prediction results, along with a study showing
the acoustic properties of focal accents vary by
structural position, support our claims. As a largely
novel task, these results are an important first step
in detecting focus for spoken language applications.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/calhounIS07.pdf},
year = 2007
}
@inproceedings{richmond2007a,
author = {Richmond, K.},
title = {A Multitask Learning Perspective on
Acoustic-Articulatory Inversion},
booktitle = {Proc. Interspeech},
address = {Antwerp, Belgium},
abstract = {This paper proposes the idea that by viewing an
inversion mapping MLP from a Multitask Learning
perspective, we may be able to relax two constraints
which are inherent in using electromagnetic
articulography as a source of articulatory information
for speech technology purposes. As a first step to
evaluating this idea, we perform an inversion mapping
experiment in an attempt to ascertain whether the
hidden layer of a ``multitask'' MLP can act
beneficially as a hidden representation that is shared
between inversion mapping subtasks for multiple
articulatory targets. Our results in the case of the
tongue dorsum x-coordinate indicate this is indeed the
case and show good promise. Results for the tongue
dorsum y-coordinate however are not so clear-cut, and
will require further investigation.},
categories = {acoustic-articulatory inversion, MLP, multitask
learning},
key = {richmond2007a},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/is2007_paper.pdf},
year = 2007
}
@article{clarkrichmondking_specom2007,
author = {Robert A. J. Clark and Korin Richmond and Simon King},
title = {Multisyn: Open-domain unit selection for the
{F}estival speech synthesis system},
journal = {Speech Communication},
volume = 49,
number = 4,
pages = {317--330},
abstract = {We present the implementation and evaluation of an
open-domain unit selection speech synthesis engine
designed to be flexible enough to encourage further
unit selection research and allow rapid voice
development by users with minimal speech synthesis
knowledge and experience. We address the issues of
automatically processing speech data into a usable
voice using automatic segmentation techniques and how
the knowledge obtained at labelling time can be
exploited at synthesis time. We describe target cost
and join cost implementation for such a system and
describe the outcome of building voices with a number
of different sized datasets. We show that, in a
competitive evaluation, voices built using this
technology compare favourably to other systems.},
categories = {speech synthesis, festival, multisyn, unitselection},
doi = {10.1016/j.specom.2007.01.014},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/clarkrichmondking_specom2007.pdf},
year = 2007
}
@inproceedings{bell_king_is2007,
author = {Bell, Peter and King, Simon},
title = {Sparse Gaussian Graphical Models for Speech
Recognition},
booktitle = {Proc. Interspeech 2007},
address = {Antwerp, Belgium},
abstract = {We address the problem of learning the structure of
Gaussian graphical models for use in automatic speech
recognition, a means of controlling the form of the
inverse covariance matrices of such systems. With
particular focus on data sparsity issues, we implement
a method for imposing graphical model structure on a
Gaussian mixture system, using a convex optimisation
technique to maximise a penalised likelihood
expression. The results of initial experiments on a
phone recognition task show a performance improvement
over an equivalent full-covariance system.},
categories = {speech recognition, acoustic models, graphical models,
precision matrix models},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/sparseGM_is2007.pdf},
year = 2007
}
@article{dielmann2007-tmm,
author = {Dielmann, Alfred and Renals, Steve},
title = {Automatic meeting segmentation using dynamic
{Bayesian} networks},
journal = {IEEE Transactions on Multimedia},
volume = {9},
number = {1},
pages = {25--36},
abstract = {Multiparty meetings are a ubiquitous feature of
organizations, and there are considerable economic
benefits that would arise from their automatic analysis
and structuring. In this paper, we are concerned with
the segmentation and structuring of meetings (recorded
using multiple cameras and microphones) into sequences
of group meeting actions such as monologue, discussion
and presentation. We outline four families of
multimodal features based on speaker turns, lexical
transcription, prosody, and visual motion that are
extracted from the raw audio and video recordings. We
relate these low-level features to more complex group
behaviors using a multistream modelling framework based
on multistream dynamic Bayesian networks (DBNs). This
results in an effective approach to the segmentation
problem, resulting in an action error rate of 12.2\%,
compared with 43\% using an approach based on hidden
Markov models. Moreover, the multistream DBN developed
here leaves scope for many further improvements and
extensions.},
doi = {10.1109/TMM.2006.886337},
pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2007/dielmann2007-tmm.pdf},
url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=4032598&arnumber=4032608&count=23&index=3},
year = 2007
}
@inproceedings{jyamagis07:hts2007,
author = {Junichi Yamagishi and Heiga Zen and Tomoki Toda and
Keiichi Tokuda},
title = {Speaker-Independent {HMM}-based Speech Synthesis
System -- {HTS-2007} System for the {Blizzard Challenge
2007}},
booktitle = {Proc. Blizzard Challenge 2007},
abstract = {This paper describes an HMM-based speech synthesis
system developed by the HTS working group for the
Blizzard Challenge 2007. To further explore the
potential of HMM-based speech synthesis, we incorporate
new features in our conventional system which underpin
a speaker-independent approach: speaker adaptation
techniques; adaptive training for HSMMs; and full
covariance modeling using the CSMAPLR transforms.},
categories = {HMM, speech synthesis, speaker adaptation, HTS,
Blizzard Challenge},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blizzard2007-HTS.pdf},
year = 2007
}
@inproceedings{AMIsystemICASSP2007,
author = {T. Hain and L. Burget and J. Dines and G. Garau and M.
Karafiat and M. Lincoln and J. Vepa and V. Wan},
title = {{The {AMI} System for the Transcription of Speech in
Meetings}},
booktitle = {Proc. {ICASSP}},
abstract = {This paper describes the AMI transcription system for
speech in meetings developed in collaboration by five
research groups. The system includes generic techniques
such as discriminative and speaker adaptive training,
vocal tract length normalisation, heteroscedastic
linear discriminant analysis, maximum likelihood linear
regression, and phone posterior based features, as well
as techniques specifically designed for meeting data.
These include segmentation and cross-talk suppression,
beam-forming, domain adaptation, web-data collection,
and channel adaptive training. The system was improved
by more than 20\% relative in word error rate compared
to our previous system and was usd in the NIST RTÂ’06
evaluations where it was found to yield competitive
performance.},
categories = {LVCSR, NIST Meeting Transcription Evaluation RT06S},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/ICASSP07.AMIasrsystem.pdf},
year = 2007
}
@inproceedings{richmond2007b,
author = {Richmond, K. and Strom, V. and Clark, R. and
Yamagishi, J. and Fitt, S.},
title = {Festival Multisyn Voices for the 2007 Blizzard
Challenge},
booktitle = {Proc. Blizzard Challenge Workshop (in Proc. SSW6)},
address = {Bonn, Germany},
abstract = {This paper describes selected aspects of the Festival
Multisyn entry to the Blizzard Challenge 2007. We
provide an overview of the process of building the
three required voices from the speech data provided.
This paper focuses on new features of Multisyn which
are currently under development and which have been
employed in the system used for this Blizzard
Challenge. These differences are the application of a
more flexible phonetic lattice representation during
forced alignment labelling and the use of a pitch
accent target cost component. Finally, we also examine
aspects of the speech data provided for this year's
Blizzard Challenge and raise certain issues for
discussion concerning the aim of comparing voices made
with differing subsets of the data provided.},
categories = {tts, blizzard, multisyn, unit selection},
key = {richmond2007b},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blizzard2007paper.pdf},
year = 2007
}
@inproceedings{penner-icphs:07,
author = {Heike Penner and Nicholas Miller and Maria Wolters},
title = {Motor Speech Disorders in Three {P}arkinsonian
Syndromes: A Comparative Study},
booktitle = {Proc. Intl. Conf. Phon. Sci,},
abstract = {},
categories = {},
year = 2007
}
@article{king07:JASA2007,
author = {King, S. and Frankel, J. and Livescu, K. and
McDermott, E. and Richmond, K. and Wester, M.},
title = {Speech production knowledge in automatic speech
recognition},
journal = {Journal of the Acoustical Society of America},
volume = 121,
number = 2,
pages = {723--742},
abstract = {Although much is known about how speech is produced,
and research into speech production has resulted in
measured articulatory data, feature systems of
different kinds and numerous models, speech production
knowledge is almost totally ignored in current
mainstream approaches to automatic speech recognition.
Representations of speech production allow simple
explanations for many phenomena observed in speech
which cannot be easily analyzed from either acoustic
signal or phonetic transcription alone. In this
article, we provide a survey of a growing body of work
in which such representations are used to improve
automatic speech recognition.},
month = feb,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/King_et_al_review.pdf},
year = 2007
}