2003.bib
@comment{{This file has been generated by bib2bib 1.92}}
@comment{{Command line: /home/korin/bibtex2html-1.92-LINUX/bib2bib -oc /home/korin/projects/publications/new_output/transitdata/2003-citations -ob /home/korin/projects/publications/new_output/transitdata/2003.bib -c 'year : "2003"' /home/korin/projects/publications/filtlists/full_publications_list.bib}}
@incollection{gotoh-lm03,
author = {Y.~Gotoh and S.~Renals},
title = {Language Modelling},
booktitle = {Text and Speech Triggered Information Access},
editor = {S.~Renals and G.~Grefenstette},
pages = {78--105},
abstract = {This is a preprint of a tutorial on statistical
language modelling, based on Yoshi Gotoh's course at
the \href{http://www.ilsp.gr/testia/testia2000.html}
{ELSNET-2000 Summer School} on Text and Speech
Triggered Information Access. },
categories = {ie,lm,bnews,sheffield},
crossref = {renals-book03},
year = 2003
}
@inproceedings{Sturm-03,
author = {J. Sturm and J. M. Kessens and M. Wester and F. de Wet
and E. Sanders and H. Strik },
title = {Automatic Transcription of Football Commentaries in
the {MUMIS} Project},
booktitle = {Proc. Eurospeech '03},
pages = {-},
abstract = {This paper describes experiments carried out to
automatically transcribe football commentaries in
Dutch, English and German for multimedia indexing. Our
results show that the high levels of stadium noise in
the material create a task that is extremely difficult
for conventional ASR. The baseline WERs vary from 83\%
to 94\% for the three languages investigated. Employing
state-of-the-art noise robustness techniques leads to
relative reductions of 9-10\% WER. Application specific
words such as players names are recognized correctly in
about 50\% of cases. Although this result is
substantially better than the overall result, it is
inadequate. Much better results can be obtained if the
football commentaries are recorded separately from the
stadium noise. This would make the automatic
transcriptions more useful for multimedia indexing.},
categories = {asr, MUMIS, Nijmegen},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/mumis_final.pdf},
year = 2003
}
@article{Ho2003Applied,
author = {Tu Bao Ho and Trong Dung Nguyen and Hiroshi Shimodaira
and Masayuki Kimura},
title = {{A Knowledge Discovery System with Support for Model
Selection and Visualization}},
journal = {Applied Intelligence},
volume = {19},
number = {},
pages = {125--141},
categories = {KDD},
year = 2003
}
@inproceedings{Goubanova:2003,
author = {Goubanova, O.},
title = {{B}ayesian Modelling Of Vowel Segment Duration For
Text-to-Speech Synthesis Using Distinctive Features},
booktitle = {Proc. ICPhS 2003},
volume = 3,
pages = {2349},
address = {Barcelona, Spain},
abstract = {We report the results of applying the Bayesian Belief
Network (BN) approach to predicting vowel duration. A
Bayesian inference of the vowel duration is performed
on a hybrid Bayesian network consisting of discrete and
continuous nodes, with the nodes in the network
representing the linguistic factors that affect segment
duration. New to the present research, we model segment
identity factor as a set of distinctive features. The
features chosen were height, frontness, length, and
roundness. We also experimented with a word class
feature that implicitly represents word frequency
information. We contrasted the results of the belief
network model with those of the sums of products (SoP)
model and classification and regression tree (CART)
model. We trained and tested all three models on the
same data. In terms of the RMS error and correlation
coefficient, our BN model performs no worse than SoP
model, and it significantly outperforms CART model.},
categories = {Bayesian, text-to-speech synthesis, duration modelling},
ps = {http://www.cstr.ed.ac.uk/downloads/publications/2003/OGoubanova_icphs2k3.ps},
year = 2003
}
@inproceedings{calhoun:03,
author = {Calhoun, Sasha},
title = {The Nature of Theme and Rheme Accents},
booktitle = {One-Day Meeting for Young Speech Researchers},
address = {University College, London},
abstract = {It has increasingly been recognised that appropriate
intonation is essential to create believable voices for
speech synthesis. This is particularly true in
dialogue, where the link between intonation and meaning
is especially important. Here we report two
experiments, a production and perception study, which
test an aspect of Steedman's (2000) theory relating
information and intonation structure with a view to
specifying intonation in a speech synthesis system. He
claims that themes and rhemes, the basic building
blocks of information structure, are marked by
distinctive pitch accents in English, which he
identifies with L+H* and H* in the ToBI system
respectively. After reviewing problems with the
identification of these ToBI accents, we show that
speakers do produce and listeners do distinguish
different pitch accents in these discourse contexts,
but that the ToBI labels may not be helpful to
characterise the distinction. The exact phonetic nature
of theme and rheme accents remains unclear, but the
alignment of the start of the rise, pitch height and
the fall after the pitch peak all appear to be factors.
Speakers also appear to be more sensitive to the
distinction at the end of an utterance than
utterance-medially.},
categories = {prosody, information structure, pitch accents,
production and perception experiments},
month = apr,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/calhounPGC03.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/calhounPGC03.ps},
year = 2003
}
@inproceedings{mayoturk:03,
author = {Mayo, C. and Turk, A.},
title = {Is the development of cue weighting strategies in
children's speech perception context-dependent?},
booktitle = {XVth International Congress of Phonetic Sciences,
Barcelona},
categories = {speech perception, development, cue weighting},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/icphs-0677.pdf},
year = 2003
}
@inproceedings{gillett:king:eurospeech2003b,
author = {Ben Gillett and Simon King},
title = {Transforming {F0} Contours},
booktitle = {Proc. {E}urospeech},
address = {Geneva},
abstract = {Voice transformation is the process of transforming
the characteristics of speech uttered by a source
speaker, such that a listener would believe the speech
was uttered by a target speaker. Training F0 contour
generation models for speech synthesis requires a large
corpus of speech. If it were possible to adapt the F0
contour of one speaker to sound like that of another
speaker, using a small, easily obtainable parameter
set, this would be extremely valuable. We present a new
method for the transformation of F0 contours from one
speaker to another based on a small linguistically
motivated parameter set. The system performs a
piecewise linear mapping using these parameters. A
perceptual experiment clearly demonstrates that the
presented system is at least as good as an existing
technique for all speaker pairs, and that in many cases
it is much better and almost as good as using the
target F0 contour},
categories = {},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/Gillett_King_eurospeech2003a.pdf},
year = 2003
}
@inproceedings{Wester-03,
author = {M. Wester},
title = {Syllable classification using articulatory-acoustic
features},
booktitle = {Proc. of Eurospeech '03},
pages = {-},
address = {Geneva},
abstract = {This paper investigates the use of
articulatory-acoustic features for the classification
of syllables in TIMIT. The main motivation for this
study is to circumvent the ``beads-on-a-string''
problem, i.e. the assumption that words can be
described as a simple concatenation of phones.
Posterior probabilities for articulatory-acoustic
features are obtained from artificial neural nets and
are used to classify speech within the scope of
syllables instead of phones. This gives the opportunity
to account for asynchronous feature changes, exploiting
the strengths of the articulatory-acoustic features,
instead of losing the potential by reverting to phones.},
categories = {aaf, syllable, TIMIT, Edinburgh},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/wester.2003.1.pdf},
year = 2003
}
@article{Matsuda2003IEICE06,
author = {Shigeki Matsuda and Mitsuru Nakai and Hiroshi
Shimodaira and Shigeki Sagayama},
title = {{Speech Recognition Using Asynchronous Transition
{HMM}}},
journal = {IEICE Trans. D-II},
volume = {J86-D-II},
number = {6},
pages = {741--754},
note = {(in Japanese)},
abstract = {We propose asynchronous-transition HMM (AT-HMM) that
is based on asynchronous transition structures among
individual features of acoustic feature vector
sequences. Conventional HMM represents vector sequences
by using a chain of states, each state has vector
distributions of multi-dimensions. Therefore, the
conventional HMM assumes that individual features
change synchronously. However, this assumption seems
over-simplified for modeling the temporal behavior of
acoustic features, since cepstrum and its
time-derivative can not synchronize with each other. In
speaker-dependent continuous phoneme recognition task,
the AT-HMMs reduced errors by 10\% to 40\%. In
speaker-independent task, the performance of the
AT-HMMs was comparable to conventional HMMs.},
categories = {asr, jaist},
month = jun,
year = 2003
}
@inproceedings{koumpis-msdr03,
author = {K.~Koumpis and S.~Renals},
title = {Evaluation of extractive voicemail summarization},
booktitle = {Proc. ISCA Workshop on Multilingual Spoken Document
Retrieval},
pages = {19--24},
abstract = {This paper is about the evaluation of a system that
generates short text summaries of voicemail messages,
suitable for transmission as text messages. Our
approach to summarization is based on a
speech-recognized transcript of the voicemail message,
from which a set of summary words is extracted. The
system uses a classifier to identify the summary words,
with each word being identified by a vector of lexical
and prosodic features. The features are selected using
Parcel, an ROC-based algorithm. Our evaluations of the
system, using a slot error rate metric, have compared
manual and automatic summarization, and manual and
automatic recognition (using two different
recognizers). We also report on two subjective
evaluations using mean opinion score of summaries, and
a set of comprehension tests. The main results from
these experiments were that the perceived difference in
quality of summarization was affected more by errors
resulting from automatic transcription, than by the
automatic summarization process.},
categories = {voicemail,summarization,prosody,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/msdr03.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/msdr03.ps.gz},
year = 2003
}
@inproceedings{shig031,
author = {Yoshinori Shiga and Simon King},
title = {Estimating the Spectral Envelope of Voiced Speech
Using Multi-frame Analysis},
booktitle = {Proc. {E}urospeech-2003},
volume = 3,
pages = {1737--1740},
address = {Geneva, Switzerland},
abstract = {This paper proposes a novel approach for estimating
the spectral envelope of voiced speech independently of
its harmonic structure. Because of the
quasi-periodicity of voiced speech, its spectrum
indicates harmonic structure and only has energy at
frequencies corresponding to integral multiples of F0.
It is hence impossible to identify transfer
characteristics between the adjacent harmonics. In
order to resolve this problem, Multi-frame Analysis
(MFA) is introduced. The MFA estimates a spectral
envelope using many portions of speech which are
vocalised using the same vocal-tract shape. Since each
of the portions usually has a different F0 and ensuing
different harmonic structure, a number of harmonics can
be obtained at various frequencies to form a spectral
envelope. The method thereby gives a closer
approximation to the vocal-tract transfer function.},
categories = {artic, lbg, clustering, mocha, harmonic, envelope,
edinburgh},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/shiga_eurospeech03a.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/shiga_eurospeech03a.ps},
year = 2003
}
@inproceedings{Shimodaira2003ICDAR,
author = {Hiroshi Shimodaira and Takashi Sudo and Mitsuru Nakai
and Shigeki Sagayama},
title = {{On-line Overlaid-Handwriting Recognition Based on
Substroke {HMM}s}},
booktitle = {ICDAR'03},
pages = {1043--1047},
abstract = {This paper proposes a novel handwriting recognition
interface for wearable computing where users write
characters continuously without pauses on a small
single writing box. Since characters are written on the
same writing area, they are overlaid with each other.
Therefore the task is regarded as a special case of the
continuous character recognition problem. In contrast
to the conventional continuous character recognition
problem, location information of strokes does not help
very much in the proposed framework. To tackle the
problem, substroke based hidden Markov models (HMMs)
and a stochastic bigram language model are employed.
Preliminary experiments were carried out on a dataset
of 578 handwriting sequences with a character bigram
consisting of 1,016 Japanese educational Kanji and 71
Hiragana characters. The proposed method demonstrated
promising performance with 69.2\% of handwriting
sequences beeing correctly recognized when different
stroke order was permitted, and the rate was improved
up to 88.0\% when characters were written with fixed
stroke order.},
categories = {HWR, jaist},
journal = {},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/Shimodaira2003ICDAR.pdf},
year = 2003
}
@inproceedings{renals-icassp03,
author = {S.~Renals and D.~Ellis},
title = {Audio information access from meeting rooms},
booktitle = {Proc. IEEE ICASSP},
volume = {4},
pages = {744--747},
abstract = {We investigate approaches to accessing information
from the streams of audio data that result from
multi-channel recordings of meetings. The methods
investigated use word-level transcriptions, and
information derived from models of speaker activity and
speaker turn patterns. Our experiments include spoken
document retrieval for meetings, automatic structuring
of meetings based on self-similarity matrices of
speaker turn patterns and a simple model of speaker
activity. Meeting recordings are rich in both lexical
and non-lexical information; our results illustrate
some novel kinds of analysis made possible by a
transcribed corpus of natural meetings.},
categories = {m4,multimodal,ir,meetings,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/icassp03-mtg.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/icassp03-mtg.ps.gz},
year = 2003
}
@article{Wester-CSL-03,
author = {M. Wester},
title = {Pronunciation modeling for {ASR} -- knowledge-based
and data-derived methods},
journal = {Computer Speech and Language},
volume = {17},
pages = {69-85},
abstract = {This article focuses on modeling pronunciation
variation in two different ways: data-derived and
knowledge-based. The knowledge-based approach consists
of using phonological rules to generate variants. The
data-derived approach consists of performing phone
recognition, followed by smoothing using decision trees
(D-trees) to alleviate some of the errors in the phone
recognition. Using phonological rules led to a small
improvement in WER; a data-derived approach in which
the phone recognition was smoothed using D-trees prior
to lexicon generation led to larger improvements
compared to the baseline. The lexicon was employed in
two different recognition systems: a hybrid HMM/ANN
system and a HMM-based system, to ascertain whether
pronunciation variation was truly being modeled. This
proved to be the case as no significant differences
were found between the results obtained with the two
systems. Furthermore, we found that 10\% of variants
generated by the phonological rules were also found
using phone recognition, and this increased to 28\%
when the phone recognition output was smoothed by using
D-trees. This indicates that the D-trees generalize
beyond what has been seen in the training material,
whereas when the phone recognition approach is employed
directly, unseen pronunciations cannot be predicted. In
addition, we propose a metric to measure confusability
in the lexicon. Using this confusion metric to prune
variants results in roughly the same improvement as
using the D-tree method.},
categories = {asr, pm, VIOS, Nijmegen},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/CSL-pronvar.pdf},
year = 2003
}
@inproceedings{kolluru-asru03,
author = {B. Kolluru and H. Christensen and Y. Gotoh and S.
Renals},
title = {Exploring the style-technique interaction in
extractive summarization of broadcast news},
booktitle = {Proc. IEEE Automatic Speech Recognition and
Understanding Workshop},
pages = {},
abstract = {In this paper we seek to explore the interaction
between the style of a broadcast news story and its
summarization technique. We report the performance of
three different summarization techniques on broadcast
news stories, which are split into planned speech and
spontaneous speech. The initial results indicate that
some summarization techniques work better for the
documents with spontaneous speech than for those with
planned speech. Even for human beings some documents
are inherently dif cult to summarize. We observe this
correlation between degree of dif culty in summarizing
and performance of the three automatic summarizers.
Given the high frequency of named entities in broadcast
news and even greater number of references to these
named entities, we also gauge the effect of named
entity and coreference resolution in a news story, on
the performance of these summarizers.},
categories = {s3l,summarization,bnews,edinburgh},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/asru03-style.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/asru03-style.ps.gz},
year = 2003
}
@inproceedings{Keeni2003ICEIS,
author = {Kanad Keeni and Kunio Goto and Hiroshi Shimodaira},
title = {{On fast learning of Multi-layer Feed-forward Neural
Networks Using Back Propagation}},
booktitle = {International Conference on Enterprise and Information
Systems (ICEIS2003)},
pages = {266--271},
abstract = {This study discusses the subject of training data
selection for neural networks using back propagation.
We have made only one assumption that there are no
overlapping of training data belonging to different
classes, in other words the training data is
linearly/semi-linearly separable . Training data is
analyzed and the data that affect the learning process
are selected based on the idea of Critical points. The
proposed method is applied to a classification problem
where the task is to recognize the characters A,C and
B,D. The experimental results show that in case of
batch mode the proposed method takes almost 1/7 of real
and 1/10 of user training time required for
conventional method. On the other hand in case of
online mode the proposed method takes 1/3 of training
epochs, 1/9 of real and 1/20 of user and 1/3 system
time required for the conventional method. The
classification rate of training and testing data are
the same as it is with the conventional method. },
month = apr,
year = 2003
}
@inproceedings{koumpis-eurospeech03,
author = {K.~Koumpis and S.~Renals},
title = {Multi-class Extractive Voicemail Summarization},
booktitle = {Proc. Eurospeech},
pages = {2785--2788},
abstract = {This paper is about a system that extracts principal
content words from speech-recognized transcripts of
voicemail messages and classifies them into proper
names, telephone numbers, dates/times and `other'. The
short text summaries generated are suitable for mobile
messaging applications. The system uses a set of
classifiers to identify the summary words, with each
word being identified by a vector of lexical and
prosodic features. The features are selected using
Parcel, an ROC-based algorithm. We visually compare the
role of a large number of individual features and
discuss effective ways to combine them. We finally
evaluate their performance on manual and automatic
transcriptions derived from two different speech
recognition systems.},
categories = {voicemail,summarization,prosody,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/eurospeech03-voicemail.pdf},
year = 2003
}
@article{richmond2003,
author = {Richmond, K. and King, S. and Taylor, P.},
title = {Modelling the Uncertainty in Recovering Articulation
from Acoustics},
journal = {Computer Speech and Language},
volume = 17,
pages = {153--172},
abstract = {This paper presents an experimental comparison of the
performance of the multilayer perceptron (MLP) with
that of the mixture density network (MDN) for an
acoustic-to-articulatory mapping task. A corpus of
acoustic-articulatory data recorded by electromagnetic
articulography (EMA) for a single speaker was used as
training and test data for this purpose. In theory, the
MDN is able to provide a richer, more flexible
description of the target variables in response to a
given input vector than the least-squares trained MLP.
Our results show that the mean likelihoods of the
target articulatory parameters for an unseen test set
were indeed consistently higher with the MDN than with
the MLP. The increase ranged from approximately 3\% to
22\%, depending on the articulatory channel in
question. On the basis of these results, we argue that
using a more flexible description of the target domain,
such as that offered by the MDN, can prove beneficial
when modelling the acoustic-to-articulatory mapping.},
categories = {artic, ann, mlp, mdn, inversion, mocha, edinburgh},
key = {richmond2003},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/richmond2003.pdf},
year = 2003
}
@inproceedings{vanbael:king:icphs2003,
author = {Christophe Van Bael and Simon King},
title = {An Accent-Independent Lexicon for Automatic Speech
Recognition},
booktitle = {Proc. ICPhS},
pages = {1165-1168},
abstract = {Recent work at the Centre for Speech Technology Re-
search (CSTR) at the University of Edinburgh has de-
veloped an accent-independent lexicon for speech syn-
thesis (the Unisyn project). The main purpose of this
lexicon is to avoid the problems and cost of writing a
new lexicon for every new accent needed for synthesis.
Only recently, a first attempt has been made to use the
Keyword Lexicon for automatic speech recognition.},
categories = {},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/VanBael_King_icphs2003.pdf},
year = 2003
}
@inproceedings{horlock:king:eurospeech2003a,
author = {James Horlock and Simon King},
title = {Named Entity Extraction from Word Lattices},
booktitle = {Proc. Eurospeech},
address = {Geneva},
abstract = {We present a method for named entity extraction from
word lattices produced by a speech recogniser. Previous
work by others on named entity extraction from speech
has used either a manual transcript or 1-best
recogniser output. We describe how a single Viterbi
search can recover both the named entity sequence and
the corresponding word sequence from a word lattice,
and further that it is possible to trade off an
increase in word error rate for improved named entity
extraction.},
categories = {},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/Horlock_King_eurospeech2003a.pdf},
year = 2003
}
@inproceedings{wan-icassp03,
author = {V.~Wan and S.~Renals},
title = {{SVMSVM}: Support vector machine speaker verification
methodology},
booktitle = {Proc. IEEE ICASSP},
volume = {2},
pages = {221--224},
abstract = {Support vector machines with the Fisher and
score-space kernels are used for text independent
speaker verification to provide direct q discrimination
between complete utterances. This is unlike approaches
such as discriminatively trained Gaussian mixture
models or other discriminative classifiers that
discriminate at the frame-level only. Using the
sequence-level discrimination approach we are able to
achieve error-rates that are significantly better than
the current state-of-the-art on the PolyVar database.},
categories = {verification,kernel,svm,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/icassp03-svm.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/icassp03-svm.ps.gz},
year = 2003
}
@inproceedings{christensen-asru03,
author = {H. Christensen and Y. Gotoh and B. Kolluru and S.
Renals},
title = {Are extractive text summarisation techniques portable
to broadcast news?},
booktitle = {Proc. IEEE Automatic Speech Recognition and
Understanding Workshop},
pages = {},
abstract = {In this paper we report on a series of experiments
which compare the effect of individual features on both
text and speech summarisation, the effect of basing the
speech summaries on automatic speech recognition
transcripts with varying word error rates, and the
effect of summarisation approach and transcript source
on summary quality. We show that classical text
summarisation features (based on stylistic and content
information) are portable to broadcast news. However,
the quality of the speech transcripts as well as the
difference in information structure between broadcast
and newspaper news affect the usability of the
individual features.},
categories = {s3l,summarization,bnews,edinburgh},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/asru03-portable.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/asru03-portable.ps.gz},
year = 2003
}
@article{Kawamoto2003Book,
author = {Shin-ichi Kawamoto and Hiroshi Shimodaira and Shigeki
Sagayama and others},
title = {{Galatea: Open-Source Software for Developing
Anthropomorphic Spoken Dialog Agents}},
journal = {Life-Like Characters. Tools, Affective Functions, and
Applications. Helmut Prendinger et al. (Eds.) Springer},
volume = {},
number = {},
pages = {187--212},
abstract = {Galatea is a software toolkit to develop a human-like
spoken dialog agnet. In order to easily integrate the
modules of different characteristics including speech
recognizer, speech synthesizer, facial-image
synthesizer and dialog controller, each module is
modeled as a virtual machine having a simple common
interface and connected to each other through a broker
(communication manager). Galatea employs model-based
speech and facial-image synthesizers whose model
parameters are adapted easily to those for an existing
person if his/her training data is given. The software
toolkit that runs on both UNIX/Linux and Windows
operating systems will be publicly available in the
middle of 2003. },
categories = {lifelike-agent, jaist},
month = nov,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/Kawamoto2003Book.pdf},
year = 2003
}
@inproceedings{vepa-king_euro03,
author = {Vepa, J. and King, S.},
title = {Kalman-filter based Join Cost for Unit-selection
Speech Synthesis},
booktitle = {Proc. {E}urospeech},
address = {Geneva, Switzerland},
abstract = {We introduce a new method for computing join cost in
unit-selection speech synthesis which uses a linear
dynamical model (also known as a Kalman filter) to
model line spectral frequency trajectories. The model
uses an underlying subspace in which it makes smooth,
continuous trajectories. This subspace can be seen as
an analogy for underlying articulator movement. Once
trained, the model can be used to measure how well
concatenated speech segments join together. The
objective join cost is based on the error between model
predictions and actual observations. We report
correlations between this measure and mean listener
scores obtained from a perceptual listening experiment.
Our experiments use a state-of-the art unit-selection
text-to-speech system: `rVoice' from Rhetorical Systems
Ltd.},
categories = {join cost, Kalman filter, LDM, rVoice, edinburgh},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/vepa_eurospeech03.pdf},
year = 2003
}
@phdthesis{clark_phd03,
author = {Robert A. J. Clark},
title = {Generating Synthetic Pitch Contours Using Prosodic
Structure},
school = {The University of Edinburgh},
categories = {speech synthesis, prosody, intonation, festival},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/clark_phd03.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/clark_phd03.ps.gz},
year = 2003
}
@article{mayoscobbiehewlettwaters:03,
author = {Mayo, C. and Scobbie, J. and Hewlett, N. and Waters,
D.},
title = {The influence of phonemic awareness development on
acoustic cue weighting in children's speech perception},
journal = {Journal of Speech, Language and Hearing Research},
volume = 46,
pages = {1184-1196},
categories = {speech perception, development, cue weighting,
phonemic awareness, literacy},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/JSLHR1184-Mayo.pdf},
year = 2003
}
@inproceedings{horlock:king:eurospeech2003b,
author = {James Horlock and Simon King},
title = {Discriminative Methods for Improving Named Entity
Extraction on Speech Data},
booktitle = {Proc. Eurospeech},
address = {Geneva},
abstract = {In this paper we present a method of discriminatively
training language models for spoken language
understanding; we show improvements in named entity
F-scores on speech data using these improved language
models. A comparison between theoretical probabilities
associated with manual markup and the actual
probabilities of output markup is used to identify
probabilities requiring adjustment. We present results
which support our hypothesis that improvements in
F-scores are possible by using either previously used
training data or held out development data to improve
discrimination amongst a set of N-gram language models.},
categories = {},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/Horlock_King_eurospeech2003b.pdf},
year = 2003
}
@article{king:jphon2003,
author = {Simon King},
title = {Dependence and independence in automatic speech
recognition and synthesis},
journal = {Journal of Phonetics},
volume = 31,
number = {3-4},
pages = {407-411},
abstract = {A short review paper},
categories = {},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/King_jphon2003.pdf},
year = 2003
}
@inproceedings{wrigley-eurospeech03,
author = {S.~Wrigley and G.~Brown and V.~Wan and S. Renals},
title = {Feature Selection for the Classification of Crosstalk
in Multi-Channel Audio},
booktitle = {Proc. Eurospeech},
pages = {469--472},
abstract = {An extension to the conventional speech / nonspeech
classification framework is presented for a scenario in
which a number of microphones record the activity of
speakers present at a meeting (one microphone per
speaker). Since each microphone can receive speech from
both the participant wearing the microphone (local
speech) and other participants (crosstalk), the
recorded audio can be broadly classified in four ways:
local speech, crosstalk plus local speech, crosstalk
alone and silence. We describe a classifier in which a
Gaussian mixture model (GMM) is used to model each
class. A large set of potential acoustic features are
considered, some of which have been employed in
previous speech / nonspeech classifiers. A combination
of two feature selection algorithms is used to identify
the optimal feature set for each class. Results from
the GMM classifier using the selected features are
superior to those of a previously published approach.},
categories = {m4,crosstalk,meetings,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/eurospeech03-xtalk.pdf},
year = 2003
}
@inproceedings{Nakai2003ICDAR,
author = {Mitsuru Nakai and Hiroshi Shimodaira and Shigeki
Sagayama},
title = {{Generation of Hierarchical Dictionary for
Stroke-order Free Kanji Handwriting Recognition Based
on Substroke {HMM}}},
booktitle = {Proc. of ICDAR2003},
pages = {514--518},
abstract = {This paper describes a method of generating a
Kanjihierarchical structured dictionary for
stroke-number and stroke-order free handwriting
recognition based on sub-stroke HMM. In stroke-based
methods, a large number of stroke-order variations can
be easily expressed by justadding different stroke
sequences to the dictionary and itis not necessary to
train new reference patterns. The hierarchical
structured dictionary has an advantage that thousands
of stroke-order variations of Kanji characters can be
produced using a small number of stroke-order rules
defin-ing Kanji parts. Moreover, the recognition speed
is fast since common sequences are shared in a
substroke network, even if the total number of
stroke-order combinations becomes enormous practically.
In experiments, 300 differentstroke-order rules of
Kanji parts were statistical ly chosen by using 60
writers' handwritings of 1,016 educational
Kanjicharacters. By adding these new stroke-order rules
to the dictionary, about 9,000 variations of different
stroke-orderswere generated for 2,965 JIS 1st level
Kanji characters. As a result, we successfully improved
the recognition accuracyfrom 82.6\% to 90.2\% for
stroke-order free handwritings.},
categories = {HWR, jaist},
journal = {},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/Nakai2003ICDAR.pdf},
year = 2003
}
@inproceedings{clark_icphs03,
author = {Robert A. J. Clark},
title = {Modelling Pitch Accents for Concept-to-Speech
Synthesis.},
booktitle = {Proc. XVth International Congress of Phonetic Sciences},
volume = 2,
pages = {1141--1144},
categories = {speech synthesis, prosody, intonation, festival},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/clark_icphs03.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/clark_icphs03.ps},
year = 2003
}
@inproceedings{Keeni2003ICONIP,
author = {Kanad Keeni and Kunio Goto and Hiroshi Shimodaira},
title = {{Automatic Filtering of Network IntrusionDetection
System Alarms Using Multi-layer Feed-forward Neural
Networks}},
booktitle = {International Conference on Neural Information
Processing (ICONIP2003)},
pages = {},
categories = {ann},
journal = {},
month = jun,
year = 2003
}
@inproceedings{gillett:king:eurospeech2003a,
author = {Ben Gillett and Simon King},
title = {Transforming Voice Quality},
booktitle = {Proc. {E}urospeech},
address = {Geneva},
abstract = {Voice transformation is the process of transforming
the characteristics of speech uttered by a source
speaker, such that a listener would believe the speech
was uttered by a target speaker. In this paper we
address the problem of transforming voice quality. We
do not attempt to transform prosody. Our system has two
main parts corresponding to the two components of the
source-filter model of speech production. The first
component transforms the spectral envelope as
represented by a linear prediction model. The
transformation is achieved using a Gaussian mixture
model, which is trained on aligned speech from source
and target speakers. The second part of the system
predicts the spectral detail from the transformed
linear prediction coefficients. A novel approach is
proposed, which is based on a classifier and residual
codebooks. On the basis of a number of performance
metrics it outperforms existing systems.},
categories = {},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/Gillett_King_eurospeech2003b.pdf},
year = 2003
}
@article{Cox-ijhci03,
author = {Cox, S.J. and Lincoln, M. and Nakisa, M. and Wells, M.
and Tutt, M. and Abbott, S.},
title = {The Development and Evaluation of a Speech to Sign
Translation System to Assist Transactions},
journal = {Int. Journal of Human Computer Interaction},
volume = {16},
number = {2},
pages = {141-161},
abstract = {The design, development, and evaluation of an
experimental translation system that aims to aid
transactions between a deaf person and a clerk in a
post office (PO) is described. The system uses a speech
recognizer to recognize speech from a PO clerk and then
synthesizes recognized phrases in British Sign language
(BSL) using a specially developed avatar. The main
objective in developing this prototype system was to
determine how useful it would be to a customer whose
first language was BSL, and to discover what areas of
the system required more research and development to
make it more effective. The system was evaluated by 6
prelingually profoundly deaf people and 3 PO clerks.
Deaf users and PO clerks were supportive of the system,
but the former group required a higher quality of
signing from the avatar and the latter a system that
was less constrained in the phrases it could recognize;
both these areas are being addressed in the next phase
of development.},
categories = {visicast,sign language,translation,UEA},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/ijhci.pdf},
year = 2003
}
@phdthesis{frankel03:thesis,
author = {Frankel, J.},
title = {Linear dynamic models for automatic speech recognition},
school = {The Centre for Speech Technology Research, Edinburgh
University},
abstract = {The majority of automatic speech recognition (ASR)
systems rely on hidden Markov models (HMM), in which
the output distribution associated with each state is
modelled by a mixture of diagonal covariance Gaussians.
Dynamic information is typically included by appending
time-derivatives to feature vectors. This approach,
whilst successful, makes the false assumption of
framewise independence of the augmented feature vectors
and ignores the spatial correlations in the
parametrised speech signal. This dissertation seeks to
address these shortcomings by exploring acoustic
modelling for ASR with an application of a form of
state-space model, the linear dynamic model (LDM).
Rather than modelling individual frames of data, LDMs
characterize entire segments of speech. An
auto-regressive state evolution through a continuous
space gives a Markovian model of the underlying
dynamics, and spatial correlations between feature
dimensions are absorbed into the structure of the
observation process. LDMs have been applied to speech
recognition before, however a smoothed Gauss-Markov
form was used which ignored the potential for subspace
modelling. The continuous dynamical state means that
information is passed along the length of each segment.
Furthermore, if the state is allowed to be continuous
across segment boundaries, long range dependencies are
built into the system and the assumption of
independence of successive segments is loosened. The
state provides an explicit model of temporal
correlation which sets this approach apart from
frame-based and some segment-based models where the
ordering of the data is unimportant. The benefits of
such a model are examined both within and between
segments. LDMs are well suited to modelling smoothly
varying, continuous, yet noisy trajectories such as
found in measured articulatory data. Using
speaker-dependent data from the MOCHA corpus, the
performance of systems which model acoustic,
articulatory, and combined acoustic-articulatory
features are compared. As well as measured articulatory
parameters, experiments use the output of neural
networks trained to perform an articulatory inversion
mapping. The speaker-independent TIMIT corpus provides
the basis for larger scale acoustic-only experiments.
Classification tasks provide an ideal means to compare
modelling choices without the confounding influence of
recognition search errors, and are used to explore
issues such as choice of state dimension, front-end
acoustic parametrization and parameter initialization.
Recognition for segment models is typically more
computationally expensive than for frame-based models.
Unlike frame-level models, it is not always possible to
share likelihood calculations for observation sequences
which occur within hypothesized segments that have
different start and end times. Furthermore, the Viterbi
criterion is not necessarily applicable at the frame
level. This work introduces a novel approach to
decoding for segment models in the form of a stack
decoder with $A^*$ search. Such a scheme allows
flexibility in the choice of acoustic and language
models since the Viterbi criterion is not integral to
the search, and hypothesis generation is independent of
the particular language model. Furthermore, the
time-asynchronous ordering of the search means that
only likely paths are extended, and so a minimum number
of models are evaluated. The decoder is used to give
full recognition results for feature-sets derived from
the MOCHA and TIMIT corpora. Conventional train/test
divisions and choice of language model are used so that
results can be directly compared to those in other
studies. The decoder is also used to implement Viterbi
training, in which model parameters are alternately
updated and then used to re-align the training data.},
categories = {am,artic,asr,ldm,mocha,timit,search,edinburgh},
month = apr,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/Frankel_thesis2003.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/Frankel_thesis2003.ps},
year = 2003
}
@inproceedings{shig032,
author = {Yoshinori Shiga and Simon King},
title = {Estimation of voice source and vocal tract
characteristics based on multi-frame analysis},
booktitle = {Proc. Eurospeech},
volume = 3,
pages = {1749--1752},
address = {Geneva, Switzerland},
abstract = {This paper presents a new approach for estimating
voice source and vocal tract filter characteristics of
voiced speech. When it is required to know the transfer
function of a system in signal processing, the input
and output of the system are experimentally observed
and used to calculate the function. However, in the
case of source-filter separation we deal with in this
paper, only the output (speech) is observed and the
characteristics of the system (vocal tract) and the
input (voice source) must simultaneously be estimated.
Hence the estimate becomes extremely difficult, and it
is usually solved approximately using oversimplified
models. We demonstrate that these characteristics are
separable under the assumption that they are
independently controlled by different factors. The
separation is realised using an iterative approximation
along with the Multi-frame Analysis method, which we
have proposed to find spectral envelopes of voiced
speech with minimum interference of the harmonic
structure.},
categories = {artic, lbg, clustering, mocha, source-filter,
edinburgh},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/shiga_eurospeech03b.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/shiga_eurospeech03b.ps},
year = 2003
}
@inproceedings{Tokuno2003HCII,
author = {Tokuno Junko and Naoto Akira and Mitsuru Nakai and
Hiroshi Shimodaira and Shigeki Sagayama},
title = {{Blind-handwriting Interface for Wearable Computing}},
booktitle = {Proc. of Human - Computer Interaction (HCI)
International 2003, Volume 2},
pages = {303--307},
note = {},
abstract = {This paper proposes a novel input interface that we
call "blind handwriting" for wearable computing. The
blind handwriting, which is a word similar to "blind
typing" of keyboard, is a particular writing style
where the user does not see the pen or the finger
movement. Without visual feedback, written characters
are distorted, as in the case when the user is
blindfolded, and therefore existing on-line handwriting
recognition systems fail to recognize them correctly.
The sub-stroke based hidden Markov model approach is
employed to tackle this problem. When the pen or touch
pad is used as an input device, the proposed interface
demonstrates a recognition rate of 83\% on a test set
of 61 people where each person wrote 1016 Japanese
Kanji characters. },
categories = {HWR, jaist},
journal = {},
month = jun,
year = 2003
}
@inproceedings{Lin03,
author = {Lincoln, M. and Cox, S.J.},
title = {A Comparison of Language Processing Techniques for a
Constrained Speech Translation System},
booktitle = {IEEE Conference on Acoustics, Speech and Signal
Processing},
address = {Hong Kong},
abstract = {A system designed to allow Post Office counter clerks
to communicate with deaf customers by translating
speech into sign language is described. The system uses
approximately 370 pre-stored phrases which may be
signed to the customer using a specially designed
avatar. The clerk is unable to memorise this number of
phrases and therefore the system attempts to map from
their input speech to the semantically equivalent
pre-stored phrase. We describe a number of language
processing techniques developed to perform the mapping,
and give results obtained using alternative
formulations of the phrases from a number of speakers.
We then give results for recognised speech input and
show how mis-recognitions effect the mapping system.
Best performance is obtained using a mapping system
based on an entropy weighted, vector based distance
measure between the test phrase and each of the signed
phrases.},
categories = {visicast,sign language,translation,UEA},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/icassp2003.pdf},
year = 2003
}
@book{renals-book03,
editor = {S.~Renals and G.~Grefenstette},
title = {Text and Speech Triggered Information Access},
publisher = {Springer-Verlag},
number = {2705},
series = {Lecture Notes in Computer Science},
abstract = {Edited collection of revised lectures from the
\href{http://www.ilsp.gr/testia/testia2000.html}
{ELSNET-2000 Summer School} on Text and Speech
Triggered Information Access. },
categories = {recognition,ir,ie,lm,multimodal,sheffield},
url = {http://www.springerlink.com/openurl.asp?genre=issue&issn=0302-9743&volume=2705&issue=preprint},
year = 2003
}