2002.bib
@comment{{This file has been generated by bib2bib 1.92}}
@comment{{Command line: /home/korin/bibtex2html-1.92-LINUX/bib2bib -oc /home/korin/projects/publications/new_output/transitdata/2002-citations -ob /home/korin/projects/publications/new_output/transitdata/2002.bib -c 'year : "2002"' /home/korin/projects/publications/filtlists/full_publications_list.bib}}
@article{Kawamoto2002IPSJ07,
author = {Shin-ichi Kawamoto and Hiroshi Shimodaira and others},
title = {{Design of Software Toolkit for Anthromorphic Spoken
Dialog Agent Software with Customization-oriented
Features}},
journal = {Information Processing Society of Japan (IPSJ) Journal},
volume = {43},
number = {7},
pages = {2249--2263},
note = {(in Japanese)},
month = jul,
year = 2002
}
@inproceedings{VTTS,
author = {Graf, H. P. and Cosatto, E. and Strom, V. and Huang,
F. J.},
title = {Visual Prosody: Facial Movements Accompanying Speech},
booktitle = {Proc Fifth Int. Conf. Automatic Face and Gesture
Recognition},
pages = {397-401},
abstract = {As we articulate speech, we usually move the head and
exhibit various facial expressions. This visual aspect
of speech aids understanding and helps communicating
additional information, such as the speaker's mood. In
this paper we analyze quantitatively head and facial
movements that accompany speech and investigate how
they relate to the text's prosodic structure. We
recorded several hours of speech and measured the
locations of the speaker's main facial features as well
as their head poses. The text was evaluated with a
prosody prediction tool, identifying phrase boundaries
and pitch accents. Characteristic for most speakers are
simple motion patterns that are repeatedly applied in
synchrony with the main prosodic events. Direction and
strength of head movements vary widely from one speaker
to another, yet their timing is typically well
synchronized with the spoken text. Understanding
quantitatively the correlations between head movements
and spoken text is important for synthesizing
photo-realistic talking heads. Talking heads appear
much more engaging when they exhibit realistic motion
patterns.},
categories = {VTTS},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/paper.vtts.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/paper.vtts.ps},
year = 2002
}
@phdthesis{richmond2002,
author = {Richmond, K.},
title = {Estimating Articulatory Parameters from the Acoustic
Speech Signal},
school = {The Centre for Speech Technology Research, Edinburgh
University},
abstract = {A successful method for inferring articulation from
the acoustic speech signal would find many
applications: low bit-rate speech coding, visual
representation of speech, and the possibility of
improved automatic speech recognition to name but a
few. It is unsurprising, therefore, that researchers
have been investigating the acoustic-to-articulatory
inversion mapping for several decades now. A great
variety of approaches and models have been applied to
the problem. Unfortunately, the overwhelming majority
of these attempts have faced difficulties in
satisfactorily assessing performance in terms of
genuine human articulation. However, technologies such
as electromagnetic articulography (EMA) mean that
measurement of human articulation during speech has
become increasingly accessible. Crucially, a large
corpus of acoustic-articulatory data during
phonetically-diverse, continuous speech has recently
been recorded at Queen Margaret College, Edinburgh. One
of the primary motivations of this thesis is to exploit
the availability of this remarkable resource. Among the
data-driven models which have been employed in previous
studies, the feedforward multilayer perceptron (MLP) in
particular has been used several times with promising
results. Researchers have cited advantages in terms of
memory requirement and execution speed as a significant
factor motivating their use. Furthermore, the MLP is
well known as a universal function approximator; an MLP
of suitable form can in theory represent any arbitrary
mapping function. Therefore, using an MLP in
conjunction with the relatively large quantities of
acoustic-articulatory data arguably represents a
promising and useful first research step for the
current thesis, and a significant part of this thesis
is occupied with doing this. Having demonstrated an MLP
which performs well enough to provide a reasonable
baseline, we go on to critically evaluate the
suitability of the MLP for the inversion mapping. The
aim is to find ways to improve modelling accuracy
further. Considering what model of the target
articulatory domain is provided in the MLP is key in
this respect. It has been shown that the outputs of an
MLP trained with the sum-of-squares error function
approximate the mean of the target data points
conditioned on the input vector. In many situations,
this is an appropriate and sufficient solution. In
other cases, however, this conditional mean is an
inconveniently limiting model of data in the target
domain, particularly for ill-posed problems where the
mapping may be multi-valued. Substantial evidence
exists which shows that multiple articulatory
configurations are able to produce the same acoustic
signal. This means that a system intended to map from a
point in acoustic space can be faced with multiple
candidate articulatory configurations. Therefore,
despite the impressive ability of the MLP to model
mapping functions, it may prove inadequate in certain
respects for performing the acoustic-to-articulatory
inversion mapping. Mixture density networks (MDN)
provide a principled method to model arbitrary
probability density functions over the target domain,
conditioned on the input vector. In theory, therefore,
the MDN offers a superior model of the target domain
compared to the MLP. We hypothesise that this advantage
will prove beneficial in the case of the
acoustic-to-articulatory inversion mapping.
Accordingly, this thesis aims to test this hypothesis
and directly compare the performance of MDN with MLP on
exactly the same acoustic-to-articulatory inversion
task.},
categories = {artic, ann, mlp, mdn, inversion, mocha, edinburgh},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/phd_final_bound.ps},
year = 2002
}
@inproceedings{salomon:king:osborne:icslp2002,
author = {Jesper Salomon and Simon King and Miles Osborne},
title = {Framewise phone classification using support vector
machines},
booktitle = {Proceedings International Conference on Spoken
Language Processing},
address = {Denver},
abstract = {We describe the use of Support Vector Machines for
phonetic classification on the TIMIT corpus. Unlike
previous work, in which entire phonemes are classified,
our system operates in a \textit{framewise} manner and
is intended for use as the front-end of a hybrid system
similar to ABBOT. We therefore avoid the problems of
classifying variable-length vectors. Our frame-level
phone classification accuracy on the complete TIMIT
test set is competitive with other results from the
literature. In addition, we address the serious problem
of \textit{scaling} Support Vector Machines by using
the Kernel Fisher Discriminant.},
categories = {},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/Salomon_King_Osborne_icslp2002.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/Salomon_King_Osborne_icslp2002.ps},
year = 2002
}
@inproceedings{vepa-king-taylor_icslp02,
author = {Vepa, J. and King, S. and Taylor, P.},
title = {Objective Distance Measures for Spectral
Discontinuities in Concatenative Speech Synthesis},
booktitle = {Proc. {ICSLP}},
address = {Denver, USA},
abstract = {In unit selection based concatenative speech systems,
`join cost', which measures how well two units can be
joined together, is one of the main criteria for
selecting appropriate units from the inventory. The
ideal join cost will measure `perceived' discontinuity,
based on easily measurable spectral properties of the
units being joined, in order to ensure smooth and
natural-sounding synthetic speech. In this paper we
report a perceptual experiment conducted to measure the
correlation between `subjective' human perception and
various `objective' spectrally-based measures proposed
in the literature. Our experiments used a
state-of-the-art unit-selection text-to-speech system:
`rVoice' from Rhetorical Systems Ltd.},
categories = {join cost, distance measures, MCA, rVoice, edinburgh},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/vepa_icslp02.pdf},
year = 2002
}
@inproceedings{Tokuno2002IWFHR,
author = {Junko Tokuno and Nobuhito Inami and Shigeki Matsuda
and Mitsuru Nakai and Hiroshi Shimodaira and Shigeki
Sagayama},
title = {{Context-Dependent Substroke Model for {HMM}-based
On-line Handwriting Recognition}},
booktitle = {Proc. of IWFHR-8},
pages = {78--83},
note = {},
abstract = {This paper describes an effective modeling technique
in the on-line recognition for cursive Kanji
handwritings and Hiragana handwritings. Our
conventional recognition system based on substroke HMMs
(hidden Markov models) employs straight-type substrokes
as primary models and has achieved high recognition
rate in the recognition of careful Kanji handwritings.
On the other hand, the recognition rate for cursive
handwritings is comparatively low, since they consist
of mainlycurve-strokes. Therefore, we propose a
technique of using multiple models for each substroke
by considering the substroke context, which is a
preceding substroke and a following substroke. In order
to construct these context-dependent models
efficiently, we use the SSS (Successive State
Splitting) algorithm developed in speech recognition.
Through the experiments, the recognition rate improved
from 88\% to 92\% for cursive Kanji handwritings and
from 90\% to 98\% for Hiragana handwritings.},
journal = {},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/Tokuno2002IWFHR.pdf},
year = 2002
}
@inproceedings{Wester-icslp-02,
author = {M. Wester and J.M. Kessens and H. Strik},
title = {Goal-directed {ASR} in a multimedia indexing and
searching environment ({MUMIS})},
booktitle = {Proc. of ICSLP},
pages = {1993-1996},
address = {Denver},
abstract = {This paper describes the contribution of automatic
speech recognition (ASR) within the framework of MUMIS
(Multimedia Indexing and Searching Environment). The
domain is football commentaries. The initial results of
carrying out ASR on Dutch and English football
commentaries are presented. We found that overall word
error rates are high, but application specific words
are recognized reasonably well. The difficulty of the
ASR task is greatly increased by the high levels of
noise present in the material.},
categories = {asr, MUMIS, Nijmegen},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/wester.2002.2.pdf},
year = 2002
}
@article{Rokui2002IPSJ07,
author = {Jun Rokui and Mitsuru Nakai and Hiroshi Shimodaira and
Shigeki Sagayama},
title = {{Speaker Normalization Using Linear Transformation of
Vocal Tract Length Based on Maximum Likelihood
Estimation}},
journal = {Information Processing Society of Japan (IPSJ)},
volume = {43},
number = {7},
pages = {2030--2037},
note = {(in Japanese)},
abstract = { },
categories = {asr, jaist},
month = jul,
year = 2002
}
@inproceedings{Goubanova:2002,
author = {Goubanova, O.},
title = {Forms of Introduction in Map Task Dialogues: Case of
{L2} {Russian} Speakers},
booktitle = {Proc. ICSLP 2002},
address = {Denver, USA},
year = 2002
}
@article{Otsuki2002IPSJ,
author = {Tomoshi Otsuki and Naoki Saitou and Mitsuru Nakai and
Hiroshi Shimodaira and Shigeki Sagayama},
title = {{Musical Rhythm Recognition Using Hidden Markov Model}},
journal = {Information Processing Society of Japan (IPSJ) JOURNAL},
volume = {43},
number = {2},
note = {(in Japanese)},
month = feb,
year = 2002
}
@article{robinson-specom02,
author = {A.~J.~Robinson and G.~D.~Cook and D.~P.~W.~Ellis and
E.~Fosler-Lussier and S.~J.~Renals and
D.~A.~G.~Williams},
title = {Connectionist Speech Recognition of Broadcast News},
journal = {Speech Communication},
volume = {37},
pages = {27--45},
abstract = {This paper describes connectionist techniques for
recognition of Broadcast News. The fundamental
difference between connectionist systems and more
conventional mixture-of-Gaussian systems is that
connectionist models directly estimate posterior
probabilities as opposed to likelihoods. Access to
posterior probabilities has enabled us to develop a
number of novel approaches to confidence estimation,
pronunciation modelling and search. In addition we have
investigated a new feature extraction technique based
on the modulation-filtered spectrogram, and methods for
combining multiple information sources. We have
incorporated all of these techniques into a system for
the transcription of Broadcast News, and we present
results on the 1998 DARPA Hub-4E Broadcast News
evaluation data.},
categories = {sprach,bnews,recognition,am,hybrid,abbot,lm,search,pron,eval,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/specom02-preprint.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/specom02-preprint.ps.gz},
year = 2002
}
@article{Wright-Hastie_2002,
author = {Helen Wright-Hastie and Massimo Poesio and Stephen
Isard},
title = {Automatically predicting dialogue structure using
prosodic features},
journal = {Speech Communication},
volume = 36,
number = {1-2},
pages = {63-79},
categories = {dialogue, prosody, recognition},
year = 2002
}
@inproceedings{Cox02d,
author = {Cox, S.J. and Lincoln, M. and Tryggvason, J and
Nakisa, M and Wells, Mand Tutt, M. and Abbott, S},
title = {{TESSA}, a system to aid communication with deaf
people},
booktitle = {ASSETS 2002, Fifth International {ACM SIGCAPH}
Conference on Assistive Technologies},
pages = {205-212},
address = {Edinburgh, Scotland},
abstract = {{TESSA} is an experimental system that aims to aid
transactions between a deaf person and a clerk in a
Post Office by translating the clerks speech to sign
language. A speech recogniser recognises speech from
the clerk and the system then synthesizes the
appropriate sequence of signs in British Sign language
(BSL) using a speciallydeveloped avatar. By using a
phrase lookup approach to language translation, which
is appropriate for the highly constrained discourse in
a Post Office, we were able to build a working system
that we could evaluate. We summarise the results of
this evaluation (undertaken by deaf users and Post
office clerks), and discuss how the findings from the
evaluation are being used in the development of an
improved system},
categories = {visicast,sign language,translation,UEA},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/Cox-Assets-2000.pdf},
year = 2002
}
@inproceedings{pietquin-icassp02,
author = {O.~Pietquin and S.~Renals},
title = {{ASR} system modeling for automatic evaluation and
optimization of dialogue systems},
booktitle = {Proc IEEE ICASSP},
pages = {46--49},
abstract = {Though the field of spoken dialogue systems has
developed quickly in the last decade, rapid design of
dialogue strategies remains uneasy. Several approaches
to the problem of automatic strategy learning have been
proposed and the use of Reinforcement Learning
introduced by Levin and Pieraccini is becoming part of
the state of the art in this area. However, the quality
of the strategy learned by the system depends on the
definition of the optimization criterion and on the
accuracy of the environment model. In this paper, we
propose to bring a model of an ASR system in the
simulated environment in order to enhance the learned
strategy. To do so, we introduced recognition error
rates and confidence levels produced by ASR systems in
the optimization criterion.},
categories = {dialog,rl,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/icassp02-rl.pdf},
year = 2002
}
@inproceedings{Matsushita2002HIS03,
author = {Yoshinori Matsushita and Shinnichi Kawamoto and
Mitsuru Nakai and Hiroshi Shimodaira and Shigeki
Sagayama},
title = {{A Head-Behavior Synchronization Model with Utterance
for Anthropomorphic Spoken-Dialog Agent}},
booktitle = {Technical Report of IEICE, HIS2001},
note = {(in Japanese)},
abstract = { A novel method of synchronously synthesizing the head
motion of an anthropomorphic spoken dialog agent with
its utterance is proposed. Although much efforts have
been taken to synchronize the lip motion with
utterance, very few research exist for such head-motion
control. A neural network is employed to learn the
relationship between the acoustic features of the
utterance and the head motion that are measured by a
motion-capturing system. The proposed method enables to
simulate the facial animation automatically that moves
synchronously with any given utterances. Subjective
evaluation of the performance of the method is reported
as well. },
categories = {lifelike-agent, jaist},
journal = {},
month = mar,
year = 2002
}
@inproceedings{wan-icassp02,
author = {V.~Wan and S.~Renals},
title = {Evaluation of Kernel Methods for Speaker Verification
and Identification},
booktitle = {Proc IEEE ICASSP},
pages = {669--672},
abstract = {Support vector machines are evaluated on speaker
verification and speaker identification tasks. We
compare the polynomial kernel, the Fisher kernel, a
likelihood ratio kernel and the pair hidden Markov
model kernel with baseline systems based on a
discriminative polynomial classifier and generative
Gaussian mixture model classifiers. Simulations were
carried out on the YOHO database and some promising
results were obtained.},
categories = {verification,kernel,svm,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/icassp02-svm.pdf},
year = 2002
}
@mastersthesis{calhoun:02,
author = {Calhoun, Sasha},
title = {Using Prosody in {ASR}: the Segmentation of Broadcast
Radio News},
school = {University of Edinburgh},
abstract = {This study explores how prosodic information can be
used in Automatic Speech Recognition (ASR). A system
was built which automatically identifies topic
boundaries in a corpus of broadcast radio news. We
evaluate the effectiveness of different types of
features, including textual, durational, F0, Tilt and
ToBI features in that system. These features were
suggested by a review of the literature on how topic
structure is indicated by humans and recognised by both
humans and machines from both a linguistic and natural
language processing standpoint. In particular, we
investigate whether acoustic cues to prosodgz?{g
information can be used directly to indicate topic
structure, or whether it is better to derive discourse
structure from intonational events, such as ToBI
events, in a manner suggested by Steedman's (2000)
theory, among others. It was found that the global
properties of an utterance (mean and maximum F0) and
textual features (based on Hearst's (1997) lexical
scores and cue phrases) were effective in recognising
topic boundaries on their own whereas all other
features investigated were not. Performance using Tilt
and ToBI features was disappointing, although this
could have been because of inaccuracies in estimating
az?{g}these~0g}7{ parameters. We suggest that different
acoustic cues to prosody are more effective in
recognising discourse information at certain levels of
discourse structure than others. The identification of
higher level structure is informed by the properties of
lower level structure. Although the findings of this
study were not conclusive on this issue, we propose
that prosody in ASR and synthesis should be represented
in terms of the intonational events relevant to each
level of discourse structure. Further, at the level of
topic structure, a taxonomy of events is needed to
describe the global F0 properties of each utterance
that makes up that structure.} },
categories = {prosody, automatic topic segmentation, broadcast news,
prosodic cues, textual cues},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/sasha_calhoun.pdf},
year = 2002
}
@inproceedings{Shimodaira2002ICASSP,
author = {Hiroshi Shimodaira and Nobuyoshi Sakai and Mitsuru
Nakai and Shigeki Sagayama},
title = {{Jacobian Joint Adaptation to Noise, Channel and Vocal
Tract Length}},
booktitle = {Proc. of ICASSP2002},
pages = {197--200},
abstract = {A new Jacobian approach that linearly decomposes the
composite of additive noise, multiplicative noise
(channel transfer function) and speaker's vocal tract
length, and adapts the acoustic model parameters
simultaneously to these factors is proposed in this
paper. Due to the fact that these factors non-linearly
degrade the observed features for speech recognition,
existing approaches fail to adapt the acoustic models
adequately. Approximating the nonlinear operation by a
linear model enables to employ the least square error
estimation of the factors and adapt the acoustic model
parameters with small amount of speech samples. Speech
recognition experiments on ATR isolated word database
demonstrate significant reduction of error rates, which
supports the effectiveness of the proposed scheme. },
categories = {asr, jaist},
journal = {},
month = may,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/Shimodaira2002ICASSP.pdf},
year = 2002
}
@inproceedings{strom02,
author = {V. Strom},
title = {From Text to Speech Without {ToBI}},
booktitle = {Proc. ICSLP},
address = {Denver},
abstract = {A new method for predicting prosodic parameters, i.e.
phone durations and F0 targets, from preprocessed text
is presented. The prosody model comprises a set of
CARTs, which are learned from a large database of
labeled speech. This database need not be annotated
with Tone and Break Indices (ToBI labels). Instead, a
simpler symbolic prosodic description is created by a
bootstrapping method. The method had been applied to
one Spanish and two German speakers. For the German
voices, two listening tests showed a significant
preference for the new method over a more traditional
approach of prosody prediction, based on hand-crafted
rules.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/paper.icslp02.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/paper.icslp02.ps},
year = 2002
}
@mastersthesis{Couper-02,
author = {Couper, Fiona},
title = {Switching linear dynamical models for automatic speech
recognition},
school = {University of Edinburgh},
abstract = {The field of speech recognition research has been
dominated by the Hidden Markov Model (HMM) in recent
years. The HMM has known weaknesses, such as the strong
``independence assumption'' which presumes observations
to be uncorrelated. New types of statistical modelling
are now being investigated to overcome the weaknesses
of HMMs. One such model is the Linear Dynamical Model
(LDM), whose properties are more appropriate to speech.
Modelling phone segments with LDMs gives fairly good
classification and recognition scores, and this report
explores possible extensions to a system using such
models. Training only one model per phone cannot fully
model variation that exists in speech, and perhaps
training more than one model for some segments will
improve accuracy scores. This is investigated here, and
four methods for building two models instead of one for
any phone are presented. Three of the methods produce
significantly increased classification accuracy scores,
compared to a set of single models.},
categories = {asr},
pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2002/couper_msc.pdf},
year = 2002
}
@phdthesis{Wester-02,
author = {Mirjam Wester},
title = {Pronunciation Variation Modeling for {D}utch Automatic
Speech Recognition},
school = {University of Nijmegen},
abstract = {This thesis consists of an introductory review to
pronunciation variation modeling, followed by four
papers in which the PhD research is described.},
categories = {asr, pm, Nijmegen},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/thesis.pdf},
year = 2002
}
@inproceedings{vepa-king-taylor_ieee02,
author = {Vepa, J. and King, S. and Taylor, P.},
title = {New Objective Distance Measures for Spectral
Discontinuities in Concatenative Speech Synthesis},
booktitle = {Proc. {IEEE} 2002 workshop on speech synthesis},
address = {Santa Monica, USA},
abstract = {The quality of unit selection based concatenative
speech synthesis mainly depends on how well two
successive units can be joined together to minimise the
audible discontinuities. The objective measure of
discontinuity used when selecting units is known as the
`join cost'. The ideal join cost will measure
`perceived' discontinuity, based on easily measurable
spectral properties of the units being joined, in order
to ensure smooth and natural-sounding synthetic speech.
In this paper we describe a perceptual experiment
conducted to measure the correlation between
`subjective' human perception and various `objective'
spectrally-based measures proposed in the literature.
Also we report new objective distance measures derived
from various distance metrics based on these spectral
features, which have good correlation with human
perception to concatenation discontinuities. Our
experiments used a state-of-the art unit-selection
text-to-speech system: `rVoice' from Rhetorical Systems
Ltd.},
categories = {join cost, weighted distances, MCA, rVoice, edinburgh},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/vepa_tts02.pdf},
year = 2002
}
@inproceedings{Nakai2002ICPR,
author = {Mitsuru Nakai and Takashi Sudo and Hiroshi Shimodaira
and Shigeki Sagayama},
title = {{Pen Pressure Features for Writer-Independent On-Line
Handwriting Recognition Based on Substroke {HMM}}},
booktitle = {Proc. of ICPR2002, III},
pages = {220--223},
categories = {hwr, jaist},
journal = {},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/Nakai2002ICPR.pdf},
year = 2002
}
@inproceedings{mayoturkwatson:02,
author = {Mayo, C. and Turk, A. and Watson, J.},
title = {Development of cue weighting strategies in children's
speech perception},
booktitle = {Proceedings of TIPS: Temporal Integration in the
Perception of Speech, Aix-en-Provence},
categories = {speech perception, development, cue weighting},
year = 2002
}
@inproceedings{Keeni2002AIA,
author = {Kanad Keeni and Hiroshi Shimodaira},
title = {{On Selection of Training Data for Fast Learning of
Neural Networks Using Back Propagation}},
booktitle = {IASTED International Conference on Artificial
Intelligence and Application (AIA2002)},
pages = {474--478},
journal = {},
month = sep,
year = 2002
}
@inproceedings{Kawamoto2002PRICAI,
author = {Shin-ichi Kawamoto and Hiroshi Shimodaira and Tsuneo
Nitta and Takuya Nishimoto and Satoshi Nakamura and
Katsunobu Itou and Shigeo Morishima and Tatsuo
Yotsukura and Atsuhiko Kai and Akinobu Lee and Yoichi
Yamashita and Takao Kobayashi and Keiichi Tokuda and
Keikichi Hirose and Nobuaki Minematsu and Atsushi
Yamada and Yasuharu Den and Takehito Utsuro and Shigeki
Sagayama},
title = {{Open-source software for developing anthropomorphic
spoken dialog agent}},
booktitle = {Proc. of PRICAI-02, International Workshop on Lifelike
Animated Agents},
pages = {64--69},
categories = {lifelike-agent, jaist},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/Kawamoto2002PRICAI.pdf},
year = 2002
}
@inproceedings{strom02a,
author = {Juergen Schroeter and Alistair Conkie and Ann Syrdal
and Mark Beutnagel and Matthias Jilka and Volker Strom
and Yeon-Jun Kim and Hong-Goo Kang and David Kapilow},
title = {A perspective on the next challanges for {TTS}},
booktitle = {IEEE 2002 Workshop in Speech Synthesis},
pages = {11-13},
address = {Santa Monica, CA},
abstract = {The quality of speech synthesis has come a long way
since Homer Dudley's ``Vocoder'' in 1939. In fact, with
the wide-spread use of unit-selection synthesizers, the
naturalness of the synthesized speech is now high
enough to pass the Turing test for short utterances,
such as prompts. Therefore, it seems valid to ask the
question ``what are the next challenges for TTS
Research?'' This paper tries to identify unresoved
issues, the solution of which would greatly enhance the
state of the art in TTS.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/strom02a.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/strom02a.ps},
year = 2002
}
@inproceedings{Takeda2002MMSP,
author = {Haruto Takeda and Naoki Saito and Tomoshi Otsuki and
Mitsuru Nakai and Hiroshi Shimodaira and Shigeki
Sagayama},
title = {{Hidden Markov Model for AUtomatic Transcription of
MIDI Signals}},
booktitle = {2002 International Workshop on Multimedia Signal
Processing},
pages = {},
journal = {},
month = dec,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/Takeda2002MMSP12.pdf},
year = 2002
}