2002.bib

@comment{{This file has been generated by bib2bib 1.92}}
@comment{{Command line: /home/korin/bibtex2html-1.92-LINUX/bib2bib -oc /home/korin/projects/publications/new_output/transitdata/2002-citations -ob /home/korin/projects/publications/new_output/transitdata/2002.bib -c 'year : "2002"' /home/korin/projects/publications/filtlists/full_publications_list.bib}}
@article{Kawamoto2002IPSJ07,
  author = {Shin-ichi Kawamoto and Hiroshi Shimodaira and others},
  title = {{Design of Software Toolkit for Anthromorphic Spoken
                   Dialog Agent Software with Customization-oriented
                   Features}},
  journal = {Information Processing Society of Japan (IPSJ) Journal},
  volume = {43},
  number = {7},
  pages = {2249--2263},
  note = {(in Japanese)},
  month = jul,
  year = 2002
}
@inproceedings{VTTS,
  author = {Graf, H. P. and Cosatto, E. and Strom, V. and Huang,
                   F. J.},
  title = {Visual Prosody: Facial Movements Accompanying Speech},
  booktitle = {Proc Fifth Int. Conf. Automatic Face and Gesture
                   Recognition},
  pages = {397-401},
  abstract = {As we articulate speech, we usually move the head and
                   exhibit various facial expressions. This visual aspect
                   of speech aids understanding and helps communicating
                   additional information, such as the speaker's mood. In
                   this paper we analyze quantitatively head and facial
                   movements that accompany speech and investigate how
                   they relate to the text's prosodic structure. We
                   recorded several hours of speech and measured the
                   locations of the speaker's main facial features as well
                   as their head poses. The text was evaluated with a
                   prosody prediction tool, identifying phrase boundaries
                   and pitch accents. Characteristic for most speakers are
                   simple motion patterns that are repeatedly applied in
                   synchrony with the main prosodic events. Direction and
                   strength of head movements vary widely from one speaker
                   to another, yet their timing is typically well
                   synchronized with the spoken text. Understanding
                   quantitatively the correlations between head movements
                   and spoken text is important for synthesizing
                   photo-realistic talking heads. Talking heads appear
                   much more engaging when they exhibit realistic motion
                   patterns.},
  categories = {VTTS},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/paper.vtts.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/paper.vtts.ps},
  year = 2002
}
@phdthesis{richmond2002,
  author = {Richmond, K.},
  title = {Estimating Articulatory Parameters from the Acoustic
                   Speech Signal},
  school = {The Centre for Speech Technology Research, Edinburgh
                   University},
  abstract = {A successful method for inferring articulation from
                   the acoustic speech signal would find many
                   applications: low bit-rate speech coding, visual
                   representation of speech, and the possibility of
                   improved automatic speech recognition to name but a
                   few. It is unsurprising, therefore, that researchers
                   have been investigating the acoustic-to-articulatory
                   inversion mapping for several decades now. A great
                   variety of approaches and models have been applied to
                   the problem. Unfortunately, the overwhelming majority
                   of these attempts have faced difficulties in
                   satisfactorily assessing performance in terms of
                   genuine human articulation. However, technologies such
                   as electromagnetic articulography (EMA) mean that
                   measurement of human articulation during speech has
                   become increasingly accessible. Crucially, a large
                   corpus of acoustic-articulatory data during
                   phonetically-diverse, continuous speech has recently
                   been recorded at Queen Margaret College, Edinburgh. One
                   of the primary motivations of this thesis is to exploit
                   the availability of this remarkable resource. Among the
                   data-driven models which have been employed in previous
                   studies, the feedforward multilayer perceptron (MLP) in
                   particular has been used several times with promising
                   results. Researchers have cited advantages in terms of
                   memory requirement and execution speed as a significant
                   factor motivating their use. Furthermore, the MLP is
                   well known as a universal function approximator; an MLP
                   of suitable form can in theory represent any arbitrary
                   mapping function. Therefore, using an MLP in
                   conjunction with the relatively large quantities of
                   acoustic-articulatory data arguably represents a
                   promising and useful first research step for the
                   current thesis, and a significant part of this thesis
                   is occupied with doing this. Having demonstrated an MLP
                   which performs well enough to provide a reasonable
                   baseline, we go on to critically evaluate the
                   suitability of the MLP for the inversion mapping. The
                   aim is to find ways to improve modelling accuracy
                   further. Considering what model of the target
                   articulatory domain is provided in the MLP is key in
                   this respect. It has been shown that the outputs of an
                   MLP trained with the sum-of-squares error function
                   approximate the mean of the target data points
                   conditioned on the input vector. In many situations,
                   this is an appropriate and sufficient solution. In
                   other cases, however, this conditional mean is an
                   inconveniently limiting model of data in the target
                   domain, particularly for ill-posed problems where the
                   mapping may be multi-valued. Substantial evidence
                   exists which shows that multiple articulatory
                   configurations are able to produce the same acoustic
                   signal. This means that a system intended to map from a
                   point in acoustic space can be faced with multiple
                   candidate articulatory configurations. Therefore,
                   despite the impressive ability of the MLP to model
                   mapping functions, it may prove inadequate in certain
                   respects for performing the acoustic-to-articulatory
                   inversion mapping. Mixture density networks (MDN)
                   provide a principled method to model arbitrary
                   probability density functions over the target domain,
                   conditioned on the input vector. In theory, therefore,
                   the MDN offers a superior model of the target domain
                   compared to the MLP. We hypothesise that this advantage
                   will prove beneficial in the case of the
                   acoustic-to-articulatory inversion mapping.
                   Accordingly, this thesis aims to test this hypothesis
                   and directly compare the performance of MDN with MLP on
                   exactly the same acoustic-to-articulatory inversion
                   task.},
  categories = {artic, ann, mlp, mdn, inversion, mocha, edinburgh},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/phd_final_bound.ps},
  year = 2002
}
@inproceedings{salomon:king:osborne:icslp2002,
  author = {Jesper Salomon and Simon King and Miles Osborne},
  title = {Framewise phone classification using support vector
                   machines},
  booktitle = {Proceedings International Conference on Spoken
                   Language Processing},
  address = {Denver},
  abstract = {We describe the use of Support Vector Machines for
                   phonetic classification on the TIMIT corpus. Unlike
                   previous work, in which entire phonemes are classified,
                   our system operates in a \textit{framewise} manner and
                   is intended for use as the front-end of a hybrid system
                   similar to ABBOT. We therefore avoid the problems of
                   classifying variable-length vectors. Our frame-level
                   phone classification accuracy on the complete TIMIT
                   test set is competitive with other results from the
                   literature. In addition, we address the serious problem
                   of \textit{scaling} Support Vector Machines by using
                   the Kernel Fisher Discriminant.},
  categories = {},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/Salomon_King_Osborne_icslp2002.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/Salomon_King_Osborne_icslp2002.ps},
  year = 2002
}
@inproceedings{vepa-king-taylor_icslp02,
  author = {Vepa, J. and King, S. and Taylor, P.},
  title = {Objective Distance Measures for Spectral
                   Discontinuities in Concatenative Speech Synthesis},
  booktitle = {Proc. {ICSLP}},
  address = {Denver, USA},
  abstract = {In unit selection based concatenative speech systems,
                   `join cost', which measures how well two units can be
                   joined together, is one of the main criteria for
                   selecting appropriate units from the inventory. The
                   ideal join cost will measure `perceived' discontinuity,
                   based on easily measurable spectral properties of the
                   units being joined, in order to ensure smooth and
                   natural-sounding synthetic speech. In this paper we
                   report a perceptual experiment conducted to measure the
                   correlation between `subjective' human perception and
                   various `objective' spectrally-based measures proposed
                   in the literature. Our experiments used a
                   state-of-the-art unit-selection text-to-speech system:
                   `rVoice' from Rhetorical Systems Ltd.},
  categories = {join cost, distance measures, MCA, rVoice, edinburgh},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/vepa_icslp02.pdf},
  year = 2002
}
@inproceedings{Tokuno2002IWFHR,
  author = {Junko Tokuno and Nobuhito Inami and Shigeki Matsuda
                   and Mitsuru Nakai and Hiroshi Shimodaira and Shigeki
                   Sagayama},
  title = {{Context-Dependent Substroke Model for {HMM}-based
                   On-line Handwriting Recognition}},
  booktitle = {Proc. of IWFHR-8},
  pages = {78--83},
  note = {},
  abstract = {This paper describes an effective modeling technique
                   in the on-line recognition for cursive Kanji
                   handwritings and Hiragana handwritings. Our
                   conventional recognition system based on substroke HMMs
                   (hidden Markov models) employs straight-type substrokes
                   as primary models and has achieved high recognition
                   rate in the recognition of careful Kanji handwritings.
                   On the other hand, the recognition rate for cursive
                   handwritings is comparatively low, since they consist
                   of mainlycurve-strokes. Therefore, we propose a
                   technique of using multiple models for each substroke
                   by considering the substroke context, which is a
                   preceding substroke and a following substroke. In order
                   to construct these context-dependent models
                   efficiently, we use the SSS (Successive State
                   Splitting) algorithm developed in speech recognition.
                   Through the experiments, the recognition rate improved
                   from 88\% to 92\% for cursive Kanji handwritings and
                   from 90\% to 98\% for Hiragana handwritings.},
  journal = {},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/Tokuno2002IWFHR.pdf},
  year = 2002
}
@inproceedings{Wester-icslp-02,
  author = {M. Wester and J.M. Kessens and H. Strik},
  title = {Goal-directed {ASR} in a multimedia indexing and
                   searching environment ({MUMIS})},
  booktitle = {Proc. of ICSLP},
  pages = {1993-1996},
  address = {Denver},
  abstract = {This paper describes the contribution of automatic
                   speech recognition (ASR) within the framework of MUMIS
                   (Multimedia Indexing and Searching Environment). The
                   domain is football commentaries. The initial results of
                   carrying out ASR on Dutch and English football
                   commentaries are presented. We found that overall word
                   error rates are high, but application specific words
                   are recognized reasonably well. The difficulty of the
                   ASR task is greatly increased by the high levels of
                   noise present in the material.},
  categories = {asr, MUMIS, Nijmegen},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/wester.2002.2.pdf},
  year = 2002
}
@article{Rokui2002IPSJ07,
  author = {Jun Rokui and Mitsuru Nakai and Hiroshi Shimodaira and
                   Shigeki Sagayama},
  title = {{Speaker Normalization Using Linear Transformation of
                   Vocal Tract Length Based on Maximum Likelihood
                   Estimation}},
  journal = {Information Processing Society of Japan (IPSJ)},
  volume = {43},
  number = {7},
  pages = {2030--2037},
  note = {(in Japanese)},
  abstract = { },
  categories = {asr, jaist},
  month = jul,
  year = 2002
}
@inproceedings{Goubanova:2002,
  author = {Goubanova, O.},
  title = {Forms of Introduction in Map Task Dialogues: Case of
                   {L2} {Russian} Speakers},
  booktitle = {Proc. ICSLP 2002},
  address = {Denver, USA},
  year = 2002
}
@article{Otsuki2002IPSJ,
  author = {Tomoshi Otsuki and Naoki Saitou and Mitsuru Nakai and
                   Hiroshi Shimodaira and Shigeki Sagayama},
  title = {{Musical Rhythm Recognition Using Hidden Markov Model}},
  journal = {Information Processing Society of Japan (IPSJ) JOURNAL},
  volume = {43},
  number = {2},
  note = {(in Japanese)},
  month = feb,
  year = 2002
}
@article{robinson-specom02,
  author = {A.~J.~Robinson and G.~D.~Cook and D.~P.~W.~Ellis and
                   E.~Fosler-Lussier and S.~J.~Renals and
                   D.~A.~G.~Williams},
  title = {Connectionist Speech Recognition of Broadcast News},
  journal = {Speech Communication},
  volume = {37},
  pages = {27--45},
  abstract = {This paper describes connectionist techniques for
                   recognition of Broadcast News. The fundamental
                   difference between connectionist systems and more
                   conventional mixture-of-Gaussian systems is that
                   connectionist models directly estimate posterior
                   probabilities as opposed to likelihoods. Access to
                   posterior probabilities has enabled us to develop a
                   number of novel approaches to confidence estimation,
                   pronunciation modelling and search. In addition we have
                   investigated a new feature extraction technique based
                   on the modulation-filtered spectrogram, and methods for
                   combining multiple information sources. We have
                   incorporated all of these techniques into a system for
                   the transcription of Broadcast News, and we present
                   results on the 1998 DARPA Hub-4E Broadcast News
                   evaluation data.},
  categories = {sprach,bnews,recognition,am,hybrid,abbot,lm,search,pron,eval,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/specom02-preprint.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/specom02-preprint.ps.gz},
  year = 2002
}
@article{Wright-Hastie_2002,
  author = {Helen Wright-Hastie and Massimo Poesio and Stephen
                   Isard},
  title = {Automatically predicting dialogue structure using
                   prosodic features},
  journal = {Speech Communication},
  volume = 36,
  number = {1-2},
  pages = {63-79},
  categories = {dialogue, prosody, recognition},
  year = 2002
}
@inproceedings{Cox02d,
  author = {Cox, S.J. and Lincoln, M. and Tryggvason, J and
                   Nakisa, M and Wells, Mand Tutt, M. and Abbott, S},
  title = {{TESSA}, a system to aid communication with deaf
                   people},
  booktitle = {ASSETS 2002, Fifth International {ACM SIGCAPH}
                   Conference on Assistive Technologies},
  pages = {205-212},
  address = {Edinburgh, Scotland},
  abstract = {{TESSA} is an experimental system that aims to aid
                   transactions between a deaf person and a clerk in a
                   Post Office by translating the clerks speech to sign
                   language. A speech recogniser recognises speech from
                   the clerk and the system then synthesizes the
                   appropriate sequence of signs in British Sign language
                   (BSL) using a speciallydeveloped avatar. By using a
                   phrase lookup approach to language translation, which
                   is appropriate for the highly constrained discourse in
                   a Post Office, we were able to build a working system
                   that we could evaluate. We summarise the results of
                   this evaluation (undertaken by deaf users and Post
                   office clerks), and discuss how the findings from the
                   evaluation are being used in the development of an
                   improved system},
  categories = {visicast,sign language,translation,UEA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/Cox-Assets-2000.pdf},
  year = 2002
}
@inproceedings{pietquin-icassp02,
  author = {O.~Pietquin and S.~Renals},
  title = {{ASR} system modeling for automatic evaluation and
                   optimization of dialogue systems},
  booktitle = {Proc IEEE ICASSP},
  pages = {46--49},
  abstract = {Though the field of spoken dialogue systems has
                   developed quickly in the last decade, rapid design of
                   dialogue strategies remains uneasy. Several approaches
                   to the problem of automatic strategy learning have been
                   proposed and the use of Reinforcement Learning
                   introduced by Levin and Pieraccini is becoming part of
                   the state of the art in this area. However, the quality
                   of the strategy learned by the system depends on the
                   definition of the optimization criterion and on the
                   accuracy of the environment model. In this paper, we
                   propose to bring a model of an ASR system in the
                   simulated environment in order to enhance the learned
                   strategy. To do so, we introduced recognition error
                   rates and confidence levels produced by ASR systems in
                   the optimization criterion.},
  categories = {dialog,rl,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/icassp02-rl.pdf},
  year = 2002
}
@inproceedings{Matsushita2002HIS03,
  author = {Yoshinori Matsushita and Shinnichi Kawamoto and
                   Mitsuru Nakai and Hiroshi Shimodaira and Shigeki
                   Sagayama},
  title = {{A Head-Behavior Synchronization Model with Utterance
                   for Anthropomorphic Spoken-Dialog Agent}},
  booktitle = {Technical Report of IEICE, HIS2001},
  note = {(in Japanese)},
  abstract = { A novel method of synchronously synthesizing the head
                   motion of an anthropomorphic spoken dialog agent with
                   its utterance is proposed. Although much efforts have
                   been taken to synchronize the lip motion with
                   utterance, very few research exist for such head-motion
                   control. A neural network is employed to learn the
                   relationship between the acoustic features of the
                   utterance and the head motion that are measured by a
                   motion-capturing system. The proposed method enables to
                   simulate the facial animation automatically that moves
                   synchronously with any given utterances. Subjective
                   evaluation of the performance of the method is reported
                   as well. },
  categories = {lifelike-agent, jaist},
  journal = {},
  month = mar,
  year = 2002
}
@inproceedings{wan-icassp02,
  author = {V.~Wan and S.~Renals},
  title = {Evaluation of Kernel Methods for Speaker Verification
                   and Identification},
  booktitle = {Proc IEEE ICASSP},
  pages = {669--672},
  abstract = {Support vector machines are evaluated on speaker
                   verification and speaker identification tasks. We
                   compare the polynomial kernel, the Fisher kernel, a
                   likelihood ratio kernel and the pair hidden Markov
                   model kernel with baseline systems based on a
                   discriminative polynomial classifier and generative
                   Gaussian mixture model classifiers. Simulations were
                   carried out on the YOHO database and some promising
                   results were obtained.},
  categories = {verification,kernel,svm,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/icassp02-svm.pdf},
  year = 2002
}
@mastersthesis{calhoun:02,
  author = {Calhoun, Sasha},
  title = {Using Prosody in {ASR}: the Segmentation of Broadcast
                   Radio News},
  school = {University of Edinburgh},
  abstract = {This study explores how prosodic information can be
                   used in Automatic Speech Recognition (ASR). A system
                   was built which automatically identifies topic
                   boundaries in a corpus of broadcast radio news. We
                   evaluate the effectiveness of different types of
                   features, including textual, durational, F0, Tilt and
                   ToBI features in that system. These features were
                   suggested by a review of the literature on how topic
                   structure is indicated by humans and recognised by both
                   humans and machines from both a linguistic and natural
                   language processing standpoint. In particular, we
                   investigate whether acoustic cues to prosodgz?{g
                   information can be used directly to indicate topic
                   structure, or whether it is better to derive discourse
                   structure from intonational events, such as ToBI
                   events, in a manner suggested by Steedman's (2000)
                   theory, among others. It was found that the global
                   properties of an utterance (mean and maximum F0) and
                   textual features (based on Hearst's (1997) lexical
                   scores and cue phrases) were effective in recognising
                   topic boundaries on their own whereas all other
                   features investigated were not. Performance using Tilt
                   and ToBI features was disappointing, although this
                   could have been because of inaccuracies in estimating
                   az?{g}these~0g}7{ parameters. We suggest that different
                   acoustic cues to prosody are more effective in
                   recognising discourse information at certain levels of
                   discourse structure than others. The identification of
                   higher level structure is informed by the properties of
                   lower level structure. Although the findings of this
                   study were not conclusive on this issue, we propose
                   that prosody in ASR and synthesis should be represented
                   in terms of the intonational events relevant to each
                   level of discourse structure. Further, at the level of
                   topic structure, a taxonomy of events is needed to
                   describe the global F0 properties of each utterance
                   that makes up that structure.} },
  categories = {prosody, automatic topic segmentation, broadcast news,
                   prosodic cues, textual cues},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/sasha_calhoun.pdf},
  year = 2002
}
@inproceedings{Shimodaira2002ICASSP,
  author = {Hiroshi Shimodaira and Nobuyoshi Sakai and Mitsuru
                   Nakai and Shigeki Sagayama},
  title = {{Jacobian Joint Adaptation to Noise, Channel and Vocal
                   Tract Length}},
  booktitle = {Proc. of ICASSP2002},
  pages = {197--200},
  abstract = {A new Jacobian approach that linearly decomposes the
                   composite of additive noise, multiplicative noise
                   (channel transfer function) and speaker's vocal tract
                   length, and adapts the acoustic model parameters
                   simultaneously to these factors is proposed in this
                   paper. Due to the fact that these factors non-linearly
                   degrade the observed features for speech recognition,
                   existing approaches fail to adapt the acoustic models
                   adequately. Approximating the nonlinear operation by a
                   linear model enables to employ the least square error
                   estimation of the factors and adapt the acoustic model
                   parameters with small amount of speech samples. Speech
                   recognition experiments on ATR isolated word database
                   demonstrate significant reduction of error rates, which
                   supports the effectiveness of the proposed scheme. },
  categories = {asr, jaist},
  journal = {},
  month = may,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/Shimodaira2002ICASSP.pdf},
  year = 2002
}
@inproceedings{strom02,
  author = {V. Strom},
  title = {From Text to Speech Without {ToBI}},
  booktitle = {Proc. ICSLP},
  address = {Denver},
  abstract = {A new method for predicting prosodic parameters, i.e.
                   phone durations and F0 targets, from preprocessed text
                   is presented. The prosody model comprises a set of
                   CARTs, which are learned from a large database of
                   labeled speech. This database need not be annotated
                   with Tone and Break Indices (ToBI labels). Instead, a
                   simpler symbolic prosodic description is created by a
                   bootstrapping method. The method had been applied to
                   one Spanish and two German speakers. For the German
                   voices, two listening tests showed a significant
                   preference for the new method over a more traditional
                   approach of prosody prediction, based on hand-crafted
                   rules.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/paper.icslp02.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/paper.icslp02.ps},
  year = 2002
}
@mastersthesis{Couper-02,
  author = {Couper, Fiona},
  title = {Switching linear dynamical models for automatic speech
                   recognition},
  school = {University of Edinburgh},
  abstract = {The field of speech recognition research has been
                   dominated by the Hidden Markov Model (HMM) in recent
                   years. The HMM has known weaknesses, such as the strong
                   ``independence assumption'' which presumes observations
                   to be uncorrelated. New types of statistical modelling
                   are now being investigated to overcome the weaknesses
                   of HMMs. One such model is the Linear Dynamical Model
                   (LDM), whose properties are more appropriate to speech.
                   Modelling phone segments with LDMs gives fairly good
                   classification and recognition scores, and this report
                   explores possible extensions to a system using such
                   models. Training only one model per phone cannot fully
                   model variation that exists in speech, and perhaps
                   training more than one model for some segments will
                   improve accuracy scores. This is investigated here, and
                   four methods for building two models instead of one for
                   any phone are presented. Three of the methods produce
                   significantly increased classification accuracy scores,
                   compared to a set of single models.},
  categories = {asr},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2002/couper_msc.pdf},
  year = 2002
}
@phdthesis{Wester-02,
  author = {Mirjam Wester},
  title = {Pronunciation Variation Modeling for {D}utch Automatic
                   Speech Recognition},
  school = {University of Nijmegen},
  abstract = {This thesis consists of an introductory review to
                   pronunciation variation modeling, followed by four
                   papers in which the PhD research is described.},
  categories = {asr, pm, Nijmegen},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/thesis.pdf},
  year = 2002
}
@inproceedings{vepa-king-taylor_ieee02,
  author = {Vepa, J. and King, S. and Taylor, P.},
  title = {New Objective Distance Measures for Spectral
                   Discontinuities in Concatenative Speech Synthesis},
  booktitle = {Proc. {IEEE} 2002 workshop on speech synthesis},
  address = {Santa Monica, USA},
  abstract = {The quality of unit selection based concatenative
                   speech synthesis mainly depends on how well two
                   successive units can be joined together to minimise the
                   audible discontinuities. The objective measure of
                   discontinuity used when selecting units is known as the
                   `join cost'. The ideal join cost will measure
                   `perceived' discontinuity, based on easily measurable
                   spectral properties of the units being joined, in order
                   to ensure smooth and natural-sounding synthetic speech.
                   In this paper we describe a perceptual experiment
                   conducted to measure the correlation between
                   `subjective' human perception and various `objective'
                   spectrally-based measures proposed in the literature.
                   Also we report new objective distance measures derived
                   from various distance metrics based on these spectral
                   features, which have good correlation with human
                   perception to concatenation discontinuities. Our
                   experiments used a state-of-the art unit-selection
                   text-to-speech system: `rVoice' from Rhetorical Systems
                   Ltd.},
  categories = {join cost, weighted distances, MCA, rVoice, edinburgh},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/vepa_tts02.pdf},
  year = 2002
}
@inproceedings{Nakai2002ICPR,
  author = {Mitsuru Nakai and Takashi Sudo and Hiroshi Shimodaira
                   and Shigeki Sagayama},
  title = {{Pen Pressure Features for Writer-Independent On-Line
                   Handwriting Recognition Based on Substroke {HMM}}},
  booktitle = {Proc. of ICPR2002, III},
  pages = {220--223},
  categories = {hwr, jaist},
  journal = {},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/Nakai2002ICPR.pdf},
  year = 2002
}
@inproceedings{mayoturkwatson:02,
  author = {Mayo, C. and Turk, A. and Watson, J.},
  title = {Development of cue weighting strategies in children's
                   speech perception},
  booktitle = {Proceedings of TIPS: Temporal Integration in the
                   Perception of Speech, Aix-en-Provence},
  categories = {speech perception, development, cue weighting},
  year = 2002
}
@inproceedings{Keeni2002AIA,
  author = {Kanad Keeni and Hiroshi Shimodaira},
  title = {{On Selection of Training Data for Fast Learning of
                   Neural Networks Using Back Propagation}},
  booktitle = {IASTED International Conference on Artificial
                   Intelligence and Application (AIA2002)},
  pages = {474--478},
  journal = {},
  month = sep,
  year = 2002
}
@inproceedings{Kawamoto2002PRICAI,
  author = {Shin-ichi Kawamoto and Hiroshi Shimodaira and Tsuneo
                   Nitta and Takuya Nishimoto and Satoshi Nakamura and
                   Katsunobu Itou and Shigeo Morishima and Tatsuo
                   Yotsukura and Atsuhiko Kai and Akinobu Lee and Yoichi
                   Yamashita and Takao Kobayashi and Keiichi Tokuda and
                   Keikichi Hirose and Nobuaki Minematsu and Atsushi
                   Yamada and Yasuharu Den and Takehito Utsuro and Shigeki
                   Sagayama},
  title = {{Open-source software for developing anthropomorphic
                   spoken dialog agent}},
  booktitle = {Proc. of PRICAI-02, International Workshop on Lifelike
                   Animated Agents},
  pages = {64--69},
  categories = {lifelike-agent, jaist},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/Kawamoto2002PRICAI.pdf},
  year = 2002
}
@inproceedings{strom02a,
  author = {Juergen Schroeter and Alistair Conkie and Ann Syrdal
                   and Mark Beutnagel and Matthias Jilka and Volker Strom
                   and Yeon-Jun Kim and Hong-Goo Kang and David Kapilow},
  title = {A perspective on the next challanges for {TTS}},
  booktitle = {IEEE 2002 Workshop in Speech Synthesis},
  pages = {11-13},
  address = {Santa Monica, CA},
  abstract = {The quality of speech synthesis has come a long way
                   since Homer Dudley's ``Vocoder'' in 1939. In fact, with
                   the wide-spread use of unit-selection synthesizers, the
                   naturalness of the synthesized speech is now high
                   enough to pass the Turing test for short utterances,
                   such as prompts. Therefore, it seems valid to ask the
                   question ``what are the next challenges for TTS
                   Research?'' This paper tries to identify unresoved
                   issues, the solution of which would greatly enhance the
                   state of the art in TTS.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/strom02a.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/strom02a.ps},
  year = 2002
}
@inproceedings{Takeda2002MMSP,
  author = {Haruto Takeda and Naoki Saito and Tomoshi Otsuki and
                   Mitsuru Nakai and Hiroshi Shimodaira and Shigeki
                   Sagayama},
  title = {{Hidden Markov Model for AUtomatic Transcription of
                   MIDI Signals}},
  booktitle = {2002 International Workshop on Multimedia Signal
                   Processing},
  pages = {},
  journal = {},
  month = dec,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/Takeda2002MMSP12.pdf},
  year = 2002
}