2003.bib

@comment{{This file has been generated by bib2bib 1.92}}
@comment{{Command line: /home/korin/bibtex2html-1.92-LINUX/bib2bib -oc /home/korin/projects/publications/new_output/transitdata/2003-citations -ob /home/korin/projects/publications/new_output/transitdata/2003.bib -c 'year : "2003"' /home/korin/projects/publications/filtlists/full_publications_list.bib}}
@incollection{gotoh-lm03,
  author = {Y.~Gotoh and S.~Renals},
  title = {Language Modelling},
  booktitle = {Text and Speech Triggered Information Access},
  editor = {S.~Renals and G.~Grefenstette},
  pages = {78--105},
  abstract = {This is a preprint of a tutorial on statistical
                   language modelling, based on Yoshi Gotoh's course at
                   the \href{http://www.ilsp.gr/testia/testia2000.html}
                   {ELSNET-2000 Summer School} on Text and Speech
                   Triggered Information Access. },
  categories = {ie,lm,bnews,sheffield},
  crossref = {renals-book03},
  year = 2003
}
@inproceedings{Sturm-03,
  author = {J. Sturm and J. M. Kessens and M. Wester and F. de Wet
                   and E. Sanders and H. Strik },
  title = {Automatic Transcription of Football Commentaries in
                   the {MUMIS} Project},
  booktitle = {Proc. Eurospeech '03},
  pages = {-},
  abstract = {This paper describes experiments carried out to
                   automatically transcribe football commentaries in
                   Dutch, English and German for multimedia indexing. Our
                   results show that the high levels of stadium noise in
                   the material create a task that is extremely difficult
                   for conventional ASR. The baseline WERs vary from 83\%
                   to 94\% for the three languages investigated. Employing
                   state-of-the-art noise robustness techniques leads to
                   relative reductions of 9-10\% WER. Application specific
                   words such as players names are recognized correctly in
                   about 50\% of cases. Although this result is
                   substantially better than the overall result, it is
                   inadequate. Much better results can be obtained if the
                   football commentaries are recorded separately from the
                   stadium noise. This would make the automatic
                   transcriptions more useful for multimedia indexing.},
  categories = {asr, MUMIS, Nijmegen},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/mumis_final.pdf},
  year = 2003
}
@article{Ho2003Applied,
  author = {Tu Bao Ho and Trong Dung Nguyen and Hiroshi Shimodaira
                   and Masayuki Kimura},
  title = {{A Knowledge Discovery System with Support for Model
                   Selection and Visualization}},
  journal = {Applied Intelligence},
  volume = {19},
  number = {},
  pages = {125--141},
  categories = {KDD},
  year = 2003
}
@inproceedings{Goubanova:2003,
  author = {Goubanova, O.},
  title = {{B}ayesian Modelling Of Vowel Segment Duration For
                   Text-to-Speech Synthesis Using Distinctive Features},
  booktitle = {Proc. ICPhS 2003},
  volume = 3,
  pages = {2349},
  address = {Barcelona, Spain},
  abstract = {We report the results of applying the Bayesian Belief
                   Network (BN) approach to predicting vowel duration. A
                   Bayesian inference of the vowel duration is performed
                   on a hybrid Bayesian network consisting of discrete and
                   continuous nodes, with the nodes in the network
                   representing the linguistic factors that affect segment
                   duration. New to the present research, we model segment
                   identity factor as a set of distinctive features. The
                   features chosen were height, frontness, length, and
                   roundness. We also experimented with a word class
                   feature that implicitly represents word frequency
                   information. We contrasted the results of the belief
                   network model with those of the sums of products (SoP)
                   model and classification and regression tree (CART)
                   model. We trained and tested all three models on the
                   same data. In terms of the RMS error and correlation
                   coefficient, our BN model performs no worse than SoP
                   model, and it significantly outperforms CART model.},
  categories = {Bayesian, text-to-speech synthesis, duration modelling},
  ps = {http://www.cstr.ed.ac.uk/downloads/publications/2003/OGoubanova_icphs2k3.ps},
  year = 2003
}
@inproceedings{calhoun:03,
  author = {Calhoun, Sasha},
  title = {The Nature of Theme and Rheme Accents},
  booktitle = {One-Day Meeting for Young Speech Researchers},
  address = {University College, London},
  abstract = {It has increasingly been recognised that appropriate
                   intonation is essential to create believable voices for
                   speech synthesis. This is particularly true in
                   dialogue, where the link between intonation and meaning
                   is especially important. Here we report two
                   experiments, a production and perception study, which
                   test an aspect of Steedman's (2000) theory relating
                   information and intonation structure with a view to
                   specifying intonation in a speech synthesis system. He
                   claims that themes and rhemes, the basic building
                   blocks of information structure, are marked by
                   distinctive pitch accents in English, which he
                   identifies with L+H* and H* in the ToBI system
                   respectively. After reviewing problems with the
                   identification of these ToBI accents, we show that
                   speakers do produce and listeners do distinguish
                   different pitch accents in these discourse contexts,
                   but that the ToBI labels may not be helpful to
                   characterise the distinction. The exact phonetic nature
                   of theme and rheme accents remains unclear, but the
                   alignment of the start of the rise, pitch height and
                   the fall after the pitch peak all appear to be factors.
                   Speakers also appear to be more sensitive to the
                   distinction at the end of an utterance than
                   utterance-medially.},
  categories = {prosody, information structure, pitch accents,
                   production and perception experiments},
  month = apr,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/calhounPGC03.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/calhounPGC03.ps},
  year = 2003
}
@inproceedings{mayoturk:03,
  author = {Mayo, C. and Turk, A.},
  title = {Is the development of cue weighting strategies in
                   children's speech perception context-dependent?},
  booktitle = {XVth International Congress of Phonetic Sciences,
                   Barcelona},
  categories = {speech perception, development, cue weighting},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/icphs-0677.pdf},
  year = 2003
}
@inproceedings{gillett:king:eurospeech2003b,
  author = {Ben Gillett and Simon King},
  title = {Transforming {F0} Contours},
  booktitle = {Proc. {E}urospeech},
  address = {Geneva},
  abstract = {Voice transformation is the process of transforming
                   the characteristics of speech uttered by a source
                   speaker, such that a listener would believe the speech
                   was uttered by a target speaker. Training F0 contour
                   generation models for speech synthesis requires a large
                   corpus of speech. If it were possible to adapt the F0
                   contour of one speaker to sound like that of another
                   speaker, using a small, easily obtainable parameter
                   set, this would be extremely valuable. We present a new
                   method for the transformation of F0 contours from one
                   speaker to another based on a small linguistically
                   motivated parameter set. The system performs a
                   piecewise linear mapping using these parameters. A
                   perceptual experiment clearly demonstrates that the
                   presented system is at least as good as an existing
                   technique for all speaker pairs, and that in many cases
                   it is much better and almost as good as using the
                   target F0 contour},
  categories = {},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/Gillett_King_eurospeech2003a.pdf},
  year = 2003
}
@inproceedings{Wester-03,
  author = {M. Wester},
  title = {Syllable classification using articulatory-acoustic
                   features},
  booktitle = {Proc. of Eurospeech '03},
  pages = {-},
  address = {Geneva},
  abstract = {This paper investigates the use of
                   articulatory-acoustic features for the classification
                   of syllables in TIMIT. The main motivation for this
                   study is to circumvent the ``beads-on-a-string''
                   problem, i.e. the assumption that words can be
                   described as a simple concatenation of phones.
                   Posterior probabilities for articulatory-acoustic
                   features are obtained from artificial neural nets and
                   are used to classify speech within the scope of
                   syllables instead of phones. This gives the opportunity
                   to account for asynchronous feature changes, exploiting
                   the strengths of the articulatory-acoustic features,
                   instead of losing the potential by reverting to phones.},
  categories = {aaf, syllable, TIMIT, Edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/wester.2003.1.pdf},
  year = 2003
}
@article{Matsuda2003IEICE06,
  author = {Shigeki Matsuda and Mitsuru Nakai and Hiroshi
                   Shimodaira and Shigeki Sagayama},
  title = {{Speech Recognition Using Asynchronous Transition
                   {HMM}}},
  journal = {IEICE Trans. D-II},
  volume = {J86-D-II},
  number = {6},
  pages = {741--754},
  note = {(in Japanese)},
  abstract = {We propose asynchronous-transition HMM (AT-HMM) that
                   is based on asynchronous transition structures among
                   individual features of acoustic feature vector
                   sequences. Conventional HMM represents vector sequences
                   by using a chain of states, each state has vector
                   distributions of multi-dimensions. Therefore, the
                   conventional HMM assumes that individual features
                   change synchronously. However, this assumption seems
                   over-simplified for modeling the temporal behavior of
                   acoustic features, since cepstrum and its
                   time-derivative can not synchronize with each other. In
                   speaker-dependent continuous phoneme recognition task,
                   the AT-HMMs reduced errors by 10\% to 40\%. In
                   speaker-independent task, the performance of the
                   AT-HMMs was comparable to conventional HMMs.},
  categories = {asr, jaist},
  month = jun,
  year = 2003
}
@inproceedings{koumpis-msdr03,
  author = {K.~Koumpis and S.~Renals},
  title = {Evaluation of extractive voicemail summarization},
  booktitle = {Proc. ISCA Workshop on Multilingual Spoken Document
                   Retrieval},
  pages = {19--24},
  abstract = {This paper is about the evaluation of a system that
                   generates short text summaries of voicemail messages,
                   suitable for transmission as text messages. Our
                   approach to summarization is based on a
                   speech-recognized transcript of the voicemail message,
                   from which a set of summary words is extracted. The
                   system uses a classifier to identify the summary words,
                   with each word being identified by a vector of lexical
                   and prosodic features. The features are selected using
                   Parcel, an ROC-based algorithm. Our evaluations of the
                   system, using a slot error rate metric, have compared
                   manual and automatic summarization, and manual and
                   automatic recognition (using two different
                   recognizers). We also report on two subjective
                   evaluations using mean opinion score of summaries, and
                   a set of comprehension tests. The main results from
                   these experiments were that the perceived difference in
                   quality of summarization was affected more by errors
                   resulting from automatic transcription, than by the
                   automatic summarization process.},
  categories = {voicemail,summarization,prosody,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/msdr03.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/msdr03.ps.gz},
  year = 2003
}
@inproceedings{shig031,
  author = {Yoshinori Shiga and Simon King},
  title = {Estimating the Spectral Envelope of Voiced Speech
                   Using Multi-frame Analysis},
  booktitle = {Proc. {E}urospeech-2003},
  volume = 3,
  pages = {1737--1740},
  address = {Geneva, Switzerland},
  abstract = {This paper proposes a novel approach for estimating
                   the spectral envelope of voiced speech independently of
                   its harmonic structure. Because of the
                   quasi-periodicity of voiced speech, its spectrum
                   indicates harmonic structure and only has energy at
                   frequencies corresponding to integral multiples of F0.
                   It is hence impossible to identify transfer
                   characteristics between the adjacent harmonics. In
                   order to resolve this problem, Multi-frame Analysis
                   (MFA) is introduced. The MFA estimates a spectral
                   envelope using many portions of speech which are
                   vocalised using the same vocal-tract shape. Since each
                   of the portions usually has a different F0 and ensuing
                   different harmonic structure, a number of harmonics can
                   be obtained at various frequencies to form a spectral
                   envelope. The method thereby gives a closer
                   approximation to the vocal-tract transfer function.},
  categories = {artic, lbg, clustering, mocha, harmonic, envelope,
                   edinburgh},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/shiga_eurospeech03a.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/shiga_eurospeech03a.ps},
  year = 2003
}
@inproceedings{Shimodaira2003ICDAR,
  author = {Hiroshi Shimodaira and Takashi Sudo and Mitsuru Nakai
                   and Shigeki Sagayama},
  title = {{On-line Overlaid-Handwriting Recognition Based on
                   Substroke {HMM}s}},
  booktitle = {ICDAR'03},
  pages = {1043--1047},
  abstract = {This paper proposes a novel handwriting recognition
                   interface for wearable computing where users write
                   characters continuously without pauses on a small
                   single writing box. Since characters are written on the
                   same writing area, they are overlaid with each other.
                   Therefore the task is regarded as a special case of the
                   continuous character recognition problem. In contrast
                   to the conventional continuous character recognition
                   problem, location information of strokes does not help
                   very much in the proposed framework. To tackle the
                   problem, substroke based hidden Markov models (HMMs)
                   and a stochastic bigram language model are employed.
                   Preliminary experiments were carried out on a dataset
                   of 578 handwriting sequences with a character bigram
                   consisting of 1,016 Japanese educational Kanji and 71
                   Hiragana characters. The proposed method demonstrated
                   promising performance with 69.2\% of handwriting
                   sequences beeing correctly recognized when different
                   stroke order was permitted, and the rate was improved
                   up to 88.0\% when characters were written with fixed
                   stroke order.},
  categories = {HWR, jaist},
  journal = {},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/Shimodaira2003ICDAR.pdf},
  year = 2003
}
@inproceedings{renals-icassp03,
  author = {S.~Renals and D.~Ellis},
  title = {Audio information access from meeting rooms},
  booktitle = {Proc. IEEE ICASSP},
  volume = {4},
  pages = {744--747},
  abstract = {We investigate approaches to accessing information
                   from the streams of audio data that result from
                   multi-channel recordings of meetings. The methods
                   investigated use word-level transcriptions, and
                   information derived from models of speaker activity and
                   speaker turn patterns. Our experiments include spoken
                   document retrieval for meetings, automatic structuring
                   of meetings based on self-similarity matrices of
                   speaker turn patterns and a simple model of speaker
                   activity. Meeting recordings are rich in both lexical
                   and non-lexical information; our results illustrate
                   some novel kinds of analysis made possible by a
                   transcribed corpus of natural meetings.},
  categories = {m4,multimodal,ir,meetings,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/icassp03-mtg.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/icassp03-mtg.ps.gz},
  year = 2003
}
@article{Wester-CSL-03,
  author = {M. Wester},
  title = {Pronunciation modeling for {ASR} -- knowledge-based
                   and data-derived methods},
  journal = {Computer Speech and Language},
  volume = {17},
  pages = {69-85},
  abstract = {This article focuses on modeling pronunciation
                   variation in two different ways: data-derived and
                   knowledge-based. The knowledge-based approach consists
                   of using phonological rules to generate variants. The
                   data-derived approach consists of performing phone
                   recognition, followed by smoothing using decision trees
                   (D-trees) to alleviate some of the errors in the phone
                   recognition. Using phonological rules led to a small
                   improvement in WER; a data-derived approach in which
                   the phone recognition was smoothed using D-trees prior
                   to lexicon generation led to larger improvements
                   compared to the baseline. The lexicon was employed in
                   two different recognition systems: a hybrid HMM/ANN
                   system and a HMM-based system, to ascertain whether
                   pronunciation variation was truly being modeled. This
                   proved to be the case as no significant differences
                   were found between the results obtained with the two
                   systems. Furthermore, we found that 10\% of variants
                   generated by the phonological rules were also found
                   using phone recognition, and this increased to 28\%
                   when the phone recognition output was smoothed by using
                   D-trees. This indicates that the D-trees generalize
                   beyond what has been seen in the training material,
                   whereas when the phone recognition approach is employed
                   directly, unseen pronunciations cannot be predicted. In
                   addition, we propose a metric to measure confusability
                   in the lexicon. Using this confusion metric to prune
                   variants results in roughly the same improvement as
                   using the D-tree method.},
  categories = {asr, pm, VIOS, Nijmegen},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/CSL-pronvar.pdf},
  year = 2003
}
@inproceedings{kolluru-asru03,
  author = {B. Kolluru and H. Christensen and Y. Gotoh and S.
                   Renals},
  title = {Exploring the style-technique interaction in
                   extractive summarization of broadcast news},
  booktitle = {Proc. IEEE Automatic Speech Recognition and
                   Understanding Workshop},
  pages = {},
  abstract = {In this paper we seek to explore the interaction
                   between the style of a broadcast news story and its
                   summarization technique. We report the performance of
                   three different summarization techniques on broadcast
                   news stories, which are split into planned speech and
                   spontaneous speech. The initial results indicate that
                   some summarization techniques work better for the
                   documents with spontaneous speech than for those with
                   planned speech. Even for human beings some documents
                   are inherently dif cult to summarize. We observe this
                   correlation between degree of dif culty in summarizing
                   and performance of the three automatic summarizers.
                   Given the high frequency of named entities in broadcast
                   news and even greater number of references to these
                   named entities, we also gauge the effect of named
                   entity and coreference resolution in a news story, on
                   the performance of these summarizers.},
  categories = {s3l,summarization,bnews,edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/asru03-style.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/asru03-style.ps.gz},
  year = 2003
}
@inproceedings{Keeni2003ICEIS,
  author = {Kanad Keeni and Kunio Goto and Hiroshi Shimodaira},
  title = {{On fast learning of Multi-layer Feed-forward Neural
                   Networks Using Back Propagation}},
  booktitle = {International Conference on Enterprise and Information
                   Systems (ICEIS2003)},
  pages = {266--271},
  abstract = {This study discusses the subject of training data
                   selection for neural networks using back propagation.
                   We have made only one assumption that there are no
                   overlapping of training data belonging to different
                   classes, in other words the training data is
                   linearly/semi-linearly separable . Training data is
                   analyzed and the data that affect the learning process
                   are selected based on the idea of Critical points. The
                   proposed method is applied to a classification problem
                   where the task is to recognize the characters A,C and
                   B,D. The experimental results show that in case of
                   batch mode the proposed method takes almost 1/7 of real
                   and 1/10 of user training time required for
                   conventional method. On the other hand in case of
                   online mode the proposed method takes 1/3 of training
                   epochs, 1/9 of real and 1/20 of user and 1/3 system
                   time required for the conventional method. The
                   classification rate of training and testing data are
                   the same as it is with the conventional method. },
  month = apr,
  year = 2003
}
@inproceedings{koumpis-eurospeech03,
  author = {K.~Koumpis and S.~Renals},
  title = {Multi-class Extractive Voicemail Summarization},
  booktitle = {Proc. Eurospeech},
  pages = {2785--2788},
  abstract = {This paper is about a system that extracts principal
                   content words from speech-recognized transcripts of
                   voicemail messages and classifies them into proper
                   names, telephone numbers, dates/times and `other'. The
                   short text summaries generated are suitable for mobile
                   messaging applications. The system uses a set of
                   classifiers to identify the summary words, with each
                   word being identified by a vector of lexical and
                   prosodic features. The features are selected using
                   Parcel, an ROC-based algorithm. We visually compare the
                   role of a large number of individual features and
                   discuss effective ways to combine them. We finally
                   evaluate their performance on manual and automatic
                   transcriptions derived from two different speech
                   recognition systems.},
  categories = {voicemail,summarization,prosody,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/eurospeech03-voicemail.pdf},
  year = 2003
}
@article{richmond2003,
  author = {Richmond, K. and King, S. and Taylor, P.},
  title = {Modelling the Uncertainty in Recovering Articulation
                   from Acoustics},
  journal = {Computer Speech and Language},
  volume = 17,
  pages = {153--172},
  abstract = {This paper presents an experimental comparison of the
                   performance of the multilayer perceptron (MLP) with
                   that of the mixture density network (MDN) for an
                   acoustic-to-articulatory mapping task. A corpus of
                   acoustic-articulatory data recorded by electromagnetic
                   articulography (EMA) for a single speaker was used as
                   training and test data for this purpose. In theory, the
                   MDN is able to provide a richer, more flexible
                   description of the target variables in response to a
                   given input vector than the least-squares trained MLP.
                   Our results show that the mean likelihoods of the
                   target articulatory parameters for an unseen test set
                   were indeed consistently higher with the MDN than with
                   the MLP. The increase ranged from approximately 3\% to
                   22\%, depending on the articulatory channel in
                   question. On the basis of these results, we argue that
                   using a more flexible description of the target domain,
                   such as that offered by the MDN, can prove beneficial
                   when modelling the acoustic-to-articulatory mapping.},
  categories = {artic, ann, mlp, mdn, inversion, mocha, edinburgh},
  key = {richmond2003},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/richmond2003.pdf},
  year = 2003
}
@inproceedings{vanbael:king:icphs2003,
  author = {Christophe Van Bael and Simon King},
  title = {An Accent-Independent Lexicon for Automatic Speech
                   Recognition},
  booktitle = {Proc. ICPhS},
  pages = {1165-1168},
  abstract = {Recent work at the Centre for Speech Technology Re-
                   search (CSTR) at the University of Edinburgh has de-
                   veloped an accent-independent lexicon for speech syn-
                   thesis (the Unisyn project). The main purpose of this
                   lexicon is to avoid the problems and cost of writing a
                   new lexicon for every new accent needed for synthesis.
                   Only recently, a first attempt has been made to use the
                   Keyword Lexicon for automatic speech recognition.},
  categories = {},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/VanBael_King_icphs2003.pdf},
  year = 2003
}
@inproceedings{horlock:king:eurospeech2003a,
  author = {James Horlock and Simon King},
  title = {Named Entity Extraction from Word Lattices},
  booktitle = {Proc. Eurospeech},
  address = {Geneva},
  abstract = {We present a method for named entity extraction from
                   word lattices produced by a speech recogniser. Previous
                   work by others on named entity extraction from speech
                   has used either a manual transcript or 1-best
                   recogniser output. We describe how a single Viterbi
                   search can recover both the named entity sequence and
                   the corresponding word sequence from a word lattice,
                   and further that it is possible to trade off an
                   increase in word error rate for improved named entity
                   extraction.},
  categories = {},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/Horlock_King_eurospeech2003a.pdf},
  year = 2003
}
@inproceedings{wan-icassp03,
  author = {V.~Wan and S.~Renals},
  title = {{SVMSVM}: Support vector machine speaker verification
                   methodology},
  booktitle = {Proc. IEEE ICASSP},
  volume = {2},
  pages = {221--224},
  abstract = {Support vector machines with the Fisher and
                   score-space kernels are used for text independent
                   speaker verification to provide direct q discrimination
                   between complete utterances. This is unlike approaches
                   such as discriminatively trained Gaussian mixture
                   models or other discriminative classifiers that
                   discriminate at the frame-level only. Using the
                   sequence-level discrimination approach we are able to
                   achieve error-rates that are significantly better than
                   the current state-of-the-art on the PolyVar database.},
  categories = {verification,kernel,svm,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/icassp03-svm.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/icassp03-svm.ps.gz},
  year = 2003
}
@inproceedings{christensen-asru03,
  author = {H. Christensen and Y. Gotoh and B. Kolluru and S.
                   Renals},
  title = {Are extractive text summarisation techniques portable
                   to broadcast news?},
  booktitle = {Proc. IEEE Automatic Speech Recognition and
                   Understanding Workshop},
  pages = {},
  abstract = {In this paper we report on a series of experiments
                   which compare the effect of individual features on both
                   text and speech summarisation, the effect of basing the
                   speech summaries on automatic speech recognition
                   transcripts with varying word error rates, and the
                   effect of summarisation approach and transcript source
                   on summary quality. We show that classical text
                   summarisation features (based on stylistic and content
                   information) are portable to broadcast news. However,
                   the quality of the speech transcripts as well as the
                   difference in information structure between broadcast
                   and newspaper news affect the usability of the
                   individual features.},
  categories = {s3l,summarization,bnews,edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/asru03-portable.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/asru03-portable.ps.gz},
  year = 2003
}
@article{Kawamoto2003Book,
  author = {Shin-ichi Kawamoto and Hiroshi Shimodaira and Shigeki
                   Sagayama and others},
  title = {{Galatea: Open-Source Software for Developing
                   Anthropomorphic Spoken Dialog Agents}},
  journal = {Life-Like Characters. Tools, Affective Functions, and
                   Applications. Helmut Prendinger et al. (Eds.) Springer},
  volume = {},
  number = {},
  pages = {187--212},
  abstract = {Galatea is a software toolkit to develop a human-like
                   spoken dialog agnet. In order to easily integrate the
                   modules of different characteristics including speech
                   recognizer, speech synthesizer, facial-image
                   synthesizer and dialog controller, each module is
                   modeled as a virtual machine having a simple common
                   interface and connected to each other through a broker
                   (communication manager). Galatea employs model-based
                   speech and facial-image synthesizers whose model
                   parameters are adapted easily to those for an existing
                   person if his/her training data is given. The software
                   toolkit that runs on both UNIX/Linux and Windows
                   operating systems will be publicly available in the
                   middle of 2003. },
  categories = {lifelike-agent, jaist},
  month = nov,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/Kawamoto2003Book.pdf},
  year = 2003
}
@inproceedings{vepa-king_euro03,
  author = {Vepa, J. and King, S.},
  title = {Kalman-filter based Join Cost for Unit-selection
                   Speech Synthesis},
  booktitle = {Proc. {E}urospeech},
  address = {Geneva, Switzerland},
  abstract = {We introduce a new method for computing join cost in
                   unit-selection speech synthesis which uses a linear
                   dynamical model (also known as a Kalman filter) to
                   model line spectral frequency trajectories. The model
                   uses an underlying subspace in which it makes smooth,
                   continuous trajectories. This subspace can be seen as
                   an analogy for underlying articulator movement. Once
                   trained, the model can be used to measure how well
                   concatenated speech segments join together. The
                   objective join cost is based on the error between model
                   predictions and actual observations. We report
                   correlations between this measure and mean listener
                   scores obtained from a perceptual listening experiment.
                   Our experiments use a state-of-the art unit-selection
                   text-to-speech system: `rVoice' from Rhetorical Systems
                   Ltd.},
  categories = {join cost, Kalman filter, LDM, rVoice, edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/vepa_eurospeech03.pdf},
  year = 2003
}
@phdthesis{clark_phd03,
  author = {Robert A. J. Clark},
  title = {Generating Synthetic Pitch Contours Using Prosodic
                   Structure},
  school = {The University of Edinburgh},
  categories = {speech synthesis, prosody, intonation, festival},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/clark_phd03.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/clark_phd03.ps.gz},
  year = 2003
}
@article{mayoscobbiehewlettwaters:03,
  author = {Mayo, C. and Scobbie, J. and Hewlett, N. and Waters,
                   D.},
  title = {The influence of phonemic awareness development on
                   acoustic cue weighting in children's speech perception},
  journal = {Journal of Speech, Language and Hearing Research},
  volume = 46,
  pages = {1184-1196},
  categories = {speech perception, development, cue weighting,
                   phonemic awareness, literacy},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/JSLHR1184-Mayo.pdf},
  year = 2003
}
@inproceedings{horlock:king:eurospeech2003b,
  author = {James Horlock and Simon King},
  title = {Discriminative Methods for Improving Named Entity
                   Extraction on Speech Data},
  booktitle = {Proc. Eurospeech},
  address = {Geneva},
  abstract = {In this paper we present a method of discriminatively
                   training language models for spoken language
                   understanding; we show improvements in named entity
                   F-scores on speech data using these improved language
                   models. A comparison between theoretical probabilities
                   associated with manual markup and the actual
                   probabilities of output markup is used to identify
                   probabilities requiring adjustment. We present results
                   which support our hypothesis that improvements in
                   F-scores are possible by using either previously used
                   training data or held out development data to improve
                   discrimination amongst a set of N-gram language models.},
  categories = {},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/Horlock_King_eurospeech2003b.pdf},
  year = 2003
}
@article{king:jphon2003,
  author = {Simon King},
  title = {Dependence and independence in automatic speech
                   recognition and synthesis},
  journal = {Journal of Phonetics},
  volume = 31,
  number = {3-4},
  pages = {407-411},
  abstract = {A short review paper},
  categories = {},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/King_jphon2003.pdf},
  year = 2003
}
@inproceedings{wrigley-eurospeech03,
  author = {S.~Wrigley and G.~Brown and V.~Wan and S. Renals},
  title = {Feature Selection for the Classification of Crosstalk
                   in Multi-Channel Audio},
  booktitle = {Proc. Eurospeech},
  pages = {469--472},
  abstract = {An extension to the conventional speech / nonspeech
                   classification framework is presented for a scenario in
                   which a number of microphones record the activity of
                   speakers present at a meeting (one microphone per
                   speaker). Since each microphone can receive speech from
                   both the participant wearing the microphone (local
                   speech) and other participants (crosstalk), the
                   recorded audio can be broadly classified in four ways:
                   local speech, crosstalk plus local speech, crosstalk
                   alone and silence. We describe a classifier in which a
                   Gaussian mixture model (GMM) is used to model each
                   class. A large set of potential acoustic features are
                   considered, some of which have been employed in
                   previous speech / nonspeech classifiers. A combination
                   of two feature selection algorithms is used to identify
                   the optimal feature set for each class. Results from
                   the GMM classifier using the selected features are
                   superior to those of a previously published approach.},
  categories = {m4,crosstalk,meetings,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/eurospeech03-xtalk.pdf},
  year = 2003
}
@inproceedings{Nakai2003ICDAR,
  author = {Mitsuru Nakai and Hiroshi Shimodaira and Shigeki
                   Sagayama},
  title = {{Generation of Hierarchical Dictionary for
                   Stroke-order Free Kanji Handwriting Recognition Based
                   on Substroke {HMM}}},
  booktitle = {Proc. of ICDAR2003},
  pages = {514--518},
  abstract = {This paper describes a method of generating a
                   Kanjihierarchical structured dictionary for
                   stroke-number and stroke-order free handwriting
                   recognition based on sub-stroke HMM. In stroke-based
                   methods, a large number of stroke-order variations can
                   be easily expressed by justadding different stroke
                   sequences to the dictionary and itis not necessary to
                   train new reference patterns. The hierarchical
                   structured dictionary has an advantage that thousands
                   of stroke-order variations of Kanji characters can be
                   produced using a small number of stroke-order rules
                   defin-ing Kanji parts. Moreover, the recognition speed
                   is fast since common sequences are shared in a
                   substroke network, even if the total number of
                   stroke-order combinations becomes enormous practically.
                   In experiments, 300 differentstroke-order rules of
                   Kanji parts were statistical ly chosen by using 60
                   writers' handwritings of 1,016 educational
                   Kanjicharacters. By adding these new stroke-order rules
                   to the dictionary, about 9,000 variations of different
                   stroke-orderswere generated for 2,965 JIS 1st level
                   Kanji characters. As a result, we successfully improved
                   the recognition accuracyfrom 82.6\% to 90.2\% for
                   stroke-order free handwritings.},
  categories = {HWR, jaist},
  journal = {},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/Nakai2003ICDAR.pdf},
  year = 2003
}
@inproceedings{clark_icphs03,
  author = {Robert A. J. Clark},
  title = {Modelling Pitch Accents for Concept-to-Speech
                   Synthesis.},
  booktitle = {Proc. XVth International Congress of Phonetic Sciences},
  volume = 2,
  pages = {1141--1144},
  categories = {speech synthesis, prosody, intonation, festival},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/clark_icphs03.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/clark_icphs03.ps},
  year = 2003
}
@inproceedings{Keeni2003ICONIP,
  author = {Kanad Keeni and Kunio Goto and Hiroshi Shimodaira},
  title = {{Automatic Filtering of Network IntrusionDetection
                   System Alarms Using Multi-layer Feed-forward Neural
                   Networks}},
  booktitle = {International Conference on Neural Information
                   Processing (ICONIP2003)},
  pages = {},
  categories = {ann},
  journal = {},
  month = jun,
  year = 2003
}
@inproceedings{gillett:king:eurospeech2003a,
  author = {Ben Gillett and Simon King},
  title = {Transforming Voice Quality},
  booktitle = {Proc. {E}urospeech},
  address = {Geneva},
  abstract = {Voice transformation is the process of transforming
                   the characteristics of speech uttered by a source
                   speaker, such that a listener would believe the speech
                   was uttered by a target speaker. In this paper we
                   address the problem of transforming voice quality. We
                   do not attempt to transform prosody. Our system has two
                   main parts corresponding to the two components of the
                   source-filter model of speech production. The first
                   component transforms the spectral envelope as
                   represented by a linear prediction model. The
                   transformation is achieved using a Gaussian mixture
                   model, which is trained on aligned speech from source
                   and target speakers. The second part of the system
                   predicts the spectral detail from the transformed
                   linear prediction coefficients. A novel approach is
                   proposed, which is based on a classifier and residual
                   codebooks. On the basis of a number of performance
                   metrics it outperforms existing systems.},
  categories = {},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/Gillett_King_eurospeech2003b.pdf},
  year = 2003
}
@article{Cox-ijhci03,
  author = {Cox, S.J. and Lincoln, M. and Nakisa, M. and Wells, M.
                   and Tutt, M. and Abbott, S.},
  title = {The Development and Evaluation of a Speech to Sign
                   Translation System to Assist Transactions},
  journal = {Int. Journal of Human Computer Interaction},
  volume = {16},
  number = {2},
  pages = {141-161},
  abstract = {The design, development, and evaluation of an
                   experimental translation system that aims to aid
                   transactions between a deaf person and a clerk in a
                   post office (PO) is described. The system uses a speech
                   recognizer to recognize speech from a PO clerk and then
                   synthesizes recognized phrases in British Sign language
                   (BSL) using a specially developed avatar. The main
                   objective in developing this prototype system was to
                   determine how useful it would be to a customer whose
                   first language was BSL, and to discover what areas of
                   the system required more research and development to
                   make it more effective. The system was evaluated by 6
                   prelingually profoundly deaf people and 3 PO clerks.
                   Deaf users and PO clerks were supportive of the system,
                   but the former group required a higher quality of
                   signing from the avatar and the latter a system that
                   was less constrained in the phrases it could recognize;
                   both these areas are being addressed in the next phase
                   of development.},
  categories = {visicast,sign language,translation,UEA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/ijhci.pdf},
  year = 2003
}
@phdthesis{frankel03:thesis,
  author = {Frankel, J.},
  title = {Linear dynamic models for automatic speech recognition},
  school = {The Centre for Speech Technology Research, Edinburgh
                   University},
  abstract = {The majority of automatic speech recognition (ASR)
                   systems rely on hidden Markov models (HMM), in which
                   the output distribution associated with each state is
                   modelled by a mixture of diagonal covariance Gaussians.
                   Dynamic information is typically included by appending
                   time-derivatives to feature vectors. This approach,
                   whilst successful, makes the false assumption of
                   framewise independence of the augmented feature vectors
                   and ignores the spatial correlations in the
                   parametrised speech signal. This dissertation seeks to
                   address these shortcomings by exploring acoustic
                   modelling for ASR with an application of a form of
                   state-space model, the linear dynamic model (LDM).
                   Rather than modelling individual frames of data, LDMs
                   characterize entire segments of speech. An
                   auto-regressive state evolution through a continuous
                   space gives a Markovian model of the underlying
                   dynamics, and spatial correlations between feature
                   dimensions are absorbed into the structure of the
                   observation process. LDMs have been applied to speech
                   recognition before, however a smoothed Gauss-Markov
                   form was used which ignored the potential for subspace
                   modelling. The continuous dynamical state means that
                   information is passed along the length of each segment.
                   Furthermore, if the state is allowed to be continuous
                   across segment boundaries, long range dependencies are
                   built into the system and the assumption of
                   independence of successive segments is loosened. The
                   state provides an explicit model of temporal
                   correlation which sets this approach apart from
                   frame-based and some segment-based models where the
                   ordering of the data is unimportant. The benefits of
                   such a model are examined both within and between
                   segments. LDMs are well suited to modelling smoothly
                   varying, continuous, yet noisy trajectories such as
                   found in measured articulatory data. Using
                   speaker-dependent data from the MOCHA corpus, the
                   performance of systems which model acoustic,
                   articulatory, and combined acoustic-articulatory
                   features are compared. As well as measured articulatory
                   parameters, experiments use the output of neural
                   networks trained to perform an articulatory inversion
                   mapping. The speaker-independent TIMIT corpus provides
                   the basis for larger scale acoustic-only experiments.
                   Classification tasks provide an ideal means to compare
                   modelling choices without the confounding influence of
                   recognition search errors, and are used to explore
                   issues such as choice of state dimension, front-end
                   acoustic parametrization and parameter initialization.
                   Recognition for segment models is typically more
                   computationally expensive than for frame-based models.
                   Unlike frame-level models, it is not always possible to
                   share likelihood calculations for observation sequences
                   which occur within hypothesized segments that have
                   different start and end times. Furthermore, the Viterbi
                   criterion is not necessarily applicable at the frame
                   level. This work introduces a novel approach to
                   decoding for segment models in the form of a stack
                   decoder with $A^*$ search. Such a scheme allows
                   flexibility in the choice of acoustic and language
                   models since the Viterbi criterion is not integral to
                   the search, and hypothesis generation is independent of
                   the particular language model. Furthermore, the
                   time-asynchronous ordering of the search means that
                   only likely paths are extended, and so a minimum number
                   of models are evaluated. The decoder is used to give
                   full recognition results for feature-sets derived from
                   the MOCHA and TIMIT corpora. Conventional train/test
                   divisions and choice of language model are used so that
                   results can be directly compared to those in other
                   studies. The decoder is also used to implement Viterbi
                   training, in which model parameters are alternately
                   updated and then used to re-align the training data.},
  categories = {am,artic,asr,ldm,mocha,timit,search,edinburgh},
  month = apr,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/Frankel_thesis2003.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/Frankel_thesis2003.ps},
  year = 2003
}
@inproceedings{shig032,
  author = {Yoshinori Shiga and Simon King},
  title = {Estimation of voice source and vocal tract
                   characteristics based on multi-frame analysis},
  booktitle = {Proc. Eurospeech},
  volume = 3,
  pages = {1749--1752},
  address = {Geneva, Switzerland},
  abstract = {This paper presents a new approach for estimating
                   voice source and vocal tract filter characteristics of
                   voiced speech. When it is required to know the transfer
                   function of a system in signal processing, the input
                   and output of the system are experimentally observed
                   and used to calculate the function. However, in the
                   case of source-filter separation we deal with in this
                   paper, only the output (speech) is observed and the
                   characteristics of the system (vocal tract) and the
                   input (voice source) must simultaneously be estimated.
                   Hence the estimate becomes extremely difficult, and it
                   is usually solved approximately using oversimplified
                   models. We demonstrate that these characteristics are
                   separable under the assumption that they are
                   independently controlled by different factors. The
                   separation is realised using an iterative approximation
                   along with the Multi-frame Analysis method, which we
                   have proposed to find spectral envelopes of voiced
                   speech with minimum interference of the harmonic
                   structure.},
  categories = {artic, lbg, clustering, mocha, source-filter,
                   edinburgh},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/shiga_eurospeech03b.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/shiga_eurospeech03b.ps},
  year = 2003
}
@inproceedings{Tokuno2003HCII,
  author = {Tokuno Junko and Naoto Akira and Mitsuru Nakai and
                   Hiroshi Shimodaira and Shigeki Sagayama},
  title = {{Blind-handwriting Interface for Wearable Computing}},
  booktitle = {Proc. of Human - Computer Interaction (HCI)
                   International 2003, Volume 2},
  pages = {303--307},
  note = {},
  abstract = {This paper proposes a novel input interface that we
                   call "blind handwriting" for wearable computing. The
                   blind handwriting, which is a word similar to "blind
                   typing" of keyboard, is a particular writing style
                   where the user does not see the pen or the finger
                   movement. Without visual feedback, written characters
                   are distorted, as in the case when the user is
                   blindfolded, and therefore existing on-line handwriting
                   recognition systems fail to recognize them correctly.
                   The sub-stroke based hidden Markov model approach is
                   employed to tackle this problem. When the pen or touch
                   pad is used as an input device, the proposed interface
                   demonstrates a recognition rate of 83\% on a test set
                   of 61 people where each person wrote 1016 Japanese
                   Kanji characters. },
  categories = {HWR, jaist},
  journal = {},
  month = jun,
  year = 2003
}
@inproceedings{Lin03,
  author = {Lincoln, M. and Cox, S.J.},
  title = {A Comparison of Language Processing Techniques for a
                   Constrained Speech Translation System},
  booktitle = {IEEE Conference on Acoustics, Speech and Signal
                   Processing},
  address = {Hong Kong},
  abstract = {A system designed to allow Post Office counter clerks
                   to communicate with deaf customers by translating
                   speech into sign language is described. The system uses
                   approximately 370 pre-stored phrases which may be
                   signed to the customer using a specially designed
                   avatar. The clerk is unable to memorise this number of
                   phrases and therefore the system attempts to map from
                   their input speech to the semantically equivalent
                   pre-stored phrase. We describe a number of language
                   processing techniques developed to perform the mapping,
                   and give results obtained using alternative
                   formulations of the phrases from a number of speakers.
                   We then give results for recognised speech input and
                   show how mis-recognitions effect the mapping system.
                   Best performance is obtained using a mapping system
                   based on an entropy weighted, vector based distance
                   measure between the test phrase and each of the signed
                   phrases.},
  categories = {visicast,sign language,translation,UEA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/icassp2003.pdf},
  year = 2003
}
@book{renals-book03,
  editor = {S.~Renals and G.~Grefenstette},
  title = {Text and Speech Triggered Information Access},
  publisher = {Springer-Verlag},
  number = {2705},
  series = {Lecture Notes in Computer Science},
  abstract = {Edited collection of revised lectures from the
                   \href{http://www.ilsp.gr/testia/testia2000.html}
                   {ELSNET-2000 Summer School} on Text and Speech
                   Triggered Information Access. },
  categories = {recognition,ir,ie,lm,multimodal,sheffield},
  url = {http://www.springerlink.com/openurl.asp?genre=issue&issn=0302-9743&volume=2705&issue=preprint},
  year = 2003
}