2005.bib

@comment{{This file has been generated by bib2bib 1.92}}
@comment{{Command line: /home/korin/bibtex2html-1.92-LINUX/bib2bib -oc /home/korin/projects/publications/new_output/transitdata/2005-citations -ob /home/korin/projects/publications/new_output/transitdata/2005.bib -c 'year : "2005"' /home/korin/projects/publications/filtlists/full_publications_list.bib}}
@inproceedings{oliverclark_interspeech05,
  author = {Dominika Oliver and Robert A. J. Clark},
  title = {Modelling pitch accent types for {P}olish speech
                   synthesis},
  booktitle = {Proc. Interspeech 2005},
  categories = {speech synthesis, prosody, intonation, festival,
                   Polish},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/oliverclark_interspeech05.pdf},
  year = 2005
}
@inproceedings{christensen-icassp05,
  author = {H. Christensen and B. Kolluru and Y. Gotoh and S.
                   Renals},
  title = {Maximum entropy segmentation of broadcast news},
  booktitle = {Proc. IEEE ICASSP},
  pages = {},
  abstract = {This paper presents an automatic system for
                   structuring and preparing a news broadcast for
                   applications such as speech summarization, browsing,
                   archiving and information retrieval. This process
                   comprises transcribing the audio using an automatic
                   speech recognizer and subsequently segmenting the text
                   into utterances and topics. A maximum entropy approach
                   is used to build statistical models for both utterance
                   and topic segmentation. The experimental work addresses
                   the effect on performance of the topic boundary
                   detector of three factors: the information sources
                   used, the quality of the ASR transcripts, and the
                   quality of the utterance boundary detector. The results
                   show that the topic segmentation is not affected
                   severely by transcripts errors, whereas errors in the
                   utterance segmentation are more devastating. },
  categories = {s3l,summarization,bnews,edinburgh,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/christensen-icassp05.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/christensen-icassp05.ps.gz},
  year = 2005
}
@inproceedings{garau-interspeech05,
  author = {G. Garau and S. Renals and T. Hain},
  title = {Applying Vocal Tract Length Normalization to Meeting
                   Recordings},
  booktitle = {Proc. Interspeech},
  abstract = {Vocal Tract Length Normalisation (VTLN) is a commonly
                   used technique to normalise for inter-speaker
                   variability. It is based on the speaker-specific
                   warping of the frequency axis, parameterised by a
                   scalar warp factor. This factor is typically estimated
                   using maximum likelihood. We discuss how VTLN may be
                   applied to multiparty conversations, reporting a
                   substantial decrease in word error rate in experiments
                   using the ICSI meetings corpus. We investigate the
                   behaviour of the VTLN warping factor and show that a
                   stable estimate is not obtained. Instead it appears to
                   be influenced by the context of the meeting, in
                   particular the current conversational partner. These
                   results are consistent with predictions made by the
                   psycholinguistic interactive alignment account of
                   dialogue, when applied at the acoustic and phonological
                   levels.},
  categories = {ami,asr,edinburgh,vtln,speaker
                   adaptation,lvcsr,meetings},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/giuliagarau_eurospeech05.pdf},
  year = 2005
}
@inproceedings{Gutkin:Gay:qr05,
  author = {Alexander Gutkin and David R. Gay},
  title = {{S}tructural {R}epresentation and {M}atching of
                   {A}rticulatory {S}peech {S}tructures based on the
                   {E}volving {T}ransformation {S}ystem ({ETS})
                   {F}ormalism},
  booktitle = {Proc. 19th International Workshop on Qualitative
                   Reasoning (QR-05)},
  editor = {Michael Hofbaur and Bernhard Rinner and Franz Wotawa},
  pages = {89--96},
  address = {Graz, Austria},
  abstract = { A formal structural representation of speech
                   consistent with the principles of combinatorial
                   structure theory is presented in this paper. The
                   representation is developed within the Evolving
                   Transformation System (ETS) formalism and encapsulates
                   speech processes at the articulatory level. We show how
                   the class structure of several consonantal phonemes of
                   English can be expressed with the help of articulatory
                   gestures---the atomic combinatorial units of speech. As
                   a preliminary step towards the design of a speech
                   recognition architecture based on the structural
                   approaches to physiology and articulatory phonology, we
                   present an algorithm for the structural detection of
                   phonemic class elements inside gestural ETS structures
                   derived from continuous speech. Experiments designed to
                   verify the adequacy of the hypothesised gestural class
                   structure conducted on the MOCHA articulatory corpus
                   are then described. Our experimental results support
                   the hypothesis that the articulatory representation
                   captures sufficient information for the accurate
                   structural identification of the phonemic classes in
                   question. },
  categories = {structural,recognition,ets,artic,mocha,edinburgh,unb},
  isbn = {3-9502019-0-4},
  month = may,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/gutkin_gay_qr05.pdf},
  year = 2005
}
@inproceedings{hain-interspeech05,
  author = {T. Hain and J. Dines and G. Garau and M. Karafiat and
                   D. Moore and V. Wan and R. Ordelman and S. Renals},
  title = {Transcription of Conference Room Meetings: an
                   Investigation},
  booktitle = {Proc. Interspeech},
  abstract = {The automatic processing of speech collected in
                   conference style meetings has attracted considerable
                   interest with several large scale projects devoted to
                   this area. In this paper we explore the use of various
                   meeting corpora for the purpose of automatic speech
                   recognition. In particular we investigate the
                   similarity of these resources and how to efficiently
                   use them in the construction of a meeting transcription
                   system. The analysis shows distinctive features for
                   each resource. However the benefit in pooling data and
                   hence the similarity seems sufficient to speak of a
                   generic conference meeting domain . In this context
                   this paper also presents work on development for the
                   AMI meeting transcription system, a joint effort by
                   seven sites working on the AMI (augmented multi-party
                   interaction) project.},
  categories = {ami,asr,edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hain-eurospeech05.pdf},
  year = 2005
}
@inproceedings{Shimodaira:mlmi05,
  author = {Hiroshi Shimodaira and Keisuke Uematsu and Shin'ichi
                   Kawamoto and Gregor Hofer and Mitsuru Nakai},
  title = {{Analysis and Synthesis of Head Motion for Lifelike
                   Conversational Agents}},
  booktitle = {Proc. MLMI2005},
  categories = {lifelike agents},
  month = jul,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/mlmi2005.pdf},
  year = 2005
}
@inproceedings{calhoun:05,
  author = {Calhoun, Sasha},
  title = {It's the Difference That Matters: An Argument for
                   Contextually-Grounded Acoustic Intonational Phonology},
  booktitle = {Linguistics Society of America Annual Meeting},
  address = {Oakland, California},
  abstract = {Standardly, the link between intonation and discourse
                   meaning is described in terms of perceptual intonation
                   categories, e.g. ToBI. We argue that this approach
                   needs to be refined to explicitly recognise: firstly,
                   that perception is affected by multiple acoustic cues,
                   including duration and intensity, as well as F0; and
                   secondly that the interpretation of these cues is
                   directly linked to the phonetic and discourse context.
                   Investigating the marking of topic status in a small
                   game task corpus, we found that although topic status
                   is not consistently marked by ToBI pitch accent, it is
                   by the F0 mean, intensity and duration of the topic
                   word. Using regression analysis, we found that when
                   factoring out the F0 mean and intensity of key parts of
                   the preceding discourse, intensity and duration become
                   stronger predictors of topic status than F0. },
  categories = {intonation theory and methodology, information
                   structure, pitch accents, corpus study},
  month = jan,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/calhounlsa05.pdf},
  year = 2005
}
@inproceedings{Hachey05,
  author = {B. Hachey and G. Murray and D. Reitter},
  title = {The {E}mbra System at {DUC} 2005: Query-oriented
                   Multi-document Summarization with a Very Large Latent
                   Semantic Space},
  booktitle = {Proceedings of the Document Understanding Conference
                   (DUC) 2005, Vancouver, BC, Canada},
  abstract = {Our summarization system submitted to DUC 2005, Embra
                   (or Edinburgh), is novel in that it relies on building
                   a very large semantic space for the purposes of
                   determining relevance and redundancy in an MMR-style
                   framework. We address specificity by detecting the
                   presence or absence of Named Entities in our extract
                   candidates, and we implemented a sentence-ordering
                   algorithm to maximize sentence cohesion in our final
                   summaries.},
  categories = {summarization, latent semantic analysis},
  month = oct,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/duc2005.pdf},
  year = 2005
}
@phdthesis{gray2005,
  author = {Gray, Calum},
  title = {Acoustic Pulse Reflectometry for Measurement of the
                   Vocal Tract with Application in Voice Synthesis.},
  school = {University of Edinburgh},
  abstract = {The measurement of human airway dimensions has been a
                   frequent objective in the fields of respiratory
                   medicine and speech research, but has proven difficult
                   to achieve non-invasively due to the airway's function
                   in breathing, swallowing and speaking. Acoustic pulse
                   reflectometry (APR) has been employed in clinical
                   studies of the vocal tract for several years, normally
                   in the function of airway measurement. The focus of
                   this work is to utilise APR in capturing vocal tract
                   profiles during the phonation of vowel sounds, for the
                   purposes of sound synthesis. By making an equivalent
                   tube model of the vocal tract, the propagation of an
                   acoustic wave can be readily calculated using
                   techniques such as waveguide modelling, which will in
                   turn allow us to synthesise sound and form the basis of
                   a physical model of the voice. The attractions of this
                   technique for vocal tract measurement are many: it is
                   non-invasive, safe, repeatable and inexpensive. In this
                   thesis, the basic theory describing wave propagation in
                   tubes of varying cross- section is outlined, together
                   with a review of how the time domain technique of APR
                   can be used to measure the input impulse response of a
                   tubular object, such as the vocal tract, from which the
                   bore profile can be calculated using the layer peeling
                   algorithm. Experimental measurements of the human vocal
                   tract during the phonation (imitation) of five
                   non-nasalised vowels [a, e, i, o, u] are presented,
                   using recent enhancements to the APR technique (MLS
                   excitation signals and virtual DC tube method) for a
                   single subject, together with optimisation of the APR
                   technique for vocal tract measurement and its
                   application in a group study using adults and children.
                   To validate the results obtained using the APR
                   technique, a comparative study with an accepted "gold
                   standard" imaging technique (Magnetic Resonance Imaging
                   - MRI) is presented, using the same subject, a voice
                   professional, in both studies. The results from this
                   study show reasonable overall agreement between the APR
                   and MRI data, with the limited resolution of the
                   acoustic technique tending to broaden features and
                   underestimate cross sectional areas, particularly in
                   the region of the pharynx and glottis. Protocols and
                   supplementary documentation required by scientific,
                   clinical and ethical review bodies for the use of human
                   volunteers in research trials are provided. From this
                   study a data corpus of vocal tract measurements is
                   gathered, using the techniques of APR and MRI, in adult
                   males, adult females and children. In conclusion,
                   limitations of the APR technique for vocal tract
                   measurement are discussed and potential improvements
                   are proposed.},
  key = {gray2005},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/ThesisCalumGray.pdf},
  year = 2005
}
@inproceedings{clarkrichmondking_interspeech05,
  author = {Robert A.J. Clark and Korin Richmond and Simon King},
  title = {Multisyn voices from {ARCTIC} data for the {B}lizzard
                   challenge},
  booktitle = {Proc. Interspeech 2005},
  abstract = {This paper describes the process of building unit
                   selection voices for the Festival Multisyn engine using
                   four ARCTIC datasets, as part of the Blizzard
                   evaluation challenge. The build process is almost
                   entirely automatic, with very little need for human
                   intervention. We discuss the difference in the
                   evaluation results for each voice and evaluate the
                   suitability of the ARCTIC datasets for building this
                   type of voice.},
  categories = {speech synthesis, festival, evaluation},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/clarkrichmondking_interspeech05.pdf},
  year = 2005
}
@article{Nakai2005IEICE01,
  author = {Mitsuru Nakai and Shigeki Sagayama and Hiroshi
                   Shimodaira},
  title = {{On-line Handwriting Recognition Based on Sub-stroke
                   {HMM}}},
  journal = {Trans. IEICE D-II},
  volume = {J88-D2},
  number = {8},
  note = {(in press) (in Japanese)},
  abstract = { This paper describes context-dependent sub-stroke
                   HMMs for on-line handwritten character recognition. As
                   there are so many characters in Japanese, modeling each
                   character by an HMM leads to an infeasible
                   character-recognition system requiring huge amount of
                   memory and enormous computation time. The sub-stroke
                   HMM approach has overcomed these problems by minimizing
                   modeling unit. However, one of the drawback of this
                   approach is that the recognition accuracy deteriorates
                   for scribbled characters. In this paper, we show that
                   the context-dependent sub-stroke modeling which depends
                   on how the sub-stroke connects to the adjacent
                   substrokes is effective to achieve robust recognition
                   of low quality characters. },
  categories = {online handwritten character recognition},
  month = aug,
  year = 2005
}
@inproceedings{calhoun:05-a,
  author = {Calhoun, Sasha and Malvina Nissim and Mark Steedman
                   and Jason Brenier},
  title = {A Framework for Annotating Information Structure in
                   Discourse},
  booktitle = {Frontiers in Corpus Annotation II: Pie in the Sky,
                   ACL2005 Conference Workshop},
  address = {Ann Arbor, Michigan},
  abstract = {We present a framework for the integrated analysis of
                   the textual and prosodic characteristics of information
                   structure in the {\em Switchboard} corpus of
                   conversational English. Information structure describes
                   the availability, organisation and salience of entities
                   in a discourse model. We present standards for the
                   annotation of {\em information status} (old, mediated
                   and new), and give guidelines for annotating {\em
                   information structure}, i.e. {\em theme/rheme} and {\em
                   background/kontrast}. We show that information
                   structure in English can only be analysed concurrently
                   with prosodic prominence and phrasing. Along with
                   existing annotations which we have integrated using NXT
                   technology, the corpus will be unique in the field of
                   conversational speech in terms of size and richness of
                   annotation, vital for many NLP applications.},
  categories = {prosody, information structure, annotation, discourse
                   semantics},
  month = jun,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/pieinsky05.pdf},
  year = 2005
}
@inproceedings{NistevalAMI05,
  author = {T. Hain and L. Burget and J. Dines and G. Garau and M.
                   Karafiat and M. Lincoln and I. McCowan and D. Moore and
                   V. Wan and R. Ordelman and S. Renals},
  title = {The 2005 {AMI} System for the transcription of Speech
                   in Meetings},
  booktitle = {Proceedings of the Rich Transcription 2005 Spring
                   Meeting Recognition Evaluation},
  abstract = {In this paper we describe the 2005 AMI system for the
                   transcription of speech in meetings used in the 2005
                   NIST RT evaluations. The system was designed for
                   participation in the speech to text part of the
                   evaluations, in particular for transcription of speech
                   recorded with multiple distant microphones and
                   independent headset microphones. System performance was
                   tested on both conference room and lecture style
                   meetings. Although input sources are processed using
                   different frontends, the recognition process is based
                   on a unified system architecture. The system operates
                   in multiple passes and makes use of state of the art
                   technologies such as discriminative training, vocal
                   tract length normalisation, heteroscedastic linear
                   discriminant analysis, speaker adaptation with maximum
                   likelihood linear regression and minimum word error
                   rate decoding. In this paper we describe the system
                   performance on the official development and test sets
                   for the NIST RT05s evaluations. The system was jointly
                   developed in less than 10 months by a multi-site team
                   and was shown to achieve competitive performance.},
  categories = {LVCSR, NIST Meeting Transcription Evaluation RT05S},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/AMIasr.nist2005.pdf},
  year = 2005
}
@inproceedings{Gutkin:King:pris05,
  author = {Alexander Gutkin and Simon King},
  title = {{I}nductive {S}tring {T}emplate-{B}ased {L}earning of
                   {S}poken {L}anguage},
  booktitle = {Proc. 5th International Workshop on Pattern
                   Recognition in Information Systems (PRIS-2005), In
                   conjunction with the 7th International Conference on
                   Enterprise Information Systems (ICEIS-2005)},
  editor = {Hugo Gamboa and Ana Fred},
  pages = {43--51},
  address = {Miami, USA},
  publisher = {INSTICC Press},
  abstract = { This paper deals with formulation of alternative
                   structural approach to the speech recognition problem.
                   In this approach, we require both the representation
                   and the learning algorithms defined on it to be
                   linguistically meaningful, which allows the speech
                   recognition system to discover the nature of the
                   linguistic classes of speech patterns corresponding to
                   the speech waveforms. We briefly discuss the current
                   formalisms and propose an alternative --- a
                   phonologically inspired string-based inductive speech
                   representation, defined within an analytical framework
                   specifically designed to address the issues of class
                   and object representation. We also present the results
                   of the phoneme classification experiments conducted on
                   the TIMIT corpus of continuous speech. },
  categories = {structural,recognition,acoustic,phonetic_feature,timit,edinburgh},
  isbn = {972-8865-28-7},
  month = may,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/gutkin_king_pris05.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/gutkin_king_pris05.ps.gz},
  year = 2005
}
@article{mayoturk-jasa05,
  author = {Mayo, C. and Turk, A.},
  title = {The influence of spectral distinctiveness on acoustic
                   cue weighting in children's and adults' speech
                   perception},
  journal = {Journal of the Acoustical Society of America},
  volume = {118},
  pages = {1730--1741},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/mayo-turk-2005-7.pdf},
  year = 2005
}
@inproceedings{king_bartels_bilmes_isp05,
  author = {Simon King and Chris Bartels and Jeff Bilmes},
  title = {SVitchboard 1: Small Vocabulary Tasks from Switchboard
                   1 },
  booktitle = {Proc. Interspeech 2005},
  address = {Lisbon, Portugal},
  abstract = {We present a conversational telephone speech data set
                   designed to support research on novel acoustic models.
                   Small vocabulary tasks from 10 words up to 500 words
                   are defined using subsets of the Switchboard-1 corpus;
                   each task has a completely closed vocabulary (an OOV
                   rate of 0\%). We justify the need for these tasks, de-
                   scribe the algorithm for selecting them from a large
                   cor- pus, give a statistical analysis of the data and
                   present baseline whole-word hidden Markov model
                   recognition results. The goal of the paper is to define
                   a common data set and to encourage other researchers to
                   use it.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/king_bartels_bilmes_svitchboard.pdf},
  year = 2005
}
@article{wrigley-sap05,
  author = {S. J. Wrigley and G. J. Brown and V. Wan and S. Renals},
  title = {Speech and crosstalk detection in multi-channel audio},
  journal = {IEEE Trans. on Speech and Audio Processing},
  volume = {13},
  pages = {84--91},
  abstract = {The analysis of scenarios in which a number of
                   microphones record the activity of speakers, such as in
                   a roundtable meeting, presents a number of
                   computational challenges. For example, if each
                   participant wears a microphone, it can receive speech
                   from both the microphone's wearer (local speech) and
                   from other participants (crosstalk). The recorded audio
                   can be broadly classified in four ways: local speech,
                   crosstalk plus local speech, crosstalk alone and
                   silence. We describe two experiments related to the
                   automatic classification of audio into these four
                   classes. The first experiment attempted to optimise a
                   set of acoustic features for use with a Gaussian
                   mixture model (GMM) classifier. A large set of
                   potential acoustic features were considered, some of
                   which have been employed in previous studies. The
                   best-performing features were found to be kurtosis,
                   fundamentalness and cross-correlation metrics. The
                   second experiment used these features to train an
                   ergodic hidden Markov model classifier. Tests performed
                   on a large corpus of recorded meetings show
                   classification accuracies of up to 96\%, and automatic
                   speech recognition performance close to that obtained
                   using ground truth segmentation.},
  categories = {m4,meetings,edinburgh,asr,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/sap04-xtalk.pdf},
  year = 2005
}
@article{goldman2005,
  author = {Jerry Goldman and Steve Renals and Steven Bird and
                   Franciska {de Jong} and Marcello Federico and Carl
                   Fleischhauer and Mark Kornbluh and Lori Lamel and Doug
                   Oard and Clare Stewart and Richard Wright},
  title = {Accessing the spoken word},
  journal = {International Journal of Digital Libraries},
  volume = 5,
  number = 4,
  pages = {287--298},
  abstract = {Spoken word audio collections cover many domains,
                   including radio and television broadcasts, oral
                   narratives, governmental proceedings, lectures, and
                   telephone conversations. The collection, access and
                   preservation of such data is stimulated by political,
                   economic, cultural and educational needs. This paper
                   outlines the major issues in the field, reviews the
                   current state of technology, examines the rapidly
                   changing policy issues relating to privacy and
                   copyright, and presents issues relating to the
                   collection and preservation of spoken audio content.},
  categories = {swag,asr,ir,edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/swag-ijdl05.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/swag-ijdl05.ps.gz},
  year = 2005
}
@inproceedings{hifny-interspeech05,
  author = {Y. Hifny and S. Renals and N. Lawrence},
  title = {A Hybrid {MaxEnt/HMM} based {ASR} System},
  booktitle = {Proc. Interspeech},
  abstract = {The aim of this work is to develop a practical
                   framework, which extends the classical Hidden Markov
                   Models (HMM) for continuous speech recognition based on
                   the Maximum Entropy (MaxEnt) principle. The MaxEnt
                   models can estimate the posterior probabilities
                   directly as with Hybrid NN/HMM connectionist speech
                   recognition systems. In particular, a new acoustic
                   modelling based on discriminative MaxEnt models is
                   formulated and is being developed to replace the
                   generative Gaussian Mixture Models (GMM) commonly used
                   to model acoustic variability. Initial experimental
                   results using the TIMIT phone task are reported.},
  categories = {ml,asr,edinburgh,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hifny-eurospeech05.pdf},
  year = 2005
}
@incollection{dielmann-mlmi04,
  author = {A. Dielmann and S. Renals},
  title = {Multistream dynamic {Bayesian} network for meeting
                   segmentation},
  booktitle = {Proc. Multimodal Interaction and Related Machine
                   Learning Algorithms Workshop (MLMI--04)},
  publisher = {Springer},
  editor = {S. Bengio and H. Bourlard},
  pages = {76--86},
  abstract = {This paper investigates the automatic analysis and
                   segmentation of meetings. A meeting is analysed in
                   terms of individual behaviours and group interactions,
                   in order to decompose each meeting in a sequence of
                   relevant phases, named meeting actions. Three feature
                   families are extracted from multimodal recordings:
                   prosody from individual lapel microphone signals,
                   speaker activity from microphone array data and lexical
                   features from textual transcripts. A statistical
                   approach is then used to relate low-level features with
                   a set of abstract categories. In order to provide a
                   flexible and powerful framework, we have employed a
                   dynamic Bayesian network based model, characterized by
                   multiple stream processing and flexible state duration
                   modelling. Experimental results demonstrate the
                   strength of this system, providing a meeting action
                   error rate of 9\%.},
  categories = {m4,multimodal,dbn,meetings,edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/dielmann-mlmi04.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/dielmann-mlmi04.ps.gz},
  year = 2005
}
@inproceedings{Gutkin:King:icassp05,
  author = {Alexander Gutkin and Simon King},
  title = {{D}etection of {S}ymbolic {G}estural {E}vents in
                   {A}rticulatory {D}ata for {U}se in {S}tructural
                   {R}epresentations of {C}ontinuous {S}peech},
  booktitle = {Proc. IEEE International Conference on Acoustics,
                   Speech, and Signal Processing (ICASSP-05)},
  volume = {I},
  pages = {885--888},
  address = {Philadelphia, PA, USA},
  publisher = {IEEE Signal Processing Society Press},
  abstract = { One of the crucial issues which often needs to be
                   addressed in structural approaches to speech
                   representation is the choice of fundamental symbolic
                   units of representation. In this paper, a
                   physiologically inspired methodology for defining these
                   symbolic atomic units in terms of primitive
                   articulatory events is proposed. It is shown how the
                   atomic articulatory events (gestures) can be detected
                   directly in the articulatory data. An algorithm for
                   evaluating the reliability of the articulatory events
                   is described and promising results of the experiments
                   conducted on MOCHA articulatory database are presented.
                   },
  categories = {structural,recognition,artic,mocha,edinburgh},
  isbn = {0-7803-8875-5},
  month = mar,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/gutkin_king_icassp2005.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/gutkin_king_icassp2005.ps.gz},
  year = 2005
}
@inproceedings{mayoturk-psp05,
  author = {Mayo, C. and Turk, A.},
  title = {No Available Theories Currently Explain All
                   Adult-Child Cue Weighting Differences},
  booktitle = {Proc. ISCA Workshop on Plasticity in Speech Perception},
  address = {London, UK},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/mayoday2.pdf},
  year = 2005
}
@article{wan-sap05,
  author = {V. Wan and S. Renals},
  title = {Speaker verification using sequence discriminant
                   support vector machines},
  journal = {IEEE Trans. on Speech and Audio Processing},
  volume = {13},
  pages = {203--210},
  abstract = {This paper presents a text-independent speaker
                   verification system using support vector machines
                   (SVMs) with score-space kernels. Score-space kernels,
                   generalize Fisher kernels, and are based on an
                   underlying generative model, such as a Gaussian mixture
                   model (GMM). This approach provides direct
                   discrimination between whole sequences, in contrast to
                   the frame-level approaches at the heart of most current
                   systems. The resultant SVMs have a very high
                   dimensionality, since it is related to the number of
                   parameters in the underlying generative model. To
                   ameliorate problems that can arise in the resultant
                   optimization, we introduce a technique called spherical
                   normalization that preconditions the Hessian matrix. We
                   have performed speaker verification experiments using
                   the PolyVar database. The SVM system presented here
                   reduces the relative error rates by 34\% compared to a
                   GMM likelihood ratio system.},
  categories = {verification,kernel,svm,edinburgh,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/sap05-svm.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/sap05-svm.ps.gz},
  year = 2005
}
@inproceedings{Murray05b,
  author = {G. Murray and S. Renals and J. Carletta and J. Moore},
  title = {Evaluating Automatic Summaries of Meeting Recordings},
  booktitle = {Proceedings of the 43rd Annual Meeting of the
                   Association for Computational Linguistics, Ann Arbor,
                   MI, USA},
  abstract = {The research below explores schemes for evaluating
                   automatic summaries of business meetings, using the
                   ICSI Meeting Corpus. Both automatic and subjective
                   evaluations were carried out, with a central interest
                   being whether or not the two types of evaluations
                   correlate with each other. The evaluation metrics were
                   used to compare and contrast differing approaches to
                   automatic summarization, the deterioration of summary
                   quality on ASR output versus manual transcripts, and to
                   determine whether manual extracts are rated
                   significantly higher than automatic extracts. },
  categories = {ami,summarization, speech summarization, prosody,
                   latent semantic analysis, summarization evaluation,
                   edinburgh},
  month = jun,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/murray-renals-carletta-moore.pdf},
  year = 2005
}
@inproceedings{murray-interspeech05,
  author = {G. Murray and S. Renals and J. Carletta},
  title = {Extractive Summarization of Meeting Recordings},
  booktitle = {Proc. Interspeech},
  abstract = {Several approaches to automatic speech summarization
                   are discussed below, using the ICSI Meetings corpus. We
                   contrast feature-based approaches using prosodic and
                   lexical features with maximal marginal relevance and
                   latent semantic analysis approaches to summarization.
                   While the latter two techniques are borrowed directly
                   from the field of text summarization, feature-based
                   approaches using prosodic information are able to
                   utilize characteristics unique to speech data. We also
                   investigate how the summarization results might
                   deteriorate when carried out on ASR output as opposed
                   to manual transcripts. All of the summaries are of an
                   extractive variety, and are compared using the software
                   ROUGE.},
  categories = {ami,summarization,prosody, latent semantic
                   analysis,edinburgh},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/murray-eurospeech05.pdf},
  year = 2005
}
@inproceedings{cuayahuitletal_asru05,
  author = {Heriberto Cuayáhuitl and Steve Renals and Oliver
                   Lemon and Hiroshi Shimodaira},
  title = {Human-Computer Dialogue Simulation Using Hidden Markov
                   Models},
  booktitle = {Proc. of IEEE Workshop on Automatic Speech Recognition
                   and Understanding (ASRU)},
  abstract = {This paper presents a probabilistic method to simulate
                   task-oriented human-computer dialogues at the intention
                   level, that may be used to improve or to evaluate the
                   performance of spoken dialogue systems. Our method uses
                   a network of Hidden Markov Models (HMMs) to predict
                   system and user intentions, where a ``language model''
                   predicts sequences of goals and the component HMMs
                   predict sequences of intentions. We compare standard
                   HMMs, Input HMMs and Input-Output HMMs in an effort to
                   better predict sequences of intentions. In addition, we
                   propose a dialogue similarity measure to evaluate the
                   realism of the simulated dialogues. We performed
                   experiments using the DARPA Communicator corpora and
                   report results with three different metrics: dialogue
                   length, dialogue similarity and precision-recall.},
  categories = {dialogue simulation, hidden markov models},
  month = nov,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hcp-asru2005.pdf},
  year = 2005
}
@inproceedings{mayoclarkking-isp05,
  author = {Mayo, C. and Clark, R. A. J. and King, S.},
  title = {Multidimensional Scaling of Listener Responses to
                   Synthetic Speech},
  booktitle = {Proc. Interspeech 2005},
  address = {Lisbon, Portugal},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/ie-speech-2005.pdf},
  year = 2005
}
@phdthesis{shiga05,
  author = {Shiga, Yoshinori},
  title = {Precise Estimation of Vocal Tract and Voice Source
                   Characteristics},
  school = {The Centre for Speech Technology Research, Edinburgh
                   University},
  abstract = {This thesis addresses the problem of quality
                   degradation in speech produced by parameter-based
                   speech synthesis, within the framework of an
                   articulatory-acoustic forward mapping. I first
                   investigate current problems in speech
                   parameterisation, and point out the fact that
                   conventional parameterisation inaccurately extracts the
                   vocal tract response due to interference from the
                   harmonic structure of voiced speech. To overcome this
                   problem, I introduce a method for estimating filter
                   responses more precisely from periodic signals. The
                   method achieves such estimation in the frequency domain
                   by approximating all the harmonics observed in several
                   frames based on a least squares criterion. It is shown
                   that the proposed method is capable of estimating the
                   response more accurately than widely-used
                   frame-by-frame parameterisation, for simulations using
                   synthetic speech and for an articulatory-acoustic
                   mapping using actual speech. I also deal with the
                   source-filter separation problem and independent
                   control of the voice source characteristic during
                   speech synthesis. I propose a statistical approach to
                   separating out the vocal-tract filter response from the
                   voice source characteristic using a large articulatory
                   database. The approach realises such separation for
                   voiced speech using an iterative approximation
                   procedure under the assumption that the speech
                   production process is a linear system composed of a
                   voice source and a vocal-tract filter, and that each of
                   the components is controlled independently by different
                   sets of factors. Experimental results show that
                   controlling the source characteristic greatly improves
                   the accuracy of the articulatory-acoustic mapping, and
                   that the spectral variation of the source
                   characteristic is evidently influenced by the
                   fundamental frequency or the power of speech. The
                   thesis provides more accurate acoustical approximation
                   of the vocal tract response, which will be beneficial
                   in a wide range of speech technologies, and lays the
                   groundwork in speech science for a new type of
                   corpus-based statistical solution to the source-filter
                   separation problem.},
  categories = {mfa, multiframe, forward, mapping, source-filter,
                   artic, mocha, edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/phd_thesis_shiga.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/phd_thesis_shiga.ps.gz},
  year = 2005
}
@inproceedings{frankel05:hybrid,
  author = {Frankel, J. and King, S.},
  title = {A Hybrid {ANN/DBN} Approach to Articulatory Feature
                   Recognition},
  booktitle = {Proc. Eurospeech},
  address = {Lisbon},
  abstract = {Artificial neural networks (ANN) have proven to be
                   well suited to the task of articulatory feature (AF)
                   recognition. Previous studies have taken a cascaded
                   approach where separate ANNs are trained for each
                   feature group, making the assumption that features are
                   statistically independent. We address this by using
                   ANNs to provide virtual evidence to a dynamic Bayesian
                   network (DBN). This gives a hybrid ANN/DBN model and
                   allows modelling of inter-feature dependencies. We
                   demonstrate significant increases in AF recognition
                   accuracy from modelling dependencies between features,
                   and present the results of embedded training
                   experiments in which a set of asynchronous feature
                   changes are learned. Furthermore, we report on the
                   application of a Viterbi training scheme in which we
                   alternate between realigning the AF training labels and
                   retraining the ANNs.},
  categories = {am,artic,asr,dbn,oginumbers,edinburgh},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/Frankel_King_INTER2005.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/Frankel_King_INTER2005.ps},
  year = 2005
}
@article{koumpis2005-acmslp,
  author = {Konstantinos Koumpis and Steve Renals},
  title = {Automatic summarization of voicemail messages using
                   lexical and prosodic features},
  journal = {ACM Transactions on Speech and Language Processing},
  volume = 2,
  number = 1,
  pages = {1--24},
  abstract = {This paper presents trainable methods for extracting
                   principal content words from voicemail messages. The
                   short text summaries generated are suitable for mobile
                   messaging applications. The system uses a set of
                   classifiers to identify the summary words, with each
                   word being identified by a vector of lexical and
                   prosodic features. We use an ROC-based algorithm,
                   Parcel, to select input features (and classifiers). We
                   have performed a series of objective and subjective
                   evaluations using unseen data from two different speech
                   recognition systems, as well as human transcriptions of
                   voicemail speech.},
  categories = {voicemail,summarization,prosody,sheffield,edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/tslp05.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/tslp05.ps.gz},
  year = 2005
}
@article{Tokuno2005IEICE01,
  author = {Junko Tokuno and Nobuhito Inami and Mitsuru Nakai and
                   Hiroshi Shimodaira and Shigeki Sagayama},
  title = {{Context-dependent Sub-stroke Model for {HMM}-based
                   On-line Handwriting Recognition}},
  journal = {Trans. IEICE D-II},
  volume = {J88-D2},
  number = {8},
  note = {(in press), (in Japanese)},
  abstract = { A new method is proposed for on-line Kanji
                   handwriting recognition. The method employs sub-stroke
                   HMMs as minimum units to constitute Kanji characters
                   and utilizes the direction of pen motion. The present
                   approach has the following advantages over the
                   conventional methods that employ character HMMs. 1)
                   Much smaller memory requirement for dictionary and
                   models. 2) Fast recognition by employing efficient
                   sub-stroke network search. 3) Capability of recognizing
                   characters not included in the training data if defined
                   as a sequence of sub-strokes in the dictionary. In
                   experiments, we have achieved a correct recognition
                   rate of above 96\% by using JAIST-IIPL database that
                   includes 1,016 educational Kanji characters. },
  categories = {online handwritten character recognition},
  month = aug,
  year = 2005
}
@inproceedings{goubanova_king_isp05,
  author = {Olga Goubanova and Simon King},
  title = {Predicting Consonant Duration with {B}ayesian Belief
                   Networks},
  booktitle = {Proc. Interspeech 2005},
  address = {Lisbon, Portugal},
  abstract = {Consonant duration is influenced by a number of
                   linguistic factors such as the consonant s identity,
                   within-word position, stress level of the previous and
                   following vowels, phrasal position of the word
                   containing the target consonant, its syllabic position,
                   identity of the previous and following segments. In our
                   work, consonant duration is predicted from a Bayesian
                   belief network (BN) consisting of discrete nodes for
                   the linguistic factors and a single continuous node for
                   the consonant s duration. Interactions between factors
                   are represented as conditional dependency arcs in this
                   graphical model. Given the parameters of the belief
                   network, the duration of each consonant in the test set
                   is then predicted as the value with the maximum
                   probability. We compare the results of the belief
                   network model with those of sums-of-products (SoP) and
                   classification and regression tree (CART) models using
                   the same data. In terms of RMS error, our BN model
                   performs better than both CART and SoP models. In terms
                   of the correlation coefficient, our BN model performs
                   better than SoP model, and no worse than CART model. In
                   addition, the Bayesian model reliably predicts
                   consonant duration in cases of missing or hidden
                   linguistic factors.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/goubanova_king_isp2005.pdf},
  year = 2005
}
@article{koumpis2005-spmag,
  author = {Koumpis, Konstantinos and Renals, Steve},
  title = {Content-based access to spoken audio},
  journal = {IEEE Signal Processing Magazine},
  volume = 22,
  number = 5,
  pages = {61--69},
  abstract = {"How analysis, retrieval and delivery phases make
                   spoken audio content more accessible"},
  categories = {asr,ir,summarization,edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/koumpis-spm05.pdf},
  year = 2005
}
@phdthesis{Gutkin:phd:05,
  author = {Alexander Gutkin},
  title = {{T}owards {F}ormal {S}tructural {R}epresentation of
                   {S}poken {L}anguage: {A}n {E}volving {T}ransformation
                   {S}ystem ({ETS}) {A}pproach},
  school = {School of Informatics, University of Edinburgh},
  address = {UK},
  note = {Internal version},
  categories = {structural,representation,recognition,edinburgh,unb,ets},
  month = dec,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/gutkin_phd_full.pdf},
  year = 2005
}
@inproceedings{AMIMLMI05,
  author = {T. Hain and L. Burget and J. Dines and G. Garau and M.
                   Karafiat and M. Lincoln and I. McCowan and D. Moore and
                   V. Wan and R. Ordelman and S. Renals},
  title = {The Development of the {AMI} System for the
                   Transcription of Speech in Meetings},
  booktitle = {2nd Joint Workshop on Multimodal Interaction and
                   Related Machine Learning Algorithms},
  abstract = {The automatic processing of speech collected in
                   conference style meetings has attracted considerable
                   interest with several large scale projects devoted to
                   this area. This paper describes the development of a
                   baseline automatic speech transcription system for
                   meetings in the context of the AMI (Augmented
                   Multiparty Interaction) project. We present several
                   techniques important to processing of this data and
                   show the performance in terms of word error rates
                   (WERs). An important aspect of transcription of this
                   data is the necessary flexibility in terms of audio
                   pre-processing. Real world systems have to deal with
                   flexible input, for example by using microphone arrays
                   or randomly placed microphones in a room. Automatic
                   segmentation and microphone array processing techniques
                   are described and the effect on WERs is discussed. The
                   system and its components presented in this paper yield
                   compettive performance and form a baseline for future
                   research in this domain.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/AMIsystemMLMI05.pdf},
  year = 2005
}
@inproceedings{faria-eurospeech05,
  author = {A.~Faria and D.~Gelbart},
  title = {Efficient Pitch-based Estimation of {VLTN} Warp
                   Factors},
  booktitle = {Proc. Eurospeech},
  abstract = { To reduce inter-speaker variability, vocal tract
                   length normalization (VTLN) is commonly used to
                   transform acoustic features for automatic speech
                   recognition (ASR). The warp factors used in this
                   process are usually derived by maximum likelihood (ML)
                   estimation, involving an exhaustive search over
                   possible values. We describe an alternative approach:
                   exploit the correlation between a speaker's average
                   pitch and vocal tract length, and model the probability
                   distribution of warp factors conditioned on pitch
                   observations. This can be used directly for warp factor
                   estimation, or as a smoothing prior in combination with
                   ML estimates. Pitch-based warp factor estimation for
                   VTLN is effective and requires relatively little memory
                   and computation. Such an approach is well-suited for
                   environments with constrained resources, or where pitch
                   is already being computed for other purposes. },
  categories = {vocal tract length normalization,speaker adaptation},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/pbvtln-latest.pdf},
  year = 2005
}
@inproceedings{Gutkin:Gay:ijcai05,
  author = {Alexander Gutkin and David R. Gay},
  title = {Structural Representation and Matching of Articulatory
                   Speech Structures based on the Evolving Transformation
                   System ({ETS}) Formalism},
  booktitle = {Proc. Nineteenth International Joint Conference on
                   Artificial Intelligence (IJCAI-05)},
  address = {Edinburgh, UK},
  categories = {structural,recognition,ets,artic,mocha,edinburgh,unb},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/gutkin_gay_ijcai05.pdf},
  year = 2005
}
@inproceedings{hofer-eurosp05,
  author = {G. Hofer and K. Richmond and R. Clark},
  title = {Informed Blending of Databases for Emotional Speech
                   Synthesis},
  booktitle = {Proc. Interspeech},
  abstract = {The goal of this project was to build a unit selection
                   voice that could portray emotions with varying
                   intensities. A suitable definition of an emotion was
                   developed along with a descriptive framework that
                   supported the work carried out. A single speaker was
                   recorded portraying happy and angry speaking styles.
                   Additionally a neutral database was also recorded. A
                   target cost function was implemented that chose units
                   according to emotion mark-up in the database. The
                   Dictionary of Affect supported the emotional target
                   cost function by providing an emotion rating for words
                   in the target utterance. If a word was particularly
                   'emotional', units from that emotion were favoured. In
                   addition intensity could be varied which resulted in a
                   bias to select a greater number emotional units. A
                   perceptual evaluation was carried out and subjects were
                   able to recognise reliably emotions with varying
                   amounts of emotional units present in the target
                   utterance.},
  categories = {speech synthesis,emotion,edinburgh},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hofer_emosyn.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hofer_emosyn.ps},
  year = 2005
}
@article{onnis2005,
  author = {Onnis, L. and Monaghan, P. and Richmond, K. and
                   Chater, N.},
  title = {Phonology impacts segmentation in speech processing.},
  journal = {Journal of Memory and Language},
  volume = {53},
  number = {2},
  pages = {225--237},
  abstract = {Peña, Bonatti, Nespor and Mehler(2002) investigated an
                   artificial language where the structure of words was
                   determined by nonadjacent dependencies between
                   syllables. They found that segmentation of continuous
                   speech could proceed on the basis of these
                   dependencies. However, Peña et al.'s artificial
                   language contained a confound in terms of phonology, in
                   that the dependent syllables began with plosives and
                   the intervening syllables began with continuants. We
                   consider three hypotheses concerning the role of
                   phonology in speech segmentation in this task: (1)
                   participants may recruit probabilistic phonotactic
                   information from their native language to the
                   artificial language learning task; (2) phonetic
                   properties of the stimuli, such as the gaps that
                   precede unvoiced plosives, can influence segmentation;
                   and (3) grouping by phonological similarity between
                   dependent syllables contributes to learning the
                   dependency. In a series of experiments controlling the
                   phonological and statistical structure of the language,
                   we found that segmentation performance is influenced by
                   the three factors in different degrees. Learning of
                   non-adjacent dependencies did not occur when (3) is
                   eliminated. We suggest that phonological processing
                   provides a fundamental contribution to distributional
                   analysis.},
  categories = {artificial language learning, statistical learning,
                   segmentation, phonology, festival},
  key = {onnis2005},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/jml.pdf},
  year = 2005
}
@article{chang05,
  author = {S. Chang and M. Wester and S. Greenberg},
  title = {An elitist approach to automatic articulatory-acoustic
                   feature classification for phonetic characterization of
                   spoken language},
  journal = {Speech Communication},
  volume = {47},
  pages = {290-311},
  abstract = {A novel framework for automatic articulatory-acoustic
                   feature extraction has been developed for enhancing the
                   accuracy of place- and manner-of-articulation
                   classification in spoken language. The "elitist"
                   approach provides a principled means of selecting
                   frames for which multi-layer perceptron, neural-network
                   classifiers are highly confident. Using this method it
                   is possible to achieve a frame-level accuracy of 93\%
                   on "elitist" frames for manner classification on a
                   corpus of American English sentences passed through a
                   telephone network (NTIMIT). Place-of-articulation
                   information is extracted for each manner class
                   independently, resulting in an appreciable gain in
                   place-feature classification relative to performance
                   for a manner-independent system. A comparable
                   enhancement in classification performance for the
                   elitist appraoch is evidenced when applied to a Dutch
                   corpus of quasi-spontaneous telephone interactions
                   (VIOS). The elitist framework provides a potential
                   means of automatically annotating a corpus at the
                   phonetic level \emph{without recourse to a word-level
                   transcript} and could thus be of utility for developing
                   traning materials for automatic speech recognition and
                   speech synthesis applications, as well as aid the
                   empirical study of spoken language. \copyright 2005
                   Elsevier B.V. All rights reserved.},
  categories = {aaf, VIOS, NTIMIT, Berkeley},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2005/elitist-final-specom.pdf},
  year = 2005
}