2005.bib
@comment{{This file has been generated by bib2bib 1.92}}
@comment{{Command line: /home/korin/bibtex2html-1.92-LINUX/bib2bib -oc /home/korin/projects/publications/new_output/transitdata/2005-citations -ob /home/korin/projects/publications/new_output/transitdata/2005.bib -c 'year : "2005"' /home/korin/projects/publications/filtlists/full_publications_list.bib}}
@inproceedings{oliverclark_interspeech05,
author = {Dominika Oliver and Robert A. J. Clark},
title = {Modelling pitch accent types for {P}olish speech
synthesis},
booktitle = {Proc. Interspeech 2005},
categories = {speech synthesis, prosody, intonation, festival,
Polish},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/oliverclark_interspeech05.pdf},
year = 2005
}
@inproceedings{christensen-icassp05,
author = {H. Christensen and B. Kolluru and Y. Gotoh and S.
Renals},
title = {Maximum entropy segmentation of broadcast news},
booktitle = {Proc. IEEE ICASSP},
pages = {},
abstract = {This paper presents an automatic system for
structuring and preparing a news broadcast for
applications such as speech summarization, browsing,
archiving and information retrieval. This process
comprises transcribing the audio using an automatic
speech recognizer and subsequently segmenting the text
into utterances and topics. A maximum entropy approach
is used to build statistical models for both utterance
and topic segmentation. The experimental work addresses
the effect on performance of the topic boundary
detector of three factors: the information sources
used, the quality of the ASR transcripts, and the
quality of the utterance boundary detector. The results
show that the topic segmentation is not affected
severely by transcripts errors, whereas errors in the
utterance segmentation are more devastating. },
categories = {s3l,summarization,bnews,edinburgh,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/christensen-icassp05.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/christensen-icassp05.ps.gz},
year = 2005
}
@inproceedings{garau-interspeech05,
author = {G. Garau and S. Renals and T. Hain},
title = {Applying Vocal Tract Length Normalization to Meeting
Recordings},
booktitle = {Proc. Interspeech},
abstract = {Vocal Tract Length Normalisation (VTLN) is a commonly
used technique to normalise for inter-speaker
variability. It is based on the speaker-specific
warping of the frequency axis, parameterised by a
scalar warp factor. This factor is typically estimated
using maximum likelihood. We discuss how VTLN may be
applied to multiparty conversations, reporting a
substantial decrease in word error rate in experiments
using the ICSI meetings corpus. We investigate the
behaviour of the VTLN warping factor and show that a
stable estimate is not obtained. Instead it appears to
be influenced by the context of the meeting, in
particular the current conversational partner. These
results are consistent with predictions made by the
psycholinguistic interactive alignment account of
dialogue, when applied at the acoustic and phonological
levels.},
categories = {ami,asr,edinburgh,vtln,speaker
adaptation,lvcsr,meetings},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/giuliagarau_eurospeech05.pdf},
year = 2005
}
@inproceedings{Gutkin:Gay:qr05,
author = {Alexander Gutkin and David R. Gay},
title = {{S}tructural {R}epresentation and {M}atching of
{A}rticulatory {S}peech {S}tructures based on the
{E}volving {T}ransformation {S}ystem ({ETS})
{F}ormalism},
booktitle = {Proc. 19th International Workshop on Qualitative
Reasoning (QR-05)},
editor = {Michael Hofbaur and Bernhard Rinner and Franz Wotawa},
pages = {89--96},
address = {Graz, Austria},
abstract = { A formal structural representation of speech
consistent with the principles of combinatorial
structure theory is presented in this paper. The
representation is developed within the Evolving
Transformation System (ETS) formalism and encapsulates
speech processes at the articulatory level. We show how
the class structure of several consonantal phonemes of
English can be expressed with the help of articulatory
gestures---the atomic combinatorial units of speech. As
a preliminary step towards the design of a speech
recognition architecture based on the structural
approaches to physiology and articulatory phonology, we
present an algorithm for the structural detection of
phonemic class elements inside gestural ETS structures
derived from continuous speech. Experiments designed to
verify the adequacy of the hypothesised gestural class
structure conducted on the MOCHA articulatory corpus
are then described. Our experimental results support
the hypothesis that the articulatory representation
captures sufficient information for the accurate
structural identification of the phonemic classes in
question. },
categories = {structural,recognition,ets,artic,mocha,edinburgh,unb},
isbn = {3-9502019-0-4},
month = may,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/gutkin_gay_qr05.pdf},
year = 2005
}
@inproceedings{hain-interspeech05,
author = {T. Hain and J. Dines and G. Garau and M. Karafiat and
D. Moore and V. Wan and R. Ordelman and S. Renals},
title = {Transcription of Conference Room Meetings: an
Investigation},
booktitle = {Proc. Interspeech},
abstract = {The automatic processing of speech collected in
conference style meetings has attracted considerable
interest with several large scale projects devoted to
this area. In this paper we explore the use of various
meeting corpora for the purpose of automatic speech
recognition. In particular we investigate the
similarity of these resources and how to efficiently
use them in the construction of a meeting transcription
system. The analysis shows distinctive features for
each resource. However the benefit in pooling data and
hence the similarity seems sufficient to speak of a
generic conference meeting domain . In this context
this paper also presents work on development for the
AMI meeting transcription system, a joint effort by
seven sites working on the AMI (augmented multi-party
interaction) project.},
categories = {ami,asr,edinburgh},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hain-eurospeech05.pdf},
year = 2005
}
@inproceedings{Shimodaira:mlmi05,
author = {Hiroshi Shimodaira and Keisuke Uematsu and Shin'ichi
Kawamoto and Gregor Hofer and Mitsuru Nakai},
title = {{Analysis and Synthesis of Head Motion for Lifelike
Conversational Agents}},
booktitle = {Proc. MLMI2005},
categories = {lifelike agents},
month = jul,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/mlmi2005.pdf},
year = 2005
}
@inproceedings{calhoun:05,
author = {Calhoun, Sasha},
title = {It's the Difference That Matters: An Argument for
Contextually-Grounded Acoustic Intonational Phonology},
booktitle = {Linguistics Society of America Annual Meeting},
address = {Oakland, California},
abstract = {Standardly, the link between intonation and discourse
meaning is described in terms of perceptual intonation
categories, e.g. ToBI. We argue that this approach
needs to be refined to explicitly recognise: firstly,
that perception is affected by multiple acoustic cues,
including duration and intensity, as well as F0; and
secondly that the interpretation of these cues is
directly linked to the phonetic and discourse context.
Investigating the marking of topic status in a small
game task corpus, we found that although topic status
is not consistently marked by ToBI pitch accent, it is
by the F0 mean, intensity and duration of the topic
word. Using regression analysis, we found that when
factoring out the F0 mean and intensity of key parts of
the preceding discourse, intensity and duration become
stronger predictors of topic status than F0. },
categories = {intonation theory and methodology, information
structure, pitch accents, corpus study},
month = jan,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/calhounlsa05.pdf},
year = 2005
}
@inproceedings{Hachey05,
author = {B. Hachey and G. Murray and D. Reitter},
title = {The {E}mbra System at {DUC} 2005: Query-oriented
Multi-document Summarization with a Very Large Latent
Semantic Space},
booktitle = {Proceedings of the Document Understanding Conference
(DUC) 2005, Vancouver, BC, Canada},
abstract = {Our summarization system submitted to DUC 2005, Embra
(or Edinburgh), is novel in that it relies on building
a very large semantic space for the purposes of
determining relevance and redundancy in an MMR-style
framework. We address specificity by detecting the
presence or absence of Named Entities in our extract
candidates, and we implemented a sentence-ordering
algorithm to maximize sentence cohesion in our final
summaries.},
categories = {summarization, latent semantic analysis},
month = oct,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/duc2005.pdf},
year = 2005
}
@phdthesis{gray2005,
author = {Gray, Calum},
title = {Acoustic Pulse Reflectometry for Measurement of the
Vocal Tract with Application in Voice Synthesis.},
school = {University of Edinburgh},
abstract = {The measurement of human airway dimensions has been a
frequent objective in the fields of respiratory
medicine and speech research, but has proven difficult
to achieve non-invasively due to the airway's function
in breathing, swallowing and speaking. Acoustic pulse
reflectometry (APR) has been employed in clinical
studies of the vocal tract for several years, normally
in the function of airway measurement. The focus of
this work is to utilise APR in capturing vocal tract
profiles during the phonation of vowel sounds, for the
purposes of sound synthesis. By making an equivalent
tube model of the vocal tract, the propagation of an
acoustic wave can be readily calculated using
techniques such as waveguide modelling, which will in
turn allow us to synthesise sound and form the basis of
a physical model of the voice. The attractions of this
technique for vocal tract measurement are many: it is
non-invasive, safe, repeatable and inexpensive. In this
thesis, the basic theory describing wave propagation in
tubes of varying cross- section is outlined, together
with a review of how the time domain technique of APR
can be used to measure the input impulse response of a
tubular object, such as the vocal tract, from which the
bore profile can be calculated using the layer peeling
algorithm. Experimental measurements of the human vocal
tract during the phonation (imitation) of five
non-nasalised vowels [a, e, i, o, u] are presented,
using recent enhancements to the APR technique (MLS
excitation signals and virtual DC tube method) for a
single subject, together with optimisation of the APR
technique for vocal tract measurement and its
application in a group study using adults and children.
To validate the results obtained using the APR
technique, a comparative study with an accepted "gold
standard" imaging technique (Magnetic Resonance Imaging
- MRI) is presented, using the same subject, a voice
professional, in both studies. The results from this
study show reasonable overall agreement between the APR
and MRI data, with the limited resolution of the
acoustic technique tending to broaden features and
underestimate cross sectional areas, particularly in
the region of the pharynx and glottis. Protocols and
supplementary documentation required by scientific,
clinical and ethical review bodies for the use of human
volunteers in research trials are provided. From this
study a data corpus of vocal tract measurements is
gathered, using the techniques of APR and MRI, in adult
males, adult females and children. In conclusion,
limitations of the APR technique for vocal tract
measurement are discussed and potential improvements
are proposed.},
key = {gray2005},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/ThesisCalumGray.pdf},
year = 2005
}
@inproceedings{clarkrichmondking_interspeech05,
author = {Robert A.J. Clark and Korin Richmond and Simon King},
title = {Multisyn voices from {ARCTIC} data for the {B}lizzard
challenge},
booktitle = {Proc. Interspeech 2005},
abstract = {This paper describes the process of building unit
selection voices for the Festival Multisyn engine using
four ARCTIC datasets, as part of the Blizzard
evaluation challenge. The build process is almost
entirely automatic, with very little need for human
intervention. We discuss the difference in the
evaluation results for each voice and evaluate the
suitability of the ARCTIC datasets for building this
type of voice.},
categories = {speech synthesis, festival, evaluation},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/clarkrichmondking_interspeech05.pdf},
year = 2005
}
@article{Nakai2005IEICE01,
author = {Mitsuru Nakai and Shigeki Sagayama and Hiroshi
Shimodaira},
title = {{On-line Handwriting Recognition Based on Sub-stroke
{HMM}}},
journal = {Trans. IEICE D-II},
volume = {J88-D2},
number = {8},
note = {(in press) (in Japanese)},
abstract = { This paper describes context-dependent sub-stroke
HMMs for on-line handwritten character recognition. As
there are so many characters in Japanese, modeling each
character by an HMM leads to an infeasible
character-recognition system requiring huge amount of
memory and enormous computation time. The sub-stroke
HMM approach has overcomed these problems by minimizing
modeling unit. However, one of the drawback of this
approach is that the recognition accuracy deteriorates
for scribbled characters. In this paper, we show that
the context-dependent sub-stroke modeling which depends
on how the sub-stroke connects to the adjacent
substrokes is effective to achieve robust recognition
of low quality characters. },
categories = {online handwritten character recognition},
month = aug,
year = 2005
}
@inproceedings{calhoun:05-a,
author = {Calhoun, Sasha and Malvina Nissim and Mark Steedman
and Jason Brenier},
title = {A Framework for Annotating Information Structure in
Discourse},
booktitle = {Frontiers in Corpus Annotation II: Pie in the Sky,
ACL2005 Conference Workshop},
address = {Ann Arbor, Michigan},
abstract = {We present a framework for the integrated analysis of
the textual and prosodic characteristics of information
structure in the {\em Switchboard} corpus of
conversational English. Information structure describes
the availability, organisation and salience of entities
in a discourse model. We present standards for the
annotation of {\em information status} (old, mediated
and new), and give guidelines for annotating {\em
information structure}, i.e. {\em theme/rheme} and {\em
background/kontrast}. We show that information
structure in English can only be analysed concurrently
with prosodic prominence and phrasing. Along with
existing annotations which we have integrated using NXT
technology, the corpus will be unique in the field of
conversational speech in terms of size and richness of
annotation, vital for many NLP applications.},
categories = {prosody, information structure, annotation, discourse
semantics},
month = jun,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/pieinsky05.pdf},
year = 2005
}
@inproceedings{NistevalAMI05,
author = {T. Hain and L. Burget and J. Dines and G. Garau and M.
Karafiat and M. Lincoln and I. McCowan and D. Moore and
V. Wan and R. Ordelman and S. Renals},
title = {The 2005 {AMI} System for the transcription of Speech
in Meetings},
booktitle = {Proceedings of the Rich Transcription 2005 Spring
Meeting Recognition Evaluation},
abstract = {In this paper we describe the 2005 AMI system for the
transcription of speech in meetings used in the 2005
NIST RT evaluations. The system was designed for
participation in the speech to text part of the
evaluations, in particular for transcription of speech
recorded with multiple distant microphones and
independent headset microphones. System performance was
tested on both conference room and lecture style
meetings. Although input sources are processed using
different frontends, the recognition process is based
on a unified system architecture. The system operates
in multiple passes and makes use of state of the art
technologies such as discriminative training, vocal
tract length normalisation, heteroscedastic linear
discriminant analysis, speaker adaptation with maximum
likelihood linear regression and minimum word error
rate decoding. In this paper we describe the system
performance on the official development and test sets
for the NIST RT05s evaluations. The system was jointly
developed in less than 10 months by a multi-site team
and was shown to achieve competitive performance.},
categories = {LVCSR, NIST Meeting Transcription Evaluation RT05S},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/AMIasr.nist2005.pdf},
year = 2005
}
@inproceedings{Gutkin:King:pris05,
author = {Alexander Gutkin and Simon King},
title = {{I}nductive {S}tring {T}emplate-{B}ased {L}earning of
{S}poken {L}anguage},
booktitle = {Proc. 5th International Workshop on Pattern
Recognition in Information Systems (PRIS-2005), In
conjunction with the 7th International Conference on
Enterprise Information Systems (ICEIS-2005)},
editor = {Hugo Gamboa and Ana Fred},
pages = {43--51},
address = {Miami, USA},
publisher = {INSTICC Press},
abstract = { This paper deals with formulation of alternative
structural approach to the speech recognition problem.
In this approach, we require both the representation
and the learning algorithms defined on it to be
linguistically meaningful, which allows the speech
recognition system to discover the nature of the
linguistic classes of speech patterns corresponding to
the speech waveforms. We briefly discuss the current
formalisms and propose an alternative --- a
phonologically inspired string-based inductive speech
representation, defined within an analytical framework
specifically designed to address the issues of class
and object representation. We also present the results
of the phoneme classification experiments conducted on
the TIMIT corpus of continuous speech. },
categories = {structural,recognition,acoustic,phonetic_feature,timit,edinburgh},
isbn = {972-8865-28-7},
month = may,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/gutkin_king_pris05.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/gutkin_king_pris05.ps.gz},
year = 2005
}
@article{mayoturk-jasa05,
author = {Mayo, C. and Turk, A.},
title = {The influence of spectral distinctiveness on acoustic
cue weighting in children's and adults' speech
perception},
journal = {Journal of the Acoustical Society of America},
volume = {118},
pages = {1730--1741},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/mayo-turk-2005-7.pdf},
year = 2005
}
@inproceedings{king_bartels_bilmes_isp05,
author = {Simon King and Chris Bartels and Jeff Bilmes},
title = {SVitchboard 1: Small Vocabulary Tasks from Switchboard
1 },
booktitle = {Proc. Interspeech 2005},
address = {Lisbon, Portugal},
abstract = {We present a conversational telephone speech data set
designed to support research on novel acoustic models.
Small vocabulary tasks from 10 words up to 500 words
are defined using subsets of the Switchboard-1 corpus;
each task has a completely closed vocabulary (an OOV
rate of 0\%). We justify the need for these tasks, de-
scribe the algorithm for selecting them from a large
cor- pus, give a statistical analysis of the data and
present baseline whole-word hidden Markov model
recognition results. The goal of the paper is to define
a common data set and to encourage other researchers to
use it.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/king_bartels_bilmes_svitchboard.pdf},
year = 2005
}
@article{wrigley-sap05,
author = {S. J. Wrigley and G. J. Brown and V. Wan and S. Renals},
title = {Speech and crosstalk detection in multi-channel audio},
journal = {IEEE Trans. on Speech and Audio Processing},
volume = {13},
pages = {84--91},
abstract = {The analysis of scenarios in which a number of
microphones record the activity of speakers, such as in
a roundtable meeting, presents a number of
computational challenges. For example, if each
participant wears a microphone, it can receive speech
from both the microphone's wearer (local speech) and
from other participants (crosstalk). The recorded audio
can be broadly classified in four ways: local speech,
crosstalk plus local speech, crosstalk alone and
silence. We describe two experiments related to the
automatic classification of audio into these four
classes. The first experiment attempted to optimise a
set of acoustic features for use with a Gaussian
mixture model (GMM) classifier. A large set of
potential acoustic features were considered, some of
which have been employed in previous studies. The
best-performing features were found to be kurtosis,
fundamentalness and cross-correlation metrics. The
second experiment used these features to train an
ergodic hidden Markov model classifier. Tests performed
on a large corpus of recorded meetings show
classification accuracies of up to 96\%, and automatic
speech recognition performance close to that obtained
using ground truth segmentation.},
categories = {m4,meetings,edinburgh,asr,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/sap04-xtalk.pdf},
year = 2005
}
@article{goldman2005,
author = {Jerry Goldman and Steve Renals and Steven Bird and
Franciska {de Jong} and Marcello Federico and Carl
Fleischhauer and Mark Kornbluh and Lori Lamel and Doug
Oard and Clare Stewart and Richard Wright},
title = {Accessing the spoken word},
journal = {International Journal of Digital Libraries},
volume = 5,
number = 4,
pages = {287--298},
abstract = {Spoken word audio collections cover many domains,
including radio and television broadcasts, oral
narratives, governmental proceedings, lectures, and
telephone conversations. The collection, access and
preservation of such data is stimulated by political,
economic, cultural and educational needs. This paper
outlines the major issues in the field, reviews the
current state of technology, examines the rapidly
changing policy issues relating to privacy and
copyright, and presents issues relating to the
collection and preservation of spoken audio content.},
categories = {swag,asr,ir,edinburgh},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/swag-ijdl05.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/swag-ijdl05.ps.gz},
year = 2005
}
@inproceedings{hifny-interspeech05,
author = {Y. Hifny and S. Renals and N. Lawrence},
title = {A Hybrid {MaxEnt/HMM} based {ASR} System},
booktitle = {Proc. Interspeech},
abstract = {The aim of this work is to develop a practical
framework, which extends the classical Hidden Markov
Models (HMM) for continuous speech recognition based on
the Maximum Entropy (MaxEnt) principle. The MaxEnt
models can estimate the posterior probabilities
directly as with Hybrid NN/HMM connectionist speech
recognition systems. In particular, a new acoustic
modelling based on discriminative MaxEnt models is
formulated and is being developed to replace the
generative Gaussian Mixture Models (GMM) commonly used
to model acoustic variability. Initial experimental
results using the TIMIT phone task are reported.},
categories = {ml,asr,edinburgh,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hifny-eurospeech05.pdf},
year = 2005
}
@incollection{dielmann-mlmi04,
author = {A. Dielmann and S. Renals},
title = {Multistream dynamic {Bayesian} network for meeting
segmentation},
booktitle = {Proc. Multimodal Interaction and Related Machine
Learning Algorithms Workshop (MLMI--04)},
publisher = {Springer},
editor = {S. Bengio and H. Bourlard},
pages = {76--86},
abstract = {This paper investigates the automatic analysis and
segmentation of meetings. A meeting is analysed in
terms of individual behaviours and group interactions,
in order to decompose each meeting in a sequence of
relevant phases, named meeting actions. Three feature
families are extracted from multimodal recordings:
prosody from individual lapel microphone signals,
speaker activity from microphone array data and lexical
features from textual transcripts. A statistical
approach is then used to relate low-level features with
a set of abstract categories. In order to provide a
flexible and powerful framework, we have employed a
dynamic Bayesian network based model, characterized by
multiple stream processing and flexible state duration
modelling. Experimental results demonstrate the
strength of this system, providing a meeting action
error rate of 9\%.},
categories = {m4,multimodal,dbn,meetings,edinburgh},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/dielmann-mlmi04.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/dielmann-mlmi04.ps.gz},
year = 2005
}
@inproceedings{Gutkin:King:icassp05,
author = {Alexander Gutkin and Simon King},
title = {{D}etection of {S}ymbolic {G}estural {E}vents in
{A}rticulatory {D}ata for {U}se in {S}tructural
{R}epresentations of {C}ontinuous {S}peech},
booktitle = {Proc. IEEE International Conference on Acoustics,
Speech, and Signal Processing (ICASSP-05)},
volume = {I},
pages = {885--888},
address = {Philadelphia, PA, USA},
publisher = {IEEE Signal Processing Society Press},
abstract = { One of the crucial issues which often needs to be
addressed in structural approaches to speech
representation is the choice of fundamental symbolic
units of representation. In this paper, a
physiologically inspired methodology for defining these
symbolic atomic units in terms of primitive
articulatory events is proposed. It is shown how the
atomic articulatory events (gestures) can be detected
directly in the articulatory data. An algorithm for
evaluating the reliability of the articulatory events
is described and promising results of the experiments
conducted on MOCHA articulatory database are presented.
},
categories = {structural,recognition,artic,mocha,edinburgh},
isbn = {0-7803-8875-5},
month = mar,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/gutkin_king_icassp2005.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/gutkin_king_icassp2005.ps.gz},
year = 2005
}
@inproceedings{mayoturk-psp05,
author = {Mayo, C. and Turk, A.},
title = {No Available Theories Currently Explain All
Adult-Child Cue Weighting Differences},
booktitle = {Proc. ISCA Workshop on Plasticity in Speech Perception},
address = {London, UK},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/mayoday2.pdf},
year = 2005
}
@article{wan-sap05,
author = {V. Wan and S. Renals},
title = {Speaker verification using sequence discriminant
support vector machines},
journal = {IEEE Trans. on Speech and Audio Processing},
volume = {13},
pages = {203--210},
abstract = {This paper presents a text-independent speaker
verification system using support vector machines
(SVMs) with score-space kernels. Score-space kernels,
generalize Fisher kernels, and are based on an
underlying generative model, such as a Gaussian mixture
model (GMM). This approach provides direct
discrimination between whole sequences, in contrast to
the frame-level approaches at the heart of most current
systems. The resultant SVMs have a very high
dimensionality, since it is related to the number of
parameters in the underlying generative model. To
ameliorate problems that can arise in the resultant
optimization, we introduce a technique called spherical
normalization that preconditions the Hessian matrix. We
have performed speaker verification experiments using
the PolyVar database. The SVM system presented here
reduces the relative error rates by 34\% compared to a
GMM likelihood ratio system.},
categories = {verification,kernel,svm,edinburgh,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/sap05-svm.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/sap05-svm.ps.gz},
year = 2005
}
@inproceedings{Murray05b,
author = {G. Murray and S. Renals and J. Carletta and J. Moore},
title = {Evaluating Automatic Summaries of Meeting Recordings},
booktitle = {Proceedings of the 43rd Annual Meeting of the
Association for Computational Linguistics, Ann Arbor,
MI, USA},
abstract = {The research below explores schemes for evaluating
automatic summaries of business meetings, using the
ICSI Meeting Corpus. Both automatic and subjective
evaluations were carried out, with a central interest
being whether or not the two types of evaluations
correlate with each other. The evaluation metrics were
used to compare and contrast differing approaches to
automatic summarization, the deterioration of summary
quality on ASR output versus manual transcripts, and to
determine whether manual extracts are rated
significantly higher than automatic extracts. },
categories = {ami,summarization, speech summarization, prosody,
latent semantic analysis, summarization evaluation,
edinburgh},
month = jun,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/murray-renals-carletta-moore.pdf},
year = 2005
}
@inproceedings{murray-interspeech05,
author = {G. Murray and S. Renals and J. Carletta},
title = {Extractive Summarization of Meeting Recordings},
booktitle = {Proc. Interspeech},
abstract = {Several approaches to automatic speech summarization
are discussed below, using the ICSI Meetings corpus. We
contrast feature-based approaches using prosodic and
lexical features with maximal marginal relevance and
latent semantic analysis approaches to summarization.
While the latter two techniques are borrowed directly
from the field of text summarization, feature-based
approaches using prosodic information are able to
utilize characteristics unique to speech data. We also
investigate how the summarization results might
deteriorate when carried out on ASR output as opposed
to manual transcripts. All of the summaries are of an
extractive variety, and are compared using the software
ROUGE.},
categories = {ami,summarization,prosody, latent semantic
analysis,edinburgh},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/murray-eurospeech05.pdf},
year = 2005
}
@inproceedings{cuayahuitletal_asru05,
author = {Heriberto Cuayáhuitl and Steve Renals and Oliver
Lemon and Hiroshi Shimodaira},
title = {Human-Computer Dialogue Simulation Using Hidden Markov
Models},
booktitle = {Proc. of IEEE Workshop on Automatic Speech Recognition
and Understanding (ASRU)},
abstract = {This paper presents a probabilistic method to simulate
task-oriented human-computer dialogues at the intention
level, that may be used to improve or to evaluate the
performance of spoken dialogue systems. Our method uses
a network of Hidden Markov Models (HMMs) to predict
system and user intentions, where a ``language model''
predicts sequences of goals and the component HMMs
predict sequences of intentions. We compare standard
HMMs, Input HMMs and Input-Output HMMs in an effort to
better predict sequences of intentions. In addition, we
propose a dialogue similarity measure to evaluate the
realism of the simulated dialogues. We performed
experiments using the DARPA Communicator corpora and
report results with three different metrics: dialogue
length, dialogue similarity and precision-recall.},
categories = {dialogue simulation, hidden markov models},
month = nov,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hcp-asru2005.pdf},
year = 2005
}
@inproceedings{mayoclarkking-isp05,
author = {Mayo, C. and Clark, R. A. J. and King, S.},
title = {Multidimensional Scaling of Listener Responses to
Synthetic Speech},
booktitle = {Proc. Interspeech 2005},
address = {Lisbon, Portugal},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/ie-speech-2005.pdf},
year = 2005
}
@phdthesis{shiga05,
author = {Shiga, Yoshinori},
title = {Precise Estimation of Vocal Tract and Voice Source
Characteristics},
school = {The Centre for Speech Technology Research, Edinburgh
University},
abstract = {This thesis addresses the problem of quality
degradation in speech produced by parameter-based
speech synthesis, within the framework of an
articulatory-acoustic forward mapping. I first
investigate current problems in speech
parameterisation, and point out the fact that
conventional parameterisation inaccurately extracts the
vocal tract response due to interference from the
harmonic structure of voiced speech. To overcome this
problem, I introduce a method for estimating filter
responses more precisely from periodic signals. The
method achieves such estimation in the frequency domain
by approximating all the harmonics observed in several
frames based on a least squares criterion. It is shown
that the proposed method is capable of estimating the
response more accurately than widely-used
frame-by-frame parameterisation, for simulations using
synthetic speech and for an articulatory-acoustic
mapping using actual speech. I also deal with the
source-filter separation problem and independent
control of the voice source characteristic during
speech synthesis. I propose a statistical approach to
separating out the vocal-tract filter response from the
voice source characteristic using a large articulatory
database. The approach realises such separation for
voiced speech using an iterative approximation
procedure under the assumption that the speech
production process is a linear system composed of a
voice source and a vocal-tract filter, and that each of
the components is controlled independently by different
sets of factors. Experimental results show that
controlling the source characteristic greatly improves
the accuracy of the articulatory-acoustic mapping, and
that the spectral variation of the source
characteristic is evidently influenced by the
fundamental frequency or the power of speech. The
thesis provides more accurate acoustical approximation
of the vocal tract response, which will be beneficial
in a wide range of speech technologies, and lays the
groundwork in speech science for a new type of
corpus-based statistical solution to the source-filter
separation problem.},
categories = {mfa, multiframe, forward, mapping, source-filter,
artic, mocha, edinburgh},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/phd_thesis_shiga.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/phd_thesis_shiga.ps.gz},
year = 2005
}
@inproceedings{frankel05:hybrid,
author = {Frankel, J. and King, S.},
title = {A Hybrid {ANN/DBN} Approach to Articulatory Feature
Recognition},
booktitle = {Proc. Eurospeech},
address = {Lisbon},
abstract = {Artificial neural networks (ANN) have proven to be
well suited to the task of articulatory feature (AF)
recognition. Previous studies have taken a cascaded
approach where separate ANNs are trained for each
feature group, making the assumption that features are
statistically independent. We address this by using
ANNs to provide virtual evidence to a dynamic Bayesian
network (DBN). This gives a hybrid ANN/DBN model and
allows modelling of inter-feature dependencies. We
demonstrate significant increases in AF recognition
accuracy from modelling dependencies between features,
and present the results of embedded training
experiments in which a set of asynchronous feature
changes are learned. Furthermore, we report on the
application of a Viterbi training scheme in which we
alternate between realigning the AF training labels and
retraining the ANNs.},
categories = {am,artic,asr,dbn,oginumbers,edinburgh},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/Frankel_King_INTER2005.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/Frankel_King_INTER2005.ps},
year = 2005
}
@article{koumpis2005-acmslp,
author = {Konstantinos Koumpis and Steve Renals},
title = {Automatic summarization of voicemail messages using
lexical and prosodic features},
journal = {ACM Transactions on Speech and Language Processing},
volume = 2,
number = 1,
pages = {1--24},
abstract = {This paper presents trainable methods for extracting
principal content words from voicemail messages. The
short text summaries generated are suitable for mobile
messaging applications. The system uses a set of
classifiers to identify the summary words, with each
word being identified by a vector of lexical and
prosodic features. We use an ROC-based algorithm,
Parcel, to select input features (and classifiers). We
have performed a series of objective and subjective
evaluations using unseen data from two different speech
recognition systems, as well as human transcriptions of
voicemail speech.},
categories = {voicemail,summarization,prosody,sheffield,edinburgh},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/tslp05.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/tslp05.ps.gz},
year = 2005
}
@article{Tokuno2005IEICE01,
author = {Junko Tokuno and Nobuhito Inami and Mitsuru Nakai and
Hiroshi Shimodaira and Shigeki Sagayama},
title = {{Context-dependent Sub-stroke Model for {HMM}-based
On-line Handwriting Recognition}},
journal = {Trans. IEICE D-II},
volume = {J88-D2},
number = {8},
note = {(in press), (in Japanese)},
abstract = { A new method is proposed for on-line Kanji
handwriting recognition. The method employs sub-stroke
HMMs as minimum units to constitute Kanji characters
and utilizes the direction of pen motion. The present
approach has the following advantages over the
conventional methods that employ character HMMs. 1)
Much smaller memory requirement for dictionary and
models. 2) Fast recognition by employing efficient
sub-stroke network search. 3) Capability of recognizing
characters not included in the training data if defined
as a sequence of sub-strokes in the dictionary. In
experiments, we have achieved a correct recognition
rate of above 96\% by using JAIST-IIPL database that
includes 1,016 educational Kanji characters. },
categories = {online handwritten character recognition},
month = aug,
year = 2005
}
@inproceedings{goubanova_king_isp05,
author = {Olga Goubanova and Simon King},
title = {Predicting Consonant Duration with {B}ayesian Belief
Networks},
booktitle = {Proc. Interspeech 2005},
address = {Lisbon, Portugal},
abstract = {Consonant duration is influenced by a number of
linguistic factors such as the consonant s identity,
within-word position, stress level of the previous and
following vowels, phrasal position of the word
containing the target consonant, its syllabic position,
identity of the previous and following segments. In our
work, consonant duration is predicted from a Bayesian
belief network (BN) consisting of discrete nodes for
the linguistic factors and a single continuous node for
the consonant s duration. Interactions between factors
are represented as conditional dependency arcs in this
graphical model. Given the parameters of the belief
network, the duration of each consonant in the test set
is then predicted as the value with the maximum
probability. We compare the results of the belief
network model with those of sums-of-products (SoP) and
classification and regression tree (CART) models using
the same data. In terms of RMS error, our BN model
performs better than both CART and SoP models. In terms
of the correlation coefficient, our BN model performs
better than SoP model, and no worse than CART model. In
addition, the Bayesian model reliably predicts
consonant duration in cases of missing or hidden
linguistic factors.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/goubanova_king_isp2005.pdf},
year = 2005
}
@article{koumpis2005-spmag,
author = {Koumpis, Konstantinos and Renals, Steve},
title = {Content-based access to spoken audio},
journal = {IEEE Signal Processing Magazine},
volume = 22,
number = 5,
pages = {61--69},
abstract = {"How analysis, retrieval and delivery phases make
spoken audio content more accessible"},
categories = {asr,ir,summarization,edinburgh},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/koumpis-spm05.pdf},
year = 2005
}
@phdthesis{Gutkin:phd:05,
author = {Alexander Gutkin},
title = {{T}owards {F}ormal {S}tructural {R}epresentation of
{S}poken {L}anguage: {A}n {E}volving {T}ransformation
{S}ystem ({ETS}) {A}pproach},
school = {School of Informatics, University of Edinburgh},
address = {UK},
note = {Internal version},
categories = {structural,representation,recognition,edinburgh,unb,ets},
month = dec,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/gutkin_phd_full.pdf},
year = 2005
}
@inproceedings{AMIMLMI05,
author = {T. Hain and L. Burget and J. Dines and G. Garau and M.
Karafiat and M. Lincoln and I. McCowan and D. Moore and
V. Wan and R. Ordelman and S. Renals},
title = {The Development of the {AMI} System for the
Transcription of Speech in Meetings},
booktitle = {2nd Joint Workshop on Multimodal Interaction and
Related Machine Learning Algorithms},
abstract = {The automatic processing of speech collected in
conference style meetings has attracted considerable
interest with several large scale projects devoted to
this area. This paper describes the development of a
baseline automatic speech transcription system for
meetings in the context of the AMI (Augmented
Multiparty Interaction) project. We present several
techniques important to processing of this data and
show the performance in terms of word error rates
(WERs). An important aspect of transcription of this
data is the necessary flexibility in terms of audio
pre-processing. Real world systems have to deal with
flexible input, for example by using microphone arrays
or randomly placed microphones in a room. Automatic
segmentation and microphone array processing techniques
are described and the effect on WERs is discussed. The
system and its components presented in this paper yield
compettive performance and form a baseline for future
research in this domain.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/AMIsystemMLMI05.pdf},
year = 2005
}
@inproceedings{faria-eurospeech05,
author = {A.~Faria and D.~Gelbart},
title = {Efficient Pitch-based Estimation of {VLTN} Warp
Factors},
booktitle = {Proc. Eurospeech},
abstract = { To reduce inter-speaker variability, vocal tract
length normalization (VTLN) is commonly used to
transform acoustic features for automatic speech
recognition (ASR). The warp factors used in this
process are usually derived by maximum likelihood (ML)
estimation, involving an exhaustive search over
possible values. We describe an alternative approach:
exploit the correlation between a speaker's average
pitch and vocal tract length, and model the probability
distribution of warp factors conditioned on pitch
observations. This can be used directly for warp factor
estimation, or as a smoothing prior in combination with
ML estimates. Pitch-based warp factor estimation for
VTLN is effective and requires relatively little memory
and computation. Such an approach is well-suited for
environments with constrained resources, or where pitch
is already being computed for other purposes. },
categories = {vocal tract length normalization,speaker adaptation},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/pbvtln-latest.pdf},
year = 2005
}
@inproceedings{Gutkin:Gay:ijcai05,
author = {Alexander Gutkin and David R. Gay},
title = {Structural Representation and Matching of Articulatory
Speech Structures based on the Evolving Transformation
System ({ETS}) Formalism},
booktitle = {Proc. Nineteenth International Joint Conference on
Artificial Intelligence (IJCAI-05)},
address = {Edinburgh, UK},
categories = {structural,recognition,ets,artic,mocha,edinburgh,unb},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/gutkin_gay_ijcai05.pdf},
year = 2005
}
@inproceedings{hofer-eurosp05,
author = {G. Hofer and K. Richmond and R. Clark},
title = {Informed Blending of Databases for Emotional Speech
Synthesis},
booktitle = {Proc. Interspeech},
abstract = {The goal of this project was to build a unit selection
voice that could portray emotions with varying
intensities. A suitable definition of an emotion was
developed along with a descriptive framework that
supported the work carried out. A single speaker was
recorded portraying happy and angry speaking styles.
Additionally a neutral database was also recorded. A
target cost function was implemented that chose units
according to emotion mark-up in the database. The
Dictionary of Affect supported the emotional target
cost function by providing an emotion rating for words
in the target utterance. If a word was particularly
'emotional', units from that emotion were favoured. In
addition intensity could be varied which resulted in a
bias to select a greater number emotional units. A
perceptual evaluation was carried out and subjects were
able to recognise reliably emotions with varying
amounts of emotional units present in the target
utterance.},
categories = {speech synthesis,emotion,edinburgh},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hofer_emosyn.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hofer_emosyn.ps},
year = 2005
}
@article{onnis2005,
author = {Onnis, L. and Monaghan, P. and Richmond, K. and
Chater, N.},
title = {Phonology impacts segmentation in speech processing.},
journal = {Journal of Memory and Language},
volume = {53},
number = {2},
pages = {225--237},
abstract = {Peña, Bonatti, Nespor and Mehler(2002) investigated an
artificial language where the structure of words was
determined by nonadjacent dependencies between
syllables. They found that segmentation of continuous
speech could proceed on the basis of these
dependencies. However, Peña et al.'s artificial
language contained a confound in terms of phonology, in
that the dependent syllables began with plosives and
the intervening syllables began with continuants. We
consider three hypotheses concerning the role of
phonology in speech segmentation in this task: (1)
participants may recruit probabilistic phonotactic
information from their native language to the
artificial language learning task; (2) phonetic
properties of the stimuli, such as the gaps that
precede unvoiced plosives, can influence segmentation;
and (3) grouping by phonological similarity between
dependent syllables contributes to learning the
dependency. In a series of experiments controlling the
phonological and statistical structure of the language,
we found that segmentation performance is influenced by
the three factors in different degrees. Learning of
non-adjacent dependencies did not occur when (3) is
eliminated. We suggest that phonological processing
provides a fundamental contribution to distributional
analysis.},
categories = {artificial language learning, statistical learning,
segmentation, phonology, festival},
key = {onnis2005},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/jml.pdf},
year = 2005
}
@article{chang05,
author = {S. Chang and M. Wester and S. Greenberg},
title = {An elitist approach to automatic articulatory-acoustic
feature classification for phonetic characterization of
spoken language},
journal = {Speech Communication},
volume = {47},
pages = {290-311},
abstract = {A novel framework for automatic articulatory-acoustic
feature extraction has been developed for enhancing the
accuracy of place- and manner-of-articulation
classification in spoken language. The "elitist"
approach provides a principled means of selecting
frames for which multi-layer perceptron, neural-network
classifiers are highly confident. Using this method it
is possible to achieve a frame-level accuracy of 93\%
on "elitist" frames for manner classification on a
corpus of American English sentences passed through a
telephone network (NTIMIT). Place-of-articulation
information is extracted for each manner class
independently, resulting in an appreciable gain in
place-feature classification relative to performance
for a manner-independent system. A comparable
enhancement in classification performance for the
elitist appraoch is evidenced when applied to a Dutch
corpus of quasi-spontaneous telephone interactions
(VIOS). The elitist framework provides a potential
means of automatically annotating a corpus at the
phonetic level \emph{without recourse to a word-level
transcript} and could thus be of utility for developing
traning materials for automatic speech recognition and
speech synthesis applications, as well as aid the
empirical study of spoken language. \copyright 2005
Elsevier B.V. All rights reserved.},
categories = {aaf, VIOS, NTIMIT, Berkeley},
pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2005/elitist-final-specom.pdf},
year = 2005
}