Publications by Giulia Garau
ggarau.bib
@inproceedings{garau-interspeech05,
author = {G. Garau and S. Renals and T. Hain},
title = {Applying Vocal Tract Length Normalization to Meeting
Recordings},
booktitle = {Proc. Interspeech},
abstract = {Vocal Tract Length Normalisation (VTLN) is a commonly
used technique to normalise for inter-speaker
variability. It is based on the speaker-specific
warping of the frequency axis, parameterised by a
scalar warp factor. This factor is typically estimated
using maximum likelihood. We discuss how VTLN may be
applied to multiparty conversations, reporting a
substantial decrease in word error rate in experiments
using the ICSI meetings corpus. We investigate the
behaviour of the VTLN warping factor and show that a
stable estimate is not obtained. Instead it appears to
be influenced by the context of the meeting, in
particular the current conversational partner. These
results are consistent with predictions made by the
psycholinguistic interactive alignment account of
dialogue, when applied at the acoustic and phonological
levels.},
categories = {ami,asr,edinburgh,vtln,speaker
adaptation,lvcsr,meetings},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/giuliagarau_eurospeech05.pdf},
year = 2005
}
@inproceedings{hain-interspeech05,
author = {T. Hain and J. Dines and G. Garau and M. Karafiat and
D. Moore and V. Wan and R. Ordelman and S. Renals},
title = {Transcription of Conference Room Meetings: an
Investigation},
booktitle = {Proc. Interspeech},
abstract = {The automatic processing of speech collected in
conference style meetings has attracted considerable
interest with several large scale projects devoted to
this area. In this paper we explore the use of various
meeting corpora for the purpose of automatic speech
recognition. In particular we investigate the
similarity of these resources and how to efficiently
use them in the construction of a meeting transcription
system. The analysis shows distinctive features for
each resource. However the benefit in pooling data and
hence the similarity seems sufficient to speak of a
generic conference meeting domain . In this context
this paper also presents work on development for the
AMI meeting transcription system, a joint effort by
seven sites working on the AMI (augmented multi-party
interaction) project.},
categories = {ami,asr,edinburgh},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hain-eurospeech05.pdf},
year = 2005
}
@inproceedings{NistevalAMI06,
author = {T. Hain and L. Burget and L. Burget and J. dines and
G. Garau and M. Karafiat and M. Lincoln and J. Vepa and
V. Wan},
title = {The {AMI} Meeting Transcription System: Progress and
Performance},
booktitle = {Proceedings of the Rich Transcription 2006 Spring
Meeting Recognition Evaluation},
abstract = {We present the AMI 2006 system for the transcription
of speech in meetings. The system was jointly developed
by multiple sites on the basis of the 2005 system for
participation in the NIST RT'05 evaluations. The paper
describes major developments such as improvements in
automatic segmentation, cross-domain model adaptation,
inclusion of MLP based features, improvements in
decoding, language modelling and vocal tract length
normalisation, the use of a new decoder, and a new
system architecture. This is followed by a
comprehensive description of the final system and its
performance in the NIST RT'06s evaluations. In
comparison to the previous year word error rate results
on the individual headset microphone task were reduced
by 20\% relative.},
categories = {LVCSR, NIST Meeting Transcription Evaluation RT06S},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/AMIasr.nist06.pdf},
year = 2006
}
@article{garau2008,
author = {Garau, Giulia and Renals, Steve},
title = {Combining Spectral Representations for Large
Vocabulary Continuous Speech Recognition},
journal = {IEEE Transactions on Audio, Speech and Language
Processing},
volume = {16},
number = {3},
pages = {508--518},
abstract = {In this paper we investigate the combination of
complementary acoustic feature streams in large
vocabulary continuous speech recognition (LVCSR). We
have explored the use of acoustic features obtained
using a pitch-synchronous analysis, STRAIGHT, in
combination with conventional features such as mel
frequency cepstral coefficients. Pitch-synchronous
acoustic features are of particular interest when used
with vocal tract length normalisation (VTLN) which is
known to be affected by the fundamental frequency. We
have combined these spectral representations directly
at the acoustic feature level using heteroscedastic
linear discriminant analysis (HLDA) and at the system
level using ROVER. We evaluated this approach on three
LVCSR tasks: dictated newspaper text (WSJCAM0),
conversational telephone speech (CTS), and multiparty
meeting transcription. The CTS and meeting
transcription experiments were both evaluated using
standard NIST test sets and evaluation protocols. Our
results indicate that combining conventional and
pitch-synchronous acoustic feature sets using HLDA
results in a consistent, significant decrease in word
error rate across all three tasks. Combining at the
system level using ROVER resulted in a further
significant decrease in word error rate.},
doi = {10.1109/TASL.2008.916519},
pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2008/garau-taslp08.pdf},
url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=4443886},
year = 2008
}
@inproceedings{NistevalAMI05,
author = {T. Hain and L. Burget and J. Dines and G. Garau and M.
Karafiat and M. Lincoln and I. McCowan and D. Moore and
V. Wan and R. Ordelman and S. Renals},
title = {The 2005 {AMI} System for the transcription of Speech
in Meetings},
booktitle = {Proceedings of the Rich Transcription 2005 Spring
Meeting Recognition Evaluation},
abstract = {In this paper we describe the 2005 AMI system for the
transcription of speech in meetings used in the 2005
NIST RT evaluations. The system was designed for
participation in the speech to text part of the
evaluations, in particular for transcription of speech
recorded with multiple distant microphones and
independent headset microphones. System performance was
tested on both conference room and lecture style
meetings. Although input sources are processed using
different frontends, the recognition process is based
on a unified system architecture. The system operates
in multiple passes and makes use of state of the art
technologies such as discriminative training, vocal
tract length normalisation, heteroscedastic linear
discriminant analysis, speaker adaptation with maximum
likelihood linear regression and minimum word error
rate decoding. In this paper we describe the system
performance on the official development and test sets
for the NIST RT05s evaluations. The system was jointly
developed in less than 10 months by a multi-site team
and was shown to achieve competitive performance.},
categories = {LVCSR, NIST Meeting Transcription Evaluation RT05S},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/AMIasr.nist2005.pdf},
year = 2005
}
@inproceedings{garau2008a,
author = {Garau, Giulia and Renals, Steve},
title = {Pitch adaptive features for {LVCSR}},
booktitle = {Proc. Interspeech '08},
abstract = {We have investigated the use of a pitch adaptive
spectral representation on large vocabulary speech
recognition, in conjunction with speaker normalisation
techniques. We have compared the effect of a smoothed
spectrogram to the pitch adaptive spectral analysis by
decoupling these two components of STRAIGHT.
Experiments performed on a large vocabulary meeting
speech recognition task highlight the importance of
combining a pitch adaptive spectral representation with
a conventional fixed window spectral analysis. We found
evidence that STRAIGHT pitch adaptive features are more
speaker independent than conventional MFCCs without
pitch adaptation, thus they also provide better
performances when combined using feature combination
techniques such as Heteroscedastic Linear Discriminant
Analysis.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/garau2008a.pdf},
year = 2008
}
@inproceedings{AMIMLMI05,
author = {T. Hain and L. Burget and J. Dines and G. Garau and M.
Karafiat and M. Lincoln and I. McCowan and D. Moore and
V. Wan and R. Ordelman and S. Renals},
title = {The Development of the {AMI} System for the
Transcription of Speech in Meetings},
booktitle = {2nd Joint Workshop on Multimodal Interaction and
Related Machine Learning Algorithms},
abstract = {The automatic processing of speech collected in
conference style meetings has attracted considerable
interest with several large scale projects devoted to
this area. This paper describes the development of a
baseline automatic speech transcription system for
meetings in the context of the AMI (Augmented
Multiparty Interaction) project. We present several
techniques important to processing of this data and
show the performance in terms of word error rates
(WERs). An important aspect of transcription of this
data is the necessary flexibility in terms of audio
pre-processing. Real world systems have to deal with
flexible input, for example by using microphone arrays
or randomly placed microphones in a room. Automatic
segmentation and microphone array processing techniques
are described and the effect on WERs is discussed. The
system and its components presented in this paper yield
compettive performance and form a baseline for future
research in this domain.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/AMIsystemMLMI05.pdf},
year = 2005
}
@inproceedings{AMIsystemICASSP2007,
author = {T. Hain and L. Burget and J. Dines and G. Garau and M.
Karafiat and M. Lincoln and J. Vepa and V. Wan},
title = {{The {AMI} System for the Transcription of Speech in
Meetings}},
booktitle = {Proc. {ICASSP}},
abstract = {This paper describes the AMI transcription system for
speech in meetings developed in collaboration by five
research groups. The system includes generic techniques
such as discriminative and speaker adaptive training,
vocal tract length normalisation, heteroscedastic
linear discriminant analysis, maximum likelihood linear
regression, and phone posterior based features, as well
as techniques specifically designed for meeting data.
These include segmentation and cross-talk suppression,
beam-forming, domain adaptation, web-data collection,
and channel adaptive training. The system was improved
by more than 20\% relative in word error rate compared
to our previous system and was usd in the NIST RTÂ’06
evaluations where it was found to yield competitive
performance.},
categories = {LVCSR, NIST Meeting Transcription Evaluation RT06S},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/ICASSP07.AMIasrsystem.pdf},
year = 2007
}