The Centre for Speech Technology Research, The university of Edinburgh

Publications by Giulia Garau

ggarau.bib

@inproceedings{garau-interspeech05,
  author = {G. Garau and S. Renals and T. Hain},
  title = {Applying Vocal Tract Length Normalization to Meeting
                   Recordings},
  booktitle = {Proc. Interspeech},
  abstract = {Vocal Tract Length Normalisation (VTLN) is a commonly
                   used technique to normalise for inter-speaker
                   variability. It is based on the speaker-specific
                   warping of the frequency axis, parameterised by a
                   scalar warp factor. This factor is typically estimated
                   using maximum likelihood. We discuss how VTLN may be
                   applied to multiparty conversations, reporting a
                   substantial decrease in word error rate in experiments
                   using the ICSI meetings corpus. We investigate the
                   behaviour of the VTLN warping factor and show that a
                   stable estimate is not obtained. Instead it appears to
                   be influenced by the context of the meeting, in
                   particular the current conversational partner. These
                   results are consistent with predictions made by the
                   psycholinguistic interactive alignment account of
                   dialogue, when applied at the acoustic and phonological
                   levels.},
  categories = {ami,asr,edinburgh,vtln,speaker
                   adaptation,lvcsr,meetings},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/giuliagarau_eurospeech05.pdf},
  year = 2005
}
@inproceedings{hain-interspeech05,
  author = {T. Hain and J. Dines and G. Garau and M. Karafiat and
                   D. Moore and V. Wan and R. Ordelman and S. Renals},
  title = {Transcription of Conference Room Meetings: an
                   Investigation},
  booktitle = {Proc. Interspeech},
  abstract = {The automatic processing of speech collected in
                   conference style meetings has attracted considerable
                   interest with several large scale projects devoted to
                   this area. In this paper we explore the use of various
                   meeting corpora for the purpose of automatic speech
                   recognition. In particular we investigate the
                   similarity of these resources and how to efficiently
                   use them in the construction of a meeting transcription
                   system. The analysis shows distinctive features for
                   each resource. However the benefit in pooling data and
                   hence the similarity seems sufficient to speak of a
                   generic conference meeting domain . In this context
                   this paper also presents work on development for the
                   AMI meeting transcription system, a joint effort by
                   seven sites working on the AMI (augmented multi-party
                   interaction) project.},
  categories = {ami,asr,edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hain-eurospeech05.pdf},
  year = 2005
}
@inproceedings{NistevalAMI06,
  author = {T. Hain and L. Burget and L. Burget and J. dines and
                   G. Garau and M. Karafiat and M. Lincoln and J. Vepa and
                   V. Wan},
  title = {The {AMI} Meeting Transcription System: Progress and
                   Performance},
  booktitle = {Proceedings of the Rich Transcription 2006 Spring
                   Meeting Recognition Evaluation},
  abstract = {We present the AMI 2006 system for the transcription
                   of speech in meetings. The system was jointly developed
                   by multiple sites on the basis of the 2005 system for
                   participation in the NIST RT'05 evaluations. The paper
                   describes major developments such as improvements in
                   automatic segmentation, cross-domain model adaptation,
                   inclusion of MLP based features, improvements in
                   decoding, language modelling and vocal tract length
                   normalisation, the use of a new decoder, and a new
                   system architecture. This is followed by a
                   comprehensive description of the final system and its
                   performance in the NIST RT'06s evaluations. In
                   comparison to the previous year word error rate results
                   on the individual headset microphone task were reduced
                   by 20\% relative.},
  categories = {LVCSR, NIST Meeting Transcription Evaluation RT06S},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/AMIasr.nist06.pdf},
  year = 2006
}
@article{garau2008,
  author = {Garau, Giulia and Renals, Steve},
  title = {Combining Spectral Representations for Large
                   Vocabulary Continuous Speech Recognition},
  journal = {IEEE Transactions on Audio, Speech and Language
                   Processing},
  volume = {16},
  number = {3},
  pages = {508--518},
  abstract = {In this paper we investigate the combination of
                   complementary acoustic feature streams in large
                   vocabulary continuous speech recognition (LVCSR). We
                   have explored the use of acoustic features obtained
                   using a pitch-synchronous analysis, STRAIGHT, in
                   combination with conventional features such as mel
                   frequency cepstral coefficients. Pitch-synchronous
                   acoustic features are of particular interest when used
                   with vocal tract length normalisation (VTLN) which is
                   known to be affected by the fundamental frequency. We
                   have combined these spectral representations directly
                   at the acoustic feature level using heteroscedastic
                   linear discriminant analysis (HLDA) and at the system
                   level using ROVER. We evaluated this approach on three
                   LVCSR tasks: dictated newspaper text (WSJCAM0),
                   conversational telephone speech (CTS), and multiparty
                   meeting transcription. The CTS and meeting
                   transcription experiments were both evaluated using
                   standard NIST test sets and evaluation protocols. Our
                   results indicate that combining conventional and
                   pitch-synchronous acoustic feature sets using HLDA
                   results in a consistent, significant decrease in word
                   error rate across all three tasks. Combining at the
                   system level using ROVER resulted in a further
                   significant decrease in word error rate.},
  doi = {10.1109/TASL.2008.916519},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2008/garau-taslp08.pdf},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=4443886},
  year = 2008
}
@inproceedings{NistevalAMI05,
  author = {T. Hain and L. Burget and J. Dines and G. Garau and M.
                   Karafiat and M. Lincoln and I. McCowan and D. Moore and
                   V. Wan and R. Ordelman and S. Renals},
  title = {The 2005 {AMI} System for the transcription of Speech
                   in Meetings},
  booktitle = {Proceedings of the Rich Transcription 2005 Spring
                   Meeting Recognition Evaluation},
  abstract = {In this paper we describe the 2005 AMI system for the
                   transcription of speech in meetings used in the 2005
                   NIST RT evaluations. The system was designed for
                   participation in the speech to text part of the
                   evaluations, in particular for transcription of speech
                   recorded with multiple distant microphones and
                   independent headset microphones. System performance was
                   tested on both conference room and lecture style
                   meetings. Although input sources are processed using
                   different frontends, the recognition process is based
                   on a unified system architecture. The system operates
                   in multiple passes and makes use of state of the art
                   technologies such as discriminative training, vocal
                   tract length normalisation, heteroscedastic linear
                   discriminant analysis, speaker adaptation with maximum
                   likelihood linear regression and minimum word error
                   rate decoding. In this paper we describe the system
                   performance on the official development and test sets
                   for the NIST RT05s evaluations. The system was jointly
                   developed in less than 10 months by a multi-site team
                   and was shown to achieve competitive performance.},
  categories = {LVCSR, NIST Meeting Transcription Evaluation RT05S},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/AMIasr.nist2005.pdf},
  year = 2005
}
@inproceedings{garau2008a,
  author = {Garau, Giulia and Renals, Steve},
  title = {Pitch adaptive features for {LVCSR}},
  booktitle = {Proc. Interspeech '08},
  abstract = {We have investigated the use of a pitch adaptive
                   spectral representation on large vocabulary speech
                   recognition, in conjunction with speaker normalisation
                   techniques. We have compared the effect of a smoothed
                   spectrogram to the pitch adaptive spectral analysis by
                   decoupling these two components of STRAIGHT.
                   Experiments performed on a large vocabulary meeting
                   speech recognition task highlight the importance of
                   combining a pitch adaptive spectral representation with
                   a conventional fixed window spectral analysis. We found
                   evidence that STRAIGHT pitch adaptive features are more
                   speaker independent than conventional MFCCs without
                   pitch adaptation, thus they also provide better
                   performances when combined using feature combination
                   techniques such as Heteroscedastic Linear Discriminant
                   Analysis.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/garau2008a.pdf},
  year = 2008
}
@inproceedings{AMIMLMI05,
  author = {T. Hain and L. Burget and J. Dines and G. Garau and M.
                   Karafiat and M. Lincoln and I. McCowan and D. Moore and
                   V. Wan and R. Ordelman and S. Renals},
  title = {The Development of the {AMI} System for the
                   Transcription of Speech in Meetings},
  booktitle = {2nd Joint Workshop on Multimodal Interaction and
                   Related Machine Learning Algorithms},
  abstract = {The automatic processing of speech collected in
                   conference style meetings has attracted considerable
                   interest with several large scale projects devoted to
                   this area. This paper describes the development of a
                   baseline automatic speech transcription system for
                   meetings in the context of the AMI (Augmented
                   Multiparty Interaction) project. We present several
                   techniques important to processing of this data and
                   show the performance in terms of word error rates
                   (WERs). An important aspect of transcription of this
                   data is the necessary flexibility in terms of audio
                   pre-processing. Real world systems have to deal with
                   flexible input, for example by using microphone arrays
                   or randomly placed microphones in a room. Automatic
                   segmentation and microphone array processing techniques
                   are described and the effect on WERs is discussed. The
                   system and its components presented in this paper yield
                   compettive performance and form a baseline for future
                   research in this domain.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/AMIsystemMLMI05.pdf},
  year = 2005
}
@inproceedings{AMIsystemICASSP2007,
  author = {T. Hain and L. Burget and J. Dines and G. Garau and M.
                   Karafiat and M. Lincoln and J. Vepa and V. Wan},
  title = {{The {AMI} System for the Transcription of Speech in
                   Meetings}},
  booktitle = {Proc. {ICASSP}},
  abstract = {This paper describes the AMI transcription system for
                   speech in meetings developed in collaboration by five
                   research groups. The system includes generic techniques
                   such as discriminative and speaker adaptive training,
                   vocal tract length normalisation, heteroscedastic
                   linear discriminant analysis, maximum likelihood linear
                   regression, and phone posterior based features, as well
                   as techniques specifically designed for meeting data.
                   These include segmentation and cross-talk suppression,
                   beam-forming, domain adaptation, web-data collection,
                   and channel adaptive training. The system was improved
                   by more than 20\% relative in word error rate compared
                   to our previous system and was usd in the NIST RTÂ’06
                   evaluations where it was found to yield competitive
                   performance.},
  categories = {LVCSR, NIST Meeting Transcription Evaluation RT06S},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/ICASSP07.AMIasrsystem.pdf},
  year = 2007
}