The Centre for Speech Technology Research, The university of Edinburgh

Publications by Giulia Garau

ggarau.bib

@inproceedings{garau-interspeech05,
  author = {Garau, G. and Renals, S. and Hain, T.},
  title = {Applying Vocal Tract Length Normalization to Meeting Recordings},
  booktitle = {Proc. Interspeech},
  month = {September},
  year = {2005},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/giuliagarau_eurospeech05.pdf},
  abstract = {Vocal Tract Length Normalisation (VTLN) is a commonly used technique to normalise for inter-speaker variability. It is based on the speaker-specific warping of the frequency axis, parameterised by a scalar warp factor. This factor is typically estimated using maximum likelihood. We discuss how VTLN may be applied to multiparty conversations, reporting a substantial decrease in word error rate in experiments using the ICSI meetings corpus. We investigate the behaviour of the VTLN warping factor and show that a stable estimate is not obtained. Instead it appears to be influenced by the context of the meeting, in particular the current conversational partner. These results are consistent with predictions made by the psycholinguistic interactive alignment account of dialogue, when applied at the acoustic and phonological levels.},
  categories = {ami,asr,edinburgh,vtln,speaker adaptation,lvcsr,meetings}
}
@inproceedings{hain-interspeech05,
  author = {Hain, T. and Dines, J. and Garau, G. and Karafiat, M. and Moore, D. and Wan, V. and Ordelman, R. and Renals, S.},
  title = {Transcription of Conference Room Meetings: an Investigation},
  booktitle = {Proc. Interspeech},
  year = {2005},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hain-eurospeech05.pdf},
  abstract = {The automatic processing of speech collected in conference style meetings has attracted considerable interest with several large scale projects devoted to this area. In this paper we explore the use of various meeting corpora for the purpose of automatic speech recognition. In particular we investigate the similarity of these resources and how to efficiently use them in the construction of a meeting transcription system. The analysis shows distinctive features for each resource. However the benefit in pooling data and hence the similarity seems sufficient to speak of a generic conference meeting domain . In this context this paper also presents work on development for the AMI meeting transcription system, a joint effort by seven sites working on the AMI (augmented multi-party interaction) project.},
  categories = {ami,asr,edinburgh}
}
@inproceedings{NistevalAMI06,
  author = {Hain, T. and Burget, L. and Burget, L. and dines, J. and Garau, G. and Karafiat, M. and Lincoln, M. and Vepa, J. and Wan, V.},
  title = {The {AMI} Meeting Transcription System: Progress and Performance},
  booktitle = {Proceedings of the Rich Transcription 2006 Spring Meeting Recognition Evaluation},
  year = {2006},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/AMIasr.nist06.pdf},
  abstract = {We present the AMI 2006 system for the transcription of speech in meetings. The system was jointly developed by multiple sites on the basis of the 2005 system for participation in the NIST RT'05 evaluations. The paper describes major developments such as improvements in automatic segmentation, cross-domain model adaptation, inclusion of MLP based features, improvements in decoding, language modelling and vocal tract length normalisation, the use of a new decoder, and a new system architecture. This is followed by a comprehensive description of the final system and its performance in the NIST RT'06s evaluations. In comparison to the previous year word error rate results on the individual headset microphone task were reduced by 20\% relative.},
  categories = {LVCSR, NIST Meeting Transcription Evaluation RT06S}
}
@article{garau2008,
  author = {Garau, Giulia and Renals, Steve},
  doi = {10.1109/TASL.2008.916519},
  title = {Combining Spectral Representations for Large Vocabulary Continuous Speech Recognition},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=4443886},
  journal = {IEEE Transactions on Audio, Speech and Language Processing},
  number = {3},
  pages = {508--518},
  volume = {16},
  year = {2008},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2008/garau-taslp08.pdf},
  abstract = {In this paper we investigate the combination of complementary acoustic feature streams in large vocabulary continuous speech recognition (LVCSR). We have explored the use of acoustic features obtained using a pitch-synchronous analysis, STRAIGHT, in combination with conventional features such as mel frequency cepstral coefficients. Pitch-synchronous acoustic features are of particular interest when used with vocal tract length normalisation (VTLN) which is known to be affected by the fundamental frequency. We have combined these spectral representations directly at the acoustic feature level using heteroscedastic linear discriminant analysis (HLDA) and at the system level using ROVER. We evaluated this approach on three LVCSR tasks: dictated newspaper text (WSJCAM0), conversational telephone speech (CTS), and multiparty meeting transcription. The CTS and meeting transcription experiments were both evaluated using standard NIST test sets and evaluation protocols. Our results indicate that combining conventional and pitch-synchronous acoustic feature sets using HLDA results in a consistent, significant decrease in word error rate across all three tasks. Combining at the system level using ROVER resulted in a further significant decrease in word error rate.}
}
@inproceedings{NistevalAMI05,
  author = {Hain, T. and Burget, L. and Dines, J. and Garau, G. and Karafiat, M. and Lincoln, M. and McCowan, I. and Moore, D. and Wan, V. and Ordelman, R. and Renals, S.},
  title = {The 2005 {AMI} System for the transcription of Speech in Meetings},
  booktitle = {Proceedings of the Rich Transcription 2005 Spring Meeting Recognition Evaluation},
  year = {2005},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/AMIasr.nist2005.pdf},
  abstract = {In this paper we describe the 2005 AMI system for the transcription of speech in meetings used in the 2005 NIST RT evaluations. The system was designed for participation in the speech to text part of the evaluations, in particular for transcription of speech recorded with multiple distant microphones and independent headset microphones. System performance was tested on both conference room and lecture style meetings. Although input sources are processed using different frontends, the recognition process is based on a unified system architecture. The system operates in multiple passes and makes use of state of the art technologies such as discriminative training, vocal tract length normalisation, heteroscedastic linear discriminant analysis, speaker adaptation with maximum likelihood linear regression and minimum word error rate decoding. In this paper we describe the system performance on the official development and test sets for the NIST RT05s evaluations. The system was jointly developed in less than 10 months by a multi-site team and was shown to achieve competitive performance.},
  categories = {LVCSR, NIST Meeting Transcription Evaluation RT05S}
}
@inproceedings{garau2008a,
  author = {Garau, Giulia and Renals, Steve},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/garau2008a.pdf},
  booktitle = {Proc. Interspeech '08},
  title = {Pitch adaptive features for {LVCSR}},
  abstract = {We have investigated the use of a pitch adaptive spectral representation on large vocabulary speech recognition, in conjunction with speaker normalisation techniques. We have compared the effect of a smoothed spectrogram to the pitch adaptive spectral analysis by decoupling these two components of STRAIGHT. Experiments performed on a large vocabulary meeting speech recognition task highlight the importance of combining a pitch adaptive spectral representation with a conventional fixed window spectral analysis. We found evidence that STRAIGHT pitch adaptive features are more speaker independent than conventional MFCCs without pitch adaptation, thus they also provide better performances when combined using feature combination techniques such as Heteroscedastic Linear Discriminant Analysis.},
  year = {2008}
}
@inproceedings{AMIMLMI05,
  author = {Hain, T. and Burget, L. and Dines, J. and Garau, G. and Karafiat, M. and Lincoln, M. and McCowan, I. and Moore, D. and Wan, V. and Ordelman, R. and Renals, S.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/AMIsystemMLMI05.pdf},
  booktitle = {2nd Joint Workshop on Multimodal Interaction and Related Machine Learning Algorithms},
  title = {The Development of the {AMI} System for the Transcription of Speech in Meetings},
  abstract = {The automatic processing of speech collected in conference style meetings has attracted considerable interest with several large scale projects devoted to this area. This paper describes the development of a baseline automatic speech transcription system for meetings in the context of the AMI (Augmented Multiparty Interaction) project. We present several techniques important to processing of this data and show the performance in terms of word error rates (WERs). An important aspect of transcription of this data is the necessary flexibility in terms of audio pre-processing. Real world systems have to deal with flexible input, for example by using microphone arrays or randomly placed microphones in a room. Automatic segmentation and microphone array processing techniques are described and the effect on WERs is discussed. The system and its components presented in this paper yield compettive performance and form a baseline for future research in this domain.},
  year = {2005}
}
@inproceedings{AMIsystemICASSP2007,
  author = {Hain, T. and Burget, L. and Dines, J. and Garau, G. and Karafiat, M. and Lincoln, M. and Vepa, J. and Wan, V.},
  title = {{The {AMI} System for the Transcription of Speech in Meetings}},
  booktitle = {Proc. {ICASSP}},
  year = {2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/ICASSP07.AMIasrsystem.pdf},
  abstract = {This paper describes the AMI transcription system for speech in meetings developed in collaboration by five research groups. The system includes generic techniques such as discriminative and speaker adaptive training, vocal tract length normalisation, heteroscedastic linear discriminant analysis, maximum likelihood linear regression, and phone posterior based features, as well as techniques specifically designed for meeting data. These include segmentation and cross-talk suppression, beam-forming, domain adaptation, web-data collection, and channel adaptive training. The system was improved by more than 20\% relative in word error rate compared to our previous system and was usd in the NIST RTÂ’06 evaluations where it was found to yield competitive performance.},
  categories = {LVCSR, NIST Meeting Transcription Evaluation RT06S}
}