The Centre for Speech Technology Research, The university of Edinburgh

Publications by Vipperla Ravi Chander

s0680896.bib

@inproceedings{vipperla08,
  author = {Ravichander Vipperla and Steve Renals and Joe Frankel},
  title = {Longitudinal study of {ASR} performance on ageing
                   voices},
  booktitle = {Proc.~Interspeech},
  address = {Brisbane},
  abstract = {This paper presents the results of a longitudinal
                   study of ASR performance on ageing voices. Experiments
                   were conducted on the audio recordings of the
                   proceedings of the Supreme Court Of The United States
                   (SCOTUS). Results show that the Automatic Speech
                   Recognition (ASR) Word Error Rates (WERs) for elderly
                   voices are significantly higher than those of adult
                   voices. The word error rate increases gradually as the
                   age of the elderly speakers increase. Use of maximum
                   likelihood linear regression (MLLR) based speaker
                   adaptation on ageing voices improves the WER though the
                   performance is still considerably lower compared to
                   adult voices. Speaker adaptation however reduces the
                   increase in WER with age during old age.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/vipperla_is08.pdf},
  year = 2008
}
@incollection{vipperla2009a,
  author = {Vipperla, Ravi Chander and Wolters, Maria and
                   Georgila, Kallirroi and Renals, Steve},
  title = {Speech Input from Older Users in Smart Environments:
                   Challenges and Perspectives},
  booktitle = {Proc. HCI International: Universal Access in
                   Human-Computer Interaction. Intelligent and Ubiquitous
                   Interaction Environments},
  publisher = {Springer},
  number = {5615},
  series = {Lecture Notes in Computer Science},
  abstract = {Although older people are an important user group for
                   smart environments, there has been relatively little
                   work on adapting natural language interfaces to their
                   requirements. In this paper, we focus on a particularly
                   thorny problem: processing speech input from older
                   users. Our experiments on the MATCH corpus show clearly
                   that we need age-specific adaptation in order to
                   recognize older users' speech reliably. Language models
                   need to cover typical interaction patterns of older
                   people, and acoustic models need to accommodate older
                   voices. Further research is needed into intelligent
                   adaptation techniques that will allow existing large,
                   robust systems to be adapted with relatively small
                   amounts of in-domain, age appropriate data. In
                   addition, older users need to be supported with
                   adequate strategies for handling speech recognition
                   errors.},
  doi = {10.1007/978-3-642-02710-9},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/vipperla2009a.pdf},
  url = {http://www.springerlink.com/content/27r01345r1683251/?p=ad2394d646814db59cf9868b0f74b11e&pi=13},
  year = 2009
}
@inproceedings{wolters-is:09,
  author = {Wolters, Maria and Vipperla, Ravichander and Renals,
                   Steve},
  title = {Age Recognition for Spoken Dialogue Systems: Do We
                   Need It?},
  booktitle = {Proc. Interspeech},
  abstract = {When deciding whether to adapt relevant aspects of the
                   system to the particular needs of older users, spoken
                   dialogue systems often rely on automatic detection of
                   chronological age. In this paper, we show that vocal
                   ageing as measured by acoustic features is an
                   unreliable indicator of the need for adaptation. Simple
                   lexical features greatly improve the prediction of both
                   relevant aspects of cognition and interactions style.
                   Lexical features also boost age group prediction. We
                   suggest that adaptation should be based on observed
                   behaviour, not on chronological age, unless it is not
                   feasible to build classifiers for relevant adaptation
                   decisions.},
  categories = {age recognition, spoken dialogue systems},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/is09.pdf},
  year = 2009
}
@inproceedings{vipperla2010a,
  author = {Vipperla, Ravi Chander and Renals, Steve and Frankel,
                   Joe},
  title = {Augmentation of adaptation data},
  booktitle = {Proc. Interspeech},
  pages = {530--533},
  address = {Makuhari, Japan},
  abstract = {Linear regression based speaker adaptation approaches
                   can improve Automatic Speech Recognition (ASR) accuracy
                   significantly for a target speaker. However, when the
                   available adaptation data is limited to a few seconds,
                   the accuracy of the speaker adapted models is often
                   worse compared with speaker independent models. In this
                   paper, we propose an approach to select a set of
                   reference speakers acoustically close to the target
                   speaker whose data can be used to augment the
                   adaptation data. To determine the acoustic similarity
                   of two speakers, we propose a distance metric based on
                   transforming sample points in the acoustic space with
                   the regression matrices of the two speakers. We show
                   the validity of this approach through a speaker
                   identification task. ASR results on SCOTUS and AMI
                   corpora with limited adaptation data of 10 to 15
                   seconds augmented by data from selected reference
                   speakers show a significant improvement in Word Error
                   Rate over speaker independent and speaker adapted
                   models.},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/vipperla-is2010.pdf},
  year = 2010
}
@article{vipperla2010,
  author = {Vipperla, Ravi Chander and Renals, Steve and Frankel,
                   Joe},
  title = {Ageing voices: The effect of changes in voice
                   parameters on {ASR} performance},
  journal = {EURASIP Journal on Audio, Speech, and Music Processing},
  abstract = {With ageing, human voices undergo several changes
                   which are typically characterized by increased
                   hoarseness and changes in articulation patterns. In
                   this study, we have examined the effect on Automatic
                   Speech Recognition (ASR) and found that the Word Error
                   Rates (WER) on older voices is about 9\% absolute
                   higher compared to those of adult voices. Subsequently,
                   we compared several voice source parameters including
                   fundamental frequency, jitter, shimmer, harmonicity and
                   cepstral peak prominence of adult and older males.
                   Several of these parameters show statistically
                   significant difference for the two groups. However,
                   artificially increasing jitter and shimmer measures do
                   not effect the ASR accuracies significantly.
                   Artificially lowering the fundamental frequency
                   degrades the ASR performance marginally but this drop
                   in performance can be overcome to some extent using
                   Vocal Tract Length Normalisation (VTLN). Overall, we
                   observe that the changes in the voice source parameters
                   do not have a significant impact on ASR performance.
                   Comparison of the likelihood scores of all the phonemes
                   for the two age groups show that there is a systematic
                   mismatch in the acoustic space of the two age groups.
                   Comparison of the phoneme recognition rates show that
                   mid vowels, nasals and phonemes that depend on the
                   ability to create constrictions with tongue tip for
                   articulation are more affected by ageing than other
                   phonemes.},
  doi = {10.1155/2010/525783},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/vipperla-eurasip10.pdf},
  url = {http://dx.doi.org/10.1155/2010/525783},
  year = 2010
}