Publications by Vipperla Ravi Chander
s0680896.bib
@inproceedings{vipperla08,
author = {Ravichander Vipperla and Steve Renals and Joe Frankel},
title = {Longitudinal study of {ASR} performance on ageing
voices},
booktitle = {Proc.~Interspeech},
address = {Brisbane},
abstract = {This paper presents the results of a longitudinal
study of ASR performance on ageing voices. Experiments
were conducted on the audio recordings of the
proceedings of the Supreme Court Of The United States
(SCOTUS). Results show that the Automatic Speech
Recognition (ASR) Word Error Rates (WERs) for elderly
voices are significantly higher than those of adult
voices. The word error rate increases gradually as the
age of the elderly speakers increase. Use of maximum
likelihood linear regression (MLLR) based speaker
adaptation on ageing voices improves the WER though the
performance is still considerably lower compared to
adult voices. Speaker adaptation however reduces the
increase in WER with age during old age.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/vipperla_is08.pdf},
year = 2008
}
@incollection{vipperla2009a,
author = {Vipperla, Ravi Chander and Wolters, Maria and
Georgila, Kallirroi and Renals, Steve},
title = {Speech Input from Older Users in Smart Environments:
Challenges and Perspectives},
booktitle = {Proc. HCI International: Universal Access in
Human-Computer Interaction. Intelligent and Ubiquitous
Interaction Environments},
publisher = {Springer},
number = {5615},
series = {Lecture Notes in Computer Science},
abstract = {Although older people are an important user group for
smart environments, there has been relatively little
work on adapting natural language interfaces to their
requirements. In this paper, we focus on a particularly
thorny problem: processing speech input from older
users. Our experiments on the MATCH corpus show clearly
that we need age-specific adaptation in order to
recognize older users' speech reliably. Language models
need to cover typical interaction patterns of older
people, and acoustic models need to accommodate older
voices. Further research is needed into intelligent
adaptation techniques that will allow existing large,
robust systems to be adapted with relatively small
amounts of in-domain, age appropriate data. In
addition, older users need to be supported with
adequate strategies for handling speech recognition
errors.},
doi = {10.1007/978-3-642-02710-9},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/vipperla2009a.pdf},
url = {http://www.springerlink.com/content/27r01345r1683251/?p=ad2394d646814db59cf9868b0f74b11e&pi=13},
year = 2009
}
@inproceedings{wolters-is:09,
author = {Wolters, Maria and Vipperla, Ravichander and Renals,
Steve},
title = {Age Recognition for Spoken Dialogue Systems: Do We
Need It?},
booktitle = {Proc. Interspeech},
abstract = {When deciding whether to adapt relevant aspects of the
system to the particular needs of older users, spoken
dialogue systems often rely on automatic detection of
chronological age. In this paper, we show that vocal
ageing as measured by acoustic features is an
unreliable indicator of the need for adaptation. Simple
lexical features greatly improve the prediction of both
relevant aspects of cognition and interactions style.
Lexical features also boost age group prediction. We
suggest that adaptation should be based on observed
behaviour, not on chronological age, unless it is not
feasible to build classifiers for relevant adaptation
decisions.},
categories = {age recognition, spoken dialogue systems},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/is09.pdf},
year = 2009
}
@inproceedings{vipperla2010a,
author = {Vipperla, Ravi Chander and Renals, Steve and Frankel,
Joe},
title = {Augmentation of adaptation data},
booktitle = {Proc. Interspeech},
pages = {530--533},
address = {Makuhari, Japan},
abstract = {Linear regression based speaker adaptation approaches
can improve Automatic Speech Recognition (ASR) accuracy
significantly for a target speaker. However, when the
available adaptation data is limited to a few seconds,
the accuracy of the speaker adapted models is often
worse compared with speaker independent models. In this
paper, we propose an approach to select a set of
reference speakers acoustically close to the target
speaker whose data can be used to augment the
adaptation data. To determine the acoustic similarity
of two speakers, we propose a distance metric based on
transforming sample points in the acoustic space with
the regression matrices of the two speakers. We show
the validity of this approach through a speaker
identification task. ASR results on SCOTUS and AMI
corpora with limited adaptation data of 10 to 15
seconds augmented by data from selected reference
speakers show a significant improvement in Word Error
Rate over speaker independent and speaker adapted
models.},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/vipperla-is2010.pdf},
year = 2010
}
@article{vipperla2010,
author = {Vipperla, Ravi Chander and Renals, Steve and Frankel,
Joe},
title = {Ageing voices: The effect of changes in voice
parameters on {ASR} performance},
journal = {EURASIP Journal on Audio, Speech, and Music Processing},
abstract = {With ageing, human voices undergo several changes
which are typically characterized by increased
hoarseness and changes in articulation patterns. In
this study, we have examined the effect on Automatic
Speech Recognition (ASR) and found that the Word Error
Rates (WER) on older voices is about 9\% absolute
higher compared to those of adult voices. Subsequently,
we compared several voice source parameters including
fundamental frequency, jitter, shimmer, harmonicity and
cepstral peak prominence of adult and older males.
Several of these parameters show statistically
significant difference for the two groups. However,
artificially increasing jitter and shimmer measures do
not effect the ASR accuracies significantly.
Artificially lowering the fundamental frequency
degrades the ASR performance marginally but this drop
in performance can be overcome to some extent using
Vocal Tract Length Normalisation (VTLN). Overall, we
observe that the changes in the voice source parameters
do not have a significant impact on ASR performance.
Comparison of the likelihood scores of all the phonemes
for the two age groups show that there is a systematic
mismatch in the acoustic space of the two age groups.
Comparison of the phoneme recognition rates show that
mid vowels, nasals and phonemes that depend on the
ability to create constrictions with tongue tip for
articulation are more affected by ageing than other
phonemes.},
doi = {10.1155/2010/525783},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/vipperla-eurasip10.pdf},
url = {http://dx.doi.org/10.1155/2010/525783},
year = 2010
}