Publications by Mirjam Wester
mwester.bib
@inproceedings{Sturm-03,
author = {J. Sturm and J. M. Kessens and M. Wester and F. de Wet
and E. Sanders and H. Strik },
title = {Automatic Transcription of Football Commentaries in
the {MUMIS} Project},
booktitle = {Proc. Eurospeech '03},
pages = {-},
abstract = {This paper describes experiments carried out to
automatically transcribe football commentaries in
Dutch, English and German for multimedia indexing. Our
results show that the high levels of stadium noise in
the material create a task that is extremely difficult
for conventional ASR. The baseline WERs vary from 83\%
to 94\% for the three languages investigated. Employing
state-of-the-art noise robustness techniques leads to
relative reductions of 9-10\% WER. Application specific
words such as players names are recognized correctly in
about 50\% of cases. Although this result is
substantially better than the overall result, it is
inadequate. Much better results can be obtained if the
football commentaries are recorded separately from the
stadium noise. This would make the automatic
transcriptions more useful for multimedia indexing.},
categories = {asr, MUMIS, Nijmegen},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/mumis_final.pdf},
year = 2003
}
@inproceedings{karhila_interspeech:11,
author = {Reima Karhila and Mirjam Wester},
title = {Rapid Adaptation of Foreign-accented {HMM}-based
Speech Synthesis},
booktitle = {Proc. Interspeech},
address = {Florence, Italy},
abstract = {This paper presents findings of listeners’
perception of speaker identity in synthetic speech.
Specifically, we investigated what the effect is on the
perceived identity of a speaker when using differently
accented average voice models and limited amounts (five
and fifteen sentences) of a speaker’s data to create
the synthetic stimuli. A speaker discrimination task
was used to measure speaker identity. Native English
listeners were presented with natural and synthetic
speech stimuli in English and were asked to decide
whether they thought the sentences were spoken by the
same person or not. An accent rating task was also
carried out to measure the perceived accents of the
synthetic speech stimuli. The results show that
listeners, for the most part, perform as well at
speaker discrimination when the stimuli have been
created using five or fifteen adaptation sentences as
when using 105 sentences. Furthermore, the accent of
the average voice model does not affect listeners’
speaker discrimination performance even though the
accent rating task shows listeners are perceiving
different accents in the synthetic stimuli. Listeners
do not base their speaker similarity decisions on
perceived accent.},
categories = {speech synthesis, rapid adaptation},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/karhila_wester_interspeech_2011.pdf},
year = 2011
}
@inproceedings{kessens-00,
author = {J.M. Kessens and M. Wester and H. Strik},
title = {Automatic Detection and Verification of {D}utch
Phonological Rules},
booktitle = {PHONUS 5: Proceedings of the "Workshop on Phonetics
and Phonology in ASR"},
pages = {117-128},
address = {Saarbruecken},
abstract = {In this paper, we propose two methods for
automatically obtaining hypotheses about pronunciation
variation. To this end, we used two different
approaches in which we employed a continuous speech
recognizer to derive this information from the speech
signal. For the first method, the output of a phone
recognition was compared to a reference transcription
in order obtain hypotheses about pronunciation
variation. Since phone recognition contains errors, we
used forced recognition in order to exclude unreliable
hypotheses. For the second method, forced recognition
was also used, but the hypotheses about the deletion of
phones were not constrained beforehand. This was
achieved by allowing each phone to be deleted. After
forced recognition, we selected the most frequently
applied rules as the set of deletion rules. Since
previous research showed that forced recognition is a
reliable tool for testing hypotheses about
pronunciation variation, we can expect that this will
also hold for the hypotheses about pronunciation
variation which we found using each of the two methods.
Another reason for expecting the rule hypotheses to be
reliable is that we found that 37-53\% of the rules are
related to Dutch phonological processes that have been
described in the literature.},
categories = {asr, pm, VIOS, Nijmegen},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/kessens.2000.2.pdf},
year = 2000
}
@article{Kessens-Wester-99,
author = {J.M. Kessens and M. Wester and H. Strik},
title = {Improving the performance of a {D}utch {CSR} by
modeling within-word and cross-word pronunciation
variation},
journal = {Speech Communication},
volume = {29},
pages = {193-207},
abstract = {This article describes how the performance of a Dutch
continuous speech recognizer was improved by modeling
pronunciation variation. We propose a general procedure
for modeling pronunciation variation. In short, it
consists of adding pronunciation variants to the
lexicon, retraining phone models and using language
models to which the pronunciation variants have been
added. First, within-word pronunciation variants were
generated by applying a set of five optional
phonological rules to the words in the baseline
lexicon. Next, a limited number of cross-word processes
were modeled, using two different methods. In the first
approach, cross-word processes were modeled by directly
adding the cross-word variants to the lexicon, and in
the second approach this was done by using multi-words.
Finally, the combination of the within-word method with
the two cross-word methods was tested. The word error
rate (WER) measured for the baseline system was
12.75\%. Compared to the baseline, a small but
statistically significant improvement of 0.68\% in WER
was measured for the within-word method, whereas both
cross-word methods in isolation led to small,
non-signicant improvements. The combination of the
within-word method and cross-word method 2 led to the
best result: an absolute improvement of 1.12\% in WER
was found compared to the baseline, which is a relative
improvement of 8.8\% in WER.},
categories = {asr, pm, VIOS, Nijmegen},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/journalversion.pdf},
year = 1999
}
@inproceedings{kessens-CLS-97,
author = {J.M. Kessens and M. Wester},
title = {Improving Recognition Performance by Modelling
Pronunciation Variation},
booktitle = {Proc. of the CLS opening Academic Year '97 '98},
pages = {1-20},
address = {Nijmegen},
abstract = {This paper describes a method for improving the
performance of a continuous speech recognizer by
modelling pronunciation variation. Although the
improvements obtained with this method are small, they
are in line with those reported by other authors. A
series of experiments was carried out to model
pronunciation variation. In the first set of
experiments word internal pronunciation variation was
modelled by applying a set of four phonological rules
to the words in the lexicon. In the second set of
experiments, variation across word boundaries was also
modelled. The results obtained with both methods are
presented in detail. Furthermore, statistics are given
on the application of the four phonological rules on
the training database. We will explain why the
improvements obtained with this method are small and
how we intend to increase the improvements in our
future research.},
categories = {asr, pm, VIOS, Nijmegen},
pdf = {http://www.cstr.ed.ac.uk/downloads/publications/1997/kessens.1997.1.pdf},
year = 1997
}
@inproceedings{Kessens-ICPhS-99,
author = {J.M. Kessens and M. Wester and H. Strik},
title = {Modeling within-word and cross-word pronunciation
variation to improve the performance of a {D}utch {CSR}},
booktitle = {Proc. of ICPhS '99},
pages = {1665-1668},
address = {San Francisco},
abstract = {This paper describes how the performance of a
continuous speech recognizer for Dutch has been
improved by modeling within-word and cross-word
pronunciation variation. Within-word variants were
automatically generated by applying five phonological
rules to the words in the lexicon. For the within-word
method, a significant improvement is found compared to
the baseline. Cross-word pronunciation variation was
modeled using two different methods: 1) adding
cross-word variants directly to the lexicon, 2) only
adding multi-words and their variants to the lexicon.
Overall, cross-word method 2 leads to better results
than cross-word method 1. The best results were
obtained when cross-word method 2 was combined with the
within-word method: a relative improvement of 8.8\% WER
was found compared to the baseline.},
categories = {asr, pm, VIOS, Nijmegen},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/kessens.1999.1.pdf},
year = 1999
}
@inproceedings{wester_interspeech:11,
author = {Mirjam Wester and Hui Liang},
title = {Cross-Lingual Speaker Discrimination Using Natural and
Synthetic Speech},
booktitle = {Proc. Interspeech},
address = {Florence, Italy},
abstract = {This paper describes speaker discrimination
experiments in which native English listeners were
presented with either natural speech stimuli in English
and Mandarin, synthetic speech stimuli in English and
Mandarin, or natural Mandarin speech and synthetic
English speech stimuli. In each experiment, listeners
were asked to decide whether they thought the sentences
were spoken by the same person or not. We found that
the results for Mandarin/English speaker discrimination
are very similar to results found in previous work on
German/English and Finnish/English speaker
discrimination. We conclude from this and previous work
that listeners are able to identify speakers across
languages and they are able to identify speakers across
speech types, but the combination of these two factors
leads to a speaker discrimination task which is too
difficult for listeners to perform successfully, given
the quality of across-language speaker adapted speech
synthesis at present.},
categories = {speaker discrimination, speaker adaptation, HMM-based
speech synthesis},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/wester_liang_interspeech_2011.pdf},
year = 2011
}
@techreport{wester_accent2010,
author = {Wester, M.},
title = {The {EMIME} {B}ilingual {D}atabase},
institution = {The University of Edinburgh},
number = {EDI-INF-RR-1388},
abstract = {This paper describes the collection of a bilingual
database of Finnish/English and German/English data. In
addition, the accents of the talkers in the database
have been rated. English, German and Finnish listeners
assessed the English, German and Finnish talkers{\^a}
degree of foreign accent in English. Native English
listeners showed higher inter-listener agreement than
non-native listeners. Further analyses showed that
non-native listeners judged Finnish and German female
talkers to be significantly less accented than do
English listeners. German males are judged less
accented by Finnish listeners than they are by English
and German listeners and there is no difference between
listeners as to how they judge the accent of Finnish
males. Finally, all English talkers are judged more
accented by non-native listeners than they are by
native English listeners.},
categories = {evaluation,cross-lingual, accent rating},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wester_accent_2010.pdf},
year = 2010
}
@inproceedings{wester-98-sposs,
author = {M. Wester and J.M. Kessens and C. Cucchiarini and H.
Strik},
title = {Selection of Pronunciation Variants in Spontaneous
Speech: Comparing the Performance of Man and Machine},
booktitle = {Proc. of the ESCA Workshop on the Sound Patterns of
Spontaneous Speech: Production and Perception},
pages = {157-160},
address = {Aix-en-Provence},
abstract = {Dans cet article, les performances d'un outil de
transcription automatique sont évaluées. L'outil de
transcription est un reconnaisseur de parole continue
(CSR) fonctionnant en mode de reconnaissance forcée.
Pour l'évaluation les performances du CSR ont été
comparées à celles de neuf auditeurs experts. La
machine et l'humain ont effectué exactement la même
tâche: décider si un segment était présent ou non
dans 467 cas. Il s'est avéré que les performances du
CSR étaient comparables à celle des experts.},
categories = {asr, pm, VIOS, Nijmegen},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/wester.1998.5.pdf},
year = 1998
}
@article{frankel07:AF_DBN,
author = {Frankel, J. and Wester, M. and King, S.},
title = {Articulatory feature recognition using dynamic
{B}ayesian networks},
journal = {Computer Speech & Language },
volume = {21},
number = {4},
pages = {620--640},
abstract = {We describe a dynamic Bayesian network for
articulatory feature recognition. The model is intended
to be a component of a speech recognizer that avoids
the problems of conventional ``beads-on-a-string''
phoneme-based models. We demonstrate that the model
gives superior recognition of articulatory features
from the speech signal compared with a stateof- the art
neural network system. We also introduce a training
algorithm that offers two major advances: it does not
require time-aligned feature labels and it allows the
model to learn a set of asynchronous feature changes in
a data-driven manner.},
month = oct,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Frankel_etal_CSL2007.pdf},
year = 2007
}
@inproceedings{Wester-icslp-02,
author = {M. Wester and J.M. Kessens and H. Strik},
title = {Goal-directed {ASR} in a multimedia indexing and
searching environment ({MUMIS})},
booktitle = {Proc. of ICSLP},
pages = {1993-1996},
address = {Denver},
abstract = {This paper describes the contribution of automatic
speech recognition (ASR) within the framework of MUMIS
(Multimedia Indexing and Searching Environment). The
domain is football commentaries. The initial results of
carrying out ASR on Dutch and English football
commentaries are presented. We found that overall word
error rates are high, but application specific words
are recognized reasonably well. The difficulty of the
ASR task is greatly increased by the high levels of
noise present in the material.},
categories = {asr, MUMIS, Nijmegen},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/wester.2002.2.pdf},
year = 2002
}
@inproceedings{wester-97,
author = {M. Wester and J.M. Kessens and C. Cucchiarini and H.
Strik},
title = {Modelling pronunciation variation: some preliminary
results},
booktitle = {Proc. of the Dept. of Language & Speech},
pages = {127-137},
address = {Nijmegen},
abstract = {In this paper we describe a method for improving the
performance of a continuous speech recognizer by
modelling pronunciation variation. Although the results
obtained with this method are in line with those
reported by other authors, the magnitude of the
improvements is very small. In looking for possible
explanations for these results, we computed various
sorts of statistics about the material. Since these
data proved to be very useful in understanding the
effects of our method, they are discussed in this
paper. Moreover, on the basis of these statistics we
discuss how the system can be improved in the future.},
categories = {asr, pm, VIOS, Nijmegen},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/wester.1997.1.pdf},
year = 1997
}
@article{wester:specom:12,
author = {Mirjam Wester},
title = {Talker discrimination across languages},
journal = {Speech Communication},
volume = {54},
pages = {781--790},
abstract = {This study investigated the extent to which listeners
are able to discriminate between bilingual talkers in
three language pairs – English–German,
English–Finnish and English–Mandarin. Native
English listeners were presented with two sentences
spoken by bilingual talkers and were asked to judge
whether they thought the sentences were spoken by the
same person. Equal amounts of cross-language and
matched-language trials were presented. The results
show that native English listeners are able to carry
out this task well; achieving percent correct levels at
well above chance for all three language pairs.
Previous research has shown this for English–German,
this research shows listeners also extend this to
Finnish and Mandarin, languages that are quite distinct
from English from a genetic and phonetic similarity
perspective. However, listeners are significantly less
accurate on cross-language talker trials
(English–foreign) than on matched-language trials
(English–English and foreign–foreign).
Understanding listeners’ behaviour in cross-language
talker discrimination using natural speech is the first
step in developing principled evaluation techniques for
synthesis systems in which the goal is for the
synthesised voice to sound like the original speaker,
for instance, in speech-to-speech translation systems,
voice conversion and reconstruction.},
categories = {evaluation},
doi = {10.1016/j.specom.2012.01.006},
pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2012/wester_specom_12.pdf},
year = 2012
}
@inproceedings{Wester-00,
author = {M. Wester and J.M. Kessens and H. Strik},
title = {Pronunciation variation in {ASR}: Which variation to
model?},
booktitle = {Proc. of {ICSLP} '00},
volume = {IV},
pages = {488-491},
address = {Beijing},
abstract = {This paper describes how the performance of a
continuous speech recognizer for Dutch has been
improved by modeling within-word and cross-word
pronunciation variation. A relative improvement of
8.8\% in WER was found compared to baseline system
performance. However, as WERs do not reveal the full
effect of modeling pronunciation variation, we
performed a detailed analysis of the differences in
recognition results that occur due to modeling
pronunciation variation and found that indeed a lot of
the differences in recognition results are not
reflected in the error rates. Furthermore, error
analysis revealed that testing sets of variants in
isolation does not predict their behavior in
combination. However, these results appeared to be
corpus dependent.},
categories = {asr, pm, VIOS, Nijmegen},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/wester.2000.1.pdf},
year = 2000
}
@inproceedings{wester04:asynch,
author = {Wester, M. and Frankel, J. and King, S.},
title = {Asynchronous Articulatory Feature Recognition Using
Dynamic {B}ayesian Networks},
booktitle = {Proc. IEICI Beyond HMM Workshop},
address = {Kyoto},
abstract = {This paper builds on previous work where dynamic
Bayesian networks (DBN) were proposed as a model for
articulatory feature recognition. Using DBNs makes it
possible to model the dependencies between features, an
addition to previous approaches which was found to
improve feature recognition performance. The DBN
results were promising, giving close to the accuracy of
artificial neural nets (ANNs). However, the system was
trained on canonical labels, leading to an overly
strong set of constraints on feature co-occurrence. In
this study, we describe an embedded training scheme
which learns a set of data-driven asynchronous feature
changes where supported in the data. Using a subset of
the OGI Numbers corpus, we describe articulatory
feature recognition experiments using both
canonically-trained and asynchronous DBNs. Performance
using DBNs is found to exceed that of ANNs trained on
an identical task, giving a higher recognition
accuracy. Furthermore, inter-feature dependencies
result in a more structured model, giving rise to fewer
feature combinations in the recognition output. In
addition to an empirical evaluation of this modelling
approach, we give a qualitative analysis, comparing
asynchrony found through our data-driven methods to the
asynchrony which may be expected on the basis of
linguistic knowledge.},
categories = {am,artic,asr,dbn,oginumbers,edinburgh},
month = dec,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Wester_et_al_IEICE.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Wester_et_al_IEICE.ps},
year = 2004
}
@inproceedings{Wester-03,
author = {M. Wester},
title = {Syllable classification using articulatory-acoustic
features},
booktitle = {Proc. of Eurospeech '03},
pages = {-},
address = {Geneva},
abstract = {This paper investigates the use of
articulatory-acoustic features for the classification
of syllables in TIMIT. The main motivation for this
study is to circumvent the ``beads-on-a-string''
problem, i.e. the assumption that words can be
described as a simple concatenation of phones.
Posterior probabilities for articulatory-acoustic
features are obtained from artificial neural nets and
are used to classify speech within the scope of
syllables instead of phones. This gives the opportunity
to account for asynchronous feature changes, exploiting
the strengths of the articulatory-acoustic features,
instead of losing the potential by reverting to phones.},
categories = {aaf, syllable, TIMIT, Edinburgh},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/wester.2003.1.pdf},
year = 2003
}
@article{Dines2011,
author = {John Dines and Hui Liang and Lakshmi Saheer and
Matthew Gibson and William Byrne and Keiichiro Oura and
Keiichi Tokuda and Junichi Yamagishi and Simon King and
Mirjam Wester and Teemu Hirsimäki and Reima
Karhila and Mikko Kurimo},
title = {Personalising speech-to-speech translation:
Unsupervised cross-lingual speaker adaptation for
{HMM}-based speech synthesis},
journal = {Computer Speech and Language},
volume = {27},
number = {2},
pages = {420--437},
abstract = {In this paper we present results of unsupervised
cross-lingual speaker adaptation applied to
text-to-speech synthesis. The application of our
research is the personalisation of speech-to-speech
translation in which we employ a HMM statistical
framework for both speech recognition and synthesis.
This framework provides a logical mechanism to adapt
synthesised speech output to the voice of the user by
way of speech recognition. In this work we present
results of several different unsupervised and
cross-lingual adaptation approaches as well as an
end-to-end speaker adaptive speech-to-speech
translation system. Our experiments show that we can
successfully apply speaker adaptation in both
unsupervised and cross-lingual scenarios and our
proposed algorithms seem to generalise well for several
language pairs. We also discuss important future
directions including the need for better evaluation
metrics.},
doi = {10.1016/j.csl.2011.08.003},
issn = {0885-2308},
keywords = {Speech-to-speech translation, Cross-lingual speaker
adaptation, HMM-based speech synthesis, Speaker
adaptation, Voice conversion},
url = {http://www.sciencedirect.com/science/article/pii/S0885230811000441},
month = feb,
year = 2013
}
@inproceedings{wester:interspeech:10,
author = {Mirjam Wester},
title = {Cross-lingual talker discrimination},
booktitle = {Proc. of Interspeech},
address = {Makuhari, Japan},
abstract = {This paper describes a talker discrimination
experiment in which native English listeners were
presented with two sentences spoken by bilingual
talkers (English/German and English/Finnish) and were
asked to judge whether they thought the sentences were
spoken by the same person or not. Equal amounts of
cross-lingual and matched-language trials were
presented. The experiments showed that listeners are
able to complete this task well, they can discriminate
between talkers significantly better than chance.
However, listeners are significantly less accurate on
cross-lingual talker trials than on matched-language
pairs. No significant differences were found on this
task between German and Finnish. Bias (B'') and
Sensitivity (A') values are presented to analyse the
listeners' behaviour in more detail. The results are
promising for the evaluation of EMIME, a project
covering speech-to-speech translation with speaker
adaptation.},
categories = {evaluation},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wester_interspeech2010.pdf},
year = 2010
}
@inproceedings{frankel04:artic_dbn,
author = {Frankel, J. and Wester, M. and King, S.},
title = {Articulatory feature recognition using dynamic
{B}ayesian networks},
booktitle = {Proc. {ICSLP}},
abstract = {This paper describes the use of dynamic Bayesian
networks for the task of articulatory feature
recognition. We show that by modeling the dependencies
between a set of 6 multi-leveled articulatory features,
recognition accuracy is increased over an equivalent
system in which features are considered independent.
Results are compared to those found using artificial
neural networks on an identical task.},
categories = {am,artic,asr,dbn,timit,edinburgh},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Frankel_et_al_ICSLP2004.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Frankel_et_al_ICSLP2004.ps},
year = 2004
}
@inproceedings{cooke:lista:12,
author = {Martin Cooke and Maria Luisa GarcÃa Lecumberri and
Yan Tang and Mirjam Wester},
title = {Do non-native listeners benefit from speech
modifications designed to promote intelligibility for
native listeners?},
booktitle = {Proceedings of The Listening Talker Workshop},
pages = 59,
note = {http://listening-talker.org/workshop/programme.html},
year = 2012
}
@article{Wester-01,
author = {M. Wester and J. M. Kessens and C. Cucchiarini and H.
Strik},
title = {Obtaining phonetic transcriptions: a comparison
between expert listeners and a continuous speech
recognizer},
journal = {Language and Speech},
volume = {44(3)},
pages = {377-403},
abstract = {In this article, we address the issue of using a
continuous speech recognition tool to obtain phonetic
or phonological representations of speech. Two
experiments were carried out in which the performance
of a continuous speech recognizer (CSR) was compared to
the performance of expert listeners in a task of
judging whether a number of prespecified phones had
been realized in an utterance. In the first experiment,
nine expert listeners and the CSR carried out exactly
the same task: deciding whether a segment was present
or not in 467 cases. In the second experiment, we
expanded on the first experiment by focusing on two
phonological processes: schwa-deletion and
schwa-insertion. The results of these experiments show
that significant differences in performance were found
between the CSR and the listeners, but also between
individual listeners. Although some of these
differences appeared to be statistically significant,
their magnitude is such that they may very well be
acceptable depending on what the transcriptions are
needed for. In other words, although the CSR is not
infallible, it makes it possible to explore large
datasets, which might outweigh the errors introduced by
the mistakes the CSR makes. For these reasons, we can
conclude that the CSR can be used instead of a listener
to carry out this type of task: deciding whether a
phone is present or not.},
categories = {automatic transcription, pm, VIOS, Nijmegen},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/wester.2001.3.pdf},
year = 2001
}
@article{Wester-CSL-03,
author = {M. Wester},
title = {Pronunciation modeling for {ASR} -- knowledge-based
and data-derived methods},
journal = {Computer Speech and Language},
volume = {17},
pages = {69-85},
abstract = {This article focuses on modeling pronunciation
variation in two different ways: data-derived and
knowledge-based. The knowledge-based approach consists
of using phonological rules to generate variants. The
data-derived approach consists of performing phone
recognition, followed by smoothing using decision trees
(D-trees) to alleviate some of the errors in the phone
recognition. Using phonological rules led to a small
improvement in WER; a data-derived approach in which
the phone recognition was smoothed using D-trees prior
to lexicon generation led to larger improvements
compared to the baseline. The lexicon was employed in
two different recognition systems: a hybrid HMM/ANN
system and a HMM-based system, to ascertain whether
pronunciation variation was truly being modeled. This
proved to be the case as no significant differences
were found between the results obtained with the two
systems. Furthermore, we found that 10\% of variants
generated by the phonological rules were also found
using phone recognition, and this increased to 28\%
when the phone recognition output was smoothed by using
D-trees. This indicates that the D-trees generalize
beyond what has been seen in the training material,
whereas when the phone recognition approach is employed
directly, unseen pronunciations cannot be predicted. In
addition, we propose a metric to measure confusability
in the lexicon. Using this confusion metric to prune
variants results in roughly the same improvement as
using the D-tree method.},
categories = {asr, pm, VIOS, Nijmegen},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/CSL-pronvar.pdf},
year = 2003
}
@inproceedings{Wester-Fosler-00,
author = {M. Wester and E. Fosler-Lussier},
title = {A comparison of data-derived and knowledge-based
modeling of pronunciation variation},
booktitle = {Proc. of ICSLP '00},
volume = {I},
pages = {270-273},
address = {Beijing},
abstract = {This paper focuses on modeling pronunciation variation
in two different ways: data-derived and
knowledge-based. The knowledge-based approach consists
of using phonological rules to generate variants. The
data-derived approach consists of performing phone
recognition, followed by various pruning and smoothing
methods to alleviate some of the errors in the phone
recognition. Using phonological rules led to a small
improvement in WER; whereas, using a data-derived
approach in which the phone recognition was smoothed
using simple decision trees (d-trees) prior to lexicon
generation led to a significant improvement compared to
the baseline. Furthermore, we found that 10\% of
variants generated by the phonological rules were also
found using phone recognition, and this increased to
23\% when the phone recognition output was smoothed by
using d-trees. In addition, we propose a metric to
measure confusability in the lexicon and we found that
employing this confusion metric to prune variants
results in roughly the same improvement as using the
d-tree method.},
categories = {asr, pm, VIOS, Berkeley},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/wester.2000.2.pdf},
year = 2000
}
@article{Oura2012703,
author = {Keiichiro Oura and Junichi Yamagishi and Mirjam Wester
and Simon King and Keiichi Tokuda},
title = {Analysis of unsupervised cross-lingual speaker
adaptation for {HMM}-based speech synthesis using
{KLD}-based transform mapping},
journal = {Speech Communication},
volume = {54},
number = {6},
pages = {703--714},
abstract = {In the EMIME project, we developed a mobile device
that performs personalized speech-to-speech translation
such that a user's spoken input in one language is used
to produce spoken output in another language, while
continuing to sound like the user's voice. We
integrated two techniques into a single architecture:
unsupervised adaptation for HMM-based TTS using
word-based large-vocabulary continuous speech
recognition, and cross-lingual speaker adaptation
(CLSA) for HMM-based TTS. The CLSA is based on a
state-level transform mapping learned using minimum
Kullback-Leibler divergence between pairs of HMM states
in the input and output languages. Thus, an
unsupervised cross-lingual speaker adaptation system
was developed. End-to-end speech-to-speech translation
systems for four languages (English, Finnish, Mandarin,
and Japanese) were constructed within this framework.
In this paper, the English-to-Japanese adaptation is
evaluated. Listening tests demonstrate that adapted
voices sound more similar to a target speaker than
average voices and that differences between supervised
and unsupervised cross-lingual speaker adaptation are
small. Calculating the KLD state-mapping on only the
first 10 mel-cepstral coefficients leads to huge
savings in computational costs, without any detrimental
effect on the quality of the synthetic speech.},
doi = {10.1016/j.specom.2011.12.004},
issn = {0167-6393},
keywords = {HMM-based speech synthesis, Unsupervised speaker
adaptation, Cross-lingual speaker adaptation,
Speech-to-speech translation},
url = {http://www.sciencedirect.com/science/article/pii/S0167639311001774},
year = 2012
}
@inproceedings{badinoclark_interspeech12,
author = {Leonardo Badino and Robert A.J. Clark and Mirjam
Wester},
title = {Towards Hierarchical Prosodic Prominence Generation in
{TTS} Synthesis},
booktitle = {Proc. Interspeech},
address = {Portland, USA},
categories = {speech synthesis, prosody},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/badinoclark_IS_2012.pdf},
year = 2012
}
@inproceedings{wester-98-icslp,
author = {M. Wester and J.M. Kessens and H. Strik},
title = {Modeling pronunciation variation for a {D}utch {CSR}:
testing three methods},
booktitle = {Proc. ICSLP '98},
pages = {2535-2538},
address = {Sydney},
abstract = {This paper describes how the performance of a
continuous speech recognizer for Dutch has been
improved by modeling pronunciation variation. We used
three methods to model pronunciation variation. First,
within-word variation was dealt with. Phonological
rules were applied to the words in the lexicon, thus
automatically generating pronunciation variants.
Secondly, cross-word pronunciation variation was
modeled using two different approaches. The first
approach was to model cross-word processes by adding
the variants as separate words to the lexicon and in
the second approach this was done by using multi-words.
For each of the methods, recognition experiments were
carried out. A significant improvement was found for
modeling within-word variation. Furthermore, modeling
crossword processes using multi-words leads to
significantly better results than modeling them using
separate words in the lexicon.},
categories = {asr, pm, VIOS, Nijmegen},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/wester.1998.3.pdf},
year = 1998
}
@inproceedings{wester-98-kerkrade,
author = { M. Wester and J.M. Kessens and H. Strik},
title = {Improving the Performance of a {D}utch {CSR} by
Modeling Pronunciation Variation},
booktitle = {Proc. of the Workshop Modeling Pronunciation Variation
for Automatic Speech Recognition},
pages = {145-150},
address = {Kerkrade},
abstract = {This paper describes how the performance of a
continuous speech recognizer for Dutch has been
improved by modeling pronunciation variation. We used
three methods in order to model pronunciation
variation. First, withinword variation was dealt with.
Phonological rules were applied to the words in the
lexicon, thus automatically generating pronunciation
variants. Secondly, cross-word pronunciation variation
was accounted for by adding multi-words and their
variants to the lexicon. Thirdly, probabilities of
pronunciation variants were incorporated in the
language model (LM), and thresholds were used to choose
which pronunciation variants to add to the LMs. For
each of the methods, recognition experiments were
carried out. A significant improvement in error rates
was measured.},
categories = {asr, pm, VIOS, Nijmegen},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/wester.1998.4.pdf},
year = 1998
}
@inproceedings{wester-98-sd,
author = {M. Wester and J.M. Kessens and H. Strik},
title = {Two automatic approaches for analyzing the frequency
of connected speech processes in {D}utch},
booktitle = {Proc. ICSLP Student Day '98},
pages = {3351-3356},
address = {Sydney},
abstract = {This paper describes two automatic approaches used to
study connected speech processes (CSPs) in Dutch. The
first approach was from a linguistic point of view -
the top-down method. This method can be used for
verification of hypotheses about CSPs. The second
approach - the bottom-up method - uses a constrained
phone recognizer to generate phone transcriptions. An
alignment was carried out between the two
transcriptions and a reference transcription. A
comparison between the two methods showed that 68\%
agreement was achieved on the CSPs. Although phone
accuracy is only 63\%, the bottom-up approach is useful
for studying CSPs. From the data generated using the
bottom-up method, indications of which CSPs are present
in the material can be found. These indications can be
used to generate hypotheses which can then be tested
using the top-down method.},
categories = {asr, pm, VIOS, Nijmegen},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/wester.1998.2.pdf},
year = 1998
}
@inproceedings{wester_icassp:11,
author = {Mirjam Wester and Reima Karhila},
title = {Speaker Similarity Evaluation of Foreign-accented
Speech Synthesis using {HMM}-based Speaker Adaptation},
booktitle = {Proc. ICASSP},
pages = {5372--5375},
address = {Prague, Czech Republic},
abstract = {This paper describes a speaker discrimination
experiment in which native English listeners were
presented with natural and synthetic speech stimuli in
English and were asked to judge whether they thought
the sentences were spoken by the same person or not.
The natural speech consisted of recordings of Finnish
speakers speaking English. The synthetic stimuli were
created using adaptation data from the same Finnish
speakers. Two average voice models were compared: one
trained on Finnish-accented English and the other on
American-accented English. The experiments illustrate
that listeners perform well at speaker discrimination
when the stimuli are both natural or both synthetic,
but when the speech types are crossed performance drops
significantly. We also found that the type of accent in
the average voice model had no effect on the
listeners’ speaker discrimination performance.},
categories = {Similarity Evaluation, Speaker Adaptation,
HMM-synthesis},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/wester_icassp_2011.pdf},
year = 2011
}
@inproceedings{Chang-Euro-01,
author = {S. Chang and S. Greenberg and M. Wester},
title = {An Elitist Approach to Articulatory-Acoustic Feature
Classification},
booktitle = {Proc. of Eurospeech '01},
pages = {1729-1733},
address = {Aalborg},
abstract = {A novel framework for automatic articulatory-acoustic
feature extraction has been developed for enhancing the
accuracy of place- and manner-of-articulation
classification in spoken language. The elitist approach
focuses on frames for which neural network (MLP)
classifiers are highly confident, and discards the
rest. Using this method, it is possible to achieve a
frame-level accuracy of 93\% for manner information on
a corpus of American English sentences passed through a
telephone network (NTIMIT). Place information is
extracted for each manner class independently,
resulting in an appreciable gain in place-feature
classification relative to performance for a manner-
independent system. The elitist framework provides a
potential means of automatically annotating a corpus at
the phonetic level without recourse to a word-level
transcript and could thus be of utility for developing
training materials for automatic speech recognition and
speech synthesis applications, as well as aid the
empirical study of spoken language.},
categories = {aaf, NTIMIT, Berkeley},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/wester.2001.2.pdf},
year = 2001
}
@inproceedings{oura:icassp:10,
author = {Keiichiro Oura and Keiichi Tokuda and Junichi
Yamagishi and Mirjam Wester and Simon King},
title = {Unsupervised Cross-lingual Speaker Adaptation for
{HMM}-based Speech Synthesis},
booktitle = {Proc. of ICASSP},
volume = {I},
pages = {4954-4957},
abstract = {In the EMIME project, we are developing a mobile
device that performs personalized speech-to-speech
translation such that a user's spoken input in one
language is used to produce spoken output in another
language, while continuing to sound like the user's
voice. We integrate two techniques, unsupervised
adaptation for HMM-based TTS using a word-based
large-vocabulary continuous speech recognizer and
cross-lingual speaker adaptation for HMM-based TTS,
into a single architecture. Thus, an unsupervised
cross-lingual speaker adaptation system can be
developed. Listening tests show very promising results,
demonstrating that adapted voices sound similar to the
target speaker and that differences between supervised
and unsupervised cross-lingual speaker adaptation are
small.},
categories = {speaker adaptation, TTS},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/oura_icassp2010.pdf},
year = 2010
}
@phdthesis{Wester-02,
author = {Mirjam Wester},
title = {Pronunciation Variation Modeling for {D}utch Automatic
Speech Recognition},
school = {University of Nijmegen},
abstract = {This thesis consists of an introductory review to
pronunciation variation modeling, followed by four
papers in which the PhD research is described.},
categories = {asr, pm, Nijmegen},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/thesis.pdf},
year = 2002
}
@inproceedings{Wester-ICPhS-99,
author = {M. Wester and J.M. Kessens},
title = {Comparison between expert listeners and continuous
speech recognizers in selecting pronunciation variants},
booktitle = {Proc. of ICPhS '99},
pages = {723-726},
address = {San Francisco},
abstract = {In this paper, the performance of an automatic
transcription tool is evaluated. The transcription tool
is a continuous speech recognizer (CSR) which can be
used to select pronunciation variants (i.e. detect
insertions and deletions of phones). The performance of
the CSR was compared to a reference transcription based
on the judgments of expert listeners. We investigated
to what extent the degree of agreement between the
listeners and the CSR was affected by employing various
sets of phone models (PMs). Overall, the PMs perform
more similarly to the listeners when pronunciation
variation is modeled. However, the various sets of PMs
lead to different results for insertion and deletion
processes. Furthermore, we found that to a certain
degree, word error rates can be used to predict which
set of PMs to use in the transcription tool.},
categories = {asr, pm, VIOS, Nijmegen},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/wester.1999.1.pdf},
year = 1999
}
@inproceedings{Gutkin:etal:ets-cam04,
author = {Alexander Gutkin and David Gay and Lev Goldfarb and
Mirjam Wester},
title = {On the {A}rticulatory {R}epresentation of {S}peech
within the {E}volving {T}ransformation {S}ystem
{F}ormalism},
booktitle = {Pattern Representation and the Future of Pattern
Recognition (Proc. Satellite Workshop of 17th
International Conference on Pattern Recognition)},
editor = {Lev Goldfarb},
pages = {57--76},
address = {Cambridge, UK},
abstract = { This paper deals with the formulation of an
alternative, structural, approach to the speech
representation and recognition problem. In this
approach, we require both the representation and the
learning algorithms to be linguistically meaningful and
to naturally represent the linguistic data at hand.
This allows the speech recognition system to discover
the emergent combinatorial structure of the linguistic
classes. The proposed approach is developed within the
ETS formalism, the first formalism in applied
mathematics specifically designed to address the issues
of class and object/event representation. We present an
initial application of ETS to the articulatory
modelling of speech based on elementary physiological
gestures that can be reliably represented as the ETS
primitives. We discuss the advantages of this gestural
approach over prevalent methods and its promising
potential to mathematical modelling and representation
in linguistics. },
categories = {structural,recognition,ets,artic,mocha,edinburgh,unb},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/ets_cam04_dasr.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/ets_cam04_dasr.ps.gz},
year = 2004
}
@inproceedings{wester00:_using_dutch_asr,
author = {M. Wester and J.M. Kessens and H. Strik},
title = {Using {D}utch phonological rules to model
pronunciation variation in {ASR}},
booktitle = {Phonus 5: proceedings of the "workshop on phonetics
and phonology in {ASR}"},
pages = {105-116},
address = {Saarbruecken},
abstract = {In this paper, we describe how the performance of a
continuous speech recognizer for Dutch has been
improved by modeling within-word and cross-word
pronunciation variation. Within-word variants were
automatically generated by applying five phonological
rules to the words in the lexicon. Cross-word
pronunciation variation was modeled by adding
multi-words and their variants to the lexicon. The best
results were obtained when the cross-word method was
combined with the within-word method: a relative
improvement of 8.8\% in the WER was found compared to
baseline system performance. We also describe an error
analysis that was carried out to investigate whether
rules in isolation can predict the performance of rules
in combination.},
categories = {asr, pm, VIOS, Nijmegen},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/wester.2000.3.pdf},
year = 2000
}
@inproceedings{kurimo:acl:10,
author = {Mikko Kurimo and William Byrne and John Dines and
Philip N. Garner and Matthew Gibson and Yong Guan and
Teemu Hirsim\"{a}ki and Reima Karhila and Simon King
and Hui Liang and Keiichiro Oura and Lakshmi Saheer and
Matt Shannon and Sayaka Shiota and Jilei Tian and
Keiichi Tokuda and Mirjam Wester and Yi-Jian Wu and
Junichi Yamagishi},
title = {Personalising speech-to-speech translation in the
{EMIME} project},
booktitle = {Proc. of the ACL 2010 System Demonstrations},
address = {Uppsala, Sweden},
abstract = {In the EMIME project we have studied unsupervised
cross-lingual speaker adaptation. We have employed an
HMM statistical framework for both speech recognition
and synthesis which provides transformation mechanisms
to adapt the synthesized voice in TTS (text-to-speech)
using the recognized voice in ASR (automatic speech
recognition). An important application for this
research is personalised speech-to-speech translation
that will use the voice of the speaker in the input
language to utter the translated sentences in the
output language. In mobile environments this enhances
the users' interaction across language barriers by
making the output speech sound more like the original
speaker's way of speaking, even if she or he could not
speak the output language.},
categories = {speaker adaptation},
month = {July},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/kurimo_acl_2010.pdf},
year = 2010
}
@inproceedings{wester-98-voicedata,
author = {M. Wester},
title = {Automatic Classification of Voice Quality: Comparing
Regression Models and Hidden {M}arkov Models},
booktitle = {Proc. of VOICEDATA98, Symposium on Databases in Voice
Quality Research and Education},
pages = {92-97},
address = {Utrecht},
abstract = {In this paper, two methods for automatically
classifying voice quality are compared: regression
analysis and hidden Markov models (HMMs). The findings
of this research show that HMMs can be used to classify
voice quality. The HMMs performed better than the
regression models in classifying breathiness and
overall degree of deviance, and the two methods showed
similar results on the roughness scale. However, the
results are not spectacular. This is mainly due to the
type of material that was available and the number of
listeners who assessed the material. Nonetheless, I
argue in this paper that these findings are interesting
because they are a promising step towards developing a
system for classifying voice quality.},
categories = {voice quality, Nijmegen},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/wester.1998.1.pdf},
year = 1998
}
@techreport{wester_mandarin:11,
author = {Mirjam Wester and Hui Liang},
title = {The {EMIME} {M}andarin {B}ilingual {D}atabase},
institution = {The University of Edinburgh},
number = {EDI-INF-RR-1396},
abstract = {This paper describes the collection of a bilingual
database of Mandarin/English data. In addition, the
accents of the talkers in the database have been rated.
English and Mandarin listeners assessed the English and
Mandarin talkers' degree of foreign accent in English.},
categories = {evaluation,cross-lingual, accent rating},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/wester_mandarin_2011.pdf},
year = 2011
}
@inproceedings{wester:ssw7:10,
author = {Mirjam Wester and John Dines and Matthew Gibson and
Hui Liang and Yi-Jian Wu and Lakshmi Saheer and Simon
King and Keiichiro Oura and Philip N. Garner and
William Byrne and Yong Guan and Teemu Hirsim\"{a}ki and
Reima Karhila and Mikko Kurimo and Matt Shannon and
Sayaka Shiota and Jilei Tian and Keiichi Tokuda and
Junichi Yamagishi},
title = {Speaker adaptation and the evaluation of speaker
similarity in the {EMIME} speech-to-speech translation
project},
booktitle = {Proc. of 7th ISCA Speech Synthesis Workshop},
address = {Kyoto, Japan},
abstract = {This paper provides an overview of speaker adaptation
research carried out in the EMIME speech-to-speech
translation (S2ST) project. We focus on how speaker
adaptation transforms can be learned from speech in one
language and applied to the acoustic models of another
language. The adaptation is transferred across
languages and/or from recognition models to synthesis
models. The various approaches investigated can all be
viewed as a process in which a mapping is defined in
terms of either acoustic model states or linguistic
units. The mapping is used to transfer either speech
data or adaptation transforms between the two models.
Because the success of speaker adaptation in
text-to-speech synthesis is measured by judging speaker
similarity, we also discuss issues concerning
evaluation of speaker similarity in an S2ST scenario.},
categories = {speaker adaptation, evaluation},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wester_ssw7_2010.pdf},
year = 2010
}
@inproceedings{kessens-COST-97,
author = {J.M. Kessens and M. Wester and C. Cucchiarini and H.
Strik},
title = {Testing a Method for Modelling Pronunciation Variation},
booktitle = {Proceedings of the COST workshop},
pages = {37-40},
address = {Rhodos},
abstract = {In this paper we describe a method for improving the
performance of a continuous speech recognizer by
modelling pronunciation variation. Although the results
obtained with this method are in line with those
reported by other authors, the magnitude of the
improvements is very small. In looking for possible
explanations for these results, we computed various
sorts of statistics about the material. Since these
data proved to be very useful in understanding the
effects of our method, they are discussed in this
paper. Moreover, on the basis of these statistics we
discuss how the system can be improved in the future.},
categories = {asr, pm, VIOS, Nijmegen},
pdf = {http://www.cstr.ed.ac.uk/downloads/publications/1997/kessens.1997.2.pdf},
year = 1997
}
@inproceedings{Wester-Chang-01,
author = {M. Wester and S. Greenberg and S. Chang},
title = {A {D}utch Treatment of an Elitist Approach to
Articulatory-Acoustic Feature Classification},
booktitle = {Proc. of Eurospeech '01},
pages = {1729-1732},
address = {Aalborg},
abstract = {A novel approach to articulatory-acoustic feature
extraction has been developed for enhancing the
accuracy of classification associated with place and
manner of articulation information. This elitist
approach is tested on a corpus of spontaneous Dutch
using two different systems, one trained on a subset of
the same corpus, the other trained on a corpus from a
different language (American English). The feature
dimensions, voicing and manner of articulation transfer
relatively well between the two languages. However,
place information transfers less well. Manner-specific
training can be used to improve classification of
articulatory place information.},
categories = {aaf, NTIMIT, VIOS, Berkeley},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/wester.2001.1.pdf},
year = 2001
}
@inproceedings{Kessens98,
author = {J.M. Kessens and M. Wester and C. Cucchiarini and H.
Strik},
title = {The Selection of Pronunciation Variants: Comparing the
Performance of Man and Machine},
booktitle = {Proc. of ICSLP '98},
pages = {2715-2718},
address = {Sydney},
abstract = {In this paper the performance of an automatic
transcription tool is evaluated. The transcription tool
is a Continuous Speech Recognizer (CSR) running in
forced recognition mode. For evaluation the performance
of the CSR was compared to that of nine expert
listeners. Both man and the machine carried out exactly
the same task: deciding whether a segment was present
or not in 467 cases. It turned out that the performance
of the CSR is comparable to that of the experts.},
categories = {asr, pm, VIOS, Nijmegen},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/kessens.1998.1.pdf},
year = 1998
}
@article{king07:JASA2007,
author = {King, S. and Frankel, J. and Livescu, K. and
McDermott, E. and Richmond, K. and Wester, M.},
title = {Speech production knowledge in automatic speech
recognition},
journal = {Journal of the Acoustical Society of America},
volume = 121,
number = 2,
pages = {723--742},
abstract = {Although much is known about how speech is produced,
and research into speech production has resulted in
measured articulatory data, feature systems of
different kinds and numerous models, speech production
knowledge is almost totally ignored in current
mainstream approaches to automatic speech recognition.
Representations of speech production allow simple
explanations for many phenomena observed in speech
which cannot be easily analyzed from either acoustic
signal or phonetic transcription alone. In this
article, we provide a survey of a growing body of work
in which such representations are used to improve
automatic speech recognition.},
month = feb,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/King_et_al_review.pdf},
year = 2007
}
@article{chang05,
author = {S. Chang and M. Wester and S. Greenberg},
title = {An elitist approach to automatic articulatory-acoustic
feature classification for phonetic characterization of
spoken language},
journal = {Speech Communication},
volume = {47},
pages = {290-311},
abstract = {A novel framework for automatic articulatory-acoustic
feature extraction has been developed for enhancing the
accuracy of place- and manner-of-articulation
classification in spoken language. The "elitist"
approach provides a principled means of selecting
frames for which multi-layer perceptron, neural-network
classifiers are highly confident. Using this method it
is possible to achieve a frame-level accuracy of 93\%
on "elitist" frames for manner classification on a
corpus of American English sentences passed through a
telephone network (NTIMIT). Place-of-articulation
information is extracted for each manner class
independently, resulting in an appreciable gain in
place-feature classification relative to performance
for a manner-independent system. A comparable
enhancement in classification performance for the
elitist appraoch is evidenced when applied to a Dutch
corpus of quasi-spontaneous telephone interactions
(VIOS). The elitist framework provides a potential
means of automatically annotating a corpus at the
phonetic level \emph{without recourse to a word-level
transcript} and could thus be of utility for developing
traning materials for automatic speech recognition and
speech synthesis applications, as well as aid the
empirical study of spoken language. \copyright 2005
Elsevier B.V. All rights reserved.},
categories = {aaf, VIOS, NTIMIT, Berkeley},
pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2005/elitist-final-specom.pdf},
year = 2005
}