The Centre for Speech Technology Research, The university of Edinburgh

Publications by Mirjam Wester

mwester.bib

@inproceedings{Sturm-03,
  author = {J. Sturm and J. M. Kessens and M. Wester and F. de Wet
                   and E. Sanders and H. Strik },
  title = {Automatic Transcription of Football Commentaries in
                   the {MUMIS} Project},
  booktitle = {Proc. Eurospeech '03},
  pages = {-},
  abstract = {This paper describes experiments carried out to
                   automatically transcribe football commentaries in
                   Dutch, English and German for multimedia indexing. Our
                   results show that the high levels of stadium noise in
                   the material create a task that is extremely difficult
                   for conventional ASR. The baseline WERs vary from 83\%
                   to 94\% for the three languages investigated. Employing
                   state-of-the-art noise robustness techniques leads to
                   relative reductions of 9-10\% WER. Application specific
                   words such as players names are recognized correctly in
                   about 50\% of cases. Although this result is
                   substantially better than the overall result, it is
                   inadequate. Much better results can be obtained if the
                   football commentaries are recorded separately from the
                   stadium noise. This would make the automatic
                   transcriptions more useful for multimedia indexing.},
  categories = {asr, MUMIS, Nijmegen},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/mumis_final.pdf},
  year = 2003
}
@inproceedings{karhila_interspeech:11,
  author = {Reima Karhila and Mirjam Wester},
  title = {Rapid Adaptation of Foreign-accented {HMM}-based
                   Speech Synthesis},
  booktitle = {Proc. Interspeech},
  address = {Florence, Italy},
  abstract = {This paper presents findings of listeners’
                   perception of speaker identity in synthetic speech.
                   Specifically, we investigated what the effect is on the
                   perceived identity of a speaker when using differently
                   accented average voice models and limited amounts (five
                   and fifteen sentences) of a speaker’s data to create
                   the synthetic stimuli. A speaker discrimination task
                   was used to measure speaker identity. Native English
                   listeners were presented with natural and synthetic
                   speech stimuli in English and were asked to decide
                   whether they thought the sentences were spoken by the
                   same person or not. An accent rating task was also
                   carried out to measure the perceived accents of the
                   synthetic speech stimuli. The results show that
                   listeners, for the most part, perform as well at
                   speaker discrimination when the stimuli have been
                   created using five or fifteen adaptation sentences as
                   when using 105 sentences. Furthermore, the accent of
                   the average voice model does not affect listeners’
                   speaker discrimination performance even though the
                   accent rating task shows listeners are perceiving
                   different accents in the synthetic stimuli. Listeners
                   do not base their speaker similarity decisions on
                   perceived accent.},
  categories = {speech synthesis, rapid adaptation},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/karhila_wester_interspeech_2011.pdf},
  year = 2011
}
@inproceedings{kessens-00,
  author = {J.M. Kessens and M. Wester and H. Strik},
  title = {Automatic Detection and Verification of {D}utch
                   Phonological Rules},
  booktitle = {PHONUS 5: Proceedings of the "Workshop on Phonetics
                   and Phonology in ASR"},
  pages = {117-128},
  address = {Saarbruecken},
  abstract = {In this paper, we propose two methods for
                   automatically obtaining hypotheses about pronunciation
                   variation. To this end, we used two different
                   approaches in which we employed a continuous speech
                   recognizer to derive this information from the speech
                   signal. For the first method, the output of a phone
                   recognition was compared to a reference transcription
                   in order obtain hypotheses about pronunciation
                   variation. Since phone recognition contains errors, we
                   used forced recognition in order to exclude unreliable
                   hypotheses. For the second method, forced recognition
                   was also used, but the hypotheses about the deletion of
                   phones were not constrained beforehand. This was
                   achieved by allowing each phone to be deleted. After
                   forced recognition, we selected the most frequently
                   applied rules as the set of deletion rules. Since
                   previous research showed that forced recognition is a
                   reliable tool for testing hypotheses about
                   pronunciation variation, we can expect that this will
                   also hold for the hypotheses about pronunciation
                   variation which we found using each of the two methods.
                   Another reason for expecting the rule hypotheses to be
                   reliable is that we found that 37-53\% of the rules are
                   related to Dutch phonological processes that have been
                   described in the literature.},
  categories = {asr, pm, VIOS, Nijmegen},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/kessens.2000.2.pdf},
  year = 2000
}
@article{Kessens-Wester-99,
  author = {J.M. Kessens and M. Wester and H. Strik},
  title = {Improving the performance of a {D}utch {CSR} by
                   modeling within-word and cross-word pronunciation
                   variation},
  journal = {Speech Communication},
  volume = {29},
  pages = {193-207},
  abstract = {This article describes how the performance of a Dutch
                   continuous speech recognizer was improved by modeling
                   pronunciation variation. We propose a general procedure
                   for modeling pronunciation variation. In short, it
                   consists of adding pronunciation variants to the
                   lexicon, retraining phone models and using language
                   models to which the pronunciation variants have been
                   added. First, within-word pronunciation variants were
                   generated by applying a set of five optional
                   phonological rules to the words in the baseline
                   lexicon. Next, a limited number of cross-word processes
                   were modeled, using two different methods. In the first
                   approach, cross-word processes were modeled by directly
                   adding the cross-word variants to the lexicon, and in
                   the second approach this was done by using multi-words.
                   Finally, the combination of the within-word method with
                   the two cross-word methods was tested. The word error
                   rate (WER) measured for the baseline system was
                   12.75\%. Compared to the baseline, a small but
                   statistically significant improvement of 0.68\% in WER
                   was measured for the within-word method, whereas both
                   cross-word methods in isolation led to small,
                   non-signicant improvements. The combination of the
                   within-word method and cross-word method 2 led to the
                   best result: an absolute improvement of 1.12\% in WER
                   was found compared to the baseline, which is a relative
                   improvement of 8.8\% in WER.},
  categories = {asr, pm, VIOS, Nijmegen},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/journalversion.pdf},
  year = 1999
}
@inproceedings{kessens-CLS-97,
  author = {J.M. Kessens and M. Wester},
  title = {Improving Recognition Performance by Modelling
                   Pronunciation Variation},
  booktitle = {Proc. of the CLS opening Academic Year '97 '98},
  pages = {1-20},
  address = {Nijmegen},
  abstract = {This paper describes a method for improving the
                   performance of a continuous speech recognizer by
                   modelling pronunciation variation. Although the
                   improvements obtained with this method are small, they
                   are in line with those reported by other authors. A
                   series of experiments was carried out to model
                   pronunciation variation. In the first set of
                   experiments word internal pronunciation variation was
                   modelled by applying a set of four phonological rules
                   to the words in the lexicon. In the second set of
                   experiments, variation across word boundaries was also
                   modelled. The results obtained with both methods are
                   presented in detail. Furthermore, statistics are given
                   on the application of the four phonological rules on
                   the training database. We will explain why the
                   improvements obtained with this method are small and
                   how we intend to increase the improvements in our
                   future research.},
  categories = {asr, pm, VIOS, Nijmegen},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/1997/kessens.1997.1.pdf},
  year = 1997
}
@inproceedings{Kessens-ICPhS-99,
  author = {J.M. Kessens and M. Wester and H. Strik},
  title = {Modeling within-word and cross-word pronunciation
                   variation to improve the performance of a {D}utch {CSR}},
  booktitle = {Proc. of ICPhS '99},
  pages = {1665-1668},
  address = {San Francisco},
  abstract = {This paper describes how the performance of a
                   continuous speech recognizer for Dutch has been
                   improved by modeling within-word and cross-word
                   pronunciation variation. Within-word variants were
                   automatically generated by applying five phonological
                   rules to the words in the lexicon. For the within-word
                   method, a significant improvement is found compared to
                   the baseline. Cross-word pronunciation variation was
                   modeled using two different methods: 1) adding
                   cross-word variants directly to the lexicon, 2) only
                   adding multi-words and their variants to the lexicon.
                   Overall, cross-word method 2 leads to better results
                   than cross-word method 1. The best results were
                   obtained when cross-word method 2 was combined with the
                   within-word method: a relative improvement of 8.8\% WER
                   was found compared to the baseline.},
  categories = {asr, pm, VIOS, Nijmegen},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/kessens.1999.1.pdf},
  year = 1999
}
@inproceedings{wester_interspeech:11,
  author = {Mirjam Wester and Hui Liang},
  title = {Cross-Lingual Speaker Discrimination Using Natural and
                   Synthetic Speech},
  booktitle = {Proc. Interspeech},
  address = {Florence, Italy},
  abstract = {This paper describes speaker discrimination
                   experiments in which native English listeners were
                   presented with either natural speech stimuli in English
                   and Mandarin, synthetic speech stimuli in English and
                   Mandarin, or natural Mandarin speech and synthetic
                   English speech stimuli. In each experiment, listeners
                   were asked to decide whether they thought the sentences
                   were spoken by the same person or not. We found that
                   the results for Mandarin/English speaker discrimination
                   are very similar to results found in previous work on
                   German/English and Finnish/English speaker
                   discrimination. We conclude from this and previous work
                   that listeners are able to identify speakers across
                   languages and they are able to identify speakers across
                   speech types, but the combination of these two factors
                   leads to a speaker discrimination task which is too
                   difficult for listeners to perform successfully, given
                   the quality of across-language speaker adapted speech
                   synthesis at present.},
  categories = {speaker discrimination, speaker adaptation, HMM-based
                   speech synthesis},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/wester_liang_interspeech_2011.pdf},
  year = 2011
}
@techreport{wester_accent2010,
  author = {Wester, M.},
  title = {The {EMIME} {B}ilingual {D}atabase},
  institution = {The University of Edinburgh},
  number = {EDI-INF-RR-1388},
  abstract = {This paper describes the collection of a bilingual
                   database of Finnish/English and German/English data. In
                   addition, the accents of the talkers in the database
                   have been rated. English, German and Finnish listeners
                   assessed the English, German and Finnish talkers{\^a}
                   degree of foreign accent in English. Native English
                   listeners showed higher inter-listener agreement than
                   non-native listeners. Further analyses showed that
                   non-native listeners judged Finnish and German female
                   talkers to be significantly less accented than do
                   English listeners. German males are judged less
                   accented by Finnish listeners than they are by English
                   and German listeners and there is no difference between
                   listeners as to how they judge the accent of Finnish
                   males. Finally, all English talkers are judged more
                   accented by non-native listeners than they are by
                   native English listeners.},
  categories = {evaluation,cross-lingual, accent rating},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wester_accent_2010.pdf},
  year = 2010
}
@inproceedings{wester-98-sposs,
  author = {M. Wester and J.M. Kessens and C. Cucchiarini and H.
                   Strik},
  title = {Selection of Pronunciation Variants in Spontaneous
                   Speech: Comparing the Performance of Man and Machine},
  booktitle = {Proc. of the ESCA Workshop on the Sound Patterns of
                   Spontaneous Speech: Production and Perception},
  pages = {157-160},
  address = {Aix-en-Provence},
  abstract = {Dans cet article, les performances d'un outil de
                   transcription automatique sont évaluées. L'outil de
                   transcription est un reconnaisseur de parole continue
                   (CSR) fonctionnant en mode de reconnaissance forcée.
                   Pour l'évaluation les performances du CSR ont été
                   comparées à celles de neuf auditeurs experts. La
                   machine et l'humain ont effectué exactement la même
                   tâche: décider si un segment était présent ou non
                   dans 467 cas. Il s'est avéré que les performances du
                   CSR étaient comparables à celle des experts.},
  categories = {asr, pm, VIOS, Nijmegen},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/wester.1998.5.pdf},
  year = 1998
}
@article{frankel07:AF_DBN,
  author = {Frankel, J. and Wester, M. and King, S.},
  title = {Articulatory feature recognition using dynamic
                   {B}ayesian networks},
  journal = {Computer Speech & Language },
  volume = {21},
  number = {4},
  pages = {620--640},
  abstract = {We describe a dynamic Bayesian network for
                   articulatory feature recognition. The model is intended
                   to be a component of a speech recognizer that avoids
                   the problems of conventional ``beads-on-a-string''
                   phoneme-based models. We demonstrate that the model
                   gives superior recognition of articulatory features
                   from the speech signal compared with a stateof- the art
                   neural network system. We also introduce a training
                   algorithm that offers two major advances: it does not
                   require time-aligned feature labels and it allows the
                   model to learn a set of asynchronous feature changes in
                   a data-driven manner.},
  month = oct,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Frankel_etal_CSL2007.pdf},
  year = 2007
}
@inproceedings{Wester-icslp-02,
  author = {M. Wester and J.M. Kessens and H. Strik},
  title = {Goal-directed {ASR} in a multimedia indexing and
                   searching environment ({MUMIS})},
  booktitle = {Proc. of ICSLP},
  pages = {1993-1996},
  address = {Denver},
  abstract = {This paper describes the contribution of automatic
                   speech recognition (ASR) within the framework of MUMIS
                   (Multimedia Indexing and Searching Environment). The
                   domain is football commentaries. The initial results of
                   carrying out ASR on Dutch and English football
                   commentaries are presented. We found that overall word
                   error rates are high, but application specific words
                   are recognized reasonably well. The difficulty of the
                   ASR task is greatly increased by the high levels of
                   noise present in the material.},
  categories = {asr, MUMIS, Nijmegen},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/wester.2002.2.pdf},
  year = 2002
}
@inproceedings{wester-97,
  author = {M. Wester and J.M. Kessens and C. Cucchiarini and H.
                   Strik},
  title = {Modelling pronunciation variation: some preliminary
                   results},
  booktitle = {Proc. of the Dept. of Language & Speech},
  pages = {127-137},
  address = {Nijmegen},
  abstract = {In this paper we describe a method for improving the
                   performance of a continuous speech recognizer by
                   modelling pronunciation variation. Although the results
                   obtained with this method are in line with those
                   reported by other authors, the magnitude of the
                   improvements is very small. In looking for possible
                   explanations for these results, we computed various
                   sorts of statistics about the material. Since these
                   data proved to be very useful in understanding the
                   effects of our method, they are discussed in this
                   paper. Moreover, on the basis of these statistics we
                   discuss how the system can be improved in the future.},
  categories = {asr, pm, VIOS, Nijmegen},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/wester.1997.1.pdf},
  year = 1997
}
@article{wester:specom:12,
  author = {Mirjam Wester},
  title = {Talker discrimination across languages},
  journal = {Speech Communication},
  volume = {54},
  pages = {781--790},
  abstract = {This study investigated the extent to which listeners
                   are able to discriminate between bilingual talkers in
                   three language pairs – English–German,
                   English–Finnish and English–Mandarin. Native
                   English listeners were presented with two sentences
                   spoken by bilingual talkers and were asked to judge
                   whether they thought the sentences were spoken by the
                   same person. Equal amounts of cross-language and
                   matched-language trials were presented. The results
                   show that native English listeners are able to carry
                   out this task well; achieving percent correct levels at
                   well above chance for all three language pairs.
                   Previous research has shown this for English–German,
                   this research shows listeners also extend this to
                   Finnish and Mandarin, languages that are quite distinct
                   from English from a genetic and phonetic similarity
                   perspective. However, listeners are significantly less
                   accurate on cross-language talker trials
                   (English–foreign) than on matched-language trials
                   (English–English and foreign–foreign).
                   Understanding listeners’ behaviour in cross-language
                   talker discrimination using natural speech is the first
                   step in developing principled evaluation techniques for
                   synthesis systems in which the goal is for the
                   synthesised voice to sound like the original speaker,
                   for instance, in speech-to-speech translation systems,
                   voice conversion and reconstruction.},
  categories = {evaluation},
  doi = {10.1016/j.specom.2012.01.006},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2012/wester_specom_12.pdf},
  year = 2012
}
@inproceedings{Wester-00,
  author = {M. Wester and J.M. Kessens and H. Strik},
  title = {Pronunciation variation in {ASR}: Which variation to
                   model?},
  booktitle = {Proc. of {ICSLP} '00},
  volume = {IV},
  pages = {488-491},
  address = {Beijing},
  abstract = {This paper describes how the performance of a
                   continuous speech recognizer for Dutch has been
                   improved by modeling within-word and cross-word
                   pronunciation variation. A relative improvement of
                   8.8\% in WER was found compared to baseline system
                   performance. However, as WERs do not reveal the full
                   effect of modeling pronunciation variation, we
                   performed a detailed analysis of the differences in
                   recognition results that occur due to modeling
                   pronunciation variation and found that indeed a lot of
                   the differences in recognition results are not
                   reflected in the error rates. Furthermore, error
                   analysis revealed that testing sets of variants in
                   isolation does not predict their behavior in
                   combination. However, these results appeared to be
                   corpus dependent.},
  categories = {asr, pm, VIOS, Nijmegen},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/wester.2000.1.pdf},
  year = 2000
}
@inproceedings{wester04:asynch,
  author = {Wester, M. and Frankel, J. and King, S.},
  title = {Asynchronous Articulatory Feature Recognition Using
                   Dynamic {B}ayesian Networks},
  booktitle = {Proc. IEICI Beyond HMM Workshop},
  address = {Kyoto},
  abstract = {This paper builds on previous work where dynamic
                   Bayesian networks (DBN) were proposed as a model for
                   articulatory feature recognition. Using DBNs makes it
                   possible to model the dependencies between features, an
                   addition to previous approaches which was found to
                   improve feature recognition performance. The DBN
                   results were promising, giving close to the accuracy of
                   artificial neural nets (ANNs). However, the system was
                   trained on canonical labels, leading to an overly
                   strong set of constraints on feature co-occurrence. In
                   this study, we describe an embedded training scheme
                   which learns a set of data-driven asynchronous feature
                   changes where supported in the data. Using a subset of
                   the OGI Numbers corpus, we describe articulatory
                   feature recognition experiments using both
                   canonically-trained and asynchronous DBNs. Performance
                   using DBNs is found to exceed that of ANNs trained on
                   an identical task, giving a higher recognition
                   accuracy. Furthermore, inter-feature dependencies
                   result in a more structured model, giving rise to fewer
                   feature combinations in the recognition output. In
                   addition to an empirical evaluation of this modelling
                   approach, we give a qualitative analysis, comparing
                   asynchrony found through our data-driven methods to the
                   asynchrony which may be expected on the basis of
                   linguistic knowledge.},
  categories = {am,artic,asr,dbn,oginumbers,edinburgh},
  month = dec,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Wester_et_al_IEICE.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Wester_et_al_IEICE.ps},
  year = 2004
}
@inproceedings{Wester-03,
  author = {M. Wester},
  title = {Syllable classification using articulatory-acoustic
                   features},
  booktitle = {Proc. of Eurospeech '03},
  pages = {-},
  address = {Geneva},
  abstract = {This paper investigates the use of
                   articulatory-acoustic features for the classification
                   of syllables in TIMIT. The main motivation for this
                   study is to circumvent the ``beads-on-a-string''
                   problem, i.e. the assumption that words can be
                   described as a simple concatenation of phones.
                   Posterior probabilities for articulatory-acoustic
                   features are obtained from artificial neural nets and
                   are used to classify speech within the scope of
                   syllables instead of phones. This gives the opportunity
                   to account for asynchronous feature changes, exploiting
                   the strengths of the articulatory-acoustic features,
                   instead of losing the potential by reverting to phones.},
  categories = {aaf, syllable, TIMIT, Edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/wester.2003.1.pdf},
  year = 2003
}
@article{Dines2011,
  author = {John Dines and Hui Liang and Lakshmi Saheer and
                   Matthew Gibson and William Byrne and Keiichiro Oura and
                   Keiichi Tokuda and Junichi Yamagishi and Simon King and
                   Mirjam Wester and Teemu Hirsimäki and Reima
                   Karhila and Mikko Kurimo},
  title = {Personalising speech-to-speech translation:
                   Unsupervised cross-lingual speaker adaptation for
                   {HMM}-based speech synthesis},
  journal = {Computer Speech and Language},
  volume = {27},
  number = {2},
  pages = {420--437},
  abstract = {In this paper we present results of unsupervised
                   cross-lingual speaker adaptation applied to
                   text-to-speech synthesis. The application of our
                   research is the personalisation of speech-to-speech
                   translation in which we employ a HMM statistical
                   framework for both speech recognition and synthesis.
                   This framework provides a logical mechanism to adapt
                   synthesised speech output to the voice of the user by
                   way of speech recognition. In this work we present
                   results of several different unsupervised and
                   cross-lingual adaptation approaches as well as an
                   end-to-end speaker adaptive speech-to-speech
                   translation system. Our experiments show that we can
                   successfully apply speaker adaptation in both
                   unsupervised and cross-lingual scenarios and our
                   proposed algorithms seem to generalise well for several
                   language pairs. We also discuss important future
                   directions including the need for better evaluation
                   metrics.},
  doi = {10.1016/j.csl.2011.08.003},
  issn = {0885-2308},
  keywords = {Speech-to-speech translation, Cross-lingual speaker
                   adaptation, HMM-based speech synthesis, Speaker
                   adaptation, Voice conversion},
  url = {http://www.sciencedirect.com/science/article/pii/S0885230811000441},
  month = feb,
  year = 2013
}
@inproceedings{wester:interspeech:10,
  author = {Mirjam Wester},
  title = {Cross-lingual talker discrimination},
  booktitle = {Proc. of Interspeech},
  address = {Makuhari, Japan},
  abstract = {This paper describes a talker discrimination
                   experiment in which native English listeners were
                   presented with two sentences spoken by bilingual
                   talkers (English/German and English/Finnish) and were
                   asked to judge whether they thought the sentences were
                   spoken by the same person or not. Equal amounts of
                   cross-lingual and matched-language trials were
                   presented. The experiments showed that listeners are
                   able to complete this task well, they can discriminate
                   between talkers significantly better than chance.
                   However, listeners are significantly less accurate on
                   cross-lingual talker trials than on matched-language
                   pairs. No significant differences were found on this
                   task between German and Finnish. Bias (B'') and
                   Sensitivity (A') values are presented to analyse the
                   listeners' behaviour in more detail. The results are
                   promising for the evaluation of EMIME, a project
                   covering speech-to-speech translation with speaker
                   adaptation.},
  categories = {evaluation},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wester_interspeech2010.pdf},
  year = 2010
}
@inproceedings{frankel04:artic_dbn,
  author = {Frankel, J. and Wester, M. and King, S.},
  title = {Articulatory feature recognition using dynamic
                   {B}ayesian networks},
  booktitle = {Proc. {ICSLP}},
  abstract = {This paper describes the use of dynamic Bayesian
                   networks for the task of articulatory feature
                   recognition. We show that by modeling the dependencies
                   between a set of 6 multi-leveled articulatory features,
                   recognition accuracy is increased over an equivalent
                   system in which features are considered independent.
                   Results are compared to those found using artificial
                   neural networks on an identical task.},
  categories = {am,artic,asr,dbn,timit,edinburgh},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Frankel_et_al_ICSLP2004.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Frankel_et_al_ICSLP2004.ps},
  year = 2004
}
@inproceedings{cooke:lista:12,
  author = {Martin Cooke and Maria Luisa García Lecumberri and
                   Yan Tang and Mirjam Wester},
  title = {Do non-native listeners benefit from speech
                   modifications designed to promote intelligibility for
                   native listeners?},
  booktitle = {Proceedings of The Listening Talker Workshop},
  pages = 59,
  note = {http://listening-talker.org/workshop/programme.html},
  year = 2012
}
@article{Wester-01,
  author = {M. Wester and J. M. Kessens and C. Cucchiarini and H.
                   Strik},
  title = {Obtaining phonetic transcriptions: a comparison
                   between expert listeners and a continuous speech
                   recognizer},
  journal = {Language and Speech},
  volume = {44(3)},
  pages = {377-403},
  abstract = {In this article, we address the issue of using a
                   continuous speech recognition tool to obtain phonetic
                   or phonological representations of speech. Two
                   experiments were carried out in which the performance
                   of a continuous speech recognizer (CSR) was compared to
                   the performance of expert listeners in a task of
                   judging whether a number of prespecified phones had
                   been realized in an utterance. In the first experiment,
                   nine expert listeners and the CSR carried out exactly
                   the same task: deciding whether a segment was present
                   or not in 467 cases. In the second experiment, we
                   expanded on the first experiment by focusing on two
                   phonological processes: schwa-deletion and
                   schwa-insertion. The results of these experiments show
                   that significant differences in performance were found
                   between the CSR and the listeners, but also between
                   individual listeners. Although some of these
                   differences appeared to be statistically significant,
                   their magnitude is such that they may very well be
                   acceptable depending on what the transcriptions are
                   needed for. In other words, although the CSR is not
                   infallible, it makes it possible to explore large
                   datasets, which might outweigh the errors introduced by
                   the mistakes the CSR makes. For these reasons, we can
                   conclude that the CSR can be used instead of a listener
                   to carry out this type of task: deciding whether a
                   phone is present or not.},
  categories = {automatic transcription, pm, VIOS, Nijmegen},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/wester.2001.3.pdf},
  year = 2001
}
@article{Wester-CSL-03,
  author = {M. Wester},
  title = {Pronunciation modeling for {ASR} -- knowledge-based
                   and data-derived methods},
  journal = {Computer Speech and Language},
  volume = {17},
  pages = {69-85},
  abstract = {This article focuses on modeling pronunciation
                   variation in two different ways: data-derived and
                   knowledge-based. The knowledge-based approach consists
                   of using phonological rules to generate variants. The
                   data-derived approach consists of performing phone
                   recognition, followed by smoothing using decision trees
                   (D-trees) to alleviate some of the errors in the phone
                   recognition. Using phonological rules led to a small
                   improvement in WER; a data-derived approach in which
                   the phone recognition was smoothed using D-trees prior
                   to lexicon generation led to larger improvements
                   compared to the baseline. The lexicon was employed in
                   two different recognition systems: a hybrid HMM/ANN
                   system and a HMM-based system, to ascertain whether
                   pronunciation variation was truly being modeled. This
                   proved to be the case as no significant differences
                   were found between the results obtained with the two
                   systems. Furthermore, we found that 10\% of variants
                   generated by the phonological rules were also found
                   using phone recognition, and this increased to 28\%
                   when the phone recognition output was smoothed by using
                   D-trees. This indicates that the D-trees generalize
                   beyond what has been seen in the training material,
                   whereas when the phone recognition approach is employed
                   directly, unseen pronunciations cannot be predicted. In
                   addition, we propose a metric to measure confusability
                   in the lexicon. Using this confusion metric to prune
                   variants results in roughly the same improvement as
                   using the D-tree method.},
  categories = {asr, pm, VIOS, Nijmegen},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/CSL-pronvar.pdf},
  year = 2003
}
@inproceedings{Wester-Fosler-00,
  author = {M. Wester and E. Fosler-Lussier},
  title = {A comparison of data-derived and knowledge-based
                   modeling of pronunciation variation},
  booktitle = {Proc. of ICSLP '00},
  volume = {I},
  pages = {270-273},
  address = {Beijing},
  abstract = {This paper focuses on modeling pronunciation variation
                   in two different ways: data-derived and
                   knowledge-based. The knowledge-based approach consists
                   of using phonological rules to generate variants. The
                   data-derived approach consists of performing phone
                   recognition, followed by various pruning and smoothing
                   methods to alleviate some of the errors in the phone
                   recognition. Using phonological rules led to a small
                   improvement in WER; whereas, using a data-derived
                   approach in which the phone recognition was smoothed
                   using simple decision trees (d-trees) prior to lexicon
                   generation led to a significant improvement compared to
                   the baseline. Furthermore, we found that 10\% of
                   variants generated by the phonological rules were also
                   found using phone recognition, and this increased to
                   23\% when the phone recognition output was smoothed by
                   using d-trees. In addition, we propose a metric to
                   measure confusability in the lexicon and we found that
                   employing this confusion metric to prune variants
                   results in roughly the same improvement as using the
                   d-tree method.},
  categories = {asr, pm, VIOS, Berkeley},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/wester.2000.2.pdf},
  year = 2000
}
@article{Oura2012703,
  author = {Keiichiro Oura and Junichi Yamagishi and Mirjam Wester
                   and Simon King and Keiichi Tokuda},
  title = {Analysis of unsupervised cross-lingual speaker
                   adaptation for {HMM}-based speech synthesis using
                   {KLD}-based transform mapping},
  journal = {Speech Communication},
  volume = {54},
  number = {6},
  pages = {703--714},
  abstract = {In the EMIME project, we developed a mobile device
                   that performs personalized speech-to-speech translation
                   such that a user's spoken input in one language is used
                   to produce spoken output in another language, while
                   continuing to sound like the user's voice. We
                   integrated two techniques into a single architecture:
                   unsupervised adaptation for HMM-based TTS using
                   word-based large-vocabulary continuous speech
                   recognition, and cross-lingual speaker adaptation
                   (CLSA) for HMM-based TTS. The CLSA is based on a
                   state-level transform mapping learned using minimum
                   Kullback-Leibler divergence between pairs of HMM states
                   in the input and output languages. Thus, an
                   unsupervised cross-lingual speaker adaptation system
                   was developed. End-to-end speech-to-speech translation
                   systems for four languages (English, Finnish, Mandarin,
                   and Japanese) were constructed within this framework.
                   In this paper, the English-to-Japanese adaptation is
                   evaluated. Listening tests demonstrate that adapted
                   voices sound more similar to a target speaker than
                   average voices and that differences between supervised
                   and unsupervised cross-lingual speaker adaptation are
                   small. Calculating the KLD state-mapping on only the
                   first 10 mel-cepstral coefficients leads to huge
                   savings in computational costs, without any detrimental
                   effect on the quality of the synthetic speech.},
  doi = {10.1016/j.specom.2011.12.004},
  issn = {0167-6393},
  keywords = {HMM-based speech synthesis, Unsupervised speaker
                   adaptation, Cross-lingual speaker adaptation,
                   Speech-to-speech translation},
  url = {http://www.sciencedirect.com/science/article/pii/S0167639311001774},
  year = 2012
}
@inproceedings{badinoclark_interspeech12,
  author = {Leonardo Badino and Robert A.J. Clark and Mirjam
                   Wester},
  title = {Towards Hierarchical Prosodic Prominence Generation in
                   {TTS} Synthesis},
  booktitle = {Proc. Interspeech},
  address = {Portland, USA},
  categories = {speech synthesis, prosody},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/badinoclark_IS_2012.pdf},
  year = 2012
}
@inproceedings{wester-98-icslp,
  author = {M. Wester and J.M. Kessens and H. Strik},
  title = {Modeling pronunciation variation for a {D}utch {CSR}:
                   testing three methods},
  booktitle = {Proc. ICSLP '98},
  pages = {2535-2538},
  address = {Sydney},
  abstract = {This paper describes how the performance of a
                   continuous speech recognizer for Dutch has been
                   improved by modeling pronunciation variation. We used
                   three methods to model pronunciation variation. First,
                   within-word variation was dealt with. Phonological
                   rules were applied to the words in the lexicon, thus
                   automatically generating pronunciation variants.
                   Secondly, cross-word pronunciation variation was
                   modeled using two different approaches. The first
                   approach was to model cross-word processes by adding
                   the variants as separate words to the lexicon and in
                   the second approach this was done by using multi-words.
                   For each of the methods, recognition experiments were
                   carried out. A significant improvement was found for
                   modeling within-word variation. Furthermore, modeling
                   crossword processes using multi-words leads to
                   significantly better results than modeling them using
                   separate words in the lexicon.},
  categories = {asr, pm, VIOS, Nijmegen},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/wester.1998.3.pdf},
  year = 1998
}
@inproceedings{wester-98-kerkrade,
  author = { M. Wester and J.M. Kessens and H. Strik},
  title = {Improving the Performance of a {D}utch {CSR} by
                   Modeling Pronunciation Variation},
  booktitle = {Proc. of the Workshop Modeling Pronunciation Variation
                   for Automatic Speech Recognition},
  pages = {145-150},
  address = {Kerkrade},
  abstract = {This paper describes how the performance of a
                   continuous speech recognizer for Dutch has been
                   improved by modeling pronunciation variation. We used
                   three methods in order to model pronunciation
                   variation. First, withinword variation was dealt with.
                   Phonological rules were applied to the words in the
                   lexicon, thus automatically generating pronunciation
                   variants. Secondly, cross-word pronunciation variation
                   was accounted for by adding multi-words and their
                   variants to the lexicon. Thirdly, probabilities of
                   pronunciation variants were incorporated in the
                   language model (LM), and thresholds were used to choose
                   which pronunciation variants to add to the LMs. For
                   each of the methods, recognition experiments were
                   carried out. A significant improvement in error rates
                   was measured.},
  categories = {asr, pm, VIOS, Nijmegen},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/wester.1998.4.pdf},
  year = 1998
}
@inproceedings{wester-98-sd,
  author = {M. Wester and J.M. Kessens and H. Strik},
  title = {Two automatic approaches for analyzing the frequency
                   of connected speech processes in {D}utch},
  booktitle = {Proc. ICSLP Student Day '98},
  pages = {3351-3356},
  address = {Sydney},
  abstract = {This paper describes two automatic approaches used to
                   study connected speech processes (CSPs) in Dutch. The
                   first approach was from a linguistic point of view -
                   the top-down method. This method can be used for
                   verification of hypotheses about CSPs. The second
                   approach - the bottom-up method - uses a constrained
                   phone recognizer to generate phone transcriptions. An
                   alignment was carried out between the two
                   transcriptions and a reference transcription. A
                   comparison between the two methods showed that 68\%
                   agreement was achieved on the CSPs. Although phone
                   accuracy is only 63\%, the bottom-up approach is useful
                   for studying CSPs. From the data generated using the
                   bottom-up method, indications of which CSPs are present
                   in the material can be found. These indications can be
                   used to generate hypotheses which can then be tested
                   using the top-down method.},
  categories = {asr, pm, VIOS, Nijmegen},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/wester.1998.2.pdf},
  year = 1998
}
@inproceedings{wester_icassp:11,
  author = {Mirjam Wester and Reima Karhila},
  title = {Speaker Similarity Evaluation of Foreign-accented
                   Speech Synthesis using {HMM}-based Speaker Adaptation},
  booktitle = {Proc. ICASSP},
  pages = {5372--5375},
  address = {Prague, Czech Republic},
  abstract = {This paper describes a speaker discrimination
                   experiment in which native English listeners were
                   presented with natural and synthetic speech stimuli in
                   English and were asked to judge whether they thought
                   the sentences were spoken by the same person or not.
                   The natural speech consisted of recordings of Finnish
                   speakers speaking English. The synthetic stimuli were
                   created using adaptation data from the same Finnish
                   speakers. Two average voice models were compared: one
                   trained on Finnish-accented English and the other on
                   American-accented English. The experiments illustrate
                   that listeners perform well at speaker discrimination
                   when the stimuli are both natural or both synthetic,
                   but when the speech types are crossed performance drops
                   significantly. We also found that the type of accent in
                   the average voice model had no effect on the
                   listeners’ speaker discrimination performance.},
  categories = {Similarity Evaluation, Speaker Adaptation,
                   HMM-synthesis},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/wester_icassp_2011.pdf},
  year = 2011
}
@inproceedings{Chang-Euro-01,
  author = {S. Chang and S. Greenberg and M. Wester},
  title = {An Elitist Approach to Articulatory-Acoustic Feature
                   Classification},
  booktitle = {Proc. of Eurospeech '01},
  pages = {1729-1733},
  address = {Aalborg},
  abstract = {A novel framework for automatic articulatory-acoustic
                   feature extraction has been developed for enhancing the
                   accuracy of place- and manner-of-articulation
                   classification in spoken language. The elitist approach
                   focuses on frames for which neural network (MLP)
                   classifiers are highly confident, and discards the
                   rest. Using this method, it is possible to achieve a
                   frame-level accuracy of 93\% for manner information on
                   a corpus of American English sentences passed through a
                   telephone network (NTIMIT). Place information is
                   extracted for each manner class independently,
                   resulting in an appreciable gain in place-feature
                   classification relative to performance for a manner-
                   independent system. The elitist framework provides a
                   potential means of automatically annotating a corpus at
                   the phonetic level without recourse to a word-level
                   transcript and could thus be of utility for developing
                   training materials for automatic speech recognition and
                   speech synthesis applications, as well as aid the
                   empirical study of spoken language.},
  categories = {aaf, NTIMIT, Berkeley},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/wester.2001.2.pdf},
  year = 2001
}
@inproceedings{oura:icassp:10,
  author = {Keiichiro Oura and Keiichi Tokuda and Junichi
                   Yamagishi and Mirjam Wester and Simon King},
  title = {Unsupervised Cross-lingual Speaker Adaptation for
                   {HMM}-based Speech Synthesis},
  booktitle = {Proc. of ICASSP},
  volume = {I},
  pages = {4954-4957},
  abstract = {In the EMIME project, we are developing a mobile
                   device that performs personalized speech-to-speech
                   translation such that a user's spoken input in one
                   language is used to produce spoken output in another
                   language, while continuing to sound like the user's
                   voice. We integrate two techniques, unsupervised
                   adaptation for HMM-based TTS using a word-based
                   large-vocabulary continuous speech recognizer and
                   cross-lingual speaker adaptation for HMM-based TTS,
                   into a single architecture. Thus, an unsupervised
                   cross-lingual speaker adaptation system can be
                   developed. Listening tests show very promising results,
                   demonstrating that adapted voices sound similar to the
                   target speaker and that differences between supervised
                   and unsupervised cross-lingual speaker adaptation are
                   small.},
  categories = {speaker adaptation, TTS},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/oura_icassp2010.pdf},
  year = 2010
}
@phdthesis{Wester-02,
  author = {Mirjam Wester},
  title = {Pronunciation Variation Modeling for {D}utch Automatic
                   Speech Recognition},
  school = {University of Nijmegen},
  abstract = {This thesis consists of an introductory review to
                   pronunciation variation modeling, followed by four
                   papers in which the PhD research is described.},
  categories = {asr, pm, Nijmegen},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/thesis.pdf},
  year = 2002
}
@inproceedings{Wester-ICPhS-99,
  author = {M. Wester and J.M. Kessens},
  title = {Comparison between expert listeners and continuous
                   speech recognizers in selecting pronunciation variants},
  booktitle = {Proc. of ICPhS '99},
  pages = {723-726},
  address = {San Francisco},
  abstract = {In this paper, the performance of an automatic
                   transcription tool is evaluated. The transcription tool
                   is a continuous speech recognizer (CSR) which can be
                   used to select pronunciation variants (i.e. detect
                   insertions and deletions of phones). The performance of
                   the CSR was compared to a reference transcription based
                   on the judgments of expert listeners. We investigated
                   to what extent the degree of agreement between the
                   listeners and the CSR was affected by employing various
                   sets of phone models (PMs). Overall, the PMs perform
                   more similarly to the listeners when pronunciation
                   variation is modeled. However, the various sets of PMs
                   lead to different results for insertion and deletion
                   processes. Furthermore, we found that to a certain
                   degree, word error rates can be used to predict which
                   set of PMs to use in the transcription tool.},
  categories = {asr, pm, VIOS, Nijmegen},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/wester.1999.1.pdf},
  year = 1999
}
@inproceedings{Gutkin:etal:ets-cam04,
  author = {Alexander Gutkin and David Gay and Lev Goldfarb and
                   Mirjam Wester},
  title = {On the {A}rticulatory {R}epresentation of {S}peech
                   within the {E}volving {T}ransformation {S}ystem
                   {F}ormalism},
  booktitle = {Pattern Representation and the Future of Pattern
                   Recognition (Proc. Satellite Workshop of 17th
                   International Conference on Pattern Recognition)},
  editor = {Lev Goldfarb},
  pages = {57--76},
  address = {Cambridge, UK},
  abstract = { This paper deals with the formulation of an
                   alternative, structural, approach to the speech
                   representation and recognition problem. In this
                   approach, we require both the representation and the
                   learning algorithms to be linguistically meaningful and
                   to naturally represent the linguistic data at hand.
                   This allows the speech recognition system to discover
                   the emergent combinatorial structure of the linguistic
                   classes. The proposed approach is developed within the
                   ETS formalism, the first formalism in applied
                   mathematics specifically designed to address the issues
                   of class and object/event representation. We present an
                   initial application of ETS to the articulatory
                   modelling of speech based on elementary physiological
                   gestures that can be reliably represented as the ETS
                   primitives. We discuss the advantages of this gestural
                   approach over prevalent methods and its promising
                   potential to mathematical modelling and representation
                   in linguistics. },
  categories = {structural,recognition,ets,artic,mocha,edinburgh,unb},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/ets_cam04_dasr.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/ets_cam04_dasr.ps.gz},
  year = 2004
}
@inproceedings{wester00:_using_dutch_asr,
  author = {M. Wester and J.M. Kessens and H. Strik},
  title = {Using {D}utch phonological rules to model
                   pronunciation variation in {ASR}},
  booktitle = {Phonus 5: proceedings of the "workshop on phonetics
                   and phonology in {ASR}"},
  pages = {105-116},
  address = {Saarbruecken},
  abstract = {In this paper, we describe how the performance of a
                   continuous speech recognizer for Dutch has been
                   improved by modeling within-word and cross-word
                   pronunciation variation. Within-word variants were
                   automatically generated by applying five phonological
                   rules to the words in the lexicon. Cross-word
                   pronunciation variation was modeled by adding
                   multi-words and their variants to the lexicon. The best
                   results were obtained when the cross-word method was
                   combined with the within-word method: a relative
                   improvement of 8.8\% in the WER was found compared to
                   baseline system performance. We also describe an error
                   analysis that was carried out to investigate whether
                   rules in isolation can predict the performance of rules
                   in combination.},
  categories = {asr, pm, VIOS, Nijmegen},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/wester.2000.3.pdf},
  year = 2000
}
@inproceedings{kurimo:acl:10,
  author = {Mikko Kurimo and William Byrne and John Dines and
                   Philip N. Garner and Matthew Gibson and Yong Guan and
                   Teemu Hirsim\"{a}ki and Reima Karhila and Simon King
                   and Hui Liang and Keiichiro Oura and Lakshmi Saheer and
                   Matt Shannon and Sayaka Shiota and Jilei Tian and
                   Keiichi Tokuda and Mirjam Wester and Yi-Jian Wu and
                   Junichi Yamagishi},
  title = {Personalising speech-to-speech translation in the
                   {EMIME} project},
  booktitle = {Proc. of the ACL 2010 System Demonstrations},
  address = {Uppsala, Sweden},
  abstract = {In the EMIME project we have studied unsupervised
                   cross-lingual speaker adaptation. We have employed an
                   HMM statistical framework for both speech recognition
                   and synthesis which provides transformation mechanisms
                   to adapt the synthesized voice in TTS (text-to-speech)
                   using the recognized voice in ASR (automatic speech
                   recognition). An important application for this
                   research is personalised speech-to-speech translation
                   that will use the voice of the speaker in the input
                   language to utter the translated sentences in the
                   output language. In mobile environments this enhances
                   the users' interaction across language barriers by
                   making the output speech sound more like the original
                   speaker's way of speaking, even if she or he could not
                   speak the output language.},
  categories = {speaker adaptation},
  month = {July},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/kurimo_acl_2010.pdf},
  year = 2010
}
@inproceedings{wester-98-voicedata,
  author = {M. Wester},
  title = {Automatic Classification of Voice Quality: Comparing
                   Regression Models and Hidden {M}arkov Models},
  booktitle = {Proc. of VOICEDATA98, Symposium on Databases in Voice
                   Quality Research and Education},
  pages = {92-97},
  address = {Utrecht},
  abstract = {In this paper, two methods for automatically
                   classifying voice quality are compared: regression
                   analysis and hidden Markov models (HMMs). The findings
                   of this research show that HMMs can be used to classify
                   voice quality. The HMMs performed better than the
                   regression models in classifying breathiness and
                   overall degree of deviance, and the two methods showed
                   similar results on the roughness scale. However, the
                   results are not spectacular. This is mainly due to the
                   type of material that was available and the number of
                   listeners who assessed the material. Nonetheless, I
                   argue in this paper that these findings are interesting
                   because they are a promising step towards developing a
                   system for classifying voice quality.},
  categories = {voice quality, Nijmegen},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/wester.1998.1.pdf},
  year = 1998
}
@techreport{wester_mandarin:11,
  author = {Mirjam Wester and Hui Liang},
  title = {The {EMIME} {M}andarin {B}ilingual {D}atabase},
  institution = {The University of Edinburgh},
  number = {EDI-INF-RR-1396},
  abstract = {This paper describes the collection of a bilingual
                   database of Mandarin/English data. In addition, the
                   accents of the talkers in the database have been rated.
                   English and Mandarin listeners assessed the English and
                   Mandarin talkers' degree of foreign accent in English.},
  categories = {evaluation,cross-lingual, accent rating},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/wester_mandarin_2011.pdf},
  year = 2011
}
@inproceedings{wester:ssw7:10,
  author = {Mirjam Wester and John Dines and Matthew Gibson and
                   Hui Liang and Yi-Jian Wu and Lakshmi Saheer and Simon
                   King and Keiichiro Oura and Philip N. Garner and
                   William Byrne and Yong Guan and Teemu Hirsim\"{a}ki and
                   Reima Karhila and Mikko Kurimo and Matt Shannon and
                   Sayaka Shiota and Jilei Tian and Keiichi Tokuda and
                   Junichi Yamagishi},
  title = {Speaker adaptation and the evaluation of speaker
                   similarity in the {EMIME} speech-to-speech translation
                   project},
  booktitle = {Proc. of 7th ISCA Speech Synthesis Workshop},
  address = {Kyoto, Japan},
  abstract = {This paper provides an overview of speaker adaptation
                   research carried out in the EMIME speech-to-speech
                   translation (S2ST) project. We focus on how speaker
                   adaptation transforms can be learned from speech in one
                   language and applied to the acoustic models of another
                   language. The adaptation is transferred across
                   languages and/or from recognition models to synthesis
                   models. The various approaches investigated can all be
                   viewed as a process in which a mapping is defined in
                   terms of either acoustic model states or linguistic
                   units. The mapping is used to transfer either speech
                   data or adaptation transforms between the two models.
                   Because the success of speaker adaptation in
                   text-to-speech synthesis is measured by judging speaker
                   similarity, we also discuss issues concerning
                   evaluation of speaker similarity in an S2ST scenario.},
  categories = {speaker adaptation, evaluation},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wester_ssw7_2010.pdf},
  year = 2010
}
@inproceedings{kessens-COST-97,
  author = {J.M. Kessens and M. Wester and C. Cucchiarini and H.
                   Strik},
  title = {Testing a Method for Modelling Pronunciation Variation},
  booktitle = {Proceedings of the COST workshop},
  pages = {37-40},
  address = {Rhodos},
  abstract = {In this paper we describe a method for improving the
                   performance of a continuous speech recognizer by
                   modelling pronunciation variation. Although the results
                   obtained with this method are in line with those
                   reported by other authors, the magnitude of the
                   improvements is very small. In looking for possible
                   explanations for these results, we computed various
                   sorts of statistics about the material. Since these
                   data proved to be very useful in understanding the
                   effects of our method, they are discussed in this
                   paper. Moreover, on the basis of these statistics we
                   discuss how the system can be improved in the future.},
  categories = {asr, pm, VIOS, Nijmegen},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/1997/kessens.1997.2.pdf},
  year = 1997
}
@inproceedings{Wester-Chang-01,
  author = {M. Wester and S. Greenberg and S. Chang},
  title = {A {D}utch Treatment of an Elitist Approach to
                   Articulatory-Acoustic Feature Classification},
  booktitle = {Proc. of Eurospeech '01},
  pages = {1729-1732},
  address = {Aalborg},
  abstract = {A novel approach to articulatory-acoustic feature
                   extraction has been developed for enhancing the
                   accuracy of classification associated with place and
                   manner of articulation information. This elitist
                   approach is tested on a corpus of spontaneous Dutch
                   using two different systems, one trained on a subset of
                   the same corpus, the other trained on a corpus from a
                   different language (American English). The feature
                   dimensions, voicing and manner of articulation transfer
                   relatively well between the two languages. However,
                   place information transfers less well. Manner-specific
                   training can be used to improve classification of
                   articulatory place information.},
  categories = {aaf, NTIMIT, VIOS, Berkeley},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/wester.2001.1.pdf},
  year = 2001
}
@inproceedings{Kessens98,
  author = {J.M. Kessens and M. Wester and C. Cucchiarini and H.
                   Strik},
  title = {The Selection of Pronunciation Variants: Comparing the
                   Performance of Man and Machine},
  booktitle = {Proc. of ICSLP '98},
  pages = {2715-2718},
  address = {Sydney},
  abstract = {In this paper the performance of an automatic
                   transcription tool is evaluated. The transcription tool
                   is a Continuous Speech Recognizer (CSR) running in
                   forced recognition mode. For evaluation the performance
                   of the CSR was compared to that of nine expert
                   listeners. Both man and the machine carried out exactly
                   the same task: deciding whether a segment was present
                   or not in 467 cases. It turned out that the performance
                   of the CSR is comparable to that of the experts.},
  categories = {asr, pm, VIOS, Nijmegen},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/kessens.1998.1.pdf},
  year = 1998
}
@article{king07:JASA2007,
  author = {King, S. and Frankel, J. and Livescu, K. and
                   McDermott, E. and Richmond, K. and Wester, M.},
  title = {Speech production knowledge in automatic speech
                   recognition},
  journal = {Journal of the Acoustical Society of America},
  volume = 121,
  number = 2,
  pages = {723--742},
  abstract = {Although much is known about how speech is produced,
                   and research into speech production has resulted in
                   measured articulatory data, feature systems of
                   different kinds and numerous models, speech production
                   knowledge is almost totally ignored in current
                   mainstream approaches to automatic speech recognition.
                   Representations of speech production allow simple
                   explanations for many phenomena observed in speech
                   which cannot be easily analyzed from either acoustic
                   signal or phonetic transcription alone. In this
                   article, we provide a survey of a growing body of work
                   in which such representations are used to improve
                   automatic speech recognition.},
  month = feb,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/King_et_al_review.pdf},
  year = 2007
}
@article{chang05,
  author = {S. Chang and M. Wester and S. Greenberg},
  title = {An elitist approach to automatic articulatory-acoustic
                   feature classification for phonetic characterization of
                   spoken language},
  journal = {Speech Communication},
  volume = {47},
  pages = {290-311},
  abstract = {A novel framework for automatic articulatory-acoustic
                   feature extraction has been developed for enhancing the
                   accuracy of place- and manner-of-articulation
                   classification in spoken language. The "elitist"
                   approach provides a principled means of selecting
                   frames for which multi-layer perceptron, neural-network
                   classifiers are highly confident. Using this method it
                   is possible to achieve a frame-level accuracy of 93\%
                   on "elitist" frames for manner classification on a
                   corpus of American English sentences passed through a
                   telephone network (NTIMIT). Place-of-articulation
                   information is extracted for each manner class
                   independently, resulting in an appreciable gain in
                   place-feature classification relative to performance
                   for a manner-independent system. A comparable
                   enhancement in classification performance for the
                   elitist appraoch is evidenced when applied to a Dutch
                   corpus of quasi-spontaneous telephone interactions
                   (VIOS). The elitist framework provides a potential
                   means of automatically annotating a corpus at the
                   phonetic level \emph{without recourse to a word-level
                   transcript} and could thus be of utility for developing
                   traning materials for automatic speech recognition and
                   speech synthesis applications, as well as aid the
                   empirical study of spoken language. \copyright 2005
                   Elsevier B.V. All rights reserved.},
  categories = {aaf, VIOS, NTIMIT, Berkeley},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2005/elitist-final-specom.pdf},
  year = 2005
}