# mwester.bib

@inproceedings{Sturm-03,
author = {Sturm, J. and Kessens, J. M. and Wester, M. and de Wet, F. and Sanders, E. and Strik, H.},
title = {Automatic Transcription of Football Commentaries in the {MUMIS} Project},
booktitle = {Proc. Eurospeech '03},
abstract = {This paper describes experiments carried out to automatically transcribe football commentaries in Dutch, English and German for multimedia indexing. Our results show that the high levels of stadium noise in the material create a task that is extremely difficult for conventional ASR. The baseline WERs vary from 83\% to 94\% for the three languages investigated. Employing state-of-the-art noise robustness techniques leads to relative reductions of 9-10\% WER. Application specific words such as players names are recognized correctly in about 50\% of cases. Although this result is substantially better than the overall result, it is inadequate. Much better results can be obtained if the football commentaries are recorded separately from the stadium noise. This would make the automatic transcriptions more useful for multimedia indexing.},
year = {2003},
pages = {-},
categories = {asr, MUMIS, Nijmegen}
}

@inproceedings{karhila_interspeech:11,
author = {Karhila, Reima and Wester, Mirjam},
title = {Rapid Adaptation of Foreign-accented {HMM}-based Speech Synthesis},
booktitle = {Proc. Interspeech},
year = {2011},
abstract = {This paper presents findings of listeners’ perception of speaker identity in synthetic speech. Specifically, we investigated what the effect is on the perceived identity of a speaker when using differently accented average voice models and limited amounts (five and fifteen sentences) of a speaker’s data to create the synthetic stimuli. A speaker discrimination task was used to measure speaker identity. Native English listeners were presented with natural and synthetic speech stimuli in English and were asked to decide whether they thought the sentences were spoken by the same person or not. An accent rating task was also carried out to measure the perceived accents of the synthetic speech stimuli. The results show that listeners, for the most part, perform as well at speaker discrimination when the stimuli have been created using five or fifteen adaptation sentences as when using 105 sentences. Furthermore, the accent of the average voice model does not affect listeners’ speaker discrimination performance even though the accent rating task shows listeners are perceiving different accents in the synthetic stimuli. Listeners do not base their speaker similarity decisions on perceived accent.},
categories = {speech synthesis, rapid adaptation}
}

@inproceedings{kessens-00,
author = {Kessens, J.M. and Wester, M. and Strik, H.},
title = {Automatic Detection and Verification of {D}utch Phonological Rules},
booktitle = {PHONUS 5: Proceedings of the "Workshop on Phonetics and Phonology in ASR"},
year = {2000},
abstract = {In this paper, we propose two methods for automatically obtaining hypotheses about pronunciation variation. To this end, we used two different approaches in which we employed a continuous speech recognizer to derive this information from the speech signal. For the first method, the output of a phone recognition was compared to a reference transcription in order obtain hypotheses about pronunciation variation. Since phone recognition contains errors, we used forced recognition in order to exclude unreliable hypotheses. For the second method, forced recognition was also used, but the hypotheses about the deletion of phones were not constrained beforehand. This was achieved by allowing each phone to be deleted. After forced recognition, we selected the most frequently applied rules as the set of deletion rules. Since previous research showed that forced recognition is a reliable tool for testing hypotheses about pronunciation variation, we can expect that this will also hold for the hypotheses about pronunciation variation which we found using each of the two methods. Another reason for expecting the rule hypotheses to be reliable is that we found that 37-53\% of the rules are related to Dutch phonological processes that have been described in the literature.},
pages = {117-128},
categories = {asr, pm, VIOS, Nijmegen}
}

@article{Kessens-Wester-99,
author = {Kessens, J.M. and Wester, M. and Strik, H.},
title = {Improving the performance of a {D}utch {CSR} by modeling within-word and cross-word pronunciation variation},
journal = {Speech Communication},
abstract = {This article describes how the performance of a Dutch continuous speech recognizer was improved by modeling pronunciation variation. We propose a general procedure for modeling pronunciation variation. In short, it consists of adding pronunciation variants to the lexicon, retraining phone models and using language models to which the pronunciation variants have been added. First, within-word pronunciation variants were generated by applying a set of five optional phonological rules to the words in the baseline lexicon. Next, a limited number of cross-word processes were modeled, using two different methods. In the first approach, cross-word processes were modeled by directly adding the cross-word variants to the lexicon, and in the second approach this was done by using multi-words. Finally, the combination of the within-word method with the two cross-word methods was tested. The word error rate (WER) measured for the baseline system was 12.75\%. Compared to the baseline, a small but statistically significant improvement of 0.68\% in WER was measured for the within-word method, whereas both cross-word methods in isolation led to small, non-signicant improvements. The combination of the within-word method and cross-word method 2 led to the best result: an absolute improvement of 1.12\% in WER was found compared to the baseline, which is a relative improvement of 8.8\% in WER.},
volume = {29},
year = {1999},
pages = {193-207},
categories = {asr, pm, VIOS, Nijmegen}
}

@inproceedings{kessens-CLS-97,
author = {Kessens, J.M. and Wester, M.},
title = {Improving Recognition Performance by Modelling Pronunciation Variation},
booktitle = {Proc. CLS opening Academic Year '97 '98},
year = {1997},
abstract = {This paper describes a method for improving the performance of a continuous speech recognizer by modelling pronunciation variation. Although the improvements obtained with this method are small, they are in line with those reported by other authors. A series of experiments was carried out to model pronunciation variation. In the first set of experiments word internal pronunciation variation was modelled by applying a set of four phonological rules to the words in the lexicon. In the second set of experiments, variation across word boundaries was also modelled. The results obtained with both methods are presented in detail. Furthermore, statistics are given on the application of the four phonological rules on the training database. We will explain why the improvements obtained with this method are small and how we intend to increase the improvements in our future research.},
pages = {1-20},
categories = {asr, pm, VIOS, Nijmegen}
}

@inproceedings{Kessens-ICPhS-99,
author = {Kessens, J.M. and Wester, M. and Strik, H.},
title = {Modeling within-word and cross-word pronunciation variation to improve the performance of a {D}utch {CSR}},
booktitle = {Proc. ICPhS '99},
year = {1999},
abstract = {This paper describes how the performance of a continuous speech recognizer for Dutch has been improved by modeling within-word and cross-word pronunciation variation. Within-word variants were automatically generated by applying five phonological rules to the words in the lexicon. For the within-word method, a significant improvement is found compared to the baseline. Cross-word pronunciation variation was modeled using two different methods: 1) adding cross-word variants directly to the lexicon, 2) only adding multi-words and their variants to the lexicon. Overall, cross-word method 2 leads to better results than cross-word method 1. The best results were obtained when cross-word method 2 was combined with the within-word method: a relative improvement of 8.8\% WER was found compared to the baseline.},
pages = {1665-1668},
categories = {asr, pm, VIOS, Nijmegen}
}

@inproceedings{wester_interspeech:11,
author = {Wester, Mirjam and Liang, Hui},
title = {Cross-Lingual Speaker Discrimination Using Natural and Synthetic Speech},
booktitle = {Proc. Interspeech},
year = {2011},
abstract = {This paper describes speaker discrimination experiments in which native English listeners were presented with either natural speech stimuli in English and Mandarin, synthetic speech stimuli in English and Mandarin, or natural Mandarin speech and synthetic English speech stimuli. In each experiment, listeners were asked to decide whether they thought the sentences were spoken by the same person or not. We found that the results for Mandarin/English speaker discrimination are very similar to results found in previous work on German/English and Finnish/English speaker discrimination. We conclude from this and previous work that listeners are able to identify speakers across languages and they are able to identify speakers across speech types, but the combination of these two factors leads to a speaker discrimination task which is too difficult for listeners to perform successfully, given the quality of across-language speaker adapted speech synthesis at present.},
categories = {speaker discrimination, speaker adaptation, HMM-based speech synthesis}
}

@techreport{wester_accent2010,
author = {Wester, M.},
title = {The {EMIME} {B}ilingual {D}atabase},
abstract = {This paper describes the collection of a bilingual database of Finnish/English and German/English data. In addition, the accents of the talkers in the database have been rated. English, German and Finnish listeners assessed the English, German and Finnish talkers{\^a} degree of foreign accent in English. Native English listeners showed higher inter-listener agreement than non-native listeners. Further analyses showed that non-native listeners judged Finnish and German female talkers to be significantly less accented than do English listeners. German males are judged less accented by Finnish listeners than they are by English and German listeners and there is no difference between listeners as to how they judge the accent of Finnish males. Finally, all English talkers are judged more accented by non-native listeners than they are by native English listeners.},
number = {EDI-INF-RR-1388},
year = {2010},
institution = {The University of Edinburgh},
categories = {evaluation,cross-lingual, accent rating}
}

@inproceedings{wester-98-sposs,
author = {Wester, M. and Kessens, J.M. and Cucchiarini, C. and Strik, H.},
title = {Selection of Pronunciation Variants in Spontaneous Speech: Comparing the Performance of Man and Machine},
booktitle = {Proc. ESCA Workshop on the Sound Patterns of Spontaneous Speech: Production and Perception},
year = {1998},
pages = {157-160},
categories = {asr, pm, VIOS, Nijmegen}
}

@article{frankel07:AF_DBN,
author = {Frankel, J. and Wester, M. and King, S.},
title = {Articulatory feature recognition using dynamic {B}ayesian networks},
journal = {Computer Speech & Language},
number = {4},
abstract = {We describe a dynamic Bayesian network for articulatory feature recognition. The model is intended to be a component of a speech recognizer that avoids the problems of conventional beads-on-a-string'' phoneme-based models. We demonstrate that the model gives superior recognition of articulatory features from the speech signal compared with a stateof- the art neural network system. We also introduce a training algorithm that offers two major advances: it does not require time-aligned feature labels and it allows the model to learn a set of asynchronous feature changes in a data-driven manner.},
month = {October},
volume = {21},
year = {2007},
pages = {620--640}
}

@inproceedings{Wester-icslp-02,
author = {Wester, M. and Kessens, J.M. and Strik, H.},
title = {Goal-directed {ASR} in a multimedia indexing and searching environment ({MUMIS})},
booktitle = {Proc. ICSLP},
year = {2002},
abstract = {This paper describes the contribution of automatic speech recognition (ASR) within the framework of MUMIS (Multimedia Indexing and Searching Environment). The domain is football commentaries. The initial results of carrying out ASR on Dutch and English football commentaries are presented. We found that overall word error rates are high, but application specific words are recognized reasonably well. The difficulty of the ASR task is greatly increased by the high levels of noise present in the material.},
pages = {1993-1996},
categories = {asr, MUMIS, Nijmegen}
}

@inproceedings{wester-97,
author = {Wester, M. and Kessens, J.M. and Cucchiarini, C. and Strik, H.},
title = {Modelling pronunciation variation: some preliminary results},
booktitle = {Proc. Dept. of Language & Speech},
year = {1997},
abstract = {In this paper we describe a method for improving the performance of a continuous speech recognizer by modelling pronunciation variation. Although the results obtained with this method are in line with those reported by other authors, the magnitude of the improvements is very small. In looking for possible explanations for these results, we computed various sorts of statistics about the material. Since these data proved to be very useful in understanding the effects of our method, they are discussed in this paper. Moreover, on the basis of these statistics we discuss how the system can be improved in the future.},
pages = {127-137},
categories = {asr, pm, VIOS, Nijmegen}
}

@article{wester:specom:12,
author = {Wester, Mirjam},
doi = {10.1016/j.specom.2012.01.006},
title = {Talker discrimination across languages},
journal = {Speech Communication},
abstract = {This study investigated the extent to which listeners are able to discriminate between bilingual talkers in three language pairs – English–German, English–Finnish and English–Mandarin. Native English listeners were presented with two sentences spoken by bilingual talkers and were asked to judge whether they thought the sentences were spoken by the same person. Equal amounts of cross-language and matched-language trials were presented. The results show that native English listeners are able to carry out this task well; achieving percent correct levels at well above chance for all three language pairs. Previous research has shown this for English–German, this research shows listeners also extend this to Finnish and Mandarin, languages that are quite distinct from English from a genetic and phonetic similarity perspective. However, listeners are significantly less accurate on cross-language talker trials (English–foreign) than on matched-language trials (English–English and foreign–foreign). Understanding listeners’ behaviour in cross-language talker discrimination using natural speech is the first step in developing principled evaluation techniques for synthesis systems in which the goal is for the synthesised voice to sound like the original speaker, for instance, in speech-to-speech translation systems, voice conversion and reconstruction.},
volume = {54},
year = {2012},
pages = {781--790},
categories = {evaluation}
}

@inproceedings{Wester-00,
author = {Wester, M. and Kessens, J.M. and Strik, H.},
title = {Pronunciation variation in {ASR}: Which variation to model?},
booktitle = {Proc. {ICSLP} '00},
year = {2000},
abstract = {This paper describes how the performance of a continuous speech recognizer for Dutch has been improved by modeling within-word and cross-word pronunciation variation. A relative improvement of 8.8\% in WER was found compared to baseline system performance. However, as WERs do not reveal the full effect of modeling pronunciation variation, we performed a detailed analysis of the differences in recognition results that occur due to modeling pronunciation variation and found that indeed a lot of the differences in recognition results are not reflected in the error rates. Furthermore, error analysis revealed that testing sets of variants in isolation does not predict their behavior in combination. However, these results appeared to be corpus dependent.},
volume = {IV},
pages = {488-491},
categories = {asr, pm, VIOS, Nijmegen}
}

@inproceedings{wester04:asynch,
author = {Wester, M. and Frankel, J. and King, S.},
title = {Asynchronous Articulatory Feature Recognition Using Dynamic {B}ayesian Networks},
booktitle = {Proc. IEICI Beyond HMM Workshop},
year = {2004},
month = {December},
abstract = {This paper builds on previous work where dynamic Bayesian networks (DBN) were proposed as a model for articulatory feature recognition. Using DBNs makes it possible to model the dependencies between features, an addition to previous approaches which was found to improve feature recognition performance. The DBN results were promising, giving close to the accuracy of artificial neural nets (ANNs). However, the system was trained on canonical labels, leading to an overly strong set of constraints on feature co-occurrence. In this study, we describe an embedded training scheme which learns a set of data-driven asynchronous feature changes where supported in the data. Using a subset of the OGI Numbers corpus, we describe articulatory feature recognition experiments using both canonically-trained and asynchronous DBNs. Performance using DBNs is found to exceed that of ANNs trained on an identical task, giving a higher recognition accuracy. Furthermore, inter-feature dependencies result in a more structured model, giving rise to fewer feature combinations in the recognition output. In addition to an empirical evaluation of this modelling approach, we give a qualitative analysis, comparing asynchrony found through our data-driven methods to the asynchrony which may be expected on the basis of linguistic knowledge.},
categories = {am,artic,asr,dbn,oginumbers,edinburgh}
}

@inproceedings{Wester-03,
author = {Wester, M.},
title = {Syllable classification using articulatory-acoustic features},
booktitle = {Proc. Eurospeech '03},
year = {2003},
abstract = {This paper investigates the use of articulatory-acoustic features for the classification of syllables in TIMIT. The main motivation for this study is to circumvent the beads-on-a-string'' problem, i.e. the assumption that words can be described as a simple concatenation of phones. Posterior probabilities for articulatory-acoustic features are obtained from artificial neural nets and are used to classify speech within the scope of syllables instead of phones. This gives the opportunity to account for asynchronous feature changes, exploiting the strengths of the articulatory-acoustic features, instead of losing the potential by reverting to phones.},
pages = {-},
categories = {aaf, syllable, TIMIT, Edinburgh}
}

@article{Dines2011,
author = {Dines, John and Liang, Hui and Saheer, Lakshmi and Gibson, Matthew and Byrne, William and Oura, Keiichiro and Tokuda, Keiichi and Yamagishi, Junichi and King, Simon and Wester, Mirjam and Hirsimäki, Teemu and Karhila, Reima and Kurimo, Mikko},
doi = {10.1016/j.csl.2011.08.003},
title = {Personalising speech-to-speech translation: Unsupervised cross-lingual speaker adaptation for {HMM}-based speech synthesis},
url = {http://www.sciencedirect.com/science/article/pii/S0885230811000441},
journal = {Computer Speech and Language},
issn = {0885-2308},
number = {2},
abstract = {In this paper we present results of unsupervised cross-lingual speaker adaptation applied to text-to-speech synthesis. The application of our research is the personalisation of speech-to-speech translation in which we employ a HMM statistical framework for both speech recognition and synthesis. This framework provides a logical mechanism to adapt synthesised speech output to the voice of the user by way of speech recognition. In this work we present results of several different unsupervised and cross-lingual adaptation approaches as well as an end-to-end speaker adaptive speech-to-speech translation system. Our experiments show that we can successfully apply speaker adaptation in both unsupervised and cross-lingual scenarios and our proposed algorithms seem to generalise well for several language pairs. We also discuss important future directions including the need for better evaluation metrics.},
month = {February},
volume = {27},
year = {2013},
keywords = {Speech-to-speech translation, Cross-lingual speaker adaptation, HMM-based speech synthesis, Speaker adaptation, Voice conversion},
pages = {420--437}
}

@inproceedings{wester:interspeech:10,
author = {Wester, Mirjam},
title = {Cross-lingual talker discrimination},
booktitle = {Proc. Interspeech},
year = {2010},
month = {September},
abstract = {This paper describes a talker discrimination experiment in which native English listeners were presented with two sentences spoken by bilingual talkers (English/German and English/Finnish) and were asked to judge whether they thought the sentences were spoken by the same person or not. Equal amounts of cross-lingual and matched-language trials were presented. The experiments showed that listeners are able to complete this task well, they can discriminate between talkers significantly better than chance. However, listeners are significantly less accurate on cross-lingual talker trials than on matched-language pairs. No significant differences were found on this task between German and Finnish. Bias (B'') and Sensitivity (A') values are presented to analyse the listeners' behaviour in more detail. The results are promising for the evaluation of EMIME, a project covering speech-to-speech translation with speaker adaptation.},
categories = {evaluation}
}

@inproceedings{frankel04:artic_dbn,
author = {Frankel, J. and Wester, M. and King, S.},
title = {Articulatory feature recognition using dynamic {B}ayesian networks},
booktitle = {Proc. {ICSLP}},
month = {September},
year = {2004},
abstract = {This paper describes the use of dynamic Bayesian networks for the task of articulatory feature recognition. We show that by modeling the dependencies between a set of 6 multi-leveled articulatory features, recognition accuracy is increased over an equivalent system in which features are considered independent. Results are compared to those found using artificial neural networks on an identical task.},
categories = {am,artic,asr,dbn,timit,edinburgh}
}

@inproceedings{cooke:lista:12,
author = {Cooke, Martin and Lecumberri, Maria Luisa García and Tang, Yan and Wester, Mirjam},
note = {http://listening-talker.org/workshop/programme.html},
booktitle = {Proceedings of The Listening Talker Workshop},
year = {2012},
pages = {59},
title = {Do non-native listeners benefit from speech modifications designed to promote intelligibility for native listeners?}
}

@article{Wester-01,
author = {Wester, M. and Kessens, J. M. and Cucchiarini, C. and Strik, H.},
title = {Obtaining phonetic transcriptions: a comparison between expert listeners and a continuous speech recognizer},
journal = {Language and Speech},
abstract = {In this article, we address the issue of using a continuous speech recognition tool to obtain phonetic or phonological representations of speech. Two experiments were carried out in which the performance of a continuous speech recognizer (CSR) was compared to the performance of expert listeners in a task of judging whether a number of prespecified phones had been realized in an utterance. In the first experiment, nine expert listeners and the CSR carried out exactly the same task: deciding whether a segment was present or not in 467 cases. In the second experiment, we expanded on the first experiment by focusing on two phonological processes: schwa-deletion and schwa-insertion. The results of these experiments show that significant differences in performance were found between the CSR and the listeners, but also between individual listeners. Although some of these differences appeared to be statistically significant, their magnitude is such that they may very well be acceptable depending on what the transcriptions are needed for. In other words, although the CSR is not infallible, it makes it possible to explore large datasets, which might outweigh the errors introduced by the mistakes the CSR makes. For these reasons, we can conclude that the CSR can be used instead of a listener to carry out this type of task: deciding whether a phone is present or not.},
volume = {44(3)},
year = {2001},
pages = {377-403},
categories = {automatic transcription, pm, VIOS, Nijmegen}
}

@article{Wester-CSL-03,
author = {Wester, M.},
title = {Pronunciation modeling for {ASR} -- knowledge-based and data-derived methods},
journal = {Computer Speech and Language},
abstract = {This article focuses on modeling pronunciation variation in two different ways: data-derived and knowledge-based. The knowledge-based approach consists of using phonological rules to generate variants. The data-derived approach consists of performing phone recognition, followed by smoothing using decision trees (D-trees) to alleviate some of the errors in the phone recognition. Using phonological rules led to a small improvement in WER; a data-derived approach in which the phone recognition was smoothed using D-trees prior to lexicon generation led to larger improvements compared to the baseline. The lexicon was employed in two different recognition systems: a hybrid HMM/ANN system and a HMM-based system, to ascertain whether pronunciation variation was truly being modeled. This proved to be the case as no significant differences were found between the results obtained with the two systems. Furthermore, we found that 10\% of variants generated by the phonological rules were also found using phone recognition, and this increased to 28\% when the phone recognition output was smoothed by using D-trees. This indicates that the D-trees generalize beyond what has been seen in the training material, whereas when the phone recognition approach is employed directly, unseen pronunciations cannot be predicted. In addition, we propose a metric to measure confusability in the lexicon. Using this confusion metric to prune variants results in roughly the same improvement as using the D-tree method.},
volume = {17},
year = {2003},
pages = {69-85},
categories = {asr, pm, VIOS, Nijmegen}
}

@inproceedings{Wester-Fosler-00,
author = {Wester, M. and Fosler-Lussier, E.},
title = {A comparison of data-derived and knowledge-based modeling of pronunciation variation},
booktitle = {Proc. ICSLP '00},
year = {2000},
abstract = {This paper focuses on modeling pronunciation variation in two different ways: data-derived and knowledge-based. The knowledge-based approach consists of using phonological rules to generate variants. The data-derived approach consists of performing phone recognition, followed by various pruning and smoothing methods to alleviate some of the errors in the phone recognition. Using phonological rules led to a small improvement in WER; whereas, using a data-derived approach in which the phone recognition was smoothed using simple decision trees (d-trees) prior to lexicon generation led to a significant improvement compared to the baseline. Furthermore, we found that 10\% of variants generated by the phonological rules were also found using phone recognition, and this increased to 23\% when the phone recognition output was smoothed by using d-trees. In addition, we propose a metric to measure confusability in the lexicon and we found that employing this confusion metric to prune variants results in roughly the same improvement as using the d-tree method.},
volume = {I},
pages = {270-273},
categories = {asr, pm, VIOS, Berkeley}
}

@article{Oura2012703,
author = {Oura, Keiichiro and Yamagishi, Junichi and Wester, Mirjam and King, Simon and Tokuda, Keiichi},
doi = {10.1016/j.specom.2011.12.004},
title = {Analysis of unsupervised cross-lingual speaker adaptation for {HMM}-based speech synthesis using {KLD}-based transform mapping},
url = {http://www.sciencedirect.com/science/article/pii/S0167639311001774},
journal = {Speech Communication},
issn = {0167-6393},
number = {6},
abstract = {In the EMIME project, we developed a mobile device that performs personalized speech-to-speech translation such that a user's spoken input in one language is used to produce spoken output in another language, while continuing to sound like the user's voice. We integrated two techniques into a single architecture: unsupervised adaptation for HMM-based TTS using word-based large-vocabulary continuous speech recognition, and cross-lingual speaker adaptation (CLSA) for HMM-based TTS. The CLSA is based on a state-level transform mapping learned using minimum Kullback-Leibler divergence between pairs of HMM states in the input and output languages. Thus, an unsupervised cross-lingual speaker adaptation system was developed. End-to-end speech-to-speech translation systems for four languages (English, Finnish, Mandarin, and Japanese) were constructed within this framework. In this paper, the English-to-Japanese adaptation is evaluated. Listening tests demonstrate that adapted voices sound more similar to a target speaker than average voices and that differences between supervised and unsupervised cross-lingual speaker adaptation are small. Calculating the KLD state-mapping on only the first 10 mel-cepstral coefficients leads to huge savings in computational costs, without any detrimental effect on the quality of the synthetic speech.},
volume = {54},
year = {2012},
keywords = {HMM-based speech synthesis, Unsupervised speaker adaptation, Cross-lingual speaker adaptation, Speech-to-speech translation},
pages = {703--714}
}

@inproceedings{badinoclark_interspeech12,
author = {Badino, Leonardo and Clark, Robert A.J. and Wester, Mirjam},
title = {Towards Hierarchical Prosodic Prominence Generation in {TTS} Synthesis},
booktitle = {Proc. Interspeech},
year = {2012},
categories = {speech synthesis, prosody}
}

@inproceedings{wester-98-icslp,
author = {Wester, M. and Kessens, J.M. and Strik, H.},
title = {Modeling pronunciation variation for a {D}utch {CSR}: testing three methods},
booktitle = {Proc. ICSLP '98},
year = {1998},
abstract = {This paper describes how the performance of a continuous speech recognizer for Dutch has been improved by modeling pronunciation variation. We used three methods to model pronunciation variation. First, within-word variation was dealt with. Phonological rules were applied to the words in the lexicon, thus automatically generating pronunciation variants. Secondly, cross-word pronunciation variation was modeled using two different approaches. The first approach was to model cross-word processes by adding the variants as separate words to the lexicon and in the second approach this was done by using multi-words. For each of the methods, recognition experiments were carried out. A significant improvement was found for modeling within-word variation. Furthermore, modeling crossword processes using multi-words leads to significantly better results than modeling them using separate words in the lexicon.},
pages = {2535-2538},
categories = {asr, pm, VIOS, Nijmegen}
}

@inproceedings{wester-98-kerkrade,
author = {Wester, M. and Kessens, J.M. and Strik, H.},
title = {Improving the Performance of a {D}utch {CSR} by Modeling Pronunciation Variation},
booktitle = {Proc. Workshop Modeling Pronunciation Variation for Automatic Speech Recognition},
year = {1998},
abstract = {This paper describes how the performance of a continuous speech recognizer for Dutch has been improved by modeling pronunciation variation. We used three methods in order to model pronunciation variation. First, withinword variation was dealt with. Phonological rules were applied to the words in the lexicon, thus automatically generating pronunciation variants. Secondly, cross-word pronunciation variation was accounted for by adding multi-words and their variants to the lexicon. Thirdly, probabilities of pronunciation variants were incorporated in the language model (LM), and thresholds were used to choose which pronunciation variants to add to the LMs. For each of the methods, recognition experiments were carried out. A significant improvement in error rates was measured.},
pages = {145-150},
categories = {asr, pm, VIOS, Nijmegen}
}

@inproceedings{wester-98-sd,
author = {Wester, M. and Kessens, J.M. and Strik, H.},
title = {Two automatic approaches for analyzing the frequency of connected speech processes in {D}utch},
booktitle = {Proc. ICSLP Student Day '98},
year = {1998},
abstract = {This paper describes two automatic approaches used to study connected speech processes (CSPs) in Dutch. The first approach was from a linguistic point of view - the top-down method. This method can be used for verification of hypotheses about CSPs. The second approach - the bottom-up method - uses a constrained phone recognizer to generate phone transcriptions. An alignment was carried out between the two transcriptions and a reference transcription. A comparison between the two methods showed that 68\% agreement was achieved on the CSPs. Although phone accuracy is only 63\%, the bottom-up approach is useful for studying CSPs. From the data generated using the bottom-up method, indications of which CSPs are present in the material can be found. These indications can be used to generate hypotheses which can then be tested using the top-down method.},
pages = {3351-3356},
categories = {asr, pm, VIOS, Nijmegen}
}

@inproceedings{wester_icassp:11,
author = {Wester, Mirjam and Karhila, Reima},
title = {Speaker Similarity Evaluation of Foreign-accented Speech Synthesis using {HMM}-based Speaker Adaptation},
booktitle = {Proc. ICASSP},
year = {2011},
abstract = {This paper describes a speaker discrimination experiment in which native English listeners were presented with natural and synthetic speech stimuli in English and were asked to judge whether they thought the sentences were spoken by the same person or not. The natural speech consisted of recordings of Finnish speakers speaking English. The synthetic stimuli were created using adaptation data from the same Finnish speakers. Two average voice models were compared: one trained on Finnish-accented English and the other on American-accented English. The experiments illustrate that listeners perform well at speaker discrimination when the stimuli are both natural or both synthetic, but when the speech types are crossed performance drops significantly. We also found that the type of accent in the average voice model had no effect on the listeners’ speaker discrimination performance.},
pages = {5372--5375},
categories = {Similarity Evaluation, Speaker Adaptation, HMM-synthesis}
}

@inproceedings{Chang-Euro-01,
author = {Chang, S. and Greenberg, S. and Wester, M.},
title = {An Elitist Approach to Articulatory-Acoustic Feature Classification},
booktitle = {Proc. Eurospeech '01},
year = {2001},
abstract = {A novel framework for automatic articulatory-acoustic feature extraction has been developed for enhancing the accuracy of place- and manner-of-articulation classification in spoken language. The elitist approach focuses on frames for which neural network (MLP) classifiers are highly confident, and discards the rest. Using this method, it is possible to achieve a frame-level accuracy of 93\% for manner information on a corpus of American English sentences passed through a telephone network (NTIMIT). Place information is extracted for each manner class independently, resulting in an appreciable gain in place-feature classification relative to performance for a manner- independent system. The elitist framework provides a potential means of automatically annotating a corpus at the phonetic level without recourse to a word-level transcript and could thus be of utility for developing training materials for automatic speech recognition and speech synthesis applications, as well as aid the empirical study of spoken language.},
pages = {1729-1733},
categories = {aaf, NTIMIT, Berkeley}
}

@inproceedings{oura:icassp:10,
author = {Oura, Keiichiro and Tokuda, Keiichi and Yamagishi, Junichi and Wester, Mirjam and King, Simon},
title = {Unsupervised Cross-lingual Speaker Adaptation for {HMM}-based Speech Synthesis},
booktitle = {Proc. ICASSP},
abstract = {In the EMIME project, we are developing a mobile device that performs personalized speech-to-speech translation such that a user's spoken input in one language is used to produce spoken output in another language, while continuing to sound like the user's voice. We integrate two techniques, unsupervised adaptation for HMM-based TTS using a word-based large-vocabulary continuous speech recognizer and cross-lingual speaker adaptation for HMM-based TTS, into a single architecture. Thus, an unsupervised cross-lingual speaker adaptation system can be developed. Listening tests show very promising results, demonstrating that adapted voices sound similar to the target speaker and that differences between supervised and unsupervised cross-lingual speaker adaptation are small.},
volume = {I},
year = {2010},
pages = {4954-4957},
}

@phdthesis{Wester-02,
author = {Wester, Mirjam},
school = {University of Nijmegen},
title = {Pronunciation Variation Modeling for {D}utch Automatic Speech Recognition},
abstract = {This thesis consists of an introductory review to pronunciation variation modeling, followed by four papers in which the PhD research is described.},
year = {2002},
categories = {asr, pm, Nijmegen}
}

@inproceedings{Wester-ICPhS-99,
author = {Wester, M. and Kessens, J.M.},
title = {Comparison between expert listeners and continuous speech recognizers in selecting pronunciation variants},
booktitle = {Proc. ICPhS '99},
year = {1999},
abstract = {In this paper, the performance of an automatic transcription tool is evaluated. The transcription tool is a continuous speech recognizer (CSR) which can be used to select pronunciation variants (i.e. detect insertions and deletions of phones). The performance of the CSR was compared to a reference transcription based on the judgments of expert listeners. We investigated to what extent the degree of agreement between the listeners and the CSR was affected by employing various sets of phone models (PMs). Overall, the PMs perform more similarly to the listeners when pronunciation variation is modeled. However, the various sets of PMs lead to different results for insertion and deletion processes. Furthermore, we found that to a certain degree, word error rates can be used to predict which set of PMs to use in the transcription tool.},
pages = {723-726},
categories = {asr, pm, VIOS, Nijmegen}
}

@inproceedings{Gutkin:etal:ets-cam04,
editor = {Goldfarb, Lev},
author = {Gutkin, Alexander and Gay, David and Goldfarb, Lev and Wester, Mirjam},
title = {On the {A}rticulatory {R}epresentation of {S}peech within the {E}volving {T}ransformation {S}ystem {F}ormalism},
booktitle = {Pattern Representation and the Future of Pattern Recognition (Proc. Satellite Workshop of 17th International Conference on Pattern Recognition)},
year = {2004},
abstract = {This paper deals with the formulation of an alternative, structural, approach to the speech representation and recognition problem. In this approach, we require both the representation and the learning algorithms to be linguistically meaningful and to naturally represent the linguistic data at hand. This allows the speech recognition system to discover the emergent combinatorial structure of the linguistic classes. The proposed approach is developed within the ETS formalism, the first formalism in applied mathematics specifically designed to address the issues of class and object/event representation. We present an initial application of ETS to the articulatory modelling of speech based on elementary physiological gestures that can be reliably represented as the ETS primitives. We discuss the advantages of this gestural approach over prevalent methods and its promising potential to mathematical modelling and representation in linguistics.},
month = {August},
pages = {57--76},
categories = {structural,recognition,ets,artic,mocha,edinburgh,unb}
}

@inproceedings{wester00:_using_dutch_asr,
author = {Wester, M. and Kessens, J.M. and Strik, H.},
title = {Using {D}utch phonological rules to model pronunciation variation in {ASR}},
booktitle = {Phonus 5: proceedings of the "workshop on phonetics and phonology in {ASR}"},
year = {2000},
abstract = {In this paper, we describe how the performance of a continuous speech recognizer for Dutch has been improved by modeling within-word and cross-word pronunciation variation. Within-word variants were automatically generated by applying five phonological rules to the words in the lexicon. Cross-word pronunciation variation was modeled by adding multi-words and their variants to the lexicon. The best results were obtained when the cross-word method was combined with the within-word method: a relative improvement of 8.8\% in the WER was found compared to baseline system performance. We also describe an error analysis that was carried out to investigate whether rules in isolation can predict the performance of rules in combination.},
pages = {105-116},
categories = {asr, pm, VIOS, Nijmegen}
}

@inproceedings{kurimo:acl:10,
author = {Kurimo, Mikko and Byrne, William and Dines, John and Garner, Philip N. and Gibson, Matthew and Guan, Yong and Hirsim\"{a}ki, Teemu and Karhila, Reima and King, Simon and Liang, Hui and Oura, Keiichiro and Saheer, Lakshmi and Shannon, Matt and Shiota, Sayaka and Tian, Jilei and Tokuda, Keiichi and Wester, Mirjam and Wu, Yi-Jian and Yamagishi, Junichi},
title = {Personalising speech-to-speech translation in the {EMIME} project},
booktitle = {Proc. ACL 2010 System Demonstrations},
year = {2010},
month = {July},
abstract = {In the EMIME project we have studied unsupervised cross-lingual speaker adaptation. We have employed an HMM statistical framework for both speech recognition and synthesis which provides transformation mechanisms to adapt the synthesized voice in TTS (text-to-speech) using the recognized voice in ASR (automatic speech recognition). An important application for this research is personalised speech-to-speech translation that will use the voice of the speaker in the input language to utter the translated sentences in the output language. In mobile environments this enhances the users' interaction across language barriers by making the output speech sound more like the original speaker's way of speaking, even if she or he could not speak the output language.},
}

@inproceedings{wester-98-voicedata,
author = {Wester, M.},
title = {Automatic Classification of Voice Quality: Comparing Regression Models and Hidden {M}arkov Models},
booktitle = {Proc. VOICEDATA98, Symposium on Databases in Voice Quality Research and Education},
year = {1998},
abstract = {In this paper, two methods for automatically classifying voice quality are compared: regression analysis and hidden Markov models (HMMs). The findings of this research show that HMMs can be used to classify voice quality. The HMMs performed better than the regression models in classifying breathiness and overall degree of deviance, and the two methods showed similar results on the roughness scale. However, the results are not spectacular. This is mainly due to the type of material that was available and the number of listeners who assessed the material. Nonetheless, I argue in this paper that these findings are interesting because they are a promising step towards developing a system for classifying voice quality.},
pages = {92-97},
categories = {voice quality, Nijmegen}
}

@techreport{wester_mandarin:11,
author = {Wester, Mirjam and Liang, Hui},
title = {The {EMIME} {M}andarin {B}ilingual {D}atabase},
abstract = {This paper describes the collection of a bilingual database of Mandarin/English data. In addition, the accents of the talkers in the database have been rated. English and Mandarin listeners assessed the English and Mandarin talkers' degree of foreign accent in English.},
number = {EDI-INF-RR-1396},
year = {2011},
institution = {The University of Edinburgh},
categories = {evaluation,cross-lingual, accent rating}
}

@inproceedings{wester:ssw7:10,
author = {Wester, Mirjam and Dines, John and Gibson, Matthew and Liang, Hui and Wu, Yi-Jian and Saheer, Lakshmi and King, Simon and Oura, Keiichiro and Garner, Philip N. and Byrne, William and Guan, Yong and Hirsim\"{a}ki, Teemu and Karhila, Reima and Kurimo, Mikko and Shannon, Matt and Shiota, Sayaka and Tian, Jilei and Tokuda, Keiichi and Yamagishi, Junichi},
title = {Speaker adaptation and the evaluation of speaker similarity in the {EMIME} speech-to-speech translation project},
booktitle = {Proc. 7th ISCA Speech Synthesis Workshop},
year = {2010},
month = {September},
abstract = {This paper provides an overview of speaker adaptation research carried out in the EMIME speech-to-speech translation (S2ST) project. We focus on how speaker adaptation transforms can be learned from speech in one language and applied to the acoustic models of another language. The adaptation is transferred across languages and/or from recognition models to synthesis models. The various approaches investigated can all be viewed as a process in which a mapping is defined in terms of either acoustic model states or linguistic units. The mapping is used to transfer either speech data or adaptation transforms between the two models. Because the success of speaker adaptation in text-to-speech synthesis is measured by judging speaker similarity, we also discuss issues concerning evaluation of speaker similarity in an S2ST scenario.},
}

@inproceedings{kessens-COST-97,
author = {Kessens, J.M. and Wester, M. and Cucchiarini, C. and Strik, H.},
title = {Testing a Method for Modelling Pronunciation Variation},
booktitle = {Proceedings of the COST workshop},
year = {1997},
abstract = {In this paper we describe a method for improving the performance of a continuous speech recognizer by modelling pronunciation variation. Although the results obtained with this method are in line with those reported by other authors, the magnitude of the improvements is very small. In looking for possible explanations for these results, we computed various sorts of statistics about the material. Since these data proved to be very useful in understanding the effects of our method, they are discussed in this paper. Moreover, on the basis of these statistics we discuss how the system can be improved in the future.},
pages = {37-40},
categories = {asr, pm, VIOS, Nijmegen}
}

@inproceedings{Wester-Chang-01,
author = {Wester, M. and Greenberg, S. and Chang, S.},
title = {A {D}utch Treatment of an Elitist Approach to Articulatory-Acoustic Feature Classification},
booktitle = {Proc. Eurospeech '01},
year = {2001},
abstract = {A novel approach to articulatory-acoustic feature extraction has been developed for enhancing the accuracy of classification associated with place and manner of articulation information. This elitist approach is tested on a corpus of spontaneous Dutch using two different systems, one trained on a subset of the same corpus, the other trained on a corpus from a different language (American English). The feature dimensions, voicing and manner of articulation transfer relatively well between the two languages. However, place information transfers less well. Manner-specific training can be used to improve classification of articulatory place information.},
pages = {1729-1732},
categories = {aaf, NTIMIT, VIOS, Berkeley}
}

@inproceedings{Kessens98,
author = {Kessens, J.M. and Wester, M. and Cucchiarini, C. and Strik, H.},
title = {The Selection of Pronunciation Variants: Comparing the Performance of Man and Machine},
booktitle = {Proc. ICSLP '98},
year = {1998},
abstract = {In this paper the performance of an automatic transcription tool is evaluated. The transcription tool is a Continuous Speech Recognizer (CSR) running in forced recognition mode. For evaluation the performance of the CSR was compared to that of nine expert listeners. Both man and the machine carried out exactly the same task: deciding whether a segment was present or not in 467 cases. It turned out that the performance of the CSR is comparable to that of the experts.},
pages = {2715-2718},
categories = {asr, pm, VIOS, Nijmegen}
}

@article{king07:JASA2007,
author = {King, S. and Frankel, J. and Livescu, K. and McDermott, E. and Richmond, K. and Wester, M.},
title = {Speech production knowledge in automatic speech recognition},
journal = {Journal of the Acoustical Society of America},
number = {2},
abstract = {Although much is known about how speech is produced, and research into speech production has resulted in measured articulatory data, feature systems of different kinds and numerous models, speech production knowledge is almost totally ignored in current mainstream approaches to automatic speech recognition. Representations of speech production allow simple explanations for many phenomena observed in speech which cannot be easily analyzed from either acoustic signal or phonetic transcription alone. In this article, we provide a survey of a growing body of work in which such representations are used to improve automatic speech recognition.},
month = {February},
volume = {121},
year = {2007},
pages = {723--742}
}

@article{chang05,
author = {Chang, S. and Wester, M. and Greenberg, S.},
title = {An elitist approach to automatic articulatory-acoustic feature classification for phonetic characterization of spoken language},
journal = {Speech Communication},
abstract = {A novel framework for automatic articulatory-acoustic feature extraction has been developed for enhancing the accuracy of place- and manner-of-articulation classification in spoken language. The "elitist" approach provides a principled means of selecting frames for which multi-layer perceptron, neural-network classifiers are highly confident. Using this method it is possible to achieve a frame-level accuracy of 93\% on "elitist" frames for manner classification on a corpus of American English sentences passed through a telephone network (NTIMIT). Place-of-articulation information is extracted for each manner class independently, resulting in an appreciable gain in place-feature classification relative to performance for a manner-independent system. A comparable enhancement in classification performance for the elitist appraoch is evidenced when applied to a Dutch corpus of quasi-spontaneous telephone interactions (VIOS). The elitist framework provides a potential means of automatically annotating a corpus at the phonetic level \emph{without recourse to a word-level transcript} and could thus be of utility for developing traning materials for automatic speech recognition and speech synthesis applications, as well as aid the empirical study of spoken language. \copyright 2005 Elsevier B.V. All rights reserved.},
volume = {47},
year = {2005},
pages = {290-311},
categories = {aaf, VIOS, NTIMIT, Berkeley}
}

@inproceedings{Valentini-Botinhao_SSW8,
author = {Valentini-Botinhao, Cassia and Wester, Mirjam and Yamagishi, Junichi and King, Simon},
title = {Using neighbourhood density and selective {SNR} boosting to increase the intelligibility of synthetic speech in noise},
booktitle = {8th ISCA Workshop on Speech Synthesis},
year = {2013},
abstract = {Motivated by the fact that words are not equally confusable, we explore the idea of using word-level intelligibility predictions to selectively boost the harder-to-understand words in a sentence, aiming to improve overall intelligibility in the presence of noise. First, the intelligibility of a set of words from dense and sparse phonetic neighbourhoods was evaluated in isolation. The resulting intelligibility scores were used to inform two sentencelevel experiments. In the first experiment the signal-to-noise ratio of one word was boosted to the detriment of another word. Sentence intelligibility did not generally improve. The intelligibility of words in isolation and in a sentence were found to be significantly different, both in clean and in noisy conditions. For the second experiment, one word was selectively boosted while slightly attenuating all other words in the sentence. This strategy was successful for words that were poorly recognised in that particular context. However, a reliable predictor of word-in-context intelligibility remains elusive, since this involves – as our results indicate – semantic, syntactic and acoustic information about the word and the sentence.},
month = {August},
pages = {133--138}
}

@inproceedings{wester:icassp:14,
author = {Wester, Mirjam and Mayo, Cassie},
title = {Accent rating by native and non-native listeners},
booktitle = {Proceedings of ICASSP},
year = {2014},
month = {May},
pages = {7749-7753},
abstract = {This study investigates the influence of listener native language with respect to talker native language on perception of degree of foreign accent in English. Listeners from native English, Finnish, German and Mandarin backgrounds rated the accentedness of native English, Finnish, German and Mandarin talkers producing a controlled set of English sentences. Results indicate that non-native listeners, like native listeners, are able to classify non-native talkers as foreign-accented, and native talkers as unaccented. However, while non-native talkers received higher accentedness ratings than native talkers from all listener groups, non-native listeners judged talkers with non-native accents less harshly than did native English listeners. Similarly, non-native listeners assigned higher degrees of foreign accent to native English talkers than did native English listeners. It seems that non-native listeners give accentedness ratings that are less extreme, or closer to the centre of the rating scale in both directions, than those used by native listeners.},
categories = {accent rating}
}

@inproceedings{Valentini_IS14,
author = {Valentini-Botinhao, C. and Wester, M.},
title = {Using linguistic predictability and the {Lombard} effect to increase the intelligibility of synthetic speech in noise},
abstract = {In order to predict which words in a sentence are harder to understand in noise it is necessary to consider not only audibility but also semantic or linguistic information. This paper focuses on using linguistic predictability to inform an intelligibility enhancement method that uses Lombard-adapted synthetic speech to modify low predictable words in Speech Perception in Noise (SPIN) test sentences. Word intelligibility in the presence of speech-shaped noise was measured using plain, Lombard and a combination of the two synthetic voices. The findings show that the Lombard voice increases intelligibility in noise but the intelligibility gap between words in a high and low predictable context still remains. Using a Lombard voice when a word is unpredictable is a good strategy, but if a word is predictable from its context the Lombard benefit only occurs when other words in the sentence are also modified.},
year = {2014},
month = {September},
pages = {2063--2067},
booktitle = {Proc. Interspeech},
categories = {intelligibility enhancement, speech in noise, HMM-based speech synthesis, SPIN test}
}

@inproceedings{dall_IS14,
author = {Dall, Rasmus and Wester, Mirjam and Corley, Martin},
title = {The Effect of Filled Pauses and Speaking Rate on Speech Comprehension in Natural, Vocoded and Synthetic Speech},
abstract = {It has been shown that in natural speech filled pauses can be beneficial to a listener. In this paper, we attempt to discover whether listeners react in a similar way to filled pauses in synthetic and vocoded speech compared to natural speech. We present two experiments focusing on reaction time to a target word. In the first, we replicate earlier work in natural speech, namely that listeners respond faster to a target word following a filled pause than following a silent pause. This is replicated in vocoded but not in synthetic speech. Our second experiment investigates the effect of speaking rate on reaction times as this was potentially a confounding factor in the first experiment. Evidence suggests that slower speech rates lead to slower reaction times in synthetic \emph{and} in natural speech. Moreover, in synthetic speech the response to a target word after a filled pause is slower than after a silent pause. This finding, combined with an overall slower reaction time, demonstrates a shortfall in current synthesis techniques. Remedying this could help make synthesis less demanding and more pleasant for the listener, and reaction time experiments could thus provide a measure of improvement in synthesis techniques.},
year = {2014},
booktitle = {Proc. Interspeech},
categories = {HMM-synthesis, speech synthesis, reaction time, filled pause, disfluency, speaking rate, speech perception}
}

@inproceedings{wester_IS14,
author = {Wester, Mirjam and Lecumberri, M. Luisa Garcia and Cooke, Martin},
title = {{DIAPIX-FL}: A symmetric corpus of problem-solving dialogues in first and second languages},
abstract = {This paper describes a corpus of conversations recorded using an extension of the DiapixUK task: the Diapix Foreign Language corpus (DIAPIX-FL) . English and Spanish native talkers were recorded speaking both English and Spanish. The bidirectionality of the corpus makes it possible to separate language (English or Spanish) from speaking in a first language (L1) or second language (L2). An acoustic analysis was carried out to analyse changes in F0, voicing, intensity, spectral tilt and formants that might result from speaking in an L2. The effect of L1 and nativeness on turn types was also studied. Factors that were investigated were pausing, elongations, and incomplete words. Speakers displayed certain patterns that suggest an on-going process of L2 phonological acquisition, such as the overall percentage of voicing in their speech. Results also show an increase in hesitation phenomena (pauses, elongations, incomplete turns), a decrease in produced speech and speech rate, a reduction of F0 range, raising of minimum F0 when speaking in the non-native language which are consistent with more tentative speech and may be used as indicators of non-nativeness.},
year = {2014},
booktitle = {Proc. Interspeech},
categories = {L1-L2, DIAPIX, non-native}
}

@inproceedings{Dall_Tomalin_IS14,
author = {Dall, Rasmus and Tomalin, Marcus and Wester, Mirjam and Byrne, William and King, Simon},
title = {Investigating Automatic & Human Filled Pause Insertion for Speech Synthesis},
booktitle = {Proc. Interspeech},
year = {2014},
abstract = {Filled pauses are pervasive in conversational speech and have been shown to serve several psychological and structural purposes. Despite this, they are seldom modelled overtly by state-of-the-art speech synthesis systems. This paper seeks to motivate the incorporation of filled pauses into speech synthesis systems by exploring their use in conversational speech, and by comparing the performance of several automatic systems inserting filled pauses into fluent text. Two initial experiments are described which seek to determine whether people's predicted insertion points are consistent with actual practice and/or with each other. The experiments also investigate whether there are right' and wrong' places to insert filled pauses. The results show good consistency between people's predictions of usage and their actual practice, as well as a perceptual preference for the right' placement. The third experiment contrasts the performance of several automatic systems that insert filled pauses into fluent sentences. The best performance (determined by F-score) was achieved through the by-word interpolation of probabilities predicted by Recurrent Neural Network and 4gram Language Models. The results offer insights into the use and perception of filled pauses by humans, and how automatic systems can be used to predict their locations.},
categories = {filled pause, HMM TTS, SVM, RNN}
}

@inproceedings{wester:icphs:2015,
author = {Wester, Mirjam and Garcia Lecumberri, M. Luisa and Cooke, Martin},
title = {/u/-fronting in {English} speakers' {L1} but not in their {L2}},
booktitle = {Proc. ICPhS},
year = {2015},
month = {August},
abstract = {This paper presents an acoustic analysis of the three corner vowels in the Diapix Foreign Language corpus (DIAPIX-FL) which contains material from English and Spanish native speakers from both their L1 and L2. We investigated how L1 vowel characteristics influence the production of L2 vowels, and to what extent a current sound change in one of the languages is reflected in the other. We find that /u/-fronting in English occurs for both native and non-native speakers, although the degree of /u/-fronting is much larger for the English group. English speakers appear to create a separate category for the L2 /u/ rather than use their L1 sound. Spanish speakers show some adjustment to their English /u/ and /a/ realisations. These findings suggest that despite limited exposure to the L2 sounds, learners are aware of realisational differences between the languages and implement them to different degrees even for non-standard variants.},
categories = {/u/-fronting, L1, L2, non-native}
}

@inproceedings{wester:listeners:IS2015,
author = {Wester, Mirjam and Valentini-Botinhao, Cassia and Henter, Gustav Eje},
title = {Are we using enough listeners? {No! An empirically-supported critique of Interspeech 2014 TTS evaluations}},
booktitle = {Proc. Interspeech},
year = {2015},
month = {September},
pages = {3476--3480},
abstract = {Tallying the numbers of listeners that took part in subjective evaluations of synthetic speech at Interspeech 2014 showed that in more than 60% of papers conclusions are based on listening tests with less than 20 listeners. Our analysis of Blizzard 2013 data shows that for a MOS test measuring naturalness a stable level of significance is only reached when more than 30 listeners are used. In this paper, we set out a list of guidelines, i.e., a checklist for carrying out meaningful subjective evaluations. We further illustrate the importance of sentence coverage and number of listeners by presenting changes to rank order and number of significant pairs by re-analysing data from the Blizzard Challenge 2013.},
categories = {Subjective evaluation, text-to-speech, MOS test}
}

@inproceedings{wester:artificial:IS2015,
author = {Wester, Mirjam and Aylett, Matthew and Tomalin, Marcus and Dall, Rasmus},
title = {Artificial Personality and Disfluency},
booktitle = {Proc. Interspeech},
year = {2015},
month = {September},
abstract = {The focus of this paper is artificial voices with different personalities. Previous studies have shown links between an individual's use of disfluencies in their speech and their perceived personality. Here, filled pauses (uh and um) and discourse markers (like, you know, I mean) have been included in synthetic speech as a way of creating an artificial voice with different personalities. We discuss the automatic insertion of filled pauses and discourse markers (i.e., fillers) into otherwise fluent texts. The automatic system is compared to a ground truth of human "acted" filler insertion. Perceived personality (as defined by the big five personality dimensions) of the synthetic speech is assessed by means of a standardised questionnaire. Synthesis without fillers is compared to synthesis with either spontaneous or synthetic fillers. Our findings explore how the inclusion of disfluencies influences the way in which subjects rate the perceived personality of an artificial voice.},
categories = {artificial personality, TTS, disfluency}
}

@inproceedings{tomalin:diss:2015,
author = {Tomalin, Marcus and Wester, Mirjam and Dall, Rasmus and Byrne, Bill and King, Simon},
title = {A Lattice-based Approach to Automatic Filled Pause Insertion},
booktitle = {Proc. DiSS 2015},
year = {2015},
month = {August},
abstract = {This paper describes a novel method for automatically inserting filled pauses (e.g., UM) into fluent texts. Although filled pauses are known to serve a wide range of psychological and structural functions in conversational speech, they have not traditionally been modelled overtly by state-of-the-art speech synthesis systems. However, several recent systems have started to model disfluencies specifically, and so there is an increasing need to create disfluent speech synthesis input by automatically inserting filled pauses into otherwise fluent text. The approach presented here interpolates Ngrams and Full-Output Recurrent Neural Network Language Models (f-RNNLMs) in a lattice-rescoring framework. It is shown that the interpolated system outperforms separate Ngram and f-RNNLM systems, where performance is analysed using the Precision, Recall, and F-score metrics.},
categories = {Disfluency, Filled Pauses, f-RNNLMs, Ngrams, Lattices}
}

@inproceedings{Wester:diss:2015,
author = {Wester, Mirjam and Corley, Martin and Dall, Rasmus},
title = {The Temporal Delay Hypothesis: Natural, Vocoded and Synthetic Speech},
booktitle = {Proc. DiSS 2015},
year = {2015},
month = {August},
abstract = {Including disfluencies in synthetic speech is being explored as a way of making synthetic speech sound more natural and conversational. How to measure whether the resulting speech is actually more natural, however, is not straightforward. Conventional approaches to synthetic speech evaluation fall short as a listener is either primed to prefer stimuli with filled pauses or, when they aren't primed they prefer more fluent speech. Psycholinguistic reaction time experiments may circumvent this issue. In this paper, we revisit one such reaction time experiment. For natural speech, delays in word onset were found to facilitate word recognition regardless of the type of delay; be they a filled pause (um), silence or a tone. We expand these experiments by examining the effect of using vocoded and synthetic speech. Our results partially replicate previous findings. For natural and vocoded speech, if the delay is a silent pause, significant increases in the speed of word recognition are found. If the delay comprises a filled pause there is a significant increase in reaction time for vocoded speech but not for natural speech. For synthetic speech, no clear effects of delay on word recognition are found. We hypothesise this is because it takes longer (requires more cognitive resources) to process synthetic speech than natural or vocoded speech.},
categories = {delay hypothesis, disfluency}
}

@inproceedings{dall:diss2015,
author = {Dall, Rasmus and Wester, Mirjam and Corley, Martin},
title = {Disfluencies in change detection in natural, vocoded and synthetic speech},
booktitle = {Proc. DiSS 2015},
year = {2015},
month = {August},
abstract = {In this paper, we investigate the effect of filled pauses, a discourse marker and silent pauses in a change detection experiment in natural, vocoded and synthetic speech. In natural speech change detection has been found to increase in the presence of filled pauses, we extend this work by replicating earlier findings and explore the effect of a discourse marker, like, and silent pauses. Furthermore we report how the use of "unnatural" speech, namely synthetic and vocoded, affects change detection rates. It was found that the filled pauses, the discourse marker and silent pauses all increase change detection rates in natural speech, however in neither synthetic nor vocoded speech did this effect appear. Rather, change detection rates decreased in both types of "unnatural" speech compared to natural speech. The natural results suggests that while each type of pause increase detection rates, the type of pause may have a further effect. The "unnatural" results suggest that it is not the full pipeline of synthetic speech that causes the degradation, but rather that something in the pre-processing, i.e. vocoding, of the speech database limits the resulting synthesis.},
categories = {change detection, filled pauses, speech synthesis}
}

@inproceedings{wester:human:IS2015,
author = {Wester, Mirjam and Wu, Zhizheng and Yamagishi, Junichi},
title = {Human vs Machine Spoofing Detection on Wideband and Narrowband Data},
booktitle = {Proc. Interspeech},
year = {2015},
month = {September},
abstract = {How well do humans detect spoofing attacks directed at automatic speaker verification systems? This paper investigates the performance of humans at detecting spoofing attacks from speech synthesis and voice conversion systems. Two speaker verification tasks, in which the speakers were either humans or machines, were also conducted. The three tasks were carried out with two types of data: wideband (16kHz) and narrowband (8kHz) telephone line simulated data. Spoofing detection by humans was compared to automatic spoofing detection (ASD) algorithms. Listening tests were carefully constructed to en- sure the human and automatic tasks were as similar as possible taking into consideration listener’s constraints (e.g., fatigue and memory limitations). Results for human trials show the error rates on narrowband data double compared to on wide- band data. The second verification task, which included only artificial speech, showed equal overall acceptance rates for both 8kHz and 16kHz. In the spoofing detection task, there was a drop in performance on most of the artificial trials as well as on human trials. At 8kHz, 20% of human trials were incorrectly classified as artificial, compared to 12% at 16kHz. The ASD algorithms also showed a drop in performance on 8kHz data, but outperformed human listeners across the board.},
categories = {spoofing, human performance, automatic spoofing detection}
}

@inproceedings{bell15_mgb_challenge,
author = {Bell, Peter and Gales, Mark and Hain, Thomas and Kilgour, Jonathan and Lanchantin, Pierre and Liu, Xunying and McParland, Andrew and Renals, Steve and Saz, Oscar and Wester, Mirjam and Woodland, Phil},
booktitle = {Proc. ASRU},
title = {The {MGB} challenge: Evaluating multi-genre broadcast media recognition},
abstract = {This paper describes the Multi-Genre Broadcast (MGB) Challenge at ASRU~2015, an evaluation focused on speech recognition, speaker diarization, and lightly supervised'' alignment of BBC TV recordings. The challenge training data covered the whole range of seven weeks BBC TV output across four channels, resulting in about 1,600 hours of broadcast audio. In addition several hundred million words of BBC subtitle text was provided for language modelling. A novel aspect of the evaluation was the exploration of speech recognition and speaker diarization in a longitudinal setting -- i.e. recognition of several episodes of the same show, and speaker diarization across these episodes, linking speakers. The longitudinal tasks also offered the opportunity for systems to make use of supplied metadata including show title, genre tag, and date/time of transmission. This paper describes the task data and evaluation process used in the MGB challenge, and summarises the results obtained.},
year = {2015}
}

@inproceedings{wester2016evaluating,
author = {Wester, Mirjam and Watts, Oliver and Henter, Gustav Eje},
title = {Evaluating comprehension of natural and synthetic conversational speech},
url = {http://www.isca-speech.org/archive/sp2016/pdfs_stamped/41.pdf},
abstract = {Current speech synthesis methods typically operate on isolated sentences and lack convincing prosody when generating longer segments of speech. Similarly, prevailing TTS evaluation paradigms, such as intelligibility (transcription word error rate) or MOS, only score sentences in isolation, even though overall comprehension is arguably more important for speech-based communication. In an effort to develop more ecologically-relevant evaluation techniques that go beyond isolated sentences, we investigated comprehension of natural and synthetic speech dialogues. Specifically, we tested listener comprehension on long segments of spontaneous and engaging conversational speech (three 10-minute radio interviews of comedians). Interviews were reproduced either as natural speech, synthesised from carefully prepared transcripts, or synthesised using durations from forced-alignment against the natural speech, all in a balanced design. Comprehension was measured using multiple choice questions. A significant difference was measured between the comprehension/retention of natural speech (74\% correct responses) and synthetic speech with forced-aligned durations (61\% correct responses). However, no significant difference was observed between natural and regular synthetic speech (70\% correct responses). Effective evaluation of comprehension remains elusive.},
month = {June},
volume = {8},
year = {2016},
keywords = {evaluation, comprehension, conversational speech, statistical parametric speech synthesis},
booktitle = {Speech Prosody},
pages = {736--740},
categories = {evaluation, comprehension, conversational speech, statistical parametric speech synthesis}
}

@inproceedings{henter2016robust,
author = {Henter, Gustav Eje and Ronanki, Srikanth and Watts, Oliver and Wester, Mirjam and Wu, Zhizheng and King, Simon},
title = {Robust {TTS} duration modelling using {DNN}s},
url = {http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=7472655},
abstract = {Accurate modelling and prediction of speech-sound durations is an important component in generating more natural synthetic speech. Deep neural networks (DNNs) offer a powerful modelling paradigm, and large, found corpora of natural and expressive speech are easy to acquire for training them. Unfortunately, found datasets are seldom subject to the quality-control that traditional synthesis methods expect. Common issues likely to affect duration modelling include transcription errors, reductions, filled pauses, and forced-alignment inaccuracies. To combat this, we propose to improve modelling and prediction of speech durations using methods from robust statistics, which are able to disregard ill-fitting points in the training material. We describe a robust fitting criterion based on the density power divergence (the beta-divergence) and a robust generation heuristic using mixture density networks (MDNs). Perceptual tests indicate that subjects prefer synthetic speech generated using robust models of duration over the baselines.},
month = {March},
volume = {41},
year = {2016},
keywords = {Speech synthesis, duration modelling, robust statistics},
booktitle = {Proc. ICASSP},
pages = {5130--5134},
categories = {Speech synthesis, duration modelling, robust statistics}
}

@inproceedings{toda2016voice,
author = {Toda, Tomoki and Chen, Ling-Hui and Saito, Daisuke and Villavicencio, Fernando and Wester, Mirjam and Wu, Zhizheng and Yamagishi, Junichi},
booktitle = {Proc. Interspeech},
title = {The Voice Conversion Challenge 2016},
abstract = {This paper describes the Voice Conversion Challenge 2016 devised by the authors to better understand different voice conversion (VC) techniques by comparing their performance on a common dataset. The task of the challenge was speaker conversion, i.e., to transform the voice identity of a source speaker into that of a target speaker while preserving the linguistic content. Using a common dataset consisting of 162 utterances for training and 54 utterances for evaluation from each of 5 source and 5 target speakers, 17 groups working in VC around the world developed their own VC systems for every combination of the source and target speakers, i.e., 25 systems in total, and generated voice samples converted by the developed systems. These samples were evaluated in terms of target speaker similarity and naturalness by 200 listeners in a controlled environment. This paper summarizes the design of the challenge, its result, and a future plan to share views about unsolved problems and challenges faced by the current VC techniques.},
year = {2016}
}

@inproceedings{wester2016analysis,
author = {Wester, Mirjam and Wu, Zhizheng and Yamagishi, Junichi},
booktitle = {Proc. Interspeech},
title = {Analysis of the Voice Conversion Challenge 2016 Evaluation Results},
abstract = {The Voice Conversion Challenge 2016 is the first Voice Conversion Challenge in which different voice conversion systems and approaches using the same voice data were compared. This paper describes the design of the evaluation, it presents the results and statistical analyses of the results.},
year = {2016}
}

@inproceedings{wester2016multidimensional,
author = {Wester, Mirjam and Wu, Zhizheng and Yamagishi, Junichi},
title = {Multidimensional scaling of systems in the Voice Conversion Challenge 2016},
booktitle = {Proc. Speech Synthesis Workshop 9},
year = {2016},
abstract = {This study investigates how listeners judge the similarity of voice converted voices using a talker discrimination task. The data used is from the Voice Conversion Challenge 2016. 17 participants from around the world took part in building voice converted voices from a shared data set of source and target speakers. This paper describes the evaluation of similarity for four of the source-target pairs (two intra-gender and two cross-gender) in more detail. Multidimensional scaling was performed to illustrate where each system was perceived to be in an acoustic space compared to the source and target speakers and to each other.}
}

@inproceedings{Dall2016e,
author = {Dall, Rasmus and Tomalin, Marcus and Wester, Mirjam},
title = {{Synthesising Filled Pauses: Representation and Datamixing}},
booktitle = {Proc. SSW9},
year = {2016},
abstract = {Filled pauses occur frequently in spontaneous human speech, yet modern text-to-speech synthesis systems rarely model these disfluencies overtly, and consequently they do not output convincing synthetic filled pauses. This paper presents a text-to-speech system that is specifically designed to model these particular disfluencies more efffectively. A preparatory investigation shows that a synthetic voice trained exclusively on spontaneous speech is perceived to be inferior in quality to a voice trained entirely on read speech, even though the latter does not handle filled pauses well. This motivates an investigation into the phonetic representation of filled pauses which show that, in a preference test, the use of a distinct phone for filled pauses is preferred over the standard /V/ phone and the alternative /@/ phone. In addition, we present a variety of data-mixing techniques to combine the strengths of standard synthesis systems trained on read speech corpora with the supplementary advantages offered by systems trained on spontaneous speech. In a MUSHRA-style test, it is found that the best overall quality is obtained by combining the two types of corpora using a source mark- ing technique. Specifically, general speech is synthesised with a standard mark, while filled pauses are synthesised with a spontaneous mark, which has the added benefit of also producing filled pauses that are comparatively well synthesised.},
categories = {TTS, Filled Pauses, HMM, Phonetic Represen- tation, Speech Synthesis}
}

@article{aylett2017speech,
author = {Aylett, Matthew P. and Vinciarelli, Alessandro and Wester, Mirjam},
doi = {10.1109/TAFFC.2017.2763134},
title = {Speech Synthesis for the Generation of Artificial Personality},
journal = {IEEE Transactions on Affective Computing},
year = {2017},
abstract = {A synthetic voice personifies the system using it. In this work we examine the impact text content, voice quality and synthesis system have on the perceived personality of two synthetic voices. Subjects rated synthetic utterances based on the Big-Five personality traits and naturalness. The naturalness rating of synthesis output did not correlate significantly with any Big-Five characteristic except for a marginal correlation with openness. Although text content is dominant in personality judgments, results showed that voice quality change implemented using a unit selection synthesis system significantly affected the perception of the Big-Five, for example tense voice being associated with being disagreeable and lax voice with lower conscientiousness. In addition a comparison between a parametric implementation and unit selection implementation of the same voices showed that parametric voices were rated as significantly less neurotic than both the text alone and the unit selection system, while the unit selection was rated as more open than both the text alone and the parametric system. The results have implications for synthesis voice and system type selection for applications such as personal assistants and embodied conversational agents where developing an emotional relationship with the user, or developing a branding experience is important.},
categories = {TTS, Artificial Personality}
}

@inproceedings{yoshimura2016hierarchical,
author = {Yoshimura, Takenori and Henter, {Gustav Eje} and Watts, Oliver and Wester, Mirjam and Yamagishi, Junichi and Tokuda, Keiichi},
bdsk-url-1 = {http://dx.doi.org/10.21437/Interspeech.2016-847},
publisher = {International Speech Communication Association},
doi = {10.21437/Interspeech.2016-847},
date-modified = {2018-01-19 16:43:35 +0000},
title = {A Hierarchical Predictor of Synthetic Speech Naturalness Using Neural Networks},
abstract = {A problem when developing and tuning speech synthesis systems is that there is no well-established method of automatically rating the quality of the synthetic speech. This research attempts to obtain a new automated measure which is trained on the result of large-scale subjective evaluations employing many human listeners, i.e., the Blizzard Challenge. To exploit the data, we experiment with linear regression, feed-forward and convolutional neural network models, and combinations of them to regress from synthetic speech to the perceptual scores obtained from listeners. The biggest improvements were seen when combining stimulus- and system-level predictions.},
month = sep,
year = {2016},
`