The Centre for Speech Technology Research, The university of Edinburgh

Publications by Junichi Yamagishi

jyamagis.bib

@article{Cassia_CSL13,
  author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S. and Maia, R.},
  doi = {10.1016/j.csl.2013.06.001},
  title = {Intelligibility enhancement of {HMM}-generated speech in additive noise by modifying Mel cepstral coefficients to increase the Glimpse Proportion},
  journal = {Computer Speech and Language},
  number = {2},
  abstract = {This paper describes speech intelligibility enhancement for hidden Markov model (HMM) generated synthetic speech in noise. We present a method for modifying the Mel cepstral coefficients generated by statistical parametric models that have been trained on plain speech. We update these coefficients such that the Glimpse Proportion – an objective measure of the intelligibility of speech in noise – increases, while keeping the speech energy fixed. An acoustic analysis reveals that the modified speech is boosted in the region 1-4kHz, particularly for vowels, nasals and approximants. Results from listening tests employing speech-shaped noise show that the modified speech is as intelligible as a synthetic voice trained on plain speech whose duration, Mel cepstral coefficients and excitation signal parameters have been adapted to Lombard speech from the same speaker. Our proposed method does not require these additional recordings of Lombard speech. In the presence of a competing talker, both modification and adaptation of spectral coefficients give more modest gains.},
  volume = {28},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/Cassia_CSL14.pdf},
  pages = {665--686}
}
@inproceedings{Cassia_IS13,
  author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S. and Stylianou, Y.},
  title = {{Combining perceptually-motivated spectral shaping with loudness and duration modification for intelligibility enhancement of HMM-based synthetic speech in noise}},
  booktitle = {Proc. Interspeech},
  year = {2013},
  month = {August},
  address = {Lyon, France},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2013/Cassia_IS13.pdf},
  abstact = {This paper presents our entry to a speech-in-noise intelligibility enhancement evaluation: the Hurricane Challenge. The system consists of a Text-To-Speech voice manipulated through a combination of enhancement strategies, each of which is known to be individually successful: a perceptually-motivated spectral shaper based on the Glimpse Proportion measure, dynamic range compression, and adaptation to Lombard excitation and duration patterns. We achieved substantial intelligibility improvements relative to unmodified synthetic speech: 4.9 dB in competing speaker and 4.1 dB in speech-shaped noise. An analysis conducted across this and other two similar evaluations shows that the spectral shaper and the compressor (both of which are loudness boosters) contribute most under higher SNR conditions, particularly for speech-shaped noise. Duration and excitation Lombard-adapted changes are more beneficial in lower SNR conditions, and for competing speaker noise.}
}
@inproceedings{Cassia_ICASSP13,
  author = {Valentini-Botinhao, C. and Godoy, E. and Stylianou, Y. and Sauert, B. and King, S. and Yamagishi, J.},
  title = {{Improving intelligibility in noise of HMM-generated speech via noise-dependent and -independent methods.}},
  booktitle = {Proc. ICASSP},
  year = {2013},
  month = {May},
  address = {Vancouver, Canada},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Cassia_ICASSP13.pdf},
  abstact = {In order to improve the intelligibility of HMM-generated Text-to- Speech (TTS) in noise, this work evaluates several speech enhancement methods, exploring combinations of noise-independent and -dependent approaches as well as algorithms previously developed for natural speech. We evaluate one noise-dependent method proposed for TTS, based on the glimpse proportion measure, and three approaches originally proposed for natural speech - one that estimates the noise and is based on the speech intelligibility index, and two noise-independent methods based on different spectral shaping techniques followed by dynamic range compression. We demonstrate how these methods influence the average spectra for different phone classes. We then present results of a listening experiment with speech-shaped noise and a competing speaker. A few methods made the TTS voice even more intelligible than the natural one. Although noise-dependent methods did not improve gains, the intelligibility differences found in distinct noises motivates such dependency.}
}
@article{HTS,
  author = {Tokuda, Keiichi and Nankaku, Yoshihiko and Today, Tomoki and Zen, Heiga and Yamagishi, Junichi and Oura, Keiichiro},
  volume = {101},
  title = {Speech Synthesis Based on Hidden Markov Models},
  abstract = {This paper gives a general overview of hidden Markov model (HMM)-based speech synthesis, which has recently been demonstrated to be very effective in synthesizing speech. The main advantage of this approach is its flexibility in changing speaker identities, emotions, and speaking styles. This paper also discusses the relation between the HMM-based approach and the more conventional unit-selection approach that has dominated over the last decades. Finally, advanced techniques for future developments are described.},
  number = {6},
  month = {June},
  note = {(in press)},
  year = {2013},
  journal = {Proceedings of the IEEE}
}
@inproceedings{6423522,
  author = {Yang, Chen-Yu and Brown, G. and Lu, Liang and Yamagishi, J. and King, S.},
  doi = {10.1109/ISCSLP.2012.6423522},
  title = {Noise-robust whispered speech recognition using a non-audible-murmur microphone with VTS compensation},
  abstract = {In this paper, we introduce a newly-created corpus of whispered speech simultaneously recorded via a close-talking microphone and a non-audible murmur (NAM) microphone in both clean and noisy conditions. To benchmark the corpus, which has been freely released recently, experiments on automatic recognition of continuous whispered speech were conducted. When training and test conditions are matched, the NAM microphone is found to be more robust against background noise than the close-talking microphone. In mismatched conditions (noisy data, models trained on clean speech), we found that Vector Taylor Series (VTS) compensation is particularly effective for the NAM signal.},
  year = {2012},
  booktitle = {Chinese Spoken Language Processing (ISCSLP), 2012 8th International Symposium on},
  pages = {220-223}
}
@inproceedings{LorenzoAlbayzinProposal2012,
  author = {Lorenzo-Trueba, Jaime and Watts, Oliver and Barra-Chicote, Roberto and Yamagishi, Junichi and King, Simon and Montero, Juan M},
  title = {Simple4All proposals for the Albayzin Evaluations in Speech Synthesis},
  abstract = {Simple4All is a European funded project that aims to streamline the production of multilanguage expressive synthetic voices by means of unsupervised data extraction techniques, allowing the automatic processing of freely available data into flexible task-specific voices. In this paper we describe three different approaches for this task, the first two covering enhancements in expressivity and flexibility with the final one focusing on the development of unsupervised voices. The first technique introduces the principle of speaker adaptation from average models consisting of multiple voices, with the second being an extension of this adaptation concept into allowing the control of the expressive strength of the synthetic voice. Finally, an unsupervised approach to synthesis capable of learning from unlabelled text data is introduced in detail},
  year = {2012},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/simple4all-proposal.pdf},
  booktitle = {Proc. Iberspeech 2012},
  categories = {Albayzin challenge, expressive speech synthesis}
}
@article{child_speech_journal_2010,
  author = {Watts, O. and Yamagishi, J. and King, S. and Berkling, K.},
  doi = {10.1109/TASL.2009.2035029},
  title = {Synthesis of Child Speech with {HMM} Adaptation and Voice Conversion},
  journal = {Audio, Speech, and Language Processing, IEEE Transactions on},
  issn = {1558-7916},
  number = {5},
  month = {July},
  volume = {18},
  pages = {1005--1016},
  year = {2010},
  keywords = {HMM adaptation techniques;child speech synthesis;hidden Markov model;speaker adaptive modeling technique;speaker dependent technique;speaker-adaptive voice;statistical parametric synthesizer;target speaker corpus;voice conversion;hidden Markov models;speech synthesis;},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/YamagishiJ_SynthesisofChildSpeech.pdf},
  abstract = {The synthesis of child speech presents challenges both in the collection of data and in the building of a synthesizer from that data. We chose to build a statistical parametric synthesizer using the hidden Markov model (HMM)-based system HTS, as this technique has previously been shown to perform well for limited amounts of data, and for data collected under imperfect conditions. Six different configurations of the synthesizer were compared, using both speaker-dependent and speaker-adaptive modeling techniques, and using varying amounts of data. For comparison with HMM adaptation, techniques from voice conversion were used to transform existing synthesizers to the characteristics of the target speaker. Speaker-adaptive voices generally outperformed child speaker-dependent voices in the evaluation. HMM adaptation outperformed voice conversion style techniques when using the full target speaker corpus; with fewer adaptation data, however, no significant listener preference for either HMM adaptation or voice conversion methods was found.}
}
@article{Ekpenyong2013,
  author = {Ekpenyong, Moses and Urua, Eno-Abasi and Watts, Oliver and King, Simon and Yamagishi, Junichi},
  numpages = {9},
  issue_date = {January, 2014},
  doi = {10.1016/j.specom.2013.02.003},
  title = {Statistical Parametric Speech Synthesis for {I}bibio},
  url = {http://dx.doi.org/10.1016/j.specom.2013.02.003},
  journal = {Speech Communication},
  issn = {0167-6393},
  abstract = {Ibibio is a Nigerian tone language, spoken in the south-east coastal region of Nigeria. Like most African languages, it is resource-limited. This presents a major challenge to conventional approaches to speech synthesis, which typically require the training of numerous predictive models of linguistic features such as the phoneme sequence (i.e., a pronunciation dictionary plus a letter-to-sound model) and prosodic structure (e.g., a phrase break predictor). This training is invariably supervised, requiring a corpus of training data labelled with the linguistic feature to be predicted. In this paper, we investigate what can be achieved in the absence of many of these expensive resources, and also with a limited amount of speech recordings. We employ a statistical parametric method, because this has been found to offer good performance even on small corpora, and because it is able to directly learn the relationship between acoustics and whatever linguistic features are available, potentially mitigating the absence of explicit representations of intermediate linguistic layers such as prosody. We present an evaluation that compares systems that have access to varying degrees of linguistic structure. The simplest system only uses phonetic context (quinphones), and this is compared to systems with access to a richer set of context features, with or without tone marking. It is found that the use of tone marking contributes significantly to the quality of synthetic speech. Future work should therefore address the problem of tone assignment using a dictionary and the building of a prediction module for out-of-vocabulary words.},
  month = {January},
  volume = {56},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Moses_Ibibio.pdf},
  pages = {243--251},
  categories = {HTS, Ibibio, Low-resource languages, Speech synthesis}
}
@article{analysis-hts-adaptation-junichi,
  author = {Yamagishi, Junichi and Kobayashi, Takao and Nakano, Yuji and Ogata, Katsumi and Isogai, Juri},
  title = {Analysis of Speaker Adaptation Algorihms for {HMM}-based Speech Synthesis and a Constrained {SMAPLR} Adaptation Algorithm},
  journal = {IEEE Transactions on Audio, Speech and Language Processing},
  note = {In print},
  key = {analysis-hts-adaptation-junichi},
  year = {2008},
  abstract = {In this paper we analyze the effects of several factors and configuration choices encountered during training and model construction when we want to obtain better and more stable adaptation in HMM-based speech synthesis. We then propose a new adaptation algorithm called constrained structural maximum a posteriori linear regression (CSMAPLR) whose derivation is based on the knowledge obtained in this analysis and on the results of comparing several conventional adaptation algorithms. Here we investigate six major aspects of the speaker adaptation: initial models transform functions, estimation criteria, and sensitivity of several linear regression adaptation algorithms algorithms. Analyzing the effect of the initial model, we compare speaker-dependent models, gender-independent models, and the simultaneous use of the gender-dependent models to single use of the gender-dependent models. Analyzing the effect of the transform functions, we compare the transform function for only mean vectors with that for mean vectors and covariance matrices. Analyzing the effect of the estimation criteria, we compare the ML criterion with a robust estimation criterion called structural MAP. We evaluate the sensitivity of several thresholds for the piecewise linear regression algorithms and take up methods combining MAP adaptation with the linear regression algorithms. We incorporate these adaptation algorithms into our speech synthesis system and present several subjective and objective evaluation results showing the utility and effectiveness of these algorithms in speaker adaptation for HMM-based speech synthesis.},
  categories = {speech synthesis, HMM-based speech synthesis, HTS, speaker adaptation, voice conversion, average voice}
}
@inproceedings{anderssoncabral09,
  author = {Andersson, J. Sebastian and Cabral, Joao P. and Badino, Leonardo and Yamagishi, Junichi and Clark, Robert A.J.},
  title = {Glottal Source and Prosodic Prominence Modelling in {HMM}-based Speech Synthesis for the {B}lizzard {C}hallenge 2009},
  booktitle = {The Blizzard Challenge 2009},
  year = {2009},
  month = {September},
  address = {Edinburgh, U.K.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/cstr_Blizzard2009.pdf},
  abstract = {This paper describes the CSTR entry for the Blizzard Challenge 2009. The work focused on modifying two parts of the Nitech 2005 HTS speech synthesis system to improve naturalness and contextual appropriateness. The first part incorporated an implementation of the Linjencrants-Fant (LF) glottal source model. The second part focused on improving synthesis of prosodic prominence including emphasis through context dependent phonemes. Emphasis was assigned to the synthesised test sentences based on a handful of theory based rules. The two parts (LF-model and prosodic prominence) were not combined and hence evaluated separately. The results on naturalness for the LF-model showed that it is not yet perceived as natural as the Benchmark HTS system for neutral speech. The results for the prosodic prominence modelling showed that it was perceived as contextually appropriate as the Benchmark HTS system, despite a low naturalness score. The Blizzard challenge evaluation has provided valuable information on the status of our work and continued work will begin with analysing why our modifications resulted in reduced naturalness compared to the Benchmark HTS system.},
  categories = {HMM, HTS, speech synthesis, LF-model, glottal source, prosodic prominence, emphasis}
}
@article{treeboosting-junichi,
  author = {Yamagishi, Junichi and Kawai, Hisashi and Kobayashi, Takao},
  note = {},
  doi = {10.1016/j.specom.2007.12.003},
  title = {Phone Duration Modeling Using Gradient Tree Boosting},
  journal = {Speech Communication},
  number = {5},
  abstract = {In text-to-speech synthesis systems, phone duration influences the quality and naturalness of synthetic speech. In this study, we incorporate an ensemble learning technique called gradient tree boosting into phone duration modeling as an alternative to the conventional approach using regression trees, and objectively evaluate the prediction accuracy of Japanese, Mandarin, and English phone duration. The gradient tree boosting algorithm is a meta algorithm of regression trees: it iteratively builds the regression tree from the residuals and outputs weighting sum of the regression trees. Our evaluation results show that compared to the regression trees or other techniques related to the regression trees, the gradient tree boosting algorithm can substantially and robustly improve the predictive accuracy of the phone duration regardless of languages, speakers, or domains.},
  month = {May},
  volume = {50},
  key = {treeboosting-junichi},
  year = {2008},
  pages = {405--415},
  categories = {Text-to-speech synthesis, Phone duration modeling, Gradient tree boosing}
}
@inproceedings{ling:richmond:yamagishi:wang:2008a,
  author = {Ling, Zhen-Hua and Richmond, Korin and Yamagishi, Junichi and Wang, Ren-Hua},
  title = {Articulatory Control of {HMM}-based Parametric Speech Synthesis Driven by Phonetic Knowledge},
  booktitle = {Proc. Interspeech},
  year = {2008},
  abstract = {This paper presents a method to control the characteristics of synthetic speech flexibly by integrating articulatory features into a Hidden Markov Model (HMM)-based parametric speech synthesis system. In contrast to model adaptation and interpolation approaches for speaking style control, this method is driven by phonetic knowledge, and target speech samples are not required. The joint distribution of parallel acoustic and articulatory features considering cross-stream feature dependency is estimated. At synthesis time, acoustic and articulatory features are generated simultaneously based on the maximum-likelihood criterion. The synthetic speech can be controlled flexibly by modifying the generated articulatory features according to arbitrary phonetic rules in the parameter generation process. Our experiments show that the proposed method is effective in both changing the overall character of synthesized speech and in controlling the quality of a specific vowel.},
  month = {September},
  key = {ling:richmond:yamagishi:wang:2008a},
  address = {Brisbane, Australia},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080582.PDF},
  pages = {573--576},
  categories = {speech synthesis, HMM, articulatory features, phonetic knowledge}
}
@article{Andersson2012175,
  author = {Andersson, Sebastian and Yamagishi, Junichi and Clark, Robert A.J.},
  note = {},
  doi = {10.1016/j.specom.2011.08.001},
  title = {Synthesis and evaluation of conversational characteristics in {HMM}-based speech synthesis},
  url = {http://www.sciencedirect.com/science/article/pii/S0167639311001178},
  journal = {Speech Communication},
  issn = {0167-6393},
  number = {2},
  abstract = {Spontaneous conversational speech has many characteristics that are currently not modelled well by HMM-based speech synthesis and in order to build synthetic voices that can give an impression of someone partaking in a conversation, we need to utilise data that exhibits more of the speech phenomena associated with conversations than the more generally used carefully read aloud sentences. In this paper we show that synthetic voices built with HMM-based speech synthesis techniques from conversational speech data, preserved segmental and prosodic characteristics of frequent conversational speech phenomena. An analysis of an evaluation investigating the perception of quality and speaking style of HMM-based voices confirms that speech with conversational characteristics are instrumental for listeners to perceive successful integration of conversational speech phenomena in synthetic speech. The achieved synthetic speech quality provides an encouraging start for the continued use of conversational speech in HMM-based speech synthesis.},
  volume = {54},
  year = {2012},
  keywords = {Speech synthesis, HMM, Conversation, Spontaneous speech, Filled pauses, Discourse marker},
  pages = {175--188}
}
@inproceedings{cereproc-hts,
  author = {Aylett, Matthew P. and Yamagishi, Junichi},
  title = {Combining Statistical Parameteric Speech Synthesis and Unit-Selection for Automatic Voice Cloning},
  booktitle = {Proc. LangTech 2008},
  year = {2008},
  month = {September},
  key = {cereproc-hts},
  address = {Brisbane, Australia},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/03_AYLETT.pdf},
  abstract = {The ability to use the recorded audio of a subject's voice to produce an open-domain synthesis system has generated much interest both in academic research and in commercial speech technology. The ability to produce synthetic versions of a subjects voice has potential commercial applications, such as virtual celebrity actors, or potential clinical applications, such as offering a synthetic replacement voice in the case of a laryngectomy. Recent developments in HMM-based speech synthesis have shown it is possible to produce synthetic voices from quite small amounts of speech data. However, mimicking the depth and variation of a speaker's prosody as well as synthesising natural voice quality is still a challenging research problem. In contrast, unit-selection systems have shown it is possible to strongly retain the character of the voice but only with sufficient original source material. Often this runs into hours and may require significant manual checking and labelling. In this paper we will present two state of the art systems, an HMM based system HTS-2007, developed by CSTR and Nagoya Institute Technology, and a commercial unit-selection system CereVoice, developed by Cereproc. Both systems have been used to mimic the voice of George W. Bush (43rd president of the United States) using freely available audio from the web. In addition we will present a hybrid system which combines both technologies. We demonstrate examples of synthetic voices created from 10, 40 and 210 minutes of randomly selected speech. We will then discuss the underlying problems associated with voice cloning using found audio, and the scalability of our solution.},
  categories = {speech synthesis, HMM-based speech synthesis, HTS, speaker adaptation, voice conversion, average voice}
}
@inproceedings{lingIS2012,
  author = {Ling, Zhen-Hua and Richmond, Korin and Yamagishi, Junichi},
  title = {Vowel Creation by Articulatory Control in {HMM}-based Parametric Speech Synthesis},
  booktitle = {Proc. Interspeech},
  year = {2012},
  month = {September},
  address = {Portland, Oregon, USA},
  keywords = {Speech synthesis, articulatory features, multiple-regression hidden Markov model},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/LingRichmondYamagishi_IS2012.pdf},
  abstract = {This paper presents a method to produce a new vowel by articulatory control in hidden Markov model (HMM) based parametric speech synthesis. A multiple regression HMM (MRHMM) is adopted to model the distribution of acoustic features, with articulatory features used as external auxiliary variables. The dependency between acoustic and articulatory features is modelled by a group of linear transforms that are either estimated context-dependently or determined by the distribution of articulatory features. Vowel identity is removed from the set of context features used to ensure compatibility between the context-dependent model parameters and the articulatory features of a new vowel. At synthesis time, acoustic features are predicted according to the input articulatory features as well as context information. With an appropriate articulatory feature sequence, a new vowel can be generated even when it does not exist in the training set. Experimental results show this method is effective in creating the English vowel /2/ by articulatory control without using any acoustic samples of this vowel.},
  categories = {Speech synthesis, articulatory features, multiple-regression hidden Markov model}
}
@inproceedings{jyamagis07:avss2006,
  author = {Yamagishi, Junichi and Kobayashi, Takao and Renals, Steve and King, Simon and Zen, Heiga and Toda, Tomoki and Tokuda, Keiichi},
  title = {Improved Average-Voice-based Speech Synthesis Using Gender-Mixed Modeling and a Parameter Generation Algorithm Considering {GV}},
  booktitle = {Proc. 6th ISCA Workshop on Speech Synthesis (SSW-6)},
  month = {August},
  year = {2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/ssw6-yamagishi.pdf},
  abstract = {For constructing a speech synthesis system which can achieve diverse voices, we have been developing a speaker independent approach of HMM-based speech synthesis in which statistical average voice models are adapted to a target speaker using a small amount of speech data. In this paper, we incorporate a high-quality speech vocoding method STRAIGHT and a parameter generation algorithm with global variance into the system for improving quality of synthetic speech. Furthermore, we introduce a feature-space speaker adaptive training algorithm and a gender mixed modeling technique for conducting further normalization of the average voice model. We build an English text-to-speech system using these techniques and show the performance of the system.},
  categories = {HMM, speech synthesis, speaker adaptation, HTS}
}
@article{6289354,
  author = {Ling, Z. and Richmond, K. and Yamagishi, J.},
  doi = {10.1109/TASL.2012.2215600},
  title = {Articulatory Control of {HMM}-based Parametric Speech Synthesis using Feature-Space-Switched Multiple Regression},
  journal = {Audio, Speech, and Language Processing, IEEE Transactions on},
  issn = {1558-7916},
  number = {1},
  abstract = {In previous work we proposed a method to control the characteristics of synthetic speech flexibly by integrating articulatory features into a hidden Markov model (HMM) based parametric speech synthesiser. In this method, a unified acoustic-articulatory model is trained, and context-dependent linear transforms are used to model the dependency between the two feature streams. In this paper, we go significantly further and propose a feature-space-switched multiple regression HMM to improve the performance of articulatory control. A multiple regression HMM (MRHMM) is adopted to model the distribution of acoustic features, with articulatory features used as exogenous explanatory variables. A separate Gaussian mixture model (GMM) is introduced to model the articulatory space, and articulatory-to-acoustic regression matrices are trained for each component of this GMM, instead of for the context-dependent states in the HMM. Furthermore, we propose a task-specific context feature tailoring method to ensure compatibility between state context features and articulatory features that are manipulated at synthesis time. The proposed method is evaluated on two tasks, using a speech database with acoustic waveforms and articulatory movements recorded in parallel by electromagnetic articulography (EMA). In a vowel identity modification task, the new method achieves better performance when reconstructing target vowels by varying articulatory inputs than our previous approach. A second vowel creation task shows our new method is highly effective at producing a new vowel from appropriate articulatory representations which, even though no acoustic samples for this vowel are present in the training data, is shown to sound highly natural.},
  volume = {21},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/06289354.pdf},
  pages = {207--219}
}
@inproceedings{zen:HTSoverview,
  author = {Zen, Heiga and Oura, Keiichiro and Nose, Takashi and Yamagishi, Junichi and Sako, Shinji and Toda, Tomoki and Masuko, Takashi and Black, Alan W. and Tokuda, Keiichi},
  title = {Recent development of the {HMM}-based speech synthesis system ({HTS})},
  booktitle = {Proc. 2009 Asia-Pacific Signal and Information Processing Association (APSIPA)},
  year = {2009},
  month = {October},
  address = {Sapporo, Japan},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/zen_APSIPA2009.pdf},
  abstract = {A statistical parametric approach to speech synthesis based on hidden Markov models (HMMs) has grown in popularity over the last few years. In this approach, spectrum, excitation, and duration of speech are simultaneously modeled by context-dependent HMMs, and speech waveforms are generate from the HMMs themselves. Since December 2002, we have publicly released an open-source software toolkit named “HMM-based speech synthesis system (HTS)” to provide a research and development toolkit for statistical parametric speech synthesis. This paper describes recent developments of HTS in detail, as well as future release plans.}
}
@inproceedings{hirai07:5ms2007,
  author = {Hirai, Toshio and Yamagishi, Junichi and Tenpaku, Seiichi},
  title = {Utilization of an {HMM}-Based Feature Generation Module in 5 ms Segment Concatenative Speech Synthesis},
  booktitle = {Proc. 6th ISCA Workshop on Speech Synthesis (SSW-6)},
  month = {August},
  year = {2007},
  abstract = {If a concatenative speech synthesis system uses more short speech segments, it increases the potential to generate natural speech because the concatenation variation becomes greater. Recently, a synthesis approach was proposed in which very short (5 ms) segments are used. In this paper, an implementation of an HMM-based feature generation module into a very short segment concatenative synthesis system that has the advantage of modularity and a synthesis experiment are described.},
  categories = {speech synthesis, HTS, hybrid algorithm}
}
@inproceedings{CassiaICASSP12,
  author = {Valentini-Botinhao, C. and Maia, R. and Yamagishi, J. and King, S. and Zen, H.},
  doi = {10.1109/ICASSP.2012.6288794},
  title = {{Cepstral analysis based on the Glimpse proportion measure for improving the intelligibility of {HMM}-based synthetic speech in noise}},
  booktitle = {Proc. ICASSP},
  year = {2012},
  abstract = {In this paper we introduce a new cepstral coefficient extraction method based on an intelligibility measure for speech in noise, the Glimpse Proportion measure. This new method aims to increase the intelligibility of speech in noise by modifying the clean speech, and has applications in scenarios such as public announcement and car navigation systems. We first explain how the Glimpse Proportion measure operates and further show how we approximated it to integrate it into an existing spectral envelope parameter extraction method commonly used in the HMM-based speech synthesis framework. We then demonstrate how this new method changes the modelled spectrum according to the characteristics of the noise and show results for a listening test with vocoded and HMM-based synthetic speech. The test indicates that the proposed method can significantly improve intelligibility of synthetic speech in speech shaped noise.},
  month = {March},
  address = {Kyoto, Japan},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_ICASSP12.pdf},
  pages = {3997--4000},
  categories = {HMM-based speech synthesis, intelligibility enhancement, speech analysis}
}
@inproceedings{ling_interspeech2010,
  author = {Ling, Zhen-Hua and Richmond, Korin and Yamagishi, Junichi},
  title = {{HMM}-based Text-to-Articulatory-Movement Prediction and Analysis of Critical Articulators},
  booktitle = {Proc. Interspeech},
  year = {2010},
  abstract = {In this paper we present a method to predict the movement of a speaker's mouth from text input using hidden Markov models (HMM). We have used a corpus of human articulatory movements, recorded by electromagnetic articulography (EMA), to train HMMs. To predict articulatory movements from text, a suitable model sequence is selected and the maximum-likelihood parameter generation (MLPG) algorithm is used to generate output articulatory trajectories. In our experiments, we find that fully context-dependent models outperform monophone and quinphone models, achieving an average root mean square (RMS) error of 1.945mm when state durations are predicted from text, and 0.872mm when natural state durations are used. Finally, we go on to analyze the prediction error for different EMA dimensions and phone types. We find a clear pattern emerges that the movements of so-called critical articulators can be predicted more accurately than the average performance.},
  month = {September},
  address = {Makuhari, Japan},
  keywords = {Hidden Markov model, articulatory features, parameter generation, critical articulators},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100349.pdf},
  pages = {2194--2197}
}
@inproceedings{PhillipIS2012,
  author = {Leon, Phillip L. De and Stewart, Bryan and Yamagishi, Junichi},
  title = {Synthetic Speech Discrimination using Pitch Pattern Statistics Derived from Image Analysis},
  booktitle = {Proc. Interspeech},
  year = {2012},
  month = {September},
  address = {Portland, Oregon, USA},
  abstract = {In this paper, we extend the work by Ogihara, et al. to discriminate between human and synthetic speech using features based on pitch patterns. As previously demonstrated, significant differences in pitch patterns between human and synthetic speech can be leveraged to classify speech as being human or synthetic in origin. We propose using mean pitch stability, mean pitch stability range, and jitter as features extracted after image analysis of pitch patterns. We have observed that for synthetic speech, these features lie in a small and distinct space as compared to human speech and have modeled them with a multivariate Gaussian distribution. Our classifier is trained using synthetic speech collected from the 2008 and 2011 Blizzard Challenge along with Festival pre-built voices and human speech from the NIST2002 corpus. We evaluate the classifier on a much larger corpus than previously studied using human speech from the Switchboard corpus, synthetic speech from the Resource Management corpus, and synthetic speech generated from Festival trained on the Wall Street Journal corpus. Results show 98% accuracy in correctly classifying human speech and 96% accuracy in correctly classifying synthetic speech.}
}
@article{roberto:specom2010,
  author = {Barra-Chicote, R. and Yamagishi, J. and King, S. and Monero, J. Manuel and Macias-Guarasa, J.},
  doi = {10.1016/j.specom.2009.12.007},
  title = {Analysis of Statistical Parametric and Unit-Selection Speech Synthesis Systems Applied to Emotional Speech},
  journal = {Speech Communication},
  number = {5},
  abstract = {We have applied two state-of-the-art speech synthesis techniques (unit selection and HMM-based synthesis) to the synthesis of emotional speech. A series of carefully designed perceptual tests to evaluate speech quality, emotion identification rates and emotional strength were used for the six emotions which we recorded -- happiness, sadness, anger, surprise, fear, disgust. For the HMM-based method, we evaluated spectral and source components separately and identified which components contribute to which emotion. Our analysis shows that, although the HMM method produces significantly better neutral speech, the two methods produce emotional speech of similar quality, except for emotions having context-dependent prosodic patterns. Whilst synthetic speech produced using the unit selection method has better emotional strength scores than the HMM-based method, the HMM-based method has the ability to manipulate the emotional strength. For emotions that are characterized by both spectral and prosodic components, synthetic speech using unit selection methods was more accurately identified by listeners. For emotions mainly characterized by prosodic components, HMM-based synthetic speech was more accurately identified. This finding differs from previous results regarding listener judgements of speaker similarity for neutral speech. We conclude that unit selection methods require improvements to prosodic modeling and that HMM-based methods require improvements to spectral modeling for emotional speech. Certain emotions cannot be reproduced well by either method.},
  month = {May},
  volume = {52},
  year = {2010},
  keywords = {Emotional speech synthesis; HMM-based synthesis; Unit selection},
  pages = {394--404}
}
@inproceedings{5947571,
  author = {Andraszewicz, S. and Yamagishi, J. and King, S.},
  doi = {10.1109/ICASSP.2011.5947571},
  title = {Vocal attractiveness of statistical speech synthesisers},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011 IEEE International Conference on},
  issn = {1520-6149},
  abstract = {Our previous analysis of speaker-adaptive HMM-based speech synthesis methods suggested that there are two possible reasons why average voices can obtain higher subjective scores than any individual adapted voice: 1) model adaptation degrades speech quality proportionally to the distance 'moved' by the transforms, and 2) psychoacoustic effects relating to the attractiveness of the voice. This paper is a follow-on from that analysis and aims to separate these effects out. Our latest perceptual experiments focus on attractiveness, using average voices and speaker-dependent voices without model trans formation, and show that using several speakers to create a voice improves smoothness (measured by Harmonics-to-Noise Ratio), reduces distance from the the average voice in the log F0-F1 space of the final voice and hence makes it more attractive at the segmental level. However, this is weakened or overridden at supra-segmental or sentence levels.},
  month = {May},
  year = {2011},
  keywords = {speaker-adaptive HMM-based speech synthesis methods;speaker-dependent voices;statistical speech synthesisers;vocal attractiveness;hidden Markov models;speaker recognition;speech synthesis;},
  pages = {5368--5371}
}
@inproceedings{Jaime2IS2012,
  author = {Lorenzo, J. and Martinez, B. and Barra-Chicote, R. and Lopez–Ludena, V. and Ferreiros, J. and Yamagishi, J. and Montero, J.M.},
  title = {Towards an Unsupervised Speaking Style Voice Building Framework: Multi–Style Speaker Diarization},
  booktitle = {Proc. Interspeech},
  year = {2012},
  month = {September},
  address = {Portland, Oregon, USA},
  abstract = {Current text–to–speech systems are developed using studio-recorded speech in a neutral style or based on acted emotions. However, the proliferation of media sharing sites would allow developing a new generation of speech–based systems which could cope with sponta- neous and styled speech. This paper proposes an architecture to deal with realistic recordings and carries out some experiments on unsupervised speaker diarization. In order to maximize the speaker purity of the clusters while keeping a high speaker coverage, the paper evaluates the F–measure of a diarization module, achieving high scores (>85%) especially when the clusters are longer than 30 seconds, even for the more spontaneous and expressive styles (such as talk shows or sports).}
}
@incollection{sarah:hts09,
  editor = {Mullennix, John W. and Stern, Steven E.},
  author = {Creer, Sarah and Green, Phil and Cunningham, Stuart and Yamagishi, Junichi},
  publisher = {IGI Global},
  title = {Building personalised synthesised voices for individuals with dysarthria using the {HTS} toolkit},
  booktitle = {Computer Synthesized Speech Technologies: Tools for Aiding Impairment},
  note = {in press},
  edition = {1st},
  year = {2009},
  abstract = {When the speech of an individual becomes unintelligible due to a neurological disorder, a synthesized voice can replace that of the individual. To fully replace all functions of human speech communication: communication of information, maintenance of social relationships and displaying identity, the voice must be intelligible, natural-sounding and retain the vocal identity of the speaker. For speakers with dysarthria, achieving this output with minimal data recordings and deteriorating speech is difficult. An alternative to this is using Hidden Markov models (HMMs) which require much less speech data than needed for concatenative methods, to adapt a robust statistical model of speech towards the speaker characteristics captured in the data recorded by the individual. This chapter implements this technique using the HTS toolkit to build personalized synthetic voices for two individuals with dysarthria. An evaluation of the voices by the participants themselves suggests that this technique shows promise for building and reconstructing personalized voices for individuals with dysarthria once deterioration has begun.}
}
@article{tuomo:ieee2011,
  author = {Raitio, T. and Suni, A. and Yamagishi, J. and Pulakka, H. and Nurminen, J. and Vainio, M. and Alku, P.},
  doi = {10.1109/TASL.2010.2045239},
  title = {{HMM}-Based Speech Synthesis Utilizing Glottal Inverse Filtering},
  journal = {IEEE Transactions on Audio, Speech and Language Processing},
  number = {1},
  abstract = {This paper describes an hidden Markov model (HMM)-based speech synthesizer that utilizes glottal inverse filtering for generating natural sounding synthetic speech. In the proposed method, speech is first decomposed into the glottal source signal and the model of the vocal tract filter through glottal inverse filtering, and thus parametrized into excitation and spectral features. The source and filter features are modeled individually in the framework of HMM and generated in the synthesis stage according to the text input. The glottal excitation is synthesized through interpolating and concatenating natural glottal flow pulses, and the excitation signal is further modified according to the spectrum of the desired voice source characteristics. Speech is synthesized by filtering the reconstructed source signal with the vocal tract filter. Experiments show that the proposed system is capable of generating natural sounding speech, and the quality is clearly better compared to two HMM-based speech synthesis systems based on widely used vocoder techniques.},
  month = {January},
  volume = {19},
  year = {2011},
  keywords = {Glottal inverse filtering , hidden Markov model (HMM) , speech synthesis},
  pages = {153--165}
}
@inproceedings{Ayletetal09,
  author = {Aylett, Matthew P. and King, Simon and Yamagishi, Junichi},
  title = {Speech Synthesis Without a Phone Inventory},
  booktitle = {Interspeech},
  abstract = {In speech synthesis the unit inventory is decided using phonological and phonetic expertise. This process is resource intensive and potentially sub-optimal. In this paper we investigate how acoustic clustering, together with lexicon constraints, can be used to build a self-organised inventory. Six English speech synthesis systems were built using two frameworks, unit selection and parametric HTS for three inventory conditions: 1) a traditional phone set, 2) a system using orthographic units, and 3) a self-organised inventory. A listening test showed a strong preference for the classic system, and for the orthographic system over the self-organised system. Results also varied by letter to sound complexity and database coverage. This suggests the self-organised approach failed to generalise pronunciation as well as introducing noise above and beyond that caused by orthographic sound mismatch.},
  place = {Brighton},
  year = {2009},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/ma_interspeech09.pdf},
  pages = {2087--2090},
  categories = {speech synthesis, unit selection, parametric synthesis, phone inventory, orthographic synthesis}
}
@inproceedings{phillip:icassp2010,
  author = {Leon, P. L. De and Apsingekar, V. R. and Pucher, M. and Yamagishi, J.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/main_r2.pdf},
  booktitle = {{Proc. ICASSP 2010}},
  year = {2010},
  title = {Revisiting the security of speaker verification systems against imposture using synthetic speech},
  address = {Dallas, Texas, USA}
}
@inproceedings{lingvowel,
  author = {Ling, Zhenhua and Richmond, Korin and Yamagishi, Junichi},
  title = {Vowel Creation by Articulatory Control in {HMM}-based Parametric Speech Synthesis},
  booktitle = {Proc. The Listening Talker Workshop},
  year = {2012},
  month = {May},
  address = {Edinburgh, UK},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2012/Ling_etal_LISTA.pdf},
  pages = {72}
}
@article{JunichiHTS06,
  author = {Yamagishi, Junichi and Kobayashi, Takao},
  title = {Average-Voice-based Speech Synthesis using HSMM-based Speaker Adaptation and Adaptive Training.},
  journal = {IEICE Trans. Information and Systems},
  number = {2},
  abstract = {In speaker adaptation for speech synthesis, it is desirable to convert both voice characteristics and prosodic features such as F0 and phone duration. For simultaneous adaptation of spectrum, F0 and phone duration within the HMM framework, we need to transform not only the state output distributions corresponding to spectrum and F0 but also the duration distributions corresponding to phone duration. However, it is not straightforward to adapt the state duration because the original HMM does not have explicit duration distributions. Therefore, we utilize the framework of the hidden semi-Markov model (HSMM), which is an HMM having explicit state duration distributions, and we apply an HSMM-based model adaptation algorithm to simultaneously transform both the state output and state duration distributions. Furthermore, we propose an HSMM-based adaptive training algorithm to simultaneously normalize the state output and state duration distributions of the average voice model. We incorporate these techniques into our HSMM-based speech synthesis system, and show their effectiveness from the results of subjective and objective evaluation tests.},
  month = {February},
  volume = {E90-D},
  year = {2007},
  pages = {533-543}
}
@inproceedings{watts_yamagishi_king_2011,
  author = {Watts, Oliver and Yamagishi, Junichi and King, Simon},
  title = {Unsupervised continuous-valued word features for phrase-break prediction without a part-of-speech tagger},
  booktitle = {Proc. Interspeech},
  year = {2011},
  abstract = {Part of speech (POS) tags are foremost among the features conventionally used to predict intonational phrase-breaks for text to speech (TTS) conversion. The construction of such systems therefore presupposes the availability of a POS tagger for the relevant language, or of a corpus manually tagged with POS. However, such tools and resources are not available in the majority of the world’s languages, and manually labelling text with POS tags is an expensive and time-consuming process. We therefore propose the use of continuous-valued features that summarise the distributional characteristics of word types as surrogates for POS features. Importantly, such features are obtained in an unsupervised manner from an untagged text corpus. We present results on the phrase-break prediction task, where use of the features closes the gap in performance between a baseline system (using only basic punctuation-related features) and a topline system (incorporating a state-of-the-art POS tagger).},
  month = {August},
  address = {Florence, Italy},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/watts_yamagishi_king_2011.pdf},
  pages = {2157--2160}
}
@inproceedings{Cassia_IS11,
  author = {Valentini-Botinhao, Cassia and Yamagishi, Junichi and King, Simon},
  title = {Can Objective Measures Predict the Intelligibility of Modified {HMM}-based Synthetic Speech in Noise?},
  booktitle = {Proc. Interspeech},
  month = {August},
  year = {2011},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/Cassia_IS11.pdf},
  abstract = {{Synthetic speech can be modified to improve intelligibility in noise. In order to perform modifications automatically, it would be useful to have an objective measure that could predict the intelligibility of modified synthetic speech for human listeners. We analysed the impact on intelligibility – and on how well objective measures predict it – when we separately modify speaking rate, fundamental frequency, line spectral pairs and spectral peaks. Shifting LSPs can increase intelligibility for human listeners; other modifications had weaker effects. Among the objective measures we evaluated, the Dau model and the Glimpse proportion were the best predictors of human performance.}},
  categories = {HMM-based speech synthesis, objective measures of intelligibility}
}
@inproceedings{robust-hts,
  author = {Yamagishi, Junichi and Ling, Zhenhua and King, Simon},
  title = {Robustness of HMM-based Speech Synthesis},
  booktitle = {Proc. Interspeech 2008},
  year = {2008},
  abstract = {As speech synthesis techniques become more advanced, we are able to consider building high-quality voices from data collected outside the usual highly-controlled recording studio environment. This presents new challenges that are not present in conventional text-to-speech synthesis: the available speech data are not perfectly clean, the recording conditions are not consistent, and/or the phonetic balance of the material is not ideal. Although a clear picture of the performance of various speech synthesis techniques (e.g., concatenative, HMM-based or hybrid) under good conditions is provided by the Blizzard Challenge, it is not well understood how robust these algorithms are to less favourable conditions. In this paper, we analyse the performance of several speech synthesis methods under such conditions. This is, as far as we know, a new research topic: ``Robust speech synthesis.'' As a consequence of our investigations, we propose a new robust training method for the HMM-based speech synthesis in for use with speech data collected in unfavourable conditions.},
  month = {September},
  key = {robust-hts},
  address = {Brisbane, Australia},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/RobustnessHTS.pdf},
  pages = {581--584},
  categories = {speech synthesis, HMM-based speech synthesis, HTS, speaker adaptation, voice conversion, average voice, unit selection}
}
@inproceedings{5947440,
  author = {De Leon, P.L. and Hernaez, I. and Saratxaga, I. and Pucher, M. and Yamagishi, J.},
  doi = {10.1109/ICASSP.2011.5947440},
  title = {Detection of synthetic speech for the problem of imposture},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011 IEEE International Conference on},
  issn = {1520-6149},
  abstract = {In this paper, we present new results from our research into the vulnerability of a speaker verification (SV) system to synthetic speech. We use a HMM-based speech synthesizer, which creates synthetic speech for a targeted speaker through adaptation of a background model and both GMM-UBM and support vector machine (SVM) SV systems. Using 283 speakers from the Wall-Street Journal (WSJ) corpus, our SV systems have a 0.35% EER. When the systems are tested with synthetic speech generated from speaker models derived from the WSJ journal corpus, over 91% of the matched claims are accepted. We propose the use of relative phase shift (RPS) in order to detect synthetic speech and develop a GMM-based synthetic speech classifier (SSC). Using the SSC, we are able to correctly classify human speech in 95% of tests and synthetic speech in 88% of tests thus significantly reducing the vulnerability.},
  month = {May},
  year = {2011},
  keywords = {EER;GMM-UBM;GMM-based synthetic speech classifier;HMM-based speech synthesizer;RPS;SSC;SV system;WSJ corpus;Wall-Street Journal corpus;relative phase shift;speaker verification system;support vector machine;hidden Markov models;speaker recognition;speech synthesis;support vector machines;},
  pages = {4844--4847}
}
@article{nose07:mrhsmm,
  author = {Nose, Takashi and Yamagishi, Junichi and Kobayashi, Takao},
  title = {A Style Control Technique for {HMM}-based Expressive Speech Synthesis},
  url = {http://search.ieice.org/bin/summary.php?id=e90-d_9_1406&category=D&lang=E&year=2007&abst=},
  journal = {IEICE Trans. Information and Systems},
  number = {9},
  abstract = {This paper describes a technique for controlling the degree of expressivity of a desired emotional expression and/or speaking style of synthesized speech in an HMM-based speech synthesis framework. With this technique, multiple emotional expressions and speaking styles of speech are modeled in a single model by using a multiple-regression hidden semi-Markov model (MRHSMM). A set of control parameters, called the style vector, is defined, and each speech synthesis unit is modeled by using the MRHSMM, in which mean parameters of the state output and duration distributions are expressed by multiple-regression of the style vector. In the synthesis stage, the mean parameters of the synthesis units are modified by transforming an arbitrarily given style vector that corresponds to a point in a low-dimensional space, called style space, each of whose coordinates represents a certain specific speaking style or emotion of speech. The results of subjective evaluation tests show that style and its intensity can be controlled by changing the style vector},
  month = {September},
  volume = {E90-D},
  year = {2007},
  pages = {1406--1413},
  categories = {HMM-based speech synthesis, speaking style, emotional expression, style interpolation, hidden semi-Markov model (HSMM)}
}
@article{Ximera06,
  author = {Kawai, Hisashi and Toda, Tomoki and Yamagishi, Junichi and Hirai, Toshio and Ni, Jinfu and Nishizawa, Nobuyuki and Tsuzaki, Minoru and Tokuda, Keiichi},
  title = {XIMERA: a concatenative speech synthesis system with large scale corpora},
  journal = {IEICE Trans. Information and Systems},
  number = {12},
  month = {December},
  volume = {J89-D-II},
  year = {2006},
  pages = {2688-2698}
}
@article{ling2008,
  author = {Ling, Z. and Richmond, K. and Yamagishi, J. and Wang, R.},
  note = {\textbf{IEEE SPS 2010 Young Author Best Paper Award}},
  doi = {10.1109/TASL.2009.2014796},
  title = {Integrating Articulatory Features into {HMM}-based Parametric Speech Synthesis},
  journal = {IEEE Transactions on Audio, Speech and Language Processing},
  number = {6},
  abstract = {This paper presents an investigation of ways to integrate articulatory features into Hidden Markov Model (HMM)-based parametric speech synthesis, primarily with the aim of improving the performance of acoustic parameter generation. The joint distribution of acoustic and articulatory features is estimated during training and is then used for parameter generation at synthesis time in conjunction with a maximum-likelihood criterion. Different model structures are explored to allow the articulatory features to influence acoustic modeling: model clustering, state synchrony and cross-stream feature dependency. The results of objective evaluation show that the accuracy of acoustic parameter prediction can be improved when shared clustering and asynchronous-state model structures are adopted for combined acoustic and articulatory features. More significantly, our experiments demonstrate that modeling the dependency between these two feature streams can make speech synthesis more flexible. The characteristics of synthetic speech can be easily controlled by modifying generated articulatory features as part of the process of acoustic parameter generation.},
  month = {August},
  volume = {17},
  key = {ling2008},
  year = {2009},
  pages = {1171--1185},
  categories = {Speech synthesis, articulation, HMM-based synthesis}
}
@inproceedings{child_synthesis_2009,
  author = {Watts, Oliver and Yamagishi, Junichi and King, Simon and Berkling, Kay},
  title = {{HMM} Adaptation and Voice Conversion for the Synthesis of Child Speech: A Comparison},
  booktitle = {Proc. Interspeech 2009},
  year = {2009},
  abstract = {This study compares two different methodologies for producing data-driven synthesis of child speech from existing systems that have been trained on the speech of adults. On one hand, an existing statistical parametric synthesiser is transformed using model adaptation techniques, informed by linguistic and prosodic knowledge, to the speaker characteristics of a child speaker. This is compared with the application of voice conversion techniques to convert the output of an existing waveform concatenation synthesiser with no explicit linguistic or prosodic knowledge. In a subjective evaluation of the similarity of synthetic speech to natural speech from the target speaker, the HMM-based systems evaluated are generally preferred, although this is at least in part due to the higher dimensional acoustic features supported by these techniques.},
  month = {September},
  address = {Brighton, U.K.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/child_synthesis_2009.pdf},
  pages = {2627--2630}
}
@article{anderssonyamagishi12,
  author = {Andersson, S. and Yamagishi, J. and Clark, R.A.J.},
  doi = {10.1016/j.specom.2011.08.001},
  title = {Synthesis and Evaluation of Conversational Characteristics in {HMM}-Based Speech Synthesis},
  journal = {Speech Communication},
  number = {2},
  abstract = {Spontaneous conversational speech has many characteristics that are currently not modelled well by HMM-based speech synthesis and in order to build synthetic voices that can give an impression of someone partaking in a conversation, we need to utilise data that exhibits more of the speech phenomena associated with conversations than the more generally used carefully read aloud sentences. In this paper we show that synthetic voices built with HMM-based speech synthesis techniques from conversational speech data, preserved segmental and prosodic characteristics of frequent conversational speech phenomena. An analysis of an evaluation investigating the perception of quality and speaking style of HMM-based voices confirms that speech with conversational characteristics are instrumental for listeners to perceive successful integration of conversational speech phenomena in synthetic speech. The achieved synthetic speech quality provides an encouraging start for the continued use of conversational speech in HMM-based speech synthesis.},
  volume = {54},
  year = {2012},
  pages = {175-188}
}
@inproceedings{yong:ssw7,
  author = {Guan, Yong and Tian, Jilei and Wu, Yi-Jian and Yamagishi, Junichi and Nurminen, Jani},
  title = {A Unified and Automatic Approach Of {M}andarin {HTS} System},
  booktitle = {{Proc. SSW7}},
  year = {2010},
  month = {September},
  address = {Kyoto, Japan},
  keywords = {HTS, speech synthesis, mandarin},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/HTS_Yong_ssw7.pdf}
}
@article{Dines2011,
  author = {Dines, John and Liang, Hui and Saheer, Lakshmi and Gibson, Matthew and Byrne, William and Oura, Keiichiro and Tokuda, Keiichi and Yamagishi, Junichi and King, Simon and Wester, Mirjam and Hirsimäki, Teemu and Karhila, Reima and Kurimo, Mikko},
  doi = {10.1016/j.csl.2011.08.003},
  title = {Personalising speech-to-speech translation: Unsupervised cross-lingual speaker adaptation for {HMM}-based speech synthesis},
  url = {http://www.sciencedirect.com/science/article/pii/S0885230811000441},
  journal = {Computer Speech and Language},
  issn = {0885-2308},
  number = {2},
  abstract = {In this paper we present results of unsupervised cross-lingual speaker adaptation applied to text-to-speech synthesis. The application of our research is the personalisation of speech-to-speech translation in which we employ a HMM statistical framework for both speech recognition and synthesis. This framework provides a logical mechanism to adapt synthesised speech output to the voice of the user by way of speech recognition. In this work we present results of several different unsupervised and cross-lingual adaptation approaches as well as an end-to-end speaker adaptive speech-to-speech translation system. Our experiments show that we can successfully apply speaker adaptation in both unsupervised and cross-lingual scenarios and our proposed algorithms seem to generalise well for several language pairs. We also discuss important future directions including the need for better evaluation metrics.},
  month = {February},
  volume = {27},
  year = {2013},
  keywords = {Speech-to-speech translation, Cross-lingual speaker adaptation, HMM-based speech synthesis, Speaker adaptation, Voice conversion},
  pages = {420--437}
}
@inproceedings{avss-icassp07,
  author = {Yamagishi, J. and Kobayashi, T. and Tachibana, M. and Ogata, K. and Nakano, Y.},
  booktitle = {Proc. ICASSP},
  year = {2007},
  pages = {1233--1236},
  abstract = {In human computer interaction and dialogue systems, it is often desirable for text-to-speech synthesis to be able to generate natural sounding speech with an arbitrary speaker~Afs voice and with varying speaking styles and/or emotional expressions. We have developed an average-voice-based speech synthesis method using statistical average voice models and model adaptation techniques for this purpose. In this paper, we describe an overview of the speech synthesis system and show the current performance with several experimental results.},
  title = {Model adaptation approach to speech synthesis with diverse voices and styles}
}
@inproceedings{lei2011a,
  author = {Lei, Ming and Yamagishi, Junichi and Richmond, Korin and Ling, Zhen-Hua and King, Simon and Dai, Li-Rong},
  title = {Formant-controlled {HMM}-based speech synthesis},
  booktitle = {Proc. Interspeech},
  year = {2011},
  abstract = {This paper proposes a novel framework that enables us to manipulate and control formants in HMM-based speech synthesis. In this framework, the dependency between formants and spectral features is modelled by piecewise linear transforms; formant parameters are effectively mapped by these to the means of Gaussian distributions over the spectral synthesis parameters. The spectral envelope features generated under the influence of formants in this way may then be passed to high-quality vocoders to generate the speech waveform. This provides two major advantages over conventional frameworks. First, we can achieve spectral modification by changing formants only in those parts where we want control, whereas the user must specify all formants manually in conventional formant synthesisers (e.g. Klatt). Second, this can produce high-quality speech. Our results show the proposed method can control vowels in the synthesized speech by manipulating F 1 and F 2 without any degradation in synthesis quality.},
  month = {August},
  address = {Florence, Italy},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/IS110769.pdf},
  pages = {2777--2780},
  categories = {speech synthesis, hidden Markov model, formants, controllability}
}
@inproceedings{cabral:renals:richmond:yamagishi:2008a,
  author = {Cabral, J. and Renals, S. and Richmond, K. and Yamagishi, J.},
  title = {Glottal Spectral Separation for Parametric Speech Synthesis},
  booktitle = {Proc. Interspeech},
  year = {2008},
  abstract = {This paper presents a method to control the characteristics of synthetic speech flexibly by integrating articulatory features into a Hidden Markov Model (HMM)-based parametric speech synthesis system. In contrast to model adaptation and interpolation approaches for speaking style control, this method is driven by phonetic knowledge, and target speech samples are not required. The joint distribution of parallel acoustic and articulatory features considering cross-stream feature dependency is estimated. At synthesis time, acoustic and articulatory features are generated simultaneously based on the maximum-likelihood criterion. The synthetic speech can be controlled flexibly by modifying the generated articulatory features according to arbitrary phonetic rules in the parameter generation process. Our experiments show that the proposed method is effective in both changing the overall character of synthesized speech and in controlling the quality of a specific vowel.},
  month = {September},
  key = {cabral:renals:richmond:yamagishi:2008a},
  address = {Brisbane, Australia},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS081086.PDF},
  pages = {1829--1832},
  categories = {HMM speech synthesis, Glottal Spectral Separation, LF-model}
}
@inproceedings{zen07:hts-2,
  author = {Zen, Heiga and Nose, Takashi and Yamagishi, Junichi and Sako, Shinji and Masuko, Takashi and Black, Alan and Tokuda, Keiichi},
  title = {The {HMM}-based speech synthesis system ({HTS}) version 2.0},
  booktitle = {Proc. 6th ISCA Workshop on Speech Synthesis (SSW-6)},
  month = {August},
  year = {2007},
  abstract = {A statistical parametric speech synthesis system based on hidden Markov models (HMMs) has grown in popularity over the last few years. This system simultaneously models spectrum, excitation, and duration of speech using context-dependent HMMs and generates speech waveforms from the HMMs themselves. Since December 2002, we have publicly released an open-source software toolkit named HMM-based speech synthesis system (HTS) to provide a research and development platform for the speech synthesis community. In December 2006, HTS version 2.0 was released. This version includes a number of new features which are useful for both speech synthesis researchers and developers. This paper describes HTS version 2.0 in detail, as well as future release plans.},
  categories = {HMM, speech synthesis, HTS}
}
@article{john:ieee2011,
  author = {Dines, J. and Yamagishi, J. and King, S.},
  doi = {10.1109/JSTSP.2010.2079315},
  title = {Measuring the gap between {HMM}-based {ASR} and {TTS}},
  journal = {IEEE Selected Topics in Signal Processing},
  note = {(in press)},
  year = {2011},
  keywords = {Acoustics, Adaptation model, Context modeling, Hidden Markov models, Speech, Speech recognition, Training, speech recognition, speech synthesis, unified models},
  abstract = {The EMIME European project is conducting research in the development of technologies for mobile, personalised speech-to-speech translation systems. The hidden Markov model (HMM) is being used as the underlying technology in both automatic speech recognition (ASR) and text-to-speech synthesis (TTS) components, thus, the investigation of unified statistical modelling approaches has become an implicit goal of our research. As one of the first steps towards this goal, we have been investigating commonalities and differences between HMM-based ASR and TTS. In this paper we present results and analysis of a series of experiments that have been conducted on English ASR and TTS systems measuring their performance with respect to phone set and lexicon; acoustic feature type and dimensionality; HMM topology; and speaker adaptation. Our results show that, although the fundamental statistical model may be essentially the same, optimal ASR and TTS performance often demands diametrically opposed system designs. This represents a major challenge to be addressed in the investigation of such unified modelling approaches.}
}
@inproceedings{Dall_Veaux_Yamagishi_King_Interspeech2012,
  author = {Dall, Rasmus and Veaux, Christophe and Yamagishi, Junichi and King, Simon},
  title = {Analysis of speaker clustering techniques for {HMM}-based speech synthesis},
  booktitle = {Proc. Interspeech},
  month = {September},
  year = {2012},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Dall_Veaux_Yamagishi_King_Interspeech2012.pdf},
  abstract = {This paper describes a method for speaker clustering, with the application of building average voice models for speaker-adaptive HMM-based speech synthesis that are a good basis for adapting to specific target speakers. Our main hypothesis is that using perceptually similar speakers to build the average voice model will be better than use unselected speakers, even if the amount of data available from perceptually similar speakers is smaller. We measure the perceived similarities among a group of 30 female speakers in a listening test and then apply multiple linear regression to automatically predict these listener judgements of speaker similarity and thus to identify similar speakers automatically. We then compare a variety of average voice models trained on either speakers who were perceptually judged to be similar to the target speaker, or speakers selected by the multiple linear regression, or a large global set of unselected speakers. We find that the average voice model trained on perceptually similar speakers provides better performance than the global model, even though the latter is trained on more data, confirming our main hypothesis. However, the average voice model using speakers selected automatically by the multiple linear regression does not reach the same level of performance.},
  categories = {Statistical parametric speech synthesis, hidden Markov models, speaker adaptation}
}
@inproceedings{JaimeIS2012,
  author = {Lorenzo-Trueba, Jaime and Barra-Chicote, Roberto and Raitio, Tuomo and Obin, Nicolas and Alku, Paavo and Yamagishi, Junichi and Montero, Juan M},
  title = {Towards Glottal Source Controllability in Expressive Speech Synthesis},
  booktitle = {Proc. Interspeech},
  year = {2012},
  month = {September},
  address = {Portland, Oregon, USA},
  abstract = {In order to obtain more human like sounding human- machine interfaces we must first be able to give them expressive capabilities in the way of emotional and stylistic features so as to closely adequate them to the intended task. If we want to replicate those features it is not enough to merely replicate the prosodic information of fundamental frequency and speaking rhythm. The proposed additional layer is the modification of the glottal model, for which we make use of the GlottHMM parameters. This paper analyzes the viability of such an approach by verifying that the expressive nuances are captured by the aforementioned features, obtaining 95% recognition rates on styled speaking and 82% on emotional speech. Then we evaluate the effect of speaker bias and recording environment on the source modeling in order to quantify possible problems when analyzing multi-speaker databases. Finally we propose a speaking styles separation for Spanish based on prosodic features and check its perceptual significance.}
}
@inproceedings{hts-child-oliver,
  author = {Watts, Oliver and Yamagishi, Junichi and Berkling, Kay and King, Simon},
  title = {{HMM}-based synthesis of child speech},
  booktitle = {Proc. 1st Workshop on Child, Computer and Interaction (ICMI'08 post-conference workshop)},
  year = {2008},
  month = {October},
  key = {hts-child-oliver},
  address = {Crete, Greece},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/child-hts-oliver.pdf},
  abstract = {The synthesis of child speech presents challenges both in the collection of data and in the building of a synthesiser from that data. Because only limited data can be collected, and the domain of that data is constrained, it is difficult to obtain the type of phonetically-balanced corpus usually used in speech synthesis. As a consequence, building a synthesiser from this data is difficult. Concatenative synthesisers are not robust to corpora with many missing units (as is likely when the corpus content is not carefully designed), so we chose to build a statistical parametric synthesiser using the HMM-based system HTS. This technique has previously been shown to perform well for limited amounts of data, and for data collected under imperfect conditions. We compared 6 different configurations of the synthesiser, using both speaker-dependent and speaker-adaptive modelling techniques, and using varying amounts of data. The output from these systems was evaluated alongside natural and vocoded speech, in a Blizzard-style listening test.},
  categories = {speech synthesis, HMM-based speech synthesis, HTS, speaker adaptation, voice conversion, average voice, child speech}
}
@article{Oura2012703,
  author = {Oura, Keiichiro and Yamagishi, Junichi and Wester, Mirjam and King, Simon and Tokuda, Keiichi},
  doi = {10.1016/j.specom.2011.12.004},
  title = {Analysis of unsupervised cross-lingual speaker adaptation for {HMM}-based speech synthesis using {KLD}-based transform mapping},
  url = {http://www.sciencedirect.com/science/article/pii/S0167639311001774},
  journal = {Speech Communication},
  issn = {0167-6393},
  number = {6},
  abstract = {In the EMIME project, we developed a mobile device that performs personalized speech-to-speech translation such that a user's spoken input in one language is used to produce spoken output in another language, while continuing to sound like the user's voice. We integrated two techniques into a single architecture: unsupervised adaptation for HMM-based TTS using word-based large-vocabulary continuous speech recognition, and cross-lingual speaker adaptation (CLSA) for HMM-based TTS. The CLSA is based on a state-level transform mapping learned using minimum Kullback-Leibler divergence between pairs of HMM states in the input and output languages. Thus, an unsupervised cross-lingual speaker adaptation system was developed. End-to-end speech-to-speech translation systems for four languages (English, Finnish, Mandarin, and Japanese) were constructed within this framework. In this paper, the English-to-Japanese adaptation is evaluated. Listening tests demonstrate that adapted voices sound more similar to a target speaker than average voices and that differences between supervised and unsupervised cross-lingual speaker adaptation are small. Calculating the KLD state-mapping on only the first 10 mel-cepstral coefficients leads to huge savings in computational costs, without any detrimental effect on the quality of the synthetic speech.},
  volume = {54},
  year = {2012},
  keywords = {HMM-based speech synthesis, Unsupervised speaker adaptation, Cross-lingual speaker adaptation, Speech-to-speech translation},
  pages = {703--714}
}
@inproceedings{cabral_ssw7,
  author = {Cabral, Jo{\~a}o and Renals, Steve and Richmond, Korin and Yamagishi, Junichi},
  title = {Transforming Voice Source Parameters in a {HMM}-based Speech Synthesiser with Glottal Post-Filtering},
  booktitle = {Proc. 7th ISCA Speech Synthesis Workshop (SSW7)},
  year = {2010},
  abstract = {Control over voice quality, e.g. breathy and tense voice, is important for speech synthesis applications. For example, transformations can be used to modify aspects of the voice re- lated to speaker's identity and to improve expressiveness. How- ever, it is hard to modify voice characteristics of the synthetic speech, without degrading speech quality. State-of-the-art sta- tistical speech synthesisers, in particular, do not typically al- low control over parameters of the glottal source, which are strongly correlated with voice quality. Consequently, the con- trol of voice characteristics in these systems is limited. In con- trast, the HMM-based speech synthesiser proposed in this paper uses an acoustic glottal source model. The system passes the glottal signal through a whitening filter to obtain the excitation of voiced sounds. This technique, called glottal post-filtering, allows to transform voice characteristics of the synthetic speech by modifying the source model parameters. We evaluated the proposed synthesiser in a perceptual ex- periment, in terms of speech naturalness, intelligibility, and similarity to the original speaker's voice. The results show that it performed as well as a HMM-based synthesiser, which generates the speech signal with a commonly used high-quality speech vocoder.},
  month = {September},
  address = {NICT/ATR, Kyoto, Japan},
  keywords = {HMM-based speech synthesis, voice quality, glottal post-filter},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/cabral_ssw7.pdf},
  pages = {365--370}
}
@inproceedings{tachibana07:styleclassify07,
  author = {Tachibana, Makoto and Kawashima, Keigo and Yamagishi, Junichi and Kobayashi, Takao},
  title = {Performance Evaluation of {HMM}-Based Style Classification with a Small Amount of Training Data},
  booktitle = {Proc. Interspeech 2007},
  month = {August},
  year = {2007},
  abstract = {This paper describes a classification technique for emotional expressions and speaking styles of speech using only a small amount of training data of a target speaker. We model spectral and fundamental frequency (F0) features simultaneously using multi-space probability distribution HMM (MSD-HMM), and adapt a speaker-independent neutral style model to a certain target speaker’s style model with a small amount of data using MSD-MLLR which is extended MLLR for MSD-HMM. We perform classification experiments for professional narrators’ speech and non-professional speakers' speech and evaluate the performance of proposed technique by comparing with other commonly used classifiers. We show that the proposed technique gives better result than the other classifiers when using a few sentences of target speaker’s style data.},
  categories = {emotion, speaking style, classification}
}
@article{Ling2010834,
  author = {Ling, Zhen-Hua and Richmond, Korin and Yamagishi, Junichi},
  doi = {10.1016/j.specom.2010.06.006},
  title = {An Analysis of {HMM}-based prediction of articulatory movements},
  journal = {Speech Communication},
  issn = {0167-6393},
  number = {10},
  abstract = {This paper presents an investigation into predicting the movement of a speaker's mouth from text input using hidden Markov models (HMM). A corpus of human articulatory movements, recorded by electromagnetic articulography (EMA), is used to train HMMs. To predict articulatory movements for input text, a suitable model sequence is selected and a maximum-likelihood parameter generation (MLPG) algorithm is used to generate output articulatory trajectories. Unified acoustic-articulatory HMMs are introduced to integrate acoustic features when an acoustic signal is also provided with the input text. Several aspects of this method are analyzed in this paper, including the effectiveness of context-dependent modeling, the role of supplementary acoustic input, and the appropriateness of certain model structures for the unified acoustic-articulatory models. When text is the sole input, we find that fully context-dependent models significantly outperform monophone and quinphone models, achieving an average root mean square (RMS) error of 1.945 mm and an average correlation coefficient of 0.600. When both text and acoustic features are given as input to the system, the difference between the performance of quinphone models and fully context-dependent models is no longer significant. The best performance overall is achieved using unified acoustic-articulatory quinphone HMMs with separate clustering of acoustic and articulatory model parameters, a synchronous-state sequence, and a dependent-feature model structure, with an RMS error of 0.900 mm and a correlation coefficient of 0.855 on average. Finally, we also apply the same quinphone HMMs to the acoustic-articulatory, or inversion, mapping problem, where only acoustic input is available. An average root mean square (RMS) error of 1.076 mm and an average correlation coefficient of 0.812 are achieved. Taken together, our results demonstrate how text and acoustic inputs both contribute to the prediction of articulatory movements in the method used.},
  month = {October},
  volume = {52},
  year = {2010},
  keywords = {Hidden Markov model; Articulatory features; Parameter generation},
  pages = {834--846}
}
@inproceedings{Cassia_ICASSP11,
  author = {Valentini-Botinhao, Cassia and Yamagishi, Junichi and King, Simon},
  doi = {10.1109/ICASSP.2011.5947507},
  title = {Evaluation of objective measures for intelligibility prediction of {HMM}-based synthetic speech in noise},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011 IEEE International Conference on},
  issn = {1520-6149},
  abstract = {{In this paper we evaluate four objective measures of speech with regards to intelligibility prediction of synthesized speech in diverse noisy situations. We evaluated three intelligibility measures, the Dau measure, the glimpse proportion and the Speech Intelligibility Index (SII) and a quality measure, the Perceptual Evaluation of Speech Quality (PESQ). For the generation of synthesized speech we used a state of the art HMM-based speech synthesis system. The noisy conditions comprised four additive noises. The measures were compared with subjective intelligibility scores obtained in listening tests. The results show the Dau and the glimpse measures to be the best predictors of intelligibility, with correlations of around 0.83 to subjective scores. All measures gave less accurate predictions of intelligibility for synthetic speech than have previously been found for natural speech; in particular the SII measure. In additional experiments, we processed the synthesized speech by an ideal binary mask before adding noise. The Glimpse measure gave the most accurate intelligibility predictions in this situation.}},
  month = {May},
  year = {2011},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/Cassia_ICASSP11.pdf},
  pages = {5112--5115},
  categories = {HMM-based speech synthesis, objective measures of intelligibility}
}
@inproceedings{leo_09-1,
  author = {Badino, Leonardo and Andersson, J. Sebastian and Yamagishi, Junichi and Clark, Robert A.J.},
  title = {Identification of Contrast and Its Emphatic Realization in {HMM}-based Speech Synthesis},
  booktitle = {Proc. Interspeech 2009},
  year = {2009},
  month = {September},
  address = {Brighton, U.K.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/IS090749.PDF},
  abstract = {The work presented in this paper proposes to identify contrast in the form of contrastive word pairs and prosodically signal it with emphatic accents in a Text-to-Speech (TTS) application using a Hidden-Markov-Model (HMM) based speech synthesis system. We first describe a novel method to automatically detect contrastive word pairs using textual features only and report its performance on a corpus of spontaneous conversations in English. Subsequently we describe the set of features selected to train a HMM-based speech synthesis system and attempting to properly control prosodic prominence (including emphasis). Results from a large scale perceptual test show that in the majority of cases listeners judge emphatic contrastive word pairs as acceptable as their non-emphatic counterpart, while emphasis on non-contrastive pairs is almost never acceptable.}
}
@misc{Hofer_Shimodaira:sigg:2007,
  author = {Hofer, Gregor and Shimodaira, Hiroshi and Yamagishi, Junichi},
  howpublished = {Poster at Siggraph 2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/siggraph07.pdf},
  year = {2007},
  title = {Speech-driven Head Motion Synthesis based on a Trajectory Model},
  address = {San Diego, USA}
}
@article{yamagishi2009,
  author = {Yamagishi, Junichi and Nose, Takashi and Zen, Heiga and Ling, Zhenhua and Toda, Tomoki and Tokuda, Keiichi and King, Simon and Renals, Steve},
  title = {Robust Speaker-Adaptive {HMM}-based Text-to-Speech Synthesis},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=5109758&arnumber=5153555&count=14&index=12},
  journal = {IEEE Transactions on Audio, Speech and Language Processing},
  number = {6},
  abstract = {This paper describes a speaker-adaptive HMM-based speech synthesis system. The new system, called ``HTS-2007,'' employs speaker adaptation (CSMAPLR+MAP), feature-space adaptive training, mixed-gender modeling, and full-covariance modeling using CSMAPLR transforms, in addition to several other techniques that have proved effective in our previous systems. Subjective evaluation results show that the new system generates significantly better quality synthetic speech than speaker-dependent approaches with realistic amounts of speech data, and that it bears comparison with speaker-dependent approaches even when large amounts of speech data are available. In addition, a comparison study with several speech synthesis techniques shows the new system is very robust: It is able to build voices from less-than-ideal speech data and synthesize good-quality speech even for out-of-domain sentences.},
  volume = {17},
  year = {2009},
  pdf = {},
  pages = {1208--1230}
}
@inproceedings{phillip:odyssey2010,
  author = {Leon, P.L. De and Pucher, M. and Yamagishi, J.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/main_v2.pdf},
  booktitle = {{Proc. Odyssey (The speaker and language recognition workshop) 2010}},
  year = {2010},
  title = {Evaluation of the Vulnerability of Speaker Verification to Synthetic Speech},
  address = {Brno, Czech Republic}
}
@article{Stan2011442,
  author = {Stan, Adriana and Yamagishi, Junichi and King, Simon and Aylett, Matthew},
  note = {},
  doi = {10.1016/j.specom.2010.12.002},
  title = {The {R}omanian speech synthesis ({RSS}) corpus: Building a high quality {HMM}-based speech synthesis system using a high sampling rate},
  url = {http://www.sciencedirect.com/science/article/pii/S0167639310002074},
  journal = {Speech Communication},
  issn = {0167-6393},
  number = {3},
  abstract = {This paper first introduces a newly-recorded high quality Romanian speech corpus designed for speech synthesis, called ``RSS'', along with Romanian front-end text processing modules and HMM-based synthetic voices built from the corpus. All of these are now freely available for academic use in order to promote Romanian speech technology research. The RSS corpus comprises 3500 training sentences and 500 test sentences uttered by a female speaker and was recorded using multiple microphones at 96 kHz sampling frequency in a hemianechoic chamber. The details of the new Romanian text processor we have developed are also given. Using the database, we then revisit some basic configuration choices of speech synthesis, such as waveform sampling frequency and auditory frequency warping scale, with the aim of improving speaker similarity, which is an acknowledged weakness of current HMM-based speech synthesisers. As we demonstrate using perceptual tests, these configuration choices can make substantial differences to the quality of the synthetic speech. Contrary to common practice in automatic speech recognition, higher waveform sampling frequencies can offer enhanced feature extraction and improved speaker similarity for HMM-based speech synthesis.},
  volume = {53},
  year = {2011},
  keywords = {Speech synthesis, HTS, Romanian, HMMs, Sampling frequency, Auditory scale},
  pages = {442--450}
}
@inproceedings{oura:icassp:10,
  author = {Oura, Keiichiro and Tokuda, Keiichi and Yamagishi, Junichi and Wester, Mirjam and King, Simon},
  title = {Unsupervised Cross-lingual Speaker Adaptation for {HMM}-based Speech Synthesis},
  booktitle = {Proc. ICASSP},
  abstract = {In the EMIME project, we are developing a mobile device that performs personalized speech-to-speech translation such that a user's spoken input in one language is used to produce spoken output in another language, while continuing to sound like the user's voice. We integrate two techniques, unsupervised adaptation for HMM-based TTS using a word-based large-vocabulary continuous speech recognizer and cross-lingual speaker adaptation for HMM-based TTS, into a single architecture. Thus, an unsupervised cross-lingual speaker adaptation system can be developed. Listening tests show very promising results, demonstrating that adapted voices sound similar to the target speaker and that differences between supervised and unsupervised cross-lingual speaker adaptation are small.},
  volume = {I},
  year = {2010},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/oura_icassp2010.pdf},
  pages = {4954-4957},
  categories = {speaker adaptation, TTS}
}
@inproceedings{cabral_yrwst,
  author = {Cabral, J. and Renals, S. and Richmond, K. and Yamagishi, J.},
  title = {{HMM}-based Speech Synthesis with an Acoustic Glottal Source Model},
  booktitle = {Proc. The First Young Researchers Workshop in Speech Technology},
  month = {April},
  year = {2009},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/jscabral-yrwss2009.pdf},
  abstract = {A major cause of degradation of speech quality in HMM-based speech synthesis is the use of a simple delta pulse signal to generate the excitation of voiced speech. This paper describes a new approach to using an acoustic glottal source model in HMM-based synthesisers. The goal is to improve speech quality and parametric flexibility to better model and transform voice characteristics.},
  categories = {HMM-based Speech Synthesis, LF-Model, Glottal Spectral Separation}
}
@article{Hashimoto2012857,
  author = {Hashimoto, Kei and Yamagishi, Junichi and Byrne, William and King, Simon and Tokuda, Keiichi},
  note = {},
  doi = {10.1016/j.specom.2012.02.004},
  title = {Impacts of machine translation and speech synthesis on speech-to-speech translation},
  url = {http://www.sciencedirect.com/science/article/pii/S0167639312000283},
  journal = {Speech Communication},
  issn = {0167-6393},
  number = {7},
  abstract = {This paper analyzes the impacts of machine translation and speech synthesis on speech-to-speech translation systems. A typical speech-to-speech translation system consists of three components: speech recognition, machine translation and speech synthesis. Many techniques have been proposed for integration of speech recognition and machine translation. However, corresponding techniques have not yet been considered for speech synthesis. The focus of the current work is machine translation and speech synthesis, and we present a subjective evaluation designed to analyze their impact on speech-to-speech translation. The results of these analyses show that the naturalness and intelligibility of the synthesized speech are strongly affected by the fluency of the translated sentences. In addition, several features were found to correlate well with the average fluency of the translated sentences and the average naturalness of the synthesized speech.},
  volume = {54},
  year = {2012},
  keywords = {Speech-to-speech translation, Machine translation, Speech synthesis, Subjective evaluation},
  pages = {857--866}
}
@article{6205335,
  author = {De Leon, P. L. and Pucher, M. and Yamagishi, J. and Hernaez, I. and Saratxaga, I.},
  doi = {10.1109/TASL.2012.2201472},
  title = {Evaluation of Speaker Verification Security and Detection of {HMM}-Based Synthetic Speech},
  journal = {Audio, Speech, and Language Processing, IEEE Transactions on},
  issn = {1558-7916},
  number = {8},
  abstract = {In this paper, we evaluate the vulnerability of speaker verification (SV) systems to synthetic speech. The SV systems are based on either the Gaussian mixture model #x2013;universal background model (GMM-UBM) or support vector machine (SVM) using GMM supervectors. We use a hidden Markov model (HMM)-based text-to-speech (TTS) synthesizer, which can synthesize speech for a target speaker using small amounts of training data through model adaptation of an average voice or background model. Although the SV systems have a very low equal error rate (EER), when tested with synthetic speech generated from speaker models derived from the Wall Street Journal (WSJ) speech corpus, over 81% of the matched claims are accepted. This result suggests vulnerability in SV systems and thus a need to accurately detect synthetic speech. We propose a new feature based on relative phase shift (RPS), demonstrate reliable detection of synthetic speech, and show how this classifier can be used to improve security of SV systems.},
  month = {October},
  volume = {20},
  year = {2012},
  pages = {2280--2290}
}
@inproceedings{CassiaSAPA12,
  author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S.},
  title = {{Evaluating speech intelligibility enhancement for {HMM}-based synthetic speech in noise}},
  booktitle = {Proc. Sapa Workshop},
  address = {Portland, USA},
  month = {September},
  year = {2012},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_Sapa12.pdf},
  abstract = {It is possible to increase the intelligibility of speech in noise by enhancing the clean speech signal. In this paper we demonstrate the effects of modifying the spectral envelope of synthetic speech according to the environmental noise. To achieve this, we modify Mel cepstral coefficients according to an intelligibility measure that accounts for glimpses of speech in noise: the Glimpse Proportion measure. We evaluate this method against a baseline synthetic voice trained only with normal speech and a topline voice trained with Lombard speech, as well as natural speech. The intelligibility of these voices was measured when mixed with speech-shaped noise and with a competing speaker at three different levels. The Lombard voices, both natural and synthetic, were more intelligible than the normal voices in all conditions. For speech-shaped noise, the proposed modified voice was as intelligible as the Lombard synthetic voice without requiring any recordings of Lombard speech, which are hard to obtain. However, in the case of competing talker noise, the Lombard synthetic voice was more intelligible than the proposed modified voice.},
  categories = {HMM-based speech synthesis, intelligibility enhancement}
}
@inproceedings{jyamagis:emime,
  author = {Yamagishi, Junichi and Lincoln, Mike and King, Simon and Dines, John and Gibson, Matthew and Tian, Jilei and Guan, Yong},
  title = {Analysis of Unsupervised and Noise-Robust Speaker-Adaptive {HMM}-Based Speech Synthesis Systems toward a Unified {ASR} and {TTS} Framework},
  booktitle = {Proc. Interspeech 2009},
  year = {2009},
  month = {September},
  address = {Edinburgh, U.K.},
  abstract = {For the 2009 Blizzard Challenge we have built an unsupervised version of the HTS-2008 speaker-adaptive HMM-based speech synthesis system for English, and a noise robust version of the systems for Mandarin. They are designed from a multidisciplinary application point of view in that we attempt to integrate the components of the TTS system with other technologies such as ASR. All the average voice models are trained exclusively from recognized, publicly available, ASR databases. Multi-pass LVCSR and confidence scores calculated from confusion network are used for the unsupervised systems, and noisy data recorded in cars or public spaces is used for the noise robust system. We believe the developed systems form solid benchmarks and provide good connections to ASR fields. This paper describes the development of the systems and reports the results and analysis of their evaluation.}
}
@inproceedings{CassiaLista12,
  author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S.},
  title = {{Using an intelligibility measure to create noise robust cepstral coefficients for {HMM}-based speech synthesis}},
  booktitle = {Proc. LISTA Workshop},
  year = {2012},
  month = {May},
  address = {Edinburgh, UK},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_Lista12.pdf},
  categories = {HMM-based speech synthesis, intelligibility enhancement}
}
@inproceedings{cabral2011a,
  author = {Cabral, J.P. and Renals, S. and Yamagishi, J. and Richmond, K.},
  doi = {10.1109/ICASSP.2011.5947405},
  title = {{HMM}-based speech synthesiser using the {LF}-model of the glottal source},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011 IEEE International Conference on},
  issn = {1520-6149},
  abstract = {A major factor which causes a deterioration in speech quality in {HMM}-based speech synthesis is the use of a simple delta pulse signal to generate the excitation of voiced speech. This paper sets out a new approach to using an acoustic glottal source model in HMM-based synthesisers instead of the traditional pulse signal. The goal is to improve speech quality and to better model and transform voice characteristics. We have found the new method decreases buzziness and also improves prosodic modelling. A perceptual evaluation has supported this finding by showing a 55.6% preference for the new system, as against the baseline. This improvement, while not being as significant as we had initially expected, does encourage us to work on developing the proposed speech synthesiser further.},
  month = {May},
  year = {2011},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/05947405.pdf},
  pages = {4704--4707},
  categories = {HMM-based speech synthesiser;acoustic glottal source model LF-model;delta pulse signal;perceptual evaluation;prosodic modelling;speech quality;voiced speech generation;hidden Markov models;speech synthesis;}
}
@misc{Hofer_Shimodaira:sca:2007,
  author = {Hofer, Gregor and Shimodaira, Hiroshi and Yamagishi, Junichi},
  howpublished = {Poster at SCA 2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/sca07.pdf},
  year = {2007},
  title = {Lip motion synthesis using a context dependent trajectory hidden {M}arkov model},
  address = {San Diego, USA}
}
@inproceedings{higher_level,
  author = {Watts, Oliver and Yamagishi, Junichi and King, Simon},
  title = {The role of higher-level linguistic features in {HMM}-based speech synthesis},
  booktitle = {Proc. Interspeech},
  year = {2010},
  abstract = {We analyse the contribution of higher-level elements of the linguistic specification of a data-driven speech synthesiser to the naturalness of the synthetic speech which it generates. The system is trained using various subsets of the full feature-set, in which features relating to syntactic category, intonational phrase boundary, pitch accent and boundary tones are selectively removed. Utterances synthesised by the different configurations of the system are then compared in a subjective evaluation of their naturalness. The work presented forms background analysis for an ongoing set of experiments in performing text-to-speech (TTS) conversion based on shallow features: features that can be trivially extracted from text. By building a range of systems, each assuming the availability of a different level of linguistic annotation, we obtain benchmarks for our on-going work.},
  month = {September},
  address = {Makuhari, Japan},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100565.pdf},
  pages = {841-844}
}
@inproceedings{tts_barra08,
  author = {Barra-Chicote, R. and Yamagishi, J. and Montero, J.M. and King, S. and Lutfi, S. and Macias-Guarasa, J.},
  title = {Generacion de una voz sintetica en {C}astellano basada en {HSMM} para la {E}valuacion {A}lbayzin 2008: conversion texto a voz},
  booktitle = {V Jornadas en Tecnologia del Habla},
  month = {November},
  note = {(in Spanish)},
  year = {2008},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/tts-jth08.pdf},
  pages = {115-118}
}
@inproceedings{lips08-gregpr,
  author = {Hofer, Gregor and Yamagishi, Junichi and Shimodaira, Hiroshi},
  title = {Speech-driven Lip Motion Generation with a Trajectory {HMM}},
  booktitle = {Proc. Interspeech 2008},
  year = {2008},
  abstract = {Automatic speech animation remains a challenging problem that can be described as finding the optimal sequence of animation parameter configurations given some speech. In this paper we present a novel technique to automatically synthesise lip motion trajectories from a speech signal. The developed system predicts lip motion units from the speech signal and generates animation trajectories automatically employing a "Trajectory Hidden Markov Model". Using the MLE criterion, its parameter generation algorithm produces the optimal smooth motion trajectories that are used to drive control points on the lips directly. Additionally, experiments were carried out to find a suitable model unit that produces the most accurate results. Finally a perceptual evaluation was conducted, that showed that the developed motion units perform better than phonemes.},
  month = {September},
  key = {lips08-gregpr},
  address = {Brisbane, Australia},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/GregorLipsChallenge08.pdf},
  pages = {2314--2317},
  categories = {visual speech synthesis, trajectory HMM, HTS}
}
@inproceedings{john:HTSGAP,
  author = {Dines, J. and Yamagishi, J. and King, S.},
  title = {Measuring the gap between {HMM}-based {ASR} and {TTS}},
  booktitle = {Proc. Interspeech},
  year = {2009},
  abstract = {The EMIME European project is conducting research in the development of technologies for mobile, personalised speech-to-speech translation systems. The hidden Markov model is being used as the underlying technology in both automatic speech recognition (ASR) and text-to-speech synthesis (TTS) components, thus, the investigation of unified statistical modelling approaches has become an implicit goal of our research. As one of the first steps towards this goal, we have been investigating commonalities and differences between HMM-based ASR and TTS. In this paper we present results and analysis of a series of experiments that have been conducted on English ASR and TTS systems, measuring their performance with respect to phone set and lexicon, acoustic feature type and dimensionality and HMM topology. Our results show that, although the fundamental statistical model may be essentially the same, optimal ASR and TTS performance often demands diametrically opposed system designs. This represents a major challenge to be addressed in the investigation of such unified modelling approaches.},
  month = {September},
  address = {Brighton, U.K.},
  pages = {1391--1394}
}
@inproceedings{kurimo:acl:10,
  author = {Kurimo, Mikko and Byrne, William and Dines, John and Garner, Philip N. and Gibson, Matthew and Guan, Yong and Hirsim\"{a}ki, Teemu and Karhila, Reima and King, Simon and Liang, Hui and Oura, Keiichiro and Saheer, Lakshmi and Shannon, Matt and Shiota, Sayaka and Tian, Jilei and Tokuda, Keiichi and Wester, Mirjam and Wu, Yi-Jian and Yamagishi, Junichi},
  title = {Personalising speech-to-speech translation in the {EMIME} project},
  booktitle = {Proc. ACL 2010 System Demonstrations},
  year = {2010},
  month = {July},
  address = {Uppsala, Sweden},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/kurimo_acl_2010.pdf},
  abstract = {In the EMIME project we have studied unsupervised cross-lingual speaker adaptation. We have employed an HMM statistical framework for both speech recognition and synthesis which provides transformation mechanisms to adapt the synthesized voice in TTS (text-to-speech) using the recognized voice in ASR (automatic speech recognition). An important application for this research is personalised speech-to-speech translation that will use the voice of the speaker in the input language to utter the translated sentences in the output language. In mobile environments this enhances the users' interaction across language barriers by making the output speech sound more like the original speaker's way of speaking, even if she or he could not speak the output language.},
  categories = {speaker adaptation}
}
@inproceedings{cabral07,
  author = {Cabral, J. and Renals, S. and Richmond, K. and Yamagishi, J.},
  title = {Towards an Improved Modeling of the Glottal Source in Statistical Parametric Speech Synthesis},
  booktitle = {Proc.of the 6th ISCA Workshop on Speech Synthesis},
  year = {2007},
  address = {Bonn, Germany},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/cabral07.pdf},
  abstract = {This paper proposes the use of the Liljencrants-Fant model (LF-model) to represent the glottal source signal in HMM-based speech synthesis systems. These systems generally use a pulse train to model the periodicity of the excitation signal of voiced speech. However, this model produces a strong and uniform harmonic structure throughout the spectrum of the excitation which makes the synthetic speech sound buzzy. The use of a mixed band excitation and phase manipulation reduces this effect but it can result in degradation of the speech quality if the noise component is not weighted carefully. In turn, the LF-waveform has a decaying spectrum at higher frequencies, which is more similar to the real glottal source excitation signal. We conducted a perceptual experiment to test the hypothesis that the LF-model can perform as well as or better than the pulse train in a HMM-based speech synthesizer. In the synthesis, we used the mean values of the LF-parameters, calculated by measurements of the recorded speech. The result of this study is important not only regarding the improvement in speech quality of these type of systems, but also because the LF-model can be used to model many characteristics of the glottal source, such as voice quality, which are important for voice transformation and generation of expressive speech.},
  categories = {LF-model, Statistical parametric speech synthesis, HMM-based speech synthesis}
}
@article{michael09:dialectHTS,
  author = {Pucher, Michael and Schabus, Dietmar and Yamagishi, Junichi and Neubarth, Friedrich and Strom, Volker},
  doi = {10.1016/j.specom.2009.09.004},
  title = {Modeling and Interpolation of {Austrian German and Viennese} Dialect in {HMM}-based Speech Synthesis},
  journal = {Speech Communication},
  number = {2},
  abstract = {An HMM-based speech synthesis framework is applied to both Standard Austrian German and a Viennese dialectal variety and several training strategies for multi-dialect modeling such as dialect clustering and dialect-adaptive training are investigated. For bridging the gap between processing on the level of HMMs and on the linguistic level, we add phonological transformations to the HMM interpolation and apply them to dialect interpolation. The crucial steps are to employ several formalized phonological rules between Austrian German and Viennese dialect as constraints for the HMM interpolation. We verify the effectiveness of this strategy in a number of perceptual evaluations. Since the HMM space used is not articulatory but acoustic space, there are some variations in evaluation results between the phonological rules. However, in general we obtained good evaluation results which show that listeners can perceive both continuous and categorical changes of dialect varieties by using phonological transformations employed as switching rules in the HMM interpolation.},
  volume = {52},
  year = {2010},
  pages = {164--179},
  categories = {speech synthesis, hidden Markov model, dialect, sociolect, Austrian German}
}
@inproceedings{king:tokuda:zen:yamagishi:interspeech2008,
  author = {King, Simon and Tokuda, Keiichi and Zen, Heiga and Yamagishi, Junichi},
  title = {Unsupervised adaptation for HMM-based speech synthesis},
  booktitle = {Proc. Interspeech},
  year = {2008},
  abstract = {It is now possible to synthesise speech using HMMs with a comparable quality to unit-selection techniques. Generating speech from a model has many potential advantages over concatenating waveforms. The most exciting is model adaptation. It has been shown that supervised speaker adaptation can yield high-quality synthetic voices with an order of magnitude less data than required to train a speaker-dependent model or to build a basic unit-selection system. Such supervised methods require labelled adaptation data for the target speaker. In this paper, we introduce a method capable of unsupervised adaptation, using only speech from the target speaker without any labelling.},
  month = {September},
  address = {Brisbane, Australia},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080299.PDF},
  pages = {1869-1872},
  categories = {speech synthesis, HMM-based speech synthesis, HTS, trajectory HMMs, speaker adaptation, MLLR}
}
@inproceedings{ling2011a,
  author = {Ling, Zhen-Hua and Richmond, Korin and Yamagishi, Junichi},
  title = {Feature-space transform tying in unified acoustic-articulatory modelling of articulatory control of {HMM}-based speech synthesis},
  booktitle = {Proc. Interspeech},
  year = {2011},
  abstract = {In previous work, we have proposed a method to control the characteristics of synthetic speech flexibly by integrating articulatory features into hidden Markov model (HMM) based parametric speech synthesis. A unified acoustic-articulatory model was trained and a piecewise linear transform was adopted to describe the dependency between these two feature streams. The transform matrices were trained for each HMM state and were tied based on each state's context. In this paper, an improved acoustic-articulatory modelling method is proposed. A Gaussian mixture model (GMM) is introduced to model the articulatory space and the cross-stream transform matrices are trained for each Gaussian mixture instead of context-dependently. This means the dependency relationship can vary with the change of articulatory features flexibly. Our results show this method improves the effectiveness of control over vowel quality by modifing articulatory trajectories without degrading naturalness.},
  month = {August},
  address = {Florence, Italy},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/IS110482.pdf},
  pages = {117--120},
  categories = {speech synthesis, articulatory features, hidden Markov model, Gaussian mixture model}
}
@inproceedings{5947506,
  author = {Hashimoto, K. and Yamagishi, J. and Byrne, W. and King, S. and Tokuda, K.},
  doi = {10.1109/ICASSP.2011.5947506},
  title = {An analysis of machine translation and speech synthesis in speech-to-speech translation system},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011 IEEE International Conference on},
  issn = {1520-6149},
  abstract = {This paper provides an analysis of the impacts of machine translation and speech synthesis on speech-to-speech translation systems. The speech-to-speech translation system consists of three components: speech recognition, machine translation and speech synthesis. Many techniques for integration of speech recognition and machine translation have been proposed. However, speech synthesis has not yet been considered. Therefore, in this paper, we focus on machine translation and speech synthesis, and report a subjective evaluation to analyze the impact of each component. The results of these analyses show that the naturalness and intelligibility of synthesized speech are strongly affected by the fluency of the translated sentences.},
  month = {May},
  year = {2011},
  keywords = {machine translation;speech recognition;speech synthesis;speech-to-speech translation system;speech recognition;speech synthesis;},
  pages = {5108--5111}
}
@inproceedings{junichi:interspeech2010,
  author = {Yamagishi, Junichi and Watts, Oliver and King, Simon and Usabaev, Bela},
  title = {Roles of the Average Voice in Speaker-adaptive {HMM}-based Speech Synthesis},
  booktitle = {{Proc. Interspeech}},
  year = {2010},
  abstract = {In speaker-adaptive HMM-based speech synthesis, there are typically a few speakers for which the output synthetic speech sounds worse than that of other speakers, despite having the same amount of adaptation data from within the same corpus. This paper investigates these fluctuations in quality and concludes that as mel-cepstral distance from the average voice becomes larger, the MOS naturalness scores generally become worse. Although this negative correlation is not that strong, it suggests a way to improve the training and adaptation strategies. We also draw comparisons between our findings and the work of other researchers regarding ``vocal attractiveness.''},
  month = {September},
  address = {Makuhari, Japan},
  keywords = {speech synthesis, HMM, average voice, speaker adaptation},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100361.pdf},
  pages = {418--421}
}
@inproceedings{junichi:icassp2010,
  author = {Yamagishi, J. and King, S.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/JunichiICASSP10.pdf},
  booktitle = {{Proc. ICASSP 2010}},
  year = {2010},
  title = {Simple methods for improving speaker-similarity of {HMM}-based speech synthesis},
  address = {Dallas, Texas, USA}
}
@inproceedings{hts2007-icassp,
  author = {Yamagishi, Junichi and Nose, Takashi and Zen, Heiga and Toda, Tomoki and Tokuda, Keiichi},
  doi = {10.1109/ICASSP.2008.4518520},
  title = {Performance Evaluation of The Speaker-Independent {HMM}-based Speech Synthesis System "{HTS}-2007" for the {Blizzard Challenge 2007}},
  booktitle = {Proc. ICASSP 2008},
  year = {2008},
  abstract = {This paper describes a speaker-independent/adaptive HMM-based speech synthesis system developed for the Blizzard Challenge 2007. The new system, named "HTS-2007", employs speaker adaptation (CSMAPLR+MAP), feature-space adaptive training, mixed-gender modeling, and full-covariance modeling using CSMAPLR transforms, in addition to several other techniques that have proved effective in our previous systems. Subjective evaluation results show that the new system generates significantly better quality synthetic speech than that of speaker-dependent approaches with realistic amounts of speech data, and that it bears comparison with speaker-dependent approaches even when large amounts of speech data are available.},
  month = {April},
  key = {hts2007-icassp},
  address = {Las Vegas, U.S.A},
  pages = {3957--3960},
  categories = {speech synthesis, HMM-based speech synthesis, HTS, speaker adaptation, voice conversion, average voice}
}
@article{junichi:ieee2010,
  author = {Yamagishi, J. and Usabaev, B. and King, S. and Watts, O. and Dines, J. and Tian, J. and Hu, R. and Guan, Y. and Oura, K. and Tokuda, K. and Karhila, R. and Kurimo, M.},
  doi = {10.1109/TASL.2010.2045237},
  title = {Thousands of Voices for {HMM}-based Speech Synthesis -- Analysis and Application of {TTS} Systems Built on Various {ASR} Corpora},
  journal = {IEEE Transactions on Audio, Speech and Language Processing},
  number = {5},
  abstract = {In conventional speech synthesis, large amounts of phonetically balanced speech data recorded in highly controlled recording studio environments are typically required to build a voice. Although using such data is a straightforward solution for high quality synthesis, the number of voices available will always be limited, because recording costs are high. On the other hand, our recent experiments with HMM-based speech synthesis systems have demonstrated that speaker-adaptive HMM-based speech synthesis (which uses an ``average voice model'' plus model adaptation) is robust to non-ideal speech data that are recorded under various conditions and with varying microphones, that are not perfectly clean, and/or that lack phonetic balance. This enables us to consider building high-quality voices on ``non-TTS'' corpora such as ASR corpora. Since ASR corpora generally include a large number of speakers, this leads to the possibility of producing an enormous number of voices automatically. In this paper, we demonstrate the thousands of voices for HMM-based speech synthesis that we have made from several popular ASR corpora such as the Wall Street Journal (WSJ0, WSJ1, and WSJCAM0), Resource Management, Globalphone, and SPEECON databases. We also present the results of associated analysis based on perceptual evaluation, and discuss remaining issues.},
  month = {July},
  volume = {18},
  year = {2010},
  keywords = {Automatic speech recognition (ASR), H Triple S (HTS), SPEECON database, WSJ database, average voice, hidden Markov model (HMM)-based speech synthesis, speaker adaptation, speech synthesis, voice conversion},
  pages = {984--1004}
}
@inproceedings{CassiaIS12,
  author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S.},
  title = {{Mel cepstral coefficient modification based on the Glimpse Proportion measure for improving the intelligibility of {HMM}-generated synthetic speech in noise}},
  booktitle = {Proc. Interspeech},
  year = {2012},
  month = {September},
  address = {Portland, USA},
  abstract = {We propose a method that modifies the Mel cepstral coefficients of HMM-generated synthetic speech in order to increase the intelligibility of the generated speech when heard by a listener in the presence of a known noise. This method is based on an approximation we previously proposed for the Glimpse Proportion measure. Here we show how to update the Mel cepstral coefficients using this measure as an optimization criterion and how to control the amount of distortion by limiting the frequency resolution of the modifications. To evaluate the method we built eight different voices from normal read-text speech data from a male speaker. Some voices were also built from Lombard speech data produced by the same speaker. Listening experiments with speech-shaped noise and with a single competing talker indicate that our method significantly improves intelligibility when compared to unmodified synthetic speech. The voices built from Lombard speech outperformed the proposed method particularly for the competing talker case. However, compared to a voice using only the spectral parameters from Lombard speech, the proposed method obtains similar or higher performance.},
  categories = {HMM-based speech synthesis, intelligibility enhancement, Mel cepstral coefficients}
}
@inproceedings{wester:ssw7:10,
  author = {Wester, Mirjam and Dines, John and Gibson, Matthew and Liang, Hui and Wu, Yi-Jian and Saheer, Lakshmi and King, Simon and Oura, Keiichiro and Garner, Philip N. and Byrne, William and Guan, Yong and Hirsim\"{a}ki, Teemu and Karhila, Reima and Kurimo, Mikko and Shannon, Matt and Shiota, Sayaka and Tian, Jilei and Tokuda, Keiichi and Yamagishi, Junichi},
  title = {Speaker adaptation and the evaluation of speaker similarity in the {EMIME} speech-to-speech translation project},
  booktitle = {Proc. 7th ISCA Speech Synthesis Workshop},
  year = {2010},
  month = {September},
  address = {Kyoto, Japan},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wester_ssw7_2010.pdf},
  abstract = {This paper provides an overview of speaker adaptation research carried out in the EMIME speech-to-speech translation (S2ST) project. We focus on how speaker adaptation transforms can be learned from speech in one language and applied to the acoustic models of another language. The adaptation is transferred across languages and/or from recognition models to synthesis models. The various approaches investigated can all be viewed as a process in which a mapping is defined in terms of either acoustic model states or linguistic units. The mapping is used to transfer either speech data or adaptation transforms between the two models. Because the success of speaker adaptation in text-to-speech synthesis is measured by judging speaker similarity, we also discuss issues concerning evaluation of speaker similarity in an S2ST scenario.},
  categories = {speaker adaptation, evaluation}
}
@inproceedings{hts2008,
  author = {Yamagishi, Junichi and Zen, Heiga and Wu, Yi-Jian and Toda, Tomoki and Tokuda, Keiichi},
  title = {The {HTS}-2008 System: Yet Another Evaluation of the Speaker-Adaptive {HMM}-based Speech Synthesis System in The {2008 Blizzard Challenge}},
  booktitle = {Proc. Blizzard Challenge 2008},
  year = {2008},
  month = {September},
  key = {hts2008},
  address = {Brisbane, Australia},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/HTS2008.pdf},
  abstract = {For the 2008 Blizzard Challenge, we used the same speaker-adaptive approach to HMM-based speech synthesis that was used in the HTS entry to the 2007 challenge, but an improved system was built in which the multi-accented English average voice model was trained on 41 hours of speech data with high-order mel-cepstral analysis using an efficient forward-backward algorithm for the HSMM. The listener evaluation scores for the synthetic speech generated from this system was much better than in 2007: the system had the equal best naturalness on the small English data set and the equal best intelligibility on both small and large data sets for English, and had the equal best naturalness on the Mandarin data. In fact, the English system was found to be as intelligible as human speech.},
  categories = {speech synthesis, HMM-based speech synthesis, HTS, speaker adaptation, voice conversion, average voice, Blizzard Challenge}
}
@inproceedings{6287948,
  author = {Saheer, L. and Yamagishi, J. and Garner, P.N. and Dines, J.},
  doi = {10.1109/ICASSP.2012.6287948},
  title = {Combining vocal tract length normalization with hierarchial linear transformations},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2012 IEEE International Conference on},
  issn = {1520-6149},
  number = {},
  abstract = {Recent research has demonstrated the effectiveness of vocal tract length normalization (VTLN) as a rapid adaptation technique for statistical parametric speech synthesis. VTLN produces speech with naturalness preferable to that of MLLR-based adaptation techniques, being much closer in quality to that generated by the original average voice model. However with only a single parameter, VTLN captures very few speaker specific characteristics when compared to linear transform based adaptation techniques. This paper proposes that the merits of VTLN can be combined with those of linear transform based adaptation in a hierarchial Bayesian framework, where VTLN is used as the prior information. A novel technique for propagating the gender information from the VTLN prior through constrained structural maximum a posteriori linear regression (CSMAPLR) adaptation is presented. Experiments show that the resulting transformation has improved speech quality with better naturalness, intelligibility and improved speaker similarity.},
  month = {March},
  volume = {},
  year = {2012},
  keywords = {CSMAPLR adaptation;MLLR based adaptation technique;constrained structural maximum a posteriori linear regression;hierarchial Bayesian framework;hierarchial linear transformation;intelligibility;rapid adaptation technique;speaker similarity;statistical parametric speech synthesis;vocal tract length normalization;Bayes methods;speech intelligibility;},
  pages = {4493 -4496}
}
@inproceedings{jyamagis07:hts2007,
  author = {Yamagishi, Junichi and Zen, Heiga and Toda, Tomoki and Tokuda, Keiichi},
  title = {Speaker-Independent {HMM}-based Speech Synthesis System -- {HTS-2007} System for the {Blizzard Challenge 2007}},
  booktitle = {Proc. Blizzard Challenge 2007},
  month = {August},
  year = {2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blizzard2007-HTS.pdf},
  abstract = {This paper describes an HMM-based speech synthesis system developed by the HTS working group for the Blizzard Challenge 2007. To further explore the potential of HMM-based speech synthesis, we incorporate new features in our conventional system which underpin a speaker-independent approach: speaker adaptation techniques; adaptive training for HSMMs; and full covariance modeling using the CSMAPLR transforms.},
  categories = {HMM, speech synthesis, speaker adaptation, HTS, Blizzard Challenge}
}
@inproceedings{michael:interspeech2010,
  author = {Pucher, Michael and Schabus, Dietmar and Yamagishi, Junichi},
  title = {Synthesis of fast speech with interpolation of adapted {HSMMs} and its evaluation by blind and sighted listeners},
  booktitle = {Proc. Interspeech},
  year = {2010},
  abstract = {In this paper we evaluate a method for generating synthetic speech at high speaking rates based on the interpolation of hidden semi-Markov models (HSMMs) trained on speech data recorded at normal and fast speaking rates. The subjective evaluation was carried out with both blind listeners, who are used to very fast speaking rates, and sighted listeners. We show that we can achieve a better intelligibility rate and higher voice quality with this method compared to standard HSMM-based duration modeling. We also evaluate duration modeling with the interpolation of all the acoustic features including not only duration but also spectral and F0 models. An analysis of the mean squared error (MSE) of standard HSMM-based duration modeling for fast speech identifies problematic linguistic contexts for duration modeling.},
  month = {September},
  address = {Makuhari, Japan},
  keywords = {speech synthesis, fast speech, hidden semi- Markov model},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100294.pdf},
  pages = {2186--2189}
}
@inproceedings{richmond2007b,
  author = {Richmond, K. and Strom, V. and Clark, R. and Yamagishi, J. and Fitt, S.},
  title = {Festival Multisyn Voices for the 2007 Blizzard Challenge},
  booktitle = {Proc. Blizzard Challenge Workshop (in Proc. SSW6)},
  year = {2007},
  month = {August},
  key = {richmond2007b},
  address = {Bonn, Germany},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blizzard2007paper.pdf},
  abstract = {This paper describes selected aspects of the Festival Multisyn entry to the Blizzard Challenge 2007. We provide an overview of the process of building the three required voices from the speech data provided. This paper focuses on new features of Multisyn which are currently under development and which have been employed in the system used for this Blizzard Challenge. These differences are the application of a more flexible phonetic lattice representation during forced alignment labelling and the use of a pitch accent target cost component. Finally, we also examine aspects of the speech data provided for this year's Blizzard Challenge and raise certain issues for discussion concerning the aim of comparing voices made with differing subsets of the data provided.},
  categories = {tts, blizzard, multisyn, unit selection}
}
@article{2012E121001,
  author = {Yamagishi, Junichi and Veaux, Christophe and King, Simon and Renals, Steve},
  doi = {10.1250/ast.33.1},
  title = {Speech synthesis technologies for individuals with vocal disabilities: Voice banking and reconstruction},
  url = {http://www.jstage.jst.go.jp/browse/ast/33/1/_contents},
  journal = {Acoustical Science and Technology},
  number = {1},
  pages = {1--5},
  volume = {33},
  year = {2012},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/AST-33_1.pdf},
  abstract = {In this invited paper, we overview the clinical applications of speech synthesis technologies and explain a few selected researches. We also introduce the University of Edinburgh’s new project ``Voice Banking and reconstruction'' for patients with degenerative diseases, such as motor neurone disease and Parkinson's disease and show how speech synthesis technologies can improve the quality of life for the patients.}
}
@article{Creer2012,
  author = {Creer, Sarah and Cunningham, Stuart and Green, Phil and Yamagishi, Junichi},
  doi = {10.1016/j.csl.2012.10.001},
  title = {Building personalised synthetic voices for individuals with severe speech impairment},
  url = {http://www.sciencedirect.com/science/article/pii/S0885230812000836?v=s5},
  journal = {Computer Speech and Language},
  issn = {0885-2308},
  number = {6},
  abstract = {For individuals with severe speech impairment accurate spoken communication can be difficult and require considerable effort. Some may choose to use a voice output communication aid (or VOCA) to support their spoken communication needs. A VOCA typically takes input from the user through a keyboard or switch-based interface and produces spoken output using either synthesised or recorded speech. The type and number of synthetic voices that can be accessed with a VOCA is often limited and this has been implicated as a factor for rejection of the devices. Therefore, there is a need to be able to provide voices that are more appropriate and acceptable for users. This paper reports on a study that utilises recent advances in speech synthesis to produce personalised synthetic voices for 3 speakers with mild to severe dysarthria, one of the most common speech disorders. Using a statistical parametric approach to synthesis, an average voice trained on data from several unimpaired speakers was adapted using recordings of the impaired speech of 3 dysarthric speakers. By careful selection of the speech data and the model parameters, several exemplar voices were produced for each speaker. A qualitative evaluation was conducted with the speakers and listeners who were familiar with the speaker. The evaluation showed that for one of the 3 speakers a voice could be created which conveyed many of his personal characteristics, such as regional identity, sex and age.},
  volume = {27},
  year = {2012},
  keywords = {Speech synthesis, Augmentative and alternative communication, Disordered speech, Voice output communication aid},
  pages = {1178-1193}
}
@inproceedings{jyamagis:1000sHTS,
  author = {Yamagishi, J. and Usabaev, Bela and King, Simon and Watts, Oliver and Dines, John and Tian, Jilei and Hu, Rile and Guan, Yong and Oura, Keiichiro and Tokuda, Keiichi and Karhila, Reima and Kurimo, Mikko},
  title = {Thousands of voices for {HMM}-based speech synthesis},
  booktitle = {Proc. Interspeech},
  year = {2009},
  abstract = {Our recent experiments with HMM-based speech synthesis systems have demonstrated that speaker-adaptive HMM-based speech synthesis (which uses an ‘average voice model’ plus model adaptation) is robust to non-ideal speech data that are recorded under various conditions and with varying microphones, that are not perfectly clean, and/or that lack of phonetic balance. This enables us consider building high-quality voices on ’non-TTS’ corpora such as ASR corpora. Since ASR corpora generally include a large number of speakers, this leads to the possibility of producing an enormous number of voices automatically. In this paper we show thousands of voices for HMM-based speech synthesis that we have made from several popular ASR corpora such as the Wall Street Journal databases (WSJ0/WSJ1/WSJCAM0), Resource Management, Globalphone and Speecon. We report some perceptual evaluation results and outline the outstanding issues.},
  month = {September},
  address = {Brighton, U.K.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/index.php},
  pages = {420--423}
}
@inproceedings{anderssonetal2010_ssw7,
  author = {Andersson, Sebastian and Yamagishi, Junichi and Clark, Robert},
  title = {Utilising Spontaneous Conversational Speech in {HMM}-Based Speech Synthesis},
  booktitle = {The 7th ISCA Tutorial and Research Workshop on Speech Synthesis},
  month = {September},
  year = {2010},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/ssw7_paper.pdf},
  abstract = {Spontaneous conversational speech has many characteristics that are currently not well modelled in unit selection and HMM-based speech synthesis. But in order to build synthetic voices more suitable for interaction we need data that exhibits more conversational characteristics than the generally used read aloud sentences. In this paper we will show how carefully selected utterances from a spontaneous conversation was instrumental for building an HMM-based synthetic voices with more natural sounding conversational characteristics than a voice based on carefully read aloud sentences. We also investigated a style blending technique as a solution to the inherent problem of phonetic coverage in spontaneous speech data. But the lack of an appropriate representation of spontaneous speech phenomena probably contributed to results showing that we could not yet compete with the speech quality achieved for grammatical sentences.},
  categories = {HMM, speech synthesis, spontaneous speech, conversation, lexical fillers, filled pauses}
}
@inproceedings{letter_based_TTS,
  author = {Watts, Oliver and Yamagishi, Junichi and King, Simon},
  title = {Letter-based speech synthesis},
  booktitle = {Proc. Speech Synthesis Workshop 2010},
  year = {2010},
  abstract = {Initial attempts at performing text-to-speech conversion based on standard orthographic units are presented, forming part of a larger scheme of training TTS systems on features that can be trivially extracted from text. We evaluate the possibility of using the technique of decision-tree-based context clustering conventionally used in HMM-based systems for parametertying to handle letter-to-sound conversion. We present the application of a method of compound-feature discovery to corpusbased speech synthesis. Finally, an evaluation of intelligibility of letter-based systems and more conventional phoneme-based systems is presented.},
  month = {September},
  address = {Nara, Japan},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/ssw7.pdf},
  pages = {317-322}
}
@inproceedings{Valentini-Botinhao_SSW8,
  author = {Valentini-Botinhao, Cassia and Wester, Mirjam and Yamagishi, Junichi and King, Simon},
  title = {Using neighbourhood density and selective {SNR} boosting to increase the intelligibility of synthetic speech in noise},
  booktitle = {8th ISCA Workshop on Speech Synthesis},
  year = {2013},
  abstract = {Motivated by the fact that words are not equally confusable, we explore the idea of using word-level intelligibility predictions to selectively boost the harder-to-understand words in a sentence, aiming to improve overall intelligibility in the presence of noise. First, the intelligibility of a set of words from dense and sparse phonetic neighbourhoods was evaluated in isolation. The resulting intelligibility scores were used to inform two sentencelevel experiments. In the first experiment the signal-to-noise ratio of one word was boosted to the detriment of another word. Sentence intelligibility did not generally improve. The intelligibility of words in isolation and in a sentence were found to be significantly different, both in clean and in noisy conditions. For the second experiment, one word was selectively boosted while slightly attenuating all other words in the sentence. This strategy was successful for words that were poorly recognised in that particular context. However, a reliable predictor of word-in-context intelligibility remains elusive, since this involves – as our results indicate – semantic, syntactic and acoustic information about the word and the sentence.},
  month = {August},
  address = {Barcelona, Spain},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Cassia_SSW13.pdf},
  pages = {133--138}
}
@article{Creer20131178,
  author = {Creer, Sarah and Cunningham, Stuart and Green, Phil and Yamagishi, Junichi},
  note = {Special Issue on Speech and Language Processing for Assistive Technology},
  doi = {http://dx.doi.org/10.1016/j.csl.2012.10.001},
  title = {Building personalised synthetic voices for individuals with severe speech impairment},
  url = {http://www.sciencedirect.com/science/article/pii/S0885230812000836},
  journal = {Computer Speech & Language},
  issn = {0885-2308},
  number = {6},
  volume = {27},
  year = {2013},
  keywords = {Voice output communication aid},
  pages = {1178 - 1193}
}
@inproceedings{EURECOM+4018,
  author = {Evans, Nicholas W D and Kinnunen, Tomi and Yamagishi, Junichi},
  title = {Spoofing and countermeasures for automatic speaker verification},
  booktitle = {{Interspeech} 2013, 14th {A}nnual {C}onference of the {I}nternational {S}peech {C}ommunication {A}ssociation, {A}ugust 25-29, 2013, {L}yon, {F}rance},
  year = {2013},
  month = {August},
  address = {{L}yon, {FRANCE}},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/mm-publi-4018.pdf}
}
@inproceedings{Astrinaki_SSW8,
  author = {Astrinaki, Maria and Moinet, Alexis and Yamagishi, Junichi and Richmond, Korin and Ling, Zhen-Hua and King, Simon and Dutoit, Thierry},
  title = {Mage - Reactive articulatory feature control of {HMM}-based parametric speech synthesis},
  booktitle = {8th ISCA Workshop on Speech Synthesis},
  year = {2013},
  month = {August},
  address = {Barcelona, Spain},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/ssw8_OS5-1_Astrinaki.pdf},
  pages = {227--231}
}
@inproceedings{Hu_SSW8,
  author = {Hu, Qiong and Richmond, Korin and Yamagishi, Junichi and Latorre, Javier},
  title = {An experimental comparison of multiple vocoder types},
  booktitle = {8th ISCA Workshop on Speech Synthesis},
  year = {2013},
  abstract = {This paper presents an experimental comparison of a broad range of the leading vocoder types which have been previously described. We use a reference implementation of each of these to create stimuli for a listening test using copy synthesis. The listening test is performed using both Lombard and normal read speech stimuli, and with two types of question for comparison. Multi-dimensional Scaling (MDS) is conducted on the listener responses to analyse similarities in terms of quality between the vocoders. Our MDS and clustering results show that the vocoders which use a sinusoidal synthesis approach are perceptually distinguishable from the source-filter vocoders. To help further interpret the axes of the resulting MDS space, we test for correlations with standard acoustic quality metrics and find one axis is strongly correlated with PESQ scores. We also find both speech style and the format of the listening test question may influence test results. Finally, we also present preference test results which compare each vocoder with the natural speech.},
  month = {August},
  address = {Barcelona, Spain},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/ssw8_OS4-3_Hu.pdf},
  pages = {155--160}
}
@inproceedings{stan13_lightly_supervised_discriminative,
  author = {Stan, Adriana and Bell, Peter and Yamagishi, Junichi and King, Simon},
  title = {Lightly Supervised Discriminative Training of Grapheme Models for Improved Sentence-level Alignment of Speech and Text Data},
  booktitle = {Proc. Interspeech},
  address = {Lyon, France},
  month = {August},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/lightly_supervised_discriminative_is2013.pdf},
  abstract = {This paper introduces a method for lightly supervised discriminative training using MMI to improve the alignment of speech and text data for use in training HMM-based TTS systems for low-resource languages. In TTS applications, due to the use of long-span contexts, it is important to select training utterances which have wholly correct transcriptions. In a low-resource setting, when using poorly trained grapheme models, we show that the use of MMI discriminative training at the grapheme-level enables us to increase the amount of correctly aligned data by 40\%, while maintaining a 7\% sentence error rate and 0.8\% word error rate. We present the procedure for lightly supervised discriminative training with regard to the objective of minimising sentence error rate.}
}
@inproceedings{Mamiya_SSW8,
  author = {Mamiya, Yoshitaka and Stan, Adriana and Yamagishi, Junichi and Bell, Peter and Watts, Oliver and Clark, Robert and King, Simon},
  title = {Using Adaptation to Improve Speech Transcription Alignment in Noisy and Reverberant Environments},
  booktitle = {8th ISCA Workshop on Speech Synthesis},
  year = {2013},
  abstract = {When using data retrieved from the internet to create new speech databases, the recording conditions can often be highly variable within and between sessions. This variance influences the overall performance of any automatic speech and text alignment techniques used to process this data. In this paper we discuss the use of speaker adaptation methods to address this issue. Starting from a baseline system for automatic sentence-level segmentation and speech and text alignment based on GMMs and grapheme HMMs, respectively, we employ Maximum A Posteriori (MAP) and Constrained Maximum Likelihood Linear Regression (CMLLR) techniques to model the variation in the data in order to increase the amount of confidently aligned speech. We tested 29 different scenarios, which include reverberation, 8 talker babble noise and white noise, each in various combinations and SNRs. Results show that the MAP-based segmentation's performance is very much influenced by the noise type, as well as the presence or absence of reverberation. On the other hand, the CMLLR adaptation of the acoustic models gives an average 20\% increase in the aligned data percentage for the majority of the studied scenarios.},
  month = {August},
  address = {Barcelona, Spain},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/ssw8_PS1-4_Mamiya.pdf},
  pages = {61--66}
}
@inproceedings{Watts_SSW8,
  author = {Watts, Oliver and Stan, Adriana and Clark, Rob and Mamiya, Yoshitaka and Giurgiu, Mircea and Yamagishi, Junichi and King, Simon},
  title = {Unsupervised and lightly-supervised learning for rapid construction of {TTS} systems in multiple languages from 'found' data: evaluation and analysis},
  booktitle = {8th ISCA Workshop on Speech Synthesis},
  year = {2013},
  abstract = {This paper presents techniques for building text-to-speech front-ends in a way that avoids the need for language-specific expert knowledge, but instead relies on universal resources (such as the Unicode character database) and unsupervised learning from unannotated data to ease system development. The acquisition of expert language-specific knowledge and expert annotated data is a major bottleneck in the development of corpus-based TTS systems in new languages. The methods presented here side-step the need for such resources as pronunciation lexicons, phonetic feature sets, part of speech tagged data, etc. The paper explains how the techniques introduced are applied to the 14 languages of a corpus of `found' audiobook data. Results of an evaluation of the intelligibility of the systems resulting from applying these novel techniques to this data are presented.},
  month = {August},
  address = {Barcelona, Spain},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/ssw8_OS2-3_Watts.pdf},
  pages = {121--126}
}
@inproceedings{Lorenzo-Trueba_SSW8,
  author = {Lorenzo-Trueba, Jaime and Barra-Chicote, Roberto and Yamagishi, Junichi and Watts, Oliver and Montero, Juan M.},
  title = {Towards Speaking Style Transplantation in Speech Synthesis},
  booktitle = {8th ISCA Workshop on Speech Synthesis},
  year = {2013},
  abstract = {One of the biggest challenges in speech synthesis is the production of naturally sounding synthetic voices. This means that the resulting voice must be not only of high enough quality but also that it must be able to capture the natural expressiveness imbued in human speech. This paper focus on solving the expressiveness problem by proposing a set of different techniques that could be used for extrapolating the expressiveness of proven high quality speaking style models into neutral speakers in HMM-based synthesis. As an additional advantage, the proposed techniques are based on adaptation approaches, which means that they can be used with little training data (around 15 minutes of training data are used in each style for this pa- per). For the final implementation, a set of 4 speaking styles were considered: news broadcasts, live sports commentary, interviews and parliamentary speech. Finally, the implementation of the 5 techniques were tested through a perceptual evaluation that proves that the deviations between neutral and speaking style average models can be learned and used to imbue expressiveness into target neutral speakers as intended.},
  month = {August},
  address = {Barcelona, Spain},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/ssw8_PS2-3_Lorenzo-Trueba.pdf},
  pages = {179--183}
}
@inproceedings{Stan_IS13,
  author = {Stan, Adriana and Watts, Oliver and Mamiya, Yoshitaka and Giurgiu, Mircea and Clark, Rob and Yamagishi, Junichi and King, Simon},
  title = {{TUNDRA: A Multilingual Corpus of Found Data for TTS Research Created with Light Supervision}},
  booktitle = {Proc. Interspeech},
  year = {2013},
  month = {August},
  address = {Lyon, France},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/IS131055.pdf},
  abstract = {Simple4All Tundra (version 1.0) is the first release of a standardised multilingual corpus designed for text-to-speech research with imperfect or found data. The corpus consists of approximately 60 hours of speech data from audiobooks in 14 languages, as well as utterance-level alignments obtained with a lightly-supervised process. Future versions of the corpus will include finer-grained alignment and prosodic annotation, all of which will be made freely available. This paper gives a general outline of the data collected so far, as well as a detailed description of how this has been done, emphasizing the minimal language-specific knowledge and manual intervention used to compile the corpus. To demonstrate its potential use, text-to-speech systems have been built for all languages using unsupervised or lightly supervised methods, also briefly presented in the paper.}
}
@inproceedings{Mamiya_13a,
  author = {Mamiya, Yoshitaka and Yamagishi, Junichi and Watts, Oliver and Clark, Robert A.J. and King, Simon and Stan, Adriana},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/0007987.pdf},
  booktitle = {Proc. ICASSP},
  year = {2013},
  abstract = {Audiobooks have been focused on as promising data for training Text-to-Speech (TTS) systems. However, they usually do not have a correspondence between audio and text data. Moreover, they are usually divided only into chapter units. In practice, we have to make a correspondence of audio and text data before we use them for building TTS synthesisers. However aligning audio and text data is time-consuming and involves manual labor. It also requires persons skilled in speech processing. Previously, we have proposed to use graphemes for automatically aligning speech and text data. This paper further integrates a lightly supervised voice activity detection (VAD) technique to detect sentence boundaries as a pre-processing step before the grapheme approach. This lightly supervised technique requires time stamps of speech and silence only for the first fifty sentences. Combining those, we can semi-automatically build TTS systems from audiobooks with minimum manual intervention. From subjective evaluations we analyse how the grapheme-based aligner and/or the proposed VAD technique impact the quality of HMM-based speech synthesisers trained on audiobooks.},
  title = {LIGHTLY SUPERVISED GMM VAD TO USE AUDIOBOOK FOR SPEECH SYNTHESISER}
}
@inproceedings{richmond_IS2013,
  author = {Richmond, Korin and Ling, Zhenhua and Yamagishi, Junichi and Uría, Benigno},
  title = {On the Evaluation of Inversion Mapping Performance in the Acoustic Domain},
  abstract = {The two measures typically used to assess the performance of an inversion mapping method, where the aim is to estimate what articulator movements gave rise to a given acoustic signal, are root mean squared (RMS) error and correlation. In this paper, we investigate whether ``task-based'' evaluation using an articulatory-controllable HMM-based speech synthesis system can give useful additional information to complement these measures. To assess the usefulness of this evaluation approach, we use articulator trajectories estimated by a range of different inversion mapping methods as input to the synthesiser, and measure their performance in the acoustic domain in terms of RMS error of the generated acoustic parameters and with a listening test involving 30 participants. We then compare these results with the standard RMS error and correlation measures calculated in the articulatory domain. Interestingly, in the acoustic evaluation we observe one method performs with no statistically significant difference from measured articulatory data, and cases where statistically significant differences between methods exist which are not reflected in the results of the two standard measures. From our results, we conclude such task-based evaluation can indeed provide interesting extra information, and gives a useful way to compare inversion methods.},
  year = {2013},
  month = {August},
  address = {Lyon, France},
  keywords = {Inversion mapping, evaluation, HMM synthesis},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/richmond_IS2013.pdf},
  booktitle = {Proc. Interspeech},
  pages = {1012--1016}
}
@article{6690120,
  author = {Saheer, L. and Yamagishi, J. and Garner, P.N. and Dines, J.},
  doi = {10.1109/JSTSP.2013.2295554},
  title = {Combining Vocal Tract Length Normalization With Hierarchical Linear Transformations},
  journal = {Selected Topics in Signal Processing, IEEE Journal of},
  issn = {1932-4553},
  number = {2},
  month = {April},
  volume = {8},
  year = {2014},
  keywords = {Bayes methods;regression analysis;speaker recognition;speech synthesis;ASR system;CSMAPLR adaptation;MLLR-based adaptation techniques;TTS synthesis;VTLN;age information;automatic speech recognition system;combination techniques;constrained structural maximum a posteriori linear regression adaptation;gender information;hierarchical Bayesian framework;hierarchical linear transformations;mismatched conditions;speaker similarity;speaker specific characteristics;statistical parametric speech synthesis;text-to-speech synthesis;vocal tract length normalization;Adaptation models;Estimation;Hidden Markov models;Regression tree analysis;Speech;Speech synthesis;Transforms;Constrained structural maximum a posteriori linear regression;hidden Markov models;speaker adaptation;statistical parametric speech synthesis;vocal tract length normalization},
  pages = {262-272}
}
@inproceedings{Lan14,
  author = {Lanchantin, P. and Gales, M. J. F. and King, S. and Yamagishi, J.},
  booktitle = {Proc. ICASSP},
  title = {Multiple-Average-Voice-based Speech Synthesis},
  abstract = {This paper describes a novel approach for the speaker adaptation of statistical parametric speech synthesis systems based on the interpolation of a set of average voice models (AVM). Recent results have shown that the quality/naturalness of adapted voices directly depends on the distance from the average voice model that the speaker adaptation starts from. This suggests the use of several AVMs trained on carefully chosen speaker clusters from which a more suitable AVM can be selected/interpolated during the adaptation. In the proposed approach, a Multiple-AVM is trained on clusters of speakers, iteratively re-assigned during the estimation process initialised according to metadata. In contrast with the cluster adaptive training (CAT) framework, the training stage is computationally less expensive as the amount of training data and clusters gets larger. Additionally, during adaptation, each AVM constituting the multiple-AVM are first adapted towards the speaker which suggests a better tuning to the individual speaker of the space in which the interpolation takes place. It is shown via experiments, ran on a corpus of British speakers with various regional accents, that the quality/naturalness of synthetic speech of adapted voices is significantly higher than when considering a single factor-independent AVM selected according to the target speaker characteristics.},
  year = {2014}
}
@article{cabral2014a,
  author = {Cabral, J.P. and Richmond, K. and Yamagishi, J. and Renals, S.},
  doi = {10.1109/JSTSP.2014.2307274},
  title = {Glottal Spectral Separation for Speech Synthesis},
  journal = {Selected Topics in Signal Processing, IEEE Journal of},
  issn = {1932-4553},
  number = {2},
  abstract = {This paper proposes an analysis method to separate the glottal source and vocal tract components of speech that is called Glottal Spectral Separation (GSS). This method can produce high-quality synthetic speech using an acoustic glottal source model. In the source-filter models commonly used in speech technology applications it is assumed the source is a spectrally flat excitation signal and the vocal tract filter can be represented by the spectral envelope of speech. Although this model can produce high-quality speech, it has limitations for voice transformation because it does not allow control over glottal parameters which are correlated with voice quality. The main problem with using a speech model that better represents the glottal source and the vocal tract filter is that current analysis methods for separating these components are not robust enough to produce the same speech quality as using a model based on the spectral envelope of speech. The proposed GSS method is an attempt to overcome this problem, and consists of the following three steps. Initially, the glottal source signal is estimated from the speech signal. Then, the speech spectrum is divided by the spectral envelope of the glottal source signal in order to remove the glottal source effects from the speech signal. Finally, the vocal tract transfer function is obtained by computing the spectral envelope of the resulting signal. In this work, the glottal source signal is represented using the Liljencrants-Fant model (LF-model). The experiments we present here show that the analysis-synthesis technique based on GSS can produce speech comparable to that of a high-quality vocoder that is based on the spectral envelope representation. However, it also permit control over voice qualities, namely to transform a modal voice into breathy and tense, by modifying the glottal parameters.},
  month = {April},
  volume = {8},
  year = {2014},
  keywords = {Analytical models;Computational modeling;Estimation;Hidden Markov models;Mathematical model;Speech;Speech synthesis;Glottal spectral separation;LF-model;parametric speech synthesis;voice quality transformation},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/gss-ieee-2014-final.pdf},
  pages = {195-208}
}
@inproceedings{watts-2014,
  author = {Watts, Oliver and Gangireddy, Siva and Yamagishi, Junichi and King, Simon and Renals, Steve and Stan, Adriana and Giurgiu, Mircea},
  title = {NEURAL NET WORD REPRESENTATIONS FOR PHRASE-BREAK PREDICTION WITHOUT A PART OF SPEECH TAGGER},
  booktitle = {Proc. ICASSP},
  year = {2014},
  month = {May},
  pages = {2618--2622},
  address = {Florence, Italy},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/watts-2014.pdf},
  abstract = {The use of shared projection neural nets of the sort used in language modelling is proposed as a way of sharing parameters between multiple text-to-speech system components. We experiment with pretraining the weights of such a shared projection on an auxiliary language modelling task and then apply the resulting word representations to the task of phrase-break prediction. Doing so allows us to build phrase-break predictors that rival conventional systems without any reliance on conventional knowledge-based resources such as part of speech taggers.},
  categories = {Speech synthesis, TTS, unsupervised learning, neural net language modelling, multitask learning}
}
@inproceedings{Dall_Yamagishi_King_SpeechProsody2014,
  author = {Dall, Rasmus and Yamagishi, Junichi and King, Simon},
  title = {Rating Naturalness in Speech Synthesis: The Effect of Style and Expectation},
  booktitle = {Proc. Speech Prosody},
  month = {May},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/Dall_Yamagishi_King_SpeechProsody2014.pdf},
  abstract = {In this paper we present evidence that speech produced spontaneously in a conversation is considered more natural than read prompts. We also explore the relationship between participants' expectations of the speech style under evaluation and their actual ratings. In successive listening tests subjects rated the naturalness of either spontaneously produced, read aloud or written sentences, with instructions toward either conversational, reading or general naturalness. It was found that, when presented with spontaneous or read aloud speech, participants consistently rated spontaneous speech more natural - even when asked to rate naturalness in the reading case. Presented with only text, participants generally preferred transcriptions of spontaneous utterances, except when asked to evaluate naturalness in terms of reading aloud. This has implications for the application of MOS-scale naturalness ratings in Speech Synthesis, and potentially on the type of data suitable for use both in general TTS, dialogue systems and specifically in Conversational TTS, in which the goal is to reproduce speech as it is produced in a spontaneous conversational setting.},
  categories = {speech synthesis, evaluation, naturalness, MOS, spontaneous speech, read speech, TTS}
}
@inproceedings{Hu_IC14,
  author = {Hu, Qiong and Stylianou, Yannis and Richmond, Korin and Maia, Ranniery and Yamagishi, Junichi and Latorre, Javier},
  title = {A Fixed Dimension and Perceptually Based Dynamic Sinusoidal Model of Speech},
  booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
  address = {Florence, Italy},
  month = {May},
  pages = {6311--6315},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/Qiong_IC14.pdf},
  abstract = {This paper presents a fixed- and low-dimensional, perceptually based dynamic sinusoidal model of speech referred to as PDM (Perceptual Dynamic Model). To decrease and fix the number of sinusoidal components typically used in the standard sinusoidal model, we propose to use only one dynamic sinusoidal component per critical band. For each band, the sinusoid with the maximum spectral amplitude is selected and associated with the centre frequency of that critical band. The model is expanded at low frequencies by incorporating sinusoids at the boundaries of the corresponding bands while at the higher frequencies a modulated noise component is used. A listening test is conducted to compare speech reconstructed with PDM and state-of-the-art models of speech, where all models are constrained to use an equal number of parameters. The results show that PDM is clearly preferred in terms of quality over the other systems.}
}
@book{nicktomijunichi2014antispoofing,
  author = {{E}vans, {N}icholas {W} {D} and {K}innunen, {T}omi and {Y}amagishi, {J}unichi and {W}u, {Z}hizheng and {A}legre, {F}ederico and {D}e {L}eon, {P}hillip},
  publisher = {{B}ook {C}hapter in "{H}andbook of {B}iometric {A}nti-spoofing", {S}pringer, {S}. {M}arcel, {S}. {L}i and {M}. {N}ixon, {E}ds., 2014},
  doi = {http://dx.doi.org/10.1007/978-1-4471-6524-8_7},
  title = {{S}peaker recognition anti-spoofing},
  abstract = {Progress in the development of spoofing countermeasures for automatic speaker recognition is less advanced than equivalent work related to other biometric modalities. This chapter outlines the potential for even state-of-the-art automatic speaker recognition systems to be spoofed. While the use of a multitude of different datasets, protocols and metrics complicates the meaningful comparison of different vulnerabilities, we review previous work related to impersonation, replay, speech synthesis and voice conversion spoofing attacks. The article also presents an analysis of the early work to develop spoofing countermeasures. The literature shows that there is significant potential for automatic speaker verification systems to be spoofed, that significant further work is required to develop generalised countermeasures, that there is a need for standard datasets, evaluation protocols and metrics and that greater emphasis should be placed on text-dependent scenarios.},
  month = {June},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/chapter7_anti-spoofing.pdf},
  categories = {Speaker recogntion, spoofing attack, anti-spoofing, countermeasure}
}
@inproceedings{Hu_Interspeech14,
  author = {Hu, Qiong and Stylianou, Yannis and Maia, Ranniery and Richmond, Korin and Yamagishi, Junichi and Latorre, Javier},
  title = {An investigation of the application of dynamic sinusoidal models to statistical parametric speech synthesis},
  booktitle = {Proc. Interspeech},
  year = {2014},
  abstract = {This paper applies a dynamic sinusoidal synthesis model to statistical parametric speech synthesis (HTS). For this, we utilise regularised cepstral coefficients to represent both the static amplitude and dynamic slope of selected sinusoids for statistical modelling. During synthesis, a dynamic sinusoidal model is used to reconstruct speech. A preference test is conducted to compare the selection of different sinusoids for cepstral representation. Our results show that when integrated with HTS, a relatively small number of sinusoids selected according to a perceptual criterion can produce quality comparable to using all harmonics. A Mean Opinion Score (MOS) test shows that our proposed statistical system is preferred to one using mel-cepstra from pitch synchronous spectral analysis.},
  month = {September},
  address = {Singapore},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/Q_Interspeech14.pdf},
  pages = {780--784}
}
@inproceedings{postfilter_IS14,
  author = {Chen, L.-H. and Raitio, T. and Valentini-Botinhao, C. and Yamagishi, J. and Ling, Z.-H.},
  title = {{DNN-Based Stochastic Postfilter for HMM-Based Speech Synthesis}},
  booktitle = {Proc. Interspeech},
  year = {2014},
  abstract = {In this paper we propose a deep neural network to model the conditional probability of the spectral differences between natural and synthetic speech. This allows us to reconstruct the spectral fine structures in speech generated by HMMs. We compared the new stochastic data-driven postfilter with global variance based parameter generation and modulation spectrum enhancement. Our results confirm that the proposed method significantly improves the segmental quality of synthetic speech compared to the conventional methods.},
  month = {September},
  address = {Singapore},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/postfilter_IS14.pdf},
  pages = {1954--1958}
}
@inproceedings{salb_IS14,
  author = {Valentini-Botinhao, C. and Toman, M. and Pucher, M. and Schabus, D. and Yamagishi, J.},
  title = {{Intelligibility Analysis of Fast Synthesized Speech}},
  booktitle = {Proc. Interspeech},
  year = {2014},
  abstract = {In this paper we analyse the effect of speech corpus and compression method on the intelligibility of synthesized speech at fast rates. We recorded English and German language voice talents at a normal and a fast speaking rate and trained an HSMM-based synthesis system based on the normal and the fast data of each speaker. We compared three compression methods: scaling the variance of the state duration model, interpolating the duration models of the fast and the normal voices, and applying a linear compression method to generated speech. Word recognition results for the English voices show that generating speech at normal speaking rate and then applying linear compression resulted in the most intelligible speech at all tested rates. A similar result was found when evaluating the intelligibility of the natural speech corpus. For the German voices, interpolation was found to be better at moderate speaking rates but the linear method was again more successful at very high rates, for both blind and sighted participants. These results indicate that using fast speech data does not necessarily create more intelligible voices and that linear compression can more reliably provide higher intelligibility, particularly at higher rates.},
  month = {September},
  address = {Singapore},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/salb_IS14.pdf},
  pages = {2922--2926}
}
@inproceedings{salb_IS15,
  author = {Pucher, M. and Toman, M. and Schabus, D. and Valentini-Botinhao, C. and Yamagishi, J. and Zillinger, B. and Schmid, E},
  title = {{Influence of speaker familiarity on blind and visually impaired children's perception of synthetic voices in audio games}},
  booktitle = {Proc. Interspeech},
  year = {2015},
  month = {September},
  address = {Dresden, Germany},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/salb_IS15.pdf},
  abstract = {In this paper we evaluate how speaker familiarity influences the engagement times and performance of blind school children when playing audio games made with different synthetic voices. We developed synthetic voices of school children, their teachers and of speakers that were unfamiliar to them and used each of these voices to create variants of two audio games: a memory game and a labyrinth game. Results show that pupils had significantly longer engagement times and better performance when playing games that used synthetic voices built with their own voices. This result was observed even though the children reported not recognising the synthetic voice as their own after the experiment was over. These findings could be used to improve the design of audio games and lecture books for blind and visually impaired children.}
}
@article{Cassia_SPCOM15,
  author = {Valentini-Botinhao, Cassia and Toman, Markus and Pucher, Michael and Schabus, Dietmar and Yamagishi, Junichi},
  doi = {10.1016/j.specom.2015.09.002},
  title = {Intelligibility of time-compressed synthetic speech: Compression method and speaking style.},
  journal = {Speech Communication},
  month = {October},
  year = {2015},
  abstract = {We present a series of intelligibility experiments performed on natural and synthetic speech time-compressed at a range of rates and analyze the effect of speech corpus and compression method on the intelligibility scores of sighted and blind individuals. Particularly we are interested in comparing linear and non-linear compression methods applied to normal and fast speech of different speakers. We recorded English and German language voice talents reading prompts at a normal and a fast rate. To create synthetic voices we trained a statistical parametric speech synthesis system based on the normal and the fast data of each speaker. We compared three compression methods: scaling the variance of the state duration model, interpolating the duration models of the fast and the normal voices, and applying a linear compression method to the generated speech waveform. Word recognition results for the English voices show that generating speech at a normal speaking rate and then applying linear compression resulted in the most intelligible speech at all tested rates. A similar result was found when evaluating the intelligibility of the natural speech corpus. For the German voices, interpolation was found to be better at moderate speaking rates but the linear method was again more successful at very high rates, particularly when applied to the fast data. Phonemic level annotation of the normal and fast databases showed that the German speaker was able to reproduce speech at a fast rate with fewer deletion and substitution errors compared to the English speaker, supporting the intelligibility benefits observed when compressing his fast speech. This shows that the use of fast speech data to create faster synthetic voices does not necessarily lead to more intelligible voices as results are highly dependent on how successful the speaker was at speaking fast while maintaining intelligibility. Linear compression applied to normal rate speech can more reliably provide higher intelligibility, particularly at ultra fast rates.}
}
@article{7169536,
  author = {Chen, Ling-Hui and Raitio, T. and Valentini-Botinhao, C. and Ling, Z. and Yamagishi, J.},
  doi = {10.1109/TASLP.2015.2461448},
  title = {A Deep Generative Architecture for Postfiltering in Statistical Parametric Speech Synthesis},
  journal = {Audio, Speech, and Language Processing, IEEE/ACM Transactions on},
  issn = {2329-9290},
  number = {11},
  pages = {2003-2014},
  volume = {23},
  year = {2015},
  keywords = {HMM;deep generative architecture;modulation spectrum;postfilter;segmental quality;speech synthesis},
  abstract = {The generated speech of hidden Markov model (HMM)-based statistical parametric speech synthesis still sounds muffled. One cause of this degradation in speech quality may be the loss of fine spectral structures. In this paper, we propose to use a deep generative architecture, a deep neural network (DNN) generatively trained, as a postfilter. The network models the conditional probability of the spectrum of natural speech given that of synthetic speech to compensate for such gap between synthetic and natural speech. The proposed probabilistic postfilter is generatively trained by cascading two restricted Boltzmann machines (RBMs) or deep belief networks (DBNs) with one bidirectional associative memory (BAM). We devised two types of DNN postfilters: one operating in the mel-cepstral domain and the other in the higher dimensional spectral domain. We compare these two new data-driven postfilters with other types of postfilters that are currently used in speech synthesis: a fixed mel-cepstral based postfilter, the global variance based parameter generation, and the modulation spectrum-based enhancement. Subjective evaluations using the synthetic voices of a male and female speaker confirmed that the proposed DNN-based postfilter in the spectral domain significantly improved the segmental quality of synthetic speech compared to that with conventional methods.}
}
@inproceedings{Merritt2015RichContext,
  author = {Merritt, Thomas and Yamagishi, Junichi and Wu, Zhizheng and Watts, Oliver and King, Simon},
  title = {{Deep neural network context embeddings for model selection in rich-context HMM synthesis}},
  booktitle = {{Proc. Interspeech}},
  address = {Dresden},
  month = {September},
  year = {2015},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/merritt2015RichContext.pdf},
  abstract = {{This paper introduces a novel form of parametric synthesis that uses context embeddings produced by the bottleneck layer of a deep neural network to guide the selection of models in a rich-context HMM-based synthesiser. Rich-context synthesis – in which Gaussian distributions estimated from single linguistic contexts seen in the training data are used for synthesis, rather than more conventional decision tree-tied models – was originally proposed to address over-smoothing due to averaging across contexts. Our previous investigations have confirmed experimentally that averaging across different contexts is indeed one of the largest factors contributing to the limited quality of statistical parametric speech synthesis. However, a possible weakness of the rich context approach as previously formulated is that a conventional tied model is still used to guide selection of Gaussians at synthesis time. Our proposed approach replaces this with context embeddings derived from a neural network.}},
  categories = {{speech synthesis, hidden Markov model, deep neural networks, rich context, embedding}}
}
@inproceedings{ribeiro2015perceptual,
  author = {Ribeiro, Manuel Sam and Yamagishi, Junichi and Clark, Robert A. J.},
  title = {A Perceptual Investigation of Wavelet-based Decomposition of f0 for Text-to-Speech Synthesis},
  booktitle = {Proc. Interspeech},
  year = {2015},
  month = {September},
  address = {Dresden, Germany},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/ribeiro_et_al_IS15.pdf},
  abstract = {The Continuous Wavelet Transform (CWT) has been recently proposed to model f0 in the context of speech synthesis. It was shown that systems using signal decomposition with the CWT tend to outperform systems that model the signal directly. The f0 signal is typically decomposed into various scales of differing frequency. In these experiments, we reconstruct f0 with selected frequencies and ask native listeners to judge the naturalness of synthesized utterances with respect to natural speech. Results indicate that HMM-generated f0 is comparable to the CWT low frequencies, suggesting it mostly generates utterances with neutral intonation. Middle frequencies achieve very high levels of naturalness, while very high frequencies are mostly noise.},
  categories = {speech synthesis, prosody, f0 modeling, continuous wavelet transform, perceptual experiments}
}
@inproceedings{wester:human:IS2015,
  author = {Wester, Mirjam and Wu, Zhizheng and Yamagishi, Junichi},
  title = {Human vs Machine Spoofing Detection on Wideband and Narrowband Data},
  booktitle = {Proc. Interspeech},
  year = {2015},
  month = {September},
  address = {Dresden},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/wester:human:IS2015.pdf},
  abstract = {How well do humans detect spoofing attacks directed at automatic speaker verification systems? This paper investigates the performance of humans at detecting spoofing attacks from speech synthesis and voice conversion systems. Two speaker verification tasks, in which the speakers were either humans or machines, were also conducted. The three tasks were carried out with two types of data: wideband (16kHz) and narrowband (8kHz) telephone line simulated data. Spoofing detection by humans was compared to automatic spoofing detection (ASD) algorithms. Listening tests were carefully constructed to en- sure the human and automatic tasks were as similar as possible taking into consideration listener’s constraints (e.g., fatigue and memory limitations). Results for human trials show the error rates on narrowband data double compared to on wide- band data. The second verification task, which included only artificial speech, showed equal overall acceptance rates for both 8kHz and 16kHz. In the spoofing detection task, there was a drop in performance on most of the artificial trials as well as on human trials. At 8kHz, 20% of human trials were incorrectly classified as artificial, compared to 12% at 16kHz. The ASD algorithms also showed a drop in performance on 8kHz data, but outperformed human listeners across the board.},
  categories = {spoofing, human performance, automatic spoofing detection}
}
@inproceedings{wu2015asvspoof,
  author = {Wu, Zhizheng and Kinnunen, Tomi and Evans, Nicholas and Yamagishi, Junichi and Hanilci, Cemal and Sahidullah, Md and Sizov, Aleksandr},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/is2015_asvspoof.pdf},
  booktitle = {Interspeech},
  year = {2015},
  title = {{ASVspoof} 2015: the First Automatic Speaker Verification Spoofing and Countermeasures Challenge}
}
@inproceedings{wu2015sas,
  author = {Wu, Zhizheng and Khodabakhsh, Ali and Demiroglu, Cenk and Yamagishi, Junichi and Saito, Daisuke and Toda, Tomoki and King, Simon},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/icassp2015_sas.pdf},
  booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
  year = {2015},
  title = {{SAS}: A speaker verification spoofing database containing diverse attacks}
}
@inproceedings{Hu_ICASSP15,
  author = {Hu, Qiong and Stylianou, Yannis and Maia, Ranniery and Richmond, Korin and Yamagishi, Junichi},
  title = {METHODS FOR APPLYING DYNAMIC SINUSOIDAL MODELS TO STATISTICAL PARAMETRIC SPEECH SYNTHESIS},
  booktitle = {Proc. ICASSP},
  year = {2015},
  month = {April},
  address = {Brisbane, Austrilia},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/IC15_Qiong.pdf},
  abstract = {Sinusoidal vocoders can generate high quality speech, but they have not been extensively applied to statistical parametric speech synthesis. This paper presents two ways for using dynamic sinusoidal models for statistical speech synthesis, enabling the sinusoid parameters to be modelled in HMMbased synthesis. In the first method, features extracted from a fixed- and low-dimensional, perception-based dynamic sinusoidal model (PDM) are statistically modelled directly. In the second method, we convert both static amplitude and dynamic slope from all the harmonics of a signal, which we term the Harmonic Dynamic Model (HDM), to intermediate parameters (regularised cepstral coefficients) for modelling. During synthesis, HDM is then used to reconstruct speech. We have compared the voice quality of these two methods to the STRAIGHT cepstrum-based vocoder with mixed excitation in formal listening tests. Our results show that HDM with intermediate parameters can generate comparable quality as STRAIGHT, while PDM direct modelling seems promising in terms of producing good speech quality without resorting to intermediate parameters such as cepstra.}
}
@inproceedings{Hu_Interspeech15,
  author = {Hu, Qiong and Wu, Zhizheng and Richmond, Korin and Yamagishi, Junichi and Stylianou, Yannis and Maia, Ranniery},
  title = {Fusion of multiple parameterisations for {DNN}-based sinusoidal speech synthesis with multi-task learning},
  booktitle = {Proc. Interspeech},
  year = {2015},
  month = {September},
  address = {Dresden, Germany},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/Q_Interspeech15.pdf},
  abstract = {It has recently been shown that deep neural networks (DNN) can improve the quality of statistical parametric speech synthesis (SPSS) when using a source-filter vocoder. Our own previous work has furthermore shown that a dynamic sinusoidal model (DSM) is also highly suited to DNN-based SPSS, whereby sinusoids may either be used themselves as a “direct parameterisation” (DIR), or they may be encoded using an “intermediate spectral parameterisation” (INT). The approach in that work was effectively to replace a decision tree with a neural network. However, waveform parameterisation and synthesis steps that have been developed to suit HMMs may not fully exploit DNN capabilities. Here, in contrast, we investigate ways to combine INT and DIR at the levels of both DNN modelling and waveform generation. For DNN training, we propose to use multi-task learning to model cepstra (from INT) and log amplitudes (from DIR) as primary and secondary tasks. Our results show combining these improves modelling accuracy for both tasks. Next, during synthesis, instead of discarding parameters from the second task, a fusion method using harmonic amplitudes derived from both tasks is applied. Preference tests show the proposed method gives improved performance, and that this applies to synthesising both with and without global variance parameters.}
}
@article{stan-2016,
  author = {Stan, Adriana and Mamiya, Yoshitaka and Yamagishi, Junichi and Bell, Peter and Watts, Oliver and Clark, Rob and King, Simon},
  doi = {http://dx.doi.org/10.1016/j.csl.2015.06.006},
  title = {{ALISA}: An automatic lightly supervised speech segmentation and alignment tool},
  url = {http://www.sciencedirect.com/science/article/pii/S0885230815000650},
  journal = {Computer Speech and Language},
  issn = {0885-2308},
  abstract = {This paper describes the ALISA tool, which implements a lightly supervised method for sentence-level alignment of speech with imperfect transcripts. Its intended use is to enable the creation of new speech corpora from a multitude of resources in a language-independent fashion, thus avoiding the need to record or transcribe speech data. The method is designed so that it requires minimum user intervention and expert knowledge, and it is able to align data in languages which employ alphabetic scripts. It comprises a GMM-based voice activity detector and a highly constrained grapheme-based speech aligner. The method is evaluated objectively against a gold standard segmentation and transcription, as well as subjectively through building and testing speech synthesis systems from the retrieved data. Results show that on average, 70% of the original data is correctly aligned, with a word error rate of less than 0.5%. In one case, subjective listening tests show a statistically significant preference for voices built on the gold transcript, but this is small and in other tests, no statistically significant differences between the systems built from the fully supervised training data and the one which uses the proposed method are found.},
  volume = {35},
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/stan-2016.pdf},
  pages = {116--133},
  categories = {Speech segmentation, speech and text alignment, grapheme acoustic models, lightly supervised system, imperfect transcripts}
}
@article{LorenzoTrueba2015292,
  author = {Lorenzo-Trueba, Jaime and Barra-Chicote, Roberto and San-Segundo, Rubén and Ferreiros, Javier and Yamagishi, Junichi and Montero, Juan M.},
  note = {},
  doi = {http://dx.doi.org/10.1016/j.csl.2015.03.008},
  title = {Emotion transplantation through adaptation in HMM-based speech synthesis},
  url = {http://www.sciencedirect.com/science/article/pii/S0885230815000376},
  journal = {Computer Speech & Language},
  issn = {0885-2308},
  number = {1},
  abstract = {Abstract This paper proposes an emotion transplantation method capable of modifying a synthetic speech model through the use of \{CSMAPLR\} adaptation in order to incorporate emotional information learned from a different speaker model while maintaining the identity of the original speaker as much as possible. The proposed method relies on learning both emotional and speaker identity information by means of their adaptation function from an average voice model, and combining them into a single cascade transform capable of imbuing the desired emotion into the target speaker. This method is then applied to the task of transplanting four emotions (anger, happiness, sadness and surprise) into 3 male speakers and 3 female speakers and evaluated in a number of perceptual tests. The results of the evaluations show how the perceived naturalness for emotional text significantly favors the use of the proposed transplanted emotional speech synthesis when compared to traditional neutral speech synthesis, evidenced by a big increase in the perceived emotional strength of the synthesized utterances at a slight cost in speech quality. A final evaluation with a robotic laboratory assistant application shows how by using emotional speech we can significantly increase the students’ satisfaction with the dialog system, proving how the proposed emotion transplantation system provides benefits in real applications.},
  volume = {34},
  year = {2015},
  keywords = {Emotion transplantation},
  pages = {292 - 307}
}
@inproceedings{lecumberri2014generating,
  author = {Lecumberri, Mar{\i}a Luisa Garc{\i}a and Barra-Chicote, Roberto and Ram{\'o}n, Rub{\'e}n P{\'e}rez and Yamagishi, Junichi and Cooke, Martin},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/generating_segmental_foreign_accent.pdf},
  booktitle = {Fifteenth Annual Conference of the International Speech Communication Association},
  year = {2014},
  abstract = {For most of us, speaking in a non-native language involves de- viating to some extent from native pronunciation norms. How- ever, the detailed basis for foreign accent (FA) remains elusive, in part due to methodological challenges in isolating segmen- tal from suprasegmental factors. The current study examines the role of segmental features in conveying FA through the use of a generative approach in which accent is localised to sin- gle consonantal segments. Three techniques are evaluated: the first requires a highly-proficiency bilingual to produce words with isolated accented segments; the second uses cross-splicing of context-dependent consonants from the non-native language into native words; the third employs hidden Markov model syn- thesis to blend voice models for both languages. Using English and Spanish as the native/non-native languages respectively, lis- tener cohorts from both languages identified words and rated their degree of FA. All techniques were capable of generating accented words, but to differing degrees. Naturally-produced speech led to the strongest FA ratings and synthetic speech the weakest, which we interpret as the outcome of over-smoothing. Nevertheless, the flexibility offered by synthesising localised accent encourages further development of the method.},
  title = {Generating segmental foreign accent}
}
@inproceedings{CassiaIOS14,
  author = {Valentini-Botinhao, Cassia and Yamagishi, Junichi and King, Simon},
  title = {Intelligibility Enhancement of Speech in Noise},
  booktitle = {Proceedings of the Institute of Acoustics},
  year = {2014},
  abstract = {To maintain communication success, humans change the way they speak and hear according to many factors, like the age, gender, native language and social relationship between talker and listener. Other factors are dictated by how communication takes place, such as environmental factors like an active competing speaker or limitations on the communication channel. As in natural interaction, we expect to communicate with and use synthetic voices that can also adapt to different listening scenarios and keep the level of intelligibility high. Research in speech technology needs to account for this to change the way we transmit, store and artificially generate speech accordingly.},
  month = {October},
  volume = {36 Pt. 2},
  address = {Birmingham, UK},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/CassiaIOS14.pdf},
  pages = {96-103}
}
@inproceedings{astrinaki2013b,
  author = {Astrinaki, Maria and Moinet, Alexis and Yamagishi, Junichi and Richmond, Korin and Ling, Zhen-Hua and King, Simon and Dutoit, Thierry},
  title = {Mage - {HMM}-based speech synthesis reactively controlled by the articulators},
  abstract = {In this paper, we present the recent progress in the MAGE project. MAGE is a library for realtime and interactive (reactive) parametric speech synthesis using hidden Markov models (HMMs). Here, it is broadened in order to support not only the standard acoustic features (spectrum and f0) to model and synthesize speech but also to combine acoustic and articulatory features, such as tongue, lips and jaw positions. Such an integration enables the user to have a straight forward and meaningful control space to intuitively modify the synthesized phones in real time only by configuring the position of the articulators.},
  year = {2013},
  month = {August},
  address = {Barcelona, Spain},
  keywords = {speech synthesis, reactive, articulators},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/ICPHS0724.pdf},
  booktitle = {8th ISCA Workshop on Speech Synthesis},
  pages = {243}
}
@inproceedings{dall2016testing,
  author = {Dall, Rasmus and Brognaux, Sandrine and Richmond, Korin and Valentini-Botinhao, Cassia and Henter, Gustav Eje and Hirschberg, Julia and Yamagishi, Junichi},
  title = {Testing the consistency assumption: pronunciation variant forced alignment in read and spontaneous speech synthesis},
  abstract = {Forced alignment for speech synthesis traditionally aligns a phoneme sequence predetermined by the front-end text processing system. This sequence is not altered during alignment, i.e., it is forced, despite possibly being faulty. The consistency assumption is the assumption that these mistakes do not degrade models, as long as the mistakes are consistent across training and synthesis. We present evidence that in the alignment of both standard read prompts and spontaneous speech this phoneme sequence is often wrong, and that this is likely to have a negative impact on acoustic models. A lattice-based forced alignment system allowing for pronunciation variation is implemented, resulting in improved phoneme identity accuracy for both types of speech. A perceptual evaluation of HMM-based voices showed that spontaneous models trained on this improved alignment also improved standard synthesis, despite breaking the consistency assumption.},
  month = {March},
  pages = {5155-5159},
  year = {2016},
  keywords = {speech synthesis, TTS, forced alignment, HMM},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2016/dall2016testing.pdf},
  booktitle = {Proc. IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)}
}
@inproceedings{hu2016initial,
  author = {Hu, Qiong and Yamagishi, Junichi and Richmond, Korin and Subramanian, Kartick and Stylianou, Yannis},
  title = {Initial investigation of speech synthesis based on complex-valued neural networks},
  abstract = {Although frequency analysis often leads us to a speech signal in the complex domain, the acoustic models we frequently use are designed for real-valued data. Phase is usually ignored or modelled separately from spectral amplitude. Here, we propose a complex-valued neural network (CVNN) for directly modelling the results of the frequency analysis in the complex domain (such as the complex amplitude). We also introduce a phase encoding technique to map real-valued data (e.g. cepstra or log amplitudes) into the complex domain so we can use the same CVNN processing seamlessly. In this paper, a fully complex-valued neural network, namely a neural network where all of the weight matrices, activation functions and learning algorithms are in the complex domain, is applied for speech synthesis. Results show its ability to model both complex-valued and real-valued data.},
  month = {March},
  pages = {5630-5634},
  year = {2016},
  keywords = {complex-valued neural network, speech synthesis, complex amplitude, phase modelling},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2016/hu2016initial.pdf},
  booktitle = {Proc. IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)}
}
@article{richmond2015use,
  author = {Richmond, Korin and Ling, Zhen-Hua and Yamagishi, Junichi},
  doi = {10.1250/ast.36.467},
  title = {The use of articulatory movement data in speech synthesis applications: An overview - Application of articulatory movements using machine learning algorithms [Invited Review]},
  journal = {Acoustical Science and Technology},
  number = {6},
  volume = {36},
  year = {2015},
  pages = {467-477}
}
@article{richmond2015applications,
  author = {Richmond, Korin and Yamagishi, Junichi and Ling, Zhen-Hua},
  title = {Applications of articulatory movements based on machine learning},
  journal = {Journal of the Acoustical Society of Japan},
  number = {10},
  volume = {70},
  year = {2015},
  pages = {539--545}
}
@inproceedings{toda2016voice,
  author = {Toda, Tomoki and Chen, Ling-Hui and Saito, Daisuke and Villavicencio, Fernando and Wester, Mirjam and Wu, Zhizheng and Yamagishi, Junichi},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/toda2016voice.pdf},
  booktitle = {Proc. Interspeech},
  title = {The Voice Conversion Challenge 2016},
  abstract = {This paper describes the Voice Conversion Challenge 2016 devised by the authors to better understand different voice conversion (VC) techniques by comparing their performance on a common dataset. The task of the challenge was speaker conversion, i.e., to transform the voice identity of a source speaker into that of a target speaker while preserving the linguistic content. Using a common dataset consisting of 162 utterances for training and 54 utterances for evaluation from each of 5 source and 5 target speakers, 17 groups working in VC around the world developed their own VC systems for every combination of the source and target speakers, i.e., 25 systems in total, and generated voice samples converted by the developed systems. These samples were evaluated in terms of target speaker similarity and naturalness by 200 listeners in a controlled environment. This paper summarizes the design of the challenge, its result, and a future plan to share views about unsolved problems and challenges faced by the current VC techniques.},
  year = {2016}
}
@inproceedings{wester2016analysis,
  author = {Wester, Mirjam and Wu, Zhizheng and Yamagishi, Junichi},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/wester2016analysis.pdf},
  booktitle = {Proc. Interspeech},
  title = {Analysis of the Voice Conversion Challenge 2016 Evaluation Results},
  abstract = {The Voice Conversion Challenge 2016 is the first Voice Conversion Challenge in which different voice conversion systems and approaches using the same voice data were compared. This paper describes the design of the evaluation, it presents the results and statistical analyses of the results.},
  year = {2016}
}
@inproceedings{wester2016multidimensional,
  author = {Wester, Mirjam and Wu, Zhizheng and Yamagishi, Junichi},
  title = {Multidimensional scaling of systems in the Voice Conversion Challenge 2016},
  booktitle = {Proc. Speech Synthesis Workshop 9},
  year = {2016},
  address = {Sunnyvale, CA.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/wester2016multidimensional.pdf},
  abstract = {This study investigates how listeners judge the similarity of voice converted voices using a talker discrimination task. The data used is from the Voice Conversion Challenge 2016. 17 participants from around the world took part in building voice converted voices from a shared data set of source and target speakers. This paper describes the evaluation of similarity for four of the source-target pairs (two intra-gender and two cross-gender) in more detail. Multidimensional scaling was performed to illustrate where each system was perceived to be in an acoustic space compared to the source and target speakers and to each other.}
}
@inproceedings{merritt2016hybrid,
  author = {Merritt, Thomas and Clark, Robert A J and Wu, Zhizheng and Yamagishi, Junichi and King, Simon},
  title = {Deep neural network-guided unit selection synthesis},
  booktitle = {Proc. ICASSP},
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/Merritt_ICASSP2016.pdf},
  abstract = {Vocoding of speech is a standard part of statistical parametric speech synthesis systems. It imposes an upper bound of the naturalness that can possibly be achieved. Hybrid systems using parametric models to guide the selection of natural speech units can combine the benefits of robust statistical models with the high level of naturalness of waveform concatenation. Existing hybrid systems use Hidden Markov Models (HMMs) as the statistical model. This paper demonstrates that the superiority of Deep Neural Network (DNN) acoustic models over HMMs in conventional statistical parametric speech synthesis also carries over to hybrid synthesis. We compare various DNN and HMM hybrid configurations, guiding the selection of waveform units in either the vocoder parameter domain, or in the domain of embeddings (bottleneck features).},
  categories = {speech synthesis, hybrid synthesis, deep neural networks, embedding, unit selection}
}
@inproceedings{ribeiro2016wavelet,
  author = {Ribeiro, Manuel Sam and Watts, Oliver and Yamagishi, Junichi and Clark, Robert A. J.},
  title = {Wavelet-based decomposition of f0 as a secondary task for {DNN-based} speech synthesis with multi-task learning},
  booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  year = {2016},
  month = {March},
  address = {Shanghai, China},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/ribeiro-et-al-icassp16.pdf},
  abstract = {We investigate two wavelet-based decomposition strategies of the f0 signal and their usefulness as a secondary task for speech synthesis using multi-task deep neural networks (MTL-DNN). The first decomposition strategy uses a static set of scales for all utterances in the training data. We propose a second strategy, where the scale of the mother wavelet is dynamically adjusted to the rate of each utterance. This approach is able to capture f0 variations related to the syllable, word, clitic-group, and phrase units. This method also constrains the wavelet components to be within the frequency range that previous experiments have shown to be more natural. These two strategies are evaluated as a secondary task in multi-task deep neural networks (MTL-DNNs). Results indicate that on an expressive dataset there is a strong preference for the systems using multi-task learning when compared to the baseline system.},
  categories = {speech synthesis, f0 modelling, deep neural network, multi-task learning, continuous wavelet transform}
}
@inproceedings{ribeiro2016syllable,
  author = {Ribeiro, Manuel Sam and Watts, Oliver and Yamagishi, Junichi},
  title = {Syllable-level representations of suprasegmental features for {DNN-based} text-to-speech synthesis},
  booktitle = {Proceedings of Interspeech},
  year = {2016},
  month = {September},
  address = {San Francisco, United States},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/1034.PDF},
  abstract = {A top-down hierarchical system based on deep neural networks is investigated for the modeling of prosody in speech synthesis. Suprasegmental features are processed separately from segmental features and a compact distributed representation of highlevel units is learned at syllable-level. The suprasegmental representation is then integrated into a frame-level network. Objective measures show that balancing segmental and suprasegmental features can be useful for the frame-level network. Additional features incorporated into the hierarchical system are then tested. At the syllable-level, a bag-of-phones representation is proposed and, at the word-level, embeddings learned from text sources are used. It is shown that the hierarchical system is able to leverage new features at higher-levels more efficiently than a system which exploits them directly at the frame-level. A perceptual evaluation of the proposed systems is conducted and followed by a discussion of the results.},
  categories = {speech synthesis, prosody, deep neural networks, suprasegmental representations}
}
@inproceedings{ribeiro2016parallel,
  author = {Ribeiro, Manuel Sam and Watts, Oliver and Yamagishi, Junichi},
  title = {Parallel and cascaded deep neural networks for text-to-speech synthesis},
  booktitle = {9th ISCA Workshop on Speech Synthesis (SSW9)},
  year = {2016},
  month = {September},
  address = {Sunnyvale, United States},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/ribeiro-et-al-ssw9.pdf},
  abstract = {An investigation of cascaded and parallel deep neural networks for speech synthesis is conducted. In these systems, suprasegmental linguistic features (syllable-level and above) are processed separately from segmental features (phone-level and below). The suprasegmental component of the networks learns compact distributed representations of high-level linguistic units without any segmental influence. These representations are then integrated into a frame-level system using a cascaded or a parallel approach. In the cascaded network, suprasegmental representations are used as input to the frame-level network. In the parallel network, segmental and suprasegmental features are processed separately and concatenated at a later stage. These experiments are conducted with a standard set of high-dimensional linguistic features as well as a hand-pruned one. It is observed that hierarchical systems are consistently preferred over the baseline feedforward systems. Similarly, parallel networks are preferred over cascaded networks.},
  categories = {speech synthesis, prosody, deep neural networks, embeddings, suprasegmental representations}
}
@inproceedings{ribeiro2017learning,
  author = {Ribeiro, Manuel Sam and Watts, Oliver and Yamagishi, Junichi},
  title = {Learning word vector representations based on acoustic counts},
  booktitle = {Proceedings of Interspeech},
  year = {2017},
  month = {August},
  address = {Stockholm, Sweden},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2017/1340.PDF},
  abstract = {This paper presents a simple count-based approach to learning word vector representations by leveraging statistics of cooccurrences between text and speech. This type of representation requires two discrete sequences of units defined across modalities. Two possible methods for the discretization of an acoustic signal are presented, which are then applied to fundamental frequency and energy contours of a transcribed corpus of speech, yielding a sequence of textual objects (e.g. words, syllables) aligned with a sequence of discrete acoustic events. Constructing a matrix recording the co-occurrence of textual objects with acoustic events and reducing its dimensionality with matrix decomposition results in a set of context-independent representations of word types. These are applied to the task of acoustic modelling for speech synthesis; objective and subjective results indicate that these representations are useful for the generation of acoustic parameters in a text-to-speech (TTS) system. In general, we observe that the more discretization approaches, acoustic signals, and levels of linguistic analysis are incorporated into a TTS system via these count-based representations, the better that TTS system performs.},
  categories = {speech synthesis, text-to-speech, vector representations, word embeddings, deep neural networks}
}
@inproceedings{Valentini16b,
  author = {{Valentini-Botinhao}, Cassia and Wang, Xin and Takaki, Shinji and Yamagishi, Junichi},
  publisher = {ISCA},
  doi = {10.21437/Interspeech.2016-159},
  title = {Speech Enhancement for a Noise-Robust Text-to-Speech Synthesis System using Deep Recurrent Neural Networks},
  booktitle = {Interspeech},
  abstract = {Quality of text-to-speech voices built from noisy recordings is diminished. In order to improve it we propose the use of a recurrent neural network to enhance acoustic parameters prior to training. We trained a deep recurrent neural network using a parallel database of noisy and clean acoustics parameters as input and output of the network. The database consisted of multiple speakers and diverse noise conditions. We investigated using text-derived features as an additional input of the network. We processed a noisy database of two other speakers using this network and used its output to train an HMM acoustic text-to-synthesis model for each voice. Listening experiment results showed that the voice built with enhanced parameters was ranked significantly higher than the ones trained with noisy speech and speech that has been enhanced using a conventional enhancement system. The text-derived features improved results only for the female voice, where it was ranked as highly as a voice trained with clean speech.},
  month = sep,
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/Interspeech2016_Cassia_1.pdf},
  pages = {352--356}
}
@inproceedings{Valentini16a,
  author = {{Valentini-Botinhao}, Cassia and Wang, Xin and Takaki, Shinji and Yamagishi, Junichi},
  title = {Investigating {RNN}-based speech enhancement methods for noise-robust Text-to-Speech},
  booktitle = {Proceedings of 9th ISCA Speech Synthesis Workshop},
  abstract = {The quality of text-to-speech (TTS) voices built from noisy speech is compromised. Enhancing the speech data before training has been shown to improve quality but voices built with clean speech are still preferred. In this paper we investigate two different approaches for speech enhancement to train TTS systems. In both approaches we train a recursive neural network (RNN) to map acoustic features extracted from noisy speech to features describing clean speech. The enhanced data is then used to train the TTS acoustic model. In one approach we use the features conventionally employed to train TTS acoustic models, i.e Mel cepstral (MCEP) coefficients, aperiodicity values and fundamental frequency (F0). In the other approach, following conventional speech enhancement methods, we train an RNN using only the MCEP coefficients extracted from the magnitude spectrum. The enhanced MCEP features and the phase extracted from noisy speech are combined to reconstruct the waveform which is then used to extract acoustic features to train the TTS system. We show that the second approach results in larger MCEP distortion but smaller F0 errors. Subjective evaluation shows that synthetic voices trained with data enhanced with this method were rated higher and with similar to scores to voices trained with clean speech.},
  month = sep,
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/SSW9_Cassia_1.pdf},
  pages = {159--165}
}
@inproceedings{Valentini17,
  author = {{Valentini-Botinhao}, Cassia and Yamagishi, Junichi},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2017/105_Paper_2.pdf},
  booktitle = {Interspeech},
  year = {2017},
  abstract = {Intelligibility of speech in noise becomes lower as the listeners age increases, even when no apparent hearing impairment is present. The losses are, however, different depending on the nature of the noise and the characteristics of the voice. In this paper we investigate the effect that age, noise type and speaking style have on the intelligibility of speech reproduced by car loudspeakers. Using a binaural mannequin we recorded a variety of voices and speaking styles played from the audio system of a car while driving in different conditions. We used this material to create a listening test where participants were asked to transcribe what they could hear and recruited groups of young and older adults to take part in it. We found that intelligibility scores of older participants were lower for the competing speaker and background music conditions. Results also indicate that clear and Lombard speech was more intelligible than plain speech for both age groups. A mixed effect model revealed that the largest effect was the noise condition, followed by sentence type, speaking style, voice, age group and pure tone average.},
  title = {Speech intelligibility in cars: the effect of speaking style, noise and listener age}
}
@inproceedings{Lorenzo17,
  author = {Lorenzo-Trueba, Jaime and {Valentini-Botinhao}, Cassia and Henter, Gustav and Yamagishi, Junichi},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2017/532_Paper_1.pdf},
  booktitle = {Interspeech},
  year = {2017},
  abstract = {This paper analyzes a) how often listeners interpret the emotional content of an utterance incorrectly when listening to vocoded or natural speech in adverse conditions; b) which noise conditions cause the most misperceptions; and c) which group of listeners misinterpret emotions the most. The long-term goal is to construct new emotional speech synthesizers that adapt to the environment and to the listener. We performed a large-scale listening test where over 400 listeners between the ages of 21 and 72 assessed natural and vocoded acted emotional speech stimuli. The stimuli had been artificially degraded using a room impulse response recorded in a car and various in-car noise types recorded in a real car. Experimental results show that the recognition rates for emotions and perceived emotional strength degrade as signal-to-noise ratio decreases. Interestingly, misperceptions seem to be more pronounced for negative and lowarousal emotions such as calmness or anger, while positive emotions such as happiness appear to be more robust to noise. An ANOVA analysis of listener meta-data further revealed that gender and age also influenced results, with elderly male listeners most likely to incorrectly identify emotions.},
  title = {Misperceptions of the emotional content of natural and vocoded speech in a car}
}
@article{Pucher17,
  author = {Pucher, Michael and Zillinger, Bettina and Toman, Markus and Schabus, Dietmar and {Valentini-Botinhao}, Cassia and Yamagishi, Junichi and Schmid, Erich and Woltron, Thomas},
  publisher = {Academic Press Inc.},
  doi = {10.1016/j.csl.2017.05.010},
  title = {Influence of speaker familiarity on blind and visually impaired children and young adults perception of synthetic voices},
  journal = {Computer Speech and Language},
  issn = {0885-2308},
  abstract = {In this paper we evaluate how speaker familiarity influences the engagement times and performance of blind children and young adults when playing audio games made with different synthetic voices. We also show how speaker familiarity influences speaker and synthetic speech recognition. For the first experiment we develop synthetic voices of school children, their teachers and of speakers that are unfamiliar to them and use each of these voices to create variants of two audio games: a memory game and a labyrinth game. Results show that pupils have significantly longer engagement times and better performance when playing games that use synthetic voices built with their own voices. These findings can be used to improve the design of audio games and lecture books for blind and visually impaired children and young adults. In the second experiment we show that blind children and young adults are better in recognising synthetic voices than their visually impaired companions. We also show that the average familiarity with a speaker and the similarity between a speaker’s synthetic and natural voice are correlated to the speaker’s synthetic voice recognition rate.},
  month = jun,
  volume = {46},
  year = {2017},
  pages = {179--195}
}
@inproceedings{Villavicencio+2016,
  author = {Villavicencio, Fernando and Yamagishi, Junichi and Bonada, Jordi and Espic, Felipe},
  doi = {10.21437/Interspeech.2016-305},
  title = {Applying Spectral Normalisation and Efficient Envelope Estimation and Statistical Transformation for the Voice Conversion Challenge 2016},
  url = {http://hdl.handle.net/10230/32891},
  booktitle = {Interspeech},
  address = {San Francisco, USA},
  abstract = {In this work we present our entry for the Voice Conversion Challenge 2016, denoting new features to previous work on GMM-based voice conversion. We incorporate frequency warping and pitch transposition strategies to perform a normalisation of the spectral conditions, with benefits confirmed by objective and perceptual means. Moreover, the results of the challenge showed our entry among the highest performing systems in terms of perceived naturalness while maintaining the target similarity performance of GMM-based conversion.},
  month = sep,
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/0305.PDF},
  pages = {1657-61},
  categories = {voice conversion, speech synthesis, statistical spectral transformation, spectral envelope modeling}
}
@inproceedings{yoshimura2016hierarchical,
  author = {Yoshimura, Takenori and Henter, {Gustav Eje} and Watts, Oliver and Wester, Mirjam and Yamagishi, Junichi and Tokuda, Keiichi},
  bdsk-url-1 = {http://dx.doi.org/10.21437/Interspeech.2016-847},
  publisher = {International Speech Communication Association},
  doi = {10.21437/Interspeech.2016-847},
  date-modified = {2018-01-19 16:43:35 +0000},
  title = {A Hierarchical Predictor of Synthetic Speech Naturalness Using Neural Networks},
  abstract = {A problem when developing and tuning speech synthesis systems is that there is no well-established method of automatically rating the quality of the synthetic speech. This research attempts to obtain a new automated measure which is trained on the result of large-scale subjective evaluations employing many human listeners, i.e., the Blizzard Challenge. To exploit the data, we experiment with linear regression, feed-forward and convolutional neural network models, and combinations of them to regress from synthetic speech to the perceptual scores obtained from listeners. The biggest improvements were seen when combining stimulus- and system-level predictions.},
  month = sep,
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/0847.PDF},
  booktitle = {Interspeech 2016},
  pages = {342--346}
}