The Centre for Speech Technology Research, The university of Edinburgh

Publications by Junichi Yamagishi

jyamagis.bib

@article{analysis-hts-adaptation-junichi,
  author = {Junichi Yamagishi and Takao Kobayashi and Yuji Nakano
                   and Katsumi Ogata and Juri Isogai},
  title = {Analysis of Speaker Adaptation Algorihms for
                   {HMM}-based Speech Synthesis and a Constrained {SMAPLR}
                   Adaptation Algorithm},
  journal = {IEEE Transactions on Audio, Speech and Language
                   Processing},
  note = {In print},
  abstract = {In this paper we analyze the effects of several
                   factors and configuration choices encountered during
                   training and model construction when we want to obtain
                   better and more stable adaptation in HMM-based speech
                   synthesis. We then propose a new adaptation algorithm
                   called constrained structural maximum a posteriori
                   linear regression (CSMAPLR) whose derivation is based
                   on the knowledge obtained in this analysis and on the
                   results of comparing several conventional adaptation
                   algorithms. Here we investigate six major aspects of
                   the speaker adaptation: initial models transform
                   functions, estimation criteria, and sensitivity of
                   several linear regression adaptation algorithms
                   algorithms. Analyzing the effect of the initial model,
                   we compare speaker-dependent models, gender-independent
                   models, and the simultaneous use of the
                   gender-dependent models to single use of the
                   gender-dependent models. Analyzing the effect of the
                   transform functions, we compare the transform function
                   for only mean vectors with that for mean vectors and
                   covariance matrices. Analyzing the effect of the
                   estimation criteria, we compare the ML criterion with a
                   robust estimation criterion called structural MAP. We
                   evaluate the sensitivity of several thresholds for the
                   piecewise linear regression algorithms and take up
                   methods combining MAP adaptation with the linear
                   regression algorithms. We incorporate these adaptation
                   algorithms into our speech synthesis system and present
                   several subjective and objective evaluation results
                   showing the utility and effectiveness of these
                   algorithms in speaker adaptation for HMM-based speech
                   synthesis.},
  categories = {speech synthesis, HMM-based speech synthesis, HTS,
                   speaker adaptation, voice conversion, average voice},
  key = {analysis-hts-adaptation-junichi},
  year = 2008
}
@inproceedings{anderssoncabral09,
  author = {J. Sebastian Andersson and Joao P. Cabral and Leonardo
                   Badino and Junichi Yamagishi and Robert A.J. Clark},
  title = {Glottal Source and Prosodic Prominence Modelling in
                   {HMM}-based Speech Synthesis for the {B}lizzard
                   {C}hallenge 2009},
  booktitle = {The Blizzard Challenge 2009},
  address = {Edinburgh, U.K.},
  abstract = {This paper describes the CSTR entry for the Blizzard
                   Challenge 2009. The work focused on modifying two parts
                   of the Nitech 2005 HTS speech synthesis system to
                   improve naturalness and contextual appropriateness. The
                   first part incorporated an implementation of the
                   Linjencrants-Fant (LF) glottal source model. The second
                   part focused on improving synthesis of prosodic
                   prominence including emphasis through context dependent
                   phonemes. Emphasis was assigned to the synthesised test
                   sentences based on a handful of theory based rules. The
                   two parts (LF-model and prosodic prominence) were not
                   combined and hence evaluated separately. The results on
                   naturalness for the LF-model showed that it is not yet
                   perceived as natural as the Benchmark HTS system for
                   neutral speech. The results for the prosodic prominence
                   modelling showed that it was perceived as contextually
                   appropriate as the Benchmark HTS system, despite a low
                   naturalness score. The Blizzard challenge evaluation
                   has provided valuable information on the status of our
                   work and continued work will begin with analysing why
                   our modifications resulted in reduced naturalness
                   compared to the Benchmark HTS system.},
  categories = {HMM, HTS, speech synthesis, LF-model, glottal source,
                   prosodic prominence, emphasis},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/cstr_Blizzard2009.pdf},
  year = 2009
}
@article{treeboosting-junichi,
  author = {Junichi Yamagishi and Hisashi Kawai and Takao
                   Kobayashi},
  title = {Phone Duration Modeling Using Gradient Tree Boosting},
  journal = {Speech Communication},
  volume = 50,
  number = 5,
  pages = {405--415},
  note = {},
  abstract = { In text-to-speech synthesis systems, phone duration
                   influences the quality and naturalness of synthetic
                   speech. In this study, we incorporate an ensemble
                   learning technique called gradient tree boosting into
                   phone duration modeling as an alternative to the
                   conventional approach using regression trees, and
                   objectively evaluate the prediction accuracy of
                   Japanese, Mandarin, and English phone duration. The
                   gradient tree boosting algorithm is a meta algorithm of
                   regression trees: it iteratively builds the regression
                   tree from the residuals and outputs weighting sum of
                   the regression trees. Our evaluation results show that
                   compared to the regression trees or other techniques
                   related to the regression trees, the gradient tree
                   boosting algorithm can substantially and robustly
                   improve the predictive accuracy of the phone duration
                   regardless of languages, speakers, or domains.},
  categories = {Text-to-speech synthesis, Phone duration modeling,
                   Gradient tree boosing},
  doi = {10.1016/j.specom.2007.12.003},
  key = {treeboosting-junichi},
  month = may,
  year = 2008
}
@inproceedings{ling:richmond:yamagishi:wang:2008a,
  author = {Ling, Zhen-Hua and Richmond, Korin and Yamagishi,
                   Junichi and Wang, Ren-Hua },
  title = {Articulatory Control of {HMM}-based Parametric Speech
                   Synthesis Driven by Phonetic Knowledge},
  booktitle = {Proc. Interspeech},
  pages = {573--576},
  address = {Brisbane, Australia},
  abstract = {This paper presents a method to control the
                   characteristics of synthetic speech flexibly by
                   integrating articulatory features into a Hidden Markov
                   Model (HMM)-based parametric speech synthesis system.
                   In contrast to model adaptation and interpolation
                   approaches for speaking style control, this method is
                   driven by phonetic knowledge, and target speech samples
                   are not required. The joint distribution of parallel
                   acoustic and articulatory features considering
                   cross-stream feature dependency is estimated. At
                   synthesis time, acoustic and articulatory features are
                   generated simultaneously based on the
                   maximum-likelihood criterion. The synthetic speech can
                   be controlled flexibly by modifying the generated
                   articulatory features according to arbitrary phonetic
                   rules in the parameter generation process. Our
                   experiments show that the proposed method is effective
                   in both changing the overall character of synthesized
                   speech and in controlling the quality of a specific
                   vowel.},
  categories = {speech synthesis, HMM, articulatory features, phonetic
                   knowledge},
  key = {ling:richmond:yamagishi:wang:2008a},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080582.PDF},
  year = 2008
}
@article{Andersson2012175,
  author = {Sebastian Andersson and Junichi Yamagishi and Robert
                   A.J. Clark},
  title = {Synthesis and evaluation of conversational
                   characteristics in {HMM}-based speech synthesis},
  journal = {Speech Communication},
  volume = {54},
  number = {2},
  pages = {175--188},
  note = {},
  abstract = {Spontaneous conversational speech has many
                   characteristics that are currently not modelled well by
                   HMM-based speech synthesis and in order to build
                   synthetic voices that can give an impression of someone
                   partaking in a conversation, we need to utilise data
                   that exhibits more of the speech phenomena associated
                   with conversations than the more generally used
                   carefully read aloud sentences. In this paper we show
                   that synthetic voices built with HMM-based speech
                   synthesis techniques from conversational speech data,
                   preserved segmental and prosodic characteristics of
                   frequent conversational speech phenomena. An analysis
                   of an evaluation investigating the perception of
                   quality and speaking style of HMM-based voices confirms
                   that speech with conversational characteristics are
                   instrumental for listeners to perceive successful
                   integration of conversational speech phenomena in
                   synthetic speech. The achieved synthetic speech quality
                   provides an encouraging start for the continued use of
                   conversational speech in HMM-based speech synthesis.},
  doi = {10.1016/j.specom.2011.08.001},
  issn = {0167-6393},
  keywords = {Speech synthesis, HMM, Conversation, Spontaneous
                   speech, Filled pauses, Discourse marker},
  url = {http://www.sciencedirect.com/science/article/pii/S0167639311001178},
  year = 2012
}
@inproceedings{cereproc-hts,
  author = {Matthew P. Aylett and Junichi Yamagishi},
  title = {Combining Statistical Parameteric Speech Synthesis and
                   Unit-Selection for Automatic Voice Cloning},
  booktitle = {Proc. LangTech 2008},
  address = {Brisbane, Australia},
  abstract = {The ability to use the recorded audio of a subject’¡Çs
                   voice to produce an open-domain synthesis system has
                   generated much interest both in academic research and
                   in commercial speech technology. The ability to produce
                   synthetic versions of a subjects voice has potential
                   commercial applications, such as virtual celebrity
                   actors, or potential clinical applications, such as
                   offering a synthetic replacement voice in the case of a
                   laryngectomy. Recent developments in HMM-based speech
                   synthesis have shown it is possible to produce
                   synthetic voices from quite small amounts of speech
                   data. However, mimicking the depth and variation of a
                   speaker’¡Çs prosody as well as synthesising natural
                   voice quality is still a challenging research problem.
                   In contrast, unit-selection systems have shown it is
                   possible to strongly retain the character of the voice
                   but only with sufficient original source material.
                   Often this runs into hours and may require significant
                   manual checking and labelling. In this paper we will
                   present two state of the art systems, an HMM based
                   system HTS-2007, developed by CSTR and Nagoya Institute
                   Technology, and a commercial unit-selection system
                   CereVoice, developed by Cereproc. Both systems have
                   been used to mimic the voice of George W. Bush (43rd
                   president of the United States) using freely available
                   audio from the web. In addition we will present a
                   hybrid system which combines both technologies. We
                   demonstrate examples of synthetic voices created from
                   10, 40 and 210 minutes of randomly selected speech. We
                   will then discuss the underlying problems associated
                   with voice cloning using found audio, and the
                   scalability of our solution.},
  categories = {speech synthesis, HMM-based speech synthesis, HTS,
                   speaker adaptation, voice conversion, average voice},
  key = {cereproc-hts},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/03_AYLETT.pdf},
  year = 2008
}
@inproceedings{lingIS2012,
  author = {Ling, Zhen-Hua and Richmond, Korin and Yamagishi,
                   Junichi},
  title = {Vowel Creation by Articulatory Control in {HMM}-based
                   Parametric Speech Synthesis},
  booktitle = {Proc. Interspeech},
  address = {Portland, Oregon, USA},
  abstract = { This paper presents a method to produce a new vowel
                   by articulatory control in hidden Markov model (HMM)
                   based parametric speech synthesis. A multiple
                   regression HMM (MRHMM) is adopted to model the
                   distribution of acoustic features, with articulatory
                   features used as external auxiliary variables. The
                   dependency between acoustic and articulatory features
                   is modelled by a group of linear transforms that are
                   either estimated context-dependently or determined by
                   the distribution of articulatory features. Vowel
                   identity is removed from the set of context features
                   used to ensure compatibility between the
                   context-dependent model parameters and the articulatory
                   features of a new vowel. At synthesis time, acoustic
                   features are predicted according to the input
                   articulatory features as well as context information.
                   With an appropriate articulatory feature sequence, a
                   new vowel can be generated even when it does not exist
                   in the training set. Experimental results show this
                   method is effective in creating the English vowel /2/
                   by articulatory control without using any acoustic
                   samples of this vowel.},
  categories = {Speech synthesis, articulatory features,
                   multiple-regression hidden Markov model},
  keywords = {Speech synthesis, articulatory features,
                   multiple-regression hidden Markov model},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/LingRichmondYamagishi_IS2012.pdf},
  year = 2012
}
@inproceedings{jyamagis07:avss2006,
  author = {Junichi Yamagishi and Takao Kobayashi and Steve Renals
                   and Simon King and Heiga Zen and Tomoki Toda and
                   Keiichi Tokuda },
  title = {Improved Average-Voice-based Speech Synthesis Using
                   Gender-Mixed Modeling and a Parameter Generation
                   Algorithm Considering {GV}},
  booktitle = {Proc. 6th ISCA Workshop on Speech Synthesis (SSW-6)},
  abstract = {For constructing a speech synthesis system which can
                   achieve diverse voices, we have been developing a
                   speaker independent approach of HMM-based speech
                   synthesis in which statistical average voice models are
                   adapted to a target speaker using a small amount of
                   speech data. In this paper, we incorporate a
                   high-quality speech vocoding method STRAIGHT and a
                   parameter generation algorithm with global variance
                   into the system for improving quality of synthetic
                   speech. Furthermore, we introduce a feature-space
                   speaker adaptive training algorithm and a gender mixed
                   modeling technique for conducting further normalization
                   of the average voice model. We build an English
                   text-to-speech system using these techniques and show
                   the performance of the system.},
  categories = {HMM, speech synthesis, speaker adaptation, HTS},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/ssw6-yamagishi.pdf},
  year = 2007
}
@article{6289354,
  author = {Ling, Z. and Richmond, K. and Yamagishi, J.},
  title = {Articulatory Control of {HMM}-based Parametric Speech
                   Synthesis using Feature-Space-Switched Multiple
                   Regression},
  journal = {Audio, Speech, and Language Processing, IEEE
                   Transactions on},
  volume = {21},
  number = {1},
  pages = {207--219},
  abstract = {In previous work we proposed a method to control the
                   characteristics of synthetic speech flexibly by
                   integrating articulatory features into a hidden Markov
                   model (HMM) based parametric speech synthesiser. In
                   this method, a unified acoustic-articulatory model is
                   trained, and context-dependent linear transforms are
                   used to model the dependency between the two feature
                   streams. In this paper, we go significantly further and
                   propose a feature-space-switched multiple regression
                   HMM to improve the performance of articulatory control.
                   A multiple regression HMM (MRHMM) is adopted to model
                   the distribution of acoustic features, with
                   articulatory features used as exogenous explanatory
                   variables. A separate Gaussian mixture model (GMM) is
                   introduced to model the articulatory space, and
                   articulatory-to-acoustic regression matrices are
                   trained for each component of this GMM, instead of for
                   the context-dependent states in the HMM. Furthermore,
                   we propose a task-specific context feature tailoring
                   method to ensure compatibility between state context
                   features and articulatory features that are manipulated
                   at synthesis time. The proposed method is evaluated on
                   two tasks, using a speech database with acoustic
                   waveforms and articulatory movements recorded in
                   parallel by electromagnetic articulography (EMA). In a
                   vowel identity modification task, the new method
                   achieves better performance when reconstructing target
                   vowels by varying articulatory inputs than our previous
                   approach. A second vowel creation task shows our new
                   method is highly effective at producing a new vowel
                   from appropriate articulatory representations which,
                   even though no acoustic samples for this vowel are
                   present in the training data, is shown to sound highly
                   natural.},
  doi = {10.1109/TASL.2012.2215600},
  issn = {1558-7916},
  year = 2013
}
@inproceedings{zen:HTSoverview,
  author = {Heiga Zen and Keiichiro Oura and Takashi Nose and
                   Junichi Yamagishi and Shinji Sako and Tomoki Toda and
                   Takashi Masuko and Alan W. Black and Keiichi Tokuda},
  title = {Recent development of the {HMM}-based speech synthesis
                   system ({HTS})},
  booktitle = {Proc. 2009 Asia-Pacific Signal and Information
                   Processing Association (APSIPA)},
  address = {Sapporo, Japan},
  abstract = {A statistical parametric approach to speech synthesis
                   based on hidden Markov models (HMMs) has grown in
                   popularity over the last few years. In this approach,
                   spectrum, excitation, and duration of speech are
                   simultaneously modeled by context-dependent HMMs, and
                   speech waveforms are generate from the HMMs themselves.
                   Since December 2002, we have publicly released an
                   open-source software toolkit named “HMM-based speech
                   synthesis system (HTS)†to provide a research and
                   development toolkit for statistical parametric speech
                   synthesis. This paper describes recent developments of
                   HTS in detail, as well as future release plans.},
  month = oct,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/zen_APSIPA2009.pdf},
  year = 2009
}
@inproceedings{hirai07:5ms2007,
  author = {Toshio Hirai and Junichi Yamagishi and Seiichi Tenpaku
                   },
  title = {Utilization of an {HMM}-Based Feature Generation
                   Module in 5 ms Segment Concatenative Speech Synthesis},
  booktitle = {Proc. 6th ISCA Workshop on Speech Synthesis (SSW-6)},
  abstract = {If a concatenative speech synthesis system uses more
                   short speech segments, it increases the potential to
                   generate natural speech because the concatenation
                   variation becomes greater. Recently, a synthesis
                   approach was proposed in which very short (5 ms)
                   segments are used. In this paper, an implementation of
                   an HMM-based feature generation module into a very
                   short segment concatenative synthesis system that has
                   the advantage of modularity and a synthesis experiment
                   are described.},
  categories = {speech synthesis, HTS, hybrid algorithm},
  month = aug,
  year = 2007
}
@inproceedings{CassiaICASSP12,
  author = {Valentini-Botinhao, C. and Maia, R. and Yamagishi, J.
                   and King, S. and Zen, H.},
  title = {{Cepstral analysis based on the Glimpse proportion
                   measure for improving the intelligibility of
                   {HMM}-based synthetic speech in noise}},
  booktitle = {Proc. ICASSP},
  pages = {3997--4000},
  address = {Kyoto, Japan},
  abstract = {In this paper we introduce a new cepstral coefficient
                   extraction method based on an intelligibility measure
                   for speech in noise, the Glimpse Proportion measure.
                   This new method aims to increase the intelligibility of
                   speech in noise by modifying the clean speech, and has
                   applications in scenarios such as public announcement
                   and car navigation systems. We first explain how the
                   Glimpse Proportion measure operates and further show
                   how we approximated it to integrate it into an existing
                   spectral envelope parameter extraction method commonly
                   used in the HMM-based speech synthesis framework. We
                   then demonstrate how this new method changes the
                   modelled spectrum according to the characteristics of
                   the noise and show results for a listening test with
                   vocoded and HMM-based synthetic speech. The test
                   indicates that the proposed method can significantly
                   improve intelligibility of synthetic speech in speech
                   shaped noise.},
  categories = {HMM-based speech synthesis, intelligibility
                   enhancement, speech analysis},
  doi = {10.1109/ICASSP.2012.6288794},
  month = {March},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_ICASSP12.pdf},
  year = 2012
}
@inproceedings{ling_interspeech2010,
  author = {Ling, Zhen-Hua and Richmond, Korin and Yamagishi,
                   Junichi},
  title = {{HMM}-based Text-to-Articulatory-Movement Prediction
                   and Analysis of Critical Articulators},
  booktitle = {Proc. Interspeech},
  pages = {2194--2197},
  address = {Makuhari, Japan},
  abstract = {In this paper we present a method to predict the
                   movement of a speaker's mouth from text input using
                   hidden Markov models (HMM). We have used a corpus of
                   human articulatory movements, recorded by
                   electromagnetic articulography (EMA), to train HMMs. To
                   predict articulatory movements from text, a suitable
                   model sequence is selected and the maximum-likelihood
                   parameter generation (MLPG) algorithm is used to
                   generate output articulatory trajectories. In our
                   experiments, we find that fully context-dependent
                   models outperform monophone and quinphone models,
                   achieving an average root mean square (RMS) error of
                   1.945mm when state durations are predicted from text,
                   and 0.872mm when natural state durations are used.
                   Finally, we go on to analyze the prediction error for
                   different EMA dimensions and phone types. We find a
                   clear pattern emerges that the movements of so-called
                   critical articulators can be predicted more accurately
                   than the average performance.},
  keywords = {Hidden Markov model, articulatory features, parameter
                   generation, critical articulators},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100349.pdf},
  year = 2010
}
@inproceedings{PhillipIS2012,
  author = {Phillip L. De Leon and Bryan Stewart and Junichi
                   Yamagishi},
  title = {Synthetic Speech Discrimination using Pitch Pattern
                   Statistics Derived from Image Analysis},
  booktitle = {Proc. Interspeech},
  address = {Portland, Oregon, USA},
  abstract = { In this paper, we extend the work by Ogihara, et al.
                   to discriminate between human and synthetic speech
                   using features based on pitch patterns. As previously
                   demonstrated, significant differences in pitch patterns
                   between human and synthetic speech can be leveraged to
                   classify speech as being human or synthetic in origin.
                   We propose using mean pitch stability, mean pitch
                   stability range, and jitter as features extracted after
                   image analysis of pitch patterns. We have observed that
                   for synthetic speech, these features lie in a small and
                   distinct space as compared to human speech and have
                   modeled them with a multivariate Gaussian distribution.
                   Our classifier is trained using synthetic speech
                   collected from the 2008 and 2011 Blizzard Challenge
                   along with Festival pre-built voices and human speech
                   from the NIST2002 corpus. We evaluate the classifier on
                   a much larger corpus than previously studied using
                   human speech from the Switchboard corpus, synthetic
                   speech from the Resource Management corpus, and
                   synthetic speech generated from Festival trained on the
                   Wall Street Journal corpus. Results show 98% accuracy
                   in correctly classifying human speech and 96% accuracy
                   in correctly classifying synthetic speech.},
  month = sep,
  year = 2012
}
@article{roberto:specom2010,
  author = {R. Barra-Chicote and J. Yamagishi and S. King and J.
                   Manuel Monero and J. Macias-Guarasa},
  title = {Analysis of Statistical Parametric and Unit-Selection
                   Speech Synthesis Systems Applied to Emotional Speech},
  journal = {Speech Communication},
  volume = {52},
  number = {5},
  pages = {394--404},
  abstract = {We have applied two state-of-the-art speech synthesis
                   techniques (unit selection and HMM-based synthesis) to
                   the synthesis of emotional speech. A series of
                   carefully designed perceptual tests to evaluate speech
                   quality, emotion identification rates and emotional
                   strength were used for the six emotions which we
                   recorded -- happiness, sadness, anger, surprise, fear,
                   disgust. For the HMM-based method, we evaluated
                   spectral and source components separately and
                   identified which components contribute to which
                   emotion. Our analysis shows that, although the HMM
                   method produces significantly better neutral speech,
                   the two methods produce emotional speech of similar
                   quality, except for emotions having context-dependent
                   prosodic patterns. Whilst synthetic speech produced
                   using the unit selection method has better emotional
                   strength scores than the HMM-based method, the
                   HMM-based method has the ability to manipulate the
                   emotional strength. For emotions that are characterized
                   by both spectral and prosodic components, synthetic
                   speech using unit selection methods was more accurately
                   identified by listeners. For emotions mainly
                   characterized by prosodic components, HMM-based
                   synthetic speech was more accurately identified. This
                   finding differs from previous results regarding
                   listener judgements of speaker similarity for neutral
                   speech. We conclude that unit selection methods require
                   improvements to prosodic modeling and that HMM-based
                   methods require improvements to spectral modeling for
                   emotional speech. Certain emotions cannot be reproduced
                   well by either method.},
  doi = {10.1016/j.specom.2009.12.007},
  keywords = {Emotional speech synthesis; HMM-based synthesis; Unit
                   selection},
  month = may,
  year = 2010
}
@inproceedings{5947571,
  author = {Andraszewicz, S. and Yamagishi, J. and King, S.},
  title = {Vocal attractiveness of statistical speech
                   synthesisers},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011
                   IEEE International Conference on},
  pages = {5368--5371},
  abstract = {Our previous analysis of speaker-adaptive HMM-based
                   speech synthesis methods suggested that there are two
                   possible reasons why average voices can obtain higher
                   subjective scores than any individual adapted voice: 1)
                   model adaptation degrades speech quality proportionally
                   to the distance 'moved' by the transforms, and 2)
                   psychoacoustic effects relating to the attractiveness
                   of the voice. This paper is a follow-on from that
                   analysis and aims to separate these effects out. Our
                   latest perceptual experiments focus on attractiveness,
                   using average voices and speaker-dependent voices
                   without model trans formation, and show that using
                   several speakers to create a voice improves smoothness
                   (measured by Harmonics-to-Noise Ratio), reduces
                   distance from the the average voice in the log F0-F1
                   space of the final voice and hence makes it more
                   attractive at the segmental level. However, this is
                   weakened or overridden at supra-segmental or sentence
                   levels.},
  doi = {10.1109/ICASSP.2011.5947571},
  issn = {1520-6149},
  keywords = {speaker-adaptive HMM-based speech synthesis
                   methods;speaker-dependent voices;statistical speech
                   synthesisers;vocal attractiveness;hidden Markov
                   models;speaker recognition;speech synthesis;},
  month = may,
  year = 2011
}
@inproceedings{Jaime2IS2012,
  author = {J. Lorenzo and B. Martinez and R. Barra-Chicote and V.
                   Lopez–Ludena and J. Ferreiros and J. Yamagishi and
                   J.M. Montero},
  title = { Towards an Unsupervised Speaking Style Voice Building
                   Framework: Multi–Style Speaker Diarization},
  booktitle = {Proc. Interspeech},
  address = {Portland, Oregon, USA},
  abstract = { Current text–to–speech systems are developed
                   using studio-recorded speech in a neutral style or
                   based on acted emotions. However, the proliferation of
                   media sharing sites would allow developing a new
                   generation of speech–based systems which could cope
                   with sponta- neous and styled speech. This paper
                   proposes an architecture to deal with realistic
                   recordings and carries out some experiments on
                   unsupervised speaker diarization. In order to maximize
                   the speaker purity of the clusters while keeping a high
                   speaker coverage, the paper evaluates the F–measure
                   of a diarization module, achieving high scores (>85%)
                   especially when the clusters are longer than 30
                   seconds, even for the more spontaneous and expressive
                   styles (such as talk shows or sports).},
  month = sep,
  year = 2012
}
@incollection{sarah:hts09,
  author = {Sarah Creer and Phil Green and Stuart Cunningham and
                   Junichi Yamagishi},
  title = {Building personalised synthesised voices for
                   individuals with dysarthria using the {HTS} toolkit},
  booktitle = {Computer Synthesized Speech Technologies: Tools for
                   Aiding Impairment},
  publisher = {IGI Global},
  editor = {John W. Mullennix and Steven E. Stern},
  edition = {1st},
  note = {in press},
  abstract = {When the speech of an individual becomes
                   unintelligible due to a neurological disorder, a
                   synthesized voice can replace that of the individual.
                   To fully replace all functions of human speech
                   communication: communication of information,
                   maintenance of social relationships and displaying
                   identity, the voice must be intelligible,
                   natural-sounding and retain the vocal identity of the
                   speaker. For speakers with dysarthria, achieving this
                   output with minimal data recordings and deteriorating
                   speech is difficult. An alternative to this is using
                   Hidden Markov models (HMMs) which require much less
                   speech data than needed for concatenative methods, to
                   adapt a robust statistical model of speech towards the
                   speaker characteristics captured in the data recorded
                   by the individual. This chapter implements this
                   technique using the HTS toolkit to build personalized
                   synthetic voices for two individuals with dysarthria.
                   An evaluation of the voices by the participants
                   themselves suggests that this technique shows promise
                   for building and reconstructing personalized voices for
                   individuals with dysarthria once deterioration has
                   begun.},
  year = 2009
}
@article{tuomo:ieee2011,
  author = {T. Raitio and A. Suni and J. Yamagishi and H. Pulakka
                   and J. Nurminen and M. Vainio and P. Alku},
  title = {{HMM}-Based Speech Synthesis Utilizing Glottal Inverse
                   Filtering},
  journal = {IEEE Transactions on Audio, Speech and Language
                   Processing},
  volume = 19,
  number = 1,
  pages = {153--165},
  abstract = {This paper describes an hidden Markov model
                   (HMM)-based speech synthesizer that utilizes glottal
                   inverse filtering for generating natural sounding
                   synthetic speech. In the proposed method, speech is
                   first decomposed into the glottal source signal and the
                   model of the vocal tract filter through glottal inverse
                   filtering, and thus parametrized into excitation and
                   spectral features. The source and filter features are
                   modeled individually in the framework of HMM and
                   generated in the synthesis stage according to the text
                   input. The glottal excitation is synthesized through
                   interpolating and concatenating natural glottal flow
                   pulses, and the excitation signal is further modified
                   according to the spectrum of the desired voice source
                   characteristics. Speech is synthesized by filtering the
                   reconstructed source signal with the vocal tract
                   filter. Experiments show that the proposed system is
                   capable of generating natural sounding speech, and the
                   quality is clearly better compared to two HMM-based
                   speech synthesis systems based on widely used vocoder
                   techniques.},
  doi = {10.1109/TASL.2010.2045239},
  keywords = {Glottal inverse filtering , hidden Markov model (HMM)
                   , speech synthesis},
  month = jan,
  year = 2011
}
@inproceedings{Ayletetal09,
  author = {Matthew P. Aylett and Simon King and Junichi Yamagishi},
  title = {Speech Synthesis Without a Phone Inventory},
  booktitle = {Interspeech},
  pages = {2087--2090},
  abstract = { In speech synthesis the unit inventory is decided
                   using phonological and phonetic expertise. This process
                   is resource intensive and potentially sub-optimal. In
                   this paper we investigate how acoustic clustering,
                   together with lexicon constraints, can be used to build
                   a self-organised inventory. Six English speech
                   synthesis systems were built using two frameworks, unit
                   selection and parametric HTS for three inventory
                   conditions: 1) a traditional phone set, 2) a system
                   using orthographic units, and 3) a self-organised
                   inventory. A listening test showed a strong preference
                   for the classic system, and for the orthographic system
                   over the self-organised system. Results also varied by
                   letter to sound complexity and database coverage. This
                   suggests the self-organised approach failed to
                   generalise pronunciation as well as introducing noise
                   above and beyond that caused by orthographic sound
                   mismatch.},
  categories = {speech synthesis, unit selection, parametric
                   synthesis, phone inventory, orthographic synthesis},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/ma_interspeech09.pdf},
  place = {Brighton},
  year = 2009
}
@inproceedings{phillip:icassp2010,
  author = {P. L. De Leon and V. R. Apsingekar and M. Pucher and
                   J. Yamagishi},
  title = {Revisiting the security of speaker verification
                   systems against imposture using synthetic speech},
  booktitle = {{Proc. ICASSP 2010}},
  address = {Dallas, Texas, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/main_r2.pdf},
  year = 2010
}
@inproceedings{lingvowel,
  author = {Ling, Zhenhua and Richmond, Korin and Yamagishi,
                   Junichi},
  title = {Vowel Creation by Articulatory Control in {HMM}-based
                   Parametric Speech Synthesis},
  booktitle = {Proc. The Listening Talker Workshop},
  pages = {72},
  address = {Edinburgh, UK},
  month = {May},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2012/Ling_etal_LISTA.pdf},
  year = 2012
}
@article{JunichiHTS06,
  author = {Junichi Yamagishi and Takao Kobayashi},
  title = {Average-Voice-based Speech Synthesis using HSMM-based
                   Speaker Adaptation and Adaptive Training.},
  journal = {IEICE Trans. Information and Systems},
  volume = {E90-D},
  number = 2,
  pages = {533-543},
  abstract = {In speaker adaptation for speech synthesis, it is
                   desirable to convert both voice characteristics and
                   prosodic features such as F0 and phone duration. For
                   simultaneous adaptation of spectrum, F0 and phone
                   duration within the HMM framework, we need to transform
                   not only the state output distributions corresponding
                   to spectrum and F0 but also the duration distributions
                   corresponding to phone duration. However, it is not
                   straightforward to adapt the state duration because the
                   original HMM does not have explicit duration
                   distributions. Therefore, we utilize the framework of
                   the hidden semi-Markov model (HSMM), which is an HMM
                   having explicit state duration distributions, and we
                   apply an HSMM-based model adaptation algorithm to
                   simultaneously transform both the state output and
                   state duration distributions. Furthermore, we propose
                   an HSMM-based adaptive training algorithm to
                   simultaneously normalize the state output and state
                   duration distributions of the average voice model. We
                   incorporate these techniques into our HSMM-based speech
                   synthesis system, and show their effectiveness from the
                   results of subjective and objective evaluation tests.},
  month = feb,
  year = 2007
}
@inproceedings{watts_yamagishi_king_2011,
  author = {Oliver Watts and Junichi Yamagishi and Simon King},
  title = {Unsupervised continuous-valued word features for
                   phrase-break prediction without a part-of-speech tagger},
  booktitle = {Proc. Interspeech},
  pages = {2157--2160},
  address = {Florence, Italy},
  abstract = {Part of speech (POS) tags are foremost among the
                   features conventionally used to predict intonational
                   phrase-breaks for text to speech (TTS) conversion. The
                   construction of such systems therefore presupposes the
                   availability of a POS tagger for the relevant language,
                   or of a corpus manually tagged with POS. However, such
                   tools and resources are not available in the majority
                   of the world’s languages, and manually labelling text
                   with POS tags is an expensive and time-consuming
                   process. We therefore propose the use of
                   continuous-valued features that summarise the
                   distributional characteristics of word types as
                   surrogates for POS features. Importantly, such features
                   are obtained in an unsupervised manner from an untagged
                   text corpus. We present results on the phrase-break
                   prediction task, where use of the features closes the
                   gap in performance between a baseline system (using
                   only basic punctuation-related features) and a topline
                   system (incorporating a state-of-the-art POS tagger).},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/watts_yamagishi_king_2011.pdf},
  year = 2011
}
@inproceedings{Cassia_IS11,
  author = {Valentini-Botinhao, Cassia and Yamagishi, Junichi and
                   King, Simon},
  title = {Can Objective Measures Predict the Intelligibility of
                   Modified {HMM}-based Synthetic Speech in Noise?},
  booktitle = {Proc. Interspeech},
  abstract = {{Synthetic speech can be modified to improve
                   intelligibility in noise. In order to perform
                   modifications automatically, it would be useful to have
                   an objective measure that could predict the
                   intelligibility of modified synthetic speech for human
                   listeners. We analysed the impact on intelligibility
                   – and on how well objective measures predict it –
                   when we separately modify speaking rate, fundamental
                   frequency, line spectral pairs and spectral peaks.
                   Shifting LSPs can increase intelligibility for human
                   listeners; other modifications had weaker effects.
                   Among the objective measures we evaluated, the Dau
                   model and the Glimpse proportion were the best
                   predictors of human performance.}},
  categories = {HMM-based speech synthesis, objective measures of
                   intelligibility},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/Cassia_IS11.pdf},
  year = 2011
}
@inproceedings{robust-hts,
  author = {Junichi Yamagishi and Zhenhua Ling and Simon King},
  title = {Robustness of HMM-based Speech Synthesis},
  booktitle = {Proc. Interspeech 2008},
  pages = {581--584},
  address = {Brisbane, Australia},
  abstract = {As speech synthesis techniques become more advanced,
                   we are able to consider building high-quality voices
                   from data collected outside the usual highly-controlled
                   recording studio environment. This presents new
                   challenges that are not present in conventional
                   text-to-speech synthesis: the available speech data are
                   not perfectly clean, the recording conditions are not
                   consistent, and/or the phonetic balance of the material
                   is not ideal. Although a clear picture of the
                   performance of various speech synthesis techniques
                   (e.g., concatenative, HMM-based or hybrid) under good
                   conditions is provided by the Blizzard Challenge, it is
                   not well understood how robust these algorithms are to
                   less favourable conditions. In this paper, we analyse
                   the performance of several speech synthesis methods
                   under such conditions. This is, as far as we know, a
                   new research topic: ``Robust speech synthesis.'' As a
                   consequence of our investigations, we propose a new
                   robust training method for the HMM-based speech
                   synthesis in for use with speech data collected in
                   unfavourable conditions.},
  categories = {speech synthesis, HMM-based speech synthesis, HTS,
                   speaker adaptation, voice conversion, average voice,
                   unit selection},
  key = {robust-hts},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/RobustnessHTS.pdf},
  year = 2008
}
@inproceedings{5947440,
  author = {De Leon, P.L. and Hernaez, I. and Saratxaga, I. and
                   Pucher, M. and Yamagishi, J.},
  title = {Detection of synthetic speech for the problem of
                   imposture},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011
                   IEEE International Conference on},
  pages = {4844--4847},
  abstract = {In this paper, we present new results from our
                   research into the vulnerability of a speaker
                   verification (SV) system to synthetic speech. We use a
                   HMM-based speech synthesizer, which creates synthetic
                   speech for a targeted speaker through adaptation of a
                   background model and both GMM-UBM and support vector
                   machine (SVM) SV systems. Using 283 speakers from the
                   Wall-Street Journal (WSJ) corpus, our SV systems have a
                   0.35% EER. When the systems are tested with synthetic
                   speech generated from speaker models derived from the
                   WSJ journal corpus, over 91% of the matched claims are
                   accepted. We propose the use of relative phase shift
                   (RPS) in order to detect synthetic speech and develop a
                   GMM-based synthetic speech classifier (SSC). Using the
                   SSC, we are able to correctly classify human speech in
                   95% of tests and synthetic speech in 88% of tests thus
                   significantly reducing the vulnerability.},
  doi = {10.1109/ICASSP.2011.5947440},
  issn = {1520-6149},
  keywords = {EER;GMM-UBM;GMM-based synthetic speech
                   classifier;HMM-based speech synthesizer;RPS;SSC;SV
                   system;WSJ corpus;Wall-Street Journal corpus;relative
                   phase shift;speaker verification system;support vector
                   machine;hidden Markov models;speaker recognition;speech
                   synthesis;support vector machines;},
  month = may,
  year = 2011
}
@article{nose07:mrhsmm,
  author = {Takashi Nose and Junichi Yamagishi and Takao Kobayashi},
  title = {A Style Control Technique for {HMM}-based Expressive
                   Speech Synthesis},
  journal = {IEICE Trans. Information and Systems},
  volume = {E90-D},
  number = 9,
  pages = {1406--1413},
  abstract = {This paper describes a technique for controlling the
                   degree of expressivity of a desired emotional
                   expression and/or speaking style of synthesized speech
                   in an HMM-based speech synthesis framework. With this
                   technique, multiple emotional expressions and speaking
                   styles of speech are modeled in a single model by using
                   a multiple-regression hidden semi-Markov model
                   (MRHSMM). A set of control parameters, called the style
                   vector, is defined, and each speech synthesis unit is
                   modeled by using the MRHSMM, in which mean parameters
                   of the state output and duration distributions are
                   expressed by multiple-regression of the style vector.
                   In the synthesis stage, the mean parameters of the
                   synthesis units are modified by transforming an
                   arbitrarily given style vector that corresponds to a
                   point in a low-dimensional space, called style space,
                   each of whose coordinates represents a certain specific
                   speaking style or emotion of speech. The results of
                   subjective evaluation tests show that style and its
                   intensity can be controlled by changing the style
                   vector},
  categories = {HMM-based speech synthesis, speaking style, emotional
                   expression, style interpolation, hidden semi-Markov
                   model (HSMM)},
  month = sep,
  url = {http://search.ieice.org/bin/summary.php?id=e90-d_9_1406&category=D&lang=E&year=2007&abst=},
  year = 2007
}
@article{Ximera06,
  author = {Hisashi Kawai and Tomoki Toda and Junichi Yamagishi
                   and Toshio Hirai and Jinfu Ni and Nobuyuki Nishizawa
                   and Minoru Tsuzaki and Keiichi Tokuda},
  title = {XIMERA: a concatenative speech synthesis system with
                   large scale corpora},
  journal = {IEICE Trans. Information and Systems},
  volume = {J89-D-II},
  number = 12,
  pages = {2688-2698},
  month = dec,
  year = 2006
}
@article{ling2008,
  author = {Ling, Z. and Richmond, K. and Yamagishi, J. and Wang,
                   R.},
  title = {Integrating Articulatory Features into {HMM}-based
                   Parametric Speech Synthesis},
  journal = {IEEE Transactions on Audio, Speech and Language
                   Processing },
  volume = 17,
  number = 6,
  pages = {1171--1185},
  note = {\textbf{IEEE SPS 2010 Young Author Best Paper Award}},
  abstract = {This paper presents an investigation of ways to
                   integrate articulatory features into Hidden Markov
                   Model (HMM)-based parametric speech synthesis,
                   primarily with the aim of improving the performance of
                   acoustic parameter generation. The joint distribution
                   of acoustic and articulatory features is estimated
                   during training and is then used for parameter
                   generation at synthesis time in conjunction with a
                   maximum-likelihood criterion. Different model
                   structures are explored to allow the articulatory
                   features to influence acoustic modeling: model
                   clustering, state synchrony and cross-stream feature
                   dependency. The results of objective evaluation show
                   that the accuracy of acoustic parameter prediction can
                   be improved when shared clustering and
                   asynchronous-state model structures are adopted for
                   combined acoustic and articulatory features. More
                   significantly, our experiments demonstrate that
                   modeling the dependency between these two feature
                   streams can make speech synthesis more flexible. The
                   characteristics of synthetic speech can be easily
                   controlled by modifying generated articulatory features
                   as part of the process of acoustic parameter
                   generation.},
  categories = {Speech synthesis, articulation, HMM-based synthesis},
  doi = {10.1109/TASL.2009.2014796},
  key = {ling2008},
  month = aug,
  year = 2009
}
@inproceedings{child_synthesis_2009,
  author = {Oliver Watts and Junichi Yamagishi and Simon King and
                   Kay Berkling},
  title = {{HMM} Adaptation and Voice Conversion for the
                   Synthesis of Child Speech: A Comparison},
  booktitle = {Proc. Interspeech 2009},
  pages = {2627--2630},
  address = {Brighton, U.K.},
  abstract = {This study compares two different methodologies for
                   producing data-driven synthesis of child speech from
                   existing systems that have been trained on the speech
                   of adults. On one hand, an existing statistical
                   parametric synthesiser is transformed using model
                   adaptation techniques, informed by linguistic and
                   prosodic knowledge, to the speaker characteristics of a
                   child speaker. This is compared with the application of
                   voice conversion techniques to convert the output of an
                   existing waveform concatenation synthesiser with no
                   explicit linguistic or prosodic knowledge. In a
                   subjective evaluation of the similarity of synthetic
                   speech to natural speech from the target speaker, the
                   HMM-based systems evaluated are generally preferred,
                   although this is at least in part due to the higher
                   dimensional acoustic features supported by these
                   techniques.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/child_synthesis_2009.pdf},
  year = 2009
}
@article{anderssonyamagishi12,
  author = {Andersson, S. and Yamagishi, J. and Clark, R.A.J.},
  title = {Synthesis and Evaluation of Conversational
                   Characteristics in {HMM}-Based Speech Synthesis},
  journal = {Speech Communication},
  volume = 54,
  number = 2,
  pages = {175-188},
  abstract = {Spontaneous conversational speech has many
                   characteristics that are currently not modelled well by
                   HMM-based speech synthesis and in order to build
                   synthetic voices that can give an impression of someone
                   partaking in a conversation, we need to utilise data
                   that exhibits more of the speech phenomena associated
                   with conversations than the more generally used
                   carefully read aloud sentences. In this paper we show
                   that synthetic voices built with HMM-based speech
                   synthesis techniques from conversational speech data,
                   preserved segmental and prosodic characteristics of
                   frequent conversational speech phenomena. An analysis
                   of an evaluation investigating the perception of
                   quality and speaking style of HMM-based voices confirms
                   that speech with conversational characteristics are
                   instrumental for listeners to perceive successful
                   integration of conversational speech phenomena in
                   synthetic speech. The achieved synthetic speech quality
                   provides an encouraging start for the continued use of
                   conversational speech in HMM-based speech synthesis.},
  doi = {10.1016/j.specom.2011.08.001},
  year = 2012
}
@inproceedings{yong:ssw7,
  author = {Yong Guan and Jilei Tian and Yi-Jian Wu and Junichi
                   Yamagishi and Jani Nurminen},
  title = {A Unified and Automatic Approach Of {M}andarin {HTS}
                   System},
  booktitle = {{Proc. SSW7}},
  address = {Kyoto, Japan},
  keywords = {HTS, speech synthesis, mandarin},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/HTS_Yong_ssw7.pdf},
  year = 2010
}
@article{Dines2011,
  author = {John Dines and Hui Liang and Lakshmi Saheer and
                   Matthew Gibson and William Byrne and Keiichiro Oura and
                   Keiichi Tokuda and Junichi Yamagishi and Simon King and
                   Mirjam Wester and Teemu Hirsimäki and Reima
                   Karhila and Mikko Kurimo},
  title = {Personalising speech-to-speech translation:
                   Unsupervised cross-lingual speaker adaptation for
                   {HMM}-based speech synthesis},
  journal = {Computer Speech and Language},
  volume = {27},
  number = {2},
  pages = {420--437},
  abstract = {In this paper we present results of unsupervised
                   cross-lingual speaker adaptation applied to
                   text-to-speech synthesis. The application of our
                   research is the personalisation of speech-to-speech
                   translation in which we employ a HMM statistical
                   framework for both speech recognition and synthesis.
                   This framework provides a logical mechanism to adapt
                   synthesised speech output to the voice of the user by
                   way of speech recognition. In this work we present
                   results of several different unsupervised and
                   cross-lingual adaptation approaches as well as an
                   end-to-end speaker adaptive speech-to-speech
                   translation system. Our experiments show that we can
                   successfully apply speaker adaptation in both
                   unsupervised and cross-lingual scenarios and our
                   proposed algorithms seem to generalise well for several
                   language pairs. We also discuss important future
                   directions including the need for better evaluation
                   metrics.},
  doi = {10.1016/j.csl.2011.08.003},
  issn = {0885-2308},
  keywords = {Speech-to-speech translation, Cross-lingual speaker
                   adaptation, HMM-based speech synthesis, Speaker
                   adaptation, Voice conversion},
  url = {http://www.sciencedirect.com/science/article/pii/S0885230811000441},
  month = feb,
  year = 2013
}
@inproceedings{avss-icassp07,
  author = {J. Yamagishi and T. Kobayashi and M. Tachibana and K.
                   Ogata and Y. Nakano},
  title = {Model adaptation approach to speech synthesis with
                   diverse voices and styles},
  booktitle = {Proc. ICASSP},
  pages = {1233--1236},
  abstract = {In human computer interaction and dialogue systems, it
                   is often desirable for text-to-speech synthesis to be
                   able to generate natural sounding speech with an
                   arbitrary speaker~Afs voice and with varying speaking
                   styles and/or emotional expressions. We have developed
                   an average-voice-based speech synthesis method using
                   statistical average voice models and model adaptation
                   techniques for this purpose. In this paper, we describe
                   an overview of the speech synthesis system and show the
                   current performance with several experimental results.},
  year = 2007
}
@inproceedings{lei2011a,
  author = {Lei, Ming and Yamagishi, Junichi and Richmond, Korin
                   and Ling, Zhen-Hua and King, Simon and Dai, Li-Rong},
  title = {Formant-controlled {HMM}-based speech synthesis},
  booktitle = {Proc. Interspeech},
  pages = {2777--2780},
  address = {Florence, Italy},
  abstract = {This paper proposes a novel framework that enables us
                   to manipulate and control formants in HMM-based speech
                   synthesis. In this framework, the dependency between
                   formants and spectral features is modelled by piecewise
                   linear transforms; formant parameters are effectively
                   mapped by these to the means of Gaussian distributions
                   over the spectral synthesis parameters. The spectral
                   envelope features generated under the influence of
                   formants in this way may then be passed to high-quality
                   vocoders to generate the speech waveform. This provides
                   two major advantages over conventional frameworks.
                   First, we can achieve spectral modification by changing
                   formants only in those parts where we want control,
                   whereas the user must specify all formants manually in
                   conventional formant synthesisers (e.g. Klatt). Second,
                   this can produce high-quality speech. Our results show
                   the proposed method can control vowels in the
                   synthesized speech by manipulating F 1 and F 2 without
                   any degradation in synthesis quality.},
  categories = {speech synthesis, hidden Markov model, formants,
                   controllability},
  month = {August},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/IS110769.pdf},
  year = 2011
}
@inproceedings{cabral:renals:richmond:yamagishi:2008a,
  author = {Cabral, J. and Renals, S. and Richmond, K. and
                   Yamagishi, J.},
  title = {Glottal Spectral Separation for Parametric Speech
                   Synthesis},
  booktitle = {Proc. Interspeech},
  pages = {1829--1832},
  address = {Brisbane, Australia},
  abstract = {This paper presents a method to control the
                   characteristics of synthetic speech flexibly by
                   integrating articulatory features into a Hidden Markov
                   Model (HMM)-based parametric speech synthesis system.
                   In contrast to model adaptation and interpolation
                   approaches for speaking style control, this method is
                   driven by phonetic knowledge, and target speech samples
                   are not required. The joint distribution of parallel
                   acoustic and articulatory features considering
                   cross-stream feature dependency is estimated. At
                   synthesis time, acoustic and articulatory features are
                   generated simultaneously based on the
                   maximum-likelihood criterion. The synthetic speech can
                   be controlled flexibly by modifying the generated
                   articulatory features according to arbitrary phonetic
                   rules in the parameter generation process. Our
                   experiments show that the proposed method is effective
                   in both changing the overall character of synthesized
                   speech and in controlling the quality of a specific
                   vowel. },
  categories = {HMM speech synthesis, Glottal Spectral Separation,
                   LF-model},
  key = {cabral:renals:richmond:yamagishi:2008a},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS081086.PDF},
  year = 2008
}
@inproceedings{zen07:hts-2,
  author = {Heiga Zen and Takashi Nose and Junichi Yamagishi and
                   Shinji Sako and Takashi Masuko and Alan Black and
                   Keiichi Tokuda},
  title = {The {HMM}-based speech synthesis system ({HTS})
                   version 2.0},
  booktitle = {Proc. 6th ISCA Workshop on Speech Synthesis (SSW-6)},
  abstract = {A statistical parametric speech synthesis system based
                   on hidden Markov models (HMMs) has grown in popularity
                   over the last few years. This system simultaneously
                   models spectrum, excitation, and duration of speech
                   using context-dependent HMMs and generates speech
                   waveforms from the HMMs themselves. Since December
                   2002, we have publicly released an open-source software
                   toolkit named HMM-based speech synthesis system (HTS)
                   to provide a research and development platform for the
                   speech synthesis community. In December 2006, HTS
                   version 2.0 was released. This version includes a
                   number of new features which are useful for both speech
                   synthesis researchers and developers. This paper
                   describes HTS version 2.0 in detail, as well as future
                   release plans.},
  categories = {HMM, speech synthesis, HTS},
  month = aug,
  year = 2007
}
@article{john:ieee2011,
  author = {J. Dines and J. Yamagishi and S. King},
  title = {Measuring the gap between {HMM}-based {ASR} and {TTS}},
  journal = {IEEE Selected Topics in Signal Processing},
  note = {(in press)},
  abstract = {The EMIME European project is conducting research in
                   the development of technologies for mobile,
                   personalised speech-to-speech translation systems. The
                   hidden Markov model (HMM) is being used as the
                   underlying technology in both automatic speech
                   recognition (ASR) and text-to-speech synthesis (TTS)
                   components, thus, the investigation of unified
                   statistical modelling approaches has become an implicit
                   goal of our research. As one of the first steps towards
                   this goal, we have been investigating commonalities and
                   differences between HMM-based ASR and TTS. In this
                   paper we present results and analysis of a series of
                   experiments that have been conducted on English ASR and
                   TTS systems measuring their performance with respect to
                   phone set and lexicon; acoustic feature type and
                   dimensionality; HMM topology; and speaker adaptation.
                   Our results show that, although the fundamental
                   statistical model may be essentially the same, optimal
                   ASR and TTS performance often demands diametrically
                   opposed system designs. This represents a major
                   challenge to be addressed in the investigation of such
                   unified modelling approaches.},
  doi = {10.1109/JSTSP.2010.2079315},
  keywords = {Acoustics, Adaptation model, Context modeling, Hidden
                   Markov models, Speech, Speech recognition, Training,
                   speech recognition, speech synthesis, unified models},
  year = 2011
}
@inproceedings{dallIS2012,
  author = {Dall, Rasmus and Veaux, Christophe and Yamagishi,
                   Junichi and King, Simon},
  title = {Analysis of Speaker CLustering Strategies for
                   {HMM}-Based Speech Synthesis},
  booktitle = {Proc. Interspeech},
  address = {Portland, Oregon, USA},
  abstract = {This paper describes a method for speaker clustering,
                   with the application of building average voice models
                   for speaker-adaptive HMM-based speech synthesis that
                   are a good basis for adapting to specific target
                   speakers. Our main hypothesis is that using
                   perceptually similar speakers to build the average
                   voice model will be better than use unselected
                   speakers, even if the amount of data available from
                   perceptually similar speakers is smaller. We measure
                   the perceived similarities among a group of 30 female
                   speakers in a listening test and then apply multiple
                   linear regression to automatically predict these
                   listener judgements of speaker similarity and thus to
                   identify similar speakers automatically. We then
                   compare a variety of average voice models trained on
                   either speakers who were perceptually judged to be
                   similar to the target speaker, or speakers selected by
                   the multiple linear regression, or a large global set
                   of unselected speakers. We find that the average voice
                   model trained on perceptually similar speakers provides
                   better performance than the global model, even though
                   the latter is trained on more data, confirming our main
                   hypothesis. However, the average voice model using
                   speakers selected automatically by the multiple linear
                   regression does not reach the same level of
                   performance.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/DallIS2012.pdf},
  year = 2012
}
@inproceedings{JaimeIS2012,
  author = {Jaime Lorenzo-Trueba and Roberto Barra-Chicote and
                   Tuomo Raitio and Nicolas Obin and Paavo Alku and
                   Junichi Yamagishi and Juan M Montero},
  title = { Towards Glottal Source Controllability in Expressive
                   Speech Synthesis},
  booktitle = {Proc. Interspeech},
  address = {Portland, Oregon, USA},
  abstract = { In order to obtain more human like sounding human-
                   machine interfaces we must first be able to give them
                   expressive capabilities in the way of emotional and
                   stylistic features so as to closely adequate them to
                   the intended task. If we want to replicate those
                   features it is not enough to merely replicate the
                   prosodic information of fundamental frequency and
                   speaking rhythm. The proposed additional layer is the
                   modification of the glottal model, for which we make
                   use of the GlottHMM parameters. This paper analyzes the
                   viability of such an approach by verifying that the
                   expressive nuances are captured by the aforementioned
                   features, obtaining 95% recognition rates on styled
                   speaking and 82% on emotional speech. Then we evaluate
                   the effect of speaker bias and recording environment on
                   the source modeling in order to quantify possible
                   problems when analyzing multi-speaker databases.
                   Finally we propose a speaking styles separation for
                   Spanish based on prosodic features and check its
                   perceptual significance.},
  month = sep,
  year = 2012
}
@inproceedings{hts-child-oliver,
  author = {Oliver Watts and Junichi Yamagishi and Kay Berkling
                   and Simon King},
  title = {{HMM}-based synthesis of child speech},
  booktitle = {Proc. of The 1st Workshop on Child, Computer and
                   Interaction (ICMI'08 post-conference workshop)},
  address = {Crete, Greece},
  abstract = {The synthesis of child speech presents challenges both
                   in the collection of data and in the building of a
                   synthesiser from that data. Because only limited data
                   can be collected, and the domain of that data is
                   constrained, it is difficult to obtain the type of
                   phonetically-balanced corpus usually used in speech
                   synthesis. As a consequence, building a synthesiser
                   from this data is difficult. Concatenative synthesisers
                   are not robust to corpora with many missing units (as
                   is likely when the corpus content is not carefully
                   designed), so we chose to build a statistical
                   parametric synthesiser using the HMM-based system HTS.
                   This technique has previously been shown to perform
                   well for limited amounts of data, and for data
                   collected under imperfect conditions. We compared 6
                   different configurations of the synthesiser, using both
                   speaker-dependent and speaker-adaptive modelling
                   techniques, and using varying amounts of data. The
                   output from these systems was evaluated alongside
                   natural and vocoded speech, in a Blizzard-style
                   listening test.},
  categories = {speech synthesis, HMM-based speech synthesis, HTS,
                   speaker adaptation, voice conversion, average voice,
                   child speech},
  key = {hts-child-oliver},
  month = oct,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/child-hts-oliver.pdf},
  year = 2008
}
@article{Oura2012703,
  author = {Keiichiro Oura and Junichi Yamagishi and Mirjam Wester
                   and Simon King and Keiichi Tokuda},
  title = {Analysis of unsupervised cross-lingual speaker
                   adaptation for {HMM}-based speech synthesis using
                   {KLD}-based transform mapping},
  journal = {Speech Communication},
  volume = {54},
  number = {6},
  pages = {703--714},
  abstract = {In the EMIME project, we developed a mobile device
                   that performs personalized speech-to-speech translation
                   such that a user's spoken input in one language is used
                   to produce spoken output in another language, while
                   continuing to sound like the user's voice. We
                   integrated two techniques into a single architecture:
                   unsupervised adaptation for HMM-based TTS using
                   word-based large-vocabulary continuous speech
                   recognition, and cross-lingual speaker adaptation
                   (CLSA) for HMM-based TTS. The CLSA is based on a
                   state-level transform mapping learned using minimum
                   Kullback-Leibler divergence between pairs of HMM states
                   in the input and output languages. Thus, an
                   unsupervised cross-lingual speaker adaptation system
                   was developed. End-to-end speech-to-speech translation
                   systems for four languages (English, Finnish, Mandarin,
                   and Japanese) were constructed within this framework.
                   In this paper, the English-to-Japanese adaptation is
                   evaluated. Listening tests demonstrate that adapted
                   voices sound more similar to a target speaker than
                   average voices and that differences between supervised
                   and unsupervised cross-lingual speaker adaptation are
                   small. Calculating the KLD state-mapping on only the
                   first 10 mel-cepstral coefficients leads to huge
                   savings in computational costs, without any detrimental
                   effect on the quality of the synthetic speech.},
  doi = {10.1016/j.specom.2011.12.004},
  issn = {0167-6393},
  keywords = {HMM-based speech synthesis, Unsupervised speaker
                   adaptation, Cross-lingual speaker adaptation,
                   Speech-to-speech translation},
  url = {http://www.sciencedirect.com/science/article/pii/S0167639311001774},
  year = 2012
}
@inproceedings{cabral_ssw7,
  author = {Cabral, Jo{\~a}o and Renals, Steve and Richmond, Korin
                   and Yamagishi, Junichi},
  title = {Transforming Voice Source Parameters in a {HMM}-based
                   Speech Synthesiser with Glottal Post-Filtering},
  booktitle = {Proc. 7th ISCA Speech Synthesis Workshop (SSW7)},
  pages = {365--370},
  address = {NICT/ATR, Kyoto, Japan},
  abstract = {Control over voice quality, e.g. breathy and tense
                   voice, is important for speech synthesis applications.
                   For example, transformations can be used to modify
                   aspects of the voice re- lated to speaker's identity
                   and to improve expressiveness. How- ever, it is hard to
                   modify voice characteristics of the synthetic speech,
                   without degrading speech quality. State-of-the-art sta-
                   tistical speech synthesisers, in particular, do not
                   typically al- low control over parameters of the
                   glottal source, which are strongly correlated with
                   voice quality. Consequently, the con- trol of voice
                   characteristics in these systems is limited. In con-
                   trast, the HMM-based speech synthesiser proposed in
                   this paper uses an acoustic glottal source model. The
                   system passes the glottal signal through a whitening
                   filter to obtain the excitation of voiced sounds. This
                   technique, called glottal post-filtering, allows to
                   transform voice characteristics of the synthetic speech
                   by modifying the source model parameters. We evaluated
                   the proposed synthesiser in a perceptual ex- periment,
                   in terms of speech naturalness, intelligibility, and
                   similarity to the original speaker's voice. The results
                   show that it performed as well as a HMM-based
                   synthesiser, which generates the speech signal with a
                   commonly used high-quality speech vocoder.},
  keywords = {HMM-based speech synthesis, voice quality, glottal
                   post-filter},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/cabral_ssw7.pdf},
  year = 2010
}
@inproceedings{tachibana07:styleclassify07,
  author = {Makoto Tachibana and Keigo Kawashima and Junichi
                   Yamagishi and Takao Kobayashi},
  title = {Performance Evaluation of {HMM}-Based Style
                   Classification with a Small Amount of Training Data},
  booktitle = {Proc. Interspeech 2007},
  abstract = {This paper describes a classification technique for
                   emotional expressions and speaking styles of speech
                   using only a small amount of training data of a target
                   speaker. We model spectral and fundamental frequency
                   (F0) features simultaneously using multi-space
                   probability distribution HMM (MSD-HMM), and adapt a
                   speaker-independent neutral style model to a certain
                   target speaker’s style model with a small amount of
                   data using MSD-MLLR which is extended MLLR for MSD-HMM.
                   We perform classification experiments for professional
                   narrators’ speech and non-professional speakers'
                   speech and evaluate the performance of proposed
                   technique by comparing with other commonly used
                   classifiers. We show that the proposed technique gives
                   better result than the other classifiers when using a
                   few sentences of target speaker’s style data.},
  categories = {emotion, speaking style, classification},
  month = aug,
  year = 2007
}
@article{Ling2010834,
  author = {Zhen-Hua Ling and Korin Richmond and Junichi Yamagishi},
  title = {An Analysis of {HMM}-based prediction of articulatory
                   movements},
  journal = {Speech Communication},
  volume = {52},
  number = {10},
  pages = {834--846},
  abstract = { This paper presents an investigation into predicting
                   the movement of a speaker's mouth from text input using
                   hidden Markov models (HMM). A corpus of human
                   articulatory movements, recorded by electromagnetic
                   articulography (EMA), is used to train HMMs. To predict
                   articulatory movements for input text, a suitable model
                   sequence is selected and a maximum-likelihood parameter
                   generation (MLPG) algorithm is used to generate output
                   articulatory trajectories. Unified
                   acoustic-articulatory HMMs are introduced to integrate
                   acoustic features when an acoustic signal is also
                   provided with the input text. Several aspects of this
                   method are analyzed in this paper, including the
                   effectiveness of context-dependent modeling, the role
                   of supplementary acoustic input, and the
                   appropriateness of certain model structures for the
                   unified acoustic-articulatory models. When text is the
                   sole input, we find that fully context-dependent models
                   significantly outperform monophone and quinphone
                   models, achieving an average root mean square (RMS)
                   error of 1.945 mm and an average correlation
                   coefficient of 0.600. When both text and acoustic
                   features are given as input to the system, the
                   difference between the performance of quinphone models
                   and fully context-dependent models is no longer
                   significant. The best performance overall is achieved
                   using unified acoustic-articulatory quinphone HMMs with
                   separate clustering of acoustic and articulatory model
                   parameters, a synchronous-state sequence, and a
                   dependent-feature model structure, with an RMS error of
                   0.900 mm and a correlation coefficient of 0.855 on
                   average. Finally, we also apply the same quinphone HMMs
                   to the acoustic-articulatory, or inversion, mapping
                   problem, where only acoustic input is available. An
                   average root mean square (RMS) error of 1.076 mm and an
                   average correlation coefficient of 0.812 are achieved.
                   Taken together, our results demonstrate how text and
                   acoustic inputs both contribute to the prediction of
                   articulatory movements in the method used.},
  doi = {10.1016/j.specom.2010.06.006},
  issn = {0167-6393},
  keywords = {Hidden Markov model; Articulatory features; Parameter
                   generation},
  month = {October},
  year = 2010
}
@inproceedings{Cassia_ICASSP11,
  author = {Valentini-Botinhao, Cassia and Yamagishi, Junichi and
                   King, Simon},
  title = {Evaluation of objective measures for intelligibility
                   prediction of {HMM}-based synthetic speech in noise},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011
                   IEEE International Conference on},
  pages = {5112--5115},
  abstract = {{In this paper we evaluate four objective measures of
                   speech with regards to intelligibility prediction of
                   synthesized speech in diverse noisy situations. We
                   evaluated three intelligibility measures, the Dau
                   measure, the glimpse proportion and the Speech
                   Intelligibility Index (SII) and a quality measure, the
                   Perceptual Evaluation of Speech Quality (PESQ). For the
                   generation of synthesized speech we used a state of the
                   art HMM-based speech synthesis system. The noisy
                   conditions comprised four additive noises. The measures
                   were compared with subjective intelligibility scores
                   obtained in listening tests. The results show the Dau
                   and the glimpse measures to be the best predictors of
                   intelligibility, with correlations of around 0.83 to
                   subjective scores. All measures gave less accurate
                   predictions of intelligibility for synthetic speech
                   than have previously been found for natural speech; in
                   particular the SII measure. In additional experiments,
                   we processed the synthesized speech by an ideal binary
                   mask before adding noise. The Glimpse measure gave the
                   most accurate intelligibility predictions in this
                   situation.}},
  categories = {HMM-based speech synthesis, objective measures of
                   intelligibility},
  doi = {10.1109/ICASSP.2011.5947507},
  issn = {1520-6149},
  month = may,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/Cassia_ICASSP11.pdf},
  year = 2011
}
@inproceedings{leo_09-1,
  author = {Leonardo Badino and J. Sebastian Andersson and Junichi
                   Yamagishi and Robert A.J. Clark},
  title = {Identification of Contrast and Its Emphatic
                   Realization in {HMM}-based Speech Synthesis},
  booktitle = {Proc. Interspeech 2009},
  address = {Brighton, U.K.},
  abstract = {The work presented in this paper proposes to identify
                   contrast in the form of contrastive word pairs and
                   prosodically signal it with emphatic accents in a
                   Text-to-Speech (TTS) application using a
                   Hidden-Markov-Model (HMM) based speech synthesis
                   system. We first describe a novel method to
                   automatically detect contrastive word pairs using
                   textual features only and report its performance on a
                   corpus of spontaneous conversations in English.
                   Subsequently we describe the set of features selected
                   to train a HMM-based speech synthesis system and
                   attempting to properly control prosodic prominence
                   (including emphasis). Results from a large scale
                   perceptual test show that in the majority of cases
                   listeners judge emphatic contrastive word pairs as
                   acceptable as their non-emphatic counterpart, while
                   emphasis on non-contrastive pairs is almost never
                   acceptable.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/IS090749.PDF},
  year = 2009
}
@misc{Hofer_Shimodaira:sigg:2007,
  author = {Gregor Hofer and Hiroshi Shimodaira and Junichi
                   Yamagishi},
  title = {Speech-driven Head Motion Synthesis based on a
                   Trajectory Model},
  howpublished = {Poster at Siggraph 2007},
  address = {San Diego, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/siggraph07.pdf},
  year = 2007
}
@article{yamagishi2009,
  author = {Yamagishi, Junichi and Nose, Takashi and Zen, Heiga
                   and Ling, Zhenhua and Toda, Tomoki and Tokuda, Keiichi
                   and King, Simon and Renals, Steve},
  title = {Robust Speaker-Adaptive {HMM}-based Text-to-Speech
                   Synthesis},
  journal = {IEEE Transactions on Audio, Speech and Language
                   Processing},
  volume = {17},
  number = {6},
  pages = {1208--1230},
  abstract = {This paper describes a speaker-adaptive HMM-based
                   speech synthesis system. The new system, called
                   ``HTS-2007,'' employs speaker adaptation (CSMAPLR+MAP),
                   feature-space adaptive training, mixed-gender modeling,
                   and full-covariance modeling using CSMAPLR transforms,
                   in addition to several other techniques that have
                   proved effective in our previous systems. Subjective
                   evaluation results show that the new system generates
                   significantly better quality synthetic speech than
                   speaker-dependent approaches with realistic amounts of
                   speech data, and that it bears comparison with
                   speaker-dependent approaches even when large amounts of
                   speech data are available. In addition, a comparison
                   study with several speech synthesis techniques shows
                   the new system is very robust: It is able to build
                   voices from less-than-ideal speech data and synthesize
                   good-quality speech even for out-of-domain sentences.},
  pdf = {},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=5109758&arnumber=5153555&count=14&index=12},
  year = 2009
}
@inproceedings{phillip:odyssey2010,
  author = {P.L. De Leon and M. Pucher and J. Yamagishi},
  title = {Evaluation of the Vulnerability of Speaker
                   Verification to Synthetic Speech},
  booktitle = {{Proc. Odyssey (The speaker and language recognition
                   workshop) 2010}},
  address = {Brno, Czech Republic},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/main_v2.pdf},
  year = 2010
}
@article{Stan2011442,
  author = {Adriana Stan and Junichi Yamagishi and Simon King and
                   Matthew Aylett},
  title = {The {R}omanian speech synthesis ({RSS}) corpus:
                   Building a high quality {HMM}-based speech synthesis
                   system using a high sampling rate},
  journal = {Speech Communication},
  volume = {53},
  number = {3},
  pages = {442--450},
  note = {},
  abstract = {This paper first introduces a newly-recorded high
                   quality Romanian speech corpus designed for speech
                   synthesis, called ``RSS'', along with Romanian
                   front-end text processing modules and HMM-based
                   synthetic voices built from the corpus. All of these
                   are now freely available for academic use in order to
                   promote Romanian speech technology research. The RSS
                   corpus comprises 3500 training sentences and 500 test
                   sentences uttered by a female speaker and was recorded
                   using multiple microphones at 96 kHz sampling
                   frequency in a hemianechoic chamber. The details of the
                   new Romanian text processor we have developed are also
                   given. Using the database, we then revisit some basic
                   configuration choices of speech synthesis, such as
                   waveform sampling frequency and auditory frequency
                   warping scale, with the aim of improving speaker
                   similarity, which is an acknowledged weakness of
                   current HMM-based speech synthesisers. As we
                   demonstrate using perceptual tests, these configuration
                   choices can make substantial differences to the quality
                   of the synthetic speech. Contrary to common practice in
                   automatic speech recognition, higher waveform sampling
                   frequencies can offer enhanced feature extraction and
                   improved speaker similarity for HMM-based speech
                   synthesis.},
  doi = {10.1016/j.specom.2010.12.002},
  issn = {0167-6393},
  keywords = {Speech synthesis, HTS, Romanian, HMMs, Sampling
                   frequency, Auditory scale},
  url = {http://www.sciencedirect.com/science/article/pii/S0167639310002074},
  year = 2011
}
@inproceedings{oura:icassp:10,
  author = {Keiichiro Oura and Keiichi Tokuda and Junichi
                   Yamagishi and Mirjam Wester and Simon King},
  title = {Unsupervised Cross-lingual Speaker Adaptation for
                   {HMM}-based Speech Synthesis},
  booktitle = {Proc. of ICASSP},
  volume = {I},
  pages = {4954-4957},
  abstract = {In the EMIME project, we are developing a mobile
                   device that performs personalized speech-to-speech
                   translation such that a user's spoken input in one
                   language is used to produce spoken output in another
                   language, while continuing to sound like the user's
                   voice. We integrate two techniques, unsupervised
                   adaptation for HMM-based TTS using a word-based
                   large-vocabulary continuous speech recognizer and
                   cross-lingual speaker adaptation for HMM-based TTS,
                   into a single architecture. Thus, an unsupervised
                   cross-lingual speaker adaptation system can be
                   developed. Listening tests show very promising results,
                   demonstrating that adapted voices sound similar to the
                   target speaker and that differences between supervised
                   and unsupervised cross-lingual speaker adaptation are
                   small.},
  categories = {speaker adaptation, TTS},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/oura_icassp2010.pdf},
  year = 2010
}
@inproceedings{cabral_yrwst,
  author = {J. Cabral and S. Renals and K. Richmond and J.
                   Yamagishi},
  title = {{HMM}-based Speech Synthesis with an Acoustic Glottal
                   Source Model},
  booktitle = {Proc. The First Young Researchers Workshop in Speech
                   Technology},
  abstract = {A major cause of degradation of speech quality in
                   HMM-based speech synthesis is the use of a simple delta
                   pulse signal to generate the excitation of voiced
                   speech. This paper describes a new approach to using an
                   acoustic glottal source model in HMM-based
                   synthesisers. The goal is to improve speech quality and
                   parametric flexibility to better model and transform
                   voice characteristics.},
  categories = {HMM-based Speech Synthesis, LF-Model, Glottal Spectral
                   Separation},
  month = apr,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/jscabral-yrwss2009.pdf},
  year = 2009
}
@article{Hashimoto2012857,
  author = {Kei Hashimoto and Junichi Yamagishi and William Byrne
                   and Simon King and Keiichi Tokuda},
  title = {Impacts of machine translation and speech synthesis on
                   speech-to-speech translation},
  journal = {Speech Communication},
  volume = {54},
  number = {7},
  pages = {857--866},
  note = {},
  abstract = {This paper analyzes the impacts of machine translation
                   and speech synthesis on speech-to-speech translation
                   systems. A typical speech-to-speech translation system
                   consists of three components: speech recognition,
                   machine translation and speech synthesis. Many
                   techniques have been proposed for integration of speech
                   recognition and machine translation. However,
                   corresponding techniques have not yet been considered
                   for speech synthesis. The focus of the current work is
                   machine translation and speech synthesis, and we
                   present a subjective evaluation designed to analyze
                   their impact on speech-to-speech translation. The
                   results of these analyses show that the naturalness and
                   intelligibility of the synthesized speech are strongly
                   affected by the fluency of the translated sentences. In
                   addition, several features were found to correlate well
                   with the average fluency of the translated sentences
                   and the average naturalness of the synthesized speech.},
  doi = {10.1016/j.specom.2012.02.004},
  issn = {0167-6393},
  keywords = {Speech-to-speech translation, Machine translation,
                   Speech synthesis, Subjective evaluation},
  url = {http://www.sciencedirect.com/science/article/pii/S0167639312000283},
  year = 2012
}
@article{6205335,
  author = {De Leon, P. L. and Pucher, M. and Yamagishi, J. and
                   Hernaez, I. and Saratxaga, I.},
  title = {Evaluation of Speaker Verification Security and
                   Detection of {HMM}-Based Synthetic Speech},
  journal = {Audio, Speech, and Language Processing, IEEE
                   Transactions on},
  volume = {20},
  number = {8},
  pages = {2280--2290},
  abstract = {In this paper, we evaluate the vulnerability of
                   speaker verification (SV) systems to synthetic speech.
                   The SV systems are based on either the Gaussian mixture
                   model #x2013;universal background model (GMM-UBM) or
                   support vector machine (SVM) using GMM supervectors. We
                   use a hidden Markov model (HMM)-based text-to-speech
                   (TTS) synthesizer, which can synthesize speech for a
                   target speaker using small amounts of training data
                   through model adaptation of an average voice or
                   background model. Although the SV systems have a very
                   low equal error rate (EER), when tested with synthetic
                   speech generated from speaker models derived from the
                   Wall Street Journal (WSJ) speech corpus, over 81% of
                   the matched claims are accepted. This result suggests
                   vulnerability in SV systems and thus a need to
                   accurately detect synthetic speech. We propose a new
                   feature based on relative phase shift (RPS),
                   demonstrate reliable detection of synthetic speech, and
                   show how this classifier can be used to improve
                   security of SV systems.},
  doi = {10.1109/TASL.2012.2201472},
  issn = {1558-7916},
  month = oct,
  year = 2012
}
@inproceedings{CassiaSAPA12,
  author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S.},
  title = {{Speech intelligibility enhancement for {HMM}-based
                   synthetic speech in noise}},
  booktitle = {Proc. Sapa Workshop},
  address = {Portland, USA},
  abstract = {It is possible to increase the intelligibility of
                   speech in noise by enhancing the clean speech signal.
                   In this paper we demonstrate the effects of modifying
                   the spectral envelope of synthetic speech according to
                   the environmental noise. To achieve this, we modify Mel
                   cepstral coefficients according to an intelligibility
                   measure that accounts for glimpses of speech in noise:
                   the Glimpse Proportion measure. We evaluate this method
                   against a baseline synthetic voice trained only with
                   normal speech and a topline voice trained with Lombard
                   speech, as well as natural speech. The intelligibility
                   of these voices was measured when mixed with
                   speech-shaped noise and with a competing speaker at
                   three different levels. The Lombard voices, both
                   natural and synthetic, were more intelligible than the
                   normal voices in all conditions. For speech-shaped
                   noise, the proposed modified voice was as intelligible
                   as the Lombard synthetic voice without requiring any
                   recordings of Lombard speech, which are hard to obtain.
                   However, in the case of competing talker noise, the
                   Lombard synthetic voice was more intelligible than the
                   proposed modified voice.},
  categories = {HMM-based speech synthesis, intelligibility
                   enhancement},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_Sapa12.pdf},
  year = 2012
}
@inproceedings{jyamagis:emime,
  author = {Junichi Yamagishi and Mike Lincoln and Simon King and
                   John Dines and Matthew Gibson and Jilei Tian and Yong
                   Guan},
  title = {Analysis of Unsupervised and Noise-Robust
                   Speaker-Adaptive {HMM}-Based Speech Synthesis Systems
                   toward a Unified {ASR} and {TTS} Framework},
  booktitle = {Proc. Interspeech 2009},
  address = {Edinburgh, U.K.},
  abstract = {For the 2009 Blizzard Challenge we have built an
                   unsupervised version of the HTS-2008 speaker-adaptive
                   HMM-based speech synthesis system for English, and a
                   noise robust version of the systems for Mandarin. They
                   are designed from a multidisciplinary application point
                   of view in that we attempt to integrate the components
                   of the TTS system with other technologies such as ASR.
                   All the average voice models are trained exclusively
                   from recognized, publicly available, ASR databases.
                   Multi-pass LVCSR and confidence scores calculated from
                   confusion network are used for the unsupervised
                   systems, and noisy data recorded in cars or public
                   spaces is used for the noise robust system. We believe
                   the developed systems form solid benchmarks and provide
                   good connections to ASR fields. This paper describes
                   the development of the systems and reports the results
                   and analysis of their evaluation.},
  month = sep,
  year = 2009
}
@inproceedings{CassiaLista12,
  author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S.},
  title = {{Using an intelligibility measure to create noise
                   robust cepstral coefï¬cients for {HMM}-based speech
                   synthesis}},
  booktitle = {Proc. LISTA Workshop},
  address = {Edinburgh, UK},
  categories = {HMM-based speech synthesis, intelligibility
                   enhancement},
  month = {May},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_Lista12.pdf},
  year = 2012
}
@inproceedings{cabral2011a,
  author = {Cabral, J.P. and Renals, S. and Yamagishi, J. and
                   Richmond, K.},
  title = {{HMM}-based speech synthesiser using the {LF}-model of
                   the glottal source},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011
                   IEEE International Conference on},
  pages = {4704--4707},
  abstract = {A major factor which causes a deterioration in speech
                   quality in {HMM}-based speech synthesis is the use of a
                   simple delta pulse signal to generate the excitation of
                   voiced speech. This paper sets out a new approach to
                   using an acoustic glottal source model in HMM-based
                   synthesisers instead of the traditional pulse signal.
                   The goal is to improve speech quality and to better
                   model and transform voice characteristics. We have
                   found the new method decreases buzziness and also
                   improves prosodic modelling. A perceptual evaluation
                   has supported this finding by showing a 55.6%
                   preference for the new system, as against the baseline.
                   This improvement, while not being as significant as we
                   had initially expected, does encourage us to work on
                   developing the proposed speech synthesiser further.},
  categories = {HMM-based speech synthesiser;acoustic glottal source
                   model LF-model;delta pulse signal;perceptual
                   evaluation;prosodic modelling;speech quality;voiced
                   speech generation;hidden Markov models;speech
                   synthesis;},
  doi = {10.1109/ICASSP.2011.5947405},
  issn = {1520-6149},
  month = may,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/05947405.pdf},
  year = 2011
}
@misc{Hofer_Shimodaira:sca:2007,
  author = {Gregor Hofer and Hiroshi Shimodaira and Junichi
                   Yamagishi},
  title = {Lip motion synthesis using a context dependent
                   trajectory hidden {M}arkov model},
  howpublished = {Poster at SCA 2007},
  address = {San Diego, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/sca07.pdf},
  year = 2007
}
@inproceedings{higher_level,
  author = {Oliver Watts and Junichi Yamagishi and Simon King},
  title = {The role of higher-level linguistic features in
                   {HMM}-based speech synthesis},
  booktitle = {Proc. Interspeech},
  pages = {841-844},
  address = {Makuhari, Japan},
  abstract = {We analyse the contribution of higher-level elements
                   of the linguistic specification of a data-driven speech
                   synthesiser to the naturalness of the synthetic speech
                   which it generates. The system is trained using various
                   subsets of the full feature-set, in which features
                   relating to syntactic category, intonational phrase
                   boundary, pitch accent and boundary tones are
                   selectively removed. Utterances synthesised by the
                   different configurations of the system are then
                   compared in a subjective evaluation of their
                   naturalness. The work presented forms background
                   analysis for an ongoing set of experiments in
                   performing text-to-speech (TTS) conversion based on
                   shallow features: features that can be trivially
                   extracted from text. By building a range of systems,
                   each assuming the availability of a different level of
                   linguistic annotation, we obtain benchmarks for our
                   on-going work.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100565.pdf},
  year = 2010
}
@inproceedings{tts_barra08,
  author = {R. Barra-Chicote and J. Yamagishi and J.M. Montero and
                   S. King and S. Lutfi and J. Macias-Guarasa},
  title = {Generacion de una voz sintetica en {C}astellano basada
                   en {HSMM} para la {E}valuacion {A}lbayzin 2008:
                   conversion texto a voz},
  booktitle = {V Jornadas en Tecnologia del Habla},
  pages = {115-118},
  note = {(in Spanish)},
  month = nov,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/tts-jth08.pdf},
  year = 2008
}
@inproceedings{lips08-gregpr,
  author = {Gregor Hofer and Junichi Yamagishi and Hiroshi
                   Shimodaira},
  title = {Speech-driven Lip Motion Generation with a Trajectory
                   {HMM}},
  booktitle = {Proc. Interspeech 2008},
  pages = {2314--2317},
  address = {Brisbane, Australia},
  abstract = {Automatic speech animation remains a challenging
                   problem that can be described as finding the optimal
                   sequence of animation parameter configurations given
                   some speech. In this paper we present a novel technique
                   to automatically synthesise lip motion trajectories
                   from a speech signal. The developed system predicts lip
                   motion units from the speech signal and generates
                   animation trajectories automatically employing a
                   ’¡ÉTrajectory Hidden Markov Model’¡É. Using the MLE
                   criterion, its parameter generation algorithm produces
                   the optimal smooth motion trajectories that are used to
                   drive control points on the lips directly.
                   Additionally, experiments were carried out to find a
                   suitable model unit that produces the most accurate
                   results. Finally a perceptual evaluation was conducted,
                   that showed that the developed motion units perform
                   better than phonemes.},
  categories = {visual speech synthesis, trajectory HMM, HTS},
  key = {lips08-gregpr},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/GregorLipsChallenge08.pdf},
  year = 2008
}
@inproceedings{john:HTSGAP,
  author = {J. Dines and J. Yamagishi and S. King},
  title = {Measuring the gap between {HMM}-based {ASR} and {TTS}},
  booktitle = {Proc. Interspeech},
  pages = {1391--1394},
  address = {Brighton, U.K.},
  abstract = {The EMIME European project is conducting research in
                   the development of technologies for mobile,
                   personalised speech-to-speech translation systems. The
                   hidden Markov model is being used as the underlying
                   technology in both automatic speech recognition (ASR)
                   and text-to-speech synthesis (TTS) components, thus,
                   the investigation of unified statistical modelling
                   approaches has become an implicit goal of our research.
                   As one of the first steps towards this goal, we have
                   been investigating commonalities and differences
                   between HMM-based ASR and TTS. In this paper we present
                   results and analysis of a series of experiments that
                   have been conducted on English ASR and TTS systems,
                   measuring their performance with respect to phone set
                   and lexicon, acoustic feature type and dimensionality
                   and HMM topology. Our results show that, although the
                   fundamental statistical model may be essentially the
                   same, optimal ASR and TTS performance often demands
                   diametrically opposed system designs. This represents a
                   major challenge to be addressed in the investigation of
                   such unified modelling approaches.},
  month = sep,
  year = 2009
}
@inproceedings{kurimo:acl:10,
  author = {Mikko Kurimo and William Byrne and John Dines and
                   Philip N. Garner and Matthew Gibson and Yong Guan and
                   Teemu Hirsim\"{a}ki and Reima Karhila and Simon King
                   and Hui Liang and Keiichiro Oura and Lakshmi Saheer and
                   Matt Shannon and Sayaka Shiota and Jilei Tian and
                   Keiichi Tokuda and Mirjam Wester and Yi-Jian Wu and
                   Junichi Yamagishi},
  title = {Personalising speech-to-speech translation in the
                   {EMIME} project},
  booktitle = {Proc. of the ACL 2010 System Demonstrations},
  address = {Uppsala, Sweden},
  abstract = {In the EMIME project we have studied unsupervised
                   cross-lingual speaker adaptation. We have employed an
                   HMM statistical framework for both speech recognition
                   and synthesis which provides transformation mechanisms
                   to adapt the synthesized voice in TTS (text-to-speech)
                   using the recognized voice in ASR (automatic speech
                   recognition). An important application for this
                   research is personalised speech-to-speech translation
                   that will use the voice of the speaker in the input
                   language to utter the translated sentences in the
                   output language. In mobile environments this enhances
                   the users' interaction across language barriers by
                   making the output speech sound more like the original
                   speaker's way of speaking, even if she or he could not
                   speak the output language.},
  categories = {speaker adaptation},
  month = {July},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/kurimo_acl_2010.pdf},
  year = 2010
}
@inproceedings{cabral07,
  author = {J. Cabral and S. Renals and K. Richmond and J.
                   Yamagishi},
  title = {Towards an Improved Modeling of the Glottal Source in
                   Statistical Parametric Speech Synthesis},
  booktitle = {Proc.of the 6th ISCA Workshop on Speech Synthesis},
  address = {Bonn, Germany},
  abstract = {This paper proposes the use of the Liljencrants-Fant
                   model (LF-model) to represent the glottal source signal
                   in HMM-based speech synthesis systems. These systems
                   generally use a pulse train to model the periodicity of
                   the excitation signal of voiced speech. However, this
                   model produces a strong and uniform harmonic structure
                   throughout the spectrum of the excitation which makes
                   the synthetic speech sound buzzy. The use of a mixed
                   band excitation and phase manipulation reduces this
                   effect but it can result in degradation of the speech
                   quality if the noise component is not weighted
                   carefully. In turn, the LF-waveform has a decaying
                   spectrum at higher frequencies, which is more similar
                   to the real glottal source excitation signal. We
                   conducted a perceptual experiment to test the
                   hypothesis that the LF-model can perform as well as or
                   better than the pulse train in a HMM-based speech
                   synthesizer. In the synthesis, we used the mean values
                   of the LF-parameters, calculated by measurements of the
                   recorded speech. The result of this study is important
                   not only regarding the improvement in speech quality of
                   these type of systems, but also because the LF-model
                   can be used to model many characteristics of the
                   glottal source, such as voice quality, which are
                   important for voice transformation and generation of
                   expressive speech.},
  categories = {LF-model, Statistical parametric speech synthesis,
                   HMM-based speech synthesis},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/cabral07.pdf},
  year = 2007
}
@article{michael09:dialectHTS,
  author = {Michael Pucher and Dietmar Schabus and Junichi
                   Yamagishi and Friedrich Neubarth and Volker Strom},
  title = {Modeling and Interpolation of {Austrian German and
                   Viennese} Dialect in {HMM}-based Speech Synthesis},
  journal = {Speech Communication},
  volume = {52},
  number = {2},
  pages = {164--179},
  abstract = {An HMM-based speech synthesis framework is applied to
                   both Standard Austrian German and a Viennese dialectal
                   variety and several training strategies for
                   multi-dialect modeling such as dialect clustering and
                   dialect-adaptive training are investigated. For
                   bridging the gap between processing on the level of
                   HMMs and on the linguistic level, we add phonological
                   transformations to the HMM interpolation and apply them
                   to dialect interpolation. The crucial steps are to
                   employ several formalized phonological rules between
                   Austrian German and Viennese dialect as constraints for
                   the HMM interpolation. We verify the effectiveness of
                   this strategy in a number of perceptual evaluations.
                   Since the HMM space used is not articulatory but
                   acoustic space, there are some variations in evaluation
                   results between the phonological rules. However, in
                   general we obtained good evaluation results which show
                   that listeners can perceive both continuous and
                   categorical changes of dialect varieties by using
                   phonological transformations employed as switching
                   rules in the HMM interpolation.},
  categories = {speech synthesis, hidden Markov model, dialect,
                   sociolect, Austrian German},
  doi = {10.1016/j.specom.2009.09.004},
  year = 2010
}
@inproceedings{king:tokuda:zen:yamagishi:interspeech2008,
  author = {Simon King and Keiichi Tokuda and Heiga Zen and
                   Junichi Yamagishi},
  title = {Unsupervised adaptation for HMM-based speech synthesis},
  booktitle = {Proc. Interspeech},
  pages = {1869-1872},
  address = {Brisbane, Australia},
  abstract = {It is now possible to synthesise speech using HMMs
                   with a comparable quality to unit-selection techniques.
                   Generating speech from a model has many potential
                   advantages over concatenating waveforms. The most
                   exciting is model adaptation. It has been shown that
                   supervised speaker adaptation can yield high-quality
                   synthetic voices with an order of magnitude less data
                   than required to train a speaker-dependent model or to
                   build a basic unit-selection system. Such supervised
                   methods require labelled adaptation data for the target
                   speaker. In this paper, we introduce a method capable
                   of unsupervised adaptation, using only speech from the
                   target speaker without any labelling.},
  categories = {speech synthesis, HMM-based speech synthesis, HTS,
                   trajectory HMMs, speaker adaptation, MLLR},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080299.PDF},
  year = 2008
}
@article{child_speech_journal_2010,
  author = {Watts, O. and Yamagishi, J. and King, S. and Berkling,
                   K.},
  title = {Synthesis of Child Speech with {HMM} Adaptation and
                   Voice Conversion},
  journal = {Audio, Speech, and Language Processing, IEEE
                   Transactions on},
  volume = {18},
  number = {5},
  pages = {1005--1016},
  abstract = {The synthesis of child speech presents challenges both
                   in the collection of data and in the building of a
                   synthesizer from that data. We chose to build a
                   statistical parametric synthesizer using the hidden
                   Markov model (HMM)-based system HTS, as this technique
                   has previously been shown to perform well for limited
                   amounts of data, and for data collected under imperfect
                   conditions. Six different configurations of the
                   synthesizer were compared, using both speaker-dependent
                   and speaker-adaptive modeling techniques, and using
                   varying amounts of data. For comparison with HMM
                   adaptation, techniques from voice conversion were used
                   to transform existing synthesizers to the
                   characteristics of the target speaker. Speaker-adaptive
                   voices generally outperformed child speaker-dependent
                   voices in the evaluation. HMM adaptation outperformed
                   voice conversion style techniques when using the full
                   target speaker corpus; with fewer adaptation data,
                   however, no significant listener preference for either
                   HMM adaptation or voice conversion methods was found.},
  doi = {10.1109/TASL.2009.2035029},
  issn = {1558-7916},
  keywords = {HMM adaptation techniques;child speech
                   synthesis;hidden Markov model;speaker adaptive modeling
                   technique;speaker dependent technique;speaker-adaptive
                   voice;statistical parametric synthesizer;target speaker
                   corpus;voice conversion;hidden Markov models;speech
                   synthesis;},
  month = jul,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/YamagishiJ_Synthesis\%20of\%20Child\%20Speech.pdf},
  year = 2010
}
@inproceedings{ling2011a,
  author = {Ling, Zhen-Hua and Richmond, Korin and Yamagishi,
                   Junichi},
  title = {Feature-space transform tying in unified
                   acoustic-articulatory modelling of articulatory control
                   of {HMM}-based speech synthesis},
  booktitle = {Proc. Interspeech},
  pages = {117--120},
  address = {Florence, Italy},
  abstract = {In previous work, we have proposed a method to control
                   the characteristics of synthetic speech flexibly by
                   integrating articulatory features into hidden Markov
                   model (HMM) based parametric speech synthesis. A
                   unified acoustic-articulatory model was trained and a
                   piecewise linear transform was adopted to describe the
                   dependency between these two feature streams. The
                   transform matrices were trained for each HMM state and
                   were tied based on each state's context. In this paper,
                   an improved acoustic-articulatory modelling method is
                   proposed. A Gaussian mixture model (GMM) is introduced
                   to model the articulatory space and the cross-stream
                   transform matrices are trained for each Gaussian
                   mixture instead of context-dependently. This means the
                   dependency relationship can vary with the change of
                   articulatory features flexibly. Our results show this
                   method improves the effectiveness of control over vowel
                   quality by modifing articulatory trajectories without
                   degrading naturalness.},
  categories = {speech synthesis, articulatory features, hidden Markov
                   model, Gaussian mixture model},
  month = {August},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/IS110482.pdf},
  year = 2011
}
@inproceedings{5947506,
  author = {Hashimoto, K. and Yamagishi, J. and Byrne, W. and
                   King, S. and Tokuda, K.},
  title = {An analysis of machine translation and speech
                   synthesis in speech-to-speech translation system},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011
                   IEEE International Conference on},
  pages = {5108--5111},
  abstract = {This paper provides an analysis of the impacts of
                   machine translation and speech synthesis on
                   speech-to-speech translation systems. The
                   speech-to-speech translation system consists of three
                   components: speech recognition, machine translation and
                   speech synthesis. Many techniques for integration of
                   speech recognition and machine translation have been
                   proposed. However, speech synthesis has not yet been
                   considered. Therefore, in this paper, we focus on
                   machine translation and speech synthesis, and report a
                   subjective evaluation to analyze the impact of each
                   component. The results of these analyses show that the
                   naturalness and intelligibility of synthesized speech
                   are strongly affected by the fluency of the translated
                   sentences.},
  doi = {10.1109/ICASSP.2011.5947506},
  issn = {1520-6149},
  keywords = {machine translation;speech recognition;speech
                   synthesis;speech-to-speech translation system;speech
                   recognition;speech synthesis;},
  month = may,
  year = 2011
}
@inproceedings{junichi:interspeech2010,
  author = {Junichi Yamagishi and Oliver Watts and Simon King and
                   Bela Usabaev},
  title = {Roles of the Average Voice in Speaker-adaptive
                   {HMM}-based Speech Synthesis},
  booktitle = {{Proc. Interspeech}},
  pages = {418--421},
  address = {Makuhari, Japan},
  abstract = {In speaker-adaptive HMM-based speech synthesis, there
                   are typically a few speakers for which the output
                   synthetic speech sounds worse than that of other
                   speakers, despite having the same amount of adaptation
                   data from within the same corpus. This paper
                   investigates these fluctuations in quality and
                   concludes that as mel-cepstral distance from the
                   average voice becomes larger, the MOS naturalness
                   scores generally become worse. Although this negative
                   correlation is not that strong, it suggests a way to
                   improve the training and adaptation strategies. We also
                   draw comparisons between our findings and the work of
                   other researchers regarding ``vocal attractiveness.''},
  keywords = {speech synthesis, HMM, average voice, speaker
                   adaptation},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100361.pdf},
  year = 2010
}
@inproceedings{junichi:icassp2010,
  author = {J. Yamagishi and S. King},
  title = {Simple methods for improving speaker-similarity of
                   {HMM}-based speech synthesis},
  booktitle = {{Proc. ICASSP 2010}},
  address = {Dallas, Texas, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/JunichiICASSP10.pdf},
  year = 2010
}
@inproceedings{hts2007-icassp,
  author = {Junichi Yamagishi and Takashi Nose and Heiga Zen and
                   Tomoki Toda and Keiichi Tokuda},
  title = {Performance Evaluation of The Speaker-Independent
                   {HMM}-based Speech Synthesis System "{HTS}-2007" for
                   the {Blizzard Challenge 2007}},
  booktitle = {Proc. ICASSP 2008},
  pages = {3957--3960},
  address = {Las Vegas, U.S.A},
  abstract = {This paper describes a speaker-independent/adaptive
                   HMM-based speech synthesis system developed for the
                   Blizzard Challenge 2007. The new system, named
                   ’¡ÈHTS-2007’¡É, employs speaker adaptation
                   (CSMAPLR+MAP), feature-space adaptive training,
                   mixed-gender modeling, and full-covariance modeling
                   using CSMAPLR transforms, in addition to several other
                   techniques that have proved effective in our previous
                   systems. Subjective evaluation results show that the
                   new system generates significantly better quality
                   synthetic speech than that of speaker-dependent
                   approaches with realistic amounts of speech data, and
                   that it bears comparison with speaker-dependent
                   approaches even when large amounts of speech data are
                   available.},
  categories = {speech synthesis, HMM-based speech synthesis, HTS,
                   speaker adaptation, voice conversion, average voice},
  doi = {10.1109/ICASSP.2008.4518520},
  key = {hts2007-icassp},
  month = apr,
  year = 2008
}
@article{junichi:ieee2010,
  author = {J. Yamagishi and B. Usabaev and S. King and O. Watts
                   and J. Dines and J. Tian and R. Hu and Y. Guan and K.
                   Oura and K. Tokuda and R. Karhila and M. Kurimo},
  title = {Thousands of Voices for {HMM}-based Speech Synthesis
                   -- Analysis and Application of {TTS} Systems Built on
                   Various {ASR} Corpora},
  journal = {IEEE Transactions on Audio, Speech and Language
                   Processing},
  volume = 18,
  number = 5,
  pages = {984--1004},
  abstract = {In conventional speech synthesis, large amounts of
                   phonetically balanced speech data recorded in highly
                   controlled recording studio environments are typically
                   required to build a voice. Although using such data is
                   a straightforward solution for high quality synthesis,
                   the number of voices available will always be limited,
                   because recording costs are high. On the other hand,
                   our recent experiments with HMM-based speech synthesis
                   systems have demonstrated that speaker-adaptive
                   HMM-based speech synthesis (which uses an ``average
                   voice model'' plus model adaptation) is robust to
                   non-ideal speech data that are recorded under various
                   conditions and with varying microphones, that are not
                   perfectly clean, and/or that lack phonetic balance.
                   This enables us to consider building high-quality
                   voices on ``non-TTS'' corpora such as ASR corpora.
                   Since ASR corpora generally include a large number of
                   speakers, this leads to the possibility of producing an
                   enormous number of voices automatically. In this paper,
                   we demonstrate the thousands of voices for HMM-based
                   speech synthesis that we have made from several popular
                   ASR corpora such as the Wall Street Journal (WSJ0,
                   WSJ1, and WSJCAM0), Resource Management, Globalphone,
                   and SPEECON databases. We also present the results of
                   associated analysis based on perceptual evaluation, and
                   discuss remaining issues.},
  doi = {10.1109/TASL.2010.2045237},
  keywords = {Automatic speech recognition (ASR), H Triple S (HTS),
                   SPEECON database, WSJ database, average voice, hidden
                   Markov model (HMM)-based speech synthesis, speaker
                   adaptation, speech synthesis, voice conversion},
  month = jul,
  year = 2010
}
@inproceedings{CassiaIS12,
  author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S.},
  title = {{Mel cepstral coefficient modification based on the
                   Glimpse Proportion measure for improving the
                   intelligibility of {HMM}-generated synthetic speech in
                   noise}},
  booktitle = {Proc. Interspeech},
  address = {Portland, USA},
  abstract = {We propose a method that modifies the Mel cepstral
                   coefficients of HMM-generated synthetic speech in order
                   to increase the intelligibility of the generated speech
                   when heard by a listener in the presence of a known
                   noise. This method is based on an approximation we
                   previously proposed for the Glimpse Proportion measure.
                   Here we show how to update the Mel cepstral
                   coefficients using this measure as an optimization
                   criterion and how to control the amount of distortion
                   by limiting the frequency resolution of the
                   modifications. To evaluate the method we built eight
                   different voices from normal read-text speech data from
                   a male speaker. Some voices were also built from
                   Lombard speech data produced by the same speaker.
                   Listening experiments with speech-shaped noise and with
                   a single competing talker indicate that our method
                   significantly improves intelligibility when compared to
                   unmodified synthetic speech. The voices built from
                   Lombard speech outperformed the proposed method
                   particularly for the competing talker case. However,
                   compared to a voice using only the spectral parameters
                   from Lombard speech, the proposed method obtains
                   similar or higher performance.},
  categories = {HMM-based speech synthesis, intelligibility
                   enhancement, Mel cepstral coefficients},
  month = {September},
  year = 2012
}
@inproceedings{wester:ssw7:10,
  author = {Mirjam Wester and John Dines and Matthew Gibson and
                   Hui Liang and Yi-Jian Wu and Lakshmi Saheer and Simon
                   King and Keiichiro Oura and Philip N. Garner and
                   William Byrne and Yong Guan and Teemu Hirsim\"{a}ki and
                   Reima Karhila and Mikko Kurimo and Matt Shannon and
                   Sayaka Shiota and Jilei Tian and Keiichi Tokuda and
                   Junichi Yamagishi},
  title = {Speaker adaptation and the evaluation of speaker
                   similarity in the {EMIME} speech-to-speech translation
                   project},
  booktitle = {Proc. of 7th ISCA Speech Synthesis Workshop},
  address = {Kyoto, Japan},
  abstract = {This paper provides an overview of speaker adaptation
                   research carried out in the EMIME speech-to-speech
                   translation (S2ST) project. We focus on how speaker
                   adaptation transforms can be learned from speech in one
                   language and applied to the acoustic models of another
                   language. The adaptation is transferred across
                   languages and/or from recognition models to synthesis
                   models. The various approaches investigated can all be
                   viewed as a process in which a mapping is defined in
                   terms of either acoustic model states or linguistic
                   units. The mapping is used to transfer either speech
                   data or adaptation transforms between the two models.
                   Because the success of speaker adaptation in
                   text-to-speech synthesis is measured by judging speaker
                   similarity, we also discuss issues concerning
                   evaluation of speaker similarity in an S2ST scenario.},
  categories = {speaker adaptation, evaluation},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/wester_ssw7_2010.pdf},
  year = 2010
}
@inproceedings{hts2008,
  author = {Junichi Yamagishi and Heiga Zen and Yi-Jian Wu and
                   Tomoki Toda and Keiichi Tokuda},
  title = {The {HTS}-2008 System: Yet Another Evaluation of the
                   Speaker-Adaptive {HMM}-based Speech Synthesis System in
                   The {2008 Blizzard Challenge}},
  booktitle = {Proc. Blizzard Challenge 2008},
  address = {Brisbane, Australia},
  abstract = {For the 2008 Blizzard Challenge, we used the same
                   speaker-adaptive approach to HMM-based speech synthesis
                   that was used in the HTS entry to the 2007 challenge,
                   but an improved system was built in which the
                   multi-accented English average voice model was trained
                   on 41 hours of speech data with high-order mel-cepstral
                   analysis using an efficient forward-backward algorithm
                   for the HSMM. The listener evaluation scores for the
                   synthetic speech generated from this system was much
                   better than in 2007: the system had the equal best
                   naturalness on the small English data set and the equal
                   best intelligibility on both small and large data sets
                   for English, and had the equal best naturalness on the
                   Mandarin data. In fact, the English system was found to
                   be as intelligible as human speech.},
  categories = {speech synthesis, HMM-based speech synthesis, HTS,
                   speaker adaptation, voice conversion, average voice,
                   Blizzard Challenge},
  key = {hts2008},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/HTS2008.pdf},
  year = 2008
}
@inproceedings{6287948,
  author = {Saheer, L. and Yamagishi, J. and Garner, P.N. and
                   Dines, J.},
  title = {Combining vocal tract length normalization with
                   hierarchial linear transformations},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2012
                   IEEE International Conference on},
  volume = {},
  number = {},
  pages = {4493 -4496},
  abstract = {Recent research has demonstrated the effectiveness of
                   vocal tract length normalization (VTLN) as a rapid
                   adaptation technique for statistical parametric speech
                   synthesis. VTLN produces speech with naturalness
                   preferable to that of MLLR-based adaptation techniques,
                   being much closer in quality to that generated by the
                   original average voice model. However with only a
                   single parameter, VTLN captures very few speaker
                   specific characteristics when compared to linear
                   transform based adaptation techniques. This paper
                   proposes that the merits of VTLN can be combined with
                   those of linear transform based adaptation in a
                   hierarchial Bayesian framework, where VTLN is used as
                   the prior information. A novel technique for
                   propagating the gender information from the VTLN prior
                   through constrained structural maximum a posteriori
                   linear regression (CSMAPLR) adaptation is presented.
                   Experiments show that the resulting transformation has
                   improved speech quality with better naturalness,
                   intelligibility and improved speaker similarity.},
  doi = {10.1109/ICASSP.2012.6287948},
  issn = {1520-6149},
  keywords = {CSMAPLR adaptation;MLLR based adaptation
                   technique;constrained structural maximum a posteriori
                   linear regression;hierarchial Bayesian
                   framework;hierarchial linear
                   transformation;intelligibility;rapid adaptation
                   technique;speaker similarity;statistical parametric
                   speech synthesis;vocal tract length normalization;Bayes
                   methods;speech intelligibility;},
  month = mar,
  year = 2012
}
@inproceedings{jyamagis07:hts2007,
  author = {Junichi Yamagishi and Heiga Zen and Tomoki Toda and
                   Keiichi Tokuda},
  title = {Speaker-Independent {HMM}-based Speech Synthesis
                   System -- {HTS-2007} System for the {Blizzard Challenge
                   2007}},
  booktitle = {Proc. Blizzard Challenge 2007},
  abstract = {This paper describes an HMM-based speech synthesis
                   system developed by the HTS working group for the
                   Blizzard Challenge 2007. To further explore the
                   potential of HMM-based speech synthesis, we incorporate
                   new features in our conventional system which underpin
                   a speaker-independent approach: speaker adaptation
                   techniques; adaptive training for HSMMs; and full
                   covariance modeling using the CSMAPLR transforms.},
  categories = {HMM, speech synthesis, speaker adaptation, HTS,
                   Blizzard Challenge},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blizzard2007-HTS.pdf},
  year = 2007
}
@inproceedings{michael:interspeech2010,
  author = {Michael Pucher and Dietmar Schabus and Junichi
                   Yamagishi},
  title = {Synthesis of fast speech with interpolation of adapted
                   {HSMMs} and its evaluation by blind and sighted
                   listeners},
  booktitle = {Proc. Interspeech},
  pages = {2186--2189},
  address = {Makuhari, Japan},
  abstract = {In this paper we evaluate a method for generating
                   synthetic speech at high speaking rates based on the
                   interpolation of hidden semi-Markov models (HSMMs)
                   trained on speech data recorded at normal and fast
                   speaking rates. The subjective evaluation was carried
                   out with both blind listeners, who are used to very
                   fast speaking rates, and sighted listeners. We show
                   that we can achieve a better intelligibility rate and
                   higher voice quality with this method compared to
                   standard HSMM-based duration modeling. We also evaluate
                   duration modeling with the interpolation of all the
                   acoustic features including not only duration but also
                   spectral and F0 models. An analysis of the mean squared
                   error (MSE) of standard HSMM-based duration modeling
                   for fast speech identifies problematic linguistic
                   contexts for duration modeling.},
  keywords = {speech synthesis, fast speech, hidden semi- Markov
                   model},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100294.pdf},
  year = 2010
}
@inproceedings{richmond2007b,
  author = {Richmond, K. and Strom, V. and Clark, R. and
                   Yamagishi, J. and Fitt, S.},
  title = {Festival Multisyn Voices for the 2007 Blizzard
                   Challenge},
  booktitle = {Proc. Blizzard Challenge Workshop (in Proc. SSW6)},
  address = {Bonn, Germany},
  abstract = {This paper describes selected aspects of the Festival
                   Multisyn entry to the Blizzard Challenge 2007. We
                   provide an overview of the process of building the
                   three required voices from the speech data provided.
                   This paper focuses on new features of Multisyn which
                   are currently under development and which have been
                   employed in the system used for this Blizzard
                   Challenge. These differences are the application of a
                   more flexible phonetic lattice representation during
                   forced alignment labelling and the use of a pitch
                   accent target cost component. Finally, we also examine
                   aspects of the speech data provided for this year's
                   Blizzard Challenge and raise certain issues for
                   discussion concerning the aim of comparing voices made
                   with differing subsets of the data provided.},
  categories = {tts, blizzard, multisyn, unit selection},
  key = {richmond2007b},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blizzard2007paper.pdf},
  year = 2007
}
@article{2012E121001,
  author = {Junichi Yamagishi and Christophe Veaux and Simon King
                   and Steve Renals},
  title = {Speech synthesis technologies for individuals with
                   vocal disabilities: Voice banking and reconstruction},
  journal = {Acoustical Science and Technology},
  volume = {33},
  number = {1},
  pages = {1--5},
  url = {http://www.jstage.jst.go.jp/browse/ast/33/1/_contents},
  year = 2012
}
@article{Creer2012,
  author = {Sarah Creer and Stuart Cunningham and Phil Green and
                   Junichi Yamagishi},
  title = {Building personalised synthetic voices for individuals
                   with severe speech impairment},
  journal = {Computer Speech and Language},
  volume = {},
  number = {0},
  pages = { - },
  note = {},
  abstract = {For individuals with severe speech impairment accurate
                   spoken communication can be difficult and require
                   considerable effort. Some may choose to use a voice
                   output communication aid (or VOCA) to support their
                   spoken communication needs. A VOCA typically takes
                   input from the user through a keyboard or switch-based
                   interface and produces spoken output using either
                   synthesised or recorded speech. The type and number of
                   synthetic voices that can be accessed with a VOCA is
                   often limited and this has been implicated as a factor
                   for rejection of the devices. Therefore, there is a
                   need to be able to provide voices that are more
                   appropriate and acceptable for users. This paper
                   reports on a study that utilises recent advances in
                   speech synthesis to produce personalised synthetic
                   voices for 3 speakers with mild to severe dysarthria,
                   one of the most common speech disorders. Using a
                   statistical parametric approach to synthesis, an
                   average voice trained on data from several unimpaired
                   speakers was adapted using recordings of the impaired
                   speech of 3 dysarthric speakers. By careful selection
                   of the speech data and the model parameters, several
                   exemplar voices were produced for each speaker. A
                   qualitative evaluation was conducted with the speakers
                   and listeners who were familiar with the speaker. The
                   evaluation showed that for one of the 3 speakers a
                   voice could be created which conveyed many of his
                   personal characteristics, such as regional identity,
                   sex and age.},
  doi = {10.1016/j.csl.2012.10.001},
  issn = {0885-2308},
  keywords = {Speech synthesis, Augmentative and alternative
                   communication, Disordered speech, Voice output
                   communication aid},
  url = {http://www.sciencedirect.com/science/article/pii/S0885230812000836?v=s5},
  year = 2012
}
@inproceedings{jyamagis:1000sHTS,
  author = {J. Yamagishi and Bela Usabaev and Simon King and
                   Oliver Watts and John Dines and Jilei Tian and Rile Hu
                   and Yong Guan and Keiichiro Oura and Keiichi Tokuda and
                   Reima Karhila and Mikko Kurimo},
  title = {Thousands of voices for {HMM}-based speech synthesis},
  booktitle = {Proc. Interspeech},
  pages = {420--423},
  address = {Brighton, U.K.},
  abstract = {Our recent experiments with HMM-based speech synthesis
                   systems have demonstrated that speaker-adaptive
                   HMM-based speech synthesis (which uses an ‘average
                   voice model’ plus model adaptation) is robust to
                   non-ideal speech data that are recorded under various
                   conditions and with varying microphones, that are not
                   perfectly clean, and/or that lack of phonetic balance.
                   This enables us consider building high-quality voices
                   on ’non-TTS’ corpora such as ASR corpora. Since ASR
                   corpora generally include a large number of speakers,
                   this leads to the possibility of producing an enormous
                   number of voices automatically. In this paper we show
                   thousands of voices for HMM-based speech synthesis that
                   we have made from several popular ASR corpora such as
                   the Wall Street Journal databases (WSJ0/WSJ1/WSJCAM0),
                   Resource Management, Globalphone and Speecon. We report
                   some perceptual evaluation results and outline the
                   outstanding issues.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/index.php},
  year = 2009
}
@inproceedings{anderssonetal2010_ssw7,
  author = {Sebastian Andersson and Junichi Yamagishi and Robert
                   Clark},
  title = {Utilising Spontaneous Conversational Speech in
                   {HMM}-Based Speech Synthesis},
  booktitle = {The 7th ISCA Tutorial and Research Workshop on Speech
                   Synthesis},
  abstract = {Spontaneous conversational speech has many
                   characteristics that are currently not well modelled in
                   unit selection and HMM-based speech synthesis. But in
                   order to build synthetic voices more suitable for
                   interaction we need data that exhibits more
                   conversational characteristics than the generally used
                   read aloud sentences. In this paper we will show how
                   carefully selected utterances from a spontaneous
                   conversation was instrumental for building an HMM-based
                   synthetic voices with more natural sounding
                   conversational characteristics than a voice based on
                   carefully read aloud sentences. We also investigated a
                   style blending technique as a solution to the inherent
                   problem of phonetic coverage in spontaneous speech
                   data. But the lack of an appropriate representation of
                   spontaneous speech phenomena probably contributed to
                   results showing that we could not yet compete with the
                   speech quality achieved for grammatical sentences.},
  categories = {HMM, speech synthesis, spontaneous speech,
                   conversation, lexical fillers, filled pauses},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/ssw7_paper.pdf},
  year = 2010
}
@inproceedings{letter_based_TTS,
  author = {Oliver Watts and Junichi Yamagishi and Simon King},
  title = {Letter-based speech synthesis},
  booktitle = {Proc. Speech Synthesis Workshop 2010},
  pages = {317-322},
  address = {Nara, Japan},
  abstract = {Initial attempts at performing text-to-speech
                   conversion based on standard orthographic units are
                   presented, forming part of a larger scheme of training
                   TTS systems on features that can be trivially extracted
                   from text. We evaluate the possibility of using the
                   technique of decision-tree-based context clustering
                   conventionally used in HMM-based systems for
                   parametertying to handle letter-to-sound conversion. We
                   present the application of a method of compound-feature
                   discovery to corpusbased speech synthesis. Finally, an
                   evaluation of intelligibility of letter-based systems
                   and more conventional phoneme-based systems is
                   presented.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/ssw7.pdf},
  year = 2010
}