Publications by Leonardo Badino
s0679204.bib
@inproceedings{anderssoncabral09,
author = {Andersson, J. Sebastian and Cabral, Joao P. and Badino, Leonardo and Yamagishi, Junichi and Clark, Robert A.J.},
title = {Glottal Source and Prosodic Prominence Modelling in {HMM}-based Speech Synthesis for the {B}lizzard {C}hallenge 2009},
booktitle = {The Blizzard Challenge 2009},
year = {2009},
month = {September},
address = {Edinburgh, U.K.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/cstr_Blizzard2009.pdf},
abstract = {This paper describes the CSTR entry for the Blizzard Challenge 2009. The work focused on modifying two parts of the Nitech 2005 HTS speech synthesis system to improve naturalness and contextual appropriateness. The first part incorporated an implementation of the Linjencrants-Fant (LF) glottal source model. The second part focused on improving synthesis of prosodic prominence including emphasis through context dependent phonemes. Emphasis was assigned to the synthesised test sentences based on a handful of theory based rules. The two parts (LF-model and prosodic prominence) were not combined and hence evaluated separately. The results on naturalness for the LF-model showed that it is not yet perceived as natural as the Benchmark HTS system for neutral speech. The results for the prosodic prominence modelling showed that it was perceived as contextually appropriate as the Benchmark HTS system, despite a low naturalness score. The Blizzard challenge evaluation has provided valuable information on the status of our work and continued work will begin with analysing why our modifications resulted in reduced naturalness compared to the Benchmark HTS system.},
categories = {HMM, HTS, speech synthesis, LF-model, glottal source, prosodic prominence, emphasis}
}
@inproceedings{leo_04-2,
author = {Zovato, Enrico and Sandri, Stefano and Quazza, Silvia and Badino, Leonardo},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/ThA1404p.8_p890.pdf},
booktitle = {Proc. ICSLP 2004},
year = {2004},
title = {Prosodic analysis of a multi-style corpus in the perspective of emotional speech synthesis},
address = {Jeju, Korea}
}
@inproceedings{leo_07-1,
author = {Badino, Leonardo and Clark, Robert A.J.},
title = {Issues of Optionality in Pitch Accent Placement},
booktitle = {Proc. 6th ISCA Speech Synthesis Workshop},
year = {2007},
address = {Bonn, Germany},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/ssw6_252.pdf},
abstract = {When comparing the prosodic realization of different English speakers reading the same text, a significant disagreement is usually found amongst the pitch accent patterns of the speakers. Assuming that such disagreement is due to a partial optionality of pitch accent placement, it has been recently proposed to evaluate pitch accent predictors by comparing them with multi-speaker reference data. In this paper we face the issue of pitch accent optionality at different levels. At first we propose a simple mathematical definition of intra-speaker optionality which allows us to introduce a function for evaluating pitch accent predictors which we show being more accurate and robust than those used in previous works. Subsequently we compare a pitch accent predictor trained on single speaker data with a predictor trained on multi-speaker data in order to point out the large overlapping between intra-speaker and inter-speaker optionality. Finally, we show our successful results in predicting intra-speaker optionality and we suggest how this achievement could be exploited to improve the performances of a unit selection text-to speech synthesis (TTS) system.}
}
@inproceedings{leo_08-3,
author = {Andersson, J. Sebastian and Badino, Leonardo and Watts, Oliver S. and P.Aylett, Matthew},
title = {The {CSTR/Cereproc B}lizzard Entry 2008: The Inconvenient Data},
booktitle = {Proc. Blizzard Challenge Workshop (in Proc. Interspeech 2008)},
year = {2008},
address = {Brisbane, Australia},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/cstr-cereproc_Blizzard2008.pdf},
abstract = {In a commercial system data used for unit selection systems is collected with a heavy emphasis on homogeneous neutral data that has sufficient coverage for the units that will be used in the system. In this years Blizzard entry CSTR and CereProc present a joint entry where the emphasis has been to explore techniques to deal with data which is not homogeneous (the English entry) and did not have appropriate coverage for a diphone based system (the Mandarin entry where tone/phone combinations were treated as distinct phone categories). In addition, two further problems were addressed, 1) Making use of non-homogeneous data for creating a voice that can realise both expressive and neutral speaking styles (the English entry) 2) Building a unit selection system with no native understanding of the language but depending instead on external native evaluation (the Mandarin Entry).}
}
@inproceedings{badinoclark_interspeech12,
author = {Badino, Leonardo and Clark, Robert A.J. and Wester, Mirjam},
title = {Towards Hierarchical Prosodic Prominence Generation in {TTS} Synthesis},
booktitle = {Proc. Interspeech},
year = {2012},
address = {Portland, USA},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/badinoclark_IS_2012.pdf},
categories = {speech synthesis, prosody}
}
@inproceedings{leo_09-1,
author = {Badino, Leonardo and Andersson, J. Sebastian and Yamagishi, Junichi and Clark, Robert A.J.},
title = {Identification of Contrast and Its Emphatic Realization in {HMM}-based Speech Synthesis},
booktitle = {Proc. Interspeech 2009},
year = {2009},
month = {September},
address = {Brighton, U.K.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/IS090749.PDF},
abstract = {The work presented in this paper proposes to identify contrast in the form of contrastive word pairs and prosodically signal it with emphatic accents in a Text-to-Speech (TTS) application using a Hidden-Markov-Model (HMM) based speech synthesis system. We first describe a novel method to automatically detect contrastive word pairs using textual features only and report its performance on a corpus of spontaneous conversations in English. Subsequently we describe the set of features selected to train a HMM-based speech synthesis system and attempting to properly control prosodic prominence (including emphasis). Results from a large scale perceptual test show that in the majority of cases listeners judge emphatic contrastive word pairs as acceptable as their non-emphatic counterpart, while emphasis on non-contrastive pairs is almost never acceptable.}
}
@inproceedings{leo_04-3,
author = {Badino, Leonardo},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/ThB2202p.22_p965.pdf},
booktitle = {Proc. ICSLP 2004},
year = {2004},
title = {Chinese Text Word Segmentation Considering Semantic Links among Sentences},
address = {Jeju, Korea}
}
@inproceedings{leo_07-2,
author = {Aylett, Matthew P. and Andersson, J. Sebastian and Badino, Leonardo and Pidcock, Christopher J.},
title = {The {C}erevoice {B}lizzard Entry 2007: Are Small Database Errors Worse than Compression Artifacts?},
booktitle = {Proc. Blizzard Challenge Workshop 2007},
year = {2007},
address = {Bonn, Germany},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blz3_004.pdf},
abstract = {In commercial systems the memory footprint of unit selection systems is often a key issue. This is especially true for PDAs and other embedded devices. In this year's Blizzard entry CereProc R gave itself the criteria that the full database system entered would have a smaller memory footprint than either of the two smaller database entries. This was accomplished by applying Speex speech compression to the full database entry. In turn a set of small database techniques used to improve the quality of small database systems in last years entry were extended. Finally, for all systems, two quality control methods were applied to the underlying database to improve the lexicon and transcription match to the underlying data. Results suggest that mild audio quality artifacts introduced by lossy compression have almost as much impact on MOS perceived quality as concatenation errors introduced by sparse data in the smaller systems with bulked diphones.}
}
@inproceedings{leo_04-4,
author = {Badino, Leonardo and Barolo, Claudia and Quazza, Silvia},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/2026.pdf},
booktitle = {Proc. 5th ISCA Speech Synthesis Workshop},
year = {2004},
title = {Language independent phoneme mapping for foreign {TTS}},
address = {Pittsburgh, USA}
}
@inproceedings{leo_08-2,
author = {Badino, Leonardo and Clark, Robert A.J. and Strom, Volker},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080159.ps},
title = {Including Pitch Accent Optionality in Unit Selection Text-to-Speech Synthesis},
booktitle = {Proc.~Interspeech},
year = {2008},
address = {Brisbane},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080159.pdf},
abstract = {A significant variability in pitch accent placement is found when comparing the patterns of prosodic prominence realized by different English speakers reading the same sentences. In this paper we describe a simple approach to incorporate this variability to synthesize prosodic prominence in unit selection text-to-speech synthesis. The main motivation of our approach is that by taking into account the variability of accent placements we enlarge the set of prosodically acceptable speech units, thus increasing the chances of selecting a good quality sequence of units, both in prosodic and segmental terms. Results on a large scale perceptual test show the benefits of our approach and indicate directions for further improvements.},
categories = {speech synthesis, unit selection, prosodic prominence, pitch accents}
}
@inproceedings{leo_08-1,
author = {Badino, Leonardo and Clark, Robert A.J.},
title = {Automatic labeling of contrastive word pairs from spontaneous spoken English},
booktitle = {in 2008 IEEE/ACL Workshop on Spoken Language Technology},
year = {2008},
address = {Goa, India},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/0000101.pdf},
abstract = {This paper addresses the problem of automatically labeling contrast in spontaneous spoken speech, where contrast here is meant as a relation that ties two words that explicitly contrast with each other. Detection of contrast is certainly relevant in the analysis of discourse and information structure and also, because of the prosodic correlates of contrast, could play an important role in speech applications, such as text-to-speech synthesis, that need an accurate and discourse context related modeling of prosody. With this prospect we investigate the feasibility of automatic contrast labeling by training and evaluating on the Switchboard corpus a novel contrast tagger, based on Support Vector Machines (SVM), that combines lexical features, syntactic dependencies and WordNet semantic relations.}
}
@inproceedings{leo_04-1,
author = {Badino, Leonardo and Barolo, Claudia and Quazza, Silvia},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/WeA2401o.5_p1083.pdf},
booktitle = {Proc. ICSLP 2004},
year = {2004},
title = {A General Approach to {TTS} Reading of Mixed-Language Texts},
address = {Jeju, Korea}
}