Publications of EdSST fellows
@inproceedings{cabral_yrwst,
author = {J. Cabral and S. Renals and K. Richmond and J. Yamagishi},
title = {{HMM}-based Speech Synthesis with an Acoustic Glottal
Source Model},
booktitle = {The First Young Researchers Workshop in Speech
Technology},
abstract = {A major cause of degradation of speech quality in
HMM-based speech synthesis is the use of a simple delta
pulse signal to generate the excitation of voiced
speech. This paper describes a new approach to using an
acoustic glottal source model in HMM-based
synthesisers. The goal is to improve speech quality and
parametric flexibility to better model and transform
voice characteristics.},
categories = {HMM-based Speech Synthesis, LF-Model, Glottal Spectral
Separation},
month = apr,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/jscabral-yrwss2009.pdf},
year = 2009
}
@article{liker2008,
author = {Marko Liker and Damir Horga and Ines Safaric},
title = {Koartikulacijski pritisak i koartikulacijski otpor: ultrazvucno istrazivanje (Coarticulatory pressure and coarticulatory resistance: an ultrasound study)},
journal = {Govor/Speech},
year = 2008,
key = {liker2008},
volume = 25,
number = 2,
pages = {171--188}
}
@inproceedings{tietze:09,
author = {Martin I. Tietze and Andi Winterboer and Johanna D.
Moore},
title = {The effect of linguistic devices in information
presentation messages on recall and comprehension},
booktitle = {Proceedings ENLG09},
categories = {discourse cues, verbal information presentation,
recall, eye-tracking, Mechanical Turk},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/tietze.ENLG09.pdf},
year = 2009
}
@inproceedings{kocjancic_issp08,
author = {Kocjancic, Tanja},
title = {Ultrasound investigation of tongue movements in
syllables with different onset structure},
booktitle = {Proc. of the Eighth International Seminar on Speech
Production (ISSP)},
abstract = {This study is an attempt to describe syllables with
different onset structure not only in terms of
durational changes but also in terms of the distance
the tongue travels over a syllable by using ultrasound
and to compare the ratio between the two parameters,
expressed as speed. Results indicate that both measures
increase with an increasing number of onset segments
but not to the same degree for all targets. Therefore
speed was not constant over all of them. Additionally,
type of onset constituent greatly influenced the three
parameters and there were large between-speaker
similarities in case of durational changes.},
categories = {tongue movements, ultrasound},
month = dec,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/Kocjancic_ISSP_2008.pdf},
year = 2008
}
@inproceedings{kocjancic_exling08,
author = {Kocjancic, Tanja},
title = {Tongue movement and syllable onset complexity:
ultrasound study},
booktitle = {Proc. of ISCA Experimental Linguistics ExLing 2008},
abstract = {In this study ultrasound was used to investigate
tongue movements in syllables with different number and
type of onset consonants. Ultrasound recordings
provided the information of the distance the tongue
travels over a target, and audio recordings of the time
needed. The speed of tongueâs travel was calculated
from the two measurements. Results of ten speakers have
shown that both duration and distance travelled
increase with an increased number of onset segments,
but that distance travelled is additionally influenced
by the type of the segment, as is speed. Duration also
seemed to be the least speaker-dependant of the three
parameters.},
categories = {tongue movements, ultrasound},
month = aug,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/Kocjancic_ISCA_ExLing_2008.pdf},
year = 2008
}
@inproceedings{ling:richmond:yamagishi:wang:2008a,
author = {Ling, Zhen-Hua and Richmond, Korin and Yamagishi,
Junichi and Wang, Ren-Hua },
title = {Articulatory Control of {HMM}-based Parametric Speech
Synthesis Driven by Phonetic Knowledge},
booktitle = {Proc. Interspeech},
pages = {573--576},
address = {Brisbane, Australia},
abstract = {This paper presents a method to control the
characteristics of synthetic speech flexibly by
integrating articulatory features into a Hidden Markov
Model (HMM)-based parametric speech synthesis system.
In contrast to model adaptation and interpolation
approaches for speaking style control, this method is
driven by phonetic knowledge, and target speech samples
are not required. The joint distribution of parallel
acoustic and articulatory features considering
cross-stream feature dependency is estimated. At
synthesis time, acoustic and articulatory features are
generated simultaneously based on the
maximum-likelihood criterion. The synthetic speech can
be controlled flexibly by modifying the generated
articulatory features according to arbitrary phonetic
rules in the parameter generation process. Our
experiments show that the proposed method is effective
in both changing the overall character of synthesized
speech and in controlling the quality of a specific
vowel.},
categories = {speech synthesis, HMM, articulatory features, phonetic
knowledge},
key = {ling:richmond:yamagishi:wang:2008a},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080582.PDF},
year = 2008
}
@inproceedings{tietze:08:sci,
author = {Martin Tietze and Vera Demberg and Johanna D. Moore},
title = {Syntactic Complexity induces Explicit Grounding in the
{MapTask} corpus},
booktitle = {Proc. Interspeech},
abstract = {This paper provides evidence for theories of grounding
and dialogue management in human conversation. For each
utterance in a corpus of task-oriented dialogues, we
calculated integration costs, which are based on
syntactic sentence complexity. We compared the
integration costs and grounding behavior under two
conditions, namely face-to-face and a no-eye-contact
condition. The results show that integration costs were
significantlyhigher for explicitly grounded utterances
in the no-eye-contact condition, but not in the
face-to-face condition.},
categories = {dialogue, syntactic complexity, grounding},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS081130.pdf},
year = 2008
}
@inproceedings{wang:frankel:tejedor:king:icassp2008,
author = {Dong Wang and Joe Frankel and Javier Tejedor and Simon
King},
title = {A comparison of phone and grapheme-based spoken term
detection},
booktitle = {Proc. ICASSP},
pages = {4969--4972 },
abstract = {We propose grapheme-based sub-word units for spoken
term detection (STD). Compared to phones, graphemes
have a number of potential advantages. For
out-of-vocabulary search terms, phone- based approaches
must generate a pronunciation using letter-to-sound
rules. Using graphemes obviates this potentially
error-prone hard decision, shifting pronunciation
modelling into the statistical models describing the
observation space. In addition, long-span grapheme
language models can be trained directly from large text
corpora. We present experiments on Spanish and English
data, comparing phone and grapheme-based STD. For
Spanish, where phone and grapheme-based systems give
similar transcription word error rates (WERs),
grapheme-based STD significantly outperforms a phone-
based approach. The converse is found for English,
where the phone-based system outperforms a grapheme
approach. However, we present additional analysis which
suggests that phone-based STD performance levels may be
achieved by a grapheme-based approach despite lower
transcription accuracy, and that the two approaches may
usefully be combined. We propose a number of directions
for future development of these ideas, and suggest that
if grapheme-based STD can match phone-based
performance, the inherent flexibility in dealing with
out-of-vocabulary terms makes this a desirable
approach.},
doi = {10.1109/ICASSP.2008.4518773},
month = {March-April},
year = 2008
}
@inproceedings{robust-hts,
author = {Junichi Yamagishi and Zhenhua Ling and Simon King},
title = {Robustness of HMM-based Speech Synthesis},
booktitle = {Proc. Interspeech 2008},
pages = {581--584},
address = {Brisbane, Australia},
abstract = {As speech synthesis techniques become more advanced,
we are able to consider building high-quality voices
from data collected outside the usual highly-controlled
recording studio environment. This presents new
challenges that are not present in conventional
text-to-speech synthesis: the available speech data are
not perfectly clean, the recording conditions are not
consistent, and/or the phonetic balance of the material
is not ideal. Although a clear picture of the
performance of various speech synthesis techniques
(e.g., concatenative, HMM-based or hybrid) under good
conditions is provided by the Blizzard Challenge, it is
not well understood how robust these algorithms are to
less favourable conditions. In this paper, we analyse
the performance of several speech synthesis methods
under such conditions. This is, as far as we know, a
new research topic: ``Robust speech synthesis.'' As a
consequence of our investigations, we propose a new
robust training method for the HMM-based speech
synthesis in for use with speech data collected in
unfavourable conditions.},
categories = {speech synthesis, HMM-based speech synthesis, HTS,
speaker adaptation, voice conversion, average voice,
unit selection},
key = {robust-hts},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/RobustnessHTS.pdf},
year = 2008
}
@article{ling2008,
author = {Ling, Z. and Richmond, K. and Yamagishi, J. and Wang,
R.},
title = {Integrating Articulatory Features into {HMM}-based
Parametric Speech Synthesis},
journal = {IEEE Transactions on Audio, Speech and Language
Processing },
note = {Accepted for publication},
abstract = {This paper presents an investigation of ways to
integrate articulatory features into Hidden Markov
Model (HMM)-based parametric speech synthesis,
primarily with the aim of improving the performance of
acoustic parameter generation. The joint distribution
of acoustic and articulatory features is estimated
during training and is then used for parameter
generation at synthesis time in conjunction with a
maximum-likelihood criterion. Different model
structures are explored to allow the articulatory
features to influence acoustic modeling: model
clustering, state synchrony and cross-stream feature
dependency. The results of objective evaluation show
that the accuracy of acoustic parameter prediction can
be improved when shared clustering and
asynchronous-state model structures are adopted for
combined acoustic and articulatory features. More
significantly, our experiments demonstrate that
modeling the dependency between these two feature
streams can make speech synthesis more flexible. The
characteristics of synthetic speech can be easily
controlled by modifying generated articulatory features
as part of the process of acoustic parameter
generation.},
categories = {Speech synthesis, articulation, HMM-based synthesis},
key = {ling2008},
optmonth = {},
optnumber = {},
optpages = {},
optvolume = {},
year = 2009
}
@inproceedings{qin:perpinan:richmond:wrench:renals:2008a,
author = {Qin, C. and Carreira-Perpiñán, M. and Richmond, K.
and Wrench, A. and Renals, S.},
title = {Predicting Tongue Shapes from a Few Landmark Locations},
booktitle = {Proc. Interspeech},
pages = {2306--2309},
address = {Brisbane, Australia},
abstract = {We present a method for predicting the midsagittal
tongue contour from the locations of a few landmarks
(metal pellets) on the tongue surface, as used in
articulatory databases such as MOCHA and the Wisconsin
XRDB. Our method learns a mapping using ground-truth
tongue contours derived from ultrasound data and
drastically improves over spline interpolation. We also
determine the optimal locations of the landmarks, and
the number of landmarks required to achieve a desired
prediction error: 3-4 landmarks are enough to achieve
0.3-0.2 mm error per point on the tongue.},
categories = {ultrasound, tongue contour, articulation},
key = {qin:perpinan:richmond:wrench:renals:2008a},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080929.PDF},
year = 2008
}
@article{hts2007-junichi,
author = {Junichi Yamagishi and Takashi Nose and Zhen-Hua Ling
and Heiga Zen and Tomoki Toda and Keiichi Tokuda and
Simon King and Steve Renals},
title = {Robust Speaker-Adaptive {HMM}-based Text-to-Speech
Synthesis},
journal = {IEEE Transactions on Audio, Speech and Language
Processing},
note = {In review},
abstract = {This paper describes a speaker-adaptive HMM-based
speech synthesis system. The new system, called
``HTS-2007'', employs speaker adaptation (CSMAPLR+MAP),
feature-space adaptive training, mixed-gender modeling,
and full-covariance modeling using CSMAPLR transforms,
in addition to several other techniques that have
proved effective in our previous systems. Subjective
evaluation results show that the new system generates
significantly better quality synthetic speech than
speaker-dependent approaches with realistic amounts of
speech data, and that it bears comparison with
speaker-dependent approaches even when large amounts of
speech data are available. In addition, a comparison
study with several speech synthesis techniques shows
the new system is very robust: It is able to build
voices from less-than-ideal speech data and synthesize
good-quality speech even for out-of-domain sentences.},
categories = {speech synthesis, HMM-based speech synthesis, HTS,
speaker adaptation, voice conversion, average voice},
key = {hts2007-junichi},
year = 2008
}
@inproceedings{leo_08-3,
author = {J. Sebastian Andersson and Leonardo Badino and Oliver
S. Watts and Matthew P.Aylett},
title = {The {CSTR/Cereproc B}lizzard Entry 2008: The
Inconvenient Data},
booktitle = {Proc. Blizzard Challenge Workshop (in Proc.
Interspeech 2008)},
address = {Brisbane, Australia},
abstract = {In a commercial system data used for unit selection
systems is collected with a heavy emphasis on
homogeneous neutral data that has sufficient coverage
for the units that will be used in the system. In this
years Blizzard entry CSTR and CereProc present a joint
entry where the emphasis has been to explore techniques
to deal with data which is not homogeneous (the English
entry) and did not have appropriate coverage for a
diphone based system (the Mandarin entry where
tone/phone combinations were treated as distinct phone
categories). In addition, two further problems were
addressed, 1) Making use of non-homogeneous data for
creating a voice that can realise both expressive and
neutral speaking styles (the English entry) 2) Building
a unit selection system with no native understanding of
the language but depending instead on external native
evaluation (the Mandarin Entry).},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/cstr-cereproc_Blizzard2008.pdf},
year = 2008
}
@inproceedings{dong_ivan_joe_simon_interspeech08_marray,
author = {Dong Wang and Ivan Himawan and Joe Frankel and Simon
King},
title = {A Posterior Approach for Microphone Array Based Speech
Recognition},
booktitle = {Proc. Interspeech},
pages = {996--999},
abstract = {Automatic speech recognition (ASR) becomes rather
difficult in meetings domains because of the adverse
acoustic conditions, including more background noise,
more echo and reverberation and frequent cross-talking.
Microphone arrays have been demonstrated able to boost
ASR performance dramatically in such noisy and
reverberant environments, with various beamforming
algorithms. However, almost all existing beamforming
measures work in the acoustic domain, resorting to
signal processing theories and geometric explanation.
This limits their application, and induces significant
performance degradation when the geometric property is
unavailable or hard to estimate, or if heterogenous
channels exist in the audio system. In this paper, we
preset a new posterior-based approach for array-based
speech recognition. The main idea is, instead of
enhancing speech signals, we try to enhance the
posterior probabilities that frames belonging to
recognition units, e.g., phones. These enhanced
posteriors are then transferred to posterior
probability based features and are modeled by HMMs,
leading to a tandem ANN-HMM hybrid system presented by
Hermansky et al.. Experimental results demonstrated the
validity of this posterior approach. With the posterior
accumulation or enhancement, significant improvement
was achieved over the single channel baseline.
Moreover, we can combine the acoustic enhancement and
posterior enhancement together, leading to a hybrid
acoustic-posterior beamforming approach, which works
significantly better than just the acoustic
beamforming, especially in the scenario with
moving-speakers. },
categories = {speech recognition, microphone array, beamforming,
tandem approach},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/marray.a.pdf},
year = 2008
}
@inproceedings{steiner:richmond:2008a,
author = {Steiner, I. and Richmond, K.},
title = {Generating gestural timing from {EMA} data using
articulatory resynthesis},
booktitle = {Proc. 8th International Seminar on Speech Production},
address = {Strasbourg, France},
abstract = {As part of ongoing work to integrate an articulatory
synthesizer into a modular TTS platform, a method is
presented which allows gestural timings to be generated
automatically from EMA data. Further work is outlined
which will adapt the vocal tract model and phoneset to
English using new articulatory data, and use
statistical trajectory models. },
categories = {articulatory synthesis, EMA, VocalTractLab },
key = {steiner:richmond:2008a},
month = dec,
year = 2008
}
@inproceedings{cabral:renals:richmond:yamagishi:2008a,
author = {Cabral, J. and Renals, S. and Richmond, K. and
Yamagishi, J.},
title = {Glottal Spectral Separation for Parametric Speech
Synthesis},
booktitle = {Proc. Interspeech},
pages = {1829--1832},
address = {Brisbane, Australia},
abstract = {This paper presents a method to control the
characteristics of synthetic speech flexibly by
integrating articulatory features into a Hidden Markov
Model (HMM)-based parametric speech synthesis system.
In contrast to model adaptation and interpolation
approaches for speaking style control, this method is
driven by phonetic knowledge, and target speech samples
are not required. The joint distribution of parallel
acoustic and articulatory features considering
cross-stream feature dependency is estimated. At
synthesis time, acoustic and articulatory features are
generated simultaneously based on the
maximum-likelihood criterion. The synthetic speech can
be controlled flexibly by modifying the generated
articulatory features according to arbitrary phonetic
rules in the parameter generation process. Our
experiments show that the proposed method is effective
in both changing the overall character of synthesized
speech and in controlling the quality of a specific
vowel. },
categories = {HMM speech synthesis, Glottal Spectral Separation,
LF-model},
key = {cabral:renals:richmond:yamagishi:2008a},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS081086.PDF},
year = 2008
}
@inproceedings{leo_07-2,
author = {Matthew P. Aylett and J. Sebastian Andersson and
Leonardo Badino and Christopher J. Pidcock},
title = {The {C}erevoice {B}lizzard Entry 2007: Are Small
Database Errors Worse than Compression Artifacts?},
booktitle = {Proc. Blizzard Challenge Workshop 2007},
address = {Bonn, Germany},
abstract = {In commercial systems the memory footprint of unit
selection systems is often a key issue. This is
especially true for PDAs and other embedded devices. In
this year's Blizzard entry CereProc R gave itself the
criteria that the full database system entered would
have a smaller memory footprint than either of the two
smaller database entries. This was accomplished by
applying Speex speech compression to the full database
entry. In turn a set of small database techniques used
to improve the quality of small database systems in
last years entry were extended. Finally, for all
systems, two quality control methods were applied to
the underlying database to improve the lexicon and
transcription match to the underlying data. Results
suggest that mild audio quality artifacts introduced by
lossy compression have almost as much impact on MOS
perceived quality as concatenation errors introduced by
sparse data in the smaller systems with bulked
diphones.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blz3_004.pdf},
year = 2007
}
@inproceedings{joe_dong_simon_interspeech08_bottle,
author = {Joe Frankel and Dong Wang and Simon King},
title = {Growing bottleneck features for tandem {ASR}},
booktitle = {Proc. Interspeech},
pages = {1549},
abstract = { We present a method for training bottleneck MLPs for
use in tandem ASR. Experiments on meetings data show
that this approach leads to improved performance
compared with training MLPs from a random
initialization. },
categories = {tandem ASR, bottleneck MLP},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/bottlenet.a.pdf},
month = sep,
year = 2008
}
@inproceedings{cabral07,
author = {J. Cabral and S. Renals and K. Richmond and J.
Yamagishi},
title = {Towards an Improved Modeling of the Glottal Source in
Statistical Parametric Speech Synthesis},
booktitle = {Proc.of the 6th ISCA Workshop on Speech Synthesis},
address = {Bonn, Germany},
abstract = {This paper proposes the use of the Liljencrants-Fant
model (LF-model) to represent the glottal source signal
in HMM-based speech synthesis systems. These systems
generally use a pulse train to model the periodicity of
the excitation signal of voiced speech. However, this
model produces a strong and uniform harmonic structure
throughout the spectrum of the excitation which makes
the synthetic speech sound buzzy. The use of a mixed
band excitation and phase manipulation reduces this
effect but it can result in degradation of the speech
quality if the noise component is not weighted
carefully. In turn, the LF-waveform has a decaying
spectrum at higher frequencies, which is more similar
to the real glottal source excitation signal. We
conducted a perceptual experiment to test the
hypothesis that the LF-model can perform as well as or
better than the pulse train in a HMM-based speech
synthesizer. In the synthesis, we used the mean values
of the LF-parameters, calculated by measurements of the
recorded speech. The result of this study is important
not only regarding the improvement in speech quality of
these type of systems, but also because the LF-model
can be used to model many characteristics of the
glottal source, such as voice quality, which are
important for voice transformation and generation of
expressive speech.},
categories = {LF-model, Statistical parametric speech synthesis,
HMM-based speech synthesis},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/cabral07.pdf},
year = 2007
}
@article{tejedor:wang:frankel:king:colas:specom2008,
author = {Javier Tejedor and Dong Wang and Joe Frankel and Simon
King and José Colás},
title = {A comparison of grapheme and phoneme-based units for
{S}panish spoken term detection},
journal = {Speech Communication},
volume = {50},
number = {11-12},
pages = {980-991},
abstract = {The ever-increasing volume of audio data available
online through the world wide web means that automatic
methods for indexing and search are becoming essential.
Hidden Markov model (HMM) keyword spotting and lattice
search techniques are the two most common approaches
used by such systems. In keyword spotting, models or
templates are defined for each search term prior to
accessing the speech and used to find matches. Lattice
search (referred to as spoken term detection), uses a
pre-indexing of speech data in terms of word or
sub-word units, which can then quickly be searched for
arbitrary terms without referring to the original
audio. In both cases, the search term can be modelled
in terms of sub-word units, typically phonemes. For
in-vocabulary words (i.e. words that appear in the
pronunciation dictionary), the letter-to-sound
conversion systems are accepted to work well. However,
for out-of-vocabulary (OOV) search terms,
letter-to-sound conversion must be used to generate a
pronunciation for the search term. This is usually a
hard decision (i.e. not probabilistic and with no
possibility of backtracking), and errors introduced at
this step are difficult to recover from. We therefore
propose the direct use of graphemes (i.e., letter-based
sub-word units) for acoustic modelling. This is
expected to work particularly well in languages such as
Spanish, where despite the letter-to-sound mapping
being very regular, the correspondence is not
one-to-one, and there will be benefits from avoiding
hard decisions at early stages of processing. In this
article, we compare three approaches for Spanish
keyword spotting or spoken term detection, and within
each of these we compare acoustic modelling based on
phone and grapheme units. Experiments were performed
using the Spanish geographical-domain Albayzin corpus.
Results achieved in the two approaches proposed for
spoken term detection show us that trigrapheme units
for acoustic modelling match or exceed the performance
of phone-based acoustic models. In the method proposed
for keyword spotting, the results achieved with each
acoustic model are very similar.},
categories = {Spoken term detection; Keyword spotting; Graphemes;
Spanish},
doi = {doi:10.1016/j.specom.2008.03.005 },
month = {November-December},
year = 2008
}