Publications by Yoshinori Shiga
s0197813.bib
@inproceedings{shig042,
author = {Yoshinori Shiga and Simon King},
title = {Source-Filter Separation for Articulation-to-Speech
Synthesis},
booktitle = {Proc. ICSLP},
address = {Jeju, Korea},
abstract = {In this paper we examine a method for separating out
the vocal-tract filter response from the voice source
characteristic using a large articulatory database. The
method realises such separation for voiced speech using
an iterative approximation procedure under the
assumption that the speech production process is a
linear system composed of a voice source and a
vocal-tract filter, and that each of the components is
controlled independently by different sets of factors.
Experimental results show that the spectral variation
is evidently influenced by the fundamental frequency or
the power of speech, and that the tendency of the
variation may be related closely to speaker identity.
The method enables independent control over the voice
source characteristic in our articulation-to-speech
synthesis.},
categories = {artic, lbg, clustering, mocha, source-filter,
edinburgh},
month = oct,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_icslp04b.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_icslp04b.ps},
year = 2004
}
@inproceedings{shig043,
author = {Yoshinori Shiga and Simon King},
title = {Estimating detailed spectral envelopes using
articulatory clustering},
booktitle = {Proc. ICSLP},
address = {Jeju, Korea},
abstract = {This paper presents an articulatory-acoustic mapping
where detailed spectral envelopes are estimated. During
the estimation, the harmonics of a range of F0 values
are derived from the spectra of multiple voiced speech
signals vocalized with similar articulator settings.
The envelope formed by these harmonics is represented
by a cepstrum, which is computed by fitting the peaks
of all the harmonics based on the weighted least square
method in the frequency domain. The experimental result
shows that the spectral envelopes are estimated with
the highest accuracy when the cepstral order is 48--64
for a female speaker, which suggests that representing
the real response of the vocal tract requires
high-quefrency elements that conventional speech
synthesis methods are forced to discard in order to
eliminate the pitch component of speech.},
categories = {artic, lbg, clustering, mocha, harmonic, envelope,
edinburgh},
month = oct,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_icslp04a.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_icslp04a.ps},
year = 2004
}
@inproceedings{shig031,
author = {Yoshinori Shiga and Simon King},
title = {Estimating the Spectral Envelope of Voiced Speech
Using Multi-frame Analysis},
booktitle = {Proc. {E}urospeech-2003},
volume = 3,
pages = {1737--1740},
address = {Geneva, Switzerland},
abstract = {This paper proposes a novel approach for estimating
the spectral envelope of voiced speech independently of
its harmonic structure. Because of the
quasi-periodicity of voiced speech, its spectrum
indicates harmonic structure and only has energy at
frequencies corresponding to integral multiples of F0.
It is hence impossible to identify transfer
characteristics between the adjacent harmonics. In
order to resolve this problem, Multi-frame Analysis
(MFA) is introduced. The MFA estimates a spectral
envelope using many portions of speech which are
vocalised using the same vocal-tract shape. Since each
of the portions usually has a different F0 and ensuing
different harmonic structure, a number of harmonics can
be obtained at various frequencies to form a spectral
envelope. The method thereby gives a closer
approximation to the vocal-tract transfer function.},
categories = {artic, lbg, clustering, mocha, harmonic, envelope,
edinburgh},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/shiga_eurospeech03a.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/shiga_eurospeech03a.ps},
year = 2003
}
@inproceedings{shig94,
author = {Yoshinori Shiga and Yoshiyuki Hara and Tsuneo Nitta},
title = {A novel segment-con\-cat\-e\-na\-tion algorithm for a
cepstrum-based synthesizer},
booktitle = {Proc. ICSLP},
volume = 4,
pages = {1783--1786},
categories = {speech, synthesis, unit, concatenation, cepstrum,
toshiba},
year = 1994
}
@inproceedings{shig040,
author = {Yoshinori Shiga},
title = {Source-filter separation based on an articulatory
corpus},
booktitle = {One day meeting for young speech researchers ({UK}
meeting)},
address = {University College London, London, United Kingdom},
abstract = {A new approach is presented for estimating voice
source and vocal-tract filter characteristics based on
an articulatory database. From the viewpoint of
acoustics, in order to estimate the transfer function
of a system, both the input and output of the system
need to be observed. In the case of the source-filter
separation problem, however, only the output (i.e.
speech) is observable, and the response of the system
(vocal tract) and the input (voice source) must be
estimated simultaneously. The estimation is hence
theoretically impossible, and consequently the
estimation problem is generally solved approximately by
applying rather oversimplified models. The proposed
approach separates these two characteristics under the
assumption that each of the characteristics is
controlled independently by a different set of factors.
The separation is achieved by iterative approximation
based on the above assumption using a large speech
corpus including electro-magnetic articulograph data.
The proposed approach enables the independent control
of the source and filter characteristics, and thus
contributes toward improving speech quality in speech
synthesis.},
categories = {artic, lbg, clustering, mocha, source-filter,
edinburgh},
month = apr,
year = 2004
}
@inproceedings{shig98,
author = {Yoshinori Shiga and Hiroshi Matsuura and Tsuneo Nitta},
title = {Segmental duration control based on an articulatory
model},
booktitle = {Proc. ICSLP},
volume = 5,
pages = {2035--2038},
abstract = {This paper proposes a new method that determines
segmental duration for text-to-speech conversion based
on the movement of articulatory organs which compose an
articulatory model. The articulatory model comprises
four time-variable articulatory parameters representing
the conditions of articulatory organs whose physical
restriction seems to significantly influence the
segmental duration. The parameters are controlled
according to an input sequence of phonetic symbols,
following which segmental duration is determined based
on the variation of the articulatory parameters. The
proposed method is evaluated through an experiment
using a Japanese speech database that consists of 150
phonetically balanced sentences. The results indicate
that the mean square error of predicted segmental
duration is approximately 15[ms] for the closed set and
15--17[ms] for the open set. The error is within
20[ms], the level of acceptability for distortion of
segmental duration without loss of naturalness, and
hence the method is proved to effectively predict
segmental duration.},
categories = {speech, synthesis, duration, articulatory model,
toshiba},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/shiga_icslp98.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/shiga_icslp98.ps},
year = 1998
}
@inproceedings{shig041,
author = {Yoshinori Shiga and Simon King},
title = {Accurate spectral envelope estimation for
articulation-to-speech synthesis},
booktitle = {Proc. 5th ISCA Speech Synthesis Workshop},
pages = {19--24},
address = {CMU, Pittsburgh, USA},
abstract = {This paper introduces a novel articulatory-acoustic
mapping in which detailed spectral envelopes are
estimated based on the cepstrum, inclusive of the
high-quefrency elements which are discarded in
conventional speech synthesis to eliminate the pitch
component of speech. For this estimation, the method
deals with the harmonics of multiple voiced-speech
spectra so that several sets of harmonics can be
obtained at various pitch frequencies to form a
spectral envelope. The experimental result shows that
the method estimates spectral envelopes with the
highest accuracy when the cepstral order is 48--64,
which suggests that the higher order coeffcients are
required to represent detailed envelopes reflecting the
real vocal-tract responses.},
categories = {artic, lbg, clustering, mocha, harmonic, envelope,
edinburgh},
month = jun,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_ssw504.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_ssw504.ps},
year = 2004
}
@phdthesis{shiga05,
author = {Shiga, Yoshinori},
title = {Precise Estimation of Vocal Tract and Voice Source
Characteristics},
school = {The Centre for Speech Technology Research, Edinburgh
University},
abstract = {This thesis addresses the problem of quality
degradation in speech produced by parameter-based
speech synthesis, within the framework of an
articulatory-acoustic forward mapping. I first
investigate current problems in speech
parameterisation, and point out the fact that
conventional parameterisation inaccurately extracts the
vocal tract response due to interference from the
harmonic structure of voiced speech. To overcome this
problem, I introduce a method for estimating filter
responses more precisely from periodic signals. The
method achieves such estimation in the frequency domain
by approximating all the harmonics observed in several
frames based on a least squares criterion. It is shown
that the proposed method is capable of estimating the
response more accurately than widely-used
frame-by-frame parameterisation, for simulations using
synthetic speech and for an articulatory-acoustic
mapping using actual speech. I also deal with the
source-filter separation problem and independent
control of the voice source characteristic during
speech synthesis. I propose a statistical approach to
separating out the vocal-tract filter response from the
voice source characteristic using a large articulatory
database. The approach realises such separation for
voiced speech using an iterative approximation
procedure under the assumption that the speech
production process is a linear system composed of a
voice source and a vocal-tract filter, and that each of
the components is controlled independently by different
sets of factors. Experimental results show that
controlling the source characteristic greatly improves
the accuracy of the articulatory-acoustic mapping, and
that the spectral variation of the source
characteristic is evidently influenced by the
fundamental frequency or the power of speech. The
thesis provides more accurate acoustical approximation
of the vocal tract response, which will be beneficial
in a wide range of speech technologies, and lays the
groundwork in speech science for a new type of
corpus-based statistical solution to the source-filter
separation problem.},
categories = {mfa, multiframe, forward, mapping, source-filter,
artic, mocha, edinburgh},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/phd_thesis_shiga.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/phd_thesis_shiga.ps.gz},
year = 2005
}
@inproceedings{shig032,
author = {Yoshinori Shiga and Simon King},
title = {Estimation of voice source and vocal tract
characteristics based on multi-frame analysis},
booktitle = {Proc. Eurospeech},
volume = 3,
pages = {1749--1752},
address = {Geneva, Switzerland},
abstract = {This paper presents a new approach for estimating
voice source and vocal tract filter characteristics of
voiced speech. When it is required to know the transfer
function of a system in signal processing, the input
and output of the system are experimentally observed
and used to calculate the function. However, in the
case of source-filter separation we deal with in this
paper, only the output (speech) is observed and the
characteristics of the system (vocal tract) and the
input (voice source) must simultaneously be estimated.
Hence the estimate becomes extremely difficult, and it
is usually solved approximately using oversimplified
models. We demonstrate that these characteristics are
separable under the assumption that they are
independently controlled by different factors. The
separation is realised using an iterative approximation
along with the Multi-frame Analysis method, which we
have proposed to find spectral envelopes of voiced
speech with minimum interference of the harmonic
structure.},
categories = {artic, lbg, clustering, mocha, source-filter,
edinburgh},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/shiga_eurospeech03b.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/shiga_eurospeech03b.ps},
year = 2003
}