The Centre for Speech Technology Research, The university of Edinburgh

Publications by Yoshinori Shiga

s0197813.bib

@inproceedings{shig042,
  author = {Yoshinori Shiga and Simon King},
  title = {Source-Filter Separation for Articulation-to-Speech
                   Synthesis},
  booktitle = {Proc. ICSLP},
  address = {Jeju, Korea},
  abstract = {In this paper we examine a method for separating out
                   the vocal-tract filter response from the voice source
                   characteristic using a large articulatory database. The
                   method realises such separation for voiced speech using
                   an iterative approximation procedure under the
                   assumption that the speech production process is a
                   linear system composed of a voice source and a
                   vocal-tract filter, and that each of the components is
                   controlled independently by different sets of factors.
                   Experimental results show that the spectral variation
                   is evidently influenced by the fundamental frequency or
                   the power of speech, and that the tendency of the
                   variation may be related closely to speaker identity.
                   The method enables independent control over the voice
                   source characteristic in our articulation-to-speech
                   synthesis.},
  categories = {artic, lbg, clustering, mocha, source-filter,
                   edinburgh},
  month = oct,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_icslp04b.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_icslp04b.ps},
  year = 2004
}
@inproceedings{shig043,
  author = {Yoshinori Shiga and Simon King},
  title = {Estimating detailed spectral envelopes using
                   articulatory clustering},
  booktitle = {Proc. ICSLP},
  address = {Jeju, Korea},
  abstract = {This paper presents an articulatory-acoustic mapping
                   where detailed spectral envelopes are estimated. During
                   the estimation, the harmonics of a range of F0 values
                   are derived from the spectra of multiple voiced speech
                   signals vocalized with similar articulator settings.
                   The envelope formed by these harmonics is represented
                   by a cepstrum, which is computed by fitting the peaks
                   of all the harmonics based on the weighted least square
                   method in the frequency domain. The experimental result
                   shows that the spectral envelopes are estimated with
                   the highest accuracy when the cepstral order is 48--64
                   for a female speaker, which suggests that representing
                   the real response of the vocal tract requires
                   high-quefrency elements that conventional speech
                   synthesis methods are forced to discard in order to
                   eliminate the pitch component of speech.},
  categories = {artic, lbg, clustering, mocha, harmonic, envelope,
                   edinburgh},
  month = oct,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_icslp04a.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_icslp04a.ps},
  year = 2004
}
@inproceedings{shig031,
  author = {Yoshinori Shiga and Simon King},
  title = {Estimating the Spectral Envelope of Voiced Speech
                   Using Multi-frame Analysis},
  booktitle = {Proc. {E}urospeech-2003},
  volume = 3,
  pages = {1737--1740},
  address = {Geneva, Switzerland},
  abstract = {This paper proposes a novel approach for estimating
                   the spectral envelope of voiced speech independently of
                   its harmonic structure. Because of the
                   quasi-periodicity of voiced speech, its spectrum
                   indicates harmonic structure and only has energy at
                   frequencies corresponding to integral multiples of F0.
                   It is hence impossible to identify transfer
                   characteristics between the adjacent harmonics. In
                   order to resolve this problem, Multi-frame Analysis
                   (MFA) is introduced. The MFA estimates a spectral
                   envelope using many portions of speech which are
                   vocalised using the same vocal-tract shape. Since each
                   of the portions usually has a different F0 and ensuing
                   different harmonic structure, a number of harmonics can
                   be obtained at various frequencies to form a spectral
                   envelope. The method thereby gives a closer
                   approximation to the vocal-tract transfer function.},
  categories = {artic, lbg, clustering, mocha, harmonic, envelope,
                   edinburgh},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/shiga_eurospeech03a.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/shiga_eurospeech03a.ps},
  year = 2003
}
@inproceedings{shig94,
  author = {Yoshinori Shiga and Yoshiyuki Hara and Tsuneo Nitta},
  title = {A novel segment-con\-cat\-e\-na\-tion algorithm for a
                   cepstrum-based synthesizer},
  booktitle = {Proc. ICSLP},
  volume = 4,
  pages = {1783--1786},
  categories = {speech, synthesis, unit, concatenation, cepstrum,
                   toshiba},
  year = 1994
}
@inproceedings{shig040,
  author = {Yoshinori Shiga},
  title = {Source-filter separation based on an articulatory
                   corpus},
  booktitle = {One day meeting for young speech researchers ({UK}
                   meeting)},
  address = {University College London, London, United Kingdom},
  abstract = {A new approach is presented for estimating voice
                   source and vocal-tract filter characteristics based on
                   an articulatory database. From the viewpoint of
                   acoustics, in order to estimate the transfer function
                   of a system, both the input and output of the system
                   need to be observed. In the case of the source-filter
                   separation problem, however, only the output (i.e.
                   speech) is observable, and the response of the system
                   (vocal tract) and the input (voice source) must be
                   estimated simultaneously. The estimation is hence
                   theoretically impossible, and consequently the
                   estimation problem is generally solved approximately by
                   applying rather oversimplified models. The proposed
                   approach separates these two characteristics under the
                   assumption that each of the characteristics is
                   controlled independently by a different set of factors.
                   The separation is achieved by iterative approximation
                   based on the above assumption using a large speech
                   corpus including electro-magnetic articulograph data.
                   The proposed approach enables the independent control
                   of the source and filter characteristics, and thus
                   contributes toward improving speech quality in speech
                   synthesis.},
  categories = {artic, lbg, clustering, mocha, source-filter,
                   edinburgh},
  month = apr,
  year = 2004
}
@inproceedings{shig98,
  author = {Yoshinori Shiga and Hiroshi Matsuura and Tsuneo Nitta},
  title = {Segmental duration control based on an articulatory
                   model},
  booktitle = {Proc. ICSLP},
  volume = 5,
  pages = {2035--2038},
  abstract = {This paper proposes a new method that determines
                   segmental duration for text-to-speech conversion based
                   on the movement of articulatory organs which compose an
                   articulatory model. The articulatory model comprises
                   four time-variable articulatory parameters representing
                   the conditions of articulatory organs whose physical
                   restriction seems to significantly influence the
                   segmental duration. The parameters are controlled
                   according to an input sequence of phonetic symbols,
                   following which segmental duration is determined based
                   on the variation of the articulatory parameters. The
                   proposed method is evaluated through an experiment
                   using a Japanese speech database that consists of 150
                   phonetically balanced sentences. The results indicate
                   that the mean square error of predicted segmental
                   duration is approximately 15[ms] for the closed set and
                   15--17[ms] for the open set. The error is within
                   20[ms], the level of acceptability for distortion of
                   segmental duration without loss of naturalness, and
                   hence the method is proved to effectively predict
                   segmental duration.},
  categories = {speech, synthesis, duration, articulatory model,
                   toshiba},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/shiga_icslp98.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/shiga_icslp98.ps},
  year = 1998
}
@inproceedings{shig041,
  author = {Yoshinori Shiga and Simon King},
  title = {Accurate spectral envelope estimation for
                   articulation-to-speech synthesis},
  booktitle = {Proc. 5th ISCA Speech Synthesis Workshop},
  pages = {19--24},
  address = {CMU, Pittsburgh, USA},
  abstract = {This paper introduces a novel articulatory-acoustic
                   mapping in which detailed spectral envelopes are
                   estimated based on the cepstrum, inclusive of the
                   high-quefrency elements which are discarded in
                   conventional speech synthesis to eliminate the pitch
                   component of speech. For this estimation, the method
                   deals with the harmonics of multiple voiced-speech
                   spectra so that several sets of harmonics can be
                   obtained at various pitch frequencies to form a
                   spectral envelope. The experimental result shows that
                   the method estimates spectral envelopes with the
                   highest accuracy when the cepstral order is 48--64,
                   which suggests that the higher order coeffcients are
                   required to represent detailed envelopes reflecting the
                   real vocal-tract responses.},
  categories = {artic, lbg, clustering, mocha, harmonic, envelope,
                   edinburgh},
  month = jun,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_ssw504.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_ssw504.ps},
  year = 2004
}
@phdthesis{shiga05,
  author = {Shiga, Yoshinori},
  title = {Precise Estimation of Vocal Tract and Voice Source
                   Characteristics},
  school = {The Centre for Speech Technology Research, Edinburgh
                   University},
  abstract = {This thesis addresses the problem of quality
                   degradation in speech produced by parameter-based
                   speech synthesis, within the framework of an
                   articulatory-acoustic forward mapping. I first
                   investigate current problems in speech
                   parameterisation, and point out the fact that
                   conventional parameterisation inaccurately extracts the
                   vocal tract response due to interference from the
                   harmonic structure of voiced speech. To overcome this
                   problem, I introduce a method for estimating filter
                   responses more precisely from periodic signals. The
                   method achieves such estimation in the frequency domain
                   by approximating all the harmonics observed in several
                   frames based on a least squares criterion. It is shown
                   that the proposed method is capable of estimating the
                   response more accurately than widely-used
                   frame-by-frame parameterisation, for simulations using
                   synthetic speech and for an articulatory-acoustic
                   mapping using actual speech. I also deal with the
                   source-filter separation problem and independent
                   control of the voice source characteristic during
                   speech synthesis. I propose a statistical approach to
                   separating out the vocal-tract filter response from the
                   voice source characteristic using a large articulatory
                   database. The approach realises such separation for
                   voiced speech using an iterative approximation
                   procedure under the assumption that the speech
                   production process is a linear system composed of a
                   voice source and a vocal-tract filter, and that each of
                   the components is controlled independently by different
                   sets of factors. Experimental results show that
                   controlling the source characteristic greatly improves
                   the accuracy of the articulatory-acoustic mapping, and
                   that the spectral variation of the source
                   characteristic is evidently influenced by the
                   fundamental frequency or the power of speech. The
                   thesis provides more accurate acoustical approximation
                   of the vocal tract response, which will be beneficial
                   in a wide range of speech technologies, and lays the
                   groundwork in speech science for a new type of
                   corpus-based statistical solution to the source-filter
                   separation problem.},
  categories = {mfa, multiframe, forward, mapping, source-filter,
                   artic, mocha, edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/phd_thesis_shiga.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/phd_thesis_shiga.ps.gz},
  year = 2005
}
@inproceedings{shig032,
  author = {Yoshinori Shiga and Simon King},
  title = {Estimation of voice source and vocal tract
                   characteristics based on multi-frame analysis},
  booktitle = {Proc. Eurospeech},
  volume = 3,
  pages = {1749--1752},
  address = {Geneva, Switzerland},
  abstract = {This paper presents a new approach for estimating
                   voice source and vocal tract filter characteristics of
                   voiced speech. When it is required to know the transfer
                   function of a system in signal processing, the input
                   and output of the system are experimentally observed
                   and used to calculate the function. However, in the
                   case of source-filter separation we deal with in this
                   paper, only the output (speech) is observed and the
                   characteristics of the system (vocal tract) and the
                   input (voice source) must simultaneously be estimated.
                   Hence the estimate becomes extremely difficult, and it
                   is usually solved approximately using oversimplified
                   models. We demonstrate that these characteristics are
                   separable under the assumption that they are
                   independently controlled by different factors. The
                   separation is realised using an iterative approximation
                   along with the Multi-frame Analysis method, which we
                   have proposed to find spectral envelopes of voiced
                   speech with minimum interference of the harmonic
                   structure.},
  categories = {artic, lbg, clustering, mocha, source-filter,
                   edinburgh},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/shiga_eurospeech03b.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/shiga_eurospeech03b.ps},
  year = 2003
}