The Centre for Speech Technology Research, The university of Edinburgh

Publications by Olga Goubanova

s9808417.bib

@inproceedings{Goubanova:2003,
  author = {Goubanova, O.},
  title = {{B}ayesian Modelling Of Vowel Segment Duration For
                   Text-to-Speech Synthesis Using Distinctive Features},
  booktitle = {Proc. ICPhS 2003},
  volume = 3,
  pages = {2349},
  address = {Barcelona, Spain},
  abstract = {We report the results of applying the Bayesian Belief
                   Network (BN) approach to predicting vowel duration. A
                   Bayesian inference of the vowel duration is performed
                   on a hybrid Bayesian network consisting of discrete and
                   continuous nodes, with the nodes in the network
                   representing the linguistic factors that affect segment
                   duration. New to the present research, we model segment
                   identity factor as a set of distinctive features. The
                   features chosen were height, frontness, length, and
                   roundness. We also experimented with a word class
                   feature that implicitly represents word frequency
                   information. We contrasted the results of the belief
                   network model with those of the sums of products (SoP)
                   model and classification and regression tree (CART)
                   model. We trained and tested all three models on the
                   same data. In terms of the RMS error and correlation
                   coefficient, our BN model performs no worse than SoP
                   model, and it significantly outperforms CART model.},
  categories = {Bayesian, text-to-speech synthesis, duration modelling},
  ps = {http://www.cstr.ed.ac.uk/downloads/publications/2003/OGoubanova_icphs2k3.ps},
  year = 2003
}
@inproceedings{Goubanova:2001,
  author = {Goubanova, O.},
  title = {Predicting segmental durations using {B}ayesian
                   {B}elief Networks},
  booktitle = {CD-ROM Proc. 4th ISCA Tutorial and Research Workshop
                   on Speech Synthesis},
  address = {Scotland, UK},
  year = 2001
}
@article{goubanova:king:specom2008,
  author = {Olga Goubanova and Simon King},
  title = {Bayesian networks for phone duration prediction},
  journal = {Speech Communication},
  volume = {50},
  number = {4},
  pages = {301-311},
  abstract = {In a text-to-speech system, the duration of each phone
                   may be predicted by a duration model. This model is
                   usually trained using a database of phones with known
                   durations; each phone (and the context it appears in)
                   is characterised by a feature vector that is composed
                   of a set of linguistic factor values. We describe the
                   use of a graphical model -- a Bayesian network -- for
                   predicting the duration of a phone, given the values
                   for these factors. The network has one discrete
                   variable for each of the linguistic factors and a
                   single continuous variable for the phone's duration.
                   Dependencies between variables (or the lack of them)
                   are represented in the BN structure by arcs (or missing
                   arcs) between pairs of nodes. During training, both the
                   topology of the network and its parameters are learned
                   from labelled data. We compare the results of the BN
                   model with results for sums of products and CART models
                   on the same data. In terms of the root mean square
                   error, the BN model performs much better than both CART
                   and SoP models. In terms of correlation coefficient,
                   the BN model performs better than the SoP model, and as
                   well as the CART model. A BN model has certain
                   advantages over CART and SoP models. Training SoP
                   models requires a high degree of expertise. CART models
                   do not deal with interactions between factors in any
                   explicit way. As we demonstrate, a BN model can also
                   make accurate predictions of a phone's duration, even
                   when the values for some of the linguistic factors are
                   unknown.},
  categories = {Text-to-speech; Bayesian networks; Duration modelling;
                   Sums of products; Classification and regression trees},
  doi = {10.1016/j.specom.2007.10.002},
  month = {April},
  year = 2008
}
@inproceedings{Goubanova:2002,
  author = {Goubanova, O.},
  title = {Forms of Introduction in Map Task Dialogues: Case of
                   {L2} {Russian} Speakers},
  booktitle = {Proc. ICSLP 2002},
  address = {Denver, USA},
  year = 2002
}
@inproceedings{Goubanova-Taylor:2000,
  author = {Goubanova, O. and Taylor, P.},
  title = {Using {B}ayesian {B}elief Networks for model duration
                   in text-to-speech systems},
  booktitle = {CD-ROM Proc. ICSLP 2000},
  address = {Beijing, China},
  year = 2000
}
@inproceedings{goubanova_king_isp05,
  author = {Olga Goubanova and Simon King},
  title = {Predicting Consonant Duration with {B}ayesian Belief
                   Networks},
  booktitle = {Proc. Interspeech 2005},
  address = {Lisbon, Portugal},
  abstract = {Consonant duration is influenced by a number of
                   linguistic factors such as the consonant s identity,
                   within-word position, stress level of the previous and
                   following vowels, phrasal position of the word
                   containing the target consonant, its syllabic position,
                   identity of the previous and following segments. In our
                   work, consonant duration is predicted from a Bayesian
                   belief network (BN) consisting of discrete nodes for
                   the linguistic factors and a single continuous node for
                   the consonant s duration. Interactions between factors
                   are represented as conditional dependency arcs in this
                   graphical model. Given the parameters of the belief
                   network, the duration of each consonant in the test set
                   is then predicted as the value with the maximum
                   probability. We compare the results of the belief
                   network model with those of sums-of-products (SoP) and
                   classification and regression tree (CART) models using
                   the same data. In terms of RMS error, our BN model
                   performs better than both CART and SoP models. In terms
                   of the correlation coefficient, our BN model performs
                   better than SoP model, and no worse than CART model. In
                   addition, the Bayesian model reliably predicts
                   consonant duration in cases of missing or hidden
                   linguistic factors.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/goubanova_king_isp2005.pdf},
  year = 2005
}