The Centre for Speech Technology Research, The university of Edinburgh

Publications by Michael Berger

s0788136.bib

@misc{Carnival_SIGGRAPH_2010,
  author = {Michael Berger and Gregor Hofer and Hiroshi Shimodaira},
  title = {Carnival: a modular framework for automated facial
                   animation},
  howpublished = {Poster at SIGGRAPH 2010},
  note = {Bronze award winner, ACM Student Research Competition},
  abtract = {We present a software framework for speech- or
                   text-driven animation--including a platform-independent
                   API and an application implementing it--which unifies
                   state-of-the-art speech technology and graphics
                   technology within a single system.},
  address = {Los Angeles, Calif., USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/carnival.pdf},
  year = 2010
}
@inproceedings{felps_interspeech2010,
  author = {Felps, Daniel and Geng, Christian and Berger, Michael
                   and Richmond, Korin and Gutierrez-Osuna, Ricardo},
  title = {Relying on critical articulators to estimate vocal
                   tract spectra in an articulatory-acoustic database},
  booktitle = {Proc. Interspeech},
  pages = {1990--1993},
  abstract = {We present a new phone-dependent feature weighting
                   scheme that can be used to map articulatory
                   configurations (e.g. EMA) onto vocal tract spectra
                   (e.g. MFCC) through table lookup. The approach consists
                   of assigning feature weights according to a feature's
                   ability to predict the acoustic distance between
                   frames. Since an articulator's predictive accuracy is
                   phone-dependent (e.g., lip location is a better
                   predictor for bilabial sounds than for palatal sounds),
                   a unique weight vector is found for each phone.
                   Inspection of the weights reveals a correspondence with
                   the expected critical articulators for many phones. The
                   proposed method reduces overall cepstral error by 6\%
                   when compared to a uniform weighting scheme. Vowels
                   show the greatest benefit, though improvements occur
                   for 80\% of the tested phones.},
  keywords = {speech production, speech synthesis},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100076.pdf},
  year = 2010
}
@article{McGowanBerger2009,
  author = {Richard S. McGowan and Michael A. Berger},
  title = {Acoustic-articulatory mapping in vowels by locally
                   weighted regression},
  journal = {Journal of the Acoustical Society of America},
  volume = {126},
  number = {4},
  pages = {2011-2032},
  abstract = {A method for mapping between simultaneously measured
                   articulatory and acoustic data is proposed. The method
                   uses principal components analysis on the articulatory
                   and acoustic variables, and mapping between the domains
                   by locally weighted linear regression, or loess
                   [Cleveland, W. S. (1979) J. Am. Stat. Assoc. 74,
                   829--836]. The latter method permits local variation in
                   the slopes of the linear regression, assuming that the
                   function being approximated is smooth. The methodology
                   is applied to vowels of four speakers in the Wisconsin
                   X-ray Microbeam Speech Production Database, with
                   formant analysis. Results are examined in terms of (1)
                   examples of forward (articulation-to-acoustics)
                   mappings and inverse mappings, (2) distributions of
                   local slopes and constants, (3) examples of
                   correlations among slopes and constants, (4)
                   root-mean-square error, and (5) sensitivity of formant
                   frequencies to articulatory change. It is shown that
                   the results are qualitatively correct and that loess
                   performs better than global regression. The forward
                   mappings show different root-mean-square error
                   properties than the inverse mappings indicating that
                   this method is better suited for the forward mappings
                   than the inverse mappings, at least for the data chosen
                   for the current study. Some preliminary results on
                   sensitivity of the first two formant frequencies to the
                   two most important articulatory principal components
                   are presented.},
  categories = {Articulatory inversion, locally weighted regression,
                   X-ray microbeam, formant analysis},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/aam.pdf},
  year = 2009
}
@misc{Hofer_Berger:sigg2010,
  author = {Gregor Hofer and Korin Richmond and Michael Berger},
  title = {Lip Synchronization by Acoustic Inversion},
  howpublished = {Poster at Siggraph 2010},
  address = {Los Angeles, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/lipsync-sig10.pdf},
  year = 2010
}
@article{10.1109/MCG.2011.71,
  author = {Michael A. Berger and Gregor Hofer and Hiroshi
                   Shimodaira},
  title = {Carnival -- Combining Speech Technology and Computer
                   Animation},
  journal = {IEEE Computer Graphics and Applications},
  volume = {31},
  pages = {80-89},
  address = {Los Alamitos, CA, USA},
  doi = {10.1109/MCG.2011.71},
  issn = {0272-1716},
  publisher = {IEEE Computer Society},
  year = 2011
}