The Centre for Speech Technology Research, The university of Edinburgh

Publications by Gregor Hofer

ghofer.bib

@misc{Carnival_SIGGRAPH_2010,
  author = {Michael Berger and Gregor Hofer and Hiroshi Shimodaira},
  title = {Carnival: a modular framework for automated facial
                   animation},
  howpublished = {Poster at SIGGRAPH 2010},
  note = {Bronze award winner, ACM Student Research Competition},
  abtract = {We present a software framework for speech- or
                   text-driven animation--including a platform-independent
                   API and an application implementing it--which unifies
                   state-of-the-art speech technology and graphics
                   technology within a single system.},
  address = {Los Angeles, Calif., USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/carnival.pdf},
  year = 2010
}
@inproceedings{Shimodaira:mlmi05,
  author = {Hiroshi Shimodaira and Keisuke Uematsu and Shin'ichi
                   Kawamoto and Gregor Hofer and Mitsuru Nakai},
  title = {{Analysis and Synthesis of Head Motion for Lifelike
                   Conversational Agents}},
  booktitle = {Proc. MLMI2005},
  categories = {lifelike agents},
  month = jul,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/mlmi2005.pdf},
  year = 2005
}
@inproceedings{wilson_hofer:iui2011,
  author = {Theresa Wilson and Gregor Hofer},
  title = {Using Linguistic and Vocal Expressiveness in Social
                   Role Recognition},
  booktitle = {Proc~Int.~Conf.~on Intelligent User Interfaces,
                   IUI2011},
  address = {Palo Alto, USA},
  publisher = {ACM},
  abstract = {In this paper, we investigate two types of
                   expressiveness, linguistic and vocal, and whether they
                   are useful for recog- nising the social roles of
                   participants in meetings. Our ex- periments show that
                   combining expressiveness features with speech activity
                   does improve social role recognition over speech
                   activity features alone.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/WilsonHoferIUI2010sub.pdf},
  year = 2011
}
@inproceedings{friedrich:lrec2010,
  author = {Michael Pucher and Friedrich Neubarth and Volker Strom
                   and Sylvia Moosmüller and Gregor Hofer and Christian
                   Kranzler and Gudrun Schuchmann and Dietmar Schabus},
  title = {Resources for speech synthesis of Viennese varieties},
  booktitle = {Proc.~Int.~Conf.~on Language Resources and Evaluation,
                   LREC'10},
  address = {Malta},
  publisher = {European Language Resources Association (ELRA)},
  abstract = {This paper describes our work on developing corpora of
                   three varieties of Viennese for unit selection speech
                   synthesis. The synthetic voices for Viennese varieties,
                   implemented with the open domain unit selection speech
                   synthesis engine Multisyn of Festival will also be
                   released within Festival. The paper especially focuses
                   on two questions: how we selected the appropriate
                   speakers and how we obtained the text sources needed
                   for the recording of these non-standard varieties.
                   Regarding the first one, it turned out that working
                   with a ‘prototypical’ professional speaker was much
                   more preferable than striving for authenticity. In
                   addition, we give a brief outline about the differences
                   between the Austrian standard and its dialectal
                   varieties and how we solved certain technical problems
                   that are related to these differences. In particular,
                   the specific set of phones applicable to each variety
                   had to be determined by applying various constraints.
                   Since such a set does not serve any descriptive
                   purposes but rather is influencing the quality of
                   speech synthesis, a careful design of such a (in most
                   cases reduced) set was an important task.},
  categories = {speech synthesis, language varieties, phonetic
                   encoding, graphem-to-phone, pronunciation lexicon.},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2010/lrec2010_viennese.pdf},
  ps = {http://www.cstr.ed.ac.uk/downloads/publications/2010/lrec2010_viennese.ps},
  year = 2010
}
@inproceedings{dziemianko_interspeech2009,
  author = {Michal Dziemianko and Gregor Hofer and Hiroshi
                   Shimodaira},
  title = {{HMM}-Based Automatic Eye-Blink Synthesis from Speech},
  booktitle = {Proc. Interspeech},
  pages = {1799--1802},
  address = {Brighton, UK},
  abstract = {In this paper we present a novel technique to
                   automatically synthesise eye blinking from a speech
                   signal. Animating the eyes of a talking head is
                   important as they are a major focus of attention during
                   interaction. The developed system predicts eye blinks
                   from the speech signal and generates animation
                   trajectories automatically employing a ''Trajectory
                   Hidden Markov Model''. The evaluation of the
                   recognition performance showed that the timing of
                   blinking can be predicted from speech with an F-score
                   value upwards of 52\%, which is well above chance.
                   Additionally, a preliminary perceptual evaluation was
                   conducted, that confirmed that adding eye blinking
                   significantly improves the perception the character.
                   Finally it showed that the speech synchronised
                   synthesised blinks outperform random blinking in
                   naturalness ratings.},
  categories = {animation, motion synthesis, time series analysis,
                   trajectory model},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/dziemianko_interspeech2009.pdf},
  year = 2009
}
@misc{Hofer_Shimodaira:sigg:2007,
  author = {Gregor Hofer and Hiroshi Shimodaira and Junichi
                   Yamagishi},
  title = {Speech-driven Head Motion Synthesis based on a
                   Trajectory Model},
  howpublished = {Poster at Siggraph 2007},
  address = {San Diego, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/siggraph07.pdf},
  year = 2007
}
@misc{Hofer_Berger:sigg2010,
  author = {Gregor Hofer and Korin Richmond and Michael Berger},
  title = {Lip Synchronization by Acoustic Inversion},
  howpublished = {Poster at Siggraph 2010},
  address = {Los Angeles, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/lipsync-sig10.pdf},
  year = 2010
}
@inproceedings{Hofer_Shimodaira:proc:2007,
  author = {Gregor Hofer and Hiroshi Shimodaira},
  title = {Automatic Head Motion Prediction from Speech Data},
  booktitle = {Proc. Interspeech 2007},
  address = {Antwerp, Belgium},
  abstract = {In this paper we present a novel approach to generate
                   a sequence of head motion units given some speech. The
                   modelling approach is based on the notion that head
                   motion can be divided into a number of short
                   homogeneous units that can each be modelled
                   individually. The system is based on Hidden Markov
                   Models (HMM), which are trained on motion units and act
                   as a sequence generator. They can be evaluated by an
                   accuracy measure. A database of motion capture data was
                   collected and manually annotated for head motion and is
                   used to train the models. It was found that the model
                   is good at distinguishing high activity regions from
                   regions with less activity with accuracies around 75
                   percent. Furthermore the model is able to distinguish
                   different head motion patterns based on speech features
                   somewhat reliably, with accuracies reaching almost 70
                   percent.},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/interspeech07.pdf},
  year = 2007
}
@misc{Hofer_Shimodaira:sca:2007,
  author = {Gregor Hofer and Hiroshi Shimodaira and Junichi
                   Yamagishi},
  title = {Lip motion synthesis using a context dependent
                   trajectory hidden {M}arkov model},
  howpublished = {Poster at SCA 2007},
  address = {San Diego, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/sca07.pdf},
  year = 2007
}
@inproceedings{lips08-gregpr,
  author = {Gregor Hofer and Junichi Yamagishi and Hiroshi
                   Shimodaira},
  title = {Speech-driven Lip Motion Generation with a Trajectory
                   {HMM}},
  booktitle = {Proc. Interspeech 2008},
  pages = {2314--2317},
  address = {Brisbane, Australia},
  abstract = {Automatic speech animation remains a challenging
                   problem that can be described as finding the optimal
                   sequence of animation parameter configurations given
                   some speech. In this paper we present a novel technique
                   to automatically synthesise lip motion trajectories
                   from a speech signal. The developed system predicts lip
                   motion units from the speech signal and generates
                   animation trajectories automatically employing a
                   ’¡ÉTrajectory Hidden Markov Model’¡É. Using the MLE
                   criterion, its parameter generation algorithm produces
                   the optimal smooth motion trajectories that are used to
                   drive control points on the lips directly.
                   Additionally, experiments were carried out to find a
                   suitable model unit that produces the most accurate
                   results. Finally a perceptual evaluation was conducted,
                   that showed that the developed motion units perform
                   better than phonemes.},
  categories = {visual speech synthesis, trajectory HMM, HTS},
  key = {lips08-gregpr},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/GregorLipsChallenge08.pdf},
  year = 2008
}
@inproceedings{hofer_interspeech2010,
  author = {Hofer, Gregor and Richmond, Korin},
  title = {Comparison of {HMM} and {TMDN} Methods for Lip
                   Synchronisation},
  booktitle = {Proc. Interspeech},
  pages = {454--457},
  address = {Makuhari, Japan},
  abstract = {This paper presents a comparison between a hidden
                   Markov model (HMM) based method and a novel artificial
                   neural network (ANN) based method for lip
                   synchronisation. Both model types were trained on
                   motion tracking data, and a perceptual evaluation was
                   carried out comparing the output of the models, both to
                   each other and to the original tracked data. It was
                   found that the ANN-based method was judged
                   significantly better than the HMM based method.
                   Furthermore, the original data was not judged
                   significantly better than the output of the ANN method.},
  keywords = {hidden Markov model (HMM), mixture density network,
                   lip synchronisation, inversion mapping},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100668.pdf},
  year = 2010
}
@article{10.1109/MCG.2011.71,
  author = {Michael A. Berger and Gregor Hofer and Hiroshi
                   Shimodaira},
  title = {Carnival -- Combining Speech Technology and Computer
                   Animation},
  journal = {IEEE Computer Graphics and Applications},
  volume = {31},
  pages = {80-89},
  address = {Los Alamitos, CA, USA},
  doi = {10.1109/MCG.2011.71},
  issn = {0272-1716},
  publisher = {IEEE Computer Society},
  year = 2011
}
@inproceedings{hofer-eurosp05,
  author = {G. Hofer and K. Richmond and R. Clark},
  title = {Informed Blending of Databases for Emotional Speech
                   Synthesis},
  booktitle = {Proc. Interspeech},
  abstract = {The goal of this project was to build a unit selection
                   voice that could portray emotions with varying
                   intensities. A suitable definition of an emotion was
                   developed along with a descriptive framework that
                   supported the work carried out. A single speaker was
                   recorded portraying happy and angry speaking styles.
                   Additionally a neutral database was also recorded. A
                   target cost function was implemented that chose units
                   according to emotion mark-up in the database. The
                   Dictionary of Affect supported the emotional target
                   cost function by providing an emotion rating for words
                   in the target utterance. If a word was particularly
                   'emotional', units from that emotion were favoured. In
                   addition intensity could be varied which resulted in a
                   bias to select a greater number emotional units. A
                   perceptual evaluation was carried out and subjects were
                   able to recognise reliably emotions with varying
                   amounts of emotional units present in the target
                   utterance.},
  categories = {speech synthesis,emotion,edinburgh},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hofer_emosyn.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hofer_emosyn.ps},
  year = 2005
}