The Centre for Speech Technology Research, The university of Edinburgh

Publications by Gregor Hofer

ghofer.bib

@misc{Carnival_SIGGRAPH_2010,
  author = {Berger, Michael and Hofer, Gregor and Shimodaira, Hiroshi},
  title = {Carnival: a modular framework for automated facial animation},
  year = {2010},
  note = {Bronze award winner, ACM Student Research Competition},
  address = {Los Angeles, Calif., USA},
  howpublished = {Poster at SIGGRAPH 2010},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/carnival.pdf},
  abtract = {We present a software framework for speech- or text-driven animation--including a platform-independent API and an application implementing it--which unifies state-of-the-art speech technology and graphics technology within a single system.}
}
@inproceedings{Shimodaira:mlmi05,
  author = {Shimodaira, Hiroshi and Uematsu, Keisuke and Kawamoto, Shin'ichi and Hofer, Gregor and Nakai, Mitsuru},
  title = {{Analysis and Synthesis of Head Motion for Lifelike Conversational Agents}},
  booktitle = {Proc. MLMI2005},
  month = {July},
  year = {2005},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/mlmi2005.pdf},
  categories = {lifelike agents}
}
@inproceedings{wilson_hofer:iui2011,
  author = {Wilson, Theresa and Hofer, Gregor},
  publisher = {ACM},
  title = {Using Linguistic and Vocal Expressiveness in Social Role Recognition},
  booktitle = {Proc~Int.~Conf.~on Intelligent User Interfaces, IUI2011},
  year = {2011},
  address = {Palo Alto, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/WilsonHoferIUI2010sub.pdf},
  abstract = {In this paper, we investigate two types of expressiveness, linguistic and vocal, and whether they are useful for recog- nising the social roles of participants in meetings. Our ex- periments show that combining expressiveness features with speech activity does improve social role recognition over speech activity features alone.}
}
@inproceedings{friedrich:lrec2010,
  author = {Pucher, Michael and Neubarth, Friedrich and Strom, Volker and Moosmüller, Sylvia and Hofer, Gregor and Kranzler, Christian and Schuchmann, Gudrun and Schabus, Dietmar},
  publisher = {European Language Resources Association (ELRA)},
  title = {Resources for speech synthesis of Viennese varieties},
  booktitle = {Proc.~Int.~Conf.~on Language Resources and Evaluation, LREC'10},
  year = {2010},
  ps = {http://www.cstr.ed.ac.uk/downloads/publications/2010/lrec2010_viennese.ps},
  address = {Malta},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2010/lrec2010_viennese.pdf},
  abstract = {This paper describes our work on developing corpora of three varieties of Viennese for unit selection speech synthesis. The synthetic voices for Viennese varieties, implemented with the open domain unit selection speech synthesis engine Multisyn of Festival will also be released within Festival. The paper especially focuses on two questions: how we selected the appropriate speakers and how we obtained the text sources needed for the recording of these non-standard varieties. Regarding the first one, it turned out that working with a ‘prototypical’ professional speaker was much more preferable than striving for authenticity. In addition, we give a brief outline about the differences between the Austrian standard and its dialectal varieties and how we solved certain technical problems that are related to these differences. In particular, the specific set of phones applicable to each variety had to be determined by applying various constraints. Since such a set does not serve any descriptive purposes but rather is influencing the quality of speech synthesis, a careful design of such a (in most cases reduced) set was an important task.},
  categories = {speech synthesis, language varieties, phonetic encoding, graphem-to-phone, pronunciation lexicon.}
}
@inproceedings{dziemianko_interspeech2009,
  author = {Dziemianko, Michal and Hofer, Gregor and Shimodaira, Hiroshi},
  title = {{HMM}-Based Automatic Eye-Blink Synthesis from Speech},
  booktitle = {Proc. Interspeech},
  year = {2009},
  abstract = {In this paper we present a novel technique to automatically synthesise eye blinking from a speech signal. Animating the eyes of a talking head is important as they are a major focus of attention during interaction. The developed system predicts eye blinks from the speech signal and generates animation trajectories automatically employing a ''Trajectory Hidden Markov Model''. The evaluation of the recognition performance showed that the timing of blinking can be predicted from speech with an F-score value upwards of 52\%, which is well above chance. Additionally, a preliminary perceptual evaluation was conducted, that confirmed that adding eye blinking significantly improves the perception the character. Finally it showed that the speech synchronised synthesised blinks outperform random blinking in naturalness ratings.},
  month = {September},
  address = {Brighton, UK},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/dziemianko_interspeech2009.pdf},
  pages = {1799--1802},
  categories = {animation, motion synthesis, time series analysis, trajectory model}
}
@misc{Hofer_Shimodaira:sigg:2007,
  author = {Hofer, Gregor and Shimodaira, Hiroshi and Yamagishi, Junichi},
  howpublished = {Poster at Siggraph 2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/siggraph07.pdf},
  year = {2007},
  title = {Speech-driven Head Motion Synthesis based on a Trajectory Model},
  address = {San Diego, USA}
}
@misc{Hofer_Berger:sigg2010,
  author = {Hofer, Gregor and Richmond, Korin and Berger, Michael},
  howpublished = {Poster at Siggraph 2010},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/lipsync-sig10.pdf},
  year = {2010},
  title = {Lip Synchronization by Acoustic Inversion},
  address = {Los Angeles, USA}
}
@inproceedings{Hofer_Shimodaira:proc:2007,
  author = {Hofer, Gregor and Shimodaira, Hiroshi},
  title = {Automatic Head Motion Prediction from Speech Data},
  booktitle = {Proc. Interspeech 2007},
  year = {2007},
  month = {August},
  address = {Antwerp, Belgium},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/interspeech07.pdf},
  abstract = {In this paper we present a novel approach to generate a sequence of head motion units given some speech. The modelling approach is based on the notion that head motion can be divided into a number of short homogeneous units that can each be modelled individually. The system is based on Hidden Markov Models (HMM), which are trained on motion units and act as a sequence generator. They can be evaluated by an accuracy measure. A database of motion capture data was collected and manually annotated for head motion and is used to train the models. It was found that the model is good at distinguishing high activity regions from regions with less activity with accuracies around 75 percent. Furthermore the model is able to distinguish different head motion patterns based on speech features somewhat reliably, with accuracies reaching almost 70 percent.}
}
@misc{Hofer_Shimodaira:sca:2007,
  author = {Hofer, Gregor and Shimodaira, Hiroshi and Yamagishi, Junichi},
  howpublished = {Poster at SCA 2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/sca07.pdf},
  year = {2007},
  title = {Lip motion synthesis using a context dependent trajectory hidden {M}arkov model},
  address = {San Diego, USA}
}
@inproceedings{lips08-gregpr,
  author = {Hofer, Gregor and Yamagishi, Junichi and Shimodaira, Hiroshi},
  title = {Speech-driven Lip Motion Generation with a Trajectory {HMM}},
  booktitle = {Proc. Interspeech 2008},
  year = {2008},
  abstract = {Automatic speech animation remains a challenging problem that can be described as finding the optimal sequence of animation parameter configurations given some speech. In this paper we present a novel technique to automatically synthesise lip motion trajectories from a speech signal. The developed system predicts lip motion units from the speech signal and generates animation trajectories automatically employing a "Trajectory Hidden Markov Model". Using the MLE criterion, its parameter generation algorithm produces the optimal smooth motion trajectories that are used to drive control points on the lips directly. Additionally, experiments were carried out to find a suitable model unit that produces the most accurate results. Finally a perceptual evaluation was conducted, that showed that the developed motion units perform better than phonemes.},
  month = {September},
  key = {lips08-gregpr},
  address = {Brisbane, Australia},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/GregorLipsChallenge08.pdf},
  pages = {2314--2317},
  categories = {visual speech synthesis, trajectory HMM, HTS}
}
@inproceedings{hofer_interspeech2010,
  author = {Hofer, Gregor and Richmond, Korin},
  title = {Comparison of {HMM} and {TMDN} Methods for Lip Synchronisation},
  booktitle = {Proc. Interspeech},
  year = {2010},
  abstract = {This paper presents a comparison between a hidden Markov model (HMM) based method and a novel artificial neural network (ANN) based method for lip synchronisation. Both model types were trained on motion tracking data, and a perceptual evaluation was carried out comparing the output of the models, both to each other and to the original tracked data. It was found that the ANN-based method was judged significantly better than the HMM based method. Furthermore, the original data was not judged significantly better than the output of the ANN method.},
  month = {September},
  address = {Makuhari, Japan},
  keywords = {hidden Markov model (HMM), mixture density network, lip synchronisation, inversion mapping},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100668.pdf},
  pages = {454--457}
}
@article{10.1109/MCG.2011.71,
  author = {Berger, Michael A. and Hofer, Gregor and Shimodaira, Hiroshi},
  publisher = {IEEE Computer Society},
  doi = {10.1109/MCG.2011.71},
  title = {Carnival -- Combining Speech Technology and Computer Animation},
  journal = {IEEE Computer Graphics and Applications},
  issn = {0272-1716},
  volume = {31},
  year = {2011},
  pages = {80-89},
  address = {Los Alamitos, CA, USA}
}
@inproceedings{hofer-eurosp05,
  author = {Hofer, G. and Richmond, K. and Clark, R.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hofer_emosyn.ps},
  title = {Informed Blending of Databases for Emotional Speech Synthesis},
  booktitle = {Proc. Interspeech},
  month = {September},
  year = {2005},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hofer_emosyn.pdf},
  abstract = {The goal of this project was to build a unit selection voice that could portray emotions with varying intensities. A suitable definition of an emotion was developed along with a descriptive framework that supported the work carried out. A single speaker was recorded portraying happy and angry speaking styles. Additionally a neutral database was also recorded. A target cost function was implemented that chose units according to emotion mark-up in the database. The Dictionary of Affect supported the emotional target cost function by providing an emotion rating for words in the target utterance. If a word was particularly 'emotional', units from that emotion were favoured. In addition intensity could be varied which resulted in a bias to select a greater number emotional units. A perceptual evaluation was carried out and subjects were able to recognise reliably emotions with varying amounts of emotional units present in the target utterance.},
  categories = {speech synthesis,emotion,edinburgh}
}