The Centre for Speech Technology Research, The university of Edinburgh

Publications by Volker Strom

vstrom.bib

@inproceedings{VTTS,
  author = {Graf, H. P. and Cosatto, E. and Strom, V. and Huang,
                   F. J.},
  title = {Visual Prosody: Facial Movements Accompanying Speech},
  booktitle = {Proc Fifth Int. Conf. Automatic Face and Gesture
                   Recognition},
  pages = {397-401},
  abstract = {As we articulate speech, we usually move the head and
                   exhibit various facial expressions. This visual aspect
                   of speech aids understanding and helps communicating
                   additional information, such as the speaker's mood. In
                   this paper we analyze quantitatively head and facial
                   movements that accompany speech and investigate how
                   they relate to the text's prosodic structure. We
                   recorded several hours of speech and measured the
                   locations of the speaker's main facial features as well
                   as their head poses. The text was evaluated with a
                   prosody prediction tool, identifying phrase boundaries
                   and pitch accents. Characteristic for most speakers are
                   simple motion patterns that are repeatedly applied in
                   synchrony with the main prosodic events. Direction and
                   strength of head movements vary widely from one speaker
                   to another, yet their timing is typically well
                   synchronized with the spoken text. Understanding
                   quantitatively the correlations between head movements
                   and spoken text is important for synthesizing
                   photo-realistic talking heads. Talking heads appear
                   much more engaging when they exhibit realistic motion
                   patterns.},
  categories = {VTTS},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/paper.vtts.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/paper.vtts.ps},
  year = 2002
}
@inproceedings{strom00,
  author = {Ann K. Syrdal and Colin W. Wightman and Alistair
                   Conkie and Yannis Stylianou and Mark Beutnagel and
                   Juergen Schroeter and Volker Strom and Ki-Seung Lee},
  title = {Corpus-based Techniques in the AT&T NEXTGEN Synthesis
                   System},
  booktitle = {Proc.~Int.~Conf.~on Spoken Language Processing},
  address = {Beijing},
  abstract = {The AT\&T text-to-speech (TTS) synthesis system has
                   been used as a framework for experimenting with a
                   perceptually-guided data-driven approach to speech
                   synthesis, with a primary focus on data-driven elements
                   in the "back end". Statistical training techniques
                   applied to a large corpus are used to make decisions
                   about predicted speech events and selected speech
                   inventory units. Our recent advances in automatic
                   phonetic and prosodic labelling and a new faster
                   harmonic plus noise model (HMM) and unit preselection
                   implementations have significantly improved TTS quality
                   and speeded up both development time and runtime.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/strom00.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/strom00.ps},
  year = 2000
}
@inproceedings{strom99,
  author = {V. Strom and H. Heine},
  title = {Utilizing Prosody for Unconstrained Morpheme
                   Recognition},
  booktitle = {Proc. European Conf. on Speech Communication and
                   Technology},
  address = {Budapest},
  abstract = {Speech recognition systems for languages with a rich
                   inflectional morphology (like German) suffer from the
                   limitations of a word--based full--form lexicon.
                   Although the morphological and acoustical knowledge
                   about words is coded implicitly within the lexicon
                   entries (which are usually closely related to the
                   orthography of the language at hand) this knowledge is
                   usually not explicitly available for other tasks (e.g.
                   detecting OOV words, prosodic analysis). This paper
                   presents an HMM--based `word' recognizer that uses
                   morpheme--like units on the string level for
                   recognizing spontaneous German conversational speech
                   (Verbmobil corpus). The system has no explicit word
                   knowledge but uses a morpheme--bigram to capture the
                   German word and sentence structure to some extent. The
                   morpheme recognizer is tightly coupled with a prosodic
                   classifier in order to compensate for some of the
                   additional ambiguity introduced by using morphemes
                   instead of words.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/paper.eurospeech99.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/paper.eurospeech99.ps},
  year = 1999
}
@inproceedings{friedrich:COST2102,
  author = {Michael Pucher and Friedrich Neubarth and Volker Strom},
  title = {Optimizing Phonetic Encoding for {V}iennese Unit
                   Selection Speech Synthesis},
  booktitle = {COST 2102 Int. Training School 2009, LNCS},
  editor = {A. Esposito et al.},
  address = {Heidelberg},
  publisher = {Springer-Verlag},
  abstract = {While developing lexical resources for a particular
                   language variety (Viennese), we experimented with a set
                   of 5 different phonetic encodings, termed phone sets,
                   used for unit selection speech synthesis. We started
                   with a very rich phone set based on phonological
                   considerations and covering as much phonetic
                   variability as possible, which was then reduced to
                   smaller sets by applying transformation rules that map
                   or merge phone symbols. The optimal trade-off was found
                   measuring the phone error rates of automatically learnt
                   grapheme-to-phone rules and by a perceptual evaluation
                   of 27 representative synthesized sentences. Further, we
                   describe a method to semi-automatically enlarge the
                   lexical resources for the target language variety using
                   a lexicon base for Standard Austrian German.},
  categories = {speech synthesis, language varieties, phonetic
                   encoding, grapheme-to-phone, pronunciation lexicon.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/COST2102.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/COST2102.ps},
  year = 2010
}
@inproceedings{friedrich:lrec2010,
  author = {Michael Pucher and Friedrich Neubarth and Volker Strom
                   and Sylvia Moosmüller and Gregor Hofer and Christian
                   Kranzler and Gudrun Schuchmann and Dietmar Schabus},
  title = {Resources for speech synthesis of Viennese varieties},
  booktitle = {Proc.~Int.~Conf.~on Language Resources and Evaluation,
                   LREC'10},
  address = {Malta},
  publisher = {European Language Resources Association (ELRA)},
  abstract = {This paper describes our work on developing corpora of
                   three varieties of Viennese for unit selection speech
                   synthesis. The synthetic voices for Viennese varieties,
                   implemented with the open domain unit selection speech
                   synthesis engine Multisyn of Festival will also be
                   released within Festival. The paper especially focuses
                   on two questions: how we selected the appropriate
                   speakers and how we obtained the text sources needed
                   for the recording of these non-standard varieties.
                   Regarding the first one, it turned out that working
                   with a ‘prototypical’ professional speaker was much
                   more preferable than striving for authenticity. In
                   addition, we give a brief outline about the differences
                   between the Austrian standard and its dialectal
                   varieties and how we solved certain technical problems
                   that are related to these differences. In particular,
                   the specific set of phones applicable to each variety
                   had to be determined by applying various constraints.
                   Since such a set does not serve any descriptive
                   purposes but rather is influencing the quality of
                   speech synthesis, a careful design of such a (in most
                   cases reduced) set was an important task.},
  categories = {speech synthesis, language varieties, phonetic
                   encoding, graphem-to-phone, pronunciation lexicon.},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2010/lrec2010_viennese.pdf},
  ps = {http://www.cstr.ed.ac.uk/downloads/publications/2010/lrec2010_viennese.ps},
  year = 2010
}
@inproceedings{strom08,
  author = {Volker Strom and Simon King},
  title = {Investigating {F}estival's target cost function using
                   perceptual experiments},
  booktitle = {Proc.~Interspeech},
  address = {Brisbane},
  abstract = {We describe an investigation of the target cost used
                   in the Festival unit selection speech synthesis system.
                   Our ultimate goal is to automatically learn a
                   perceptually optimal target cost function. In this
                   study, we investigated the behaviour of the target cost
                   for one segment type. The target cost is based on
                   counting the mismatches in several context features. A
                   carrier sentence (``My name is Roger'') was synthesised
                   using all 147,820 possible combinations of the diphones
                   /n_ei/ and /ei_m/. 92 representative versions were
                   selected and presented to listeners as 460 pairwise
                   comparisons. The listeners' preference votes were used
                   to analyse the behaviour of the target cost, with
                   respect to the values of its component linguistic
                   context features.},
  categories = {speech synthesis, unit selection, target costs},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080514.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080514.ps},
  year = 2008
}
@inproceedings{strom:etal:interspeech2007,
  author = {Volker Strom and Ani Nenkova and Robert Clark and
                   Yolanda Vazquez-Alvarez and Jason Brenier and Simon
                   King and Dan Jurafsky},
  title = {Modelling Prominence and Emphasis Improves
                   Unit-Selection Synthesis},
  booktitle = {Proc. Interspeech 2007},
  address = {Antwerp, Belgium},
  abstract = {We describe the results of large scale perception
                   experiments showing improvements in synthesising two
                   distinct kinds of prominence: standard pitch-accent and
                   strong emphatic accents. Previously prominence
                   assignment has been mainly evaluated by computing
                   accuracy on a prominence-labelled test set. By contrast
                   we integrated an automatic pitch-accent classifier into
                   the unit selection target cost and showed that
                   listeners preferred these synthesised sentences. We
                   also describe an improved recording script for
                   collecting emphatic accents, and show that generating
                   emphatic accents leads to further improvements in the
                   fiction genre over incorporating pitch accent only.
                   Finally, we show differences in the effects of
                   prominence between child-directed speech and news and
                   fiction genres. Index Terms: speech synthesis, prosody,
                   prominence, pitch accent, unit selection},
  categories = {speech synthesis},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/p540.pdf},
  year = 2007
}
@inproceedings{clark_blizzard2006,
  author = {Clark, R. and Richmond, K. and Strom, V. and King, S.},
  title = {Multisyn Voices for the {B}lizzard {C}hallenge 2006},
  booktitle = {Proc. Blizzard Challenge Workshop (Interspeech
                   Satellite)},
  address = {Pittsburgh, USA},
  note = {(http://festvox.org/blizzard/blizzard2006.html)},
  abstract = {This paper describes the process of building unit
                   selection voices for the Festival Multisyn engine using
                   the ATR dataset provided for the Blizzard Challenge
                   2006. We begin by discussing recent improvements that
                   we have made to the Multisyn voice building process,
                   prompted by our participation in the Blizzard Challenge
                   2006. We then go on to discuss our interpretation of
                   the results observed. Finally, we conclude with some
                   comments and suggestions for the formulation of future
                   Blizzard Challenges.},
  categories = {tts, blizzard, multisyn, unit selection},
  key = {clark_blizzard2006},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/cstr_blizzard2006.pdf},
  year = 2006
}
@inproceedings{strom10d,
  author = {Volker Strom and Simon King},
  title = {A classifier-based target cost for unit selection
                   speech synthesis trained on perceptual data},
  booktitle = {Proc.~Interspeech},
  address = {Makuhari, Japan},
  abstract = {Our goal is to automatically learn a
                   PERCEPTUALLY-optimal target cost function for a unit
                   selection speech synthesiser. The approach we take here
                   is to train a classifier on human perceptual judgements
                   of synthetic speech. The output of the classifier is
                   used to make a simple three-way distinction rather than
                   to estimate a continuously-valued cost. In order to
                   collect the necessary perceptual data, we synthesised
                   145,137 short sentences with the usual target cost
                   switched off, so that the search was driven by the join
                   cost only. We then selected the 7200 sentences with the
                   best joins and asked 60 listeners to judge them,
                   providing their ratings for each syllable. From this,
                   we derived a rating for each demiphone. Using as input
                   the same context features employed in our conventional
                   target cost function, we trained a classifier on these
                   human perceptual ratings. We synthesised two sets of
                   test sentences with both our standard target cost and
                   the new target cost based on the classifier. A/B
                   preference tests showed that the classifier-based
                   target cost, which was learned completely automatically
                   from modest amounts of perceptual data, is almost as
                   good as our carefully- and expertly-tuned standard
                   target cost.},
  categories = {speech synthesis, unit selection, target cost},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/strom10d.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/strom10d.ps},
  year = 2010
}
@inproceedings{strom02,
  author = {V. Strom},
  title = {From Text to Speech Without {ToBI}},
  booktitle = {Proc. ICSLP},
  address = {Denver},
  abstract = {A new method for predicting prosodic parameters, i.e.
                   phone durations and F0 targets, from preprocessed text
                   is presented. The prosody model comprises a set of
                   CARTs, which are learned from a large database of
                   labeled speech. This database need not be annotated
                   with Tone and Break Indices (ToBI labels). Instead, a
                   simpler symbolic prosodic description is created by a
                   bootstrapping method. The method had been applied to
                   one Spanish and two German speakers. For the German
                   voices, two listening tests showed a significant
                   preference for the new method over a more traditional
                   approach of prosody prediction, based on hand-crafted
                   rules.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/paper.icslp02.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/paper.icslp02.ps},
  year = 2002
}
@incollection{ATR-Buch,
  author = {W. Hess and A. Batliner and A. Kießling and R. Kompe
                   and E. N{š}th and A. Petzold and M. Reyelt and V. Strom},
  title = {Prosodic Modules for Speech Recognition and
                   Understanding in {VERBMOBIL}},
  booktitle = {Computing Prosody},
  publisher = {Springer-Verlag},
  editor = {Yoshinori Sagisaka, Nick Campbell, Norio Higuchi},
  pages = {Part IV, Chapter 23, pp. 363 - 383},
  address = {New York},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1995/hess_et_al.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1995/hess_et_al.ps},
  year = 1995
}
@phdthesis{DissStrom,
  author = {V. Strom},
  title = {Automatische Erkennung von Satzmodus, Akzentuierung
                   und Phrasengrenzen},
  school = {University of Bonn},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/Diss.Volker.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/Diss.Volker.ps},
  year = 1998
}
@inproceedings{leo_08-2,
  author = {Leonardo Badino and Robert A.J. Clark and Volker Strom},
  title = {Including Pitch Accent Optionality in Unit Selection
                   Text-to-Speech Synthesis},
  booktitle = {Proc.~Interspeech},
  address = {Brisbane},
  abstract = {A significant variability in pitch accent placement is
                   found when comparing the patterns of prosodic
                   prominence realized by different English speakers
                   reading the same sentences. In this paper we describe a
                   simple approach to incorporate this variability to
                   synthesize prosodic prominence in unit selection
                   text-to-speech synthesis. The main motivation of our
                   approach is that by taking into account the variability
                   of accent placements we enlarge the set of prosodically
                   acceptable speech units, thus increasing the chances of
                   selecting a good quality sequence of units, both in
                   prosodic and segmental terms. Results on a large scale
                   perceptual test show the benefits of our approach and
                   indicate directions for further improvements.},
  categories = {speech synthesis, unit selection, prosodic prominence,
                   pitch accents},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080159.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080159.ps},
  year = 2008
}
@inproceedings{strom06,
  author = {Volker Strom and Robert Clark and Simon King},
  title = {Expressive Prosody for Unit-selection Speech Synthesis},
  booktitle = {Proc.~Interspeech},
  address = {Pittsburgh},
  abstract = {Current unit selection speech synthesis voices cannot
                   produce emphasis or interrogative contours because of a
                   lack of the necessary prosodic variation in the
                   recorded speech database. A method of recording script
                   design is proposed which addresses this shortcoming.
                   Appropriate components were added to the target cost
                   function of the Festival Multisyn engine, and a
                   perceptual evaluation showed a clear preference over
                   the baseline system.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/strom06.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/strom06.ps},
  year = 2006
}
@article{cs-CL-9907021,
  author = {Günther Gšrz and Jšrg Spilker and Volker Strom and
                   Hans Weber},
  title = {Architectural Considerations for Conversational
                   Systems -- The Verbmobil/INTARC Experience},
  journal = {proceedings of First International Workshop on Human
                   Computer Conversation},
  volume = {cs.CL/9907021},
  abstract = { The paper describes the speech to speech translation
                   system INTARC, developed during the first phase of the
                   Verbmobil project. The general design goals of the
                   INTARC system architecture were time synchronous
                   processing as well as incrementality and interactivity
                   as a means to achieve a higher degree of robustness and
                   scalability. Interactivity means that in addition to
                   the bottom-up (in terms of processing levels) data flow
                   the ability to process top-down restrictions
                   considering the same signal segment for all processing
                   levels. The construction of INTARC 2.0, which has been
                   operational since fall 1996, followed an engineering
                   approach focussing on the integration of symbolic
                   (linguistic) and stochastic (recognition) techniques
                   which led to a generalization of the concept of a ``one
                   pass'' beam search.},
  address = {Bellagio, Italy},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/INTARC99.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/INTARC99.ps},
  year = 1999
}
@inproceedings{strom97,
  author = {V. Strom and A. Elsner and G. G{š}rz and W. Hess and
                   W. Kasper and A. Klein and H.U. Krieger and J. Spilker
                   and H. Weber},
  title = {On the use of prosody in a speech-to-speech translator},
  booktitle = {Proc. European Conf. on Speech Communication and
                   Technology},
  address = {Rhodes},
  abstract = {In this paper a speech-to-speech translator from
                   German to English is presented. Beside the traditional
                   processing steps it takes advantage of acoustically
                   detected prosodic phrase boundaries and focus. The
                   prosodic phrase boundaries reduce search space during
                   syntactic parsing and rule out analysis trees during
                   semantic parsing. The prosodic focus faciliates a
                   ``shallow'' translation based on the best word chain in
                   cases where the deep analysis fails.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/paper.eurospeech97.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/paper.eurospeech97.ps},
  year = 1997
}
@article{michael09:dialectHTS,
  author = {Michael Pucher and Dietmar Schabus and Junichi
                   Yamagishi and Friedrich Neubarth and Volker Strom},
  title = {Modeling and Interpolation of {Austrian German and
                   Viennese} Dialect in {HMM}-based Speech Synthesis},
  journal = {Speech Communication},
  volume = {52},
  number = {2},
  pages = {164--179},
  abstract = {An HMM-based speech synthesis framework is applied to
                   both Standard Austrian German and a Viennese dialectal
                   variety and several training strategies for
                   multi-dialect modeling such as dialect clustering and
                   dialect-adaptive training are investigated. For
                   bridging the gap between processing on the level of
                   HMMs and on the linguistic level, we add phonological
                   transformations to the HMM interpolation and apply them
                   to dialect interpolation. The crucial steps are to
                   employ several formalized phonological rules between
                   Austrian German and Viennese dialect as constraints for
                   the HMM interpolation. We verify the effectiveness of
                   this strategy in a number of perceptual evaluations.
                   Since the HMM space used is not articulatory but
                   acoustic space, there are some variations in evaluation
                   results between the phonological rules. However, in
                   general we obtained good evaluation results which show
                   that listeners can perceive both continuous and
                   categorical changes of dialect varieties by using
                   phonological transformations employed as switching
                   rules in the HMM interpolation.},
  categories = {speech synthesis, hidden Markov model, dialect,
                   sociolect, Austrian German},
  doi = {10.1016/j.specom.2009.09.004},
  year = 2010
}
@inproceedings{strom95,
  author = {V.~Strom},
  title = {Detection of accents, phrase boundaries and sentence
                   modality in {G}erman with prosodic features},
  booktitle = {Proc. European Conf. on Speech Communication and
                   Technology},
  volume = {3},
  pages = {2039-2041},
  address = {Madrid},
  abstract = {In this paper detectors for accents, phrase
                   boundaries, and sentence modality are described which
                   derive prosodic features only from the speech signal
                   and its fundamental frequency to support other modules
                   of a speech understanding system in an early analysis
                   stage, or in cases where no word hypotheses are
                   available. A new method for interpolating and
                   decomposing the fundamental frequency is suggested. The
                   detectors' underlying Gaussian distribution classifiers
                   were trained and tested with approximately 50 minutes
                   of spontaneous speech, yielding recognition rates of 78
                   percent for accents, 81 percent for phrase boundaries,
                   and 85 percent for sentence modality.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1995/paper.eurospeech95.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1995/paper.eurospeech95.ps},
  year = 1995
}
@inproceedings{richmond2007b,
  author = {Richmond, K. and Strom, V. and Clark, R. and
                   Yamagishi, J. and Fitt, S.},
  title = {Festival Multisyn Voices for the 2007 Blizzard
                   Challenge},
  booktitle = {Proc. Blizzard Challenge Workshop (in Proc. SSW6)},
  address = {Bonn, Germany},
  abstract = {This paper describes selected aspects of the Festival
                   Multisyn entry to the Blizzard Challenge 2007. We
                   provide an overview of the process of building the
                   three required voices from the speech data provided.
                   This paper focuses on new features of Multisyn which
                   are currently under development and which have been
                   employed in the system used for this Blizzard
                   Challenge. These differences are the application of a
                   more flexible phonetic lattice representation during
                   forced alignment labelling and the use of a pitch
                   accent target cost component. Finally, we also examine
                   aspects of the speech data provided for this year's
                   Blizzard Challenge and raise certain issues for
                   discussion concerning the aim of comparing voices made
                   with differing subsets of the data provided.},
  categories = {tts, blizzard, multisyn, unit selection},
  key = {richmond2007b},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blizzard2007paper.pdf},
  year = 2007
}
@inproceedings{strom96b,
  author = {V. Strom and C. Widera},
  title = {What's in the ``pure'' prosody?},
  booktitle = {Proc. ICSLP},
  address = {Philadelphia},
  abstract = {Detectors for accents and phrase boundaries have been
                   developed which derive prosodic features from the
                   speech signal and its fundamental frequency to support
                   other modules of a speech understanding system in an
                   early analysis stage, or in cases where no word
                   hypotheses are available. The detectors' underlying
                   Gaussian distribution classifiers were trained with 50
                   minutes and tested with 30 minutes of spontaneous
                   speech, yielding recognition rates of 74\% for accents
                   and 86\% for phrase boundaries. Since this material was
                   prosodically hand labelled, the question was, which
                   labels for phrase boundaries and accentuation were only
                   guided by syntactic or semantic knowledge, and which
                   ones are really prosodically marked. Therefore a small
                   test subset has been resynthesized in such a way that
                   comprehensibility was lost, but the prosodic
                   characteristics were kept. This subset has been
                   re-labelled by 11 listeners with nearly the same
                   accuracy as the detectors.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1996/paper.icslp96.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1996/paper.icslp96.ps},
  year = 1996
}
@inproceedings{strom02a,
  author = {Juergen Schroeter and Alistair Conkie and Ann Syrdal
                   and Mark Beutnagel and Matthias Jilka and Volker Strom
                   and Yeon-Jun Kim and Hong-Goo Kang and David Kapilow},
  title = {A perspective on the next challanges for {TTS}},
  booktitle = {IEEE 2002 Workshop in Speech Synthesis},
  pages = {11-13},
  address = {Santa Monica, CA},
  abstract = {The quality of speech synthesis has come a long way
                   since Homer Dudley's ``Vocoder'' in 1939. In fact, with
                   the wide-spread use of unit-selection synthesizers, the
                   naturalness of the synthesized speech is now high
                   enough to pass the Turing test for short utterances,
                   such as prompts. Therefore, it seems valid to ask the
                   question ``what are the next challenges for TTS
                   Research?'' This paper tries to identify unresoved
                   issues, the solution of which would greatly enhance the
                   state of the art in TTS.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/strom02a.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/strom02a.ps},
  year = 2002
}
@inproceedings{Niemann94:PDCa,
  author = {H. Niemann and J. Denzler and B. Kahles and R. Kompe
                   and A. Kießling and E. Nšth and V. Strom},
  title = {Pitch Determination Considering Laryngealization
                   Effects in Spoken Dialogs},
  booktitle = {Proc. Int. Conf. on Neuronal Networks},
  volume = {7},
  pages = {4457--4461},
  address = {Orlando},
  abstract = {A frequent phenomen in spoken dialogs of the
                   information seeking type are short elliptic utterances
                   whose mood (declarative or interrogative) can only be
                   distinguished by intonation. The main acoustic evidence
                   is conveyed by the fundamental frequency or F0 contour.
                   Many algorithms for F0 determination have been reported
                   in the literature. A common problem are irregularities
                   of speech known as laryngealizations. This article
                   describes an approach based on neuronal network
                   techniques for the improved determination of
                   fundamental frequency. First, an improved version of
                   our neuronal network algorithm for reconstruction of
                   the voice source signal (glottis signal) is presented.
                   Second, the reconstructed voice source signal is used
                   as input to another neuronal network destinguishing the
                   three classes 'voiceless', 'voiced-non-laryngealized',
                   and 'voiced-laryngealized'. Third, the results are used
                   to improve an existing F0 algorithm. Results of this
                   approach are presented and discussed in the context of
                   the application in a spoken dialog system.},
  categories = {prosody, laryngealisation},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1994/report-33-94.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1994/report-33-94.ps},
  year = 1994
}