2008.bib

@comment{{This file has been generated by bib2bib 1.92}}
@comment{{Command line: /home/korin/bibtex2html-1.92-LINUX/bib2bib -oc /home/korin/projects/publications/new_output/transitdata/2008-citations -ob /home/korin/projects/publications/new_output/transitdata/2008.bib -c 'year : "2008"' /home/korin/projects/publications/filtlists/full_publications_list.bib}}
@article{analysis-hts-adaptation-junichi,
  author = {Junichi Yamagishi and Takao Kobayashi and Yuji Nakano
                   and Katsumi Ogata and Juri Isogai},
  title = {Analysis of Speaker Adaptation Algorihms for
                   {HMM}-based Speech Synthesis and a Constrained {SMAPLR}
                   Adaptation Algorithm},
  journal = {IEEE Transactions on Audio, Speech and Language
                   Processing},
  note = {In print},
  abstract = {In this paper we analyze the effects of several
                   factors and configuration choices encountered during
                   training and model construction when we want to obtain
                   better and more stable adaptation in HMM-based speech
                   synthesis. We then propose a new adaptation algorithm
                   called constrained structural maximum a posteriori
                   linear regression (CSMAPLR) whose derivation is based
                   on the knowledge obtained in this analysis and on the
                   results of comparing several conventional adaptation
                   algorithms. Here we investigate six major aspects of
                   the speaker adaptation: initial models transform
                   functions, estimation criteria, and sensitivity of
                   several linear regression adaptation algorithms
                   algorithms. Analyzing the effect of the initial model,
                   we compare speaker-dependent models, gender-independent
                   models, and the simultaneous use of the
                   gender-dependent models to single use of the
                   gender-dependent models. Analyzing the effect of the
                   transform functions, we compare the transform function
                   for only mean vectors with that for mean vectors and
                   covariance matrices. Analyzing the effect of the
                   estimation criteria, we compare the ML criterion with a
                   robust estimation criterion called structural MAP. We
                   evaluate the sensitivity of several thresholds for the
                   piecewise linear regression algorithms and take up
                   methods combining MAP adaptation with the linear
                   regression algorithms. We incorporate these adaptation
                   algorithms into our speech synthesis system and present
                   several subjective and objective evaluation results
                   showing the utility and effectiveness of these
                   algorithms in speaker adaptation for HMM-based speech
                   synthesis.},
  categories = {speech synthesis, HMM-based speech synthesis, HTS,
                   speaker adaptation, voice conversion, average voice},
  key = {analysis-hts-adaptation-junichi},
  year = 2008
}
@inproceedings{renals2008,
  author = {Renals, Steve and Hain, Thomas and Bourlard, Hervé},
  title = {Interpretation of Multiparty Meetings: The {AMI} and
                   {AMIDA} Projects},
  booktitle = {IEEE Workshop on Hands-Free Speech Communication and
                   Microphone Arrays, 2008. HSCMA 2008},
  pages = {115--118},
  abstract = {The AMI and AMIDA projects are collaborative EU
                   projects concerned with the automatic recognition and
                   interpretation of multiparty meetings. This paper
                   provides an overview of the advances we have made in
                   these projects with a particular focus on the
                   multimodal recording infrastructure, the publicly
                   available AMI corpus of annotated meeting recordings,
                   and the speech recognition framework that we have
                   developed for this domain.},
  doi = {10.1109/HSCMA.2008.4538700},
  keywords = {AMI corpus; Meetings; evaluation; speech recognition},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/renals2008.pdf},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=4538666&arnumber=4538700&count=68&index=33},
  year = 2008
}
@inproceedings{vipperla08,
  author = {Ravichander Vipperla and Steve Renals and Joe Frankel},
  title = {Longitudinal study of {ASR} performance on ageing
                   voices},
  booktitle = {Proc.~Interspeech},
  address = {Brisbane},
  abstract = {This paper presents the results of a longitudinal
                   study of ASR performance on ageing voices. Experiments
                   were conducted on the audio recordings of the
                   proceedings of the Supreme Court Of The United States
                   (SCOTUS). Results show that the Automatic Speech
                   Recognition (ASR) Word Error Rates (WERs) for elderly
                   voices are significantly higher than those of adult
                   voices. The word error rate increases gradually as the
                   age of the elderly speakers increase. Use of maximum
                   likelihood linear regression (MLLR) based speaker
                   adaptation on ageing voices improves the WER though the
                   performance is still considerably lower compared to
                   adult voices. Speaker adaptation however reduces the
                   increase in WER with age during old age.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/vipperla_is08.pdf},
  year = 2008
}
@article{zhang-spl2008,
  author = {Le Zhang and Steve Renals},
  title = {Acoustic-Articulatory Modelling with the Trajectory
                   {HMM}},
  journal = {IEEE Signal Processing Letters},
  volume = 15,
  pages = {245-248},
  abstract = { In this letter, we introduce an hidden Markov model
                   (HMM)-based inversion system to recovery articulatory
                   movements from speech acoustics. Trajectory HMMs are
                   used as generative models for modelling articulatory
                   data. Experiments on the MOCHA-TIMIT corpus indicate
                   that the jointly trained acoustic-articulatory models
                   are more accurate (lower RMS error) than the separately
                   trained ones, and that trajectory HMM training results
                   in greater accuracy compared with conventional maximum
                   likelihood HMM training. Moreover, the system has the
                   ability to synthesize articulatory movements directly
                   from a textual representation. },
  key = {articulatory inversion},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/zhang-spl.pdf},
  year = 2008
}
@article{treeboosting-junichi,
  author = {Junichi Yamagishi and Hisashi Kawai and Takao
                   Kobayashi},
  title = {Phone Duration Modeling Using Gradient Tree Boosting},
  journal = {Speech Communication},
  volume = 50,
  number = 5,
  pages = {405--415},
  note = {},
  abstract = { In text-to-speech synthesis systems, phone duration
                   influences the quality and naturalness of synthetic
                   speech. In this study, we incorporate an ensemble
                   learning technique called gradient tree boosting into
                   phone duration modeling as an alternative to the
                   conventional approach using regression trees, and
                   objectively evaluate the prediction accuracy of
                   Japanese, Mandarin, and English phone duration. The
                   gradient tree boosting algorithm is a meta algorithm of
                   regression trees: it iteratively builds the regression
                   tree from the residuals and outputs weighting sum of
                   the regression trees. Our evaluation results show that
                   compared to the regression trees or other techniques
                   related to the regression trees, the gradient tree
                   boosting algorithm can substantially and robustly
                   improve the predictive accuracy of the phone duration
                   regardless of languages, speakers, or domains.},
  categories = {Text-to-speech synthesis, Phone duration modeling,
                   Gradient tree boosing},
  doi = {10.1016/j.specom.2007.12.003},
  key = {treeboosting-junichi},
  month = may,
  year = 2008
}
@inproceedings{ling:richmond:yamagishi:wang:2008a,
  author = {Ling, Zhen-Hua and Richmond, Korin and Yamagishi,
                   Junichi and Wang, Ren-Hua },
  title = {Articulatory Control of {HMM}-based Parametric Speech
                   Synthesis Driven by Phonetic Knowledge},
  booktitle = {Proc. Interspeech},
  pages = {573--576},
  address = {Brisbane, Australia},
  abstract = {This paper presents a method to control the
                   characteristics of synthetic speech flexibly by
                   integrating articulatory features into a Hidden Markov
                   Model (HMM)-based parametric speech synthesis system.
                   In contrast to model adaptation and interpolation
                   approaches for speaking style control, this method is
                   driven by phonetic knowledge, and target speech samples
                   are not required. The joint distribution of parallel
                   acoustic and articulatory features considering
                   cross-stream feature dependency is estimated. At
                   synthesis time, acoustic and articulatory features are
                   generated simultaneously based on the
                   maximum-likelihood criterion. The synthetic speech can
                   be controlled flexibly by modifying the generated
                   articulatory features according to arbitrary phonetic
                   rules in the parameter generation process. Our
                   experiments show that the proposed method is effective
                   in both changing the overall character of synthesized
                   speech and in controlling the quality of a specific
                   vowel.},
  categories = {speech synthesis, HMM, articulatory features, phonetic
                   knowledge},
  key = {ling:richmond:yamagishi:wang:2008a},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080582.PDF},
  year = 2008
}
@inproceedings{goedde:08,
  author = {Florian G\"odde and Sebastian M\"oller and Klaus-Peter
                   Engelbrecht and Christine K\"uhnel and Robert
                   Schleicher and Anja Naumann and Maria Wolters},
  title = {Study of a Speech-based Smart Home System with Older
                   Users},
  booktitle = {International Workshop on Intelligent User Interfaces
                   for Ambient Assisted Living},
  pages = {17--22},
  year = 2008
}
@inproceedings{cereproc-hts,
  author = {Matthew P. Aylett and Junichi Yamagishi},
  title = {Combining Statistical Parameteric Speech Synthesis and
                   Unit-Selection for Automatic Voice Cloning},
  booktitle = {Proc. LangTech 2008},
  address = {Brisbane, Australia},
  abstract = {The ability to use the recorded audio of a subjects
                   voice to produce an open-domain synthesis system has
                   generated much interest both in academic research and
                   in commercial speech technology. The ability to produce
                   synthetic versions of a subjects voice has potential
                   commercial applications, such as virtual celebrity
                   actors, or potential clinical applications, such as
                   offering a synthetic replacement voice in the case of a
                   laryngectomy. Recent developments in HMM-based speech
                   synthesis have shown it is possible to produce
                   synthetic voices from quite small amounts of speech
                   data. However, mimicking the depth and variation of a
                   speakers prosody as well as synthesising natural
                   voice quality is still a challenging research problem.
                   In contrast, unit-selection systems have shown it is
                   possible to strongly retain the character of the voice
                   but only with sufficient original source material.
                   Often this runs into hours and may require significant
                   manual checking and labelling. In this paper we will
                   present two state of the art systems, an HMM based
                   system HTS-2007, developed by CSTR and Nagoya Institute
                   Technology, and a commercial unit-selection system
                   CereVoice, developed by Cereproc. Both systems have
                   been used to mimic the voice of George W. Bush (43rd
                   president of the United States) using freely available
                   audio from the web. In addition we will present a
                   hybrid system which combines both technologies. We
                   demonstrate examples of synthetic voices created from
                   10, 40 and 210 minutes of randomly selected speech. We
                   will then discuss the underlying problems associated
                   with voice cloning using found audio, and the
                   scalability of our solution.},
  categories = {speech synthesis, HMM-based speech synthesis, HTS,
                   speaker adaptation, voice conversion, average voice},
  key = {cereproc-hts},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/03_AYLETT.pdf},
  year = 2008
}
@inproceedings{tietze:08:sci,
  author = {Martin Tietze and Vera Demberg and Johanna D. Moore},
  title = {Syntactic Complexity induces Explicit Grounding in the
                   {MapTask} corpus},
  booktitle = {Proc. Interspeech},
  abstract = {This paper provides evidence for theories of grounding
                   and dialogue management in human conversation. For each
                   utterance in a corpus of task-oriented dialogues, we
                   calculated integration costs, which are based on
                   syntactic sentence complexity. We compared the
                   integration costs and grounding behavior under two
                   conditions, namely face-to-face and a no-eye-contact
                   condition. The results show that integration costs were
                   significantlyhigher for explicitly grounded utterances
                   in the no-eye-contact condition, but not in the
                   face-to-face condition.},
  categories = {dialogue, syntactic complexity, grounding},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS081130.pdf},
  year = 2008
}
@inproceedings{bell_king_shrinkage_is2008,
  author = {Bell, Peter and King, Simon},
  title = {A Shrinkage Estimator for Speech Recognition with Full
                   Covariance {HMM}s},
  booktitle = {Proc. Interspeech},
  address = {Brisbane, Australia},
  note = {Shortlisted for best student paper award.},
  abstract = {We consider the problem of parameter estimation in
                   full-covariance Gaussian mixture systems for automatic
                   speech recognition. Due to the high dimensionality of
                   the acoustic feature vector, the standard sample
                   covariance matrix has a high variance and is often
                   poorly-conditioned when the amount of training data is
                   limited. We explain how the use of a shrinkage
                   estimator can solve these problems, and derive a
                   formula for the optimal shrinkage intensity. We present
                   results of experiments on a phone recognition task,
                   showing that the estimator gives a performance
                   improvement over a standard full-covariance system},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/shrinkage_is2008.pdf},
  year = 2008
}
@incollection{murray2008c,
  author = {Murray, Gabriel and Kleinbauer, Thomas and Poller,
                   Peter and Renals, Steve and Kilgour, Jonathan},
  title = {Extrinsic Summarization Evaluation: A Decision Audit
                   Task},
  booktitle = {Machine Learning for Multimodal Interaction (Proc.
                   MLMI '08)},
  publisher = {Springer},
  number = {5237},
  series = {Lecture Notes in Computer Science},
  pages = {349--361},
  abstract = {In this work we describe a large-scale extrinsic
                   evaluation of automatic speech summarization
                   technologies for meeting speech. The particular task is
                   a decision audit, wherein a user must satisfy a complex
                   information need, navigating several meetings in order
                   to gain an understanding of how and why a given
                   decision was made. We compare the usefulness of
                   extractive and abstractive technologies in satisfying
                   this information need, and assess the impact of
                   automatic speech recognition (ASR) errors on user
                   performance. We employ several evaluation methods for
                   participant performance, including post-questionnaire
                   data, human subjective and objective judgments, and an
                   analysis of participant browsing behaviour.},
  doi = {10.1007/978-3-540-85853-9_32},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/murray2008c.pdf},
  year = 2008
}
@inproceedings{wang:frankel:tejedor:king:icassp2008,
  author = {Dong Wang and Joe Frankel and Javier Tejedor and Simon
                   King},
  title = {A comparison of phone and grapheme-based spoken term
                   detection},
  booktitle = {Proc. ICASSP},
  pages = {4969--4972 },
  abstract = {We propose grapheme-based sub-word units for spoken
                   term detection (STD). Compared to phones, graphemes
                   have a number of potential advantages. For
                   out-of-vocabulary search terms, phone- based approaches
                   must generate a pronunciation using letter-to-sound
                   rules. Using graphemes obviates this potentially
                   error-prone hard decision, shifting pronunciation
                   modelling into the statistical models describing the
                   observation space. In addition, long-span grapheme
                   language models can be trained directly from large text
                   corpora. We present experiments on Spanish and English
                   data, comparing phone and grapheme-based STD. For
                   Spanish, where phone and grapheme-based systems give
                   similar transcription word error rates (WERs),
                   grapheme-based STD significantly outperforms a phone-
                   based approach. The converse is found for English,
                   where the phone-based system outperforms a grapheme
                   approach. However, we present additional analysis which
                   suggests that phone-based STD performance levels may be
                   achieved by a grapheme-based approach despite lower
                   transcription accuracy, and that the two approaches may
                   usefully be combined. We propose a number of directions
                   for future development of these ideas, and suggest that
                   if grapheme-based STD can match phone-based
                   performance, the inherent flexibility in dealing with
                   out-of-vocabulary terms makes this a desirable
                   approach.},
  doi = {10.1109/ICASSP.2008.4518773},
  month = {March-April},
  year = 2008
}
@inproceedings{huang2008-is,
  author = {Songfang Huang and Steve Renals},
  title = {Unsupervised Language Model Adaptation Based on Topic
                   and Role Information in Multiparty Meetings},
  booktitle = {Proc. Interspeech'08},
  pages = {833--836},
  address = {Brisbane, Australia},
  abstract = {We continue our previous work on the modeling of topic
                   and role information from multiparty meetings using a
                   hierarchical Dirichlet process (HDP), in the context of
                   language model adaptation. In this paper we focus on
                   three problems: 1) an empirical analysis of the HDP as
                   a nonparametric topic model; 2) the mismatch problem of
                   vocabularies of the baseline n-gram model and the HDP;
                   and 3) an automatic speech recognition experiment to
                   further verify the effectiveness of our adaptation
                   framework. Experiments on a large meeting corpus of
                   more than 70 hours speech data show consistent and
                   significant improvements in terms of word error rate
                   for language model adaptation based on the topic and
                   role information.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/interspeech08.pdf},
  year = 2008
}
@inproceedings{kocjancic_issp08,
  author = {Kocjancic, Tanja},
  title = {Ultrasound investigation of tongue movements in
                   syllables with different onset structure},
  booktitle = {Proc. of the Eighth International Seminar on Speech
                   Production (ISSP)},
  abstract = {This study is an attempt to describe syllables with
                   different onset structure not only in terms of
                   durational changes but also in terms of the distance
                   the tongue travels over a syllable by using ultrasound
                   and to compare the ratio between the two parameters,
                   expressed as speed. Results indicate that both measures
                   increase with an increasing number of onset segments
                   but not to the same degree for all targets. Therefore
                   speed was not constant over all of them. Additionally,
                   type of onset constituent greatly influenced the three
                   parameters and there were large between-speaker
                   similarities in case of durational changes.},
  categories = {tongue movements, ultrasound},
  month = dec,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/Kocjancic_ISSP_2008.pdf},
  year = 2008
}
@inproceedings{gibbonmayo:08,
  author = {Gibbon, F. and Mayo, C.},
  title = {Adults' perception of conflicting acoustic cues
                   associated with EPG-defined undifferentiated gestures},
  booktitle = {4th International EPG Symposium, Edinburgh, UK.},
  categories = {speech perception, cue weighting, undifferentiated
                   gestures, electropalatography},
  year = 2008
}
@article{goubanova:king:specom2008,
  author = {Olga Goubanova and Simon King},
  title = {Bayesian networks for phone duration prediction},
  journal = {Speech Communication},
  volume = {50},
  number = {4},
  pages = {301-311},
  abstract = {In a text-to-speech system, the duration of each phone
                   may be predicted by a duration model. This model is
                   usually trained using a database of phones with known
                   durations; each phone (and the context it appears in)
                   is characterised by a feature vector that is composed
                   of a set of linguistic factor values. We describe the
                   use of a graphical model -- a Bayesian network -- for
                   predicting the duration of a phone, given the values
                   for these factors. The network has one discrete
                   variable for each of the linguistic factors and a
                   single continuous variable for the phone's duration.
                   Dependencies between variables (or the lack of them)
                   are represented in the BN structure by arcs (or missing
                   arcs) between pairs of nodes. During training, both the
                   topology of the network and its parameters are learned
                   from labelled data. We compare the results of the BN
                   model with results for sums of products and CART models
                   on the same data. In terms of the root mean square
                   error, the BN model performs much better than both CART
                   and SoP models. In terms of correlation coefficient,
                   the BN model performs better than the SoP model, and as
                   well as the CART model. A BN model has certain
                   advantages over CART and SoP models. Training SoP
                   models requires a high degree of expertise. CART models
                   do not deal with interactions between factors in any
                   explicit way. As we demonstrate, a BN model can also
                   make accurate predictions of a phone's duration, even
                   when the values for some of the linguistic factors are
                   unknown.},
  categories = {Text-to-speech; Bayesian networks; Duration modelling;
                   Sums of products; Classification and regression trees},
  doi = {10.1016/j.specom.2007.10.002},
  month = {April},
  year = 2008
}
@inproceedings{Aylett+King08,
  author = {Matthew P. Aylett and Simon King},
  title = {Single Speaker Segmentation and Inventory Selection
                   Using Dynamic Time Warping Self Organization and Joint
                   Multigram Mapping},
  booktitle = {SSW06},
  pages = {258--263},
  abstract = {In speech synthesis the inventory of units is decided
                   by inspection and on the basis of phonological and
                   phonetic expertise. The ephone (or emergent phone)
                   project at CSTR is investigating how self organisation
                   techniques can be applied to build an inventory based
                   on collected acoustic data together with the
                   constraints of a synthesis lexicon. In this paper we
                   will describe a prototype inventory creation method
                   using dynamic time warping (DTW) for acoustic
                   clustering and a joint multigram approach for relating
                   a series of symbols that represent the speech to these
                   emerged units. We initially examined two symbol sets:
                   1) A baseline of standard phones 2) Orthographic
                   symbols. The success of the approach is evaluated by
                   comparing word boundaries generated by the emergent
                   phones against those created using state-of-the-art HMM
                   segmentation. Initial results suggest the DTW
                   segmentation can match word boundaries with a root mean
                   square error (RMSE) of 35ms. Results from mapping units
                   onto phones resulted in a higher RMSE of 103ms. This
                   error was increased when multiple multigram types were
                   added and when the default unit clustering was altered
                   from 40 (our baseline) to 10. Results for orthographic
                   matching had a higher RMSE of 125ms. To conclude we
                   discuss future work that we believe can reduce this
                   error rate to a level sufficient for the techniques to
                   be applied to a unit selection synthesis system. },
  categories = {speech synthesis, unit selection, parametric
                   synthesis, phone inventory, orthographic synthesis},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/ssw06.pdf},
  place = {Bonn},
  year = 2008
}
@incollection{murray2008b,
  author = {Murray, Gabriel and Renals, Steve},
  title = {Detecting Action Items in Meetings},
  booktitle = {Machine Learning for Multimodal Interaction (Proc.
                   MLMI '08)},
  publisher = {Springer},
  number = {5237},
  series = {Lecture Notes in Computer Science},
  pages = {208--213},
  abstract = {We present a method for detecting action items in
                   spontaneous meeting speech. Using a supervised approach
                   incorporating prosodic, lexical and structural
                   features, we can classify such items with a high degree
                   of accuracy. We also examine how well various feature
                   subclasses can perform this task on their own.},
  doi = {10.1007/978-3-540-85853-9_19},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/murray2008b.pdf},
  url = {http://dx.doi.org/10.1007/978-3-540-85853-9_19},
  year = 2008
}
@inproceedings{robust-hts,
  author = {Junichi Yamagishi and Zhenhua Ling and Simon King},
  title = {Robustness of HMM-based Speech Synthesis},
  booktitle = {Proc. Interspeech 2008},
  pages = {581--584},
  address = {Brisbane, Australia},
  abstract = {As speech synthesis techniques become more advanced,
                   we are able to consider building high-quality voices
                   from data collected outside the usual highly-controlled
                   recording studio environment. This presents new
                   challenges that are not present in conventional
                   text-to-speech synthesis: the available speech data are
                   not perfectly clean, the recording conditions are not
                   consistent, and/or the phonetic balance of the material
                   is not ideal. Although a clear picture of the
                   performance of various speech synthesis techniques
                   (e.g., concatenative, HMM-based or hybrid) under good
                   conditions is provided by the Blizzard Challenge, it is
                   not well understood how robust these algorithms are to
                   less favourable conditions. In this paper, we analyse
                   the performance of several speech synthesis methods
                   under such conditions. This is, as far as we know, a
                   new research topic: ``Robust speech synthesis.'' As a
                   consequence of our investigations, we propose a new
                   robust training method for the HMM-based speech
                   synthesis in for use with speech data collected in
                   unfavourable conditions.},
  categories = {speech synthesis, HMM-based speech synthesis, HTS,
                   speaker adaptation, voice conversion, average voice,
                   unit selection},
  key = {robust-hts},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/RobustnessHTS.pdf},
  year = 2008
}
@inproceedings{kocjancic_exling08,
  author = {Kocjancic, Tanja},
  title = {Tongue movement and syllable onset complexity:
                   ultrasound study},
  booktitle = {Proc. of ISCA Experimental Linguistics ExLing 2008},
  abstract = {In this study ultrasound was used to investigate
                   tongue movements in syllables with different number and
                   type of onset consonants. Ultrasound recordings
                   provided the information of the distance the tongue
                   travels over a target, and audio recordings of the time
                   needed. The speed of tongue’s travel was calculated
                   from the two measurements. Results of ten speakers have
                   shown that both duration and distance travelled
                   increase with an increased number of onset segments,
                   but that distance travelled is additionally influenced
                   by the type of the segment, as is speed. Duration also
                   seemed to be the least speaker-dependant of the three
                   parameters.},
  categories = {tongue movements, ultrasound},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/Kocjancic_ISCA_ExLing_2008.pdf},
  year = 2008
}
@inproceedings{georgila:08,
  author = {Kallirroi Georgila and Maria Wolters and Vasilis
                   Karaiskos and Melissa Kronenthal and Robert Logie and
                   Neil Mayo and Johanna Moore and Matt Watson},
  title = {A Fully Annotated Corpus for Studying the Effect of
                   Cognitive Ageing on Users' Interactions with Spoken
                   Dialogue Systems},
  booktitle = {Proceedings of the 6th International Conference on
                   Language Resources and Evaluation},
  year = 2008
}
@article{garau2008,
  author = {Garau, Giulia and Renals, Steve},
  title = {Combining Spectral Representations for Large
                   Vocabulary Continuous Speech Recognition},
  journal = {IEEE Transactions on Audio, Speech and Language
                   Processing},
  volume = {16},
  number = {3},
  pages = {508--518},
  abstract = {In this paper we investigate the combination of
                   complementary acoustic feature streams in large
                   vocabulary continuous speech recognition (LVCSR). We
                   have explored the use of acoustic features obtained
                   using a pitch-synchronous analysis, STRAIGHT, in
                   combination with conventional features such as mel
                   frequency cepstral coefficients. Pitch-synchronous
                   acoustic features are of particular interest when used
                   with vocal tract length normalisation (VTLN) which is
                   known to be affected by the fundamental frequency. We
                   have combined these spectral representations directly
                   at the acoustic feature level using heteroscedastic
                   linear discriminant analysis (HLDA) and at the system
                   level using ROVER. We evaluated this approach on three
                   LVCSR tasks: dictated newspaper text (WSJCAM0),
                   conversational telephone speech (CTS), and multiparty
                   meeting transcription. The CTS and meeting
                   transcription experiments were both evaluated using
                   standard NIST test sets and evaluation protocols. Our
                   results indicate that combining conventional and
                   pitch-synchronous acoustic feature sets using HLDA
                   results in a consistent, significant decrease in word
                   error rate across all three tasks. Combining at the
                   system level using ROVER resulted in a further
                   significant decrease in word error rate.},
  doi = {10.1109/TASL.2008.916519},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2008/garau-taslp08.pdf},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=4443886},
  year = 2008
}
@inproceedings{qin:perpinan:richmond:wrench:renals:2008a,
  author = {Qin, C. and Carreira-Perpiñán, M. and Richmond, K.
                   and Wrench, A. and Renals, S.},
  title = {Predicting Tongue Shapes from a Few Landmark Locations},
  booktitle = {Proc. Interspeech},
  pages = {2306--2309},
  address = {Brisbane, Australia},
  abstract = {We present a method for predicting the midsagittal
                   tongue contour from the locations of a few landmarks
                   (metal pellets) on the tongue surface, as used in
                   articulatory databases such as MOCHA and the Wisconsin
                   XRDB. Our method learns a mapping using ground-truth
                   tongue contours derived from ultrasound data and
                   drastically improves over spline interpolation. We also
                   determine the optimal locations of the landmarks, and
                   the number of landmarks required to achieve a desired
                   prediction error: 3-4 landmarks are enough to achieve
                   0.3-0.2 mm error per point on the tongue.},
  categories = {ultrasound, tongue contour, articulation},
  key = {qin:perpinan:richmond:wrench:renals:2008a},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080929.PDF},
  year = 2008
}
@inproceedings{cabral:renals:richmond:yamagishi:2008a,
  author = {Cabral, J. and Renals, S. and Richmond, K. and
                   Yamagishi, J.},
  title = {Glottal Spectral Separation for Parametric Speech
                   Synthesis},
  booktitle = {Proc. Interspeech},
  pages = {1829--1832},
  address = {Brisbane, Australia},
  abstract = {This paper presents a method to control the
                   characteristics of synthetic speech flexibly by
                   integrating articulatory features into a Hidden Markov
                   Model (HMM)-based parametric speech synthesis system.
                   In contrast to model adaptation and interpolation
                   approaches for speaking style control, this method is
                   driven by phonetic knowledge, and target speech samples
                   are not required. The joint distribution of parallel
                   acoustic and articulatory features considering
                   cross-stream feature dependency is estimated. At
                   synthesis time, acoustic and articulatory features are
                   generated simultaneously based on the
                   maximum-likelihood criterion. The synthetic speech can
                   be controlled flexibly by modifying the generated
                   articulatory features according to arbitrary phonetic
                   rules in the parameter generation process. Our
                   experiments show that the proposed method is effective
                   in both changing the overall character of synthesized
                   speech and in controlling the quality of a specific
                   vowel. },
  categories = {HMM speech synthesis, Glottal Spectral Separation,
                   LF-model},
  key = {cabral:renals:richmond:yamagishi:2008a},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS081086.PDF},
  year = 2008
}
@inproceedings{leo_08-3,
  author = {J. Sebastian Andersson and Leonardo Badino and Oliver
                   S. Watts and Matthew P.Aylett},
  title = {The {CSTR/Cereproc B}lizzard Entry 2008: The
                   Inconvenient Data},
  booktitle = {Proc. Blizzard Challenge Workshop (in Proc.
                   Interspeech 2008)},
  address = {Brisbane, Australia},
  abstract = {In a commercial system data used for unit selection
                   systems is collected with a heavy emphasis on
                   homogeneous neutral data that has sufficient coverage
                   for the units that will be used in the system. In this
                   years Blizzard entry CSTR and CereProc present a joint
                   entry where the emphasis has been to explore techniques
                   to deal with data which is not homogeneous (the English
                   entry) and did not have appropriate coverage for a
                   diphone based system (the Mandarin entry where
                   tone/phone combinations were treated as distinct phone
                   categories). In addition, two further problems were
                   addressed, 1) Making use of non-homogeneous data for
                   creating a voice that can realise both expressive and
                   neutral speaking styles (the English entry) 2) Building
                   a unit selection system with no native understanding of
                   the language but depending instead on external native
                   evaluation (the Mandarin Entry).},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/cstr-cereproc_Blizzard2008.pdf},
  year = 2008
}
@inproceedings{hts-child-oliver,
  author = {Oliver Watts and Junichi Yamagishi and Kay Berkling
                   and Simon King},
  title = {{HMM}-based synthesis of child speech},
  booktitle = {Proc. of The 1st Workshop on Child, Computer and
                   Interaction (ICMI'08 post-conference workshop)},
  address = {Crete, Greece},
  abstract = {The synthesis of child speech presents challenges both
                   in the collection of data and in the building of a
                   synthesiser from that data. Because only limited data
                   can be collected, and the domain of that data is
                   constrained, it is difficult to obtain the type of
                   phonetically-balanced corpus usually used in speech
                   synthesis. As a consequence, building a synthesiser
                   from this data is difficult. Concatenative synthesisers
                   are not robust to corpora with many missing units (as
                   is likely when the corpus content is not carefully
                   designed), so we chose to build a statistical
                   parametric synthesiser using the HMM-based system HTS.
                   This technique has previously been shown to perform
                   well for limited amounts of data, and for data
                   collected under imperfect conditions. We compared 6
                   different configurations of the synthesiser, using both
                   speaker-dependent and speaker-adaptive modelling
                   techniques, and using varying amounts of data. The
                   output from these systems was evaluated alongside
                   natural and vocoded speech, in a Blizzard-style
                   listening test.},
  categories = {speech synthesis, HMM-based speech synthesis, HTS,
                   speaker adaptation, voice conversion, average voice,
                   child speech},
  key = {hts-child-oliver},
  month = oct,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/child-hts-oliver.pdf},
  year = 2008
}
@inproceedings{strom08,
  author = {Volker Strom and Simon King},
  title = {Investigating {F}estival's target cost function using
                   perceptual experiments},
  booktitle = {Proc.~Interspeech},
  address = {Brisbane},
  abstract = {We describe an investigation of the target cost used
                   in the Festival unit selection speech synthesis system.
                   Our ultimate goal is to automatically learn a
                   perceptually optimal target cost function. In this
                   study, we investigated the behaviour of the target cost
                   for one segment type. The target cost is based on
                   counting the mismatches in several context features. A
                   carrier sentence (``My name is Roger'') was synthesised
                   using all 147,820 possible combinations of the diphones
                   /n_ei/ and /ei_m/. 92 representative versions were
                   selected and presented to listeners as 460 pairwise
                   comparisons. The listeners' preference votes were used
                   to analyse the behaviour of the target cost, with
                   respect to the values of its component linguistic
                   context features.},
  categories = {speech synthesis, unit selection, target costs},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080514.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080514.ps},
  year = 2008
}
@inproceedings{dong_ivan_joe_simon_interspeech08_marray,
  author = {Dong Wang and Ivan Himawan and Joe Frankel and Simon
                   King},
  title = {A Posterior Approach for Microphone Array Based Speech
                   Recognition},
  booktitle = {Proc. Interspeech},
  pages = {996--999},
  abstract = {Automatic speech recognition (ASR) becomes rather
                   difficult in meetings domains because of the adverse
                   acoustic conditions, including more background noise,
                   more echo and reverberation and frequent cross-talking.
                   Microphone arrays have been demonstrated able to boost
                   ASR performance dramatically in such noisy and
                   reverberant environments, with various beamforming
                   algorithms. However, almost all existing beamforming
                   measures work in the acoustic domain, resorting to
                   signal processing theories and geometric explanation.
                   This limits their application, and induces significant
                   performance degradation when the geometric property is
                   unavailable or hard to estimate, or if heterogenous
                   channels exist in the audio system. In this paper, we
                   preset a new posterior-based approach for array-based
                   speech recognition. The main idea is, instead of
                   enhancing speech signals, we try to enhance the
                   posterior probabilities that frames belonging to
                   recognition units, e.g., phones. These enhanced
                   posteriors are then transferred to posterior
                   probability based features and are modeled by HMMs,
                   leading to a tandem ANN-HMM hybrid system presented by
                   Hermansky et al.. Experimental results demonstrated the
                   validity of this posterior approach. With the posterior
                   accumulation or enhancement, significant improvement
                   was achieved over the single channel baseline.
                   Moreover, we can combine the acoustic enhancement and
                   posterior enhancement together, leading to a hybrid
                   acoustic-posterior beamforming approach, which works
                   significantly better than just the acoustic
                   beamforming, especially in the scenario with
                   moving-speakers. },
  categories = {speech recognition, microphone array, beamforming,
                   tandem approach},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/marray.a.pdf},
  year = 2008
}
@article{christensen2008,
  author = {Christensen, Heidi and Gotoh, Yoshihiko and Renals,
                   Steve},
  title = {A Cascaded Broadcast News Highlighter},
  journal = {IEEE Transactions on Audio, Speech and Language
                   Processing},
  volume = {16},
  pages = {151--161},
  abstract = {This paper presents a fully automatic news skimming
                   system which takes a broadcast news audio stream and
                   provides the user with the segmented, structured and
                   highlighted transcript. This constitutes a system with
                   three different, cascading stages: converting the audio
                   stream to text using an automatic speech recogniser,
                   segmenting into utterances and stories and finally
                   determining which utterance should be highlighted using
                   a saliency score. Each stage must operate on the
                   erroneous output from the previous stage in the system;
                   an effect which is naturally amplified as the data
                   progresses through the processing stages. We present a
                   large corpus of transcribed broadcast news data
                   enabling us to investigate to which degree information
                   worth highlighting survives this cascading of
                   processes. Both extrinsic and intrinsic experimental
                   results indicate that mistakes in the story boundary
                   detection has a strong impact on the quality of
                   highlights, whereas erroneous utterance boundaries
                   cause only minor problems. Further, the difference in
                   transcription quality does not affect the overall
                   performance greatly.},
  doi = {10.1109/TASL.2007.910746},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2008/christensen-tasl08.pdf},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=4407525&arnumber=4383075&count=28&index=16},
  year = 2008
}
@incollection{huang2008-mlmi,
  author = {Songfang Huang and Steve Renals},
  title = {Modeling Topic and Role Information in Meetings using
                   the Hierarchical {D}irichlet Process},
  booktitle = {Machine Learning for Multimodal Interaction V},
  publisher = {Springer},
  editor = {Popescu-Belis, A. and Stiefelhagen, R.},
  volume = {5237},
  series = {Lecture Notes in Computer Science},
  pages = {214--225},
  abstract = {In this paper, we address the modeling of topic and
                   role information in multiparty meetings, via a
                   nonparametric Bayesian model called the hierarchical
                   Dirichlet process. This model provides a powerful
                   solution to topic modeling and a flexible framework for
                   the incorporation of other cues such as speaker role
                   information. We present our modeling framework for
                   topic and role on the AMI Meeting Corpus, and
                   illustrate the effectiveness of the approach in the
                   context of adapting a baseline language model in a
                   large-vocabulary automatic speech recognition system
                   for multiparty meetings. The adapted LM produces
                   significant improvements in terms of both perplexity
                   and word error rate.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/mlmi08.pdf},
  year = 2008
}
@inproceedings{wolters-itg:08,
  author = {Wolters, Maria and Campbell, Pauline and DePlacido,
                   Christine and Liddell, Amy and Owens, David},
  title = {Adapting {S}peech {S}ynthesis {S}ystems to {U}sers
                   with {A}ge-{R}elated {H}earing {L}oss},
  booktitle = {Beitr{\"a}ge der 8. {ITG} {F}achtagung
                   {S}prachkommunikation},
  abstract = {This paper summarises the main results of a pilot
                   study into the effect of auditory ageing on the
                   intelligibility of synthetic speech. 32 older and 12
                   younger users had to answer simple questions about a
                   series of meeting reminders and medication reminders.
                   They also underwent an extensive battery of
                   audiological and cognitive assessments. Older users
                   only had more difficulty understanding the synthetic
                   voice than younger people if they had elevated
                   pure-tone thresholds and if they were asked to
                   unfamiliar medication names. We suggest that these
                   problems can be remedied by better prompt design. User
                   interviews show that the synthetic voice used was quite
                   natural. Problems mentioned by users fit the results of
                   a previous error analysis. },
  categories = {speech synthesis, older users},
  month = sep,
  url = {http://homepages.inf.ed.ac.uk/mwolters/itg08.pdf},
  year = 2008
}
@inproceedings{steiner:richmond:2008a,
  author = {Steiner, I. and Richmond, K.},
  title = {Generating gestural timing from {EMA} data using
                   articulatory resynthesis},
  booktitle = {Proc. 8th International Seminar on Speech Production},
  address = {Strasbourg, France},
  abstract = {As part of ongoing work to integrate an articulatory
                   synthesizer into a modular TTS platform, a method is
                   presented which allows gestural timings to be generated
                   automatically from EMA data. Further work is outlined
                   which will adapt the vocal tract model and phoneset to
                   English using new articulatory data, and use
                   statistical trajectory models. },
  categories = {articulatory synthesis, EMA, VocalTractLab },
  key = {steiner:richmond:2008a},
  month = dec,
  year = 2008
}
@inproceedings{leo_08-2,
  author = {Leonardo Badino and Robert A.J. Clark and Volker Strom},
  title = {Including Pitch Accent Optionality in Unit Selection
                   Text-to-Speech Synthesis},
  booktitle = {Proc.~Interspeech},
  address = {Brisbane},
  abstract = {A significant variability in pitch accent placement is
                   found when comparing the patterns of prosodic
                   prominence realized by different English speakers
                   reading the same sentences. In this paper we describe a
                   simple approach to incorporate this variability to
                   synthesize prosodic prominence in unit selection
                   text-to-speech synthesis. The main motivation of our
                   approach is that by taking into account the variability
                   of accent placements we enlarge the set of prosodically
                   acceptable speech units, thus increasing the chances of
                   selecting a good quality sequence of units, both in
                   prosodic and segmental terms. Results on a large scale
                   perceptual test show the benefits of our approach and
                   indicate directions for further improvements.},
  categories = {speech synthesis, unit selection, prosodic prominence,
                   pitch accents},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080159.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080159.ps},
  year = 2008
}
@inproceedings{joe_dong_simon_interspeech08_bottle,
  author = {Joe Frankel and Dong Wang and Simon King},
  title = {Growing bottleneck features for tandem {ASR}},
  booktitle = {Proc. Interspeech},
  pages = {1549},
  abstract = { We present a method for training bottleneck MLPs for
                   use in tandem ASR. Experiments on meetings data show
                   that this approach leads to improved performance
                   compared with training MLPs from a random
                   initialization. },
  categories = {tandem ASR, bottleneck MLP},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/bottlenet.a.pdf},
  year = 2008
}
@inproceedings{tts_barra08,
  author = {R. Barra-Chicote and J. Yamagishi and J.M. Montero and
                   S. King and S. Lutfi and J. Macias-Guarasa},
  title = {Generacion de una voz sintetica en {C}astellano basada
                   en {HSMM} para la {E}valuacion {A}lbayzin 2008:
                   conversion texto a voz},
  booktitle = {V Jornadas en Tecnologia del Habla},
  pages = {115-118},
  note = {(in Spanish)},
  month = nov,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/tts-jth08.pdf},
  year = 2008
}
@inproceedings{lips08-gregpr,
  author = {Gregor Hofer and Junichi Yamagishi and Hiroshi
                   Shimodaira},
  title = {Speech-driven Lip Motion Generation with a Trajectory
                   {HMM}},
  booktitle = {Proc. Interspeech 2008},
  pages = {2314--2317},
  address = {Brisbane, Australia},
  abstract = {Automatic speech animation remains a challenging
                   problem that can be described as finding the optimal
                   sequence of animation parameter configurations given
                   some speech. In this paper we present a novel technique
                   to automatically synthesise lip motion trajectories
                   from a speech signal. The developed system predicts lip
                   motion units from the speech signal and generates
                   animation trajectories automatically employing a
                   Trajectory Hidden Markov Model. Using the MLE
                   criterion, its parameter generation algorithm produces
                   the optimal smooth motion trajectories that are used to
                   drive control points on the lips directly.
                   Additionally, experiments were carried out to find a
                   suitable model unit that produces the most accurate
                   results. Finally a perceptual evaluation was conducted,
                   that showed that the developed motion units perform
                   better than phonemes.},
  categories = {visual speech synthesis, trajectory HMM, HTS},
  key = {lips08-gregpr},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/GregorLipsChallenge08.pdf},
  year = 2008
}
@incollection{murray2008a,
  author = {Murray, Gabriel and Renals, Steve},
  title = {Meta Comments for Summarizing Meeting Speech},
  booktitle = {Machine Learning for Multimodal Interaction (Proc.
                   MLMI '08)},
  publisher = {Springer},
  number = {5237},
  series = {Lecture Notes in Computer Science},
  pages = {236--247},
  abstract = {This paper is about the extractive summarization of
                   meeting speech, using the ICSI and AMI corpora. In the
                   first set of experiments we use prosodic, lexical,
                   structural and speaker-related features to select the
                   most informative dialogue acts from each meeting, with
                   the hypothesis being that such a rich mixture of
                   features will yield the best results. In the second
                   part, we present an approach in which the
                   identification of ``meta-comments'' is used to create
                   more informative summaries that provide an increased
                   level of abstraction. We find that the inclusion of
                   these meta comments improves summarization performance
                   according to several evaluation metrics.},
  doi = {10.1007/978-3-540-85853-9_22},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/murray2008a.pdf},
  url = {http://dx.doi.org/10.1007/978-3-540-85853-9_22},
  year = 2008
}
@inproceedings{moeller:08,
  author = {Sebastian M\"oller and Florian G\"odde and Maria
                   Wolters},
  title = {A Corpus Analysis of Spoken Smart-Home Interactions
                   with Older Users},
  booktitle = {Proceedings of the 6th International Conference on
                   Language Resources and Evaluation},
  year = 2008
}
@inproceedings{king:tokuda:zen:yamagishi:interspeech2008,
  author = {Simon King and Keiichi Tokuda and Heiga Zen and
                   Junichi Yamagishi},
  title = {Unsupervised adaptation for HMM-based speech synthesis},
  booktitle = {Proc. Interspeech},
  pages = {1869-1872},
  address = {Brisbane, Australia},
  abstract = {It is now possible to synthesise speech using HMMs
                   with a comparable quality to unit-selection techniques.
                   Generating speech from a model has many potential
                   advantages over concatenating waveforms. The most
                   exciting is model adaptation. It has been shown that
                   supervised speaker adaptation can yield high-quality
                   synthetic voices with an order of magnitude less data
                   than required to train a speaker-dependent model or to
                   build a basic unit-selection system. Such supervised
                   methods require labelled adaptation data for the target
                   speaker. In this paper, we introduce a method capable
                   of unsupervised adaptation, using only speech from the
                   target speaker without any labelling.},
  categories = {speech synthesis, HMM-based speech synthesis, HTS,
                   trajectory HMMs, speaker adaptation, MLLR},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080299.PDF},
  year = 2008
}
@inproceedings{toth:frankel:goztolya:king:interspeech2008,
  author = {Laszlo Toth and Joe Frankel and Gabor Gosztolya and
                   Simon King},
  title = {Cross-lingual Portability of MLP-Based Tandem Features
                   -- A Case Study for English and Hungarian},
  booktitle = {Proc. Interspeech},
  pages = {2695-2698},
  address = {Brisbane, Australia},
  abstract = {One promising approach for building ASR systems for
                   less-resourced languages is cross-lingual adaptation.
                   Tandem ASR is particularly well suited to such
                   adaptation, as it includes two cascaded modelling
                   steps: feature extraction using multi-layer perceptrons
                   (MLPs), followed by modelling using a standard HMM. The
                   language-specific tuning can be performed by adjusting
                   the HMM only, leaving the MLP untouched. Here we
                   examine the portability of feature extractor MLPs
                   between an Indo-European (English) and a Finno-Ugric
                   (Hungarian) language. We present experiments which use
                   both conventional phone-posterior and articulatory
                   feature (AF) detector MLPs, both trained on a much
                   larger quantity of (English) data than the monolingual
                   (Hungarian) system. We find that the cross-lingual
                   configurations achieve similar performance to the
                   monolingual system, and that, interestingly, the AF
                   detectors lead to slightly worse performance, despite
                   the expectation that they should be more
                   language-independent than phone-based MLPs. However,
                   the cross-lingual system outperforms all other
                   configurations when the English phone MLP is adapted on
                   the Hungarian data. },
  keywords = {tandem, ASR},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/IS080729.PDF},
  year = 2008
}
@inproceedings{garau2008a,
  author = {Garau, Giulia and Renals, Steve},
  title = {Pitch adaptive features for {LVCSR}},
  booktitle = {Proc. Interspeech '08},
  abstract = {We have investigated the use of a pitch adaptive
                   spectral representation on large vocabulary speech
                   recognition, in conjunction with speaker normalisation
                   techniques. We have compared the effect of a smoothed
                   spectrogram to the pitch adaptive spectral analysis by
                   decoupling these two components of STRAIGHT.
                   Experiments performed on a large vocabulary meeting
                   speech recognition task highlight the importance of
                   combining a pitch adaptive spectral representation with
                   a conventional fixed window spectral analysis. We found
                   evidence that STRAIGHT pitch adaptive features are more
                   speaker independent than conventional MFCCs without
                   pitch adaptation, thus they also provide better
                   performances when combined using feature combination
                   techniques such as Heteroscedastic Linear Discriminant
                   Analysis.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/garau2008a.pdf},
  year = 2008
}
@article{tejedor:wang:frankel:king:colas:specom2008,
  author = {Javier Tejedor and Dong Wang and Joe Frankel and Simon
                   King and José Colás},
  title = {A comparison of grapheme and phoneme-based units for
                   {S}panish spoken term detection},
  journal = {Speech Communication},
  volume = {50},
  number = {11-12},
  pages = {980-991},
  abstract = {The ever-increasing volume of audio data available
                   online through the world wide web means that automatic
                   methods for indexing and search are becoming essential.
                   Hidden Markov model (HMM) keyword spotting and lattice
                   search techniques are the two most common approaches
                   used by such systems. In keyword spotting, models or
                   templates are defined for each search term prior to
                   accessing the speech and used to find matches. Lattice
                   search (referred to as spoken term detection), uses a
                   pre-indexing of speech data in terms of word or
                   sub-word units, which can then quickly be searched for
                   arbitrary terms without referring to the original
                   audio. In both cases, the search term can be modelled
                   in terms of sub-word units, typically phonemes. For
                   in-vocabulary words (i.e. words that appear in the
                   pronunciation dictionary), the letter-to-sound
                   conversion systems are accepted to work well. However,
                   for out-of-vocabulary (OOV) search terms,
                   letter-to-sound conversion must be used to generate a
                   pronunciation for the search term. This is usually a
                   hard decision (i.e. not probabilistic and with no
                   possibility of backtracking), and errors introduced at
                   this step are difficult to recover from. We therefore
                   propose the direct use of graphemes (i.e., letter-based
                   sub-word units) for acoustic modelling. This is
                   expected to work particularly well in languages such as
                   Spanish, where despite the letter-to-sound mapping
                   being very regular, the correspondence is not
                   one-to-one, and there will be benefits from avoiding
                   hard decisions at early stages of processing. In this
                   article, we compare three approaches for Spanish
                   keyword spotting or spoken term detection, and within
                   each of these we compare acoustic modelling based on
                   phone and grapheme units. Experiments were performed
                   using the Spanish geographical-domain Albayzin corpus.
                   Results achieved in the two approaches proposed for
                   spoken term detection show us that trigrapheme units
                   for acoustic modelling match or exceed the performance
                   of phone-based acoustic models. In the method proposed
                   for keyword spotting, the results achieved with each
                   acoustic model are very similar.},
  categories = {Spoken term detection; Keyword spotting; Graphemes;
                   Spanish},
  doi = {10.1016/j.specom.2008.03.005},
  month = {November-December},
  year = 2008
}
@inproceedings{morgan:08,
  author = {Maggie Morgan and Marilyn R. McGee-Lennon and Nick
                   Hine and John Arnott and Chris Martin and Julia S.
                   Clark and Maria Wolters},
  title = {Requirements Gathering with Diverse User Groups and
                   Stakeholders},
  booktitle = {Proc. 26th Conference on Computer-Human Interaction,
                   Florence},
  year = 2008
}
@inproceedings{hts2007-icassp,
  author = {Junichi Yamagishi and Takashi Nose and Heiga Zen and
                   Tomoki Toda and Keiichi Tokuda},
  title = {Performance Evaluation of The Speaker-Independent
                   {HMM}-based Speech Synthesis System "{HTS}-2007" for
                   the {Blizzard Challenge 2007}},
  booktitle = {Proc. ICASSP 2008},
  pages = {3957--3960},
  address = {Las Vegas, U.S.A},
  abstract = {This paper describes a speaker-independent/adaptive
                   HMM-based speech synthesis system developed for the
                   Blizzard Challenge 2007. The new system, named
                   HTS-2007, employs speaker adaptation
                   (CSMAPLR+MAP), feature-space adaptive training,
                   mixed-gender modeling, and full-covariance modeling
                   using CSMAPLR transforms, in addition to several other
                   techniques that have proved effective in our previous
                   systems. Subjective evaluation results show that the
                   new system generates significantly better quality
                   synthetic speech than that of speaker-dependent
                   approaches with realistic amounts of speech data, and
                   that it bears comparison with speaker-dependent
                   approaches even when large amounts of speech data are
                   available.},
  categories = {speech synthesis, HMM-based speech synthesis, HTS,
                   speaker adaptation, voice conversion, average voice},
  doi = {10.1109/ICASSP.2008.4518520},
  key = {hts2007-icassp},
  month = apr,
  year = 2008
}
@inproceedings{karaiskos:king:clark:mayo:blizzard2008,
  author = {Vasilis Karaiskos and Simon King and Robert A. J.
                   Clark and Catherine Mayo},
  title = {The Blizzard Challenge 2008},
  booktitle = {Proc. Blizzard Challenge Workshop},
  address = {Brisbane, Australia},
  abstract = {The Blizzard Challenge 2008 was the fourth annual
                   Blizzard Challenge. This year, participants were asked
                   to build two voices from a UK English corpus and one
                   voice from a Man- darin Chinese corpus. This is the
                   first time that a language other than English has been
                   included and also the first time that a large UK
                   English corpus has been available. In addi- tion, the
                   English corpus contained somewhat more expressive
                   speech than that found in corpora used in previous
                   Blizzard Challenges. To assist participants with
                   limited resources or limited ex- perience in
                   UK-accented English or Mandarin, unaligned la- bels
                   were provided for both corpora and for the test
                   sentences. Participants could use the provided labels
                   or create their own. An accent-specific pronunciation
                   dictionary was also available for the English speaker.
                   A set of test sentences was released to participants,
                   who were given a limited time in which to synthesise
                   them and submit the synthetic speech. An online
                   listening test was con- ducted, to evaluate
                   naturalness, intelligibility and degree of similarity
                   to the original speaker.},
  keywords = {Blizzard},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/summary_Blizzard2008.pdf},
  year = 2008
}
@inproceedings{huang2008-ptkl,
  author = {Songfang Huang and Steve Renals},
  title = {Using Participant Role in Multiparty Meetings as Prior
                   Knowledge for Nonparametric Topic Modeling},
  booktitle = {Proc. ICML/UAI/COLT Workshop on Prior Knowledge for
                   Text and Language Processing},
  pages = {21--24},
  address = {Helsinki, Finland},
  abstract = {In this paper we introduce our attempts to incorporate
                   the participant role information in multiparty meetings
                   for document modeling using the hierarchical Dirichlet
                   process. The perplexity and automatic speech
                   recognition results demonstrate that the participant
                   role information is a promising prior knowledge source
                   to be combined with language models for automatic
                   speech recognition and interaction modeling for
                   multiparty meetings.},
  month = jul,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/ptkl.pdf},
  year = 2008
}
@inproceedings{hts2008,
  author = {Junichi Yamagishi and Heiga Zen and Yi-Jian Wu and
                   Tomoki Toda and Keiichi Tokuda},
  title = {The {HTS}-2008 System: Yet Another Evaluation of the
                   Speaker-Adaptive {HMM}-based Speech Synthesis System in
                   The {2008 Blizzard Challenge}},
  booktitle = {Proc. Blizzard Challenge 2008},
  address = {Brisbane, Australia},
  abstract = {For the 2008 Blizzard Challenge, we used the same
                   speaker-adaptive approach to HMM-based speech synthesis
                   that was used in the HTS entry to the 2007 challenge,
                   but an improved system was built in which the
                   multi-accented English average voice model was trained
                   on 41 hours of speech data with high-order mel-cepstral
                   analysis using an efficient forward-backward algorithm
                   for the HSMM. The listener evaluation scores for the
                   synthetic speech generated from this system was much
                   better than in 2007: the system had the equal best
                   naturalness on the small English data set and the equal
                   best intelligibility on both small and large data sets
                   for English, and had the equal best naturalness on the
                   Mandarin data. In fact, the English system was found to
                   be as intelligible as human speech.},
  categories = {speech synthesis, HMM-based speech synthesis, HTS,
                   speaker adaptation, voice conversion, average voice,
                   Blizzard Challenge},
  key = {hts2008},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/HTS2008.pdf},
  year = 2008
}
@inproceedings{bell_king_lineSearch_is2008,
  author = {Bell, Peter and King, Simon},
  title = {Covariance Updates for Discriminative Training by
                   Constrained Line Search},
  booktitle = {Proc. Interspeech},
  address = {Brisbane, Australia},
  abstract = {We investigate the recent Constrained Line Search
                   algorithm for discriminative training of HMMs and
                   propose an alternative formula for variance update. We
                   compare the method to standard techniques on a phone
                   recognition task.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/lineSearch_is2008.pdf},
  year = 2008
}
@inproceedings{leo_08-1,
  author = {Leonardo Badino and Robert A.J. Clark},
  title = {Automatic labeling of contrastive word pairs from
                   spontaneous spoken English},
  booktitle = {in 2008 IEEE/ACL Workshop on Spoken Language
                   Technology},
  address = {Goa, India},
  abstract = {This paper addresses the problem of automatically
                   labeling contrast in spontaneous spoken speech, where
                   contrast here is meant as a relation that ties two
                   words that explicitly contrast with each other.
                   Detection of contrast is certainly relevant in the
                   analysis of discourse and information structure and
                   also, because of the prosodic correlates of contrast,
                   could play an important role in speech applications,
                   such as text-to-speech synthesis, that need an accurate
                   and discourse context related modeling of prosody. With
                   this prospect we investigate the feasibility of
                   automatic contrast labeling by training and evaluating
                   on the Switchboard corpus a novel contrast tagger,
                   based on Support Vector Machines (SVM), that combines
                   lexical features, syntactic dependencies and WordNet
                   semantic relations.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/0000101.pdf},
  year = 2008
}
@article{dielmann2008,
  author = {Dielmann, Alfred and Renals, Steve},
  title = {Recognition of Dialogue Acts in Multiparty Meetings
                   using a Switching {DBN}},
  journal = {IEEE Transactions on Audio, Speech and Language
                   Processing},
  volume = {16},
  number = {7},
  pages = {1303--1314},
  abstract = {This paper is concerned with the automatic recognition
                   of dialogue acts (DAs) in multiparty conversational
                   speech. We present a joint generative model for DA
                   recognition in which segmentation and classification of
                   DAs are carried out in parallel. Our approach to DA
                   recognition is based on a switching dynamic Bayesian
                   network (DBN) architecture. This generative approach
                   models a set of features, related to lexical content
                   and prosody, and incorporates a weighted interpolated
                   factored language model. The switching DBN coordinates
                   the recognition process by integrating the component
                   models. The factored language model, which is estimated
                   from multiple conversational data corpora, is used in
                   conjunction with additional task-specific language
                   models. In conjunction with this joint generative
                   model, we have also investigated the use of a
                   discriminative approach, based on conditional random
                   fields, to perform a reclassification of the segmented
                   DAs. We have carried out experiments on the AMI corpus
                   of multimodal meeting recordings, using both manually
                   transcribed speech, and the output of an automatic
                   speech recognizer, and using different configurations
                   of the generative model. Our results indicate that the
                   system performs well both on reference and fully
                   automatic transcriptions. A further significant
                   improvement in recognition accuracy is obtained by the
                   application of the discriminative reranking approach
                   based on conditional random fields.},
  doi = {10.1109/TASL.2008.922463},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/dielmann2008.pdf},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=4599391&arnumber=4497831&count=18&index=9},
  year = 2008
}
@inproceedings{bourlard2008,
  author = {Bourlard, Herve and Renals, Steve},
  title = {Recognition and Understanding of Meetings: Overview of
                   the {European} {AMI} and {AMIDA} Projects},
  booktitle = {Proc. LangTech 2008},
  abstract = {The AMI and AMIDA projects are concerned with the
                   recognition and interpretation of multiparty
                   (face-to-face and remote) meetings. Within these
                   projects we have developed the following: (1) an
                   infrastructure for recording meetings using multiple
                   microphones and cameras; (2) a one hundred hour,
                   manually annotated meeting corpus; (3) a number of
                   techniques for indexing, and summarizing of meeting
                   videos using automatic speech recognition and computer
                   vision, and (4) a extensible framework for browsing,
                   and searching of meeting videos. We give an overview of
                   the various techniques developed in AMI (mainly
                   involving face-to-face meetings), their integration
                   into our meeting browser framework, and future plans
                   for AMIDA (Augmented Multiparty Interaction with
                   Distant Access), the follow-up project to AMI.
                   Technical and business information related to these two
                   projects can be found at www.amiproject.org,
                   respectively on the Scientific and Business portals. },
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/bourlard2008.pdf},
  year = 2008
}