2007.bib

@comment{{This file has been generated by bib2bib 1.92}}
@comment{{Command line: /home/korin/bibtex2html-1.92-LINUX/bib2bib -oc /home/korin/projects/publications/new_output/transitdata/2007-citations -ob /home/korin/projects/publications/new_output/transitdata/2007.bib -c 'year : "2007"' /home/korin/projects/publications/filtlists/full_publications_list.bib}}
@inproceedings{frankel07:AF_MLP,
  author = {Frankel, J. and Magimai-Doss, M. and King, S. and
                   Livescu, K. and Çetin, Ö.},
  title = {Articulatory Feature Classifiers Trained on 2000 hours
                   of Telephone Speech},
  booktitle = {Proc. Interspeech},
  address = {Antwerp, Belgium},
  abstract = {This paper is intended to advertise the public
                   availability of the articulatory feature (AF)
                   classification multi-layer perceptrons (MLPs) which
                   were used in the Johns Hopkins 2006 summer workshop. We
                   describe the design choices, data preparation, AF label
                   generation, and the training of MLPs for feature
                   classification on close to 2000 hours of telephone
                   speech. In addition, we present some analysis of the
                   MLPs in terms of classification accuracy and confusions
                   along with a brief summary of the results obtained
                   during the workshop using the MLPs. We invite
                   interested parties to make use of these MLPs.},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/frankel_AF-MLP.pdf},
  year = 2007
}
@inproceedings{wolters-icphs:07,
  author = {Maria Wolters and Pauline Campbell and Christine
                   DePlacido and Amy Liddell and David Owens},
  title = {The Effect of Hearing Loss on the Intelligibility of
                   Synthetic Speech},
  booktitle = {Proc. Intl. Conf. Phon. Sci.},
  abstract = {Many factors affect the intelligibility of synthetic
                   speech. One aspect that has been severely neglected in
                   past work is hearing loss. In this study, we
                   investigate whether pure-tone audiometry thresholds
                   across a wide range of frequencies (0.25--20kHz) are
                   correlated with participants’ performance on a simple
                   task that involves accurately recalling and processing
                   reminders. Participants’ scores correlate not only with
                   thresholds in the frequency ranges commonly associated
                   with speech, but also with extended high-frequency
                   thresholds.},
  categories = {},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/WoltersetalICPhS2007.pdf},
  year = 2007
}
@inproceedings{jyamagis07:avss2006,
  author = {Junichi Yamagishi and Takao Kobayashi and Steve Renals
                   and Simon King and Heiga Zen and Tomoki Toda and
                   Keiichi Tokuda },
  title = {Improved Average-Voice-based Speech Synthesis Using
                   Gender-Mixed Modeling and a Parameter Generation
                   Algorithm Considering {GV}},
  booktitle = {Proc. 6th ISCA Workshop on Speech Synthesis (SSW-6)},
  abstract = {For constructing a speech synthesis system which can
                   achieve diverse voices, we have been developing a
                   speaker independent approach of HMM-based speech
                   synthesis in which statistical average voice models are
                   adapted to a target speaker using a small amount of
                   speech data. In this paper, we incorporate a
                   high-quality speech vocoding method STRAIGHT and a
                   parameter generation algorithm with global variance
                   into the system for improving quality of synthetic
                   speech. Furthermore, we introduce a feature-space
                   speaker adaptive training algorithm and a gender mixed
                   modeling technique for conducting further normalization
                   of the average voice model. We build an English
                   text-to-speech system using these techniques and show
                   the performance of the system.},
  categories = {HMM, speech synthesis, speaker adaptation, HTS},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/ssw6-yamagishi.pdf},
  year = 2007
}
@inproceedings{leo_07-1,
  author = {Leonardo Badino and Robert A.J. Clark},
  title = {Issues of Optionality in Pitch Accent Placement},
  booktitle = {Proc. 6th ISCA Speech Synthesis Workshop},
  address = {Bonn, Germany},
  abstract = {When comparing the prosodic realization of different
                   English speakers reading the same text, a significant
                   disagreement is usually found amongst the pitch accent
                   patterns of the speakers. Assuming that such
                   disagreement is due to a partial optionality of pitch
                   accent placement, it has been recently proposed to
                   evaluate pitch accent predictors by comparing them with
                   multi-speaker reference data. In this paper we face the
                   issue of pitch accent optionality at different levels.
                   At first we propose a simple mathematical definition of
                   intra-speaker optionality which allows us to introduce
                   a function for evaluating pitch accent predictors which
                   we show being more accurate and robust than those used
                   in previous works. Subsequently we compare a pitch
                   accent predictor trained on single speaker data with a
                   predictor trained on multi-speaker data in order to
                   point out the large overlapping between intra-speaker
                   and inter-speaker optionality. Finally, we show our
                   successful results in predicting intra-speaker
                   optionality and we suggest how this achievement could
                   be exploited to improve the performances of a unit
                   selection text-to speech synthesis (TTS) system.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/ssw6_252.pdf},
  year = 2007
}
@article{beaver:07,
  author = {David Beaver and Brady Zack Clark and Edward Flemming
                   and T. Florian Jaeger and Maria Wolters},
  title = {When Semantics meets Phonetics: {A}coustical studies
                   of second occurrence focus},
  journal = {Language},
  volume = 83,
  number = 2,
  pages = {245--276},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/BeaverLanguage2007.pdf},
  year = 2007
}
@inproceedings{wolters-ssw:07,
  author = {Maria Wolters and Pauline Campbell and Christine
                   DePlacido and Amy Liddell and David Owens},
  title = {Making Synthetic Speech Accessible to Older People},
  booktitle = {Proc. Sixth ISCA Workshop on Speech Synthesis, Bonn,
                   Germany },
  abstract = {In this paper, we report on an experiment that tested
                   users’ ability to understand the content of spoken
                   auditory reminders. Users heard meeting reminders and
                   medication reminders spoken in both a natural and a
                   synthetic voice. Our results show that older users can
                   understand synthetic speech as well as younger users
                   provided that the prompt texts are well-designed, using
                   familiar words and contextual cues. As soon as
                   unfamiliar and complex words are introduced, users’
                   hearing affects how well they can understand the
                   synthetic voice, even if their hearing would pass
                   common screening tests for speech synthesis
                   experiments. Although hearing thresholds correlate best
                   with users’ performance, central auditory processing
                   may also influence performance, especially when complex
                   errors are made.},
  categories = {},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/WoltersetalSSW2007.pdf},
  year = 2007
}
@inproceedings{hirai07:5ms2007,
  author = {Toshio Hirai and Junichi Yamagishi and Seiichi Tenpaku
                   },
  title = {Utilization of an {HMM}-Based Feature Generation
                   Module in 5 ms Segment Concatenative Speech Synthesis},
  booktitle = {Proc. 6th ISCA Workshop on Speech Synthesis (SSW-6)},
  abstract = {If a concatenative speech synthesis system uses more
                   short speech segments, it increases the potential to
                   generate natural speech because the concatenation
                   variation becomes greater. Recently, a synthesis
                   approach was proposed in which very short (5 ms)
                   segments are used. In this paper, an implementation of
                   an HMM-based feature generation module into a very
                   short segment concatenative synthesis system that has
                   the advantage of modularity and a synthesis experiment
                   are described.},
  categories = {speech synthesis, HTS, hybrid algorithm},
  month = aug,
  year = 2007
}
@incollection{dielmann-mlmi06,
  author = {A. Dielmann and S. Renals},
  title = {Automatic Dialogue Act Recognition using a Dynamic
                   {Bayesian} Network},
  booktitle = {Proc. Multimodal Interaction and Related Machine
                   Learning Algorithms Workshop (MLMI--06)},
  publisher = {Springer},
  editor = {S. Renals and S. Bengio and J. Fiscus},
  pages = {178--189},
  abstract = {We propose a joint segmentation and classification
                   approach for the dialogue act recognition task on
                   natural multi-party meetings ({ICSI} Meeting Corpus).
                   Five broad DA categories are automatically recognised
                   using a generative Dynamic {Bayesian} Network based
                   infrastructure. Prosodic features and a switching
                   graphical model are used to estimate DA boundaries, in
                   conjunction with a factored language model which is
                   used to relate words and DA categories. This easily
                   generalizable and extensible system promotes a rational
                   approach to the joint DA segmentation and recognition
                   task, and is capable of good recognition performance.},
  categories = {ami,dialogue act,dbn,factored language
                   model,meetings,edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/dielmann-mlmi06.pdf},
  year = 2007
}
@article{frankel07:factoring,
  author = {Frankel, J. and King, S.},
  title = {Factoring {G}aussian Precision Matrices for Linear
                   Dynamic Models},
  journal = {Pattern Recognition Letters},
  volume = {28},
  number = {16},
  pages = {2264-2272},
  abstract = {The linear dynamic model (LDM), also known as the
                   Kalman filter model, has been the subject of research
                   in the engineering, control, and more recently, machine
                   learning and speech technology communities. The
                   Gaussian noise processes are usually assumed to have
                   diagonal, or occasionally full, covariance matrices. A
                   number of recent papers have considered modelling the
                   precision rather than covariance matrix of a Gaussian
                   distribution, and this work applies such ideas to the
                   LDM. A Gaussian precision matrix P can be factored into
                   the form P = UTSU where U is a transform and S a
                   diagonal matrix. By varying the form of U, the
                   covariance can be specified as being diagonal or full,
                   or used to model a given set of spatial dependencies.
                   Furthermore, the transform and scaling components can
                   be shared between models, allowing richer distributions
                   with only marginally more parameters than required to
                   specify diagonal covariances. The method described in
                   this paper allows the construction of models with an
                   appropriate number of parameters for the amount of
                   available training data. We provide illustrative
                   experimental results on synthetic and real speech data
                   in which models with factored precision matrices and
                   automatically-selected numbers of parameters are as
                   good as or better than models with diagonal covariances
                   on small data sets and as good as models with full
                   covariance matrices on larger data sets.},
  categories = {LDM},
  doi = {10.1016/j.patrec.2007.07.008},
  month = {December},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Frankel_LDM_covar.pdf},
  year = 2007
}
@inproceedings{cetin07:crosslingual,
  author = {Çetin, Ö. and Magimai-Doss, M. and Kantor, A. and
                   King, S. and Bartels, C. and Frankel, J. and Livescu,
                   K.},
  title = {Monolingual and crosslingual comparison of tandem
                   features derived from articulatory and phone {MLP}s},
  booktitle = {Proc. ASRU},
  address = {Kyoto},
  organization = {IEEE},
  abstract = {In recent years, the features derived from posteriors
                   of a multilayer perceptron (MLP), known as tandem
                   features, have proven to be very effective for
                   automatic speech recognition. Most tandem features to
                   date have relied on MLPs trained for phone
                   classification. We recently showed on a relatively
                   small data set that MLPs trained for articulatory
                   feature classification can be equally effective. In
                   this paper, we provide a similar comparison using MLPs
                   trained on a much larger data set - 2000 hours of
                   English conversational telephone speech. We also
                   explore how portable phone- and articulatory feature-
                   based tandem features are in an entirely different
                   language - Mandarin - without any retraining. We find
                   that while phone-based features perform slightly better
                   in the matched-language condition, they perform
                   significantly better in the cross-language condition.
                   Yet, in the cross-language condition, neither approach
                   is as effective as the tandem features extracted from
                   an MLP trained on a relatively small amount of
                   in-domain data. Beyond feature concatenation, we also
                   explore novel observation modelling schemes that allow
                   for greater flexibility in combining the tandem and
                   standard features at hidden Markov model (HMM) outputs.},
  month = {December},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Cetin_etal_ASRU2007.pdf},
  year = 2007
}
@inproceedings{clark:podsiadlo:mayo:king:blizzard2007,
  author = {Robert A. J. Clark and Monika Podsiadlo and Mark
                   Fraser and Catherine Mayo and Simon King },
  title = {Statistical Analysis of the {B}lizzard {C}hallenge
                   2007 Listening Test Results },
  booktitle = {Proc. Blizzard 2007 (in Proc. Sixth {ISCA} Workshop on
                   Speech Synthesis)},
  address = {Bonn, Germany},
  abstract = {Blizzard 2007 is the third Blizzard Challenge, in
                   which participants build voices from a common dataset.
                   A large listening test is conducted which allows
                   comparison of systems in terms of naturalness and
                   intelligibility. New sections were added to the
                   listening test for 2007 to test the perceived
                   similarity of the speaker's identity between natural
                   and synthetic speech. In this paper, we present the
                   results of the listening test and the subsequent
                   statistical analysis. },
  categories = {blizzard,listening test},
  keywords = {Blizzard},
  month = {August},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blz3_003.pdf},
  year = 2007
}
@inproceedings{wolters-interspeech:07,
  author = {Maria Wolters and Pauline Campbell and Christine
                   DePlacido and Amy Liddell and David Owens},
  title = {The Role of Outer Hair Cell Function in the Perception
                   of Synthetic versus Natural Speech},
  booktitle = {Proc. Interspeech},
  abstract = {Hearing loss as assessed by pure-tone audiometry (PTA)
                   is significantly correlated with the intelligibility of
                   synthetic speech. However, PTA is a subjective
                   audiological measure that assesses the entire auditory
                   pathway and does not discriminate between the different
                   afferent and efferent contributions. In this paper, we
                   focus on one particular aspect of hearing that has been
                   shown to correlate with hearing loss: outer hair cell
                   (OHC) function. One role of OHCs is to increase
                   sensitivity and frequency selectivity. This function of
                   OHCs can be assessed quickly and objectively through
                   otoacoustic emissions (OAE) testing, which is little
                   known outside the field of audiology. We find that OHC
                   function affects the perception of human speech, but
                   not that of synthetic speech. This has important
                   implications not just for audiological and
                   electrophysiological research, but also for adapting
                   speech synthesis to ageing ears.},
  categories = {},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/WoltersetalInterspeech2007.pdf},
  year = 2007
}
@article{frankel07:AF_DBN,
  author = {Frankel, J. and Wester, M. and King, S.},
  title = {Articulatory feature recognition using dynamic
                   {B}ayesian networks},
  journal = {Computer Speech & Language },
  volume = {21},
  number = {4},
  pages = {620--640},
  abstract = {We describe a dynamic Bayesian network for
                   articulatory feature recognition. The model is intended
                   to be a component of a speech recognizer that avoids
                   the problems of conventional ``beads-on-a-string''
                   phoneme-based models. We demonstrate that the model
                   gives superior recognition of articulatory features
                   from the speech signal compared with a stateof- the art
                   neural network system. We also introduce a training
                   algorithm that offers two major advances: it does not
                   require time-aligned feature labels and it allows the
                   model to learn a set of asynchronous feature changes in
                   a data-driven manner.},
  month = oct,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Frankel_etal_CSL2007.pdf},
  year = 2007
}
@article{JunichiHTS06,
  author = {Junichi Yamagishi and Takao Kobayashi},
  title = {Average-Voice-based Speech Synthesis using HSMM-based
                   Speaker Adaptation and Adaptive Training.},
  journal = {IEICE Trans. Information and Systems},
  volume = {E90-D},
  number = 2,
  pages = {533-543},
  abstract = {In speaker adaptation for speech synthesis, it is
                   desirable to convert both voice characteristics and
                   prosodic features such as F0 and phone duration. For
                   simultaneous adaptation of spectrum, F0 and phone
                   duration within the HMM framework, we need to transform
                   not only the state output distributions corresponding
                   to spectrum and F0 but also the duration distributions
                   corresponding to phone duration. However, it is not
                   straightforward to adapt the state duration because the
                   original HMM does not have explicit duration
                   distributions. Therefore, we utilize the framework of
                   the hidden semi-Markov model (HSMM), which is an HMM
                   having explicit state duration distributions, and we
                   apply an HSMM-based model adaptation algorithm to
                   simultaneously transform both the state output and
                   state duration distributions. Furthermore, we propose
                   an HSMM-based adaptive training algorithm to
                   simultaneously normalize the state output and state
                   duration distributions of the average voice model. We
                   incorporate these techniques into our HSMM-based speech
                   synthesis system, and show their effectiveness from the
                   results of subjective and objective evaluation tests.},
  month = feb,
  year = 2007
}
@inproceedings{Cetin07:tandem,
  author = {Çetin, Ö. and Kantor, A. and King, S. and Bartels,
                   C. and Magimai-Doss, M. and Frankel, J. and Livescu, K.},
  title = {An articulatory feature-based tandem approach and
                   factored observation modeling},
  booktitle = {Proc. ICASSP},
  address = {Honolulu},
  abstract = {The so-called tandem approach, where the posteriors of
                   a multilayer perceptron (MLP) classifier are used as
                   features in an automatic speech recognition (ASR)
                   system has proven to be a very effective method. Most
                   tandem approaches up to date have relied on MLPs
                   trained for phone classification, and appended the
                   posterior features to some standard feature hidden
                   Markov model (HMM). In this paper, we develop an
                   alternative tandem approach based on MLPs trained for
                   articulatory feature (AF) classification. We also
                   develop a factored observation model for characterizing
                   the posterior and standard features at the HMM outputs,
                   allowing for separate hidden mixture and state-tying
                   structures for each factor. In experiments on a subset
                   of Switchboard, we show that the AFbased tandem
                   approach is as effective as the phone-based approach,
                   and that the factored observation model significantly
                   outperforms the simple feature concatenation approach
                   while using fewer parameters.},
  month = {April},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Cetin_icassp07_tandem.pdf},
  year = 2007
}
@inproceedings{fraser:king:blizzard2007,
  author = {Mark Fraser and Simon King},
  title = {The {B}lizzard {C}hallenge 2007},
  booktitle = {Proc. Blizzard 2007 (in Proc. Sixth ISCA Workshop on
                   Speech Synthesis)},
  address = {Bonn, Germany},
  abstract = {In Blizzard 2007, the third Blizzard Challenge,
                   participants were asked to build voices from a dataset,
                   a defined subset and, following certain constraints, a
                   subset of their choice. A set of test sentences was
                   then released to be synthesised. An online evaluation
                   of the submitted synthesised sentences focused on
                   naturalness and intelligibility, and added new sec-
                   tions for degree of similarity to the original speaker,
                   and similarity in terms of naturalness of pairs of
                   sentences from different systems. We summarise this
                   year's Blizzard Challenge and look ahead to possible
                   designs for Blizzard 2008 in the light of participant
                   and listener feedback. },
  categories = {blizzard, listening test},
  keywords = {Blizzard},
  month = {August},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blz3_001.pdf},
  year = 2007
}
@article{nose07:mrhsmm,
  author = {Takashi Nose and Junichi Yamagishi and Takao Kobayashi},
  title = {A Style Control Technique for {HMM}-based Expressive
                   Speech Synthesis},
  journal = {IEICE Trans. Information and Systems},
  volume = {E90-D},
  number = 9,
  pages = {1406--1413},
  abstract = {This paper describes a technique for controlling the
                   degree of expressivity of a desired emotional
                   expression and/or speaking style of synthesized speech
                   in an HMM-based speech synthesis framework. With this
                   technique, multiple emotional expressions and speaking
                   styles of speech are modeled in a single model by using
                   a multiple-regression hidden semi-Markov model
                   (MRHSMM). A set of control parameters, called the style
                   vector, is defined, and each speech synthesis unit is
                   modeled by using the MRHSMM, in which mean parameters
                   of the state output and duration distributions are
                   expressed by multiple-regression of the style vector.
                   In the synthesis stage, the mean parameters of the
                   synthesis units are modified by transforming an
                   arbitrarily given style vector that corresponds to a
                   point in a low-dimensional space, called style space,
                   each of whose coordinates represents a certain specific
                   speaking style or emotion of speech. The results of
                   subjective evaluation tests show that style and its
                   intensity can be controlled by changing the style
                   vector},
  categories = {HMM-based speech synthesis, speaking style, emotional
                   expression, style interpolation, hidden semi-Markov
                   model (HSMM)},
  month = sep,
  url = {http://search.ieice.org/bin/summary.php?id=e90-d_9_1406&category=D&lang=E&year=2007&abst=},
  year = 2007
}
@incollection{huang2007-mlmi,
  author = {Huang, Songfang and Renals, Steve},
  title = {Modeling Prosodic Features in Language Models for
                   Meetings},
  booktitle = {Machine Learning for Multimodal Interaction IV},
  publisher = {Springer},
  editor = {Popescu-Belis, A. and Renals, S. and Bourlard, H.},
  volume = {4892},
  series = {Lecture Notes in Computer Science},
  pages = {191--202},
  abstract = {Prosody has been actively studied as an important
                   knowledge source for speech recognition and
                   understanding. In this paper, we are concerned with the
                   question of exploiting prosody for language models to
                   aid automatic speech recognition in the context of
                   meetings. Using an automatic syllable detection
                   algorithm, the syllable-based prosodic features are
                   extracted to form the prosodic representation for each
                   word. Two modeling approaches are then investigated.
                   One is based on a factored language model, which
                   directly uses the prosodic representation and treats it
                   as a `word'. Instead of direct association, the second
                   approach provides a richer probabilistic structure
                   within a hierarchical Bayesian framework by introducing
                   an intermediate latent variable to represent similar
                   prosodic patterns shared by groups of words. Four-fold
                   cross-validation experiments on the ICSI Meeting Corpus
                   show that exploiting prosody for language modeling can
                   significantly reduce the perplexity, and also have
                   marginal reductions in word error rate.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/mlmi07.pdf},
  year = 2007
}
@inproceedings{livescu07:JHU_summary,
  author = {Livescu, K. and Çetin, Ö. and Hasegawa-Johnson, M.
                   and King, S. and Bartels, C. and Borges, N. and Kantor,
                   A. and Lal, P. and Yung, L. and Bezman,
                   Dawson-Haggerty, S. and Woods, B. and Frankel, J. and
                   Magimai-Doss, M. and Saenko, K.},
  title = {Articulatory feature-based methods for acoustic and
                   audio-visual speech recognition: {S}ummary from the
                   2006 {JHU} {S}ummer {W}orkshop},
  booktitle = {Proc. ICASSP},
  address = {Honolulu},
  abstract = {We report on investigations, conducted at the 2006
                   Johns HopkinsWorkshop, into the use of articulatory
                   features (AFs) for observation and pronunciation models
                   in speech recognition. In the area of observation
                   modeling, we use the outputs of AF classiers both
                   directly, in an extension of hybrid HMM/neural network
                   models, and as part of the observation vector, an
                   extension of the tandem approach. In the area of
                   pronunciation modeling, we investigate a model having
                   multiple streams of AF states with soft synchrony
                   constraints, for both audio-only and audio-visual
                   recognition. The models are implemented as dynamic
                   Bayesian networks, and tested on tasks from the
                   Small-Vocabulary Switchboard (SVitchboard) corpus and
                   the CUAVE audio-visual digits corpus. Finally, we
                   analyze AF classication and forced alignment using a
                   newly collected set of feature-level manual
                   transcriptions.},
  month = {April},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/livescu_icassp07_sum.pdf},
  year = 2007
}
@inproceedings{avss-icassp07,
  author = {J. Yamagishi and T. Kobayashi and M. Tachibana and K.
                   Ogata and Y. Nakano},
  title = {Model adaptation approach to speech synthesis with
                   diverse voices and styles},
  booktitle = {Proc. ICASSP},
  pages = {1233--1236},
  abstract = {In human computer interaction and dialogue systems, it
                   is often desirable for text-to-speech synthesis to be
                   able to generate natural sounding speech with an
                   arbitrary speaker~Afs voice and with varying speaking
                   styles and/or emotional expressions. We have developed
                   an average-voice-based speech synthesis method using
                   statistical average voice models and model adaptation
                   techniques for this purpose. In this paper, we describe
                   an overview of the speech synthesis system and show the
                   current performance with several experimental results.},
  year = 2007
}
@inproceedings{jaimes2007,
  author = {Jaimes, Alejandro and Bourlard, Hervé and Renals,
                   Steve and Carletta, Jean},
  title = {Recording, Indexing, Summarizing, and Accessing
                   Meeting Videos: An Overview of the {AMI} Project},
  booktitle = {Proc IEEE ICIAPW},
  pages = {59--64},
  abstract = {n this paper we give an overview of the AMI project.
                   AMI developed the following: (1) an infrastructure for
                   recording meetings using multiple microphones and
                   cameras; (2) a one hundred hour, manually annotated
                   meeting corpus; (3) a number of techniques for
                   indexing, and summarizing of meeting videos using
                   automatic speech recognition and computer vision, and
                   (4) an extensible framework for browsing, and searching
                   of meeting videos. We give an overview of the various
                   techniques developed in AMI, their integration into our
                   meeting browser framework, and future plans for AMIDA
                   (Augmented Multiparty Interaction with Distant Access),
                   the follow-up project to AMI.},
  doi = {10.1109/ICIAPW.2007.36},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/jaimes2007.pdf},
  url = {http://ieeexplore.ieee.org/search/srchabstract.jsp?arnumber=4427477&isnumber=4427459&punumber=4427458&k2dockey=4427477@ieeecnfs&query=%28+%28%28renals%29%3Cin%3Eau+%29+%29+%3Cand%3E+%28pyr+%3E%3D+2006+%3Cand%3E+pyr+%3C%3D+2008%29&pos=6&access=no},
  year = 2007
}
@inproceedings{zen07:hts-2,
  author = {Heiga Zen and Takashi Nose and Junichi Yamagishi and
                   Shinji Sako and Takashi Masuko and Alan Black and
                   Keiichi Tokuda},
  title = {The {HMM}-based speech synthesis system ({HTS})
                   version 2.0},
  booktitle = {Proc. 6th ISCA Workshop on Speech Synthesis (SSW-6)},
  abstract = {A statistical parametric speech synthesis system based
                   on hidden Markov models (HMMs) has grown in popularity
                   over the last few years. This system simultaneously
                   models spectrum, excitation, and duration of speech
                   using context-dependent HMMs and generates speech
                   waveforms from the HMMs themselves. Since December
                   2002, we have publicly released an open-source software
                   toolkit named HMM-based speech synthesis system (HTS)
                   to provide a research and development platform for the
                   speech synthesis community. In December 2006, HTS
                   version 2.0 was released. This version includes a
                   number of new features which are useful for both speech
                   synthesis researchers and developers. This paper
                   describes HTS version 2.0 in detail, as well as future
                   release plans.},
  categories = {HMM, speech synthesis, HTS},
  month = aug,
  year = 2007
}
@inproceedings{renals2007,
  author = {Renals, Steve and Hain, Thomas and Bourlard, Hervé},
  title = {Recognition and interpretation of meetings: The {AMI}
                   and {AMIDA} projects},
  booktitle = {Proc. IEEE Workshop on Automatic Speech Recognition
                   and Understanding (ASRU '07)},
  abstract = {The AMI and AMIDA projects are concerned with the
                   recognition and interpretation of multiparty meetings.
                   Within these projects we have: developed an
                   infrastructure for recording meetings using multiple
                   microphones and cameras; released a 100 hour annotated
                   corpus of meetings; developed techniques for the
                   recognition and interpretation of meetings based
                   primarily on speech recognition and computer vision;
                   and developed an evaluation framework at both component
                   and system levels. In this paper we present an overview
                   of these projects, with an emphasis on speech
                   recognition and content extraction. },
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/ami-asru2007.pdf},
  year = 2007
}
@inproceedings{dielmann-icassp07,
  author = {A. Dielmann and S. Renals},
  title = {{DBN} based joint Dialogue Act recognition of
                   multiparty meetings},
  booktitle = {Proc. IEEE ICASSP},
  volume = 4,
  pages = {133--136},
  abstract = {Joint Dialogue Act segmentation and classification of
                   the new {AMI} meeting corpus has been performed through
                   an integrated framework based on a switching dynamic
                   {Bayesian} network and a set of continuous features and
                   language models. The recognition process is based on a
                   dictionary of 15 {DA} classes tailored for group
                   decision-making. Experimental results show that a novel
                   interpolated Factored Language Model results in a low
                   error rate on the automatic segmentation task, and thus
                   good recognition results can be achieved on {AMI}
                   multiparty conversational speech.},
  categories = {ami,dialogue act,dbn,factored language
                   model,meetings,edinburgh},
  month = {April},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/dielmann-icassp07.pdf},
  year = 2007
}
@inproceedings{tachibana07:styleclassify07,
  author = {Makoto Tachibana and Keigo Kawashima and Junichi
                   Yamagishi and Takao Kobayashi},
  title = {Performance Evaluation of {HMM}-Based Style
                   Classification with a Small Amount of Training Data},
  booktitle = {Proc. Interspeech 2007},
  abstract = {This paper describes a classification technique for
                   emotional expressions and speaking styles of speech
                   using only a small amount of training data of a target
                   speaker. We model spectral and fundamental frequency
                   (F0) features simultaneously using multi-space
                   probability distribution HMM (MSD-HMM), and adapt a
                   speaker-independent neutral style model to a certain
                   target speaker’s style model with a small amount of
                   data using MSD-MLLR which is extended MLLR for MSD-HMM.
                   We perform classification experiments for professional
                   narrators’ speech and non-professional speakers'
                   speech and evaluate the performance of proposed
                   technique by comparing with other commonly used
                   classifiers. We show that the proposed technique gives
                   better result than the other classifiers when using a
                   few sentences of target speaker’s style data.},
  categories = {emotion, speaking style, classification},
  month = aug,
  year = 2007
}
@inproceedings{huang2007-asru,
  author = {Huang, Songfang and Renals, Steve},
  title = {Hierarchical {Pitman-Yor} Language Models for {ASR} in
                   Meetings},
  booktitle = {Proc. IEEE Workshop on Automatic Speech Recognition
                   and Understanding (ASRU'07)},
  pages = {124--129},
  address = {Kyoto, Japan},
  abstract = {In this paper we investigate the application of a
                   novel technique for language modeling --- a
                   hierarchical Bayesian language model (LM) based on the
                   Pitman-Yor process --- on automatic speech recognition
                   (ASR) for multiparty meetings. The hierarchical
                   Pitman-Yor language model (HPYLM), which was originally
                   proposed in the machine learning field, provides a
                   Bayesian interpretation to language modeling. An
                   approximation to the HPYLM recovers the exact
                   formulation of the interpolated Kneser-Ney smoothing
                   method in n-gram models. This paper focuses on the
                   application and scalability of HPYLM on a practical
                   large vocabulary ASR system. Experimental results on
                   NIST RT06s evaluation meeting data verify that HPYLM is
                   a competitive and promising language modeling
                   technique, which consistently performs better than
                   interpolated Kneser-Ney and modified Kneser-Ney n-gram
                   LMs in terms of both perplexity (PPL) and word error
                   rate (WER).},
  month = dec,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/asru07.pdf},
  year = 2007
}
@inproceedings{owens-efas:07,
  author = {David Owens and Pauline Campbell and Amy Liddell and
                   Christine DePlacido and Maria Wolters},
  title = {Random Gap Detection Threshold: A Useful Measure of
                   Auditory Ageing?},
  booktitle = {Proc. Europ. Cong. Fed. Audiol. Heidelberg, Germany},
  abstract = {},
  categories = {},
  month = jun,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Owensetal2007EFAS.pdf},
  year = 2007
}
@misc{Hofer_Shimodaira:sigg:2007,
  author = {Gregor Hofer and Hiroshi Shimodaira and Junichi
                   Yamagishi},
  title = {Speech-driven Head Motion Synthesis based on a
                   Trajectory Model},
  howpublished = {Poster at Siggraph 2007},
  address = {San Diego, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/siggraph07.pdf},
  year = 2007
}
@inproceedings{strom:etal:interspeech2007,
  author = {Volker Strom and Ani Nenkova and Robert Clark and
                   Yolanda Vazquez-Alvarez and Jason Brenier and Simon
                   King and Dan Jurafsky},
  title = {Modelling Prominence and Emphasis Improves
                   Unit-Selection Synthesis},
  booktitle = {Proc. Interspeech 2007},
  address = {Antwerp, Belgium},
  abstract = {We describe the results of large scale perception
                   experiments showing improvements in synthesising two
                   distinct kinds of prominence: standard pitch-accent and
                   strong emphatic accents. Previously prominence
                   assignment has been mainly evaluated by computing
                   accuracy on a prominence-labelled test set. By contrast
                   we integrated an automatic pitch-accent classifier into
                   the unit selection target cost and showed that
                   listeners preferred these synthesised sentences. We
                   also describe an improved recording script for
                   collecting emphatic accents, and show that generating
                   emphatic accents leads to further improvements in the
                   fiction genre over incorporating pitch accent only.
                   Finally, we show differences in the effects of
                   prominence between child-directed speech and news and
                   fiction genres. Index Terms: speech synthesis, prosody,
                   prominence, pitch accent, unit selection},
  categories = {speech synthesis},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/p540.pdf},
  year = 2007
}
@inproceedings{murray2007-interspeech,
  author = {Murray, Gabriel and Renals, Steve},
  title = {Towards online speech summarization},
  booktitle = {Proc. Interspeech '07},
  abstract = {The majority of speech summarization research has
                   focused on extracting the most informative dialogue
                   acts from recorde d, archived data. However, a
                   potential use case for speech sum- marization in the
                   meetings domain is to facilitate a meeting in progress
                   by providing the participants - whether they are at
                   tend- ing in-person or remotely - with an indication of
                   the most im- portant parts of the discussion so far.
                   This requires being a ble to determine whether a
                   dialogue act is extract-worthy befor e the global
                   meeting context is available. This paper introduces a
                   novel method for weighting dialogue acts using only
                   very lim- ited local context, and shows that high
                   summary precision is possible even when information
                   about the meeting as a whole is lacking. A new
                   evaluation framework consisting of weighted precision,
                   recall and f-score is detailed, and the novel onl ine
                   summarization method is shown to significantly increase
                   recall and f-score compared with a method using no
                   contextual infor- mation. },
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/IS070966.PDF},
  year = 2007
}
@inproceedings{nenkova:07,
  author = {Nenkova, Ani and Jason Brenier and Anubha Kothari and
                   Sasha Calhoun and Laura Whitton and David Beaver and
                   Dan Jurafsky},
  title = {To Memorize or to Predict: Prominence labeling in
                   Conversational Speech},
  booktitle = {NAACL Human Language Technology Conference},
  address = {Rochester, NY},
  abstract = {The immense prosodic variation of natural
                   conversational speech makes it challenging to predict
                   which words are prosodically prominent in this genre.
                   In this paper, we examine a new feature, accent ratio,
                   which captures how likely it is that a word will be
                   realized as prominent or not. We compare this feature
                   with traditional accentprediction features (based on
                   part of speech and N-grams) as well as with several
                   linguistically motivated and manually labeled
                   information structure features, such as whether a word
                   is given, new, or contrastive. Our results show that
                   the linguistic features do not lead to significant
                   improvements, while accent ratio alone can yield
                   prediction performance almost as good as the
                   combination of any other subset of features. Moreover,
                   this feature is useful even across genres; an
                   accent-ratio classifier trained only on conversational
                   speech predicts prominence with high accuracy in
                   broadcast news. Our results suggest that carefully
                   chosen lexicalized features can outperform less
                   fine-grained features.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/nenkovaetalhlt07.pdf},
  year = 2007
}
@inproceedings{leo_07-2,
  author = {Matthew P. Aylett and J. Sebastian Andersson and
                   Leonardo Badino and Christopher J. Pidcock},
  title = {The {C}erevoice {B}lizzard Entry 2007: Are Small
                   Database Errors Worse than Compression Artifacts?},
  booktitle = {Proc. Blizzard Challenge Workshop 2007},
  address = {Bonn, Germany},
  abstract = {In commercial systems the memory footprint of unit
                   selection systems is often a key issue. This is
                   especially true for PDAs and other embedded devices. In
                   this year's Blizzard entry CereProc R gave itself the
                   criteria that the full database system entered would
                   have a smaller memory footprint than either of the two
                   smaller database entries. This was accomplished by
                   applying Speex speech compression to the full database
                   entry. In turn a set of small database techniques used
                   to improve the quality of small database systems in
                   last years entry were extended. Finally, for all
                   systems, two quality control methods were applied to
                   the underlying database to improve the lexicon and
                   transcription match to the underlying data. Results
                   suggest that mild audio quality artifacts introduced by
                   lossy compression have almost as much impact on MOS
                   perceived quality as concatenation errors introduced by
                   sparse data in the smaller systems with bulked
                   diphones.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blz3_004.pdf},
  year = 2007
}
@article{frankel07:ldm,
  author = {Frankel, J. and King, S.},
  title = {Speech Recognition using Linear Dynamic Models},
  journal = {IEEE {T}ransactions on {S}peech and {A}udio
                   {P}rocessing},
  volume = 15,
  number = 1,
  pages = {246--256},
  abstract = {The majority of automatic speech recognition (ASR)
                   systems rely on hidden Markov models, in which Gaussian
                   mixtures model the output distributions associated with
                   sub-phone states. This approach, whilst successful,
                   models consecutive feature vectors (augmented to
                   include derivative information) as statistically
                   independent. Furthermore, spatial correlations present
                   in speech parameters are frequently ignored through the
                   use of diagonal covariance matrices. This paper
                   continues the work of Digalakis and others who proposed
                   instead a first-order linear state-space model which
                   has the capacity to model underlying dynamics, and
                   furthermore give a model of spatial correlations. This
                   paper examines the assumptions made in applying such a
                   model and shows that the addition of a hidden dynamic
                   state leads to increases in accuracy over otherwise
                   equivalent static models. We also propose a
                   time-asynchronous decoding strategy suited to
                   recognition with segment models. We describe
                   implementation of decoding for linear dynamic models
                   and present TIMIT phone recognition results.},
  categories = {am,asr,ldm,timit,search,edinburgh},
  month = {January},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Frankel_King_IEEE2007.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/Frankel_King_IEEE2007.ps},
  year = 2007
}
@inproceedings{livescu07:manual,
  author = {Livescu, K. and Bezman, A. and Borges, N. and Yung, L.
                   and Çetin, Ö. and Frankel, J. and King, S. and
                   Magimai-Doss, M. and Chi, X. and Lavoie, L.},
  title = {Manual transcription of conversational speech at the
                   articulatory feature level},
  booktitle = {Proc. ICASSP},
  address = {Honolulu},
  abstract = {We present an approach for the manual labeling of
                   speech at the articulatory feature level, and a new set
                   of labeled conversational speech collected using this
                   approach. A detailed transcription, including
                   overlapping or reduced gestures, is useful for studying
                   the great pronunciation variability in conversational
                   speech. It also facilitates the testing of feature
                   classiers, such as those used in articulatory
                   approaches to automatic speech recognition. We describe
                   an effort to transcribe a small set of utterances drawn
                   from the Switchboard database using eight articulatory
                   tiers. Two transcribers have labeled these utterances
                   in a multi-pass strategy, allowing for correction of
                   errors. We describe the data collection methods and
                   analyze the data to determine how quickly and reliably
                   this type of transcription can be done. Finally, we
                   demonstrate one use of the new data set by testing a
                   set of multilayer perceptron feature classiers against
                   both the manual labels and forced alignments.},
  month = {April},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/livescu_icassp07_trans.pdf},
  year = 2007
}
@incollection{murray2007-mlmi,
  author = {Murray, Gabriel and Renals, Steve},
  title = {Term-weighting for summarization of multi-party spoken
                   dialogues},
  booktitle = {Machine Learning for Multimodal Interaction IV },
  publisher = {Springer},
  editor = {Popescu-Belis, A. and Renals, S. and Bourlard, H.},
  volume = {4892},
  series = {Lecture Notes in Computer Science},
  pages = {155--166},
  abstract = {This paper explores the issue of term-weighting in the
                   genre of spontaneous, multi-party spoken dialogues,
                   with the intent of using such term-weights in the
                   creation of extractive meeting summaries. The field of
                   text information retrieval has yielded many
                   term-weighting tech- niques to import for our purposes;
                   this paper implements and compares several of these,
                   namely tf.idf, Residual IDF and Gain. We propose that
                   term-weighting for multi-party dialogues can exploit
                   patterns in word us- age among participant speakers,
                   and introduce the su.idf metric as one attempt to do
                   so. Results for all metrics are reported on both manual
                   and automatic speech recognition (ASR) transcripts, and
                   on both the ICSI and AMI meeting corpora. },
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/48920155.pdf},
  year = 2007
}
@inproceedings{liddell-efas:07,
  author = {Amy Liddell and David Owens and Pauline Campbell and
                   Christine DePlacido and Maria Wolters},
  title = {Can Extended High Frequency Hearing Thresholds be Used
                   to Detect Auditory Processing Difficulties in an Ageing
                   Population?},
  booktitle = {Proc. Europ. Cong. Fed. Audiol. Heidelberg, Germany},
  abstract = {},
  categories = {},
  month = jun,
  year = 2007
}
@inproceedings{Hofer_Shimodaira:proc:2007,
  author = {Gregor Hofer and Hiroshi Shimodaira},
  title = {Automatic Head Motion Prediction from Speech Data},
  booktitle = {Proc. Interspeech 2007},
  address = {Antwerp, Belgium},
  abstract = {In this paper we present a novel approach to generate
                   a sequence of head motion units given some speech. The
                   modelling approach is based on the notion that head
                   motion can be divided into a number of short
                   homogeneous units that can each be modelled
                   individually. The system is based on Hidden Markov
                   Models (HMM), which are trained on motion units and act
                   as a sequence generator. They can be evaluated by an
                   accuracy measure. A database of motion capture data was
                   collected and manually annotated for head motion and is
                   used to train the models. It was found that the model
                   is good at distinguishing high activity regions from
                   regions with less activity with accuracies around 75
                   percent. Furthermore the model is able to distinguish
                   different head motion patterns based on speech features
                   somewhat reliably, with accuracies reaching almost 70
                   percent.},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/interspeech07.pdf},
  year = 2007
}
@misc{Hofer_Shimodaira:sca:2007,
  author = {Gregor Hofer and Hiroshi Shimodaira and Junichi
                   Yamagishi},
  title = {Lip motion synthesis using a context dependent
                   trajectory hidden {M}arkov model},
  howpublished = {Poster at SCA 2007},
  address = {San Diego, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/sca07.pdf},
  year = 2007
}
@inproceedings{richmond2007_nolisp,
  author = {Richmond, K.},
  title = {Trajectory Mixture Density Networks With Multiple
                   Mixtures for Acoustic-Articulatory Inversion},
  booktitle = {Advances in Nonlinear Speech Processing, International
                   Conference on Non-Linear Speech Processing, NOLISP 2007},
  editor = {Chetouani, M. and Hussain, A. and Gas, B. and Milgram,
                   M. and Zarader, J.-L.},
  volume = 4885,
  series = {Lecture Notes in Computer Science},
  pages = {263--272},
  publisher = {Springer-Verlag Berlin Heidelberg},
  abstract = {We have previously proposed a trajectory model which
                   is based on a mixture density network (MDN) trained
                   with target variables augmented with dynamic features
                   together with an algorithm for estimating maximum
                   likelihood trajectories which respects the constraints
                   between those features. In this paper, we have extended
                   that model to allow diagonal covariance matrices and
                   multiple mixture components in the trajectory MDN
                   output probability density functions. We have evaluated
                   this extended model on an inversion mapping task and
                   found the trajectory model works well, outperforming
                   smoothing of equivalent trajectories using low-pass
                   filtering. Increasing the number of mixture components
                   in the TMDN improves results further.},
  categories = {ANN, TMDN, acoustic-articulatory inversion, MOCHA},
  doi = {10.1007/978-3-540-77347-4_23},
  key = {richmond2007_nolisp},
  month = dec,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/richmond_nolisp2007.pdf},
  year = 2007
}
@inproceedings{cuayahuitletal_interspeech07,
  author = {Heriberto Cuayáhuitl and Steve Renals and Oliver
                   Lemon and Hiroshi Shimodaira},
  title = {Hierarchical Dialogue Optimization Using Semi-Markov
                   Decision Processes},
  booktitle = {Proc. of INTERSPEECH},
  abstract = {This paper addresses the problem of dialogue
                   optimization on large search spaces. For such a
                   purpose, in this paper we propose to learn dialogue
                   strategies using multiple Semi-Markov Decision
                   Processes and hierarchical reinforcement learning. This
                   approach factorizes state variables and actions in
                   order to learn a hierarchy of policies. Our experiments
                   are based on a simulated flight booking dialogue system
                   and compare flat versus hierarchical reinforcement
                   learning. Experimental results show that the proposed
                   approach produced a dramatic search space reduction
                   (99.36\%), and converged four orders of magnitude
                   faster than flat reinforcement learning with a very
                   small loss in optimality (on average 0.3 system turns).
                   Results also report that the learnt policies
                   outperformed a hand-crafted one under three different
                   conditions of ASR confidence levels. This approach is
                   appealing to dialogue optimization due to faster
                   learning, reusable subsolutions, and scalability to
                   larger problems.},
  categories = {Spoken dialogue systems, semi-Markov decision
                   processes, hierarchical reinforcement learning.},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/SMDPs-interspeech2007.pdf},
  year = 2007
}
@inproceedings{mcgeelennon-icad:07,
  author = {Marilyn McGee-Lennon and Maria Wolters and Tony
                   McBryan},
  title = {Auditory Reminders in the Home},
  booktitle = {Proc. Intl. Conf. Auditory Display (ICAD), Montreal,
                   Canada},
  abstract = {},
  categories = {},
  month = jun,
  year = 2007
}
@inproceedings{cabral07,
  author = {J. Cabral and S. Renals and K. Richmond and J.
                   Yamagishi},
  title = {Towards an Improved Modeling of the Glottal Source in
                   Statistical Parametric Speech Synthesis},
  booktitle = {Proc.of the 6th ISCA Workshop on Speech Synthesis},
  address = {Bonn, Germany},
  abstract = {This paper proposes the use of the Liljencrants-Fant
                   model (LF-model) to represent the glottal source signal
                   in HMM-based speech synthesis systems. These systems
                   generally use a pulse train to model the periodicity of
                   the excitation signal of voiced speech. However, this
                   model produces a strong and uniform harmonic structure
                   throughout the spectrum of the excitation which makes
                   the synthetic speech sound buzzy. The use of a mixed
                   band excitation and phase manipulation reduces this
                   effect but it can result in degradation of the speech
                   quality if the noise component is not weighted
                   carefully. In turn, the LF-waveform has a decaying
                   spectrum at higher frequencies, which is more similar
                   to the real glottal source excitation signal. We
                   conducted a perceptual experiment to test the
                   hypothesis that the LF-model can perform as well as or
                   better than the pulse train in a HMM-based speech
                   synthesizer. In the synthesis, we used the mean values
                   of the LF-parameters, calculated by measurements of the
                   recorded speech. The result of this study is important
                   not only regarding the improvement in speech quality of
                   these type of systems, but also because the LF-model
                   can be used to model many characteristics of the
                   glottal source, such as voice quality, which are
                   important for voice transformation and generation of
                   expressive speech.},
  categories = {LF-model, Statistical parametric speech synthesis,
                   HMM-based speech synthesis},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/cabral07.pdf},
  year = 2007
}
@inproceedings{calhounIS:07,
  author = {Calhoun, Sasha},
  title = {Predicting Focus through Prominence Structure},
  booktitle = {Proceedings of Interspeech},
  address = {Antwerp, Belgium},
  abstract = {Focus is central to our control of information flow in
                   dialogue. Spoken language understanding systems
                   therefore need to be able to detect focus
                   automatically. It is well known that prominence is a
                   key marker of focus in English, however, the
                   relationship is not straight-forward. We present focus
                   prediction models built using the NXT Switchboard
                   corpus. We claim that a focus is more likely if a word
                   is more prominent than expected given its syntactic,
                   semantic and discourse properties. Crucially, the
                   perception of prominence arises not only from acoustic
                   cues, but also the position in prosodic structure. Our
                   focus prediction results, along with a study showing
                   the acoustic properties of focal accents vary by
                   structural position, support our claims. As a largely
                   novel task, these results are an important first step
                   in detecting focus for spoken language applications.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/calhounIS07.pdf},
  year = 2007
}
@inproceedings{richmond2007a,
  author = {Richmond, K.},
  title = {A Multitask Learning Perspective on
                   Acoustic-Articulatory Inversion},
  booktitle = {Proc. Interspeech},
  address = {Antwerp, Belgium},
  abstract = {This paper proposes the idea that by viewing an
                   inversion mapping MLP from a Multitask Learning
                   perspective, we may be able to relax two constraints
                   which are inherent in using electromagnetic
                   articulography as a source of articulatory information
                   for speech technology purposes. As a first step to
                   evaluating this idea, we perform an inversion mapping
                   experiment in an attempt to ascertain whether the
                   hidden layer of a ``multitask'' MLP can act
                   beneficially as a hidden representation that is shared
                   between inversion mapping subtasks for multiple
                   articulatory targets. Our results in the case of the
                   tongue dorsum x-coordinate indicate this is indeed the
                   case and show good promise. Results for the tongue
                   dorsum y-coordinate however are not so clear-cut, and
                   will require further investigation.},
  categories = {acoustic-articulatory inversion, MLP, multitask
                   learning},
  key = {richmond2007a},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/is2007_paper.pdf},
  year = 2007
}
@article{clarkrichmondking_specom2007,
  author = {Robert A. J. Clark and Korin Richmond and Simon King},
  title = {Multisyn: Open-domain unit selection for the
                   {F}estival speech synthesis system},
  journal = {Speech Communication},
  volume = 49,
  number = 4,
  pages = {317--330},
  abstract = {We present the implementation and evaluation of an
                   open-domain unit selection speech synthesis engine
                   designed to be flexible enough to encourage further
                   unit selection research and allow rapid voice
                   development by users with minimal speech synthesis
                   knowledge and experience. We address the issues of
                   automatically processing speech data into a usable
                   voice using automatic segmentation techniques and how
                   the knowledge obtained at labelling time can be
                   exploited at synthesis time. We describe target cost
                   and join cost implementation for such a system and
                   describe the outcome of building voices with a number
                   of different sized datasets. We show that, in a
                   competitive evaluation, voices built using this
                   technology compare favourably to other systems.},
  categories = {speech synthesis, festival, multisyn, unitselection},
  doi = {10.1016/j.specom.2007.01.014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/clarkrichmondking_specom2007.pdf},
  year = 2007
}
@inproceedings{bell_king_is2007,
  author = {Bell, Peter and King, Simon},
  title = {Sparse Gaussian Graphical Models for Speech
                   Recognition},
  booktitle = {Proc. Interspeech 2007},
  address = {Antwerp, Belgium},
  abstract = {We address the problem of learning the structure of
                   Gaussian graphical models for use in automatic speech
                   recognition, a means of controlling the form of the
                   inverse covariance matrices of such systems. With
                   particular focus on data sparsity issues, we implement
                   a method for imposing graphical model structure on a
                   Gaussian mixture system, using a convex optimisation
                   technique to maximise a penalised likelihood
                   expression. The results of initial experiments on a
                   phone recognition task show a performance improvement
                   over an equivalent full-covariance system.},
  categories = {speech recognition, acoustic models, graphical models,
                   precision matrix models},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/sparseGM_is2007.pdf},
  year = 2007
}
@article{dielmann2007-tmm,
  author = {Dielmann, Alfred and Renals, Steve},
  title = {Automatic meeting segmentation using dynamic
                   {Bayesian} networks},
  journal = {IEEE Transactions on Multimedia},
  volume = {9},
  number = {1},
  pages = {25--36},
  abstract = {Multiparty meetings are a ubiquitous feature of
                   organizations, and there are considerable economic
                   benefits that would arise from their automatic analysis
                   and structuring. In this paper, we are concerned with
                   the segmentation and structuring of meetings (recorded
                   using multiple cameras and microphones) into sequences
                   of group meeting actions such as monologue, discussion
                   and presentation. We outline four families of
                   multimodal features based on speaker turns, lexical
                   transcription, prosody, and visual motion that are
                   extracted from the raw audio and video recordings. We
                   relate these low-level features to more complex group
                   behaviors using a multistream modelling framework based
                   on multistream dynamic Bayesian networks (DBNs). This
                   results in an effective approach to the segmentation
                   problem, resulting in an action error rate of 12.2\%,
                   compared with 43\% using an approach based on hidden
                   Markov models. Moreover, the multistream DBN developed
                   here leaves scope for many further improvements and
                   extensions.},
  doi = {10.1109/TMM.2006.886337},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2007/dielmann2007-tmm.pdf},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=4032598&arnumber=4032608&count=23&index=3},
  year = 2007
}
@inproceedings{jyamagis07:hts2007,
  author = {Junichi Yamagishi and Heiga Zen and Tomoki Toda and
                   Keiichi Tokuda},
  title = {Speaker-Independent {HMM}-based Speech Synthesis
                   System -- {HTS-2007} System for the {Blizzard Challenge
                   2007}},
  booktitle = {Proc. Blizzard Challenge 2007},
  abstract = {This paper describes an HMM-based speech synthesis
                   system developed by the HTS working group for the
                   Blizzard Challenge 2007. To further explore the
                   potential of HMM-based speech synthesis, we incorporate
                   new features in our conventional system which underpin
                   a speaker-independent approach: speaker adaptation
                   techniques; adaptive training for HSMMs; and full
                   covariance modeling using the CSMAPLR transforms.},
  categories = {HMM, speech synthesis, speaker adaptation, HTS,
                   Blizzard Challenge},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blizzard2007-HTS.pdf},
  year = 2007
}
@inproceedings{AMIsystemICASSP2007,
  author = {T. Hain and L. Burget and J. Dines and G. Garau and M.
                   Karafiat and M. Lincoln and J. Vepa and V. Wan},
  title = {{The {AMI} System for the Transcription of Speech in
                   Meetings}},
  booktitle = {Proc. {ICASSP}},
  abstract = {This paper describes the AMI transcription system for
                   speech in meetings developed in collaboration by five
                   research groups. The system includes generic techniques
                   such as discriminative and speaker adaptive training,
                   vocal tract length normalisation, heteroscedastic
                   linear discriminant analysis, maximum likelihood linear
                   regression, and phone posterior based features, as well
                   as techniques specifically designed for meeting data.
                   These include segmentation and cross-talk suppression,
                   beam-forming, domain adaptation, web-data collection,
                   and channel adaptive training. The system was improved
                   by more than 20\% relative in word error rate compared
                   to our previous system and was usd in the NIST RTÂ’06
                   evaluations where it was found to yield competitive
                   performance.},
  categories = {LVCSR, NIST Meeting Transcription Evaluation RT06S},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/ICASSP07.AMIasrsystem.pdf},
  year = 2007
}
@inproceedings{richmond2007b,
  author = {Richmond, K. and Strom, V. and Clark, R. and
                   Yamagishi, J. and Fitt, S.},
  title = {Festival Multisyn Voices for the 2007 Blizzard
                   Challenge},
  booktitle = {Proc. Blizzard Challenge Workshop (in Proc. SSW6)},
  address = {Bonn, Germany},
  abstract = {This paper describes selected aspects of the Festival
                   Multisyn entry to the Blizzard Challenge 2007. We
                   provide an overview of the process of building the
                   three required voices from the speech data provided.
                   This paper focuses on new features of Multisyn which
                   are currently under development and which have been
                   employed in the system used for this Blizzard
                   Challenge. These differences are the application of a
                   more flexible phonetic lattice representation during
                   forced alignment labelling and the use of a pitch
                   accent target cost component. Finally, we also examine
                   aspects of the speech data provided for this year's
                   Blizzard Challenge and raise certain issues for
                   discussion concerning the aim of comparing voices made
                   with differing subsets of the data provided.},
  categories = {tts, blizzard, multisyn, unit selection},
  key = {richmond2007b},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blizzard2007paper.pdf},
  year = 2007
}
@inproceedings{penner-icphs:07,
  author = {Heike Penner and Nicholas Miller and Maria Wolters},
  title = {Motor Speech Disorders in Three {P}arkinsonian
                   Syndromes: A Comparative Study},
  booktitle = {Proc. Intl. Conf. Phon. Sci,},
  abstract = {},
  categories = {},
  year = 2007
}
@article{king07:JASA2007,
  author = {King, S. and Frankel, J. and Livescu, K. and
                   McDermott, E. and Richmond, K. and Wester, M.},
  title = {Speech production knowledge in automatic speech
                   recognition},
  journal = {Journal of the Acoustical Society of America},
  volume = 121,
  number = 2,
  pages = {723--742},
  abstract = {Although much is known about how speech is produced,
                   and research into speech production has resulted in
                   measured articulatory data, feature systems of
                   different kinds and numerous models, speech production
                   knowledge is almost totally ignored in current
                   mainstream approaches to automatic speech recognition.
                   Representations of speech production allow simple
                   explanations for many phenomena observed in speech
                   which cannot be easily analyzed from either acoustic
                   signal or phonetic transcription alone. In this
                   article, we provide a survey of a growing body of work
                   in which such representations are used to improve
                   automatic speech recognition.},
  month = feb,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/King_et_al_review.pdf},
  year = 2007
}