2012.bib

@comment{{This file has been generated by bib2bib 1.92}}
@comment{{Command line: /home/korin/bibtex2html-1.92-LINUX/bib2bib -oc /home/korin/projects/publications/new_output/transitdata/2012-citations -ob /home/korin/projects/publications/new_output/transitdata/2012.bib -c 'year : "2012"' /home/korin/projects/publications/filtlists/full_publications_list.bib}}
@inproceedings{swi2012_dnn,
  author = {Swietojanski, P. and Ghoshal, A. and Renals, S.},
  title = {Unsupervised Cross-lingual knowledge transfer in {DNN-based LVCSR}},
  booktitle = {Proc. IEEE Workshop on Spoken Language Technology},
  address = {Miami, Florida, USA},
  abstract = {We investigate the use of cross-lingual acoustic data to initialise deep neural network (DNN) acoustic models by means
of unsupervised restricted Boltzmann machine (RBM) pretraining.
DNNs for German are pretrained using one or all of German, Portuguese, Spanish and Swedish. The DNNs are used in a tandem configuration, where the network outputs are used as features for a hidden Markov model (HMM) whose
emission densities are modeled by Gaussian mixture models (GMMs), as well as in a hybrid configuration, where the network outputs are used as the HMM state likelihoods. The experiments show that unsupervised pretraining is more crucial
for the hybrid setups, particularly with limited amounts of transcribed training data. More importantly, unsupervised pretraining is shown to be language-independent.},
  month = dec,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/ps_slt2012.pdf},
  year = 2012
}
@article{Andersson2012175,
  author = {Sebastian Andersson and Junichi Yamagishi and Robert
                   A.J. Clark},
  title = {Synthesis and evaluation of conversational
                   characteristics in {HMM}-based speech synthesis},
  journal = {Speech Communication},
  volume = {54},
  number = {2},
  pages = {175--188},
  note = {},
  abstract = {Spontaneous conversational speech has many
                   characteristics that are currently not modelled well by
                   HMM-based speech synthesis and in order to build
                   synthetic voices that can give an impression of someone
                   partaking in a conversation, we need to utilise data
                   that exhibits more of the speech phenomena associated
                   with conversations than the more generally used
                   carefully read aloud sentences. In this paper we show
                   that synthetic voices built with HMM-based speech
                   synthesis techniques from conversational speech data,
                   preserved segmental and prosodic characteristics of
                   frequent conversational speech phenomena. An analysis
                   of an evaluation investigating the perception of
                   quality and speaking style of HMM-based voices confirms
                   that speech with conversational characteristics are
                   instrumental for listeners to perceive successful
                   integration of conversational speech phenomena in
                   synthetic speech. The achieved synthetic speech quality
                   provides an encouraging start for the continued use of
                   conversational speech in HMM-based speech synthesis.},
  doi = {10.1016/j.specom.2011.08.001},
  issn = {0167-6393},
  keywords = {Speech synthesis, HMM, Conversation, Spontaneous
                   speech, Filled pauses, Discourse marker},
  url = {http://www.sciencedirect.com/science/article/pii/S0167639311001178},
  year = 2012
}
@article{steiner:EL106,
  author = {Ingmar Steiner and Korin Richmond and Ian Marshall and
                   Calum D. Gray},
  title = {The magnetic resonance imaging subset of the mngu0
                   articulatory corpus},
  journal = {The Journal of the Acoustical Society of America},
  volume = {131},
  number = {2},
  pages = {EL106-EL111},
  abstract = {This paper announces the availability of the magnetic
                   resonance imaging (MRI) subset of the mngu0 corpus, a
                   collection of articulatory speech data from one speaker
                   containing different modalities. This subset comprises
                   volumetric MRI scans of the speaker's vocal tract
                   during sustained production of vowels and consonants,
                   as well as dynamic mid-sagittal scans of repetitive
                   consonant--vowel (CV) syllable production. For
                   reference, high-quality acoustic recordings of the
                   speech material are also available. The raw data are
                   made freely available for research purposes. },
  doi = {10.1121/1.3675459},
  keywords = {audio recording; magnetic resonance imaging; speech
                   processing},
  month = jan,
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2012/mngu0-mri-2.pdf},
  publisher = {ASA},
  year = 2012
}
@inproceedings{ultraxIS2012,
  author = {Richmond, Korin and Renals, Steve},
  title = {Ultrax: An Animated Midsagittal Vocal Tract Display
                   for Speech Therapy},
  booktitle = {Proc. Interspeech},
  address = {Portland, Oregon, USA},
  abstract = {Speech sound disorders (SSD) are the most common
                   communication impairment in childhood, and can hamper
                   social development and learning. Current speech therapy
                   interventions rely predominantly on the auditory skills
                   of the child, as little technology is available to
                   assist in diagnosis and therapy of SSDs. Realtime
                   visualisation of tongue movements has the potential to
                   bring enormous benefit to speech therapy. Ultrasound
                   scanning offers this possibility, although its display
                   may be hard to interpret. Our ultimate goal is to
                   exploit ultrasound to track tongue movement, while
                   displaying a simplified, diagrammatic vocal tract that
                   is easier for the user to interpret. In this paper, we
                   outline a general approach to this problem, combining a
                   latent space model with a dimensionality reducing model
                   of vocal tract shapes. We assess the feasibility of
                   this approach using magnetic resonance imaging (MRI)
                   scans to train a model of vocal tract shapes, which is
                   animated using electromagnetic articulography (EMA)
                   data from the same speaker.},
  categories = {Ultrasound, speech therapy, vocal tract visualisation},
  keywords = {Ultrasound, speech therapy, vocal tract visualisation},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/RichmondRenalsIS2012.pdf},
  year = 2012
}
@article{Burton2012,
  author = {Christopher Burton and Brian McKinstry and Aurora
                   Szentagotai Tatar and Antoni Serrano-Blanco and Claudia
                   Pagliari and Maria Wolters},
  title = {Activity monitoring in patients with depression: A
                   systematic review.},
  journal = {Journal of Affective Disorders},
  volume = {},
  number = {0},
  pages = { - },
  note = {},
  abstract = {Background: Altered physical activity is an important
                   feature of depression. It is manifested in psychomotor
                   retardation, agitation and withdrawal from engagement
                   in normal activities. Modern devices for activity
                   monitoring (actigraphs) make it possible to monitor
                   physical activity unobtrusively but the validity of
                   actigraphy as an indicator of mood state is uncertain.
                   We carried out a systematic review of digital
                   actigraphy in patients with depression to investigate
                   the associations between measured physical activity and
                   depression. Methods: Systematic review and
                   meta-analysis. Studies were identified from Medline,
                   EMBASE and Psycinfo databases and included if they were
                   either case control or longitudinal studies of
                   actigraphy in adults aged between 18 and 65 diagnosed
                   with a depressive disorder. Outcomes were daytime and
                   night-time activity and actigraphic measures of sleep.
                   Results: We identified 19 eligible papers from 16
                   studies (412 patients). Case control studies showed
                   less daytime activity in patients with depression
                   (standardised mean difference −0.76, 95% confidence
                   intervals −1.05 to −0.47). Longitudinal studies
                   showed moderate increase in daytime activity (0.53,
                   0.20 to 0.87) and a reduction in night-time activity
                   (−0.36, −0.65 to −0.06) over the course of
                   treatment. Limitations: All study participants were
                   unblinded. Only seven papers included patients treated
                   in the community. Conclusions: Actigraphy is a
                   potentially valuable source of additional information
                   about patients with depression. However, there are no
                   clear guidelines for use of actigraphy in studies of
                   patients with depression. Further studies should
                   investigate patients treated in the community.
                   Additional work to develop algorithms for
                   differentiating behaviour patterns is also needed.},
  categories = {"Depressive disorder","Actigraphy", "Telemonitoring"},
  doi = {10.1016/j.jad.2012.07.001},
  issn = {0165-0327},
  url = {http://www.sciencedirect.com/science/article/pii/S0165032712005034},
  year = 2012
}
@article{Wang_JCST2012,
  author = {Dong Wang and Javier Tejedor and Simon King and Joe
                   Frankel},
  title = {Term-dependent Confidence Normalization for
                   Out-of-Vocabulary Spoken Term Detection},
  journal = {Journal of Computer Science and Technology},
  volume = {27},
  number = {2},
  abstract = {Spoken Term Detection (STD) is a fundamental component
                   of spoken information retrieval systems. A key task of
                   an STD system is to determine reliable detections and
                   reject false alarms based on certain confidence
                   measures. The detection posterior probability, which is
                   often computed from lattices, is a widely used
                   confidence measure. However, a potential problem of
                   this confidence measure is that the confidence scores
                   of detections of all search terms are treated
                   uniformly, regardless of how much they may differ in
                   terms of phonetic or linguistic properties. This
                   problem is particularly evident for out-of-vocabulary
                   (OOV) terms which tend to exhibit high intra-term
                   diversity. To address the discrepancy on confidence
                   levels that the same confidence score may convey for
                   different terms, a term-dependent decision strategy is
                   desirable – for example, the term-specific threshold
                   (TST) approach. In this work, we propose a
                   term-dependent normalisation technique which
                   compensates for term diversity on confidence
                   estimation. Particularly, we propose a linear bias
                   compensation and a discriminative compensation to deal
                   with the bias problem that is inherent in lattice-based
                   confidence measuring from which the TST approach
                   suffers. We tested the proposed technique on speech
                   data from the multi-party meeting domain with two
                   state-of-the-art STD systems based on phonemes and
                   words respectively. The experimental results
                   demonstrate that the confidence normalisation approach
                   leads to a significant performance improvement in STD,
                   particularly for OOV terms with phoneme-based systems.},
  doi = {http://dx.doi.org/10.1007/s11390-012-1228-x},
  year = 2012
}
@inproceedings{Wolters:2012:HTS:2212776.2223703,
  author = {Wolters, Maria and Isaac, Karl and Doherty, Jason},
  title = {Hold that thought: are spearcons less disruptive than
                   spoken reminders?},
  booktitle = {CHI '12 Extended Abstracts on Human Factors in
                   Computing Systems},
  series = {CHI EA '12},
  pages = {1745--1750},
  address = {New York, NY, USA},
  publisher = {ACM},
  acmid = {2223703},
  doi = {10.1145/2212776.2223703},
  isbn = {978-1-4503-1016-1},
  keywords = {irrelevant speech effect, reminders, spearcon, speech,
                   working memory},
  location = {Austin, Texas, USA},
  numpages = {6},
  url = {http://doi.acm.org/10.1145/2212776.2223703},
  year = 2012
}
@inproceedings{lingIS2012,
  author = {Ling, Zhen-Hua and Richmond, Korin and Yamagishi,
                   Junichi},
  title = {Vowel Creation by Articulatory Control in {HMM}-based
                   Parametric Speech Synthesis},
  booktitle = {Proc. Interspeech},
  address = {Portland, Oregon, USA},
  abstract = { This paper presents a method to produce a new vowel
                   by articulatory control in hidden Markov model (HMM)
                   based parametric speech synthesis. A multiple
                   regression HMM (MRHMM) is adopted to model the
                   distribution of acoustic features, with articulatory
                   features used as external auxiliary variables. The
                   dependency between acoustic and articulatory features
                   is modelled by a group of linear transforms that are
                   either estimated context-dependently or determined by
                   the distribution of articulatory features. Vowel
                   identity is removed from the set of context features
                   used to ensure compatibility between the
                   context-dependent model parameters and the articulatory
                   features of a new vowel. At synthesis time, acoustic
                   features are predicted according to the input
                   articulatory features as well as context information.
                   With an appropriate articulatory feature sequence, a
                   new vowel can be generated even when it does not exist
                   in the training set. Experimental results show this
                   method is effective in creating the English vowel /2/
                   by articulatory control without using any acoustic
                   samples of this vowel.},
  categories = {Speech synthesis, articulatory features,
                   multiple-regression hidden Markov model},
  keywords = {Speech synthesis, articulatory features,
                   multiple-regression hidden Markov model},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/LingRichmondYamagishi_IS2012.pdf},
  year = 2012
}
@inproceedings{Wolters:optimi1,
  author = { Maria Wolters and Colin Matheson},
  title = {Designing {Help4Mood}: Trade-Offs and Choices},
  booktitle = { Information and Communication Technologies applied to
                   Mental Health },
  editor = {Garcia-Gomez, Juan Miguel and Paniagua-Paniagua,
                   Patricia},
  publisher = {Editorial Universitat Politecnica de Valencia},
  categories = {depression, eHealth},
  isbn = {978-84-8363-942-9},
  location = {Valencia, Spain},
  year = 2012
}
@conference{hengluIS2012,
  abstract = {Speech units are highly context-dependent, so taking
                   contextual features into account is essential for
                   speech modelling. Context is employed in HMM-based
                   Text-to-Speech speech synthesis systems via
                   context-dependent phone models. A very wide context is
                   taken into account, represented by a large set of
                   contextual factors. However, most of these factors
                   probably have no significant influence on the speech,
                   most of the time. To discover which combinations of
                   features should be taken into account, decision
                   tree-based context clustering is used. But the space of
                   context-dependent models is vast, and the number of
                   contexts seen in the training data is only a tiny
                   fraction of this space, so the task of the decision
                   tree is very hard: to generalise from observations of a
                   tiny fraction of the space to the rest of the space,
                   whilst ignoring uninformative or redundant context
                   features. The structure of the context feature space
                   has not been systematically studied for speech
                   synthesis. In this paper we discover a dependency
                   structure by learning a Bayesian Network over the joint
                   distribution of the features and the speech. We
                   demonstrate that it is possible to discard the majority
                   of context features with minimal impact on quality,
                   measured by a perceptual test.},
  address = {Portland, Oregon, USA},
  author = {Heng Lu and Simon King},
  booktitle = {Proc. Interspeech},
  categories = {HMM-based speech synthesis, Bayesian Networks, context
                   information},
  keywords = {HMM-based speech synthesis, Bayesian Networks, context
                   information},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/HengLuSimonKing.pdf},
  title = {Using {Bayesian} Networks to find relevant context
                   features for {HMM}-based speech synthesis},
  year = 2012
}
@inproceedings{CassiaICASSP12,
  author = {Valentini-Botinhao, C. and Maia, R. and Yamagishi, J.
                   and King, S. and Zen, H.},
  title = {{Cepstral analysis based on the Glimpse proportion
                   measure for improving the intelligibility of
                   {HMM}-based synthetic speech in noise}},
  booktitle = {Proc. ICASSP},
  pages = {3997--4000},
  address = {Kyoto, Japan},
  abstract = {In this paper we introduce a new cepstral coefficient
                   extraction method based on an intelligibility measure
                   for speech in noise, the Glimpse Proportion measure.
                   This new method aims to increase the intelligibility of
                   speech in noise by modifying the clean speech, and has
                   applications in scenarios such as public announcement
                   and car navigation systems. We first explain how the
                   Glimpse Proportion measure operates and further show
                   how we approximated it to integrate it into an existing
                   spectral envelope parameter extraction method commonly
                   used in the HMM-based speech synthesis framework. We
                   then demonstrate how this new method changes the
                   modelled spectrum according to the characteristics of
                   the noise and show results for a listening test with
                   vocoded and HMM-based synthetic speech. The test
                   indicates that the proposed method can significantly
                   improve intelligibility of synthetic speech in speech
                   shaped noise.},
  categories = {HMM-based speech synthesis, intelligibility
                   enhancement, speech analysis},
  doi = {10.1109/ICASSP.2012.6288794},
  month = {March},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_ICASSP12.pdf},
  year = 2012
}
@inproceedings{PhillipIS2012,
  author = {Phillip L. De Leon and Bryan Stewart and Junichi
                   Yamagishi},
  title = {Synthetic Speech Discrimination using Pitch Pattern
                   Statistics Derived from Image Analysis},
  booktitle = {Proc. Interspeech},
  address = {Portland, Oregon, USA},
  abstract = { In this paper, we extend the work by Ogihara, et al.
                   to discriminate between human and synthetic speech
                   using features based on pitch patterns. As previously
                   demonstrated, significant differences in pitch patterns
                   between human and synthetic speech can be leveraged to
                   classify speech as being human or synthetic in origin.
                   We propose using mean pitch stability, mean pitch
                   stability range, and jitter as features extracted after
                   image analysis of pitch patterns. We have observed that
                   for synthetic speech, these features lie in a small and
                   distinct space as compared to human speech and have
                   modeled them with a multivariate Gaussian distribution.
                   Our classifier is trained using synthetic speech
                   collected from the 2008 and 2011 Blizzard Challenge
                   along with Festival pre-built voices and human speech
                   from the NIST2002 corpus. We evaluate the classifier on
                   a much larger corpus than previously studied using
                   human speech from the Switchboard corpus, synthetic
                   speech from the Resource Management corpus, and
                   synthetic speech generated from Festival trained on the
                   Wall Street Journal corpus. Results show 98% accuracy
                   in correctly classifying human speech and 96% accuracy
                   in correctly classifying synthetic speech.},
  month = sep,
  year = 2012
}
@phdthesis{watts-2012,
  author = {Oliver Watts},
  title = {Unsupervised Learning for Text-to-Speech Synthesis},
  school = {University of Edinburgh},
  abstract = {This thesis introduces a general method for
                   incorporating the distributional analysis of textual
                   and linguistic objects into text-to-speech (TTS)
                   conversion systems. Conventional TTS conversion uses
                   intermediate layers of representation to bridge the gap
                   between text and speech. Collecting the annotated data
                   needed to produce these intermediate layers is a far
                   from trivial task, possibly prohibitively so for
                   languages in which no such resources are in existence.
                   Distributional analysis, in contrast, proceeds in an
                   unsupervised manner, and so enables the creation of
                   systems using textual data that are not annotated. The
                   method therefore aids the building of systems for
                   languages in which conventional linguistic resources
                   are scarce, but is not restricted to these languages.
                   The distributional analysis proposed here places the
                   textual objects analysed in a continuous-valued space,
                   rather than specifying a hard categorisation of those
                   objects. This space is then partitioned during the
                   training of acoustic models for synthesis, so that the
                   models generalise over objects' surface forms in a way
                   that is acoustically relevant. The method is applied to
                   three levels of textual analysis: to the
                   characterisation of sub-syllabic units, word units and
                   utterances. Entire systems for three languages
                   (English, Finnish and Romanian) are built with no
                   reliance on manually labelled data or language-specific
                   expertise. Results of a subjective evaluation are
                   presented.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/oliver_watts_thesis.pdf},
  year = 2012
}
@inproceedings{Wolters:bhci,
  author = {Wolters, Maria and McCloughan, Lucy and Gibson, Martin
                   and Weatherall, Chris and Matheson, Colin and Maloney,
                   Tim and Castro-Robles, Juan Carlos and Estevez, Soraya },
  title = {Monitoring People with Depression in the
                   Community---Regulatory Aspectts},
  booktitle = {Workshop on People, Computers and Psychiatry at the
                   British Computer Society's Conference on Human Computer
                   Interaction},
  pages = {1745--1750},
  categories = {depression, regulation, monitoring},
  location = {Birmingham, UK},
  year = 2012
}
@inproceedings{mayo:12,
  author = {Mayo, C. and Aubanel, V. and Cooke, M.},
  title = {Effect of prosodic changes on speech intelligibility},
  booktitle = {Proc. Interspeech},
  address = {Portland, OR, USA},
  year = 2012
}
@inproceedings{Jaime2IS2012,
  author = {J. Lorenzo and B. Martinez and R. Barra-Chicote and V.
                   Lopez–Ludena and J. Ferreiros and J. Yamagishi and
                   J.M. Montero},
  title = { Towards an Unsupervised Speaking Style Voice Building
                   Framework: Multi–Style Speaker Diarization},
  booktitle = {Proc. Interspeech},
  address = {Portland, Oregon, USA},
  abstract = { Current text–to–speech systems are developed
                   using studio-recorded speech in a neutral style or
                   based on acted emotions. However, the proliferation of
                   media sharing sites would allow developing a new
                   generation of speech–based systems which could cope
                   with sponta- neous and styled speech. This paper
                   proposes an architecture to deal with realistic
                   recordings and carries out some experiments on
                   unsupervised speaker diarization. In order to maximize
                   the speaker purity of the clusters while keeping a high
                   speaker coverage, the paper evaluates the F–measure
                   of a diarization module, achieving high scores (>85%)
                   especially when the clusters are longer than 30
                   seconds, even for the more spontaneous and expressive
                   styles (such as talk shows or sports).},
  month = sep,
  year = 2012
}
@inproceedings{lingvowel,
  author = {Ling, Zhenhua and Richmond, Korin and Yamagishi,
                   Junichi},
  title = {Vowel Creation by Articulatory Control in {HMM}-based
                   Parametric Speech Synthesis},
  booktitle = {Proc. The Listening Talker Workshop},
  pages = {72},
  address = {Edinburgh, UK},
  month = {May},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2012/Ling_etal_LISTA.pdf},
  year = 2012
}
@inproceedings{Wolters:cyber17,
  author = {Claudia Pagliari and Maria Wolters and Chris Burton
                   and Brian McKinstry and Aurora Szentagotai and Antoni
                   Serrano-Blanco and Daniel David and Luis Ferrini and
                   Susanna Albertini and Joan Carlos Castro and Soraya
                   Estévez},
  title = {Psychosocial Implications of Avatar Use in Supporting
                   Therapy of Depression},
  booktitle = { CYBER17-17th Annual CyberPsychology & CyberTherapy
                   Conference},
  categories = {depression, cybertherapy, monitoring},
  location = {Brussels, Belgium},
  year = 2012
}
@article{wester:specom:12,
  author = {Mirjam Wester},
  title = {Talker discrimination across languages},
  journal = {Speech Communication},
  volume = {54},
  pages = {781--790},
  abstract = {This study investigated the extent to which listeners
                   are able to discriminate between bilingual talkers in
                   three language pairs – English–German,
                   English–Finnish and English–Mandarin. Native
                   English listeners were presented with two sentences
                   spoken by bilingual talkers and were asked to judge
                   whether they thought the sentences were spoken by the
                   same person. Equal amounts of cross-language and
                   matched-language trials were presented. The results
                   show that native English listeners are able to carry
                   out this task well; achieving percent correct levels at
                   well above chance for all three language pairs.
                   Previous research has shown this for English–German,
                   this research shows listeners also extend this to
                   Finnish and Mandarin, languages that are quite distinct
                   from English from a genetic and phonetic similarity
                   perspective. However, listeners are significantly less
                   accurate on cross-language talker trials
                   (English–foreign) than on matched-language trials
                   (English–English and foreign–foreign).
                   Understanding listeners’ behaviour in cross-language
                   talker discrimination using natural speech is the first
                   step in developing principled evaluation techniques for
                   synthesis systems in which the goal is for the
                   synthesised voice to sound like the original speaker,
                   for instance, in speech-to-speech translation systems,
                   voice conversion and reconstruction.},
  categories = {evaluation},
  doi = {10.1016/j.specom.2012.01.006},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2012/wester_specom_12.pdf},
  year = 2012
}
@inproceedings{llu2012map,
  author = {Lu, L. and Ghoshal, A. and Renals, S.},
  title = {{Maximum a posteriori adaptation of subspace Gaussian
                   mixture models for cross-lingual speech recognition}},
  booktitle = {Proc. ICASSP},
  abstract = {This paper concerns cross-lingual acoustic modeling in
                   the case when there are limited target language
                   resources. We build on an approach in which a subspace
                   Gaussian mixture model (SGMM) is adapted to the target
                   language by reusing the globally shared parameters
                   estimated from out-of-language training data. In
                   current cross-lingual systems, these parameters are
                   fixed when training the target system, which can give
                   rise to a mismatch between the source and target
                   systems. We investigate a maximum a posteriori (MAP)
                   adaptation approach to alleviate the potential
                   mismatch. In particular, we focus on the adaptation of
                   phonetic subspace parameters using a matrix variate
                   Gaussian prior distribution. Experiments on the
                   GlobalPhone corpus using the MAP adaptation approach
                   results in word error rate reductions, compared with
                   the cross-lingual baseline systems and systems updated
                   using maximum likelihood, for training conditions with
                   1 hour and 5 hours of target language data.},
  keywords = {Subspace Gaussian Mixture Model, Maximum a Posteriori
                   Adaptation, Cross-lingual Speech Recognition},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/llu-icassp-2012.pdf},
  year = 2012
}
@article{anderssonyamagishi12,
  author = {Andersson, S. and Yamagishi, J. and Clark, R.A.J.},
  title = {Synthesis and Evaluation of Conversational
                   Characteristics in {HMM}-Based Speech Synthesis},
  journal = {Speech Communication},
  volume = 54,
  number = 2,
  pages = {175-188},
  abstract = {Spontaneous conversational speech has many
                   characteristics that are currently not modelled well by
                   HMM-based speech synthesis and in order to build
                   synthetic voices that can give an impression of someone
                   partaking in a conversation, we need to utilise data
                   that exhibits more of the speech phenomena associated
                   with conversations than the more generally used
                   carefully read aloud sentences. In this paper we show
                   that synthetic voices built with HMM-based speech
                   synthesis techniques from conversational speech data,
                   preserved segmental and prosodic characteristics of
                   frequent conversational speech phenomena. An analysis
                   of an evaluation investigating the perception of
                   quality and speaking style of HMM-based voices confirms
                   that speech with conversational characteristics are
                   instrumental for listeners to perceive successful
                   integration of conversational speech phenomena in
                   synthetic speech. The achieved synthetic speech quality
                   provides an encouraging start for the continued use of
                   conversational speech in HMM-based speech synthesis.},
  doi = {10.1016/j.specom.2011.08.001},
  year = 2012
}
@inproceedings{cooke:lista:12,
  author = {Martin Cooke and Maria Luisa García Lecumberri and
                   Yan Tang and Mirjam Wester},
  title = {Do non-native listeners benefit from speech
                   modifications designed to promote intelligibility for
                   native listeners?},
  booktitle = {Proceedings of The Listening Talker Workshop},
  pages = 59,
  note = {http://listening-talker.org/workshop/programme.html},
  year = 2012
}
@inproceedings{dallIS2012,
  author = {Dall, Rasmus and Veaux, Christophe and Yamagishi,
                   Junichi and King, Simon},
  title = {Analysis of Speaker CLustering Strategies for
                   {HMM}-Based Speech Synthesis},
  booktitle = {Proc. Interspeech},
  address = {Portland, Oregon, USA},
  abstract = {This paper describes a method for speaker clustering,
                   with the application of building average voice models
                   for speaker-adaptive HMM-based speech synthesis that
                   are a good basis for adapting to specific target
                   speakers. Our main hypothesis is that using
                   perceptually similar speakers to build the average
                   voice model will be better than use unselected
                   speakers, even if the amount of data available from
                   perceptually similar speakers is smaller. We measure
                   the perceived similarities among a group of 30 female
                   speakers in a listening test and then apply multiple
                   linear regression to automatically predict these
                   listener judgements of speaker similarity and thus to
                   identify similar speakers automatically. We then
                   compare a variety of average voice models trained on
                   either speakers who were perceptually judged to be
                   similar to the target speaker, or speakers selected by
                   the multiple linear regression, or a large global set
                   of unselected speakers. We find that the average voice
                   model trained on perceptually similar speakers provides
                   better performance than the global model, even though
                   the latter is trained on more data, confirming our main
                   hypothesis. However, the average voice model using
                   speakers selected automatically by the multiple linear
                   regression does not reach the same level of
                   performance.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/DallIS2012.pdf},
  year = 2012
}
@inproceedings{JaimeIS2012,
  author = {Jaime Lorenzo-Trueba and Roberto Barra-Chicote and
                   Tuomo Raitio and Nicolas Obin and Paavo Alku and
                   Junichi Yamagishi and Juan M Montero},
  title = { Towards Glottal Source Controllability in Expressive
                   Speech Synthesis},
  booktitle = {Proc. Interspeech},
  address = {Portland, Oregon, USA},
  abstract = { In order to obtain more human like sounding human-
                   machine interfaces we must first be able to give them
                   expressive capabilities in the way of emotional and
                   stylistic features so as to closely adequate them to
                   the intended task. If we want to replicate those
                   features it is not enough to merely replicate the
                   prosodic information of fundamental frequency and
                   speaking rhythm. The proposed additional layer is the
                   modification of the glottal model, for which we make
                   use of the GlottHMM parameters. This paper analyzes the
                   viability of such an approach by verifying that the
                   expressive nuances are captured by the aforementioned
                   features, obtaining 95% recognition rates on styled
                   speaking and 82% on emotional speech. Then we evaluate
                   the effect of speaker bias and recording environment on
                   the source modeling in order to quantify possible
                   problems when analyzing multi-speaker databases.
                   Finally we propose a speaking styles separation for
                   Spanish based on prosodic features and check its
                   perceptual significance.},
  month = sep,
  year = 2012
}
@inproceedings{bell12_mlan,
  author = {Bell, P. and Gales, M. and Lanchantin, P. and Liu, X.
                   and Long, Y. and Renals, S. and Swietojanski, P. and
                   Woodland, P.},
  title = {Transcription of multi-genre media archives using
                   out-of-domain data},
  booktitle = {Proc. IEEE Workshop on Spoken Language Technology},
  address = {Miami, Florida, USA},
  abstract = {We describe our work on developing a speech
                   recognition system for multi-genre media archives. The
                   high diversity of the data makes this a challenging
                   recognition task, which may benefit from systems
                   trained on a combination of in-domain and out-of-domain
                   data. Working with tandem HMMs, we present Multi-level
                   Adaptive Networks (MLAN), a novel technique for
                   incorporating information from out-of-domain posterior
                   features using deep neural networks. We show that it
                   provides a substantial reduction in WER over other
                   systems, with relative WER reductions of 15\% over a
                   PLP baseline, 9\% over in-domain tandem features and
                   8\% over the best out-of-domain tandem features.},
  month = dec,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/mlan_slt2012.pdf},
  year = 2012
}
@article{Oura2012703,
  author = {Keiichiro Oura and Junichi Yamagishi and Mirjam Wester
                   and Simon King and Keiichi Tokuda},
  title = {Analysis of unsupervised cross-lingual speaker
                   adaptation for {HMM}-based speech synthesis using
                   {KLD}-based transform mapping},
  journal = {Speech Communication},
  volume = {54},
  number = {6},
  pages = {703--714},
  abstract = {In the EMIME project, we developed a mobile device
                   that performs personalized speech-to-speech translation
                   such that a user's spoken input in one language is used
                   to produce spoken output in another language, while
                   continuing to sound like the user's voice. We
                   integrated two techniques into a single architecture:
                   unsupervised adaptation for HMM-based TTS using
                   word-based large-vocabulary continuous speech
                   recognition, and cross-lingual speaker adaptation
                   (CLSA) for HMM-based TTS. The CLSA is based on a
                   state-level transform mapping learned using minimum
                   Kullback-Leibler divergence between pairs of HMM states
                   in the input and output languages. Thus, an
                   unsupervised cross-lingual speaker adaptation system
                   was developed. End-to-end speech-to-speech translation
                   systems for four languages (English, Finnish, Mandarin,
                   and Japanese) were constructed within this framework.
                   In this paper, the English-to-Japanese adaptation is
                   evaluated. Listening tests demonstrate that adapted
                   voices sound more similar to a target speaker than
                   average voices and that differences between supervised
                   and unsupervised cross-lingual speaker adaptation are
                   small. Calculating the KLD state-mapping on only the
                   first 10 mel-cepstral coefficients leads to huge
                   savings in computational costs, without any detrimental
                   effect on the quality of the synthetic speech.},
  doi = {10.1016/j.specom.2011.12.004},
  issn = {0167-6393},
  keywords = {HMM-based speech synthesis, Unsupervised speaker
                   adaptation, Cross-lingual speaker adaptation,
                   Speech-to-speech translation},
  url = {http://www.sciencedirect.com/science/article/pii/S0167639311001774},
  year = 2012
}
@inproceedings{badinoclark_interspeech12,
  author = {Leonardo Badino and Robert A.J. Clark and Mirjam
                   Wester},
  title = {Towards Hierarchical Prosodic Prominence Generation in
                   {TTS} Synthesis},
  booktitle = {Proc. Interspeech},
  address = {Portland, USA},
  categories = {speech synthesis, prosody},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/badinoclark_IS_2012.pdf},
  year = 2012
}
@inproceedings{bell12_tutoring,
  author = {Bell, Peter and Dzikovska, Myroslava and Isard, Amy},
  title = {Designing a spoken language interface for a tutorial
                   dialogue system},
  booktitle = {Proc. Interspeech},
  address = {Portland, Oregon, USA},
  abstract = {We describe our work in building a spoken language
                   interface for a tutorial dialogue system. Our goal is
                   to allow natural, unrestricted student interaction with
                   the computer tutor, which has been shown to improve the
                   student's learning gain, but presents challenges for
                   speech recognition and spoken language understanding.
                   We discuss the choice of system components and present
                   the results of development experiments in both acoustic
                   and language modelling for speech recognition in this
                   domain.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/tutoring_is2012.pdf},
  year = 2012
}
@inproceedings{zwyssig2012determining,
  author = {Zwyssig, E. and Renals, S. and Lincoln, M.},
  title = {Determining the number of speakers in a meeting using
                   microphone array features},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2012
                   IEEE International Conference on},
  pages = {4765--4768},
  year = 2012
}
@article{Hashimoto2012857,
  author = {Kei Hashimoto and Junichi Yamagishi and William Byrne
                   and Simon King and Keiichi Tokuda},
  title = {Impacts of machine translation and speech synthesis on
                   speech-to-speech translation},
  journal = {Speech Communication},
  volume = {54},
  number = {7},
  pages = {857--866},
  note = {},
  abstract = {This paper analyzes the impacts of machine translation
                   and speech synthesis on speech-to-speech translation
                   systems. A typical speech-to-speech translation system
                   consists of three components: speech recognition,
                   machine translation and speech synthesis. Many
                   techniques have been proposed for integration of speech
                   recognition and machine translation. However,
                   corresponding techniques have not yet been considered
                   for speech synthesis. The focus of the current work is
                   machine translation and speech synthesis, and we
                   present a subjective evaluation designed to analyze
                   their impact on speech-to-speech translation. The
                   results of these analyses show that the naturalness and
                   intelligibility of the synthesized speech are strongly
                   affected by the fluency of the translated sentences. In
                   addition, several features were found to correlate well
                   with the average fluency of the translated sentences
                   and the average naturalness of the synthesized speech.},
  doi = {10.1016/j.specom.2012.02.004},
  issn = {0167-6393},
  keywords = {Speech-to-speech translation, Machine translation,
                   Speech synthesis, Subjective evaluation},
  url = {http://www.sciencedirect.com/science/article/pii/S0167639312000283},
  year = 2012
}
@inproceedings{stan12_grapheme_alignment,
  author = {Stan, Adriana and Bell, Peter and King, Simon},
  title = {A Grapheme-based Method for Automatic Alignment of
                   Speech and Text Data},
  booktitle = {Proc. IEEE Workshop on Spoken Language Technology},
  address = {Miami, Florida, USA},
  abstract = {This paper introduces a method for automatic alignment
                   of speech data with unsynchronised, imperfect
                   transcripts, for a domain where no initial acoustic
                   models are available. Using grapheme-based acoustic
                   models, word skip networks and orthographic speech
                   transcripts, we are able to harvest 55\% of the speech
                   with a 93\% utterance-level accuracy and 99\% word
                   accuracy for the produced transcriptions. The work is
                   based on the assumption that there is a high degree of
                   correspondence between the speech and text, and that a
                   full transcription of all of the speech is not
                   required. The method is language independent and the
                   only prior knowledge and resources required are the
                   speech and text transcripts, and a few minor user
                   interventions.},
  month = dec,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/grapheme_alignment_slt2012.pdf},
  year = 2012
}
@article{6205335,
  author = {De Leon, P. L. and Pucher, M. and Yamagishi, J. and
                   Hernaez, I. and Saratxaga, I.},
  title = {Evaluation of Speaker Verification Security and
                   Detection of {HMM}-Based Synthetic Speech},
  journal = {Audio, Speech, and Language Processing, IEEE
                   Transactions on},
  volume = {20},
  number = {8},
  pages = {2280--2290},
  abstract = {In this paper, we evaluate the vulnerability of
                   speaker verification (SV) systems to synthetic speech.
                   The SV systems are based on either the Gaussian mixture
                   model #x2013;universal background model (GMM-UBM) or
                   support vector machine (SVM) using GMM supervectors. We
                   use a hidden Markov model (HMM)-based text-to-speech
                   (TTS) synthesizer, which can synthesize speech for a
                   target speaker using small amounts of training data
                   through model adaptation of an average voice or
                   background model. Although the SV systems have a very
                   low equal error rate (EER), when tested with synthetic
                   speech generated from speaker models derived from the
                   Wall Street Journal (WSJ) speech corpus, over 81% of
                   the matched claims are accepted. This result suggests
                   vulnerability in SV systems and thus a need to
                   accurately detect synthetic speech. We propose a new
                   feature based on relative phase shift (RPS),
                   demonstrate reliable detection of synthetic speech, and
                   show how this classifier can be used to improve
                   security of SV systems.},
  doi = {10.1109/TASL.2012.2201472},
  issn = {1558-7916},
  month = oct,
  year = 2012
}
@inproceedings{CassiaSAPA12,
  author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S.},
  title = {{Speech intelligibility enhancement for {HMM}-based
                   synthetic speech in noise}},
  booktitle = {Proc. Sapa Workshop},
  address = {Portland, USA},
  abstract = {It is possible to increase the intelligibility of
                   speech in noise by enhancing the clean speech signal.
                   In this paper we demonstrate the effects of modifying
                   the spectral envelope of synthetic speech according to
                   the environmental noise. To achieve this, we modify Mel
                   cepstral coefficients according to an intelligibility
                   measure that accounts for glimpses of speech in noise:
                   the Glimpse Proportion measure. We evaluate this method
                   against a baseline synthetic voice trained only with
                   normal speech and a topline voice trained with Lombard
                   speech, as well as natural speech. The intelligibility
                   of these voices was measured when mixed with
                   speech-shaped noise and with a competing speaker at
                   three different levels. The Lombard voices, both
                   natural and synthetic, were more intelligible than the
                   normal voices in all conditions. For speech-shaped
                   noise, the proposed modified voice was as intelligible
                   as the Lombard synthetic voice without requiring any
                   recordings of Lombard speech, which are hard to obtain.
                   However, in the case of competing talker noise, the
                   Lombard synthetic voice was more intelligible than the
                   proposed modified voice.},
  categories = {HMM-based speech synthesis, intelligibility
                   enhancement},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_Sapa12.pdf},
  year = 2012
}
@inproceedings{Wolters:medetel,
  author = {Wolters, Maria and Ferrini, Louis and
                   Martinez-Miranda, Juan and Hastie, Helen and Burton,
                   Chris },
  title = {{Help4Mood} - A Flexible Solution for Supporting
                   People with Depression in the Community across Europe},
  booktitle = {Proceedings of The International eHealth, Telemedicine
                   and Health ICT Forum For Education, Networking and
                   Business (MedeTel, 2012)},
  publisher = {International Society for Telemedicine & eHealth
                   (ISfTeH)},
  categories = {depression, mental health, ehealth},
  editors = {Jodanova, E and Lievens, F},
  location = {Luxemburg},
  year = 2012
}
@inproceedings{zwyssig2012effect,
  author = {Zwyssig, E. and Renals, S. and Lincoln, M.},
  title = {{On the effect of SNR and superdirective beamforming
                   in speaker diarisation in meetings}},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2012
                   IEEE International Conference on},
  pages = {4177--4180},
  year = 2012
}
@inproceedings{CassiaLista12,
  author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S.},
  title = {{Using an intelligibility measure to create noise
                   robust cepstral coefficients for {HMM}-based speech
                   synthesis}},
  booktitle = {Proc. LISTA Workshop},
  address = {Edinburgh, UK},
  categories = {HMM-based speech synthesis, intelligibility
                   enhancement},
  month = {May},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_Lista12.pdf},
  year = 2012
}
@inproceedings{CassiaWocci12,
  author = {Valentini-Botinhao, C. and Degenkolb-Weyers, S. and
                   Maier, A. and Noeth, E. and Eysholdt, U. and Bocklet,
                   T.},
  title = {{Automatic detection of sigmatism in children}},
  booktitle = {Proc. WOCCI},
  address = {Portland, USA},
  abstract = {We propose in this paper an automatic system to detect
                   sigmatism from the speech signal. Sigmatism occurs when
                   the tongue is positioned incorrectly during
                   articulation of sibilant phones like /s/ and /z/. For
                   our task we extracted various sets of features from
                   speech: Mel frequency cepstral coefficients, energies
                   in specific bandwidths of the spectral envelope, and
                   the so-called supervectors, which are the parameters of
                   an adapted speaker model. We then trained several
                   classifiers on a speech database of German adults
                   simulating three different types of sigmatism.
                   Recognition results were calculated at a phone, word
                   and speaker level for both the simulated database and
                   for a database of pathological speakers. For the
                   simulated database, we achieved recognition rates of up
                   to 86%, 87% and 94% at a phone, word and speaker level.
                   The best classifier was then integrated as part of a
                   Java applet that allows patients to record their own
                   speech, either by pronouncing isolated phones, a
                   specific word or a list of words, and provides them
                   with a feedback whether the sibilant phones are being
                   correctly pronounced.},
  month = {September},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_WOCCI12.pdf},
  year = 2012
}
@inproceedings{janskaetal_interspeech12,
  author = {Anna C. Janska and Erich Schröger and Thomas Jacobsen
                   and Robert A. J. Clark},
  title = {Asymmetries in the perception of synthesized speech},
  booktitle = {Proc. Interspeech},
  address = {Portland, USA},
  categories = {speech synthesis, evaluation},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/janskaeral_IS_2012.pdf},
  year = 2012
}
@inproceedings{koutsogiannaki:12,
  author = {Koutsogiannaki, M. and Pettinato, M. and Mayo, C. and
                   Kandia, V. and Stylianou, Y.},
  title = {Can modified casual speech reach the intelligibility
                   of clear speech?},
  booktitle = {Proc. Interspeech},
  address = {Portland, OR, USA},
  year = 2012
}
@article{Wolters:ICST,
  title = {Managing Data in {Help4Mood}},
  journal = {ICST Transactions in Ambient Systems},
  volume = {},
  number = {Special Issue on Technology in Mental Health},
  pages = { - },
  note = {},
  authors = {Wolters, Maria and Martinez-Miranda, Juan and Hastie,
                   Helen F. and Estevez, Soraya and Matheson, Colin},
  categories = {mental health, depression, monitoring, ontologies,
                   SNOMED-CT},
  year = 2012
}
@inproceedings{lu2012jud,
  author = {Lu, L. and Ghoshal, A. and Renals, S.},
  title = {{Joint uncertainty decoding with unscented transform
                   for noise robust subspace Gaussian mixture model}},
  booktitle = {Proc. Sapa-Scale workshop},
  abstract = {Common noise compensation techniques use vector Taylor
                   series (VTS) to approximate the mismatch function.
                   Recent work shows that the approximation accuracy may
                   be improved by sampling. One such sampling technique is
                   the unscented transform (UT), which draws samples
                   deterministically from clean speech and noise model to
                   derive the noise corrupted speech parameters. This
                   paper applies UT to noise compensation of the subspace
                   Gaussian mixture model (SGMM). Since UT requires
                   relatively smaller number of samples for accurate
                   estimation, it has significantly lower computational
                   cost compared to other random sampling techniques.
                   However, the number of surface Gaussians in an SGMM is
                   typically very large, making the direct application of
                   UT, for compensating individual Gaussian components,
                   computationally impractical. In this paper, we avoid
                   the computational burden by employing UT in the
                   framework of joint uncertainty decoding (JUD), which
                   groups all the Gaussian components into small number of
                   classes, sharing the compensation parameters by class.
                   We evaluate the JUD-UT technique for an SGMM system
                   using the Aurora 4 corpus. Experimental results
                   indicate that UT can lead to increased accuracy
                   compared to VTS approximation if the JUD phase factor
                   is untuned, and to similar accuracy if the phase factor
                   is tuned empirically},
  keywords = {noise compensation, SGMM, JUD, UT},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/llu-sapa2012.pdf},
  year = 2012
}
@inproceedings{sansegundo_et_al_IS2012,
  author = {Ruben San-Segundo and Juan M. Montero and Veronica
                   Lopez-Luden and Simon King},
  title = {Detecting Acronyms from Capital Letter Sequences in
                   Spanish},
  booktitle = {Proc. Interspeech},
  address = {Portland, Oregon, USA},
  abstract = {This paper presents an automatic strategy to decide
                   how to pronounce a Capital Letter Sequence (CLS) in a
                   Text to Speech system (TTS). If CLS is well known by
                   the TTS, it can be expanded in several words. But when
                   the CLS is unknown, the system has two alternatives:
                   spelling it (abbreviation) or pronouncing it as a new
                   word (acronym). In Spanish, there is a high
                   relationship between letters and phonemes. Because of
                   this, when a CLS is similar to other words in Spanish,
                   there is a high tendency to pronounce it as a standard
                   word. This paper proposes an automatic method for
                   detecting acronyms. Additionally, this paper analyses
                   the discrimination capability of some features, and
                   several strategies for combining them in order to
                   obtain the best classifier. For the best classifier,
                   the classification error is 8.45\%. About the feature
                   analysis, the best features have been the Letter
                   Sequence Perplexity and the Average N-gram order.},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Thu-P10a-07.pdf},
  year = 2012
}
@inproceedings{aubanel:12,
  author = {Aubanel, V. and Cooke, M. and Foster, E. and
                   Garcia-Lecumberri, M. L. and Mayo, C.},
  title = {Effects of the availability of visual information and
                   presence of competing conversations on speech
                   production},
  booktitle = {Proc. Interspeech},
  address = {Portland, OR, USA},
  year = 2012
}
@inproceedings{Wolters:medetel-castro,
  author = {Estevez, Soraya and Castro-Robles, Juan Carlos and
                   Wolters, Maria },
  title = {{Help4Mood}: First Release of a Computational
                   Distributed System to Support the Treatment of Patients
                   with Major Depression},
  booktitle = {Proceedings of The International eHealth, Telemedicine
                   and Health ICT Forum For Education, Networking and
                   Business (MedeTel, 2012)},
  pages = {1745--1750},
  publisher = {International Society for Telemedicine & eHealth
                   (ISfTeH)},
  categories = {depression, mental health, ehealth},
  editors = {Jodanova, E and Lievens, F},
  location = {Luxemburg},
  year = 2012
}
@inproceedings{dzikovska-EtAl:2012:EACL2012,
  author = {Dzikovska, Myroslava O. and Bell, Peter and Isard, Amy
                   and Moore, Johanna D.},
  title = {Evaluating language understanding accuracy with
                   respect to objective outcomes in a dialogue system},
  booktitle = {Proceedings of the 13th Conference of the European
                   Chapter of the Association for Computational
                   Linguistics},
  pages = {471--481},
  address = {Avignon, France},
  publisher = {Association for Computational Linguistics},
  month = {April},
  url = {http://www.aclweb.org/anthology/E12-1048},
  year = 2012
}
@inproceedings{CassiaIS12,
  author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S.},
  title = {{Mel cepstral coefficient modification based on the
                   Glimpse Proportion measure for improving the
                   intelligibility of {HMM}-generated synthetic speech in
                   noise}},
  booktitle = {Proc. Interspeech},
  address = {Portland, USA},
  abstract = {We propose a method that modifies the Mel cepstral
                   coefficients of HMM-generated synthetic speech in order
                   to increase the intelligibility of the generated speech
                   when heard by a listener in the presence of a known
                   noise. This method is based on an approximation we
                   previously proposed for the Glimpse Proportion measure.
                   Here we show how to update the Mel cepstral
                   coefficients using this measure as an optimization
                   criterion and how to control the amount of distortion
                   by limiting the frequency resolution of the
                   modifications. To evaluate the method we built eight
                   different voices from normal read-text speech data from
                   a male speaker. Some voices were also built from
                   Lombard speech data produced by the same speaker.
                   Listening experiments with speech-shaped noise and with
                   a single competing talker indicate that our method
                   significantly improves intelligibility when compared to
                   unmodified synthetic speech. The voices built from
                   Lombard speech outperformed the proposed method
                   particularly for the competing talker case. However,
                   compared to a voice using only the spectral parameters
                   from Lombard speech, the proposed method obtains
                   similar or higher performance.},
  categories = {HMM-based speech synthesis, intelligibility
                   enhancement, Mel cepstral coefficients},
  month = {September},
  year = 2012
}
@inproceedings{lu2012noise,
  author = {Lu, L. and Chin, KK and Ghoshal, A. and Renals, S.},
  title = {{Noise compensation for subspace Gaussian mixture
                   models}},
  booktitle = {Proc. INTERSPEECH},
  abstract = {Joint uncertainty decoding (JUD) is an effective
                   model-based noise compensation technique for
                   conventional Gaussian mixture model (GMM) based speech
                   recognition systems. In this paper, we apply JUD to
                   subspace Gaussian mixture model (SGMM) based acoustic
                   models. The total number of Gaussians in the SGMM
                   acoustic model is usually much larger than for
                   conventional GMMs, which limits the application of
                   approaches which explicitly compensate each Gaussian,
                   such as vector Taylor series (VTS). However, by
                   clustering the Gaussian components into a number of
                   regression classes, JUD-based noise compensation can be
                   successfully applied to SGMM systems. We evaluate the
                   JUD/SGMM technique using the Aurora 4 corpus, and the
                   experimental results indicated that it is more accurate
                   than conventional GMM-based systems using either VTS or
                   JUD noise compensation.},
  keywords = {acoustic modelling, noise compensation, SGMM, JUD},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/llu-is2012.pdf},
  year = 2012
}
@inproceedings{6287948,
  author = {Saheer, L. and Yamagishi, J. and Garner, P.N. and
                   Dines, J.},
  title = {Combining vocal tract length normalization with
                   hierarchial linear transformations},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2012
                   IEEE International Conference on},
  volume = {},
  number = {},
  pages = {4493 -4496},
  abstract = {Recent research has demonstrated the effectiveness of
                   vocal tract length normalization (VTLN) as a rapid
                   adaptation technique for statistical parametric speech
                   synthesis. VTLN produces speech with naturalness
                   preferable to that of MLLR-based adaptation techniques,
                   being much closer in quality to that generated by the
                   original average voice model. However with only a
                   single parameter, VTLN captures very few speaker
                   specific characteristics when compared to linear
                   transform based adaptation techniques. This paper
                   proposes that the merits of VTLN can be combined with
                   those of linear transform based adaptation in a
                   hierarchial Bayesian framework, where VTLN is used as
                   the prior information. A novel technique for
                   propagating the gender information from the VTLN prior
                   through constrained structural maximum a posteriori
                   linear regression (CSMAPLR) adaptation is presented.
                   Experiments show that the resulting transformation has
                   improved speech quality with better naturalness,
                   intelligibility and improved speaker similarity.},
  doi = {10.1109/ICASSP.2012.6287948},
  issn = {1520-6149},
  keywords = {CSMAPLR adaptation;MLLR based adaptation
                   technique;constrained structural maximum a posteriori
                   linear regression;hierarchial Bayesian
                   framework;hierarchial linear
                   transformation;intelligibility;rapid adaptation
                   technique;speaker similarity;statistical parametric
                   speech synthesis;vocal tract length normalization;Bayes
                   methods;speech intelligibility;},
  month = mar,
  year = 2012
}
@inproceedings{uriaIS2012,
  author = {Benigno Uria and Iain Murray and Steve Renals and
                   Korin Richmond},
  title = {Deep Architectures for Articulatory Inversion},
  booktitle = {Proc. Interspeech},
  address = {Portland, Oregon, USA},
  abstract = { We implement two deep architectures for the
                   acoustic-articulatory inversion mapping problem: a deep
                   neural network and a deep trajectory mixture density
                   network. We find that in both cases, deep architectures
                   produce more accurate predictions than shallow
                   architectures and that this is due to the higher
                   expressive capability of a deep model and not a
                   consequence of adding more adjustable parameters. We
                   also find that a deep trajectory mixture density
                   network is able to obtain better inversion accuracies
                   than smoothing the results of a deep neural network.
                   Our best model obtained an average root mean square
                   error of 0.885 mm on the MNGU0 test dataset.},
  categories = {Articulatory inversion, deep neural network, deep
                   belief network, deep regression network, pretraining},
  keywords = {Articulatory inversion, deep neural network, deep
                   belief network, deep regression network, pretraining},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Uria_et_al_IS2012.pdf},
  year = 2012
}
@inproceedings{Wolters:mindcare,
  author = {Wolters, Maria and Martínez-Miranda, Juan and Hastie,
                   Helen and Matheson, Colin},
  title = {Managing Data in {Help4Mood}},
  booktitle = {The 2nd International Workshop on Computing Paradigms
                   for Mental Health - MindCare 2012},
  categories = {irrelevant speech effect, reminders, spearcon, speech,
                   working memory},
  location = {Vilamoura, Portugal},
  year = 2012
}
@article{2012E121001,
  author = {Junichi Yamagishi and Christophe Veaux and Simon King
                   and Steve Renals},
  title = {Speech synthesis technologies for individuals with
                   vocal disabilities: Voice banking and reconstruction},
  journal = {Acoustical Science and Technology},
  volume = {33},
  number = {1},
  pages = {1--5},
  url = {http://www.jstage.jst.go.jp/browse/ast/33/1/_contents},
  year = 2012
}
@article{Creer2012,
  author = {Sarah Creer and Stuart Cunningham and Phil Green and
                   Junichi Yamagishi},
  title = {Building personalised synthetic voices for individuals
                   with severe speech impairment},
  journal = {Computer Speech and Language},
  volume = {},
  number = {0},
  pages = { - },
  note = {},
  abstract = {For individuals with severe speech impairment accurate
                   spoken communication can be difficult and require
                   considerable effort. Some may choose to use a voice
                   output communication aid (or VOCA) to support their
                   spoken communication needs. A VOCA typically takes
                   input from the user through a keyboard or switch-based
                   interface and produces spoken output using either
                   synthesised or recorded speech. The type and number of
                   synthetic voices that can be accessed with a VOCA is
                   often limited and this has been implicated as a factor
                   for rejection of the devices. Therefore, there is a
                   need to be able to provide voices that are more
                   appropriate and acceptable for users. This paper
                   reports on a study that utilises recent advances in
                   speech synthesis to produce personalised synthetic
                   voices for 3 speakers with mild to severe dysarthria,
                   one of the most common speech disorders. Using a
                   statistical parametric approach to synthesis, an
                   average voice trained on data from several unimpaired
                   speakers was adapted using recordings of the impaired
                   speech of 3 dysarthric speakers. By careful selection
                   of the speech data and the model parameters, several
                   exemplar voices were produced for each speaker. A
                   qualitative evaluation was conducted with the speakers
                   and listeners who were familiar with the speaker. The
                   evaluation showed that for one of the 3 speakers a
                   voice could be created which conveyed many of his
                   personal characteristics, such as regional identity,
                   sex and age.},
  doi = {10.1016/j.csl.2012.10.001},
  issn = {0885-2308},
  keywords = {Speech synthesis, Augmentative and alternative
                   communication, Disordered speech, Voice output
                   communication aid},
  url = {http://www.sciencedirect.com/science/article/pii/S0885230812000836?v=s5},
  year = 2012
}