2012.bib
@comment{{This file has been generated by bib2bib 1.92}}
@comment{{Command line: /home/korin/bibtex2html-1.92-LINUX/bib2bib -oc /home/korin/projects/publications/new_output/transitdata/2012-citations -ob /home/korin/projects/publications/new_output/transitdata/2012.bib -c 'year : "2012"' /home/korin/projects/publications/filtlists/full_publications_list.bib}}
@inproceedings{swi2012_dnn,
author = {Swietojanski, P. and Ghoshal, A. and Renals, S.},
title = {Unsupervised Cross-lingual knowledge transfer in {DNN-based LVCSR}},
booktitle = {Proc. IEEE Workshop on Spoken Language Technology},
address = {Miami, Florida, USA},
abstract = {We investigate the use of cross-lingual acoustic data to initialise deep neural network (DNN) acoustic models by means
of unsupervised restricted Boltzmann machine (RBM) pretraining.
DNNs for German are pretrained using one or all of German, Portuguese, Spanish and Swedish. The DNNs are used in a tandem configuration, where the network outputs are used as features for a hidden Markov model (HMM) whose
emission densities are modeled by Gaussian mixture models (GMMs), as well as in a hybrid configuration, where the network outputs are used as the HMM state likelihoods. The experiments show that unsupervised pretraining is more crucial
for the hybrid setups, particularly with limited amounts of transcribed training data. More importantly, unsupervised pretraining is shown to be language-independent.},
month = dec,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/ps_slt2012.pdf},
year = 2012
}
@article{Andersson2012175,
author = {Sebastian Andersson and Junichi Yamagishi and Robert
A.J. Clark},
title = {Synthesis and evaluation of conversational
characteristics in {HMM}-based speech synthesis},
journal = {Speech Communication},
volume = {54},
number = {2},
pages = {175--188},
note = {},
abstract = {Spontaneous conversational speech has many
characteristics that are currently not modelled well by
HMM-based speech synthesis and in order to build
synthetic voices that can give an impression of someone
partaking in a conversation, we need to utilise data
that exhibits more of the speech phenomena associated
with conversations than the more generally used
carefully read aloud sentences. In this paper we show
that synthetic voices built with HMM-based speech
synthesis techniques from conversational speech data,
preserved segmental and prosodic characteristics of
frequent conversational speech phenomena. An analysis
of an evaluation investigating the perception of
quality and speaking style of HMM-based voices confirms
that speech with conversational characteristics are
instrumental for listeners to perceive successful
integration of conversational speech phenomena in
synthetic speech. The achieved synthetic speech quality
provides an encouraging start for the continued use of
conversational speech in HMM-based speech synthesis.},
doi = {10.1016/j.specom.2011.08.001},
issn = {0167-6393},
keywords = {Speech synthesis, HMM, Conversation, Spontaneous
speech, Filled pauses, Discourse marker},
url = {http://www.sciencedirect.com/science/article/pii/S0167639311001178},
year = 2012
}
@article{steiner:EL106,
author = {Ingmar Steiner and Korin Richmond and Ian Marshall and
Calum D. Gray},
title = {The magnetic resonance imaging subset of the mngu0
articulatory corpus},
journal = {The Journal of the Acoustical Society of America},
volume = {131},
number = {2},
pages = {EL106-EL111},
abstract = {This paper announces the availability of the magnetic
resonance imaging (MRI) subset of the mngu0 corpus, a
collection of articulatory speech data from one speaker
containing different modalities. This subset comprises
volumetric MRI scans of the speaker's vocal tract
during sustained production of vowels and consonants,
as well as dynamic mid-sagittal scans of repetitive
consonant--vowel (CV) syllable production. For
reference, high-quality acoustic recordings of the
speech material are also available. The raw data are
made freely available for research purposes. },
doi = {10.1121/1.3675459},
keywords = {audio recording; magnetic resonance imaging; speech
processing},
month = jan,
pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2012/mngu0-mri-2.pdf},
publisher = {ASA},
year = 2012
}
@inproceedings{ultraxIS2012,
author = {Richmond, Korin and Renals, Steve},
title = {Ultrax: An Animated Midsagittal Vocal Tract Display
for Speech Therapy},
booktitle = {Proc. Interspeech},
address = {Portland, Oregon, USA},
abstract = {Speech sound disorders (SSD) are the most common
communication impairment in childhood, and can hamper
social development and learning. Current speech therapy
interventions rely predominantly on the auditory skills
of the child, as little technology is available to
assist in diagnosis and therapy of SSDs. Realtime
visualisation of tongue movements has the potential to
bring enormous benefit to speech therapy. Ultrasound
scanning offers this possibility, although its display
may be hard to interpret. Our ultimate goal is to
exploit ultrasound to track tongue movement, while
displaying a simplified, diagrammatic vocal tract that
is easier for the user to interpret. In this paper, we
outline a general approach to this problem, combining a
latent space model with a dimensionality reducing model
of vocal tract shapes. We assess the feasibility of
this approach using magnetic resonance imaging (MRI)
scans to train a model of vocal tract shapes, which is
animated using electromagnetic articulography (EMA)
data from the same speaker.},
categories = {Ultrasound, speech therapy, vocal tract visualisation},
keywords = {Ultrasound, speech therapy, vocal tract visualisation},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/RichmondRenalsIS2012.pdf},
year = 2012
}
@article{Burton2012,
author = {Christopher Burton and Brian McKinstry and Aurora
Szentagotai Tatar and Antoni Serrano-Blanco and Claudia
Pagliari and Maria Wolters},
title = {Activity monitoring in patients with depression: A
systematic review.},
journal = {Journal of Affective Disorders},
volume = {},
number = {0},
pages = { - },
note = {},
abstract = {Background: Altered physical activity is an important
feature of depression. It is manifested in psychomotor
retardation, agitation and withdrawal from engagement
in normal activities. Modern devices for activity
monitoring (actigraphs) make it possible to monitor
physical activity unobtrusively but the validity of
actigraphy as an indicator of mood state is uncertain.
We carried out a systematic review of digital
actigraphy in patients with depression to investigate
the associations between measured physical activity and
depression. Methods: Systematic review and
meta-analysis. Studies were identified from Medline,
EMBASE and Psycinfo databases and included if they were
either case control or longitudinal studies of
actigraphy in adults aged between 18 and 65 diagnosed
with a depressive disorder. Outcomes were daytime and
night-time activity and actigraphic measures of sleep.
Results: We identified 19 eligible papers from 16
studies (412 patients). Case control studies showed
less daytime activity in patients with depression
(standardised mean difference −0.76, 95% confidence
intervals −1.05 to −0.47). Longitudinal studies
showed moderate increase in daytime activity (0.53,
0.20 to 0.87) and a reduction in night-time activity
(−0.36, −0.65 to −0.06) over the course of
treatment. Limitations: All study participants were
unblinded. Only seven papers included patients treated
in the community. Conclusions: Actigraphy is a
potentially valuable source of additional information
about patients with depression. However, there are no
clear guidelines for use of actigraphy in studies of
patients with depression. Further studies should
investigate patients treated in the community.
Additional work to develop algorithms for
differentiating behaviour patterns is also needed.},
categories = {"Depressive disorder","Actigraphy", "Telemonitoring"},
doi = {10.1016/j.jad.2012.07.001},
issn = {0165-0327},
url = {http://www.sciencedirect.com/science/article/pii/S0165032712005034},
year = 2012
}
@article{Wang_JCST2012,
author = {Dong Wang and Javier Tejedor and Simon King and Joe
Frankel},
title = {Term-dependent Confidence Normalization for
Out-of-Vocabulary Spoken Term Detection},
journal = {Journal of Computer Science and Technology},
volume = {27},
number = {2},
abstract = {Spoken Term Detection (STD) is a fundamental component
of spoken information retrieval systems. A key task of
an STD system is to determine reliable detections and
reject false alarms based on certain confidence
measures. The detection posterior probability, which is
often computed from lattices, is a widely used
confidence measure. However, a potential problem of
this confidence measure is that the confidence scores
of detections of all search terms are treated
uniformly, regardless of how much they may differ in
terms of phonetic or linguistic properties. This
problem is particularly evident for out-of-vocabulary
(OOV) terms which tend to exhibit high intra-term
diversity. To address the discrepancy on confidence
levels that the same confidence score may convey for
different terms, a term-dependent decision strategy is
desirable – for example, the term-specific threshold
(TST) approach. In this work, we propose a
term-dependent normalisation technique which
compensates for term diversity on confidence
estimation. Particularly, we propose a linear bias
compensation and a discriminative compensation to deal
with the bias problem that is inherent in lattice-based
confidence measuring from which the TST approach
suffers. We tested the proposed technique on speech
data from the multi-party meeting domain with two
state-of-the-art STD systems based on phonemes and
words respectively. The experimental results
demonstrate that the confidence normalisation approach
leads to a significant performance improvement in STD,
particularly for OOV terms with phoneme-based systems.},
doi = {http://dx.doi.org/10.1007/s11390-012-1228-x},
year = 2012
}
@inproceedings{Wolters:2012:HTS:2212776.2223703,
author = {Wolters, Maria and Isaac, Karl and Doherty, Jason},
title = {Hold that thought: are spearcons less disruptive than
spoken reminders?},
booktitle = {CHI '12 Extended Abstracts on Human Factors in
Computing Systems},
series = {CHI EA '12},
pages = {1745--1750},
address = {New York, NY, USA},
publisher = {ACM},
acmid = {2223703},
doi = {10.1145/2212776.2223703},
isbn = {978-1-4503-1016-1},
keywords = {irrelevant speech effect, reminders, spearcon, speech,
working memory},
location = {Austin, Texas, USA},
numpages = {6},
url = {http://doi.acm.org/10.1145/2212776.2223703},
year = 2012
}
@inproceedings{lingIS2012,
author = {Ling, Zhen-Hua and Richmond, Korin and Yamagishi,
Junichi},
title = {Vowel Creation by Articulatory Control in {HMM}-based
Parametric Speech Synthesis},
booktitle = {Proc. Interspeech},
address = {Portland, Oregon, USA},
abstract = { This paper presents a method to produce a new vowel
by articulatory control in hidden Markov model (HMM)
based parametric speech synthesis. A multiple
regression HMM (MRHMM) is adopted to model the
distribution of acoustic features, with articulatory
features used as external auxiliary variables. The
dependency between acoustic and articulatory features
is modelled by a group of linear transforms that are
either estimated context-dependently or determined by
the distribution of articulatory features. Vowel
identity is removed from the set of context features
used to ensure compatibility between the
context-dependent model parameters and the articulatory
features of a new vowel. At synthesis time, acoustic
features are predicted according to the input
articulatory features as well as context information.
With an appropriate articulatory feature sequence, a
new vowel can be generated even when it does not exist
in the training set. Experimental results show this
method is effective in creating the English vowel /2/
by articulatory control without using any acoustic
samples of this vowel.},
categories = {Speech synthesis, articulatory features,
multiple-regression hidden Markov model},
keywords = {Speech synthesis, articulatory features,
multiple-regression hidden Markov model},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/LingRichmondYamagishi_IS2012.pdf},
year = 2012
}
@inproceedings{Wolters:optimi1,
author = { Maria Wolters and Colin Matheson},
title = {Designing {Help4Mood}: Trade-Offs and Choices},
booktitle = { Information and Communication Technologies applied to
Mental Health },
editor = {Garcia-Gomez, Juan Miguel and Paniagua-Paniagua,
Patricia},
publisher = {Editorial Universitat Politecnica de Valencia},
categories = {depression, eHealth},
isbn = {978-84-8363-942-9},
location = {Valencia, Spain},
year = 2012
}
@conference{hengluIS2012,
abstract = {Speech units are highly context-dependent, so taking
contextual features into account is essential for
speech modelling. Context is employed in HMM-based
Text-to-Speech speech synthesis systems via
context-dependent phone models. A very wide context is
taken into account, represented by a large set of
contextual factors. However, most of these factors
probably have no significant influence on the speech,
most of the time. To discover which combinations of
features should be taken into account, decision
tree-based context clustering is used. But the space of
context-dependent models is vast, and the number of
contexts seen in the training data is only a tiny
fraction of this space, so the task of the decision
tree is very hard: to generalise from observations of a
tiny fraction of the space to the rest of the space,
whilst ignoring uninformative or redundant context
features. The structure of the context feature space
has not been systematically studied for speech
synthesis. In this paper we discover a dependency
structure by learning a Bayesian Network over the joint
distribution of the features and the speech. We
demonstrate that it is possible to discard the majority
of context features with minimal impact on quality,
measured by a perceptual test.},
address = {Portland, Oregon, USA},
author = {Heng Lu and Simon King},
booktitle = {Proc. Interspeech},
categories = {HMM-based speech synthesis, Bayesian Networks, context
information},
keywords = {HMM-based speech synthesis, Bayesian Networks, context
information},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/HengLuSimonKing.pdf},
title = {Using {Bayesian} Networks to find relevant context
features for {HMM}-based speech synthesis},
year = 2012
}
@inproceedings{CassiaICASSP12,
author = {Valentini-Botinhao, C. and Maia, R. and Yamagishi, J.
and King, S. and Zen, H.},
title = {{Cepstral analysis based on the Glimpse proportion
measure for improving the intelligibility of
{HMM}-based synthetic speech in noise}},
booktitle = {Proc. ICASSP},
pages = {3997--4000},
address = {Kyoto, Japan},
abstract = {In this paper we introduce a new cepstral coefficient
extraction method based on an intelligibility measure
for speech in noise, the Glimpse Proportion measure.
This new method aims to increase the intelligibility of
speech in noise by modifying the clean speech, and has
applications in scenarios such as public announcement
and car navigation systems. We first explain how the
Glimpse Proportion measure operates and further show
how we approximated it to integrate it into an existing
spectral envelope parameter extraction method commonly
used in the HMM-based speech synthesis framework. We
then demonstrate how this new method changes the
modelled spectrum according to the characteristics of
the noise and show results for a listening test with
vocoded and HMM-based synthetic speech. The test
indicates that the proposed method can significantly
improve intelligibility of synthetic speech in speech
shaped noise.},
categories = {HMM-based speech synthesis, intelligibility
enhancement, speech analysis},
doi = {10.1109/ICASSP.2012.6288794},
month = {March},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_ICASSP12.pdf},
year = 2012
}
@inproceedings{PhillipIS2012,
author = {Phillip L. De Leon and Bryan Stewart and Junichi
Yamagishi},
title = {Synthetic Speech Discrimination using Pitch Pattern
Statistics Derived from Image Analysis},
booktitle = {Proc. Interspeech},
address = {Portland, Oregon, USA},
abstract = { In this paper, we extend the work by Ogihara, et al.
to discriminate between human and synthetic speech
using features based on pitch patterns. As previously
demonstrated, significant differences in pitch patterns
between human and synthetic speech can be leveraged to
classify speech as being human or synthetic in origin.
We propose using mean pitch stability, mean pitch
stability range, and jitter as features extracted after
image analysis of pitch patterns. We have observed that
for synthetic speech, these features lie in a small and
distinct space as compared to human speech and have
modeled them with a multivariate Gaussian distribution.
Our classifier is trained using synthetic speech
collected from the 2008 and 2011 Blizzard Challenge
along with Festival pre-built voices and human speech
from the NIST2002 corpus. We evaluate the classifier on
a much larger corpus than previously studied using
human speech from the Switchboard corpus, synthetic
speech from the Resource Management corpus, and
synthetic speech generated from Festival trained on the
Wall Street Journal corpus. Results show 98% accuracy
in correctly classifying human speech and 96% accuracy
in correctly classifying synthetic speech.},
month = sep,
year = 2012
}
@phdthesis{watts-2012,
author = {Oliver Watts},
title = {Unsupervised Learning for Text-to-Speech Synthesis},
school = {University of Edinburgh},
abstract = {This thesis introduces a general method for
incorporating the distributional analysis of textual
and linguistic objects into text-to-speech (TTS)
conversion systems. Conventional TTS conversion uses
intermediate layers of representation to bridge the gap
between text and speech. Collecting the annotated data
needed to produce these intermediate layers is a far
from trivial task, possibly prohibitively so for
languages in which no such resources are in existence.
Distributional analysis, in contrast, proceeds in an
unsupervised manner, and so enables the creation of
systems using textual data that are not annotated. The
method therefore aids the building of systems for
languages in which conventional linguistic resources
are scarce, but is not restricted to these languages.
The distributional analysis proposed here places the
textual objects analysed in a continuous-valued space,
rather than specifying a hard categorisation of those
objects. This space is then partitioned during the
training of acoustic models for synthesis, so that the
models generalise over objects' surface forms in a way
that is acoustically relevant. The method is applied to
three levels of textual analysis: to the
characterisation of sub-syllabic units, word units and
utterances. Entire systems for three languages
(English, Finnish and Romanian) are built with no
reliance on manually labelled data or language-specific
expertise. Results of a subjective evaluation are
presented.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/oliver_watts_thesis.pdf},
year = 2012
}
@inproceedings{Wolters:bhci,
author = {Wolters, Maria and McCloughan, Lucy and Gibson, Martin
and Weatherall, Chris and Matheson, Colin and Maloney,
Tim and Castro-Robles, Juan Carlos and Estevez, Soraya },
title = {Monitoring People with Depression in the
Community---Regulatory Aspectts},
booktitle = {Workshop on People, Computers and Psychiatry at the
British Computer Society's Conference on Human Computer
Interaction},
pages = {1745--1750},
categories = {depression, regulation, monitoring},
location = {Birmingham, UK},
year = 2012
}
@inproceedings{mayo:12,
author = {Mayo, C. and Aubanel, V. and Cooke, M.},
title = {Effect of prosodic changes on speech intelligibility},
booktitle = {Proc. Interspeech},
address = {Portland, OR, USA},
year = 2012
}
@inproceedings{Jaime2IS2012,
author = {J. Lorenzo and B. Martinez and R. Barra-Chicote and V.
Lopez–Ludena and J. Ferreiros and J. Yamagishi and
J.M. Montero},
title = { Towards an Unsupervised Speaking Style Voice Building
Framework: Multi–Style Speaker Diarization},
booktitle = {Proc. Interspeech},
address = {Portland, Oregon, USA},
abstract = { Current text–to–speech systems are developed
using studio-recorded speech in a neutral style or
based on acted emotions. However, the proliferation of
media sharing sites would allow developing a new
generation of speech–based systems which could cope
with sponta- neous and styled speech. This paper
proposes an architecture to deal with realistic
recordings and carries out some experiments on
unsupervised speaker diarization. In order to maximize
the speaker purity of the clusters while keeping a high
speaker coverage, the paper evaluates the F–measure
of a diarization module, achieving high scores (>85%)
especially when the clusters are longer than 30
seconds, even for the more spontaneous and expressive
styles (such as talk shows or sports).},
month = sep,
year = 2012
}
@inproceedings{lingvowel,
author = {Ling, Zhenhua and Richmond, Korin and Yamagishi,
Junichi},
title = {Vowel Creation by Articulatory Control in {HMM}-based
Parametric Speech Synthesis},
booktitle = {Proc. The Listening Talker Workshop},
pages = {72},
address = {Edinburgh, UK},
month = {May},
pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2012/Ling_etal_LISTA.pdf},
year = 2012
}
@inproceedings{Wolters:cyber17,
author = {Claudia Pagliari and Maria Wolters and Chris Burton
and Brian McKinstry and Aurora Szentagotai and Antoni
Serrano-Blanco and Daniel David and Luis Ferrini and
Susanna Albertini and Joan Carlos Castro and Soraya
Estévez},
title = {Psychosocial Implications of Avatar Use in Supporting
Therapy of Depression},
booktitle = { CYBER17-17th Annual CyberPsychology & CyberTherapy
Conference},
categories = {depression, cybertherapy, monitoring},
location = {Brussels, Belgium},
year = 2012
}
@article{wester:specom:12,
author = {Mirjam Wester},
title = {Talker discrimination across languages},
journal = {Speech Communication},
volume = {54},
pages = {781--790},
abstract = {This study investigated the extent to which listeners
are able to discriminate between bilingual talkers in
three language pairs – English–German,
English–Finnish and English–Mandarin. Native
English listeners were presented with two sentences
spoken by bilingual talkers and were asked to judge
whether they thought the sentences were spoken by the
same person. Equal amounts of cross-language and
matched-language trials were presented. The results
show that native English listeners are able to carry
out this task well; achieving percent correct levels at
well above chance for all three language pairs.
Previous research has shown this for English–German,
this research shows listeners also extend this to
Finnish and Mandarin, languages that are quite distinct
from English from a genetic and phonetic similarity
perspective. However, listeners are significantly less
accurate on cross-language talker trials
(English–foreign) than on matched-language trials
(English–English and foreign–foreign).
Understanding listeners’ behaviour in cross-language
talker discrimination using natural speech is the first
step in developing principled evaluation techniques for
synthesis systems in which the goal is for the
synthesised voice to sound like the original speaker,
for instance, in speech-to-speech translation systems,
voice conversion and reconstruction.},
categories = {evaluation},
doi = {10.1016/j.specom.2012.01.006},
pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2012/wester_specom_12.pdf},
year = 2012
}
@inproceedings{llu2012map,
author = {Lu, L. and Ghoshal, A. and Renals, S.},
title = {{Maximum a posteriori adaptation of subspace Gaussian
mixture models for cross-lingual speech recognition}},
booktitle = {Proc. ICASSP},
abstract = {This paper concerns cross-lingual acoustic modeling in
the case when there are limited target language
resources. We build on an approach in which a subspace
Gaussian mixture model (SGMM) is adapted to the target
language by reusing the globally shared parameters
estimated from out-of-language training data. In
current cross-lingual systems, these parameters are
fixed when training the target system, which can give
rise to a mismatch between the source and target
systems. We investigate a maximum a posteriori (MAP)
adaptation approach to alleviate the potential
mismatch. In particular, we focus on the adaptation of
phonetic subspace parameters using a matrix variate
Gaussian prior distribution. Experiments on the
GlobalPhone corpus using the MAP adaptation approach
results in word error rate reductions, compared with
the cross-lingual baseline systems and systems updated
using maximum likelihood, for training conditions with
1 hour and 5 hours of target language data.},
keywords = {Subspace Gaussian Mixture Model, Maximum a Posteriori
Adaptation, Cross-lingual Speech Recognition},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/llu-icassp-2012.pdf},
year = 2012
}
@article{anderssonyamagishi12,
author = {Andersson, S. and Yamagishi, J. and Clark, R.A.J.},
title = {Synthesis and Evaluation of Conversational
Characteristics in {HMM}-Based Speech Synthesis},
journal = {Speech Communication},
volume = 54,
number = 2,
pages = {175-188},
abstract = {Spontaneous conversational speech has many
characteristics that are currently not modelled well by
HMM-based speech synthesis and in order to build
synthetic voices that can give an impression of someone
partaking in a conversation, we need to utilise data
that exhibits more of the speech phenomena associated
with conversations than the more generally used
carefully read aloud sentences. In this paper we show
that synthetic voices built with HMM-based speech
synthesis techniques from conversational speech data,
preserved segmental and prosodic characteristics of
frequent conversational speech phenomena. An analysis
of an evaluation investigating the perception of
quality and speaking style of HMM-based voices confirms
that speech with conversational characteristics are
instrumental for listeners to perceive successful
integration of conversational speech phenomena in
synthetic speech. The achieved synthetic speech quality
provides an encouraging start for the continued use of
conversational speech in HMM-based speech synthesis.},
doi = {10.1016/j.specom.2011.08.001},
year = 2012
}
@inproceedings{cooke:lista:12,
author = {Martin Cooke and Maria Luisa García Lecumberri and
Yan Tang and Mirjam Wester},
title = {Do non-native listeners benefit from speech
modifications designed to promote intelligibility for
native listeners?},
booktitle = {Proceedings of The Listening Talker Workshop},
pages = 59,
note = {http://listening-talker.org/workshop/programme.html},
year = 2012
}
@inproceedings{dallIS2012,
author = {Dall, Rasmus and Veaux, Christophe and Yamagishi,
Junichi and King, Simon},
title = {Analysis of Speaker CLustering Strategies for
{HMM}-Based Speech Synthesis},
booktitle = {Proc. Interspeech},
address = {Portland, Oregon, USA},
abstract = {This paper describes a method for speaker clustering,
with the application of building average voice models
for speaker-adaptive HMM-based speech synthesis that
are a good basis for adapting to specific target
speakers. Our main hypothesis is that using
perceptually similar speakers to build the average
voice model will be better than use unselected
speakers, even if the amount of data available from
perceptually similar speakers is smaller. We measure
the perceived similarities among a group of 30 female
speakers in a listening test and then apply multiple
linear regression to automatically predict these
listener judgements of speaker similarity and thus to
identify similar speakers automatically. We then
compare a variety of average voice models trained on
either speakers who were perceptually judged to be
similar to the target speaker, or speakers selected by
the multiple linear regression, or a large global set
of unselected speakers. We find that the average voice
model trained on perceptually similar speakers provides
better performance than the global model, even though
the latter is trained on more data, confirming our main
hypothesis. However, the average voice model using
speakers selected automatically by the multiple linear
regression does not reach the same level of
performance.},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/DallIS2012.pdf},
year = 2012
}
@inproceedings{JaimeIS2012,
author = {Jaime Lorenzo-Trueba and Roberto Barra-Chicote and
Tuomo Raitio and Nicolas Obin and Paavo Alku and
Junichi Yamagishi and Juan M Montero},
title = { Towards Glottal Source Controllability in Expressive
Speech Synthesis},
booktitle = {Proc. Interspeech},
address = {Portland, Oregon, USA},
abstract = { In order to obtain more human like sounding human-
machine interfaces we must first be able to give them
expressive capabilities in the way of emotional and
stylistic features so as to closely adequate them to
the intended task. If we want to replicate those
features it is not enough to merely replicate the
prosodic information of fundamental frequency and
speaking rhythm. The proposed additional layer is the
modification of the glottal model, for which we make
use of the GlottHMM parameters. This paper analyzes the
viability of such an approach by verifying that the
expressive nuances are captured by the aforementioned
features, obtaining 95% recognition rates on styled
speaking and 82% on emotional speech. Then we evaluate
the effect of speaker bias and recording environment on
the source modeling in order to quantify possible
problems when analyzing multi-speaker databases.
Finally we propose a speaking styles separation for
Spanish based on prosodic features and check its
perceptual significance.},
month = sep,
year = 2012
}
@inproceedings{bell12_mlan,
author = {Bell, P. and Gales, M. and Lanchantin, P. and Liu, X.
and Long, Y. and Renals, S. and Swietojanski, P. and
Woodland, P.},
title = {Transcription of multi-genre media archives using
out-of-domain data},
booktitle = {Proc. IEEE Workshop on Spoken Language Technology},
address = {Miami, Florida, USA},
abstract = {We describe our work on developing a speech
recognition system for multi-genre media archives. The
high diversity of the data makes this a challenging
recognition task, which may benefit from systems
trained on a combination of in-domain and out-of-domain
data. Working with tandem HMMs, we present Multi-level
Adaptive Networks (MLAN), a novel technique for
incorporating information from out-of-domain posterior
features using deep neural networks. We show that it
provides a substantial reduction in WER over other
systems, with relative WER reductions of 15\% over a
PLP baseline, 9\% over in-domain tandem features and
8\% over the best out-of-domain tandem features.},
month = dec,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/mlan_slt2012.pdf},
year = 2012
}
@article{Oura2012703,
author = {Keiichiro Oura and Junichi Yamagishi and Mirjam Wester
and Simon King and Keiichi Tokuda},
title = {Analysis of unsupervised cross-lingual speaker
adaptation for {HMM}-based speech synthesis using
{KLD}-based transform mapping},
journal = {Speech Communication},
volume = {54},
number = {6},
pages = {703--714},
abstract = {In the EMIME project, we developed a mobile device
that performs personalized speech-to-speech translation
such that a user's spoken input in one language is used
to produce spoken output in another language, while
continuing to sound like the user's voice. We
integrated two techniques into a single architecture:
unsupervised adaptation for HMM-based TTS using
word-based large-vocabulary continuous speech
recognition, and cross-lingual speaker adaptation
(CLSA) for HMM-based TTS. The CLSA is based on a
state-level transform mapping learned using minimum
Kullback-Leibler divergence between pairs of HMM states
in the input and output languages. Thus, an
unsupervised cross-lingual speaker adaptation system
was developed. End-to-end speech-to-speech translation
systems for four languages (English, Finnish, Mandarin,
and Japanese) were constructed within this framework.
In this paper, the English-to-Japanese adaptation is
evaluated. Listening tests demonstrate that adapted
voices sound more similar to a target speaker than
average voices and that differences between supervised
and unsupervised cross-lingual speaker adaptation are
small. Calculating the KLD state-mapping on only the
first 10 mel-cepstral coefficients leads to huge
savings in computational costs, without any detrimental
effect on the quality of the synthetic speech.},
doi = {10.1016/j.specom.2011.12.004},
issn = {0167-6393},
keywords = {HMM-based speech synthesis, Unsupervised speaker
adaptation, Cross-lingual speaker adaptation,
Speech-to-speech translation},
url = {http://www.sciencedirect.com/science/article/pii/S0167639311001774},
year = 2012
}
@inproceedings{badinoclark_interspeech12,
author = {Leonardo Badino and Robert A.J. Clark and Mirjam
Wester},
title = {Towards Hierarchical Prosodic Prominence Generation in
{TTS} Synthesis},
booktitle = {Proc. Interspeech},
address = {Portland, USA},
categories = {speech synthesis, prosody},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/badinoclark_IS_2012.pdf},
year = 2012
}
@inproceedings{bell12_tutoring,
author = {Bell, Peter and Dzikovska, Myroslava and Isard, Amy},
title = {Designing a spoken language interface for a tutorial
dialogue system},
booktitle = {Proc. Interspeech},
address = {Portland, Oregon, USA},
abstract = {We describe our work in building a spoken language
interface for a tutorial dialogue system. Our goal is
to allow natural, unrestricted student interaction with
the computer tutor, which has been shown to improve the
student's learning gain, but presents challenges for
speech recognition and spoken language understanding.
We discuss the choice of system components and present
the results of development experiments in both acoustic
and language modelling for speech recognition in this
domain.},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/tutoring_is2012.pdf},
year = 2012
}
@inproceedings{zwyssig2012determining,
author = {Zwyssig, E. and Renals, S. and Lincoln, M.},
title = {Determining the number of speakers in a meeting using
microphone array features},
booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2012
IEEE International Conference on},
pages = {4765--4768},
year = 2012
}
@article{Hashimoto2012857,
author = {Kei Hashimoto and Junichi Yamagishi and William Byrne
and Simon King and Keiichi Tokuda},
title = {Impacts of machine translation and speech synthesis on
speech-to-speech translation},
journal = {Speech Communication},
volume = {54},
number = {7},
pages = {857--866},
note = {},
abstract = {This paper analyzes the impacts of machine translation
and speech synthesis on speech-to-speech translation
systems. A typical speech-to-speech translation system
consists of three components: speech recognition,
machine translation and speech synthesis. Many
techniques have been proposed for integration of speech
recognition and machine translation. However,
corresponding techniques have not yet been considered
for speech synthesis. The focus of the current work is
machine translation and speech synthesis, and we
present a subjective evaluation designed to analyze
their impact on speech-to-speech translation. The
results of these analyses show that the naturalness and
intelligibility of the synthesized speech are strongly
affected by the fluency of the translated sentences. In
addition, several features were found to correlate well
with the average fluency of the translated sentences
and the average naturalness of the synthesized speech.},
doi = {10.1016/j.specom.2012.02.004},
issn = {0167-6393},
keywords = {Speech-to-speech translation, Machine translation,
Speech synthesis, Subjective evaluation},
url = {http://www.sciencedirect.com/science/article/pii/S0167639312000283},
year = 2012
}
@inproceedings{stan12_grapheme_alignment,
author = {Stan, Adriana and Bell, Peter and King, Simon},
title = {A Grapheme-based Method for Automatic Alignment of
Speech and Text Data},
booktitle = {Proc. IEEE Workshop on Spoken Language Technology},
address = {Miami, Florida, USA},
abstract = {This paper introduces a method for automatic alignment
of speech data with unsynchronised, imperfect
transcripts, for a domain where no initial acoustic
models are available. Using grapheme-based acoustic
models, word skip networks and orthographic speech
transcripts, we are able to harvest 55\% of the speech
with a 93\% utterance-level accuracy and 99\% word
accuracy for the produced transcriptions. The work is
based on the assumption that there is a high degree of
correspondence between the speech and text, and that a
full transcription of all of the speech is not
required. The method is language independent and the
only prior knowledge and resources required are the
speech and text transcripts, and a few minor user
interventions.},
month = dec,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/grapheme_alignment_slt2012.pdf},
year = 2012
}
@article{6205335,
author = {De Leon, P. L. and Pucher, M. and Yamagishi, J. and
Hernaez, I. and Saratxaga, I.},
title = {Evaluation of Speaker Verification Security and
Detection of {HMM}-Based Synthetic Speech},
journal = {Audio, Speech, and Language Processing, IEEE
Transactions on},
volume = {20},
number = {8},
pages = {2280--2290},
abstract = {In this paper, we evaluate the vulnerability of
speaker verification (SV) systems to synthetic speech.
The SV systems are based on either the Gaussian mixture
model #x2013;universal background model (GMM-UBM) or
support vector machine (SVM) using GMM supervectors. We
use a hidden Markov model (HMM)-based text-to-speech
(TTS) synthesizer, which can synthesize speech for a
target speaker using small amounts of training data
through model adaptation of an average voice or
background model. Although the SV systems have a very
low equal error rate (EER), when tested with synthetic
speech generated from speaker models derived from the
Wall Street Journal (WSJ) speech corpus, over 81% of
the matched claims are accepted. This result suggests
vulnerability in SV systems and thus a need to
accurately detect synthetic speech. We propose a new
feature based on relative phase shift (RPS),
demonstrate reliable detection of synthetic speech, and
show how this classifier can be used to improve
security of SV systems.},
doi = {10.1109/TASL.2012.2201472},
issn = {1558-7916},
month = oct,
year = 2012
}
@inproceedings{CassiaSAPA12,
author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S.},
title = {{Speech intelligibility enhancement for {HMM}-based
synthetic speech in noise}},
booktitle = {Proc. Sapa Workshop},
address = {Portland, USA},
abstract = {It is possible to increase the intelligibility of
speech in noise by enhancing the clean speech signal.
In this paper we demonstrate the effects of modifying
the spectral envelope of synthetic speech according to
the environmental noise. To achieve this, we modify Mel
cepstral coefficients according to an intelligibility
measure that accounts for glimpses of speech in noise:
the Glimpse Proportion measure. We evaluate this method
against a baseline synthetic voice trained only with
normal speech and a topline voice trained with Lombard
speech, as well as natural speech. The intelligibility
of these voices was measured when mixed with
speech-shaped noise and with a competing speaker at
three different levels. The Lombard voices, both
natural and synthetic, were more intelligible than the
normal voices in all conditions. For speech-shaped
noise, the proposed modified voice was as intelligible
as the Lombard synthetic voice without requiring any
recordings of Lombard speech, which are hard to obtain.
However, in the case of competing talker noise, the
Lombard synthetic voice was more intelligible than the
proposed modified voice.},
categories = {HMM-based speech synthesis, intelligibility
enhancement},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_Sapa12.pdf},
year = 2012
}
@inproceedings{Wolters:medetel,
author = {Wolters, Maria and Ferrini, Louis and
Martinez-Miranda, Juan and Hastie, Helen and Burton,
Chris },
title = {{Help4Mood} - A Flexible Solution for Supporting
People with Depression in the Community across Europe},
booktitle = {Proceedings of The International eHealth, Telemedicine
and Health ICT Forum For Education, Networking and
Business (MedeTel, 2012)},
publisher = {International Society for Telemedicine & eHealth
(ISfTeH)},
categories = {depression, mental health, ehealth},
editors = {Jodanova, E and Lievens, F},
location = {Luxemburg},
year = 2012
}
@inproceedings{zwyssig2012effect,
author = {Zwyssig, E. and Renals, S. and Lincoln, M.},
title = {{On the effect of SNR and superdirective beamforming
in speaker diarisation in meetings}},
booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2012
IEEE International Conference on},
pages = {4177--4180},
year = 2012
}
@inproceedings{CassiaLista12,
author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S.},
title = {{Using an intelligibility measure to create noise
robust cepstral coefficients for {HMM}-based speech
synthesis}},
booktitle = {Proc. LISTA Workshop},
address = {Edinburgh, UK},
categories = {HMM-based speech synthesis, intelligibility
enhancement},
month = {May},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_Lista12.pdf},
year = 2012
}
@inproceedings{CassiaWocci12,
author = {Valentini-Botinhao, C. and Degenkolb-Weyers, S. and
Maier, A. and Noeth, E. and Eysholdt, U. and Bocklet,
T.},
title = {{Automatic detection of sigmatism in children}},
booktitle = {Proc. WOCCI},
address = {Portland, USA},
abstract = {We propose in this paper an automatic system to detect
sigmatism from the speech signal. Sigmatism occurs when
the tongue is positioned incorrectly during
articulation of sibilant phones like /s/ and /z/. For
our task we extracted various sets of features from
speech: Mel frequency cepstral coefficients, energies
in specific bandwidths of the spectral envelope, and
the so-called supervectors, which are the parameters of
an adapted speaker model. We then trained several
classifiers on a speech database of German adults
simulating three different types of sigmatism.
Recognition results were calculated at a phone, word
and speaker level for both the simulated database and
for a database of pathological speakers. For the
simulated database, we achieved recognition rates of up
to 86%, 87% and 94% at a phone, word and speaker level.
The best classifier was then integrated as part of a
Java applet that allows patients to record their own
speech, either by pronouncing isolated phones, a
specific word or a list of words, and provides them
with a feedback whether the sibilant phones are being
correctly pronounced.},
month = {September},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_WOCCI12.pdf},
year = 2012
}
@inproceedings{janskaetal_interspeech12,
author = {Anna C. Janska and Erich Schröger and Thomas Jacobsen
and Robert A. J. Clark},
title = {Asymmetries in the perception of synthesized speech},
booktitle = {Proc. Interspeech},
address = {Portland, USA},
categories = {speech synthesis, evaluation},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/janskaeral_IS_2012.pdf},
year = 2012
}
@inproceedings{koutsogiannaki:12,
author = {Koutsogiannaki, M. and Pettinato, M. and Mayo, C. and
Kandia, V. and Stylianou, Y.},
title = {Can modified casual speech reach the intelligibility
of clear speech?},
booktitle = {Proc. Interspeech},
address = {Portland, OR, USA},
year = 2012
}
@article{Wolters:ICST,
title = {Managing Data in {Help4Mood}},
journal = {ICST Transactions in Ambient Systems},
volume = {},
number = {Special Issue on Technology in Mental Health},
pages = { - },
note = {},
authors = {Wolters, Maria and Martinez-Miranda, Juan and Hastie,
Helen F. and Estevez, Soraya and Matheson, Colin},
categories = {mental health, depression, monitoring, ontologies,
SNOMED-CT},
year = 2012
}
@inproceedings{lu2012jud,
author = {Lu, L. and Ghoshal, A. and Renals, S.},
title = {{Joint uncertainty decoding with unscented transform
for noise robust subspace Gaussian mixture model}},
booktitle = {Proc. Sapa-Scale workshop},
abstract = {Common noise compensation techniques use vector Taylor
series (VTS) to approximate the mismatch function.
Recent work shows that the approximation accuracy may
be improved by sampling. One such sampling technique is
the unscented transform (UT), which draws samples
deterministically from clean speech and noise model to
derive the noise corrupted speech parameters. This
paper applies UT to noise compensation of the subspace
Gaussian mixture model (SGMM). Since UT requires
relatively smaller number of samples for accurate
estimation, it has significantly lower computational
cost compared to other random sampling techniques.
However, the number of surface Gaussians in an SGMM is
typically very large, making the direct application of
UT, for compensating individual Gaussian components,
computationally impractical. In this paper, we avoid
the computational burden by employing UT in the
framework of joint uncertainty decoding (JUD), which
groups all the Gaussian components into small number of
classes, sharing the compensation parameters by class.
We evaluate the JUD-UT technique for an SGMM system
using the Aurora 4 corpus. Experimental results
indicate that UT can lead to increased accuracy
compared to VTS approximation if the JUD phase factor
is untuned, and to similar accuracy if the phase factor
is tuned empirically},
keywords = {noise compensation, SGMM, JUD, UT},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/llu-sapa2012.pdf},
year = 2012
}
@inproceedings{sansegundo_et_al_IS2012,
author = {Ruben San-Segundo and Juan M. Montero and Veronica
Lopez-Luden and Simon King},
title = {Detecting Acronyms from Capital Letter Sequences in
Spanish},
booktitle = {Proc. Interspeech},
address = {Portland, Oregon, USA},
abstract = {This paper presents an automatic strategy to decide
how to pronounce a Capital Letter Sequence (CLS) in a
Text to Speech system (TTS). If CLS is well known by
the TTS, it can be expanded in several words. But when
the CLS is unknown, the system has two alternatives:
spelling it (abbreviation) or pronouncing it as a new
word (acronym). In Spanish, there is a high
relationship between letters and phonemes. Because of
this, when a CLS is similar to other words in Spanish,
there is a high tendency to pronounce it as a standard
word. This paper proposes an automatic method for
detecting acronyms. Additionally, this paper analyses
the discrimination capability of some features, and
several strategies for combining them in order to
obtain the best classifier. For the best classifier,
the classification error is 8.45\%. About the feature
analysis, the best features have been the Letter
Sequence Perplexity and the Average N-gram order.},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Thu-P10a-07.pdf},
year = 2012
}
@inproceedings{aubanel:12,
author = {Aubanel, V. and Cooke, M. and Foster, E. and
Garcia-Lecumberri, M. L. and Mayo, C.},
title = {Effects of the availability of visual information and
presence of competing conversations on speech
production},
booktitle = {Proc. Interspeech},
address = {Portland, OR, USA},
year = 2012
}
@inproceedings{Wolters:medetel-castro,
author = {Estevez, Soraya and Castro-Robles, Juan Carlos and
Wolters, Maria },
title = {{Help4Mood}: First Release of a Computational
Distributed System to Support the Treatment of Patients
with Major Depression},
booktitle = {Proceedings of The International eHealth, Telemedicine
and Health ICT Forum For Education, Networking and
Business (MedeTel, 2012)},
pages = {1745--1750},
publisher = {International Society for Telemedicine & eHealth
(ISfTeH)},
categories = {depression, mental health, ehealth},
editors = {Jodanova, E and Lievens, F},
location = {Luxemburg},
year = 2012
}
@inproceedings{dzikovska-EtAl:2012:EACL2012,
author = {Dzikovska, Myroslava O. and Bell, Peter and Isard, Amy
and Moore, Johanna D.},
title = {Evaluating language understanding accuracy with
respect to objective outcomes in a dialogue system},
booktitle = {Proceedings of the 13th Conference of the European
Chapter of the Association for Computational
Linguistics},
pages = {471--481},
address = {Avignon, France},
publisher = {Association for Computational Linguistics},
month = {April},
url = {http://www.aclweb.org/anthology/E12-1048},
year = 2012
}
@inproceedings{CassiaIS12,
author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S.},
title = {{Mel cepstral coefficient modification based on the
Glimpse Proportion measure for improving the
intelligibility of {HMM}-generated synthetic speech in
noise}},
booktitle = {Proc. Interspeech},
address = {Portland, USA},
abstract = {We propose a method that modifies the Mel cepstral
coefficients of HMM-generated synthetic speech in order
to increase the intelligibility of the generated speech
when heard by a listener in the presence of a known
noise. This method is based on an approximation we
previously proposed for the Glimpse Proportion measure.
Here we show how to update the Mel cepstral
coefficients using this measure as an optimization
criterion and how to control the amount of distortion
by limiting the frequency resolution of the
modifications. To evaluate the method we built eight
different voices from normal read-text speech data from
a male speaker. Some voices were also built from
Lombard speech data produced by the same speaker.
Listening experiments with speech-shaped noise and with
a single competing talker indicate that our method
significantly improves intelligibility when compared to
unmodified synthetic speech. The voices built from
Lombard speech outperformed the proposed method
particularly for the competing talker case. However,
compared to a voice using only the spectral parameters
from Lombard speech, the proposed method obtains
similar or higher performance.},
categories = {HMM-based speech synthesis, intelligibility
enhancement, Mel cepstral coefficients},
month = {September},
year = 2012
}
@inproceedings{lu2012noise,
author = {Lu, L. and Chin, KK and Ghoshal, A. and Renals, S.},
title = {{Noise compensation for subspace Gaussian mixture
models}},
booktitle = {Proc. INTERSPEECH},
abstract = {Joint uncertainty decoding (JUD) is an effective
model-based noise compensation technique for
conventional Gaussian mixture model (GMM) based speech
recognition systems. In this paper, we apply JUD to
subspace Gaussian mixture model (SGMM) based acoustic
models. The total number of Gaussians in the SGMM
acoustic model is usually much larger than for
conventional GMMs, which limits the application of
approaches which explicitly compensate each Gaussian,
such as vector Taylor series (VTS). However, by
clustering the Gaussian components into a number of
regression classes, JUD-based noise compensation can be
successfully applied to SGMM systems. We evaluate the
JUD/SGMM technique using the Aurora 4 corpus, and the
experimental results indicated that it is more accurate
than conventional GMM-based systems using either VTS or
JUD noise compensation.},
keywords = {acoustic modelling, noise compensation, SGMM, JUD},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/llu-is2012.pdf},
year = 2012
}
@inproceedings{6287948,
author = {Saheer, L. and Yamagishi, J. and Garner, P.N. and
Dines, J.},
title = {Combining vocal tract length normalization with
hierarchial linear transformations},
booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2012
IEEE International Conference on},
volume = {},
number = {},
pages = {4493 -4496},
abstract = {Recent research has demonstrated the effectiveness of
vocal tract length normalization (VTLN) as a rapid
adaptation technique for statistical parametric speech
synthesis. VTLN produces speech with naturalness
preferable to that of MLLR-based adaptation techniques,
being much closer in quality to that generated by the
original average voice model. However with only a
single parameter, VTLN captures very few speaker
specific characteristics when compared to linear
transform based adaptation techniques. This paper
proposes that the merits of VTLN can be combined with
those of linear transform based adaptation in a
hierarchial Bayesian framework, where VTLN is used as
the prior information. A novel technique for
propagating the gender information from the VTLN prior
through constrained structural maximum a posteriori
linear regression (CSMAPLR) adaptation is presented.
Experiments show that the resulting transformation has
improved speech quality with better naturalness,
intelligibility and improved speaker similarity.},
doi = {10.1109/ICASSP.2012.6287948},
issn = {1520-6149},
keywords = {CSMAPLR adaptation;MLLR based adaptation
technique;constrained structural maximum a posteriori
linear regression;hierarchial Bayesian
framework;hierarchial linear
transformation;intelligibility;rapid adaptation
technique;speaker similarity;statistical parametric
speech synthesis;vocal tract length normalization;Bayes
methods;speech intelligibility;},
month = mar,
year = 2012
}
@inproceedings{uriaIS2012,
author = {Benigno Uria and Iain Murray and Steve Renals and
Korin Richmond},
title = {Deep Architectures for Articulatory Inversion},
booktitle = {Proc. Interspeech},
address = {Portland, Oregon, USA},
abstract = { We implement two deep architectures for the
acoustic-articulatory inversion mapping problem: a deep
neural network and a deep trajectory mixture density
network. We find that in both cases, deep architectures
produce more accurate predictions than shallow
architectures and that this is due to the higher
expressive capability of a deep model and not a
consequence of adding more adjustable parameters. We
also find that a deep trajectory mixture density
network is able to obtain better inversion accuracies
than smoothing the results of a deep neural network.
Our best model obtained an average root mean square
error of 0.885 mm on the MNGU0 test dataset.},
categories = {Articulatory inversion, deep neural network, deep
belief network, deep regression network, pretraining},
keywords = {Articulatory inversion, deep neural network, deep
belief network, deep regression network, pretraining},
month = sep,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Uria_et_al_IS2012.pdf},
year = 2012
}
@inproceedings{Wolters:mindcare,
author = {Wolters, Maria and Martínez-Miranda, Juan and Hastie,
Helen and Matheson, Colin},
title = {Managing Data in {Help4Mood}},
booktitle = {The 2nd International Workshop on Computing Paradigms
for Mental Health - MindCare 2012},
categories = {irrelevant speech effect, reminders, spearcon, speech,
working memory},
location = {Vilamoura, Portugal},
year = 2012
}
@article{2012E121001,
author = {Junichi Yamagishi and Christophe Veaux and Simon King
and Steve Renals},
title = {Speech synthesis technologies for individuals with
vocal disabilities: Voice banking and reconstruction},
journal = {Acoustical Science and Technology},
volume = {33},
number = {1},
pages = {1--5},
url = {http://www.jstage.jst.go.jp/browse/ast/33/1/_contents},
year = 2012
}
@article{Creer2012,
author = {Sarah Creer and Stuart Cunningham and Phil Green and
Junichi Yamagishi},
title = {Building personalised synthetic voices for individuals
with severe speech impairment},
journal = {Computer Speech and Language},
volume = {},
number = {0},
pages = { - },
note = {},
abstract = {For individuals with severe speech impairment accurate
spoken communication can be difficult and require
considerable effort. Some may choose to use a voice
output communication aid (or VOCA) to support their
spoken communication needs. A VOCA typically takes
input from the user through a keyboard or switch-based
interface and produces spoken output using either
synthesised or recorded speech. The type and number of
synthetic voices that can be accessed with a VOCA is
often limited and this has been implicated as a factor
for rejection of the devices. Therefore, there is a
need to be able to provide voices that are more
appropriate and acceptable for users. This paper
reports on a study that utilises recent advances in
speech synthesis to produce personalised synthetic
voices for 3 speakers with mild to severe dysarthria,
one of the most common speech disorders. Using a
statistical parametric approach to synthesis, an
average voice trained on data from several unimpaired
speakers was adapted using recordings of the impaired
speech of 3 dysarthric speakers. By careful selection
of the speech data and the model parameters, several
exemplar voices were produced for each speaker. A
qualitative evaluation was conducted with the speakers
and listeners who were familiar with the speaker. The
evaluation showed that for one of the 3 speakers a
voice could be created which conveyed many of his
personal characteristics, such as regional identity,
sex and age.},
doi = {10.1016/j.csl.2012.10.001},
issn = {0885-2308},
keywords = {Speech synthesis, Augmentative and alternative
communication, Disordered speech, Voice output
communication aid},
url = {http://www.sciencedirect.com/science/article/pii/S0885230812000836?v=s5},
year = 2012
}