2000.bib
@comment{{This file has been generated by bib2bib 1.92}}
@comment{{Command line: /home/korin/bibtex2html-1.92-LINUX/bib2bib -oc /home/korin/projects/publications/new_output/transitdata/2000-citations -ob /home/korin/projects/publications/new_output/transitdata/2000.bib -c 'year : "2000"' /home/korin/projects/publications/filtlists/full_publications_list.bib}}
@article{gotoh-roysoc00,
author = {Y.~Gotoh and S.~Renals},
title = {Information Extraction from Broadcast News},
journal = {Philosophical Transactions of the Royal Society of
London, Series A},
volume = {358},
pages = {1295--1310},
abstract = {This paper discusses the development of trainable
statistical models for extracting content from
television and radio news broadcasts. In particular we
concentrate on statistical finite state models for
identifying proper names and other named entities in
broadcast speech. Two models are presented: the first
models name class information as a word attribute; the
second explicitly models both word-word and class-class
transitions. A common n-gram based formulation is used
for both models. The task of named entity
identification is characterized by relatively sparse
training data and issues related to smoothing are
discussed. Experiments are reported using the
DARPA/NIST Hub-4E evaluation for North American
Broadcast News.},
categories = {stobs,ie,lm,bnews,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/rs00-preprint.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/rs00-preprint.ps.gz},
year = 2000
}
@inproceedings{kessens-00,
author = {J.M. Kessens and M. Wester and H. Strik},
title = {Automatic Detection and Verification of {D}utch
Phonological Rules},
booktitle = {PHONUS 5: Proceedings of the "Workshop on Phonetics
and Phonology in ASR"},
pages = {117-128},
address = {Saarbruecken},
abstract = {In this paper, we propose two methods for
automatically obtaining hypotheses about pronunciation
variation. To this end, we used two different
approaches in which we employed a continuous speech
recognizer to derive this information from the speech
signal. For the first method, the output of a phone
recognition was compared to a reference transcription
in order obtain hypotheses about pronunciation
variation. Since phone recognition contains errors, we
used forced recognition in order to exclude unreliable
hypotheses. For the second method, forced recognition
was also used, but the hypotheses about the deletion of
phones were not constrained beforehand. This was
achieved by allowing each phone to be deleted. After
forced recognition, we selected the most frequently
applied rules as the set of deletion rules. Since
previous research showed that forced recognition is a
reliable tool for testing hypotheses about
pronunciation variation, we can expect that this will
also hold for the hypotheses about pronunciation
variation which we found using each of the two methods.
Another reason for expecting the rule hypotheses to be
reliable is that we found that 37-53\% of the rules are
related to Dutch phonological processes that have been
described in the literature.},
categories = {asr, pm, VIOS, Nijmegen},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/kessens.2000.2.pdf},
year = 2000
}
@inproceedings{Ban00,
author = {Bangham, J.A. and Cox, S.J. and Lincoln, M. and
Marshall, I. and Tutt, M. and Wells, M},
title = {Signing for the deaf using virtual humans},
booktitle = {IEE Colloquium on Speech and Language processing for
Disabled and Elderly},
abstract = {Research at Televirtual (Norwich) and the University
of East Anglia, funded predominantly by the Independent
Television Commission and more recently by the UK Post
Office also, has investigated the feasibility of using
virtual signing as a communication medium for
presenting information to the Deaf. We describe and
demonstrate the underlying virtual signer technology,
and discuss the language processing techniques and
discourse models which have been investigated for
information communication in a transaction application
in Post Offices, and for presentation of more general
textual material in texts such as subtitles
accompanying television programmes.},
categories = {visicast,sign language,translation,UEA},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/iee2000-04PaperAFinal.pdf},
year = 2000
}
@article{Stolcke_2000_a,
author = {Andreas Stolcke and N. Coccaro and R. Bates and P.
Taylor and C. Van Ess-Dykema and K. Ries and Elizabeth
Shriberg and D. Jurafsky and R.Martin and M. Meteer},
title = {Dialog Act Modeling for Automatic Tagging and
Recognition of Conversational Speech},
journal = {Computational Linguistics},
volume = 26,
number = 3,
categories = {prosody, recognition, langauge modelling, dialogue,
id4s},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Stolcke_2000_a.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Stolcke_2000_a.ps},
year = 2000
}
@inproceedings{strom00,
author = {Ann K. Syrdal and Colin W. Wightman and Alistair
Conkie and Yannis Stylianou and Mark Beutnagel and
Juergen Schroeter and Volker Strom and Ki-Seung Lee},
title = {Corpus-based Techniques in the AT&T NEXTGEN Synthesis
System},
booktitle = {Proc.~Int.~Conf.~on Spoken Language Processing},
address = {Beijing},
abstract = {The AT\&T text-to-speech (TTS) synthesis system has
been used as a framework for experimenting with a
perceptually-guided data-driven approach to speech
synthesis, with a primary focus on data-driven elements
in the "back end". Statistical training techniques
applied to a large corpus are used to make decisions
about predicted speech events and selected speech
inventory units. Our recent advances in automatic
phonetic and prosodic labelling and a new faster
harmonic plus noise model (HMM) and unit preselection
implementations have significantly improved TTS quality
and speeded up both development time and runtime.},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/strom00.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/strom00.ps},
year = 2000
}
@article{renals-specom00,
author = {S.~Renals and D.~Abberley and D.~Kirby and T.~Robinson},
title = {Indexing and Retrieval of Broadcast News},
journal = {Speech Communication},
volume = {32},
pages = {5--20},
abstract = {This paper describes a spoken document retrieval (SDR)
system for British and North American Broadcast News.
The system is based on a connectionist large vocabulary
speech recognizer and a probabilistic information
retrieval system. We discuss the development of a
realtime Broadcast News speech recognizer, and its
integration into an SDR system. Two advances were made
for this task: automatic segmentation and statistical
query expansion using a secondary corpus. Precision and
recall results using the Text Retrieval Conference
(TREC) SDR evaluation infrastructure are reported
throughout the paper, and we discuss the application of
these developments to a large scale SDR task based on
an archive of British English broadcast news.},
categories = {thisl,bnews,trec,ir,recognition,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/specom00-preprint.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/specom00-preprint.ps.gz},
year = 2000
}
@inproceedings{Matsuda2000ICSLP10,
author = {Shigeki Matsuda and Mitsuru Nakai and Hiroshi
Shimodaira and Shigeki Sagayama},
title = {{Feature-dependent Allophone Clustering}},
booktitle = {Proc. ICSLP2000},
pages = {413--416},
abstract = { We propose a novel method for clustering allophones
called Feature-Dependent Allophone Clustering (FD-AC)
that determines feature-dependent HMM topology
automatically. Existing methods for allophone
clustering are based on parameter sharing between the
allophone models that resemble each other in behaviors
of feature vector sequences. However, all the features
of the vector sequences may not necessarily have a
common allophone clustering structures. It is
considered that the vector sequences can be better
modeled by allocating the optimal allophone clustering
structure to each feature. In this paper, we propose
Feature-Dependent Successive State Splitting (FD-SSS)
as an implementation of FD-AC. In speaker-dependent
continuous phoneme recognition experiments, HMMs
created by FD-SSS reduced the error rates by about 10%
compared with the conventional HMMs that have a common
allophone clustering structure for all the features. },
categories = {asr, atr, jaist},
journal = {},
month = oct,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Matsuda2000ICSLP10.pdf},
year = 2000
}
@article{carreira-nc00,
author = {M.~Carreira-Perpiñán and S.~Renals},
title = {Practical identifiability of finite mixtures of
multivariate {Bernoulli} distributions},
journal = {Neural Computation},
volume = {12},
pages = {141--152},
abstract = {The class of finite mixtures of multivariate Bernoulli
distributions is known to be nonidentifiable, i.e.,
different values of the mixture parameters can
correspond to exactly the same probability
distribution. In principle, this would mean that sample
estimates using this model would give rise to different
interpretations. We give empirical support to the fact
that estimation of this class of mixtures can still
produce meaningful results in practice, thus lessening
the importance of the identifiability problem. We also
show that the EM algorithm is guaranteed to converge to
a proper maximum likelihood estimate, owing to a
property of the log-likelihood surface. Experiments
with synthetic data sets show that an original
generating distribution can be estimated from a sample.
Experiments with an electropalatography (EPG) data set
show important structure in the data.},
categories = {ml,lv,artic,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/nc00-preprint.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/nc00-preprint.ps.gz},
year = 2000
}
@article{Taylor_2000_b,
author = {Paul Taylor},
title = {Analysis and Synthesis of Intonation using the Tilt
Model},
journal = {Journal of the Acoustical Society of America},
volume = 107,
number = 3,
pages = {1697-1714},
categories = {prosody, intonation, id4s},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Taylor_2000_b.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Taylor_2000_b.ps},
year = 2000
}
@mastersthesis{Gutkin:00,
author = {Alexander Gutkin},
title = {{L}og-{L}inear {I}nterpolation of {L}anguage {M}odels},
school = {Department of Engineering, University of Cambridge},
type = {{MPhil.} thesis},
address = {UK},
categories = {statistical speech recognition, language modelling},
month = dec,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/gutkin_mphil.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/gutkin_mphil.ps.gz},
year = 2000
}
@phdthesis{Dusterhoff_2000_a,
author = {Kurt Dusterhoff},
title = {Synthesizing Fundamental Frequency Using Models
Automatically Trained from Data},
school = {University of Edinburgh},
booktitle = {Synthesizing Fundamental Frequency Using Models
Automatically Trained from Data},
categories = {intonation, synthesis, prosody},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Dusterhoff_2000_a.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Dusterhoff_2000_a.ps},
year = 2000
}
@inproceedings{Wester-00,
author = {M. Wester and J.M. Kessens and H. Strik},
title = {Pronunciation variation in {ASR}: Which variation to
model?},
booktitle = {Proc. of {ICSLP} '00},
volume = {IV},
pages = {488-491},
address = {Beijing},
abstract = {This paper describes how the performance of a
continuous speech recognizer for Dutch has been
improved by modeling within-word and cross-word
pronunciation variation. A relative improvement of
8.8\% in WER was found compared to baseline system
performance. However, as WERs do not reveal the full
effect of modeling pronunciation variation, we
performed a detailed analysis of the differences in
recognition results that occur due to modeling
pronunciation variation and found that indeed a lot of
the differences in recognition results are not
reflected in the error rates. Furthermore, error
analysis revealed that testing sets of variants in
isolation does not predict their behavior in
combination. However, these results appeared to be
corpus dependent.},
categories = {asr, pm, VIOS, Nijmegen},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/wester.2000.1.pdf},
year = 2000
}
@phdthesis{Wright_2000_a,
author = {Helen Wright},
title = {Modelling Prosodic and Dialogue Information for
Automatic Speech Recognition},
school = {University of Edinburgh},
categories = {prosody, dialogue, recognition, id4s},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Wright_2000_a.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Wright_2000_a.ps},
year = 2000
}
@inproceedings{wrench2000b,
author = {Wrench, A. and Richmond, K.},
title = {Continuous Speech Recognition Using Articulatory Data},
booktitle = {Proc. {ICSLP} 2000},
address = {Beijing, China},
abstract = {In this paper we show that there is measurable
information in the articulatory system which can help
to disambiguate the acoustic signal. We measure
directly the movement of the lips, tongue, jaw, velum
and larynx and parameterise this articulatory feature
space using principle components analysis. The
parameterisation is developed and evaluated using a
speaker dependent phone recognition task on a specially
recorded TIMIT corpus of 460 sentences. The results
show that there is useful supplementary information
contained in the articulatory data which yields a small
but significant improvement in phone recognition
accuracy of 2\%. However, preliminary attempts to
estimate the articulatory data from the acoustic signal
and use this to supplement the acoustic input have not
yielded any significant improvement in phone accuracy.},
categories = {artic, asr, ann, mlp, hmm, inversion, mocha,edinburgh},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Wrench_2000_a.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Wrench_2000_a.ps},
year = 2000
}
@inproceedings{Wester-Fosler-00,
author = {M. Wester and E. Fosler-Lussier},
title = {A comparison of data-derived and knowledge-based
modeling of pronunciation variation},
booktitle = {Proc. of ICSLP '00},
volume = {I},
pages = {270-273},
address = {Beijing},
abstract = {This paper focuses on modeling pronunciation variation
in two different ways: data-derived and
knowledge-based. The knowledge-based approach consists
of using phonological rules to generate variants. The
data-derived approach consists of performing phone
recognition, followed by various pruning and smoothing
methods to alleviate some of the errors in the phone
recognition. Using phonological rules led to a small
improvement in WER; whereas, using a data-derived
approach in which the phone recognition was smoothed
using simple decision trees (d-trees) prior to lexicon
generation led to a significant improvement compared to
the baseline. Furthermore, we found that 10\% of
variants generated by the phonological rules were also
found using phone recognition, and this increased to
23\% when the phone recognition output was smoothed by
using d-trees. In addition, we propose a metric to
measure confusability in the lexicon and we found that
employing this confusion metric to prune variants
results in roughly the same improvement as using the
d-tree method.},
categories = {asr, pm, VIOS, Berkeley},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/wester.2000.2.pdf},
year = 2000
}
@inproceedings{koumpis-icslp00,
author = {K.~Koumpis and S. Renals},
title = {Transcription and Summarization of Voicemail Speech},
booktitle = {Proc. ICSLP},
volume = {2},
pages = {688--691},
address = {Beijing},
abstract = {This paper describes the development of a system to
transcribe and summarize voicemail messages. The
results of the research presented in this paper are
two-fold. First, a hybrid connectionist approach to the
Voicemail transcription task shows that competitive
performance can be achieved using a context-independent
system with fewer parameters than those based on
mixtures of Gaussian likelihoods. Second, an effective
and robust combination of statistical with prior
knowledge sources for term weighting is used to extract
information from the decoders output in order to
deliver summaries to the message recipients via a GSM
Short Message Service (SMS) gateway.},
categories = {voicemail,summarization,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/icslp00.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/icslp00.ps.gz},
year = 2000
}
@inproceedings{gotoh-icassp00,
author = {Y.~Gotoh and S.~Renals},
title = {Variable word rate n-grams},
booktitle = {Proc IEEE ICASSP},
pages = {1591--1594},
address = {Istanbul},
abstract = {The rate of occurrence of words is not uniform but
varies from document to document. Despite this
observation, parameters for conventional n-gram
language models are usually derived using the
assumption of a constant word rate. In this paper we
investigate the use of variable word rate assumption,
modelled by a Poisson distribution or a continuous
mixture of Poissons. We present an approach to
estimating the relative frequencies of words or n-grams
taking prior information of their occurrences into
account. Discounting and smoothing schemes are also
considered. Using the Broadcast News task, the approach
demonstrates a reduction of perplexity up to 10\%.},
categories = {stobs,lm,bnews,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/icassp2000.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/icassp2000.ps.gz},
year = 2000
}
@article{Taylor_2000_a,
author = {P A Taylor},
title = {Concept-to-Speech by Phonological Structure Matching},
journal = {Philosophical Transactions of the Royal Society,
Series A},
categories = {prosody, synthesis, unit selection, waveform
generation, festival, unisyn},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Taylor_2000_a.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Taylor_2000_a.ps},
year = 2000
}
@inproceedings{frankel00:NN_LDM,
author = {Frankel, J. and Richmond, K. and King, S. and Taylor,
P.},
title = {An automatic speech recognition system using neural
networks and linear dynamic models to recover and model
articulatory traces},
booktitle = {Proc. {ICSLP}},
abstract = {In this paper we describe a speech recognition system
using linear dynamic models and articulatory features.
Experiments are reported in which measured articulation
from the MOCHA corpus has been used, along with those
where the articulatory parameters are estimated from
the speech signal using a recurrent neural network.},
categories = {am,artic,asr,ldm,mocha,edinburgh,inversion,ann},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Frankel_et_al_ICSLP2000.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Frankel_et_al_ICSLP2000.ps},
year = 2000
}
@inproceedings{gotoh-asr2000,
author = {Y.~Gotoh and S.~Renals},
title = {Sentence Boundary Detection in Broadcast Speech
Transcripts},
booktitle = {ISCA ITRW: ASR2000},
pages = {228--235},
address = {Paris},
abstract = {This paper presents an approach to identifying
sentence boundaries in broadcast speech transcripts. We
describe finite state models that extract sentence
boundary information statistically from text and audio
sources. An n-gram language model is constructed from a
collection of British English news broadcasts and
scripts. An alternative model is estimated from pause
duration information in speech recogniser outputs
aligned with their programme script counterparts.
Experimental results show that the pause duration model
alone outperforms the language modelling approach and
that, by combining these two models, it can be improved
further and precision and recall scores of over 70\%
were attained for the task.},
categories = {stobs,ie,lm,prosody,bnews,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/asr2000.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/asr2000.ps.gz},
year = 2000
}
@inproceedings{Matsuda2000ICASSP,
author = {Shigeki Matsuda and Mitsuru Nakai and Hiroshi
Shimodaira and Shigeki Sagayama},
title = {{Asynchronous-Transition {HMM}}},
booktitle = {Proc. ICASSP 2000 (Istanbul, Turkey), Vol. II},
pages = {1001--1004},
abstract = { We propose a new class of hidden Markov model (HMM)
called asynchronous-transition HMM (AT-HMM). Opposed to
conventional HMMs where hidden state transition occurs
simultaneously to all features, the new class of HMM
allows state transitions asynchronous between
individual features to better model asynchronous
timings of acoustic feature changes. In this paper, we
focus on a particular class of AT-HMM with sequential
constraints introducing a concept of ``state tying
across time''. To maximize the advantage of the new
model, we also introduce feature-wise state tying
technique. Speaker-dependent speech recognition
experiments demonstrated that reduced error rates more
than 30\% and 50\% in phoneme and isolated word
recognition, respectively, compared with conventional
HMMs. },
categories = {asr, atr, jaist},
month = jun,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Matsuda2000ICASSP.pdf},
year = 2000
}
@inproceedings{wester00:_using_dutch_asr,
author = {M. Wester and J.M. Kessens and H. Strik},
title = {Using {D}utch phonological rules to model
pronunciation variation in {ASR}},
booktitle = {Phonus 5: proceedings of the "workshop on phonetics
and phonology in {ASR}"},
pages = {105-116},
address = {Saarbruecken},
abstract = {In this paper, we describe how the performance of a
continuous speech recognizer for Dutch has been
improved by modeling within-word and cross-word
pronunciation variation. Within-word variants were
automatically generated by applying five phonological
rules to the words in the lexicon. Cross-word
pronunciation variation was modeled by adding
multi-words and their variants to the lexicon. The best
results were obtained when the cross-word method was
combined with the within-word method: a relative
improvement of 8.8\% in the WER was found compared to
baseline system performance. We also describe an error
analysis that was carried out to investigate whether
rules in isolation can predict the performance of rules
in combination.},
categories = {asr, pm, VIOS, Nijmegen},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/wester.2000.3.pdf},
year = 2000
}
@phdthesis{mayo:00,
author = {Mayo, C.},
title = {The relationship between phonemic awareness and cue
weighting in speech perception: longitudinal and
cross-sectional child studies},
school = {Queen Margaret University College},
categories = {speech perception, development, cue weighting,
phonemic awareness, literacy},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/thesis.pdf},
year = 2000
}
@inproceedings{Goubanova-Taylor:2000,
author = {Goubanova, O. and Taylor, P.},
title = {Using {B}ayesian {B}elief Networks for model duration
in text-to-speech systems},
booktitle = {CD-ROM Proc. ICSLP 2000},
address = {Beijing, China},
year = 2000
}
@inproceedings{Morais_2000_a,
author = {Edmilson Morais and Paul Taylor and Fabio Violaro},
title = {Concatenative Text-To-Speech Synthesis Based On
Prototype Waveform Interpolation (A Time Frequency
Approach)},
booktitle = {Proc. ICSLP 2000},
address = {Beijing, China},
categories = {waveform generation, festival},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Morais_2000_a.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Morais_2000_a.ps},
year = 2000
}
@inproceedings{king00:recognition_syll,
author = {King, S. and Taylor, P. and Frankel, J. and Richmond,
K.},
title = {Speech recognition via phonetically-featured syllables},
booktitle = {PHONUS},
volume = {5},
pages = {15-34},
address = {Institute of Phonetics, University of the Saarland},
abstract = {We describe recent work on two new automatic speech
recognition systems. The first part of this paper
describes the components of a system based on
phonological features (which we call EspressoA) in
which the values of these features are estimated from
the speech signal before being used as the basis for
recognition. In the second part of the paper, another
system (which we call EspressoB) is described in which
articulatory parameters are used instead of
phonological features and a linear dynamical system
model is used to perform recognition from automatically
estimated values of these articulatory parameters.},
categories = {am,artic,asr,ldm,phonetic_feature,mocha,timit,edinburgh},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/King_et_al_Phonus2000.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/King_et_al_Phonus2000.ps},
year = 2000
}
@inproceedings{Shimodaira2000ICSLP10,
author = {Hiroshi Shimodaira and Toshihiko Akae and Mitsuru
Nakai and Shigeki Sagayama},
title = {{Jacobian Adaptation of {HMM} with Initial Model
Selection for Noisy Speech Recognition}},
booktitle = {Proc. ICSLP2000},
pages = {1003--1006},
abstract = { An extension of Jacobian Adaptation (JA) of HMMs for
degraded speech recognition is presented in which
appropriate set of initial models is selected from a
number of initial-model sets designed for different
noise environments. Based on the first order Taylor
series approximation in the acoustic feature domain, JA
adapts the acoustic model parameters trained in the
initial noise environment A to the new environment B
much faster than PMC that creates the acoustic models
for the target environment from scratch. Despite the
advantage of JA to PMC, JA has a theoretical limitation
that the change of acoustic parameters from the
environment A to B should be small in order that the
linear approximation holds. To extend the coverage of
JA, the ideas of multiple sets of initial models and
their automatic selection scheme are discussed.
Speaker-dependent isolated-word recognition experiments
are carried out to evaluate the proposed method. },
categories = {asr, jaist},
month = oct,
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Shimodaira2000ICSLP10.pdf},
year = 2000
}
@article{king:taylor:csl2000,
author = {Simon King and Paul Taylor},
title = {Detection of Phonological Features in Continuous
Speech using Neural Networks},
journal = {Computer {S}peech and {L}anguage},
volume = 14,
number = 4,
pages = {333-353},
abstract = {We report work on the first component of a two stage
speech recognition architecture based on phonological
features rather than phones. The paper reports
experiments on three phonological feature systems: 1)
the Sound Pattern of English (SPE) system which uses
binary features, 2)a multi valued (MV) feature system
which uses traditional phonetic categories such as
manner, place etc, and 3) Government Phonology (GP)
which uses a set of structured primes. All experiments
used recurrent neural networks to perform feature
detection. In these networks the input layer is a
standard framewise cepstral representation, and the
output layer represents the values of the features. The
system effectively produces a representation of the
most likely phonological features for each input frame.
All experiments were carried out on the TIMIT speaker
independent database. The networks performed well in
all cases, with the average accuracy for a single
feature ranging from 86 to 93 percent. We describe
these experiments in detail, and discuss the
justification and potential advantages of using
phonological features rather than phones for the basis
of speech recognition.},
categories = {},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/King_Taylor_csl2000.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/King_Taylor_csl2000.ps},
year = 2000
}
@inproceedings{abberley-trec00,
author = {D.~Abberley and S.~Renals and D.~Ellis and T.~Robinson},
title = {The {THISL} {SDR} system at {TREC}--8},
booktitle = {Proc. Eighth Text Retrieval Conference (TREC--8)},
abstract = {This paper describes the participation of the THISL
group at the TREC-8 Spoken Document Retrieval (SDR)
track. The THISL SDR system consists of the realtime
version of the Abbot large vocabulary speech
recognition system and the thislIR text retrieval
system. The TREC-8 evaluation assessed SDR performance
on a corpus of 500 hours of broadcast news material
collected over a five month period. The main test
condition involved retrieval of stories defined by
manual segmentation of the corpus in which non-news
material, such as commercials, were excluded. An
optional test condition required required retrieval of
the same stories from the unsegmented audio stream. The
THISL SDR system participated at both test conditions.
The results show that a system such as THISL can
produce respectable information retrieval performance
on a realistically-sized corpus of unsegmented audio
material.},
categories = {thisl,bnews,trec,ir,recognition,eval,sheffield},
pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/trec8.pdf},
ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/trec8.ps.gz},
year = 2000
}