2004.bib

@comment{{This file has been generated by bib2bib 1.92}}
@comment{{Command line: /home/korin/bibtex2html-1.92-LINUX/bib2bib -oc /home/korin/projects/publications/new_output/transitdata/2004-citations -ob /home/korin/projects/publications/new_output/transitdata/2004.bib -c 'year : "2004"' /home/korin/projects/publications/filtlists/full_publications_list.bib}}
@inproceedings{leo_04-2,
  author = {Enrico Zovato and Stefano Sandri and Silvia Quazza and
                   Leonardo Badino},
  title = {Prosodic analysis of a multi-style corpus in the
                   perspective of emotional speech synthesis},
  booktitle = {Proc. ICSLP 2004},
  address = {Jeju, Korea},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/ThA1404p.8_p890.pdf},
  year = 2004
}
@inproceedings{shig042,
  author = {Yoshinori Shiga and Simon King},
  title = {Source-Filter Separation for Articulation-to-Speech
                   Synthesis},
  booktitle = {Proc. ICSLP},
  address = {Jeju, Korea},
  abstract = {In this paper we examine a method for separating out
                   the vocal-tract filter response from the voice source
                   characteristic using a large articulatory database. The
                   method realises such separation for voiced speech using
                   an iterative approximation procedure under the
                   assumption that the speech production process is a
                   linear system composed of a voice source and a
                   vocal-tract filter, and that each of the components is
                   controlled independently by different sets of factors.
                   Experimental results show that the spectral variation
                   is evidently influenced by the fundamental frequency or
                   the power of speech, and that the tendency of the
                   variation may be related closely to speaker identity.
                   The method enables independent control over the voice
                   source characteristic in our articulation-to-speech
                   synthesis.},
  categories = {artic, lbg, clustering, mocha, source-filter,
                   edinburgh},
  month = oct,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_icslp04b.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_icslp04b.ps},
  year = 2004
}
@article{Wray04-LC04,
  author = {Wray, A. and Cox, S.J. and Lincoln, M. and Tryggvason,
                   J.},
  title = {A Formulaic Approach to Translation at the Post
                   Office: Reading the Signs},
  journal = {Language and Communication},
  volume = {24},
  number = {1},
  pages = {59-75},
  abstract = {TESSA is an interactive translation system designed to
                   support transactions between a post office clerk and a
                   deaf customer. The system translates the clerk's speech
                   into British Sign Language (BSL), displayed on a
                   screen, using a specially-developed avatar (virtual
                   human). TESSA is a context-constrained exemplification
                   of one of two basic approaches to machine translation,
                   neither of which can currently fulfil all of the
                   demands of successful automatic translation. Drawing on
                   recent research in theoretical psycholinguistics, we
                   show how TESSA is a convincing prototype model of one
                   aspect of real human language processing. Ways are
                   suggested of exploiting this parallel, potentially
                   offering new possibilities for the future design of
                   artificial language systems.},
  categories = {visicast,sign language,translation,UEA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/WrayCoxetal2004.pdf},
  year = 2004
}
@inproceedings{vepa-king-isca04,
  author = {Vepa, J. and King, S.},
  title = {Subjective evaluation of join cost and smoothing
                   methods},
  booktitle = {Proc. 5th {ISCA} speech synthesis workshop},
  address = {Pittsburgh, USA},
  abstract = {In our previous papers, we have proposed join cost
                   functions derived from spectral distances, which have
                   good correlations with perceptual scores obtained for a
                   range of concatenation discontinuities. To further
                   validate their ability to predict concatenation
                   discontinuities, we have chosen the best three spectral
                   distances and evaluated them subjectively in a
                   listening test. The units for synthesis stimuli are
                   obtained from a state-of-the-art unit selection
                   text-to-speech system: `rVoice' from Rhetorical Systems
                   Ltd. We also compared three different smoothing methods
                   in this listening test. In this paper, we report
                   listeners' preferences for each join costs in
                   combination with each smoothing method.},
  categories = {join cost, Kalman filter, smoothing, evaluation,
                   rVoice, edinburgh},
  month = jun,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/vepa_tts04.pdf},
  year = 2004
}
@inproceedings{christensen-ecir04,
  author = {H. Christensen and B. Kolluru and Y. Gotoh and S.
                   Renals},
  title = {From text summarisation to style-specific
                   summarisation for broadcast news},
  booktitle = {Proc. ECIR--2004},
  pages = {},
  abstract = {In this paper we report on a series of experiments
                   investigating the path from text-summarisation to
                   style-specific summarisation of spoken news stories. We
                   show that the portability of traditional text
                   summarisation features to broadcast news is dependent
                   on the diffusiveness of the information in the
                   broadcast news story. An analysis of two categories of
                   news stories (containing only read speech or some
                   spontaneous speech) demonstrates the importance of the
                   style and the quality of the transcript, when
                   extracting the summary-worthy information content.
                   Further experiments indicate the advantages of doing
                   style-specific summarisation of broadcast news.},
  categories = {s3l,summarization,bnews,edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/ecir04.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/ecir04.ps.gz},
  year = 2004
}
@inproceedings{vepa_king_icslp2004,
  author = {Jithendra Vepa and Simon King},
  title = {Subjective Evaluation Of Join Cost Functions Used In
                   Unit Selection Speech Synthesis},
  booktitle = {Proc. 8th International Conference on Spoken Language
                   Processing (ICSLP)},
  address = {Jeju, Korea},
  abstract = {In our previous papers, we have proposed join cost
                   functions derived from spectral distances, which have
                   good correlations with perceptual scores obtained for a
                   range of concatenation discontinuities. To further
                   validate their ability to predict concatenation
                   discontinuities, we have chosen the best three spectral
                   distances and evaluated them subjectively in a
                   listening test. The unit sequences for synthesis
                   stimuli are obtained from a state-of-the-art unit
                   selection text-tospeech system: rVoice from Rhetorical
                   Systems Ltd. In this paper, we report listeners
                   preferences for each of the three join cost functions.},
  month = oct,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/vepa_king_icslp2004.pdf},
  year = 2004
}
@inproceedings{dielmann-icassp04,
  author = {A. Dielmann and S. Renals},
  title = {Dynamic {Bayesian} Networks for Meeting Structuring},
  booktitle = {Proc. IEEE ICASSP},
  pages = {},
  abstract = {This paper is about the automatic structuring of
                   multiparty meetings using audio information. We have
                   used a corpus of 53 meetings, recorded using a
                   microphone array and lapel microphones for each
                   participant. The task was to segment meetings into a
                   sequence of meeting actions, or phases. We have adopted
                   a statistical approach using dynamic Bayesian networks
                   (DBNs). Two DBN architectures were investigated: a
                   two-level hidden Markov model (HMM) in which the
                   acoustic observations were concatenated; and a
                   multistream DBN in which two separate observation
                   sequences were modelled. Additionally we have also
                   explored the use of counter variables to constrain the
                   number of action transitions. Experimental results
                   indicate that the DBN architectures are an improvement
                   over a simple baseline HMM, with the multistream DBN
                   with counter constraints producing an action error rate
                   of 6\%.},
  categories = {m4,multimodal,dbn,meetings,edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/icassp04-m4.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/icassp04-m4.ps.gz},
  year = 2004
}
@incollection{vepa:king:joincostchapter2004,
  author = {Jithendra Vepa and Simon King},
  title = {Join Cost for Unit Selection Speech Synthesis},
  booktitle = {Speech Synthesis},
  publisher = {Prentice Hall},
  editor = {Alwan, Abeer and Narayanan, Shri},
  categories = {},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Vepa_King_joincostchapter2004.ps},
  year = 2004
}
@inproceedings{mayoturk:04b,
  author = {Mayo, C. and Turk, A.},
  title = {The Development of Perceptual Cue Weighting Within and
                   Across Monosyllabic Words},
  booktitle = {LabPhon 9, University of Illinois at Urbana-Champaign},
  categories = {speech perception, development, cue weighting},
  year = 2004
}
@inproceedings{wester04:asynch,
  author = {Wester, M. and Frankel, J. and King, S.},
  title = {Asynchronous Articulatory Feature Recognition Using
                   Dynamic {B}ayesian Networks},
  booktitle = {Proc. IEICI Beyond HMM Workshop},
  address = {Kyoto},
  abstract = {This paper builds on previous work where dynamic
                   Bayesian networks (DBN) were proposed as a model for
                   articulatory feature recognition. Using DBNs makes it
                   possible to model the dependencies between features, an
                   addition to previous approaches which was found to
                   improve feature recognition performance. The DBN
                   results were promising, giving close to the accuracy of
                   artificial neural nets (ANNs). However, the system was
                   trained on canonical labels, leading to an overly
                   strong set of constraints on feature co-occurrence. In
                   this study, we describe an embedded training scheme
                   which learns a set of data-driven asynchronous feature
                   changes where supported in the data. Using a subset of
                   the OGI Numbers corpus, we describe articulatory
                   feature recognition experiments using both
                   canonically-trained and asynchronous DBNs. Performance
                   using DBNs is found to exceed that of ANNs trained on
                   an identical task, giving a higher recognition
                   accuracy. Furthermore, inter-feature dependencies
                   result in a more structured model, giving rise to fewer
                   feature combinations in the recognition output. In
                   addition to an empirical evaluation of this modelling
                   approach, we give a qualitative analysis, comparing
                   asynchrony found through our data-driven methods to the
                   asynchrony which may be expected on the basis of
                   linguistic knowledge.},
  categories = {am,artic,asr,dbn,oginumbers,edinburgh},
  month = dec,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Wester_et_al_IEICE.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Wester_et_al_IEICE.ps},
  year = 2004
}
@inproceedings{Toney2004,
  author = {Toney, D. and Feinberg, D. and Richmond, K.},
  title = {Acoustic Features for Profiling Mobile Users of
                   Conversational Interfaces},
  booktitle = {6th International Symposium on Mobile Human-Computer
                   Interaction - {MobileHCI} 2004},
  editor = {Brewster, S. and Dunlop, M.},
  pages = {394--398},
  address = {Glasgow, Scotland},
  publisher = {Springer},
  abstract = {Conversational interfaces allow human users to use
                   spoken language to interact with computer-based
                   information services. In this paper, we examine the
                   potential for personalizing speech-based human-computer
                   interaction according to the user's gender and age. We
                   describe a system that uses acoustic features of the
                   user's speech to automatically estimate these physical
                   characteristics. We discuss the difficulties of
                   implementing this process in relation to the high level
                   of environmental noise that is typical of mobile
                   human-computer interaction.},
  month = sep,
  year = 2004
}
@inproceedings{shig043,
  author = {Yoshinori Shiga and Simon King},
  title = {Estimating detailed spectral envelopes using
                   articulatory clustering},
  booktitle = {Proc. ICSLP},
  address = {Jeju, Korea},
  abstract = {This paper presents an articulatory-acoustic mapping
                   where detailed spectral envelopes are estimated. During
                   the estimation, the harmonics of a range of F0 values
                   are derived from the spectra of multiple voiced speech
                   signals vocalized with similar articulator settings.
                   The envelope formed by these harmonics is represented
                   by a cepstrum, which is computed by fitting the peaks
                   of all the harmonics based on the weighted least square
                   method in the frequency domain. The experimental result
                   shows that the spectral envelopes are estimated with
                   the highest accuracy when the cepstral order is 48--64
                   for a female speaker, which suggests that representing
                   the real response of the vocal tract requires
                   high-quefrency elements that conventional speech
                   synthesis methods are forced to discard in order to
                   eliminate the pitch component of speech.},
  categories = {artic, lbg, clustering, mocha, harmonic, envelope,
                   edinburgh},
  month = oct,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_icslp04a.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_icslp04a.ps},
  year = 2004
}
@inproceedings{frankel04:artic_dbn,
  author = {Frankel, J. and Wester, M. and King, S.},
  title = {Articulatory feature recognition using dynamic
                   {B}ayesian networks},
  booktitle = {Proc. {ICSLP}},
  abstract = {This paper describes the use of dynamic Bayesian
                   networks for the task of articulatory feature
                   recognition. We show that by modeling the dependencies
                   between a set of 6 multi-leveled articulatory features,
                   recognition accuracy is increased over an equivalent
                   system in which features are considered independent.
                   Results are compared to those found using artificial
                   neural networks on an identical task.},
  categories = {am,artic,asr,dbn,timit,edinburgh},
  month = sep,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Frankel_et_al_ICSLP2004.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Frankel_et_al_ICSLP2004.ps},
  year = 2004
}
@inproceedings{clarkrichmondking_ssw504,
  author = {Robert A.J. Clark and Korin Richmond and Simon King},
  title = {Festival 2 -- build your own general purpose unit
                   selection speech synthesiser},
  booktitle = {Proc. 5th {ISCA} workshop on speech synthesis},
  abstract = {This paper describes version 2 of the Festival speech
                   synthesis system. Festival 2 provides a development
                   environment for concatenative speech synthesis, and now
                   includes a general purpose unit selection speech
                   synthesis engine. We discuss various aspects of unit
                   selection speech synthesis, focusing on the research
                   issues that relate to voice design and the automation
                   of the voice development process.},
  categories = {synthesis, festival, unitselection},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/clarkrichmondking_ssw504.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/clarkrichmondking_ssw504.ps},
  year = 2004
}
@inproceedings{bakerclarkwhite_ssw504,
  author = {Rachel Baker and Robert A.J. Clark and Michael White},
  title = {Synthesising Contextually Appropriate Intonation in
                   Limited Domains},
  booktitle = {Proc. 5th {ISCA} workshop on speech synthesis},
  address = {Pittsburgh, USA},
  categories = {synthesis, prosody, intonation, festival},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/bakerclarkwhite_ssw504.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/bakerclarkwhite_ssw504.ps},
  year = 2004
}
@inproceedings{Gutkin:King:icslp04,
  author = {Alexander Gutkin and Simon King},
  title = {Phone classification in pseudo-{E}uclidean Vector
                   Spaces},
  booktitle = {Proc. 8th International Conference on Spoken Language
                   Processing (ICSLP)},
  volume = {II},
  pages = {1453--1457},
  address = {Jeju Island, Korea},
  abstract = { Recently we have proposed a structural framework for
                   modelling speech, which is based on patterns of
                   phonological distinctive features, a linguistically
                   well-motivated alternative to standard vector-space
                   acoustic models like HMMs. This framework gives
                   considerable representational freedom by working with
                   features that have explicit linguistic interpretation,
                   but at the expense of the ability to apply the wide
                   range of analytical decision algorithms available in
                   vector spaces, restricting oneself to more
                   computationally expensive and less-developed symbolic
                   metric tools. In this paper we show that a
                   dissimilarity-based distance-preserving transition from
                   the original structural representation to a
                   corresponding pseudo-Euclidean vector space is
                   possible. Promising results of phone classification
                   experiments conducted on the TIMIT database are
                   reported. },
  categories = {structural,recognition,acoustic,phonetic_feature,timit,edinburgh},
  issn = {1225-441x},
  month = oct,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/gutkin_king_icslp04.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/gutkin_king_icslp04.ps.gz},
  year = 2004
}
@inproceedings{leo_04-3,
  author = {Leonardo Badino},
  title = {Chinese Text Word Segmentation Considering Semantic
                   Links among Sentences},
  booktitle = {Proc. ICSLP 2004},
  address = {Jeju, Korea},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/ThB2202p.22_p965.pdf},
  year = 2004
}
@inproceedings{Gutkin:King:icpr04,
  author = {Alexander Gutkin and Simon King},
  title = {{S}tructural {R}epresentation of {S}peech for
                   {P}honetic {C}lassification},
  booktitle = {Proc. 17th International Conference on Pattern
                   Recognition (ICPR)},
  volume = 3,
  pages = {438--441},
  address = {Cambridge, UK},
  publisher = {IEEE Computer Society Press},
  abstract = { This paper explores the issues involved in using
                   symbolic metric algorithms for automatic speech
                   recognition (ASR), via a structural representation of
                   speech. This representation is based on a set of
                   phonological distinctive features which is a
                   linguistically well-motivated alternative to the
                   ``beads-on-a-string'' view of speech that is standard
                   in current ASR systems. We report the promising results
                   of phoneme classification experiments conducted on a
                   standard continuous speech task. },
  categories = {structural,recognition,acoustic,phonetic_feature,timit,edinburgh},
  isbn = {0-7695-2128-2},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/gutkin_king_icpr04.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/gutkin_king_icpr04.ps.gz},
  year = 2004
}
@inproceedings{dielmann-mmsp04,
  author = {A. Dielmann and S. Renals},
  title = {Multi-stream segmentation of meetings},
  booktitle = {Proc. IEEE Workshop on Multimedia Signal Processing},
  pages = {},
  abstract = {This paper investigates the automatic segmentation of
                   meetings into a sequence of group actions or phases.
                   Our work is based on a corpus of multiparty meetings
                   collected in a meeting room instrumented with video
                   cameras, lapel microphones and a microphone array. We
                   have extracted a set of feature streams, in this case
                   extracted from the audio data, based on speaker turns,
                   prosody and a transcript of what was spoken. We have
                   related these signals to the higher level semantic
                   categories via a multistream statistical model based on
                   dynamic Bayesian networks (DBNs). We report on a set of
                   experiments in which different DBN architectures are
                   compared, together with the different feature streams.
                   The resultant system has an action error rate of 9\%.},
  categories = {m4,multimodal,dbn,meetings,edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/dielmann-mmsp04.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/dielmann-mmsp04.ps.gz},
  year = 2004
}
@inproceedings{leo_04-4,
  author = {Leonardo Badino and Claudia Barolo and Silvia Quazza},
  title = {Language independent phoneme mapping for foreign {TTS}},
  booktitle = {Proc. 5th ISCA Speech Synthesis Workshop},
  address = {Pittsburgh, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/2026.pdf},
  year = 2004
}
@inproceedings{abdelhaleem-icassp04,
  author = {Y. H. Abdel-Haleem and S. Renals and N. D. Lawrence},
  title = {Acoustic space dimensionality selection and
                   combination using the maximum entropy principle},
  booktitle = {Proc. IEEE ICASSP},
  pages = {},
  abstract = {In this paper we propose a discriminative approach to
                   acoustic space dimensionality selection based on
                   maximum entropy modelling. We form a set of constraints
                   by composing the acoustic space with the space of phone
                   classes, and use a continuous feature formulation of
                   maximum entropy modelling to select an optimal feature
                   set. The suggested approach has two steps: (1) the
                   selection of the best acoustic space that efficiently
                   and economically represents the acoustic data and its
                   variability; (2) the combination of selected acoustic
                   features in the maximum entropy framework to estimate
                   the posterior probabilities over the phonetic labels
                   given the acoustic input. Specific contributions of
                   this paper include a parameter estimation algorithm
                   (generalized improved iterative scaling) that enables
                   the use of negative features, the parameterization of
                   constraint functions using Gaussian mixture models, and
                   experimental results using the TIMIT database.},
  categories = {ml,maxent,am,recognition,edinburgh,sheffield},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/icassp04-me.pdf},
  year = 2004
}
@inproceedings{shig040,
  author = {Yoshinori Shiga},
  title = {Source-filter separation based on an articulatory
                   corpus},
  booktitle = {One day meeting for young speech researchers ({UK}
                   meeting)},
  address = {University College London, London, United Kingdom},
  abstract = {A new approach is presented for estimating voice
                   source and vocal-tract filter characteristics based on
                   an articulatory database. From the viewpoint of
                   acoustics, in order to estimate the transfer function
                   of a system, both the input and output of the system
                   need to be observed. In the case of the source-filter
                   separation problem, however, only the output (i.e.
                   speech) is observable, and the response of the system
                   (vocal tract) and the input (voice source) must be
                   estimated simultaneously. The estimation is hence
                   theoretically impossible, and consequently the
                   estimation problem is generally solved approximately by
                   applying rather oversimplified models. The proposed
                   approach separates these two characteristics under the
                   assumption that each of the characteristics is
                   controlled independently by a different set of factors.
                   The separation is achieved by iterative approximation
                   based on the above assumption using a large speech
                   corpus including electro-magnetic articulograph data.
                   The proposed approach enables the independent control
                   of the source and filter characteristics, and thus
                   contributes toward improving speech quality in speech
                   synthesis.},
  categories = {artic, lbg, clustering, mocha, source-filter,
                   edinburgh},
  month = apr,
  year = 2004
}
@inproceedings{calhoun:04,
  author = {Calhoun, Sasha},
  title = {Phonetic dimensions of intonational categories: the
                   case of {L}+{H}* and {H}*},
  booktitle = {Prosody 2004},
  address = {Nara, Japan},
  note = {poster},
  abstract = {ToBI, in its conception, was an attempt to describe
                   intonation in terms of phonological categories. An
                   effect of the success of ToBI in doing this has been to
                   make it standard to try to characterise all
                   intonational phonological distinctions in terms of ToBI
                   distinctions, i.e. segmental alignment of pitch targets
                   and pitch height as either High or Low. Here we report
                   a series of experiments which attempted to do this,
                   linking two supposed phonological categories, theme and
                   rheme accents, to two controversial ToBI pitch accents
                   L+H* and H* respectively. Our results suggest a
                   reanalysis of the dimensions of phonological
                   intonational distinctions. It is suggested that there
                   are three layers affecting the intonational contour:
                   global extrinsic, local extrinsic and intrinsic; and
                   the theme-rheme distinction may lie in the local
                   extrinsic layer. It is the similarity both of the
                   phonetic effects and the semantic information conveyed
                   by the last two layers that has led to the confusion in
                   results such as those reported here.},
  categories = {prosody, intonational phonology, information
                   structure, metrical structure, production and
                   perception experiment},
  month = mar,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/calhounsp04.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/calhounsp04.ps},
  year = 2004
}
@inproceedings{Gutkin:etal:ets-cam04,
  author = {Alexander Gutkin and David Gay and Lev Goldfarb and
                   Mirjam Wester},
  title = {On the {A}rticulatory {R}epresentation of {S}peech
                   within the {E}volving {T}ransformation {S}ystem
                   {F}ormalism},
  booktitle = {Pattern Representation and the Future of Pattern
                   Recognition (Proc. Satellite Workshop of 17th
                   International Conference on Pattern Recognition)},
  editor = {Lev Goldfarb},
  pages = {57--76},
  address = {Cambridge, UK},
  abstract = { This paper deals with the formulation of an
                   alternative, structural, approach to the speech
                   representation and recognition problem. In this
                   approach, we require both the representation and the
                   learning algorithms to be linguistically meaningful and
                   to naturally represent the linguistic data at hand.
                   This allows the speech recognition system to discover
                   the emergent combinatorial structure of the linguistic
                   classes. The proposed approach is developed within the
                   ETS formalism, the first formalism in applied
                   mathematics specifically designed to address the issues
                   of class and object/event representation. We present an
                   initial application of ETS to the articulatory
                   modelling of speech based on elementary physiological
                   gestures that can be reliably represented as the ETS
                   primitives. We discuss the advantages of this gestural
                   approach over prevalent methods and its promising
                   potential to mathematical modelling and representation
                   in linguistics. },
  categories = {structural,recognition,ets,artic,mocha,edinburgh,unb},
  month = aug,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/ets_cam04_dasr.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/ets_cam04_dasr.ps.gz},
  year = 2004
}
@inproceedings{shig041,
  author = {Yoshinori Shiga and Simon King},
  title = {Accurate spectral envelope estimation for
                   articulation-to-speech synthesis},
  booktitle = {Proc. 5th ISCA Speech Synthesis Workshop},
  pages = {19--24},
  address = {CMU, Pittsburgh, USA},
  abstract = {This paper introduces a novel articulatory-acoustic
                   mapping in which detailed spectral envelopes are
                   estimated based on the cepstrum, inclusive of the
                   high-quefrency elements which are discarded in
                   conventional speech synthesis to eliminate the pitch
                   component of speech. For this estimation, the method
                   deals with the harmonics of multiple voiced-speech
                   spectra so that several sets of harmonics can be
                   obtained at various pitch frequencies to form a
                   spectral envelope. The experimental result shows that
                   the method estimates spectral envelopes with the
                   highest accuracy when the cepstral order is 48--64,
                   which suggests that the higher order coeffcients are
                   required to represent detailed envelopes reflecting the
                   real vocal-tract responses.},
  categories = {artic, lbg, clustering, mocha, harmonic, envelope,
                   edinburgh},
  month = jun,
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_ssw504.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_ssw504.ps},
  year = 2004
}
@inproceedings{leo_04-1,
  author = {Leonardo Badino and Claudia Barolo and Silvia Quazza},
  title = {A General Approach to {TTS} Reading of Mixed-Language
                   Texts},
  booktitle = {Proc. ICSLP 2004},
  address = {Jeju, Korea},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/WeA2401o.5_p1083.pdf},
  year = 2004
}
@article{mayoturk:04,
  author = {Mayo, C. and Turk, T.},
  title = {Adult-child differences in acoustic cue weighting are
                   influenced by segmental context: Children are not
                   always perceptually biased towards transitions},
  journal = {Journal of the Acoustical Society of America},
  volume = 115,
  pages = {3184-3194},
  categories = {speech perception, development, cue weighting},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/mayo-turk-2004a.pdf},
  year = 2004
}