2004.bib

@comment{{This file has been generated by bib2bib 1.92}}
@comment{{Command line: /home/korin/bibtex2html-1.92-LINUX/bib2bib -oc /home/korin/projects/publications/new_output/transitdata/2004-citations -ob /home/korin/projects/publications/new_output/transitdata/2004.bib -c 'year : "2004"' /home/korin/projects/publications/filtlists/full_publications_list.bib}}
@inproceedings{leo_04-2,
  author = {Zovato, Enrico and Sandri, Stefano and Quazza, Silvia and Badino, Leonardo},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/ThA1404p.8_p890.pdf},
  booktitle = {Proc. ICSLP 2004},
  address = {Jeju, Korea},
  year = {2004},
  title = {Prosodic analysis of a multi-style corpus in the perspective of emotional speech synthesis}
}
@inproceedings{shig042,
  author = {Shiga, Yoshinori and King, Simon},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_icslp04b.ps},
  title = {Source-Filter Separation for Articulation-to-Speech Synthesis},
  booktitle = {Proc. ICSLP},
  address = {Jeju, Korea},
  month = {October},
  year = {2004},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_icslp04b.pdf},
  abstract = {In this paper we examine a method for separating out the vocal-tract filter response from the voice source characteristic using a large articulatory database. The method realises such separation for voiced speech using an iterative approximation procedure under the assumption that the speech production process is a linear system composed of a voice source and a vocal-tract filter, and that each of the components is controlled independently by different sets of factors. Experimental results show that the spectral variation is evidently influenced by the fundamental frequency or the power of speech, and that the tendency of the variation may be related closely to speaker identity. The method enables independent control over the voice source characteristic in our articulation-to-speech synthesis.},
  categories = {artic, lbg, clustering, mocha, source-filter, edinburgh}
}
@article{Wray04-LC04,
  author = {Wray, A. and Cox, S.J. and Lincoln, M. and Tryggvason, J.},
  title = {A Formulaic Approach to Translation at the Post Office: Reading the Signs},
  journal = {Language and Communication},
  number = {1},
  pages = {59-75},
  volume = {24},
  year = {2004},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/WrayCoxetal2004.pdf},
  abstract = {TESSA is an interactive translation system designed to support transactions between a post office clerk and a deaf customer. The system translates the clerk's speech into British Sign Language (BSL), displayed on a screen, using a specially-developed avatar (virtual human). TESSA is a context-constrained exemplification of one of two basic approaches to machine translation, neither of which can currently fulfil all of the demands of successful automatic translation. Drawing on recent research in theoretical psycholinguistics, we show how TESSA is a convincing prototype model of one aspect of real human language processing. Ways are suggested of exploiting this parallel, potentially offering new possibilities for the future design of artificial language systems.},
  categories = {visicast,sign language,translation,UEA}
}
@inproceedings{vepa-king-isca04,
  author = {Vepa, J. and King, S.},
  title = {Subjective evaluation of join cost and smoothing methods},
  booktitle = {Proc. 5th {ISCA} speech synthesis workshop},
  address = {Pittsburgh, USA},
  month = {June},
  year = {2004},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/vepa_tts04.pdf},
  abstract = {In our previous papers, we have proposed join cost functions derived from spectral distances, which have good correlations with perceptual scores obtained for a range of concatenation discontinuities. To further validate their ability to predict concatenation discontinuities, we have chosen the best three spectral distances and evaluated them subjectively in a listening test. The units for synthesis stimuli are obtained from a state-of-the-art unit selection text-to-speech system: `rVoice' from Rhetorical Systems Ltd. We also compared three different smoothing methods in this listening test. In this paper, we report listeners' preferences for each join costs in combination with each smoothing method.},
  categories = {join cost, Kalman filter, smoothing, evaluation, rVoice, edinburgh}
}
@inproceedings{christensen-ecir04,
  author = {Christensen, H. and Kolluru, B. and Gotoh, Y. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/ecir04.ps.gz},
  title = {From text summarisation to style-specific summarisation for broadcast news},
  booktitle = {Proc. ECIR--2004},
  pages = {},
  year = {2004},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/ecir04.pdf},
  abstract = {In this paper we report on a series of experiments investigating the path from text-summarisation to style-specific summarisation of spoken news stories. We show that the portability of traditional text summarisation features to broadcast news is dependent on the diffusiveness of the information in the broadcast news story. An analysis of two categories of news stories (containing only read speech or some spontaneous speech) demonstrates the importance of the style and the quality of the transcript, when extracting the summary-worthy information content. Further experiments indicate the advantages of doing style-specific summarisation of broadcast news.},
  categories = {s3l,summarization,bnews,edinburgh}
}
@inproceedings{vepa_king_icslp2004,
  author = {Vepa, Jithendra and King, Simon},
  title = {Subjective Evaluation Of Join Cost Functions Used In Unit Selection Speech Synthesis},
  booktitle = {Proc. 8th International Conference on Spoken Language Processing (ICSLP)},
  address = {Jeju, Korea},
  month = {October},
  year = {2004},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/vepa_king_icslp2004.pdf},
  abstract = {In our previous papers, we have proposed join cost functions derived from spectral distances, which have good correlations with perceptual scores obtained for a range of concatenation discontinuities. To further validate their ability to predict concatenation discontinuities, we have chosen the best three spectral distances and evaluated them subjectively in a listening test. The unit sequences for synthesis stimuli are obtained from a state-of-the-art unit selection text-tospeech system: rVoice from Rhetorical Systems Ltd. In this paper, we report listeners preferences for each of the three join cost functions.}
}
@inproceedings{dielmann-icassp04,
  author = {Dielmann, A. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/icassp04-m4.ps.gz},
  title = {Dynamic {Bayesian} Networks for Meeting Structuring},
  booktitle = {Proc. IEEE ICASSP},
  pages = {},
  year = {2004},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/icassp04-m4.pdf},
  abstract = {This paper is about the automatic structuring of multiparty meetings using audio information. We have used a corpus of 53 meetings, recorded using a microphone array and lapel microphones for each participant. The task was to segment meetings into a sequence of meeting actions, or phases. We have adopted a statistical approach using dynamic Bayesian networks (DBNs). Two DBN architectures were investigated: a two-level hidden Markov model (HMM) in which the acoustic observations were concatenated; and a multistream DBN in which two separate observation sequences were modelled. Additionally we have also explored the use of counter variables to constrain the number of action transitions. Experimental results indicate that the DBN architectures are an improvement over a simple baseline HMM, with the multistream DBN with counter constraints producing an action error rate of 6\%.},
  categories = {m4,multimodal,dbn,meetings,edinburgh}
}
@incollection{vepa:king:joincostchapter2004,
  editor = {Alwan, Abeer and Narayanan, Shri},
  author = {Vepa, Jithendra and King, Simon},
  publisher = {Prentice Hall},
  title = {Join Cost for Unit Selection Speech Synthesis},
  booktitle = {Speech Synthesis},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Vepa_King_joincostchapter2004.ps},
  year = {2004},
  categories = {}
}
@inproceedings{mayoturk:04b,
  author = {Mayo, C. and Turk, A.},
  booktitle = {LabPhon 9, University of Illinois at Urbana-Champaign},
  title = {The Development of Perceptual Cue Weighting Within and Across Monosyllabic Words},
  categories = {speech perception, development, cue weighting},
  year = {2004}
}
@inproceedings{wester04:asynch,
  author = {Wester, M. and Frankel, J. and King, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Wester_et_al_IEICE.ps},
  title = {Asynchronous Articulatory Feature Recognition Using Dynamic {B}ayesian Networks},
  booktitle = {Proc. IEICI Beyond HMM Workshop},
  address = {Kyoto},
  month = {December},
  year = {2004},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Wester_et_al_IEICE.pdf},
  abstract = {This paper builds on previous work where dynamic Bayesian networks (DBN) were proposed as a model for articulatory feature recognition. Using DBNs makes it possible to model the dependencies between features, an addition to previous approaches which was found to improve feature recognition performance. The DBN results were promising, giving close to the accuracy of artificial neural nets (ANNs). However, the system was trained on canonical labels, leading to an overly strong set of constraints on feature co-occurrence. In this study, we describe an embedded training scheme which learns a set of data-driven asynchronous feature changes where supported in the data. Using a subset of the OGI Numbers corpus, we describe articulatory feature recognition experiments using both canonically-trained and asynchronous DBNs. Performance using DBNs is found to exceed that of ANNs trained on an identical task, giving a higher recognition accuracy. Furthermore, inter-feature dependencies result in a more structured model, giving rise to fewer feature combinations in the recognition output. In addition to an empirical evaluation of this modelling approach, we give a qualitative analysis, comparing asynchrony found through our data-driven methods to the asynchrony which may be expected on the basis of linguistic knowledge.},
  categories = {am,artic,asr,dbn,oginumbers,edinburgh}
}
@inproceedings{Toney2004,
  editor = {Brewster, S. and Dunlop, M.},
  author = {Toney, D. and Feinberg, D. and Richmond, K.},
  publisher = {Springer},
  title = {Acoustic Features for Profiling Mobile Users of Conversational Interfaces},
  booktitle = {6th International Symposium on Mobile Human-Computer Interaction - {MobileHCI} 2004},
  address = {Glasgow, Scotland},
  month = {September},
  pages = {394--398},
  year = {2004},
  abstract = {Conversational interfaces allow human users to use spoken language to interact with computer-based information services. In this paper, we examine the potential for personalizing speech-based human-computer interaction according to the user's gender and age. We describe a system that uses acoustic features of the user's speech to automatically estimate these physical characteristics. We discuss the difficulties of implementing this process in relation to the high level of environmental noise that is typical of mobile human-computer interaction.}
}
@inproceedings{shig043,
  author = {Shiga, Yoshinori and King, Simon},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_icslp04a.ps},
  title = {Estimating detailed spectral envelopes using articulatory clustering},
  booktitle = {Proc. ICSLP},
  address = {Jeju, Korea},
  month = {October},
  year = {2004},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_icslp04a.pdf},
  abstract = {This paper presents an articulatory-acoustic mapping where detailed spectral envelopes are estimated. During the estimation, the harmonics of a range of F0 values are derived from the spectra of multiple voiced speech signals vocalized with similar articulator settings. The envelope formed by these harmonics is represented by a cepstrum, which is computed by fitting the peaks of all the harmonics based on the weighted least square method in the frequency domain. The experimental result shows that the spectral envelopes are estimated with the highest accuracy when the cepstral order is 48--64 for a female speaker, which suggests that representing the real response of the vocal tract requires high-quefrency elements that conventional speech synthesis methods are forced to discard in order to eliminate the pitch component of speech.},
  categories = {artic, lbg, clustering, mocha, harmonic, envelope, edinburgh}
}
@inproceedings{frankel04:artic_dbn,
  author = {Frankel, J. and Wester, M. and King, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Frankel_et_al_ICSLP2004.ps},
  title = {Articulatory feature recognition using dynamic {B}ayesian networks},
  booktitle = {Proc. {ICSLP}},
  month = {September},
  year = {2004},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/Frankel_et_al_ICSLP2004.pdf},
  abstract = {This paper describes the use of dynamic Bayesian networks for the task of articulatory feature recognition. We show that by modeling the dependencies between a set of 6 multi-leveled articulatory features, recognition accuracy is increased over an equivalent system in which features are considered independent. Results are compared to those found using artificial neural networks on an identical task.},
  categories = {am,artic,asr,dbn,timit,edinburgh}
}
@inproceedings{clarkrichmondking_ssw504,
  author = {Clark, Robert A.J. and Richmond, Korin and King, Simon},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/clarkrichmondking_ssw504.ps},
  title = {Festival 2 -- build your own general purpose unit selection speech synthesiser},
  booktitle = {Proc. 5th {ISCA} workshop on speech synthesis},
  year = {2004},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/clarkrichmondking_ssw504.pdf},
  abstract = {This paper describes version 2 of the Festival speech synthesis system. Festival 2 provides a development environment for concatenative speech synthesis, and now includes a general purpose unit selection speech synthesis engine. We discuss various aspects of unit selection speech synthesis, focusing on the research issues that relate to voice design and the automation of the voice development process.},
  categories = {synthesis, festival, unitselection}
}
@inproceedings{bakerclarkwhite_ssw504,
  author = {Baker, Rachel and Clark, Robert A.J. and White, Michael},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/bakerclarkwhite_ssw504.ps},
  title = {Synthesising Contextually Appropriate Intonation in Limited Domains},
  booktitle = {Proc. 5th {ISCA} workshop on speech synthesis},
  address = {Pittsburgh, USA},
  year = {2004},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/bakerclarkwhite_ssw504.pdf},
  categories = {synthesis, prosody, intonation, festival}
}
@inproceedings{Gutkin:King:icslp04,
  author = {Gutkin, Alexander and King, Simon},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/gutkin_king_icslp04.ps.gz},
  title = {Phone classification in pseudo-{E}uclidean Vector Spaces},
  booktitle = {Proc. 8th International Conference on Spoken Language Processing (ICSLP)},
  issn = {1225-441x},
  year = {2004},
  month = {October},
  volume = {II},
  pages = {1453--1457},
  address = {Jeju Island, Korea},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/gutkin_king_icslp04.pdf},
  abstract = {Recently we have proposed a structural framework for modelling speech, which is based on patterns of phonological distinctive features, a linguistically well-motivated alternative to standard vector-space acoustic models like HMMs. This framework gives considerable representational freedom by working with features that have explicit linguistic interpretation, but at the expense of the ability to apply the wide range of analytical decision algorithms available in vector spaces, restricting oneself to more computationally expensive and less-developed symbolic metric tools. In this paper we show that a dissimilarity-based distance-preserving transition from the original structural representation to a corresponding pseudo-Euclidean vector space is possible. Promising results of phone classification experiments conducted on the TIMIT database are reported.},
  categories = {structural,recognition,acoustic,phonetic_feature,timit,edinburgh}
}
@inproceedings{leo_04-3,
  author = {Badino, Leonardo},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/ThB2202p.22_p965.pdf},
  booktitle = {Proc. ICSLP 2004},
  address = {Jeju, Korea},
  year = {2004},
  title = {Chinese Text Word Segmentation Considering Semantic Links among Sentences}
}
@inproceedings{Gutkin:King:icpr04,
  author = {Gutkin, Alexander and King, Simon},
  publisher = {IEEE Computer Society Press},
  isbn = {0-7695-2128-2},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/gutkin_king_icpr04.ps.gz},
  booktitle = {Proc. 17th International Conference on Pattern Recognition (ICPR)},
  title = {{S}tructural {R}epresentation of {S}peech for {P}honetic {C}lassification},
  year = {2004},
  month = {August},
  volume = {3},
  pages = {438--441},
  address = {Cambridge, UK},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/gutkin_king_icpr04.pdf},
  abstract = {This paper explores the issues involved in using symbolic metric algorithms for automatic speech recognition (ASR), via a structural representation of speech. This representation is based on a set of phonological distinctive features which is a linguistically well-motivated alternative to the ``beads-on-a-string'' view of speech that is standard in current ASR systems. We report the promising results of phoneme classification experiments conducted on a standard continuous speech task.},
  categories = {structural,recognition,acoustic,phonetic_feature,timit,edinburgh}
}
@inproceedings{dielmann-mmsp04,
  author = {Dielmann, A. and Renals, S.},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/dielmann-mmsp04.ps.gz},
  title = {Multi-stream segmentation of meetings},
  booktitle = {Proc. IEEE Workshop on Multimedia Signal Processing},
  pages = {},
  year = {2004},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/dielmann-mmsp04.pdf},
  abstract = {This paper investigates the automatic segmentation of meetings into a sequence of group actions or phases. Our work is based on a corpus of multiparty meetings collected in a meeting room instrumented with video cameras, lapel microphones and a microphone array. We have extracted a set of feature streams, in this case extracted from the audio data, based on speaker turns, prosody and a transcript of what was spoken. We have related these signals to the higher level semantic categories via a multistream statistical model based on dynamic Bayesian networks (DBNs). We report on a set of experiments in which different DBN architectures are compared, together with the different feature streams. The resultant system has an action error rate of 9\%.},
  categories = {m4,multimodal,dbn,meetings,edinburgh}
}
@inproceedings{leo_04-4,
  author = {Badino, Leonardo and Barolo, Claudia and Quazza, Silvia},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/2026.pdf},
  booktitle = {Proc. 5th ISCA Speech Synthesis Workshop},
  address = {Pittsburgh, USA},
  year = {2004},
  title = {Language independent phoneme mapping for foreign {TTS}}
}
@inproceedings{abdelhaleem-icassp04,
  author = {Abdel-Haleem, Y. H. and Renals, S. and Lawrence, N. D.},
  title = {Acoustic space dimensionality selection and combination using the maximum entropy principle},
  booktitle = {Proc. IEEE ICASSP},
  abstract = {In this paper we propose a discriminative approach to acoustic space dimensionality selection based on maximum entropy modelling. We form a set of constraints by composing the acoustic space with the space of phone classes, and use a continuous feature formulation of maximum entropy modelling to select an optimal feature set. The suggested approach has two steps: (1) the selection of the best acoustic space that efficiently and economically represents the acoustic data and its variability; (2) the combination of selected acoustic features in the maximum entropy framework to estimate the posterior probabilities over the phonetic labels given the acoustic input. Specific contributions of this paper include a parameter estimation algorithm (generalized improved iterative scaling) that enables the use of negative features, the parameterization of constraint functions using Gaussian mixture models, and experimental results using the TIMIT database.},
  year = {2004},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/icassp04-me.pdf},
  pages = {},
  categories = {ml,maxent,am,recognition,edinburgh,sheffield}
}
@inproceedings{shig040,
  author = {Shiga, Yoshinori},
  title = {Source-filter separation based on an articulatory corpus},
  booktitle = {One day meeting for young speech researchers ({UK} meeting)},
  address = {University College London, London, United Kingdom},
  month = {April},
  year = {2004},
  abstract = {A new approach is presented for estimating voice source and vocal-tract filter characteristics based on an articulatory database. From the viewpoint of acoustics, in order to estimate the transfer function of a system, both the input and output of the system need to be observed. In the case of the source-filter separation problem, however, only the output (i.e. speech) is observable, and the response of the system (vocal tract) and the input (voice source) must be estimated simultaneously. The estimation is hence theoretically impossible, and consequently the estimation problem is generally solved approximately by applying rather oversimplified models. The proposed approach separates these two characteristics under the assumption that each of the characteristics is controlled independently by a different set of factors. The separation is achieved by iterative approximation based on the above assumption using a large speech corpus including electro-magnetic articulograph data. The proposed approach enables the independent control of the source and filter characteristics, and thus contributes toward improving speech quality in speech synthesis.},
  categories = {artic, lbg, clustering, mocha, source-filter, edinburgh}
}
@inproceedings{calhoun:04,
  author = {Calhoun, Sasha},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/calhounsp04.ps},
  title = {Phonetic dimensions of intonational categories: the case of {L}+{H}* and {H}*},
  booktitle = {Prosody 2004},
  address = {Nara, Japan},
  month = {March},
  note = {poster},
  year = {2004},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/calhounsp04.pdf},
  abstract = {ToBI, in its conception, was an attempt to describe intonation in terms of phonological categories. An effect of the success of ToBI in doing this has been to make it standard to try to characterise all intonational phonological distinctions in terms of ToBI distinctions, i.e. segmental alignment of pitch targets and pitch height as either High or Low. Here we report a series of experiments which attempted to do this, linking two supposed phonological categories, theme and rheme accents, to two controversial ToBI pitch accents L+H* and H* respectively. Our results suggest a reanalysis of the dimensions of phonological intonational distinctions. It is suggested that there are three layers affecting the intonational contour: global extrinsic, local extrinsic and intrinsic; and the theme-rheme distinction may lie in the local extrinsic layer. It is the similarity both of the phonetic effects and the semantic information conveyed by the last two layers that has led to the confusion in results such as those reported here.},
  categories = {prosody, intonational phonology, information structure, metrical structure, production and perception experiment}
}
@inproceedings{Gutkin:etal:ets-cam04,
  editor = {Goldfarb, Lev},
  author = {Gutkin, Alexander and Gay, David and Goldfarb, Lev and Wester, Mirjam},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/ets_cam04_dasr.ps.gz},
  title = {On the {A}rticulatory {R}epresentation of {S}peech within the {E}volving {T}ransformation {S}ystem {F}ormalism},
  booktitle = {Pattern Representation and the Future of Pattern Recognition (Proc. Satellite Workshop of 17th International Conference on Pattern Recognition)},
  address = {Cambridge, UK},
  month = {August},
  pages = {57--76},
  year = {2004},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/ets_cam04_dasr.pdf},
  abstract = {This paper deals with the formulation of an alternative, structural, approach to the speech representation and recognition problem. In this approach, we require both the representation and the learning algorithms to be linguistically meaningful and to naturally represent the linguistic data at hand. This allows the speech recognition system to discover the emergent combinatorial structure of the linguistic classes. The proposed approach is developed within the ETS formalism, the first formalism in applied mathematics specifically designed to address the issues of class and object/event representation. We present an initial application of ETS to the articulatory modelling of speech based on elementary physiological gestures that can be reliably represented as the ETS primitives. We discuss the advantages of this gestural approach over prevalent methods and its promising potential to mathematical modelling and representation in linguistics.},
  categories = {structural,recognition,ets,artic,mocha,edinburgh,unb}
}
@inproceedings{shig041,
  author = {Shiga, Yoshinori and King, Simon},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_ssw504.ps},
  title = {Accurate spectral envelope estimation for articulation-to-speech synthesis},
  booktitle = {Proc. 5th ISCA Speech Synthesis Workshop},
  address = {CMU, Pittsburgh, USA},
  month = {June},
  pages = {19--24},
  year = {2004},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/shiga_ssw504.pdf},
  abstract = {This paper introduces a novel articulatory-acoustic mapping in which detailed spectral envelopes are estimated based on the cepstrum, inclusive of the high-quefrency elements which are discarded in conventional speech synthesis to eliminate the pitch component of speech. For this estimation, the method deals with the harmonics of multiple voiced-speech spectra so that several sets of harmonics can be obtained at various pitch frequencies to form a spectral envelope. The experimental result shows that the method estimates spectral envelopes with the highest accuracy when the cepstral order is 48--64, which suggests that the higher order coeffcients are required to represent detailed envelopes reflecting the real vocal-tract responses.},
  categories = {artic, lbg, clustering, mocha, harmonic, envelope, edinburgh}
}
@inproceedings{leo_04-1,
  author = {Badino, Leonardo and Barolo, Claudia and Quazza, Silvia},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/WeA2401o.5_p1083.pdf},
  booktitle = {Proc. ICSLP 2004},
  address = {Jeju, Korea},
  year = {2004},
  title = {A General Approach to {TTS} Reading of Mixed-Language Texts}
}
@article{mayoturk:04,
  author = {Mayo, C. and Turk, T.},
  title = {Adult-child differences in acoustic cue weighting are influenced by segmental context: Children are not always perceptually biased towards transitions},
  journal = {Journal of the Acoustical Society of America},
  volume = {115},
  year = {2004},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/mayo-turk-2004a.pdf},
  pages = {3184-3194},
  categories = {speech perception, development, cue weighting}
}