# 2005.bib

@comment{{This file has been generated by bib2bib 1.92}}

@comment{{Command line: /home/korin/bibtex2html-1.92-LINUX/bib2bib -oc /home/korin/projects/publications/new_output/transitdata/2005-citations -ob /home/korin/projects/publications/new_output/transitdata/2005.bib -c 'year : "2005"' /home/korin/projects/publications/filtlists/full_publications_list.bib}}

@inproceedings{oliverclark_interspeech05,
author = {Oliver, Dominika and Clark, Robert A. J.},
booktitle = {Proc. Interspeech 2005},
year = {2005},
categories = {speech synthesis, prosody, intonation, festival, Polish},
title = {Modelling pitch accent types for {P}olish speech synthesis}
}

@inproceedings{christensen-icassp05,
author = {Christensen, H. and Kolluru, B. and Gotoh, Y. and Renals, S.},
title = {Maximum entropy segmentation of broadcast news},
booktitle = {Proc. IEEE ICASSP},
abstract = {This paper presents an automatic system for structuring and preparing a news broadcast for applications such as speech summarization, browsing, archiving and information retrieval. This process comprises transcribing the audio using an automatic speech recognizer and subsequently segmenting the text into utterances and topics. A maximum entropy approach is used to build statistical models for both utterance and topic segmentation. The experimental work addresses the effect on performance of the topic boundary detector of three factors: the information sources used, the quality of the ASR transcripts, and the quality of the utterance boundary detector. The results show that the topic segmentation is not affected severely by transcripts errors, whereas errors in the utterance segmentation are more devastating.},
year = {2005},
pages = {},
categories = {s3l,summarization,bnews,edinburgh,sheffield}
}

@inproceedings{garau-interspeech05,
author = {Garau, G. and Renals, S. and Hain, T.},
title = {Applying Vocal Tract Length Normalization to Meeting Recordings},
booktitle = {Proc. Interspeech},
month = {September},
year = {2005},
abstract = {Vocal Tract Length Normalisation (VTLN) is a commonly used technique to normalise for inter-speaker variability. It is based on the speaker-specific warping of the frequency axis, parameterised by a scalar warp factor. This factor is typically estimated using maximum likelihood. We discuss how VTLN may be applied to multiparty conversations, reporting a substantial decrease in word error rate in experiments using the ICSI meetings corpus. We investigate the behaviour of the VTLN warping factor and show that a stable estimate is not obtained. Instead it appears to be influenced by the context of the meeting, in particular the current conversational partner. These results are consistent with predictions made by the psycholinguistic interactive alignment account of dialogue, when applied at the acoustic and phonological levels.},
}

@inproceedings{Gutkin:Gay:qr05,
editor = {Hofbaur, Michael and Rinner, Bernhard and Wotawa, Franz},
author = {Gutkin, Alexander and Gay, David R.},
isbn = {3-9502019-0-4},
title = {{S}tructural {R}epresentation and {M}atching of {A}rticulatory {S}peech {S}tructures based on the {E}volving {T}ransformation {S}ystem ({ETS}) {F}ormalism},
booktitle = {Proc. 19th International Workshop on Qualitative Reasoning (QR-05)},
year = {2005},
abstract = {A formal structural representation of speech consistent with the principles of combinatorial structure theory is presented in this paper. The representation is developed within the Evolving Transformation System (ETS) formalism and encapsulates speech processes at the articulatory level. We show how the class structure of several consonantal phonemes of English can be expressed with the help of articulatory gestures---the atomic combinatorial units of speech. As a preliminary step towards the design of a speech recognition architecture based on the structural approaches to physiology and articulatory phonology, we present an algorithm for the structural detection of phonemic class elements inside gestural ETS structures derived from continuous speech. Experiments designed to verify the adequacy of the hypothesised gestural class structure conducted on the MOCHA articulatory corpus are then described. Our experimental results support the hypothesis that the articulatory representation captures sufficient information for the accurate structural identification of the phonemic classes in question.},
month = {May},
pages = {89--96},
categories = {structural,recognition,ets,artic,mocha,edinburgh,unb}
}

@inproceedings{hain-interspeech05,
author = {Hain, T. and Dines, J. and Garau, G. and Karafiat, M. and Moore, D. and Wan, V. and Ordelman, R. and Renals, S.},
title = {Transcription of Conference Room Meetings: an Investigation},
booktitle = {Proc. Interspeech},
year = {2005},
abstract = {The automatic processing of speech collected in conference style meetings has attracted considerable interest with several large scale projects devoted to this area. In this paper we explore the use of various meeting corpora for the purpose of automatic speech recognition. In particular we investigate the similarity of these resources and how to efficiently use them in the construction of a meeting transcription system. The analysis shows distinctive features for each resource. However the benefit in pooling data and hence the similarity seems sufficient to speak of a generic conference meeting domain . In this context this paper also presents work on development for the AMI meeting transcription system, a joint effort by seven sites working on the AMI (augmented multi-party interaction) project.},
categories = {ami,asr,edinburgh}
}

@inproceedings{Shimodaira:mlmi05,
author = {Shimodaira, Hiroshi and Uematsu, Keisuke and Kawamoto, Shin'ichi and Hofer, Gregor and Nakai, Mitsuru},
title = {{Analysis and Synthesis of Head Motion for Lifelike Conversational Agents}},
booktitle = {Proc. MLMI2005},
month = {July},
year = {2005},
categories = {lifelike agents}
}

@inproceedings{calhoun:05,
author = {Calhoun, Sasha},
title = {It's the Difference That Matters: An Argument for Contextually-Grounded Acoustic Intonational Phonology},
booktitle = {Linguistics Society of America Annual Meeting},
year = {2005},
month = {January},
abstract = {Standardly, the link between intonation and discourse meaning is described in terms of perceptual intonation categories, e.g. ToBI. We argue that this approach needs to be refined to explicitly recognise: firstly, that perception is affected by multiple acoustic cues, including duration and intensity, as well as F0; and secondly that the interpretation of these cues is directly linked to the phonetic and discourse context. Investigating the marking of topic status in a small game task corpus, we found that although topic status is not consistently marked by ToBI pitch accent, it is by the F0 mean, intensity and duration of the topic word. Using regression analysis, we found that when factoring out the F0 mean and intensity of key parts of the preceding discourse, intensity and duration become stronger predictors of topic status than F0.},
categories = {intonation theory and methodology, information structure, pitch accents, corpus study}
}

@inproceedings{Hachey05,
author = {Hachey, B. and Murray, G. and Reitter, D.},
title = {The {E}mbra System at {DUC} 2005: Query-oriented Multi-document Summarization with a Very Large Latent Semantic Space},
booktitle = {Proceedings of the Document Understanding Conference (DUC) 2005, Vancouver, BC, Canada},
month = {October},
year = {2005},
abstract = {Our summarization system submitted to DUC 2005, Embra (or Edinburgh), is novel in that it relies on building a very large semantic space for the purposes of determining relevance and redundancy in an MMR-style framework. We address specificity by detecting the presence or absence of Named Entities in our extract candidates, and we implemented a sentence-ordering algorithm to maximize sentence cohesion in our final summaries.},
categories = {summarization, latent semantic analysis}
}

@phdthesis{gray2005,
author = {Gray, Calum},
school = {University of Edinburgh},
title = {Acoustic Pulse Reflectometry for Measurement of the Vocal Tract with Application in Voice Synthesis.},
abstract = {The measurement of human airway dimensions has been a frequent objective in the fields of respiratory medicine and speech research, but has proven difficult to achieve non-invasively due to the airway's function in breathing, swallowing and speaking. Acoustic pulse reflectometry (APR) has been employed in clinical studies of the vocal tract for several years, normally in the function of airway measurement. The focus of this work is to utilise APR in capturing vocal tract profiles during the phonation of vowel sounds, for the purposes of sound synthesis. By making an equivalent tube model of the vocal tract, the propagation of an acoustic wave can be readily calculated using techniques such as waveguide modelling, which will in turn allow us to synthesise sound and form the basis of a physical model of the voice. The attractions of this technique for vocal tract measurement are many: it is non-invasive, safe, repeatable and inexpensive. In this thesis, the basic theory describing wave propagation in tubes of varying cross- section is outlined, together with a review of how the time domain technique of APR can be used to measure the input impulse response of a tubular object, such as the vocal tract, from which the bore profile can be calculated using the layer peeling algorithm. Experimental measurements of the human vocal tract during the phonation (imitation) of five non-nasalised vowels [a, e, i, o, u] are presented, using recent enhancements to the APR technique (MLS excitation signals and virtual DC tube method) for a single subject, together with optimisation of the APR technique for vocal tract measurement and its application in a group study using adults and children. To validate the results obtained using the APR technique, a comparative study with an accepted "gold standard" imaging technique (Magnetic Resonance Imaging - MRI) is presented, using the same subject, a voice professional, in both studies. The results from this study show reasonable overall agreement between the APR and MRI data, with the limited resolution of the acoustic technique tending to broaden features and underestimate cross sectional areas, particularly in the region of the pharynx and glottis. Protocols and supplementary documentation required by scientific, clinical and ethical review bodies for the use of human volunteers in research trials are provided. From this study a data corpus of vocal tract measurements is gathered, using the techniques of APR and MRI, in adult males, adult females and children. In conclusion, limitations of the APR technique for vocal tract measurement are discussed and potential improvements are proposed.},
key = {gray2005},
year = {2005},
}

@inproceedings{clarkrichmondking_interspeech05,
author = {Clark, Robert A.J. and Richmond, Korin and King, Simon},
title = {Multisyn voices from {ARCTIC} data for the {B}lizzard challenge},
booktitle = {Proc. Interspeech 2005},
month = {September},
year = {2005},
abstract = {This paper describes the process of building unit selection voices for the Festival Multisyn engine using four ARCTIC datasets, as part of the Blizzard evaluation challenge. The build process is almost entirely automatic, with very little need for human intervention. We discuss the difference in the evaluation results for each voice and evaluate the suitability of the ARCTIC datasets for building this type of voice.},
categories = {speech synthesis, festival, evaluation}
}

@article{Nakai2005IEICE01,
author = {Nakai, Mitsuru and Sagayama, Shigeki and Shimodaira, Hiroshi},
note = {(in press) (in Japanese)},
title = {{On-line Handwriting Recognition Based on Sub-stroke {HMM}}},
journal = {Trans. IEICE D-II},
number = {8},
month = {August},
volume = {J88-D2},
year = {2005},
abstract = {This paper describes context-dependent sub-stroke HMMs for on-line handwritten character recognition. As there are so many characters in Japanese, modeling each character by an HMM leads to an infeasible character-recognition system requiring huge amount of memory and enormous computation time. The sub-stroke HMM approach has overcomed these problems by minimizing modeling unit. However, one of the drawback of this approach is that the recognition accuracy deteriorates for scribbled characters. In this paper, we show that the context-dependent sub-stroke modeling which depends on how the sub-stroke connects to the adjacent substrokes is effective to achieve robust recognition of low quality characters.},
categories = {online handwritten character recognition}
}

@inproceedings{calhoun:05-a,
author = {Calhoun, Sasha and Nissim, Malvina and Steedman, Mark and Brenier, Jason},
title = {A Framework for Annotating Information Structure in Discourse},
booktitle = {Frontiers in Corpus Annotation II: Pie in the Sky, ACL2005 Conference Workshop},
year = {2005},
month = {June},
abstract = {We present a framework for the integrated analysis of the textual and prosodic characteristics of information structure in the {\em Switchboard} corpus of conversational English. Information structure describes the availability, organisation and salience of entities in a discourse model. We present standards for the annotation of {\em information status} (old, mediated and new), and give guidelines for annotating {\em information structure}, i.e. {\em theme/rheme} and {\em background/kontrast}. We show that information structure in English can only be analysed concurrently with prosodic prominence and phrasing. Along with existing annotations which we have integrated using NXT technology, the corpus will be unique in the field of conversational speech in terms of size and richness of annotation, vital for many NLP applications.},
categories = {prosody, information structure, annotation, discourse semantics}
}

@inproceedings{NistevalAMI05,
author = {Hain, T. and Burget, L. and Dines, J. and Garau, G. and Karafiat, M. and Lincoln, M. and McCowan, I. and Moore, D. and Wan, V. and Ordelman, R. and Renals, S.},
title = {The 2005 {AMI} System for the transcription of Speech in Meetings},
booktitle = {Proceedings of the Rich Transcription 2005 Spring Meeting Recognition Evaluation},
year = {2005},
abstract = {In this paper we describe the 2005 AMI system for the transcription of speech in meetings used in the 2005 NIST RT evaluations. The system was designed for participation in the speech to text part of the evaluations, in particular for transcription of speech recorded with multiple distant microphones and independent headset microphones. System performance was tested on both conference room and lecture style meetings. Although input sources are processed using different frontends, the recognition process is based on a unified system architecture. The system operates in multiple passes and makes use of state of the art technologies such as discriminative training, vocal tract length normalisation, heteroscedastic linear discriminant analysis, speaker adaptation with maximum likelihood linear regression and minimum word error rate decoding. In this paper we describe the system performance on the official development and test sets for the NIST RT05s evaluations. The system was jointly developed in less than 10 months by a multi-site team and was shown to achieve competitive performance.},
categories = {LVCSR, NIST Meeting Transcription Evaluation RT05S}
}

@inproceedings{Gutkin:King:pris05,
editor = {Gamboa, Hugo and Fred, Ana},
author = {Gutkin, Alexander and King, Simon},
publisher = {INSTICC Press},
isbn = {972-8865-28-7},
title = {{I}nductive {S}tring {T}emplate-{B}ased {L}earning of {S}poken {L}anguage},
booktitle = {Proc. 5th International Workshop on Pattern Recognition in Information Systems (PRIS-2005), In conjunction with the 7th International Conference on Enterprise Information Systems (ICEIS-2005)},
abstract = {This paper deals with formulation of alternative structural approach to the speech recognition problem. In this approach, we require both the representation and the learning algorithms defined on it to be linguistically meaningful, which allows the speech recognition system to discover the nature of the linguistic classes of speech patterns corresponding to the speech waveforms. We briefly discuss the current formalisms and propose an alternative --- a phonologically inspired string-based inductive speech representation, defined within an analytical framework specifically designed to address the issues of class and object representation. We also present the results of the phoneme classification experiments conducted on the TIMIT corpus of continuous speech.},
month = {May},
year = {2005},
pages = {43--51},
categories = {structural,recognition,acoustic,phonetic_feature,timit,edinburgh}
}

@article{mayoturk-jasa05,
author = {Mayo, C. and Turk, A.},
title = {The influence of spectral distinctiveness on acoustic cue weighting in children's and adults' speech perception},
journal = {Journal of the Acoustical Society of America},
volume = {118},
year = {2005},
pages = {1730--1741}
}

@inproceedings{king_bartels_bilmes_isp05,
author = {King, Simon and Bartels, Chris and Bilmes, Jeff},
title = {SVitchboard 1: Small Vocabulary Tasks from Switchboard 1},
booktitle = {Proc. Interspeech 2005},
year = {2005},
abstract = {We present a conversational telephone speech data set designed to support research on novel acoustic models. Small vocabulary tasks from 10 words up to 500 words are defined using subsets of the Switchboard-1 corpus; each task has a completely closed vocabulary (an OOV rate of 0\%). We justify the need for these tasks, de- scribe the algorithm for selecting them from a large cor- pus, give a statistical analysis of the data and present baseline whole-word hidden Markov model recognition results. The goal of the paper is to define a common data set and to encourage other researchers to use it.}
}

@article{wrigley-sap05,
author = {Wrigley, S. J. and Brown, G. J. and Wan, V. and Renals, S.},
title = {Speech and crosstalk detection in multi-channel audio},
journal = {IEEE Trans. on Speech and Audio Processing},
abstract = {The analysis of scenarios in which a number of microphones record the activity of speakers, such as in a roundtable meeting, presents a number of computational challenges. For example, if each participant wears a microphone, it can receive speech from both the microphone's wearer (local speech) and from other participants (crosstalk). The recorded audio can be broadly classified in four ways: local speech, crosstalk plus local speech, crosstalk alone and silence. We describe two experiments related to the automatic classification of audio into these four classes. The first experiment attempted to optimise a set of acoustic features for use with a Gaussian mixture model (GMM) classifier. A large set of potential acoustic features were considered, some of which have been employed in previous studies. The best-performing features were found to be kurtosis, fundamentalness and cross-correlation metrics. The second experiment used these features to train an ergodic hidden Markov model classifier. Tests performed on a large corpus of recorded meetings show classification accuracies of up to 96\%, and automatic speech recognition performance close to that obtained using ground truth segmentation.},
volume = {13},
year = {2005},
pages = {84--91},
categories = {m4,meetings,edinburgh,asr,sheffield}
}

@article{goldman2005,
author = {Goldman, Jerry and Renals, Steve and Bird, Steven and {de Jong}, Franciska and Federico, Marcello and Fleischhauer, Carl and Kornbluh, Mark and Lamel, Lori and Oard, Doug and Stewart, Clare and Wright, Richard},
title = {Accessing the spoken word},
journal = {International Journal of Digital Libraries},
number = {4},
abstract = {Spoken word audio collections cover many domains, including radio and television broadcasts, oral narratives, governmental proceedings, lectures, and telephone conversations. The collection, access and preservation of such data is stimulated by political, economic, cultural and educational needs. This paper outlines the major issues in the field, reviews the current state of technology, examines the rapidly changing policy issues relating to privacy and copyright, and presents issues relating to the collection and preservation of spoken audio content.},
volume = {5},
year = {2005},
pages = {287--298},
categories = {swag,asr,ir,edinburgh}
}

@inproceedings{hifny-interspeech05,
author = {Hifny, Y. and Renals, S. and Lawrence, N.},
title = {A Hybrid {MaxEnt/HMM} based {ASR} System},
booktitle = {Proc. Interspeech},
year = {2005},
abstract = {The aim of this work is to develop a practical framework, which extends the classical Hidden Markov Models (HMM) for continuous speech recognition based on the Maximum Entropy (MaxEnt) principle. The MaxEnt models can estimate the posterior probabilities directly as with Hybrid NN/HMM connectionist speech recognition systems. In particular, a new acoustic modelling based on discriminative MaxEnt models is formulated and is being developed to replace the generative Gaussian Mixture Models (GMM) commonly used to model acoustic variability. Initial experimental results using the TIMIT phone task are reported.},
categories = {ml,asr,edinburgh,sheffield}
}

@incollection{dielmann-mlmi04,
editor = {Bengio, S. and Bourlard, H.},
author = {Dielmann, A. and Renals, S.},
publisher = {Springer},
title = {Multistream dynamic {Bayesian} network for meeting segmentation},
booktitle = {Proc. Multimodal Interaction and Related Machine Learning Algorithms Workshop (MLMI--04)},
abstract = {This paper investigates the automatic analysis and segmentation of meetings. A meeting is analysed in terms of individual behaviours and group interactions, in order to decompose each meeting in a sequence of relevant phases, named meeting actions. Three feature families are extracted from multimodal recordings: prosody from individual lapel microphone signals, speaker activity from microphone array data and lexical features from textual transcripts. A statistical approach is then used to relate low-level features with a set of abstract categories. In order to provide a flexible and powerful framework, we have employed a dynamic Bayesian network based model, characterized by multiple stream processing and flexible state duration modelling. Experimental results demonstrate the strength of this system, providing a meeting action error rate of 9\%.},
year = {2005},
pages = {76--86},
categories = {m4,multimodal,dbn,meetings,edinburgh}
}

@inproceedings{Gutkin:King:icassp05,
author = {Gutkin, Alexander and King, Simon},
publisher = {IEEE Signal Processing Society Press},
isbn = {0-7803-8875-5},
title = {{D}etection of {S}ymbolic {G}estural {E}vents in {A}rticulatory {D}ata for {U}se in {S}tructural {R}epresentations of {C}ontinuous {S}peech},
booktitle = {Proc. IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP-05)},
abstract = {One of the crucial issues which often needs to be addressed in structural approaches to speech representation is the choice of fundamental symbolic units of representation. In this paper, a physiologically inspired methodology for defining these symbolic atomic units in terms of primitive articulatory events is proposed. It is shown how the atomic articulatory events (gestures) can be detected directly in the articulatory data. An algorithm for evaluating the reliability of the articulatory events is described and promising results of the experiments conducted on MOCHA articulatory database are presented.},
month = {March},
volume = {I},
year = {2005},
pages = {885--888},
categories = {structural,recognition,artic,mocha,edinburgh}
}

@inproceedings{mayoturk-psp05,
author = {Mayo, C. and Turk, A.},
booktitle = {Proc. ISCA Workshop on Plasticity in Speech Perception},
year = {2005},
title = {No Available Theories Currently Explain All Adult-Child Cue Weighting Differences},
}

@article{wan-sap05,
author = {Wan, V. and Renals, S.},
title = {Speaker verification using sequence discriminant support vector machines},
journal = {IEEE Trans. on Speech and Audio Processing},
abstract = {This paper presents a text-independent speaker verification system using support vector machines (SVMs) with score-space kernels. Score-space kernels, generalize Fisher kernels, and are based on an underlying generative model, such as a Gaussian mixture model (GMM). This approach provides direct discrimination between whole sequences, in contrast to the frame-level approaches at the heart of most current systems. The resultant SVMs have a very high dimensionality, since it is related to the number of parameters in the underlying generative model. To ameliorate problems that can arise in the resultant optimization, we introduce a technique called spherical normalization that preconditions the Hessian matrix. We have performed speaker verification experiments using the PolyVar database. The SVM system presented here reduces the relative error rates by 34\% compared to a GMM likelihood ratio system.},
volume = {13},
year = {2005},
pages = {203--210},
categories = {verification,kernel,svm,edinburgh,sheffield}
}

@inproceedings{Murray05b,
author = {Murray, G. and Renals, S. and Carletta, J. and Moore, J.},
title = {Evaluating Automatic Summaries of Meeting Recordings},
booktitle = {Proceedings of the 43rd Annual Meeting of the Association for Computational Linguistics, Ann Arbor, MI, USA},
month = {June},
year = {2005},
abstract = {The research below explores schemes for evaluating automatic summaries of business meetings, using the ICSI Meeting Corpus. Both automatic and subjective evaluations were carried out, with a central interest being whether or not the two types of evaluations correlate with each other. The evaluation metrics were used to compare and contrast differing approaches to automatic summarization, the deterioration of summary quality on ASR output versus manual transcripts, and to determine whether manual extracts are rated significantly higher than automatic extracts.},
categories = {ami,summarization, speech summarization, prosody, latent semantic analysis, summarization evaluation, edinburgh}
}

@inproceedings{murray-interspeech05,
author = {Murray, G. and Renals, S. and Carletta, J.},
title = {Extractive Summarization of Meeting Recordings},
booktitle = {Proc. Interspeech},
month = {September},
year = {2005},
abstract = {Several approaches to automatic speech summarization are discussed below, using the ICSI Meetings corpus. We contrast feature-based approaches using prosodic and lexical features with maximal marginal relevance and latent semantic analysis approaches to summarization. While the latter two techniques are borrowed directly from the field of text summarization, feature-based approaches using prosodic information are able to utilize characteristics unique to speech data. We also investigate how the summarization results might deteriorate when carried out on ASR output as opposed to manual transcripts. All of the summaries are of an extractive variety, and are compared using the software ROUGE.},
categories = {ami,summarization,prosody, latent semantic analysis,edinburgh}
}

@inproceedings{cuayahuitletal_asru05,
author = {Cuayáhuitl, Heriberto and Renals, Steve and Lemon, Oliver and Shimodaira, Hiroshi},
title = {Human-Computer Dialogue Simulation Using Hidden Markov Models},
booktitle = {Proc. IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)},
month = {November},
year = {2005},
abstract = {This paper presents a probabilistic method to simulate task-oriented human-computer dialogues at the intention level, that may be used to improve or to evaluate the performance of spoken dialogue systems. Our method uses a network of Hidden Markov Models (HMMs) to predict system and user intentions, where a language model'' predicts sequences of goals and the component HMMs predict sequences of intentions. We compare standard HMMs, Input HMMs and Input-Output HMMs in an effort to better predict sequences of intentions. In addition, we propose a dialogue similarity measure to evaluate the realism of the simulated dialogues. We performed experiments using the DARPA Communicator corpora and report results with three different metrics: dialogue length, dialogue similarity and precision-recall.},
categories = {dialogue simulation, hidden markov models}
}

@inproceedings{mayoclarkking-isp05,
author = {Mayo, C. and Clark, R. A. J. and King, S.},
title = {Multidimensional Scaling of Listener Responses to Synthetic Speech},
booktitle = {Proc. Interspeech 2005},
year = {2005},
month = {September},
}

@phdthesis{shiga05,
author = {Shiga, Yoshinori},
school = {The Centre for Speech Technology Research, Edinburgh University},
title = {Precise Estimation of Vocal Tract and Voice Source Characteristics},
abstract = {This thesis addresses the problem of quality degradation in speech produced by parameter-based speech synthesis, within the framework of an articulatory-acoustic forward mapping. I first investigate current problems in speech parameterisation, and point out the fact that conventional parameterisation inaccurately extracts the vocal tract response due to interference from the harmonic structure of voiced speech. To overcome this problem, I introduce a method for estimating filter responses more precisely from periodic signals. The method achieves such estimation in the frequency domain by approximating all the harmonics observed in several frames based on a least squares criterion. It is shown that the proposed method is capable of estimating the response more accurately than widely-used frame-by-frame parameterisation, for simulations using synthetic speech and for an articulatory-acoustic mapping using actual speech. I also deal with the source-filter separation problem and independent control of the voice source characteristic during speech synthesis. I propose a statistical approach to separating out the vocal-tract filter response from the voice source characteristic using a large articulatory database. The approach realises such separation for voiced speech using an iterative approximation procedure under the assumption that the speech production process is a linear system composed of a voice source and a vocal-tract filter, and that each of the components is controlled independently by different sets of factors. Experimental results show that controlling the source characteristic greatly improves the accuracy of the articulatory-acoustic mapping, and that the spectral variation of the source characteristic is evidently influenced by the fundamental frequency or the power of speech. The thesis provides more accurate acoustical approximation of the vocal tract response, which will be beneficial in a wide range of speech technologies, and lays the groundwork in speech science for a new type of corpus-based statistical solution to the source-filter separation problem.},
year = {2005},
categories = {mfa, multiframe, forward, mapping, source-filter, artic, mocha, edinburgh}
}

@inproceedings{frankel05:hybrid,
author = {Frankel, J. and King, S.},
title = {A Hybrid {ANN/DBN} Approach to Articulatory Feature Recognition},
booktitle = {Proc. Eurospeech},
year = {2005},
month = {September},
abstract = {Artificial neural networks (ANN) have proven to be well suited to the task of articulatory feature (AF) recognition. Previous studies have taken a cascaded approach where separate ANNs are trained for each feature group, making the assumption that features are statistically independent. We address this by using ANNs to provide virtual evidence to a dynamic Bayesian network (DBN). This gives a hybrid ANN/DBN model and allows modelling of inter-feature dependencies. We demonstrate significant increases in AF recognition accuracy from modelling dependencies between features, and present the results of embedded training experiments in which a set of asynchronous feature changes are learned. Furthermore, we report on the application of a Viterbi training scheme in which we alternate between realigning the AF training labels and retraining the ANNs.},
categories = {am,artic,asr,dbn,oginumbers,edinburgh}
}

@article{koumpis2005-acmslp,
author = {Koumpis, Konstantinos and Renals, Steve},
title = {Automatic summarization of voicemail messages using lexical and prosodic features},
journal = {ACM Transactions on Speech and Language Processing},
number = {1},
abstract = {This paper presents trainable methods for extracting principal content words from voicemail messages. The short text summaries generated are suitable for mobile messaging applications. The system uses a set of classifiers to identify the summary words, with each word being identified by a vector of lexical and prosodic features. We use an ROC-based algorithm, Parcel, to select input features (and classifiers). We have performed a series of objective and subjective evaluations using unseen data from two different speech recognition systems, as well as human transcriptions of voicemail speech.},
volume = {2},
year = {2005},
pages = {1--24},
categories = {voicemail,summarization,prosody,sheffield,edinburgh}
}

@article{Tokuno2005IEICE01,
author = {Tokuno, Junko and Inami, Nobuhito and Nakai, Mitsuru and Shimodaira, Hiroshi and Sagayama, Shigeki},
note = {(in press), (in Japanese)},
title = {{Context-dependent Sub-stroke Model for {HMM}-based On-line Handwriting Recognition}},
journal = {Trans. IEICE D-II},
number = {8},
month = {August},
volume = {J88-D2},
year = {2005},
abstract = {A new method is proposed for on-line Kanji handwriting recognition. The method employs sub-stroke HMMs as minimum units to constitute Kanji characters and utilizes the direction of pen motion. The present approach has the following advantages over the conventional methods that employ character HMMs. 1) Much smaller memory requirement for dictionary and models. 2) Fast recognition by employing efficient sub-stroke network search. 3) Capability of recognizing characters not included in the training data if defined as a sequence of sub-strokes in the dictionary. In experiments, we have achieved a correct recognition rate of above 96\% by using JAIST-IIPL database that includes 1,016 educational Kanji characters.},
categories = {online handwritten character recognition}
}

@inproceedings{goubanova_king_isp05,
author = {Goubanova, Olga and King, Simon},
title = {Predicting Consonant Duration with {B}ayesian Belief Networks},
booktitle = {Proc. Interspeech 2005},
year = {2005},
abstract = {Consonant duration is influenced by a number of linguistic factors such as the consonant s identity, within-word position, stress level of the previous and following vowels, phrasal position of the word containing the target consonant, its syllabic position, identity of the previous and following segments. In our work, consonant duration is predicted from a Bayesian belief network (BN) consisting of discrete nodes for the linguistic factors and a single continuous node for the consonant s duration. Interactions between factors are represented as conditional dependency arcs in this graphical model. Given the parameters of the belief network, the duration of each consonant in the test set is then predicted as the value with the maximum probability. We compare the results of the belief network model with those of sums-of-products (SoP) and classification and regression tree (CART) models using the same data. In terms of RMS error, our BN model performs better than both CART and SoP models. In terms of the correlation coefficient, our BN model performs better than SoP model, and no worse than CART model. In addition, the Bayesian model reliably predicts consonant duration in cases of missing or hidden linguistic factors.}
}

@article{koumpis2005-spmag,
author = {Koumpis, Konstantinos and Renals, Steve},
journal = {IEEE Signal Processing Magazine},
number = {5},
abstract = {"How analysis, retrieval and delivery phases make spoken audio content more accessible"},
volume = {22},
year = {2005},
pages = {61--69},
categories = {asr,ir,summarization,edinburgh}
}

@phdthesis{Gutkin:phd:05,
author = {Gutkin, Alexander},
school = {School of Informatics, University of Edinburgh},
title = {{T}owards {F}ormal {S}tructural {R}epresentation of {S}poken {L}anguage: {A}n {E}volving {T}ransformation {S}ystem ({ETS}) {A}pproach},
year = {2005},
month = {December},
note = {Internal version},
categories = {structural,representation,recognition,edinburgh,unb,ets}
}

@inproceedings{AMIMLMI05,
author = {Hain, T. and Burget, L. and Dines, J. and Garau, G. and Karafiat, M. and Lincoln, M. and McCowan, I. and Moore, D. and Wan, V. and Ordelman, R. and Renals, S.},
booktitle = {2nd Joint Workshop on Multimodal Interaction and Related Machine Learning Algorithms},
year = {2005},
abstract = {The automatic processing of speech collected in conference style meetings has attracted considerable interest with several large scale projects devoted to this area. This paper describes the development of a baseline automatic speech transcription system for meetings in the context of the AMI (Augmented Multiparty Interaction) project. We present several techniques important to processing of this data and show the performance in terms of word error rates (WERs). An important aspect of transcription of this data is the necessary flexibility in terms of audio pre-processing. Real world systems have to deal with flexible input, for example by using microphone arrays or randomly placed microphones in a room. Automatic segmentation and microphone array processing techniques are described and the effect on WERs is discussed. The system and its components presented in this paper yield compettive performance and form a baseline for future research in this domain.},
title = {The Development of the {AMI} System for the Transcription of Speech in Meetings}
}

@inproceedings{faria-eurospeech05,
author = {Faria, A. and Gelbart, D.},
title = {Efficient Pitch-based Estimation of {VLTN} Warp Factors},
booktitle = {Proc. Eurospeech},
year = {2005},
abstract = {To reduce inter-speaker variability, vocal tract length normalization (VTLN) is commonly used to transform acoustic features for automatic speech recognition (ASR). The warp factors used in this process are usually derived by maximum likelihood (ML) estimation, involving an exhaustive search over possible values. We describe an alternative approach: exploit the correlation between a speaker's average pitch and vocal tract length, and model the probability distribution of warp factors conditioned on pitch observations. This can be used directly for warp factor estimation, or as a smoothing prior in combination with ML estimates. Pitch-based warp factor estimation for VTLN is effective and requires relatively little memory and computation. Such an approach is well-suited for environments with constrained resources, or where pitch is already being computed for other purposes.},
categories = {vocal tract length normalization,speaker adaptation}
}

@inproceedings{Gutkin:Gay:ijcai05,
author = {Gutkin, Alexander and Gay, David R.},
title = {Structural Representation and Matching of Articulatory Speech Structures based on the Evolving Transformation System ({ETS}) Formalism},
booktitle = {Proc. Nineteenth International Joint Conference on Artificial Intelligence (IJCAI-05)},
year = {2005},
month = {August},
categories = {structural,recognition,ets,artic,mocha,edinburgh,unb}
}

@inproceedings{hofer-eurosp05,
author = {Hofer, G. and Richmond, K. and Clark, R.},
title = {Informed Blending of Databases for Emotional Speech Synthesis},
booktitle = {Proc. Interspeech},
month = {September},
year = {2005},
abstract = {The goal of this project was to build a unit selection voice that could portray emotions with varying intensities. A suitable definition of an emotion was developed along with a descriptive framework that supported the work carried out. A single speaker was recorded portraying happy and angry speaking styles. Additionally a neutral database was also recorded. A target cost function was implemented that chose units according to emotion mark-up in the database. The Dictionary of Affect supported the emotional target cost function by providing an emotion rating for words in the target utterance. If a word was particularly 'emotional', units from that emotion were favoured. In addition intensity could be varied which resulted in a bias to select a greater number emotional units. A perceptual evaluation was carried out and subjects were able to recognise reliably emotions with varying amounts of emotional units present in the target utterance.},
categories = {speech synthesis,emotion,edinburgh}
}

@article{onnis2005,
author = {Onnis, L. and Monaghan, P. and Richmond, K. and Chater, N.},
title = {Phonology impacts segmentation in speech processing.},
journal = {Journal of Memory and Language},
number = {2},
abstract = {Peña, Bonatti, Nespor and Mehler(2002) investigated an artificial language where the structure of words was determined by nonadjacent dependencies between syllables. They found that segmentation of continuous speech could proceed on the basis of these dependencies. However, Peña et al.'s artificial language contained a confound in terms of phonology, in that the dependent syllables began with plosives and the intervening syllables began with continuants. We consider three hypotheses concerning the role of phonology in speech segmentation in this task: (1) participants may recruit probabilistic phonotactic information from their native language to the artificial language learning task; (2) phonetic properties of the stimuli, such as the gaps that precede unvoiced plosives, can influence segmentation; and (3) grouping by phonological similarity between dependent syllables contributes to learning the dependency. In a series of experiments controlling the phonological and statistical structure of the language, we found that segmentation performance is influenced by the three factors in different degrees. Learning of non-adjacent dependencies did not occur when (3) is eliminated. We suggest that phonological processing provides a fundamental contribution to distributional analysis.},
volume = {53},
key = {onnis2005},
year = {2005},
pages = {225--237},
categories = {artificial language learning, statistical learning, segmentation, phonology, festival}
}

@article{chang05,
author = {Chang, S. and Wester, M. and Greenberg, S.},
title = {An elitist approach to automatic articulatory-acoustic feature classification for phonetic characterization of spoken language},
journal = {Speech Communication},
abstract = {A novel framework for automatic articulatory-acoustic feature extraction has been developed for enhancing the accuracy of place- and manner-of-articulation classification in spoken language. The "elitist" approach provides a principled means of selecting frames for which multi-layer perceptron, neural-network classifiers are highly confident. Using this method it is possible to achieve a frame-level accuracy of 93\% on "elitist" frames for manner classification on a corpus of American English sentences passed through a telephone network (NTIMIT). Place-of-articulation information is extracted for each manner class independently, resulting in an appreciable gain in place-feature classification relative to performance for a manner-independent system. A comparable enhancement in classification performance for the elitist appraoch is evidenced when applied to a Dutch corpus of quasi-spontaneous telephone interactions (VIOS). The elitist framework provides a potential means of automatically annotating a corpus at the phonetic level \emph{without recourse to a word-level transcript} and could thus be of utility for developing traning materials for automatic speech recognition and speech synthesis applications, as well as aid the empirical study of spoken language. \copyright 2005 Elsevier B.V. All rights reserved.},
volume = {47},
year = {2005},