The Centre for Speech Technology Research, The university of Edinburgh

Publications by Catherine Lai

clai.bib

@inproceedings{lai2013summarize,
  author = {Lai, Catherine and Carletta, Jean and Renals, Steve},
  title = {Detecting Summarization Hot Spots in Meetings Using Group Level Involvement and Turn-Taking Features},
  booktitle = {Proc. Interspeech 2013, Lyon, France},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/laic2013summarization.pdf},
  abstract = {In this paper we investigate how participant involvement and turn-taking features relate to extractive summarization of meeting dialogues. In particular, we examine whether automatically derived measures of group level involvement, like participation equality and turn-taking freedom, can help detect where summarization relevant meeting segments will be. Results show that classification using turn-taking features performed better than the majority class baseline for data from both AMI and ICSI meeting corpora in identifying whether meeting segments contain extractive summary dialogue acts. The feature based approach also provided better recall than using manual ICSI involvement hot spot annotations. Turn-taking features were additionally found to be predictive of the amount of extractive summary content in a segment. In general, we find that summary content decreases with higher participation equality and overlap, while it increases with the number of very short utterances. Differences in results between the AMI and ICSI data sets suggest how group participatory structure can be used to understand what makes meetings easy or difficult to summarize.},
  categories = {summarization, turn-taking, involvement, social signals}
}
@inproceedings{lai2013affect,
  author = {Lai, Catherine and Carletta, Jean and Renals, Steve},
  title = {Modelling Participant Affect in Meetings with Turn-Taking Features},
  booktitle = {Proceedings of WASSS 2013, Grenoble, France},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/laic2013affect.pdf},
  abstract = {This paper explores the relationship between turn-taking and meeting affect. To investigate this, we model post-meeting ratings of satisfaction, cohesion and leadership from participants of AMI corpus meetings using group and individual turn-taking features. The results indicate that participants gave higher satisfaction and cohesiveness ratings to meetings with greater group turn-taking freedom and individual very short utterance rates, while lower ratings were associated with more silence and speaker overlap. Besides broad applicability to satisfaction ratings, turn-taking freedom was found to be a better predictor than equality of speaking time when considering whether participants felt that everyone they had a chance to contribute. If we include dialogue act information, we see that substantive feedback type turns like assessments are more predictive of meeting affect than information giving acts or backchannels. This work highlights the importance of feedback turns and modelling group level activity in multiparty dialogue for understanding the social aspects of speech.},
  categories = {turn-taking, meetings, affect, involvement, social signals}
}
@inproceedings{laiEtAl2012rhythm,
  author = {Lai, Catherine and Evanini, Keelan and Zechner, Klaus},
  title = {Applying Rhythm Metrics to Non-native Spontaneous Speech},
  booktitle = {Proceedings of SLaTE 2013, Grenoble, France},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/laiEtAl2012rhythm.pdf},
  abstract = {This study investigates a variety of rhythm metrics on two corpora of non-native spontaneous speech and compares the nonnative distributions to values from a corpus of native speech. Several of the metrics are shown to differentiate well between native and non-native speakers and to also have moderate correlations with English proficiency scores that were assigned to the non-native speech. The metric that had the highest correlation with English proficiency scores (apart from speaking rate) was rPVIsyl (the raw Pairwise Variability Index for syllables), with r = 0.43.},
  categories = {L2 speech, pronunciation scoring, rhythm}
}
@incollection{moore2014cicling,
  editor = {Gelbukh, Alexander},
  author = {Moore, Johanna D. and Tian, Leimin and Lai, Catherine},
  publisher = {Springer Berlin Heidelberg},
  doi = {10.1007/978-3-642-54903-8_2},
  isbn = {978-3-642-54902-1},
  title = {Word-Level Emotion Recognition Using High-Level Features},
  series = {Lecture Notes in Computer Science},
  booktitle = {Computational Linguistics and Intelligent Text Processing},
  pages = {17-31},
  volume = {8404},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/moore2014cicling.pdf},
  abstract = {In this paper, we investigate the use of high-level features for recognizing human emotions at the word-level in natural conversations with virtual agents. Experiments were carried out on the 2012 Audio/Visual Emotion Challenge (AVEC2012) database, where emotions are defined as vectors in the Arousal-Expectancy-Power-Valence emotional space. Our model using 6 novel disfluency features yields significant improvements compared to those using large number of low-level spectral and prosodic features, and the overall performance difference between it and the best model of the AVEC2012 Word-Level Sub-Challenge is not significant. Our visual model using the Active Shape Model visual features also yields significant improvements compared to models using the low-level Local Binary Patterns visual features. We built a bimodal model By combining our disfluency and visual feature sets and applying Correlation-based Feature-subset Selection. Considering overall performance on all emotion dimensions, our bimodal model outperforms the second best model of the challenge, and comes close to the best model. It also gives the best result when predicting Expectancy values.},
  categories = {Emotion recognition, disfluencies, multimodal language processing}
}
@inproceedings{lai2014ivie,
  author = {Lai, Catherine},
  title = {Interpreting Final Rises: Task and Role Factors},
  booktitle = {Proceedings of Speech Prosody 7, Dublin, Ireland},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/lai2014ivie.pdf},
  abstract = {This paper examines the distribution of utterance final pitch rises in dialogues with different task structures. More specifically, we examine map-task and topical conversation dialogues of Southern Standard British English speakers in the IViE corpus. Overall, we find that the map-task dialogues contain more rising features, where these mainly arise from instructions and affirmatives. While rise features were somewhat predictive of turn-changes, these effects were swamped by task and role effects. Final rises were not predictive of affirmative responses. These findings indicate that while rises can be interpreted as indicating some sort of contingency, it is with respect to the higher level discourse structure rather than the specific utterance bearing the rise. We explore the relationship between rises and the need for co-ordination in dialogue, and hypothesize that the more speakers have to co-ordinate in a dialogue, the more rising features we will see on non-question utterances. In general, these sorts of contextual conditions need to be taken into account when we collect and analyze intonational data, and when we link them to speaker states such as uncertainty or submissiveness.},
  categories = {Prosody, semantics, turn-taking, task-oriented dialogue, conversational dialogue}
}
@inproceedings{lai2014,
  author = {Lai, Catherine and Renals, Steve},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/lai2014incorporating.pdf},
  booktitle = {Proc. Interspeech 2014},
  title = {Incorporating Lexical and Prosodic Information at Different Levels for Meeting Summarization},
  abstract = {This paper investigates how prosodic features can be used to augment lexical features for meeting summarization. Automatic detection of summary-worthy content using non-lexical features, like prosody, has generally focused on features calculated over dialogue acts. However, a salient role of prosody is to distinguish important words within utterances. To examine whether including more fine grained prosodic information can help extractive summarization, we perform experiments incorporating lexical and prosodic features at different levels. For ICSI and AMI meeting corpora, we find that combining prosodic and lexical features at a lower level has better AUROC performance than adding in prosodic features derived over dialogue acts. ROUGE F-scores also show the same pattern for the ICSI data. However, the differences are less clear for the AMI data where the range of scores is much more compressed. In order to understand the relationship between the generated summaries and differences in standard measures, we look at the distribution of extracted content over meeting as well as summary redundancy. We find that summaries based on dialogue act level prosody better reflect the amount of human annotated summary content in meeting segments, while summaries derived from prosodically augmented lexical features exhibit less redundancy.},
  year = {2014}
}
@inproceedings{bell15_news_summarisation,
  author = {Bell, Peter and Lai, Catherine and Llewellyn, Clare and Birch, Alexandra and Sinclair, Mark},
  title = {A system for automatic broadcast news summarisation, geolocation and translation},
  booktitle = {Proc. Interspeech (demo session)},
  year = {2015},
  month = {September},
  address = {Dresden, Germany},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/bell15_news_summarisation.pdf},
  abstract = {An increasing amount of news content is produced in audio-video form every day. To effectively analyse and monitoring this multilingual data stream, we require methods to extract and present audio content in accessible ways. In this paper, we describe an end-to-end system for processing and browsing audio news data. This fully automated system brings together our recent research on audio scene analysis, speech recognition, summarisation, named entity detection, geolocation, and machine translation. The graphical interface allows users to visualise the distribution of news content by entity names and story location. Browsing of news events is facilitated through extractive summaries and the ability to view transcripts in multiple languages.}
}
@inproceedings{cervone15_reported_speech_prosody,
  author = {Cervone, Alessandra and Lai, Catherine and Pareti, Silvia and Bell, Peter},
  title = {Towards automatic detection of reported speech in dialogue using prosodic cues},
  booktitle = {Proc. Interspeech},
  year = {2015},
  month = {September},
  address = {Dresden, Germany},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/cervone15_reported_speech_prosody.pdf},
  abstract = {The phenomenon of reported speech -- whereby we quote the words, thoughts and opinions of others, or recount past dialogue -- is widespread in conversational speech. Detecting such quotations automatically has numerous applications: for example, in enhancing automatic transcription or spoken language understanding applications. However, the task is challenging, not least because lexical cues of quotations are frequently ambiguous or not present in spoken language. The aim of this paper is to identify potential prosodic cues of reported speech which could be used, along with the lexical ones, to automatically detect quotations and ascribe them to their rightful source, that is reconstructing their Attribution Relations. In order to do so we analyze SARC, a small corpus of telephone conversations that we have annotated with Attribution Relations. The results of the statistical analysis performed on the data show how variations in pitch, intensity, and timing features can be exploited as cues of quotations. Furthermore, we build a SVM classifier which integrates lexical and prosodic cues to automatically detect quotations in speech that performs significantly better than chance.}
}
@inproceedings{tian_recognizing_2015,
  author = {Tian, Leimin and Lai, Catherine and Moore, Johanna D.},
  title = {Recognizing emotions in dialogue with disfluences and non-verbal vocalisations},
  booktitle = {Proceedings of the 4th {Interdisciplinary} {Workshop} on {Laughter} and {Other} {Non}-verbal {Vocalisations} in {Speech}},
  pages = {15},
  volume = {14},
  year = {2015},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/tian_recognizing_emotions_in_dialogues_with_disfluencies_and_non_verbal_vocalisations.pdf},
  abstract = {We investigate the usefulness of DISfluencies and Non-verbal Vocalisations (DIS-NV) for recognizing human emotions in dialogues. The proposed fea- tures measure filled pauses, fillers, stutters, laughter, and breath in utterances. The predictiveness of DIS- NV features is compared with lexical features and state-of-the-art low-level acoustic features. Our experimental results show that using DIS-NV features alone is not as predictive as using lexical or acoustic features. However, adding them to lexical or acoustic feature set yields improvement compared to using lexical or acoustic features alone. This indi- cates that disfluencies and non-verbal vocalisations provide useful information overlooked by the other two types of features for emotion recognition},
  categories = {emotion recognition, disfluency, LSTM, dialogue}
}
@inproceedings{tian_emotion_2015,
  author = {Tian, Leimin and Moore, Johanna D. and Lai, Catherine},
  title = {Emotion {Recognition} in {Spontaneous} and {Acted} {Dialogues}},
  booktitle = {Proceedings of {ACII} 2015},
  year = {2015},
  address = {Xi'an, China},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/tian2015emotion.pdf},
  abstract = {In this work, we compare emotion recognition on two types of speech: spontaneous and acted dialogues. Experiments were conducted on the AVEC 2012 database of spontaneous dialogues and the IEMOCAP database of acted dialogues. We studied the performance of two types of acoustic features for emotion recognition: knowledge-inspired disfluency and non-verbal vocalisation (DIS-NV) features, and statistical Low-Level Descriptor (LLD) based features. Both Support Vector Machines (SVM) and Long Short-Term Memory Recurrent Neural Networks (LSTM-RNN) were built using each feature set on each emotional database. Our work aims to identify aspects of the data that constrain the effectiveness of models and features. Our results show that the perfor mance of different types of features and models is influenced by the type of dialogue and the amount of training data. Because DIS-NVs are less frequent in acted dialogues than in spontaneous dialogues, the DIS-NV features perform better than the LLD features when recognizing emotions in spontaneous dialogues, but not in acted dialogues. The LSTM-RNN model gives better performance than the SVM model when there is enough training data, but the complex structure of a LSTM-RNN model may limit its performance when there is less training data available, and may also risk over-fitting. Additionally, we find that long distance contexts may be more useful when performing emotion recognition at the word level than at the utterance level.},
  categories = {emotion recognition, disfluency, laughter, speech processing, HCI, dialogue}
}
@inproceedings{farrus_paragraph-based_2016,
  author = {Farrus, Mireia and Lai, Catherine and Moore, Johanna D.},
  doi = {10.21437/SpeechProsody.2016-235},
  title = {Paragraph-based prosodic cues for speech synthesis applications},
  booktitle = {Proceedings of Speech Prosody 2016},
  address = {Boston, MA, USA},
  pages = {1143--1147},
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/farrus2016para.pdf},
  abstract = {Speech synthesis has improved in both expressiveness and voice quality in recent years. However, obtaining full expressiveness when dealing with large multi-sentential synthesized discourse is still a challenge, since speech synthesizers do not take into account the prosodic differences that have been observed in discourse units such as paragraphs. The current study validates and extends previous work by analyzing the prosody of paragraph units in a large and diverse corpus of TED Talks using automatically extracted F0, intensity and timing features. In addition, a series of classification experiments was performed in order to identify which features are consistently used to distinguish paragraph breaks. The results show significant differences in prosody related to paragraph position. Moreover, the classification experiments show that boundary features such as pause duration and differences in F0 and intensity levels are the most consistent cues in marking paragraph boundaries. This suggests that these features should be taken into account when generating spoken discourse in order to improve naturalness and expressiveness.},
  categories = {discourse unit, prosodic cue, paragraph boundary, speech synthesis}
}
@inproceedings{lai_automatic_2016,
  author = {Lai, Catherine and Farrus, Mireia and Moore, Johanna},
  title = {Automatic {Paragraph} {Segmentation} with {Lexical} and {Prosodic} {Features}},
  booktitle = {Proceedings of {Interspeech} 2016},
  year = {2016},
  address = {San Francisco, CA, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/laic2016para.pdf},
  abstract = {As long-form spoken documents become more ubiquitous in everyday life, so does the need for automatic discourse segmentation in spoken language processing tasks. Although previous work has focused on broad topic segmentation, detection of finer-grained discourse units, such as paragraphs, is highly desirable for presenting and analyzing spoken content. To better understand how different aspects of speech cue these subtle discourse transitions, we investigate automatic paragraph segmentation of TED talks. We build lexical and prosodic paragraph segmenters using Support Vector Machines, AdaBoost, and Long Short Term Memory (LSTM) recurrent neural networks. In general, we find that induced cue words and supra-sentential prosodic features outperform features based on topical coherence, syntactic form and complexity. However, our best performance is achieved by combining a wide range of individually weak lexical and prosodic features, with the sequence modelling LSTM generally outperforming the other classifiers by a large margin. Moreover, we find that models that allow lower level interactions between different feature types produce better results than treating lexical and prosodic contributions as separate, independent information sources.},
  categories = {prosody, discourse, segmentation, paragraph, coherence, spoken language processing}
}