The Centre for Speech Technology Research, The university of Edinburgh

Publications by Alfred Dielmann

adielman.bib

@incollection{alhames-mlmi05,
  author = {M. Al-Hames and A. Dielmann and D. Gatica-Perez and S.
                   Reiter and S. Renals and G. Rigoll and D. Zhang},
  title = {Multimodal Integration for Meeting Group Action
                   Segmentation and Recognition},
  booktitle = {Proc. Multimodal Interaction and Related Machine
                   Learning Algorithms Workshop (MLMI--05)},
  publisher = {Springer},
  editor = {S. Renals and S. Bengio},
  pages = {52--63},
  abstract = {We address the problem of segmentation and recognition
                   of sequences of multimodal human interactions in
                   meetings. These interactions can be seen as a rough
                   structure of a meeting, and can be used either as input
                   for a meeting browser or as a first step towards a
                   higher semantic analysis of the meeting. A common
                   lexicon of multimodal group meeting actions, a shared
                   meeting data set, and a common evaluation procedure
                   enable us to compare the different approaches. We
                   compare three different multimodal feature sets and our
                   modelling infrastructures: a higher semantic feature
                   approach, multi-layer HMMs, a multistream DBN, as well
                   as a multi-stream mixed-state DBN for disturbed data.},
  categories = {m4,ami,multimodal,dbn,meetings,edinburgh,IDIAP,munich},
  year = 2006
}
@incollection{dielmann-mlmi06,
  author = {A. Dielmann and S. Renals},
  title = {Automatic Dialogue Act Recognition using a Dynamic
                   {Bayesian} Network},
  booktitle = {Proc. Multimodal Interaction and Related Machine
                   Learning Algorithms Workshop (MLMI--06)},
  publisher = {Springer},
  editor = {S. Renals and S. Bengio and J. Fiscus},
  pages = {178--189},
  abstract = {We propose a joint segmentation and classification
                   approach for the dialogue act recognition task on
                   natural multi-party meetings ({ICSI} Meeting Corpus).
                   Five broad DA categories are automatically recognised
                   using a generative Dynamic {Bayesian} Network based
                   infrastructure. Prosodic features and a switching
                   graphical model are used to estimate DA boundaries, in
                   conjunction with a factored language model which is
                   used to relate words and DA categories. This easily
                   generalizable and extensible system promotes a rational
                   approach to the joint DA segmentation and recognition
                   task, and is capable of good recognition performance.},
  categories = {ami,dialogue act,dbn,factored language
                   model,meetings,edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/dielmann-mlmi06.pdf},
  year = 2007
}
@inproceedings{dielmann-icassp04,
  author = {A. Dielmann and S. Renals},
  title = {Dynamic {Bayesian} Networks for Meeting Structuring},
  booktitle = {Proc. IEEE ICASSP},
  pages = {},
  abstract = {This paper is about the automatic structuring of
                   multiparty meetings using audio information. We have
                   used a corpus of 53 meetings, recorded using a
                   microphone array and lapel microphones for each
                   participant. The task was to segment meetings into a
                   sequence of meeting actions, or phases. We have adopted
                   a statistical approach using dynamic Bayesian networks
                   (DBNs). Two DBN architectures were investigated: a
                   two-level hidden Markov model (HMM) in which the
                   acoustic observations were concatenated; and a
                   multistream DBN in which two separate observation
                   sequences were modelled. Additionally we have also
                   explored the use of counter variables to constrain the
                   number of action transitions. Experimental results
                   indicate that the DBN architectures are an improvement
                   over a simple baseline HMM, with the multistream DBN
                   with counter constraints producing an action error rate
                   of 6\%.},
  categories = {m4,multimodal,dbn,meetings,edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/icassp04-m4.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/icassp04-m4.ps.gz},
  year = 2004
}
@inproceedings{dielmann-icassp07,
  author = {A. Dielmann and S. Renals},
  title = {{DBN} based joint Dialogue Act recognition of
                   multiparty meetings},
  booktitle = {Proc. IEEE ICASSP},
  volume = 4,
  pages = {133--136},
  abstract = {Joint Dialogue Act segmentation and classification of
                   the new {AMI} meeting corpus has been performed through
                   an integrated framework based on a switching dynamic
                   {Bayesian} network and a set of continuous features and
                   language models. The recognition process is based on a
                   dictionary of 15 {DA} classes tailored for group
                   decision-making. Experimental results show that a novel
                   interpolated Factored Language Model results in a low
                   error rate on the automatic segmentation task, and thus
                   good recognition results can be achieved on {AMI}
                   multiparty conversational speech.},
  categories = {ami,dialogue act,dbn,factored language
                   model,meetings,edinburgh},
  month = {April},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/dielmann-icassp07.pdf},
  year = 2007
}
@incollection{dielmann-mlmi04,
  author = {A. Dielmann and S. Renals},
  title = {Multistream dynamic {Bayesian} network for meeting
                   segmentation},
  booktitle = {Proc. Multimodal Interaction and Related Machine
                   Learning Algorithms Workshop (MLMI--04)},
  publisher = {Springer},
  editor = {S. Bengio and H. Bourlard},
  pages = {76--86},
  abstract = {This paper investigates the automatic analysis and
                   segmentation of meetings. A meeting is analysed in
                   terms of individual behaviours and group interactions,
                   in order to decompose each meeting in a sequence of
                   relevant phases, named meeting actions. Three feature
                   families are extracted from multimodal recordings:
                   prosody from individual lapel microphone signals,
                   speaker activity from microphone array data and lexical
                   features from textual transcripts. A statistical
                   approach is then used to relate low-level features with
                   a set of abstract categories. In order to provide a
                   flexible and powerful framework, we have employed a
                   dynamic Bayesian network based model, characterized by
                   multiple stream processing and flexible state duration
                   modelling. Experimental results demonstrate the
                   strength of this system, providing a meeting action
                   error rate of 9\%.},
  categories = {m4,multimodal,dbn,meetings,edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/dielmann-mlmi04.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/dielmann-mlmi04.ps.gz},
  year = 2005
}
@inproceedings{dielmann-mmsp04,
  author = {A. Dielmann and S. Renals},
  title = {Multi-stream segmentation of meetings},
  booktitle = {Proc. IEEE Workshop on Multimedia Signal Processing},
  pages = {},
  abstract = {This paper investigates the automatic segmentation of
                   meetings into a sequence of group actions or phases.
                   Our work is based on a corpus of multiparty meetings
                   collected in a meeting room instrumented with video
                   cameras, lapel microphones and a microphone array. We
                   have extracted a set of feature streams, in this case
                   extracted from the audio data, based on speaker turns,
                   prosody and a transcript of what was spoken. We have
                   related these signals to the higher level semantic
                   categories via a multistream statistical model based on
                   dynamic Bayesian networks (DBNs). We report on a set of
                   experiments in which different DBN architectures are
                   compared, together with the different feature streams.
                   The resultant system has an action error rate of 9\%.},
  categories = {m4,multimodal,dbn,meetings,edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/dielmann-mmsp04.pdf},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/dielmann-mmsp04.ps.gz},
  year = 2004
}
@article{dielmann2007-tmm,
  author = {Dielmann, Alfred and Renals, Steve},
  title = {Automatic meeting segmentation using dynamic
                   {Bayesian} networks},
  journal = {IEEE Transactions on Multimedia},
  volume = {9},
  number = {1},
  pages = {25--36},
  abstract = {Multiparty meetings are a ubiquitous feature of
                   organizations, and there are considerable economic
                   benefits that would arise from their automatic analysis
                   and structuring. In this paper, we are concerned with
                   the segmentation and structuring of meetings (recorded
                   using multiple cameras and microphones) into sequences
                   of group meeting actions such as monologue, discussion
                   and presentation. We outline four families of
                   multimodal features based on speaker turns, lexical
                   transcription, prosody, and visual motion that are
                   extracted from the raw audio and video recordings. We
                   relate these low-level features to more complex group
                   behaviors using a multistream modelling framework based
                   on multistream dynamic Bayesian networks (DBNs). This
                   results in an effective approach to the segmentation
                   problem, resulting in an action error rate of 12.2\%,
                   compared with 43\% using an approach based on hidden
                   Markov models. Moreover, the multistream DBN developed
                   here leaves scope for many further improvements and
                   extensions.},
  doi = {10.1109/TMM.2006.886337},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2007/dielmann2007-tmm.pdf},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=4032598&arnumber=4032608&count=23&index=3},
  year = 2007
}
@article{dielmann2008,
  author = {Dielmann, Alfred and Renals, Steve},
  title = {Recognition of Dialogue Acts in Multiparty Meetings
                   using a Switching {DBN}},
  journal = {IEEE Transactions on Audio, Speech and Language
                   Processing},
  volume = {16},
  number = {7},
  pages = {1303--1314},
  abstract = {This paper is concerned with the automatic recognition
                   of dialogue acts (DAs) in multiparty conversational
                   speech. We present a joint generative model for DA
                   recognition in which segmentation and classification of
                   DAs are carried out in parallel. Our approach to DA
                   recognition is based on a switching dynamic Bayesian
                   network (DBN) architecture. This generative approach
                   models a set of features, related to lexical content
                   and prosody, and incorporates a weighted interpolated
                   factored language model. The switching DBN coordinates
                   the recognition process by integrating the component
                   models. The factored language model, which is estimated
                   from multiple conversational data corpora, is used in
                   conjunction with additional task-specific language
                   models. In conjunction with this joint generative
                   model, we have also investigated the use of a
                   discriminative approach, based on conditional random
                   fields, to perform a reclassification of the segmented
                   DAs. We have carried out experiments on the AMI corpus
                   of multimodal meeting recordings, using both manually
                   transcribed speech, and the output of an automatic
                   speech recognizer, and using different configurations
                   of the generative model. Our results indicate that the
                   system performs well both on reference and fully
                   automatic transcriptions. A further significant
                   improvement in recognition accuracy is obtained by the
                   application of the discriminative reranking approach
                   based on conditional random fields.},
  doi = {10.1109/TASL.2008.922463},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/dielmann2008.pdf},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=4599391&arnumber=4497831&count=18&index=9},
  year = 2008
}