Publications by Atef Ben Youssef

abenyou.bib

@inproceedings{Braude2013a,
  author = {Braude, David Adam and Shimodaira, Hiroshi and Ben Youssef, Atef},
  title = {Template-Warping Based Speech Driven Head Motion Synthesis},
  booktitle = {Interspeech},
  abstract = {We propose a method for synthesising head motion from speech using a combination of an Input-Output Markov model (IOMM) and Gaussian mixture models trained in a supervised manner. A key difference of this approach compared to others is to model the head motion in each angle as a series of templates of motion rather than trying to recover a frame-wise function. The templates were chosen to reflect natural patterns in the head motion, and states for the IOMM were chosen based on statistics of the templates. This reduces the search space for the trajectories and stops impossible motions such as discontinuities from being possible. For synthesis our system warps the templates to account for the acoustic features and the other angles' warping parameters. We show our system is capable of recovering the statistics of the motion that were chosen for the states. Our system was then compared to a baseline that used a frame-wise mapping that is based on previously published work. A subjective preference test that includes multiple speakers showed participants have a preference for the segment based approach. Both of these systems were trained on storytelling free speech.},
  year = {2013},
  keywords = {Head motion synthesis, GMMs, IOMM},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Braude-etal_IS13.pdf},
  pages = {2763 -- 2767}
}

@inproceedings{benyoussef:IS2013,
  author = {Ben Youssef, Atef and Shimodaira, Hiroshi and Braude, David A.},
  title = {Articulatory features for speech-driven head motion synthesis},
  booktitle = {Proc. Interspeech},
  year = {2013},
  abstract = {This study investigates the use of articulatory features for speech-driven head motion synthesis as opposed to prosody features such as F0 and energy which have been mainly used in the literature. In the proposed approach, multi-stream HMMs are trained jointly on the synchronous streams of speech and head motion data. Articulatory features can be regarded as an intermediate parametrisation of speech that are expected to have a close link with head movement. Measured head and articulatory movements acquired by EMA were synchronously recorded with speech. Measured articulatory data was compared to those predicted from speech using an HMM-based inversion mapping system trained in a semi-supervised fashion. Canonical correlation analysis (CCA) on a data set of free speech of 12 people shows that the articulatory features are more correlated with head rotation than prosodic and/or cepstral speech features. It is also shown that the synthesised head motion using articulatory features give higher correlations with the original head motion than when only prosodic features are used.},
  month = {August},
  address = {Lyon, France},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/BenYoussef-et-al_IS13.pdf},
  pages = {2758-2762}
}

@inproceedings{braude2013template,
  author = {Braude, David A. and Shimodaira, Hiroshi and Ben Youssef, Atef},
  title = {Template-Warping Based Speech Driven Head Motion Synthesis},
  booktitle = {Proc. Interspeech},
  year = {2013},
  abstract = {We propose a method for synthesising head motion from speech using a combination of an Input-Output Markov model (IOMM) and Gaussian mixture models trained in a supervised manner. A key difference of this approach compared to others is to model the head motion in each angle as a series of templates of motion rather than trying to recover a frame-wise function. The templates were chosen to reﬂect natural patterns in the head motion, and states for the IOMM were chosen based on statistics of the templates. This reduces the search space for the trajectories and stops impossible motions such as discontinuities from being possible. For synthesis our system warps the templates to account for the acoustic features and the other angles’ warping parameters. We show our system is capable of recovering the statistics of the motion that were chosen for the states. Our system was then compared to a baseline that used a frame-wise mapping that is based on previously published work. A subjective preference test that includes multiple speakers showed participants have a preference for the segment based approach. Both of these systems were trained on storytelling free speech.},
  month = {August},
  address = {Lyon, France},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Braude-etal_IS13.pdf},
  pages = {2763-2767}
}

@inproceedings{benyoussef:iva2013,
  author = {Ben Youssef, Atef and Shimodaira, Hiroshi and Braude, David A.},
  title = {Head Motion Analysis and Synthesis over Different Tasks},
  booktitle = {Proc. Intelligent Virtual Agents},
  abstract = {It is known that subjects vary in their head movements. This paper presents an analysis of this variety over different tasks and speakers and their impact on head motion synthesis. Measured head and articulatory movements acquired by an ElectroMagnetic Articulograph (EMA) synchronously recorded with audio was used. Data set of speech of 12 people recorded on different tasks confirms that the head motion variate over tasks and speakers. Experimental results confirmed that the proposed models were capable of learning and synthesising task-dependent head motions from speech. Subjective evaluation of synthesised head motion using task models shows that trained models on the matched task is better than mismatched one and free speech data provide models that predict preferred motion by the participants compared to read speech data.},
  month = {September},
  year = {2013},
  organization = {Springer},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/BenYoussef-et-al_IVA13.pdf},
  pages = {285-294}
}

@inproceedings{braude:iva2013,
  author = {Braude, David A. and Shimodaira, Hiroshi and Ben Youssef, Atef},
  title = {The {University of Edinburgh} Head-Motion and Audio Storytelling ({U}o{E}-{H}A{S}) Dataset},
  booktitle = {Proc. Intelligent Virtual Agents},
  year = {2013},
  abstract = {In this paper we announce the release of a large dataset of storytelling monologue with motion capture for the head and body. Initial tests on the dataset indicate that head motion is more dependant on the speaker than the style of speech.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Braude-etal_IVA2013.pdf},
  organization = {Springer},
  pages = {466-467}
}

@inproceedings{hueber:IS2012,
  author = {Hueber, Thomas and Ben Youssef, Atef and Bailly, Gérard and Badin, Pierre and Elisei, Frédéric},
  title = {Cross-speaker Acoustic-to-Articulatory Inversion using Phone-based Trajectory {HMM} for Pronunciation Training.},
  booktitle = {Proc. Interspeech},
  year = {2012},
  address = {Portland, Oregon, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Hueber_etal_IS2012.pdf},
  abstract = {The article presents a statistical mapping approach for crossspeaker acoustic-to-articulatory inversion. The goal is to estimate the most likely articulatory trajectories for a reference speaker from the speech audio signal of another speaker. This approach is developed in the framework of our system of visual articulatory feedback developed for computer-assisted pronunciation training applications (CAPT). The proposed technique is based on the joint modeling of articulatory and acoustic features, for each phonetic class, using full-covariance trajectory HMM. The acoustic-to-articulatory inversion is achieved in 2 steps: 1) finding the most likely HMM state sequence from the acoustic observations; 2) inferring the articulatory trajectories from both the decoded state sequence and the acoustic observations. The problem of speaker adaptation is addressed using a voice conversion approach, based on trajectory GMM.}
}

@book{bailly2012sensorimotor,
  author = {Bailly, Gérard and Badin, Pierre and Revéret, Lionel and Ben Youssef, Atef},
  publisher = {Cambridge University Press},
  doi = {http://dx.doi.org/10.1017/CBO9780511843891.016},
  title = {Sensorimotor characteristics of speech production},
  booktitle = {Audiovisual Speech Processing},
  bookblurb = {When we speak, we configure the vocal tract which shapes the visible motions of the face and the patterning of the audible speech acoustics. Similarly, we use these visible and audible behaviors to perceive speech. This book showcases a broad range of research investigating how these two types of signals are used in spoken communication, how they interact, and how they can be used to enhance the realistic synthesis and recognition of audible and visible speech. The volume begins by addressing two important questions about human audio-visual performance: how auditory and visual signals combine to access the mental lexicon and where in the brain this and related processes take place. It then turns to the production and perception of multimodal speech and how structures are coordinated within and across the two modalities. Finally, the book presents overviews and recent developments in machine-based speech recognition and synthesis of AV speech.},
  year = {2012},
  contributor = {Bailly, Gérard and Badin, Pierre and Revéret, Lionel and Ben Youssef, Atef},
  pages = {368-396}
}

@phdthesis{benyoussef:phd2011,
  author = {Ben Youssef, Atef},
  school = {Grenoble University},
  title = {{Control of talking heads by acoustic-to-articulatory inversion for language learning and rehabilitation}},
  collaboration = {projet ANR ARTIS},
  month = {October},
  affiliation = {GIPSA-lab},
  year = {2011},
  keywords = {visual articulatory feedback; acoustic-to-articulatory speech inversion mapping; ElectroMagnetic Articulography (EMA); hidden Markov models (HMMs), Gaussian mixture models (GMMs); speaker adaptation; face-to-tongue mapping},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/Ben-Youssef_Atef_PhD-2011.pdf},
  abstract = {{This thesis presents a visual articulatory feedback system in which the visible and non visible articulators of a talking head are controlled by inversion from a speaker's voice. Our approach to this inversion problem is based on statistical models built on acoustic and articulatory data recorded on a French speaker by means of an electromagnetic articulograph. A first system combines acoustic speech recognition and articulatory speech synthesis techniques based on hidden Markov Models (HMMs). A second system uses Gaussian mixture models (GMMs) to estimate directly the articulatory trajectories from the speech sound. In order to generalise the single speaker system to a multi-speaker system, we have implemented a speaker adaptation method based on the maximum likelihood linear regression (MLLR) that we have assessed by means of a reference articulatory recognition system. Finally, we present a complete visual articulatory feedback demonstrator.}}
}

@inproceedings{benyoussef:IS2011,
  author = {Ben Youssef, Atef and Hueber, Thomas and Badin, Pierre and Bailly, Gérard},
  title = {Toward a multi-speaker visual articulatory feedback system},
  booktitle = {Proc. Interspeech},
  year = {2011},
  abstract = {In this paper, we present recent developments on the HMMbased acoustic-to-articulatory inversion approach that we develop for a "visual articulatory feedback" system. In this approach, multi-stream phoneme HMMs are trained jointly on synchronous streams of acoustic and articulatory data, acquired by electromagnetic articulography (EMA). Acousticto- articulatory inversion is achieved in two steps. Phonetic and state decoding is first performed. Then articulatory trajectories are inferred from the decoded phone and state sequence using the maximum-likelihood parameter generation algorithm (MLPG). We introduce here a new procedure for the reestimation of the HMM parameters, based on the Minimum Generation Error criterion (MGE). We also investigate the use of model adaptation techniques based on maximum likelihood linear regression (MLLR), as a first step toward a multispeaker visual articulatory feedback system.},
  month = {August},
  address = {Florence, Italie},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/BenYoussef-etal_IS11.pdf},
  pages = {589-592}
}

@inproceedings{benyoussef:ISSP2011toward,
  author = {Ben Youssef, Atef and Hueber, Thomas and Badin, Pierre and Bailly, Gérard and Elisei, Frédéric},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/ben_youssef_etal_issp_final.pdf},
  booktitle = {9th International Seminar on Speech Production, ISSP9},
  year = {2011},
  title = {Toward a speaker-independent visual articulatory feedback system},
  address = {Montreal, Canada}
}

@inproceedings{hueber:p3s2011,
  author = {Hueber, Thomas and Badin, Pierre and Bailly, Gérard and Ben Youssef, Atef and Elisei, Frédéric and Denby, Bruce and Chollet, Gérard},
  title = {Statistical mapping between articulatory and acoustic data. Application to Silent Speech Interface and Visual Articulatory Feedback},
  booktitle = {{Proceedings of the 1st International Workshop on Performative Speech and Singing Synthesis (p3s)}},
  year = {2011},
  address = {Vancouver, Canada},
  keywords = {statistical mapping silent speech ultrasound visual articulatory feedback talking head HMM GMM},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/hueber_etal_p3s_V5.pdf},
  abstract = {This paper reviews some theoretical and practical aspects of different statistical mapping techniques used to model the relationships between the articulatory gestures and the resulting speech sound. These techniques are based on the joint modeling of articulatory and acoustic data using Gaussian Mixture Model (GMM) and Hidden Markov Model (HMM). These methods are implemented in two systems: (1) the silent speech interface developed at SIGMA and LTCI laboratories which converts tongue and lip motions, captured during silent articulation by ultrasound and video imaging, into audible speech, and (2) the visual articulatory feedback system, developed at GIPSA-lab, which automatically animates, from the speech sound, a 3D orofacial clone displaying all articulators (including the tongue). These mapping techniques are also discussed in terms of real-time implementation.}
}

@inproceedings{benyoussef:AVSP2010,
  author = {Ben Youssef, Atef and Badin, Pierre and Bailly, Gérard},
  title = {Acoustic-to-articulatory inversion in speech based on statistical models},
  booktitle = {Proc. AVSP 2010},
  year = {2010},
  abstract = {Two speech inversion methods are implemented and compared. In the first, multistream Hidden Markov Models (HMMs) of phonemes are jointly trained from synchronous streams of articulatory data acquired by EMA and speech spectral parameters; an acoustic recognition system uses the acoustic part of the HMMs to deliver a phoneme chain and the states durations; this information is then used by a trajectory formation procedure based on the articulatory part of the HMMs to resynthesise the articulatory movements. In the second, Gaussian Mixture Models (GMMs) are trained on these streams to directly associate articulatory frames with acoustic frames in context, using Maximum Likelihood Estimation. Over a corpus of 17 minutes uttered by a French speaker, the RMS error was 1.62 mm with the HMMs and 2.25 mm with the GMMs.},
  address = {Hakone, Kanagawa, Japon},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/BenYoussef_Badin_Bailly_AVSP2010.pdf},
  pages = {160-165}
}

@inproceedings{benyoussef:IS2010,
  author = {Ben Youssef, Atef and Badin, Pierre and Bailly, Gérard},
  title = {Can tongue be recovered from face? The answer of data-driven statistical models},
  booktitle = {Proc. Interspeech},
  year = {2010},
  abstract = {This study revisits the face-to-tongue articulatory inversion problem in speech. We compare the Multi Linear Regression method (MLR) with two more sophisticated methods based on Hidden Markov Models (HMMs) and Gaussian Mixture Models (GMMs), using the same French corpus of articulatory data acquired by ElectroMagnetoGraphy. GMMs give overall results better than HMMs, but MLR does poorly. GMMs and HMMs maintain the original phonetic class distribution, though with some centralisation effects, effects still much stronger with MLR. A detailed analysis shows that, if the jaw / lips / tongue tip synergy helps recovering front high vowels and coronal consonants, the velars are not recovered at all. It is therefore not possible to recover reliably tongue from face},
  month = {September},
  address = {Makuhari, Japan},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/BenYoussef_Badin_Bailly_interspeech2010.pdf},
  pages = {2002-2005}
}

@inproceedings{badin:L2SW2010,
  author = {Badin, Pierre and Ben Youssef, Atef and Bailly, Gérard and Elisei, Frédéric and Hueber, Thomas},
  title = {Visual articulatory feedback for phonetic correction in second language learning},
  booktitle = {Workshop on Second Language Studies: Acquisition, Learning, Education and Technology},
  year = {2010},
  address = {Tokyo, Japan},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/L2WS2010_PB-ABY-GB-etal.pdf},
  abstract = {Orofacial clones can display speech articulation in an augmented mode, i.e. display all major speech articulators, including those usually hidden such as the tongue or the velum. Besides, a number of studies tend to show that the visual articulatory feedback provided by ElectroPalatoGraphy or ultrasound echography is useful for speech therapy. This paper describes the latest developments in acoustic-to-articulatory inversion, based on statistical models, to drive orofacial clones from speech sound. It suggests that this technology could provide a more elaborate feedback than previously available, and that it would be useful in the domain of Computer Aided Pronunciation Training}
}

@inproceedings{benyoussef:IS2009,
  author = {Ben Youssef, Atef and Badin, Pierre and Bailly, Gérard and Heracleous, Panikos},
  title = {Acoustic-to-articulatory inversion using speech recognition and trajectory formation based on phoneme hidden Markov models},
  booktitle = {Proc. Interspeech},
  year = {2009},
  abstract = {In order to recover the movements of usually hidden articulators such as tongue or velum, we have developed a data-based speech inversion method. HMMs are trained, in a multistream framework, from two synchronous streams: articulatory movements measured by EMA, and MFCC + energy from the speech signal. A speech recognition procedure based on the acoustic part of the HMMs delivers the chain of phonemes and together with their durations, information that is subsequently used by a trajectory formation procedure based on the articulatory part of the HMMs to synthesise the articulatory movements. The RMS reconstruction error ranged between 1.1 and 2. mm.},
  month = {September},
  address = {Brighton, UK},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/BenYoussef_Badin_Bailly_Heracleous_HMMinversion_Interspeech_2009.pdf},
  pages = {2255-2258}
}

@inproceedings{benyoussef:hal-00508281,
  author = {Ben Youssef, Atef and Badin, Pierre and Bailly, Gérard and Tran, Viet-Anh},
  language = {French},
  title = {Méthodes basées sur les HMMs et les GMMs pour l'inversion acoustico-articulatoire en parole},
  abstract = {Two speech inversion methods are implemented and compared. In the first, multistream Hidden Markov Models (HMMs) of phonemes are jointly trained from synchronous streams of articulatory data acquired by EMA and speech spectral parameters; an acoustic recognition system uses the acoustic part of the HMMs to deliver a phoneme chain and the states durations; this information is then used by a trajectory formation procedure based on the articulatory part of the HMMs to resynthesise the articulatory data. In the second, Gaussian Mixture Models (GMMs) are trained on these streams to associate directly articulatory frames with acoustic frames in context. Over a corpus of 17 minutes uttered by a French speaker, the RMS error was 1,66 mm with the HMMs and 2,25 mm with the GMMs.},
  address = {Mons, Belgium},
  month = {May},
  year = {2010},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/BenYoussef_Badin_Bailly_JEP2010.pdf},
  booktitle = {{Proc. JEP}},
  pages = {249-252}
}

@inproceedings{benyoussef:RJCP2009,
  author = {Ben Youssef, Atef and Tran, Viet-Anh and Badin, Pierre and Bailly, Gérard},
  title = {HMMs and GMMs based methods in acoustic-to-articulatory speech inversion},
  booktitle = {Proc. RJCP},
  year = {2009},
  address = {Avignon, France},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/BenYoussef-et-al_RJCP-2009.pdf},
  pages = {186-192}
}

@inproceedings{besacier2008LIG,
  author = {Besacier, Laurent and Ben Youssef, Atef and Blanchon, Hervé},
  title = {The LIG Arabic/English Speech Translation System at IWSLT08},
  booktitle = {International Workshop on Spoken Language Translation (IWSLT) 2008},
  year = {2008},
  abstract = {This paper is a description of the system presented by the LIG laboratory to the IWSLT08 speech translation evaluation. The LIG participated, for the second time this year, in the Arabic to English speech translation task. For translation, we used a conventional statistical phrase-based system developed using the moses open source decoder. We describe chronologically the improvements made since last year, starting from the IWSLT 2007 system, following with the improvements made for our 2008 submission. Then, we discuss in section 5 some post-evaluation experiments made very recently, as well as some on-going work on Arabic / English speech to text translation. This year, the systems were ranked according to the (BLEU+METEOR)/2 score of the primary ASR output run submissions. The LIG was ranked 5th/10 based on this rule.},
  address = {Hawaii, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/Besacier-etal_IWSLT-2008.pdf},
  pages = {58--62}
}

@inproceedings{benyoussef_shimodaira_icassp2014,
  author = {Ben Youssef, Atef and Shimodaira, Hiroshi and Braude, David},
  title = {Speech driven Talking Head from Estimated Articulatory Features},
  booktitle = {Proc. ICASSP},
  address = {Florence, Italy},
  month = {May},
  pages = {4606--4610},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/benyoussef_etal_icassp2014.pdf},
  abstract = {In this paper, we present a talking head in which the lips and head motion are controlled using articulatory movements estimated from speech. A phonesize HMM-based inversion mapping is employed and trained in a semi-supervised fashion. The advantage of the use of articulatory features is that they can drive the lips motions and they have a close link with head movements. Speech inversion normally requires the training data recorded with electromagnetic articulograph (EMA), which restricts the naturalness of head movements. The present study considers a more realistic recording condition where the training data for the target speaker are recorded with a usual motion capture system rather than EMA. Different temporal clustering techniques are investigated for HMM-based mapping as well as a GMM-based frame-wise mapping as a baseline system. Objective and subjective experiments show that the synthesised motions are more natural using an HMM system than a GMM one, and estimated EMA features outperform prosodic features.},
  categories = {acoustic-articulatory, inversion mapping, MLPG, talking heads}
}