The Centre for Speech Technology Research, The university of Edinburgh

Publications by Hiroshi Shimodaira

hshimoda.bib

@article{Kawamoto2002IPSJ07,
  author = {Kawamoto, Shin-ichi and Shimodaira, Hiroshi and others},
  volume = {43},
  title = {{Design of Software Toolkit for Anthromorphic Spoken Dialog Agent Software with Customization-oriented Features}},
  journal = {Information Processing Society of Japan (IPSJ) Journal},
  number = {7},
  month = {July},
  note = {(in Japanese)},
  year = {2002},
  pages = {2249--2263}
}
@inproceedings{cuayahuitletal_interspeech06,
  author = {Cuayáhuitl, Heriberto and Renals, Steve and Lemon, Oliver and Shimodaira, Hiroshi},
  title = {Learning Multi-Goal Dialogue Strategies Using Reinforcement Learning With Reduced State-Action Spaces},
  booktitle = {Proc. Interspeech},
  month = {September},
  year = {2006},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/rss-icslp2006.pdf},
  abstract = {Learning dialogue strategies using the reinforcement learning framework is problematic due to its expensive computational cost. In this paper we propose an algorithm that reduces a state-action space to one which includes only valid state-actions. We performed experiments on full and reduced spaces using three systems (with 5, 9 and 20 slots) in the travel domain using a simulated environment. The task was to learn multi-goal dialogue strategies optimizing single and multiple confirmations. Average results using strategies learnt on reduced spaces reveal the following benefits against full spaces: 1) less computer memory (94\% reduction), 2) faster learning (93\% faster convergence) and better performance (8.4\% less time steps and 7.7\% higher reward).},
  categories = {reinforcement learning, spoken dialogue systems}
}
@article{Ho2003Applied,
  author = {Ho, Tu Bao and Nguyen, Trong Dung and Shimodaira, Hiroshi and Kimura, Masayuki},
  title = {{A Knowledge Discovery System with Support for Model Selection and Visualization}},
  journal = {Applied Intelligence},
  number = {},
  volume = {19},
  year = {2003},
  pages = {125--141},
  categories = {KDD}
}
@article{Keeni1996IEICE,
  author = {Keeni, Kanad and Shimodaira, Hiroshi and Nishino, Tetsuro and Tan, Yasuo},
  title = {{Recognition of Devanagari Characters Using Neural Networks}},
  journal = {IEICE},
  number = {5},
  month = {May},
  volume = {E79-D},
  year = {1996},
  pages = {523--528},
  categories = {character-recognition, ann, jaist}
}
@misc{Carnival_SIGGRAPH_2010,
  author = {Berger, Michael and Hofer, Gregor and Shimodaira, Hiroshi},
  title = {Carnival: a modular framework for automated facial animation},
  address = {Los Angeles, Calif., USA},
  note = {Bronze award winner, ACM Student Research Competition},
  year = {2010},
  howpublished = {Poster at SIGGRAPH 2010},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/carnival.pdf},
  abtract = {We present a software framework for speech- or text-driven animation--including a platform-independent API and an application implementing it--which unifies state-of-the-art speech technology and graphics technology within a single system.}
}
@inproceedings{Sagayama2001ISCA08a,
  author = {Sagayama, Shigeki and Kato, Yutaka and Nakai, Mitsuru and Shimodaira, Hiroshi},
  title = {{Jacobian Approach to Joint Adaptation to Noise, Channel and Vocal Tract Length}},
  booktitle = {Proc. ISCA Workshop on Adaptation Methods (Sophia Antipolis, France)},
  month = {August},
  year = {2001},
  pages = {117--120},
  categories = {asr, jaist}
}
@inproceedings{Shimodaira1997Eurospeech,
  author = {Shimodaira, Hiroshi and Nakai, Mitsuru and Kumata, Akihiro},
  title = {{Restration of Pitch Pattern of Speech Based on a Pitch Gereration Model}},
  booktitle = {Proc. EuroSpeech'97},
  month = {September},
  pages = {512--524},
  year = {1997},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/euro97.pdf},
  abstract = {In this paper a model-based approach for restoring a continuous fundamental frequency (F0) contour from the noisy output of an F0 extractor is investigated. In contrast to the conventional pitch trackers based on numerical curve-fitting, the proposed method employs a quantitative pitch generation model, which is often used for synthesizing F0 contour from prosodic event commands for estimating continuous F0 pattern. An inverse filtering technique is introduced for obtaining the initial candidates of the prosodic commands. In order to find the optimal command sequence from the commands efficiently, a beam-search algorithm and an N-best technique are employed. Preliminary experiments for a male speaker of the ATR B-set database showed promising results both in quality of the restored pattern and estimation of the prosodic events.},
  categories = {f0, jaist}
}
@inproceedings{Shimodaira1998ICSLP,
  author = {Shimodaira, Hiroshi and Rokui, Jun and Nakai, Mitsuru},
  title = {{Improving The Generalization Performance Of The MCE/GPD Learning}},
  booktitle = {ICSLP'98, Australia},
  month = {December},
  year = {1998},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/Shimodaira1998ICSLP.pdf},
  abstract = {A novel method to prevent the over-fitting effect and improve the generalization performance of the Minimum Classification Error (MCE) / Generalized Probabilistic Descent (GPD) learning is proposed. The MCE/GPD method, which is one of the newest discriminative-learning approaches proposed by Katagiri and Juang in 1992, results in better recognition performance in various areas of pattern recognition than the maximum-likelihood (ML) based approach where a posteriori probabilities are estimated. Despite its superiority in recognition performance, it still suffers from the problem of over-fitting to the training samples as it is with other learning algorithms. In the present study, a regularization technique is employed to the MCE method to overcome this problem. Feed-forward neural networks are employed as a recognition platform to evaluate the recognition performance of the proposed method. Recognition experiments are conducted on several sorts of datasets. The proposed method shows better generalization performance than the original one.},
  categories = {lifelike-agent, jaist}
}
@inproceedings{Keeni1998ICPR,
  author = {Keeni, Kanad and Nakayama, Kenji and Shimodaira, Hiroshi},
  title = {{Automatic Generation of Initial Weights and Estimation of Hidden Units for Pattern Classification Using Neural Networks}},
  booktitle = {14th International Conference on Pattern Recognition (ICPR'98)},
  month = {August},
  year = {1998},
  pages = {1568--1571},
  categories = {ann, jaist}
}
@article{cuayahuitl2009,
  author = {Cuayáhuitl, Heriberto and Renals, Steve and Lemon, Oliver and Shimodaira, Hiroshi},
  doi = {10.1016/j.csl.2009.07.001},
  title = {Evaluation of a hierarchical reinforcement learning spoken dialogue system},
  journal = {Computer Speech and Language},
  number = {2},
  pages = {395-429},
  volume = {24},
  year = {2009},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/cuayahuitl-csl09.pdf},
  abstract = {We describe an evaluation of spoken dialogue strategies designed using hierarchical reinforcement learning agents. The dialogue strategies were learnt in a simulated environment and tested in a laboratory setting with 32 users. These dialogues were used to evaluate three types of machine dialogue behaviour: hand-coded, fully-learnt and semi-learnt. These experiments also served to evaluate the realism of simulated dialogues using two proposed metrics contrasted with ‘Precision-Recall’. The learnt dialogue behaviours used the Semi-Markov Decision Process (SMDP) model, and we report the first evaluation of this model in a realistic conversational environment. Experimental results in the travel planning domain provide evidence to support the following claims: (a) hierarchical semi-learnt dialogue agents are a better alternative (with higher overall performance) than deterministic or fully-learnt behaviour; (b) spoken dialogue strategies learnt with highly coherent user behaviour and conservative recognition error rates (keyword error rate of 20\%) can outperform a reasonable hand-coded strategy; and (c) hierarchical reinforcement learning dialogue agents are feasible and promising for the (semi) automatic design of optimized dialogue behaviours in larger-scale systems.}
}
@inproceedings{Keeni1997ICDAR,
  author = {Keeni, Kanad and Shimodaira, Hiroshi and Nakayama, Kenji},
  title = {{On Distributed Representation of Output Layer for Recognizing Japanese Kana Characters Using Neural Networks}},
  booktitle = {Proceedings of the 4'th International Conference on Document Analysis and Recognition, ICDAR'97},
  month = {July},
  note = {Ulm, Germany},
  year = {1997},
  pages = {600--603},
  categories = {hwr, ann, jaist}
}
@inproceedings{Matsuda2000ICSLP10,
  author = {Matsuda, Shigeki and Nakai, Mitsuru and Shimodaira, Hiroshi and Sagayama, Shigeki},
  title = {{Feature-dependent Allophone Clustering}},
  journal = {},
  abstract = {We propose a novel method for clustering allophones called Feature-Dependent Allophone Clustering (FD-AC) that determines feature-dependent HMM topology automatically. Existing methods for allophone clustering are based on parameter sharing between the allophone models that resemble each other in behaviors of feature vector sequences. However, all the features of the vector sequences may not necessarily have a common allophone clustering structures. It is considered that the vector sequences can be better modeled by allocating the optimal allophone clustering structure to each feature. In this paper, we propose Feature-Dependent Successive State Splitting (FD-SSS) as an implementation of FD-AC. In speaker-dependent continuous phoneme recognition experiments, HMMs created by FD-SSS reduced the error rates by about 10% compared with the conventional HMMs that have a common allophone clustering structure for all the features.},
  month = {October},
  year = {2000},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Matsuda2000ICSLP10.pdf},
  booktitle = {Proc. ICSLP2000},
  pages = {413--416},
  categories = {asr, atr, jaist}
}
@inproceedings{Tokuno2002IWFHR,
  author = {Tokuno, Junko and Inami, Nobuhito and Matsuda, Shigeki and Nakai, Mitsuru and Shimodaira, Hiroshi and Sagayama, Shigeki},
  title = {{Context-Dependent Substroke Model for {HMM}-based On-line Handwriting Recognition}},
  journal = {},
  abstract = {This paper describes an effective modeling technique in the on-line recognition for cursive Kanji handwritings and Hiragana handwritings. Our conventional recognition system based on substroke HMMs (hidden Markov models) employs straight-type substrokes as primary models and has achieved high recognition rate in the recognition of careful Kanji handwritings. On the other hand, the recognition rate for cursive handwritings is comparatively low, since they consist of mainlycurve-strokes. Therefore, we propose a technique of using multiple models for each substroke by considering the substroke context, which is a preceding substroke and a following substroke. In order to construct these context-dependent models efficiently, we use the SSS (Successive State Splitting) algorithm developed in speech recognition. Through the experiments, the recognition rate improved from 88\% to 92\% for cursive Kanji handwritings and from 90\% to 98\% for Hiragana handwritings.},
  month = {August},
  note = {},
  year = {2002},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/Tokuno2002IWFHR.pdf},
  booktitle = {Proc. IWFHR-8},
  pages = {78--83}
}
@inproceedings{Nakai1995ICASSP,
  author = {Nakai, Mitsuru and Harald, Singer and Sagisaka, Yoshinori and Shimodaira, Hiroshi},
  title = {{Automatic Prosodic Segmentation by F0 Clustering Using Superpositional Modeling}},
  journal = {},
  month = {May},
  year = {1995},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1995/Nakai1995ICASSP.pdf},
  booktitle = {Proc. ICASSP-95, PR08.6},
  pages = {624--627},
  categories = {F0, atr, jaist}
}
@inproceedings{Koba1995HCIb,
  author = {Koba, Hisao and hiroshi Shimodaira and Kimura, Masayuki},
  title = {{Intelligent Automatic Document Transcription System for Braille: To Improve Accessibility to Printed Matter for the Visually Impaired}},
  booktitle = {HIC International'95},
  year = {1995},
  month = {July}
}
@inproceedings{Shimodaira:mlmi05,
  author = {Shimodaira, Hiroshi and Uematsu, Keisuke and Kawamoto, Shin'ichi and Hofer, Gregor and Nakai, Mitsuru},
  title = {{Analysis and Synthesis of Head Motion for Lifelike Conversational Agents}},
  booktitle = {Proc. MLMI2005},
  month = {July},
  year = {2005},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/mlmi2005.pdf},
  categories = {lifelike agents}
}
@inproceedings{Keeni1999IJCNN,
  author = {Keeni, Kanad and Nakayama, Kenji and Shimodaira, Hiroshi},
  title = {{Estimation of Initial Weights and Hidden Units for Fast Learning of Multi-layer Neural Networks for Pattern Classification}},
  booktitle = {IEEE International Joint Conference on Neural Networks (IJCNN'99)},
  month = {July},
  year = {1999},
  journal = {},
  categories = {ann, jaist}
}
@inproceedings{Shimodaira1994ICASSP,
  author = {Shimodaira, Hiroshi and Nakai, Mitsuru},
  title = {Prosodic Phrase Segmentation by Pitch Pattern Clustering},
  booktitle = {Proc. ICASSP-94, 76.5, vol.II},
  month = {March},
  note = {},
  pages = {185--188},
  year = {1994},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1994/Shimodaira1994ICASSP.pdf},
  abstract = {This paper proposes a novel method for detecting the optimal sequence of prosodic phrases from continuous speech based on data-driven approach. The pitch pattern of input speech is divided into prosodic segments which minimized the overall distortion with pitch pattern templates of accent phrases by using the One Pass search algorithm. The pitch pattern templates are designed by clustering a large number of training samples of accent phrases. On the ATR continuous speech database uttered by 10 speakers, the rate of correct segmentation was 91.7 \% maximum for the same sex data of training and testing, 88.6 \% for the opposite sex.},
  categories = {F0, jaist}
}
@article{Iida1998IEICE06,
  author = {Iida, Eiji and Kunifuji, Susumu and Shimodaira, Hiroshi and Kimura, Masayuki},
  volume = {J81-D-I},
  title = {{A Scale-Down Solution of N^2-1 Puzzle}},
  journal = {Trans. IEICE(D-I)},
  number = {6},
  month = {June},
  note = {(in Japanese)},
  year = {1998},
  pages = {604--614},
  categories = {puzzle, jaist}
}
@article{Rokui2002IPSJ07,
  author = {Rokui, Jun and Nakai, Mitsuru and Shimodaira, Hiroshi and Sagayama, Shigeki},
  volume = {43},
  title = {{Speaker Normalization Using Linear Transformation of Vocal Tract Length Based on Maximum Likelihood Estimation}},
  journal = {Information Processing Society of Japan (IPSJ)},
  number = {7},
  month = {July},
  note = {(in Japanese)},
  pages = {2030--2037},
  year = {2002},
  abstract = {},
  categories = {asr, jaist}
}
@inproceedings{taylor:shimodaira:isard:king:kowtko:icslp1996,
  author = {Taylor, Paul A. and Shimodaira, Hiroshi and Isard, Stephen and King, Simon and Kowtko, Jacqueline},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1996/Taylor_1996_a.ps},
  title = {Using Prosodic Information to Constrain Language Models for Spoken dialogue},
  booktitle = {Proc. {ICSLP} `96},
  address = {Philadelphia},
  year = {1996},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1996/Taylor_1996_a.pdf},
  abstract = {We present work intended to improve speech recognition performance for computer dialogue by taking into account the way that dialogue context and intonational tune interact to limit the possibilities for what an utterance might be. We report here on the extra constraint achieved in a bigram language model expressed in terms of entropy by using separate submodels for different sorts of dialogue acts and trying to predict which submodel to apply by analysis of the intonation of the sentence being recognised.},
  categories = {asr, intonation, dialogue, lm,id4s}
}
@article{Otsuki2002IPSJ,
  author = {Otsuki, Tomoshi and Saitou, Naoki and Nakai, Mitsuru and Shimodaira, Hiroshi and Sagayama, Shigeki},
  volume = {43},
  title = {{Musical Rhythm Recognition Using Hidden Markov Model}},
  journal = {Information Processing Society of Japan (IPSJ) JOURNAL},
  number = {2},
  month = {February},
  note = {(in Japanese)},
  year = {2002}
}
@inproceedings{Bao1997-1,
  author = {Ho, Tu Bao and Dung, Nguyen Trong and Shimodaira, Hiroshi and Kimura, Masayuki},
  title = {{An Interactive-Graphic Environment for Discovering and Using Conceptual Knowledge}},
  booktitle = {7th European-Japanese Conference on Information Modelling and Knowledge Bases},
  month = {May},
  year = {1997},
  pages = {327--343},
  categories = {kdd, jaist}
}
@article{Matsuda2003IEICE06,
  author = {Matsuda, Shigeki and Nakai, Mitsuru and Shimodaira, Hiroshi and Sagayama, Shigeki},
  volume = {J86-D-II},
  title = {{Speech Recognition Using Asynchronous Transition {HMM}}},
  journal = {IEICE Trans. D-II},
  number = {6},
  month = {June},
  note = {(in Japanese)},
  pages = {741--754},
  year = {2003},
  abstract = {We propose asynchronous-transition HMM (AT-HMM) that is based on asynchronous transition structures among individual features of acoustic feature vector sequences. Conventional HMM represents vector sequences by using a chain of states, each state has vector distributions of multi-dimensions. Therefore, the conventional HMM assumes that individual features change synchronously. However, this assumption seems over-simplified for modeling the temporal behavior of acoustic features, since cepstrum and its time-derivative can not synchronize with each other. In speaker-dependent continuous phoneme recognition task, the AT-HMMs reduced errors by 10\% to 40\%. In speaker-independent task, the performance of the AT-HMMs was comparable to conventional HMMs.},
  categories = {asr, jaist}
}
@article{Nakai2005IEICE01,
  author = {Nakai, Mitsuru and Sagayama, Shigeki and Shimodaira, Hiroshi},
  volume = {J88-D2},
  title = {{On-line Handwriting Recognition Based on Sub-stroke {HMM}}},
  journal = {Trans. IEICE D-II},
  number = {8},
  month = {August},
  note = {(in press) (in Japanese)},
  year = {2005},
  abstract = {This paper describes context-dependent sub-stroke HMMs for on-line handwritten character recognition. As there are so many characters in Japanese, modeling each character by an HMM leads to an infeasible character-recognition system requiring huge amount of memory and enormous computation time. The sub-stroke HMM approach has overcomed these problems by minimizing modeling unit. However, one of the drawback of this approach is that the recognition accuracy deteriorates for scribbled characters. In this paper, we show that the context-dependent sub-stroke modeling which depends on how the sub-stroke connects to the adjacent substrokes is effective to achieve robust recognition of low quality characters.},
  categories = {online handwritten character recognition}
}
@inproceedings{Fujinaga2001ICASSP,
  author = {Fujinaga, Katsuhisa and Nakai, Mitsuru and Shimodaira, Hiroshi and Sagayama, Shigeki},
  title = {{Multiple-Regression Hidden Markov Model}},
  booktitle = {Proc. ICASSP 2001},
  month = {May},
  year = {2001},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/Fujinaga2001ICASSP.pdf},
  categories = {asr, jaist}
}
@inproceedings{Shimodaira2003ICDAR,
  author = {Shimodaira, Hiroshi and Sudo, Takashi and Nakai, Mitsuru and Sagayama, Shigeki},
  title = {{On-line Overlaid-Handwriting Recognition Based on Substroke {HMM}s}},
  journal = {},
  abstract = {This paper proposes a novel handwriting recognition interface for wearable computing where users write characters continuously without pauses on a small single writing box. Since characters are written on the same writing area, they are overlaid with each other. Therefore the task is regarded as a special case of the continuous character recognition problem. In contrast to the conventional continuous character recognition problem, location information of strokes does not help very much in the proposed framework. To tackle the problem, substroke based hidden Markov models (HMMs) and a stochastic bigram language model are employed. Preliminary experiments were carried out on a dataset of 578 handwriting sequences with a character bigram consisting of 1,016 Japanese educational Kanji and 71 Hiragana characters. The proposed method demonstrated promising performance with 69.2\% of handwriting sequences beeing correctly recognized when different stroke order was permitted, and the rate was improved up to 88.0\% when characters were written with fixed stroke order.},
  month = {August},
  year = {2003},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/Shimodaira2003ICDAR.pdf},
  booktitle = {ICDAR'03},
  pages = {1043--1047},
  categories = {HWR, jaist}
}
@article{Nakai1994IEICE06,
  author = {Nakai, Mitsuru and Shimodaira, Hiroshi and Sagayama, Shigeki},
  volume = {77},
  title = {{Prosodic Phrase Segmentation Based on Pitch-Pattern Clustering}},
  journal = {Electronics and Communications in Japan, Part 3},
  number = {6},
  month = {June},
  note = {(in Japanese)},
  year = {1994},
  pages = {80--91},
  categories = {F0, jaist}
}
@inproceedings{cuayahuitletal_slt06,
  author = {Cuayáhuitl, Heriberto and Renals, Steve and Lemon, Oliver and Shimodaira, Hiroshi},
  title = {Reinforcement Learning of Dialogue Strategies With Hierarchical Abstract Machines},
  booktitle = {Proc. IEEE/ACL Workshop on Spoken Language Technology (SLT)},
  month = {December},
  year = {2006},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/ham-slt2006.pdf},
  abstract = {In this paper we propose partially specified dialogue strategies for dialogue strategy optimization, where part of the strategy is specified deterministically and the rest optimized with Reinforcement Learning (RL). To do this we apply RL with Hierarchical Abstract Machines (HAMs). We also propose to build simulated users using HAMs, incorporating a combination of hierarchical deterministic and probabilistic behaviour. We performed experiments using a single-goal flight booking dialogue system, and compare two dialogue strategies (deterministic and optimized) using three types of simulated user (novice, experienced and expert). Our results show that HAMs are promising for both dialogue optimization and simulation, and provide evidence that indeed partially specified dialogue strategies can outperform deterministic ones (on average 4.7 fewer system turns) with faster learning than the traditional RL framework.},
  categories = {reinforcement learning, spoken dialogue systems}
}
@inproceedings{Nakai1997Eurospeech,
  author = {Nakai, Mitsuru and Shimodaira, Hiroshi},
  title = {{On Representation of Fundamental Frequency of Speech for Prosody Analysis Using Reliability Function}},
  booktitle = {Proc. EuroSpeech'97},
  month = {September},
  year = {1997},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/Nakai1997Eurospeech.pdf},
  pages = {243--246},
  categories = {f0, jaist}
}
@inproceedings{dziemianko_interspeech2009,
  author = {Dziemianko, Michal and Hofer, Gregor and Shimodaira, Hiroshi},
  title = {{HMM}-Based Automatic Eye-Blink Synthesis from Speech},
  booktitle = {Proc. Interspeech},
  address = {Brighton, UK},
  month = {September},
  pages = {1799--1802},
  year = {2009},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/dziemianko_interspeech2009.pdf},
  abstract = {In this paper we present a novel technique to automatically synthesise eye blinking from a speech signal. Animating the eyes of a talking head is important as they are a major focus of attention during interaction. The developed system predicts eye blinks from the speech signal and generates animation trajectories automatically employing a ''Trajectory Hidden Markov Model''. The evaluation of the recognition performance showed that the timing of blinking can be predicted from speech with an F-score value upwards of 52\%, which is well above chance. Additionally, a preliminary perceptual evaluation was conducted, that confirmed that adding eye blinking significantly improves the perception the character. Finally it showed that the speech synchronised synthesised blinks outperform random blinking in naturalness ratings.},
  categories = {animation, motion synthesis, time series analysis, trajectory model}
}
@inproceedings{Keeni2003ICEIS,
  author = {Keeni, Kanad and Goto, Kunio and Shimodaira, Hiroshi},
  title = {{On fast learning of Multi-layer Feed-forward Neural Networks Using Back Propagation}},
  booktitle = {International Conference on Enterprise and Information Systems (ICEIS2003)},
  abstract = {This study discusses the subject of training data selection for neural networks using back propagation. We have made only one assumption that there are no overlapping of training data belonging to different classes, in other words the training data is linearly/semi-linearly separable . Training data is analyzed and the data that affect the learning process are selected based on the idea of Critical points. The proposed method is applied to a classification problem where the task is to recognize the characters A,C and B,D. The experimental results show that in case of batch mode the proposed method takes almost 1/7 of real and 1/10 of user training time required for conventional method. On the other hand in case of online mode the proposed method takes 1/3 of training epochs, 1/9 of real and 1/20 of user and 1/3 system time required for the conventional method. The classification rate of training and testing data are the same as it is with the conventional method.},
  month = {April},
  year = {2003},
  pages = {266--271}
}
@article{Shimodaira2001NIPS,
  author = {Shimodaira, Hiroshi and Noma, Ken-ichi and Nakai, Mitsuru and Sagayama, Shigeki},
  title = {{Dynamic Time-Alignment Kernel in Support Vector Machine}},
  journal = {Advances in Neural Information Processing Systems 14, NIPS2001},
  month = {December},
  volume = {2},
  pages = {921--928},
  year = {2001},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/Shimodaira2001NIPS.pdf},
  abstract = {A new class of Support Vector Machine (SVM) that is applicable to sequential-pattern recognition such as speech recognition is developed by incorporating an idea of non-linear time alignment into the kernel function. Since the time-alignment operation of sequential pattern is embedded in the new kernel function, standard SVM training and classification algorithms can be employed without further modifications. The proposed SVM (DTAK-SVM) is evaluated in speaker-dependent speech recognition experiments of hand-segmented phoneme recognition. Preliminary experimental results show comparable recognition performance with hidden Markov models (HMMs).},
  categories = {ml, svm, jaist}
}
@inbook{Nakai1997Book,
  author = {Nakai, Mitsuru and Singer, Harald and Sagisaka, Yoshinori and Shimodaira, Hiroshi},
  title = {{Accent Phrase Segmentation by F0 Clustering Using Superpositional Modeling}},
  booktitle = {Computing Prosody, Chapter 22},
  month = {January},
  year = {1997},
  pages = {343--360},
  categories = {f0, atr, jaist}
}
@misc{Hofer_Shimodaira:sigg:2007,
  author = {Hofer, Gregor and Shimodaira, Hiroshi and Yamagishi, Junichi},
  howpublished = {Poster at Siggraph 2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/siggraph07.pdf},
  address = {San Diego, USA},
  year = {2007},
  title = {Speech-driven Head Motion Synthesis based on a Trajectory Model}
}
@inproceedings{Matsushita2002HIS03,
  author = {Matsushita, Yoshinori and Kawamoto, Shinnichi and Nakai, Mitsuru and Shimodaira, Hiroshi and Sagayama, Shigeki},
  title = {{A Head-Behavior Synchronization Model with Utterance for Anthropomorphic Spoken-Dialog Agent}},
  journal = {},
  month = {March},
  note = {(in Japanese)},
  year = {2002},
  booktitle = {Technical Report of IEICE, HIS2001},
  abstract = {A novel method of synchronously synthesizing the head motion of an anthropomorphic spoken dialog agent with its utterance is proposed. Although much efforts have been taken to synchronize the lip motion with utterance, very few research exist for such head-motion control. A neural network is employed to learn the relationship between the acoustic features of the utterance and the head motion that are measured by a motion-capturing system. The proposed method enables to simulate the facial animation automatically that moves synchronously with any given utterances. Subjective evaluation of the performance of the method is reported as well.},
  categories = {lifelike-agent, jaist}
}
@inproceedings{Keeni1998ICCLSDP,
  author = {Keeni, Kanad and Shimodaira, Hiroshi and Nakayama, Kenji and Kotani, Kazunori},
  title = {{On Parameter Initialization of Multi-layer Feed-forward Neural Networks for Pattern Recognition}},
  booktitle = {International Conference on Computational Linguistics, Speech and Document Processing (ICCLSDP-'98), Calcutta, India},
  month = {February},
  year = {1998},
  pages = {D8--12},
  categories = {ann, jaist}
}
@inproceedings{Shimodaira1993Eurospeech,
  author = {Shimodaira, Hiroshi and Nakai, Mitsuru},
  ps = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1993/euro93.ps.gz},
  title = {Accent Phrase Segmentation Using Transition Probabilities Between Pitch Pattern Templates},
  booktitle = {Proc. EuroSpeech'93},
  month = {September},
  note = {},
  pages = {1767--1770},
  year = {1993},
  abstract = {This paper proposes a novel method for segmenting continuous speech into accent phrases by using a prosodic feature 'pitch pattern'. The pitch pattern extracted from input speech signals is divided into the accent segments automatically by using the One-Stage DP algorithm, in which reference templates representing various types of accent patterns and connectivity between them are used to find out the optimum sequence of accent segments. In case of making the reference templates from a large number of training data, the LBG clustering algorithm is used to represent typical accent patterns by a small number of templates. Evaluation tests were carried out using the ATR continuous speech database of a male speaker. Experimental results showed more than 91 \% of phrase boundaries were correctly detected.},
  categories = {F0, jaist}
}
@inproceedings{Shimodaira1998SPR,
  author = {Shimodaira, Hiroshi and Rokui, Jun and Nakai, Mitsuru},
  title = {{Modified Minimum Classification Error Learning and Its Application to Neural Networks}},
  booktitle = {2nd International Workshop on Statistical Techniques in Pattern Recognition (SPR'98), Sydney, Australia},
  abstract = {A novel method to improve the generalization performance of the Minimum Classification Error (MCE) / Generalized Probabilistic Descent (GPD) learning is proposed. The MCE/GPD learning proposed by Juang and Katagiri in 1992 results in better recognition performance than the maximum-likelihood (ML) based learning in various areas of pattern recognition. Despite its superiority in recognition performance, as well as other learning algorithms, it still suffers from the problem of ``over-fitting'' to the training samples. In the present study, a regularization technique has been employed to the MCE learning to overcome this problem. Feed-forward neural networks are employed as a recognition platform to evaluate the recognition performance of the proposed method. Recognition experiments are conducted on several sorts of data sets.},
  year = {1998},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/spr98.pdf},
  pages = {},
  categories = {mce, ann, jaist}
}
@inproceedings{Keeni1997ICPPOL,
  author = {Keeni, Kanad and Shimodaira, Hiroshi},
  title = {{On Representation of Output Layer for Recognizing Japanese Kana Characters Using Neural Networks}},
  booktitle = {Proc. the `17'th International Conference on Computer Processing of Oriental Languages},
  month = {April},
  note = {Baptist University, Kowloon Tong, Hong Kong},
  year = {1997},
  pages = {305--308},
  categories = {ann, jaist}
}
@inproceedings{Shimodaira2002ICASSP,
  author = {Shimodaira, Hiroshi and Sakai, Nobuyoshi and Nakai, Mitsuru and Sagayama, Shigeki},
  title = {{Jacobian Joint Adaptation to Noise, Channel and Vocal Tract Length}},
  journal = {},
  abstract = {A new Jacobian approach that linearly decomposes the composite of additive noise, multiplicative noise (channel transfer function) and speaker's vocal tract length, and adapts the acoustic model parameters simultaneously to these factors is proposed in this paper. Due to the fact that these factors non-linearly degrade the observed features for speech recognition, existing approaches fail to adapt the acoustic models adequately. Approximating the nonlinear operation by a linear model enables to employ the least square error estimation of the factors and adapt the acoustic model parameters with small amount of speech samples. Speech recognition experiments on ATR isolated word database demonstrate significant reduction of error rates, which supports the effectiveness of the proposed scheme.},
  month = {May},
  year = {2002},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/Shimodaira2002ICASSP.pdf},
  booktitle = {Proc. ICASSP2002},
  pages = {197--200},
  categories = {asr, jaist}
}
@article{Nakai1994IEICE02,
  author = {Nakai, Mitsuru and Shimodaira, Hiroshi and Sagayama, Shigeki},
  volume = {J77-A},
  title = {Prosodic Phrase Segmentation Based on Pitch-Pattern Clustering},
  journal = {Trans. IEICE (A)},
  number = {2},
  month = {February},
  note = {(in Japanese)},
  year = {1994},
  pages = {206--214},
  categories = {F0, jaist}
}
@inproceedings{Keeni2001SPPRA,
  author = {Keeni, Kanad and Goto, Kunio and Shimodaira, Hiroshi},
  title = {{On Extraction of E-Mail Address from Fax Message for Automatic Delivery to Individual Recipient}},
  booktitle = {IASTED International Conference on Siganl Processing Pattern Recognition and Application},
  year = {2001},
  categories = {nn, jaist},
  month = {July}
}
@article{Kawamoto2003Book,
  author = {Kawamoto, Shin-ichi and Shimodaira, Hiroshi and Sagayama, Shigeki and others},
  title = {{Galatea: Open-Source Software for Developing Anthropomorphic Spoken Dialog Agents}},
  journal = {Life-Like Characters. Tools, Affective Functions, and Applications. Helmut Prendinger et al. (Eds.) Springer},
  number = {},
  month = {November},
  volume = {},
  pages = {187--212},
  year = {2003},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/Kawamoto2003Book.pdf},
  abstract = {Galatea is a software toolkit to develop a human-like spoken dialog agnet. In order to easily integrate the modules of different characteristics including speech recognizer, speech synthesizer, facial-image synthesizer and dialog controller, each module is modeled as a virtual machine having a simple common interface and connected to each other through a broker (communication manager). Galatea employs model-based speech and facial-image synthesizers whose model parameters are adapted easily to those for an existing person if his/her training data is given. The software toolkit that runs on both UNIX/Linux and Windows operating systems will be publicly available in the middle of 2003.},
  categories = {lifelike-agent, jaist}
}
@inproceedings{Nakai1998ICSLP,
  author = {Nakai, Mitsuru and Shimodaira, Hiroshi},
  title = {{The Use of F0 Reliability Function for Prosodic Command Analysis on F0 Contour Generation Model}},
  booktitle = {Proc. ICSLP'98},
  month = {December},
  year = {1998},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1998/Nakai1998ICSLP.pdf},
  categories = {asr, atr, jaist}
}
@inproceedings{Hofer_Shimodaira:proc:2007,
  author = {Hofer, Gregor and Shimodaira, Hiroshi},
  title = {Automatic Head Motion Prediction from Speech Data},
  booktitle = {Proc. Interspeech 2007},
  address = {Antwerp, Belgium},
  month = {August},
  year = {2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/interspeech07.pdf},
  abstract = {In this paper we present a novel approach to generate a sequence of head motion units given some speech. The modelling approach is based on the notion that head motion can be divided into a number of short homogeneous units that can each be modelled individually. The system is based on Hidden Markov Models (HMM), which are trained on motion units and act as a sequence generator. They can be evaluated by an accuracy measure. A database of motion capture data was collected and manually annotated for head motion and is used to train the models. It was found that the model is good at distinguishing high activity regions from regions with less activity with accuracies around 75 percent. Furthermore the model is able to distinguish different head motion patterns based on speech features somewhat reliably, with accuracies reaching almost 70 percent.}
}
@misc{Hofer_Shimodaira:sca:2007,
  author = {Hofer, Gregor and Shimodaira, Hiroshi and Yamagishi, Junichi},
  howpublished = {Poster at SCA 2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/sca07.pdf},
  address = {San Diego, USA},
  year = {2007},
  title = {Lip motion synthesis using a context dependent trajectory hidden {M}arkov model}
}
@inproceedings{Nakai1994ICSLP,
  author = {Nakai, Mitsuru and Shimodaira, Hiroshi},
  title = {{Accent Phrase Segmentation by Finding N-best Sequences of Pitch Pattern Templates}},
  journal = {},
  month = {September},
  year = {1994},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1994/Nakai1994ICSLP.pdf},
  booktitle = {Proc. ICSLP94, 8.10},
  pages = {347--350},
  categories = {F0, jaist}
}
@inproceedings{Nakai2001ICDAR,
  author = {Nakai, Mitsuru and Akira, Naoto and Shimodaira, Hiroshi and Sagayama, Shigeki},
  title = {{Substroke Approach to {HMM}-based On-line Kanji Handwriting Recognition}},
  booktitle = {Proc. ICDAR'01},
  month = {September},
  pages = {491--495},
  year = {2001},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/Nakai2001ICDAR.pdf},
  abstract = {A new method is proposed for on-line handwriting recognition of Kanji characters. The method employs substroke HMMs as minimum units to constitute Japanese Kanji characters and utilizes the direction of pen motion. The main motivation is to fully utilize the continuous speech recognition algorithm by relating sentence speech to Kanji character, phonemes to substrokes, and grammar to Kanji structure. The proposed system consists input feature analysis, substroke HMMs, a character structure dictionary and a decoder. The present approach has the following advantages over the conventional methods that employ whole character HMMs. 1) Much smaller memory requirement for dictionary and models. 2) Fast recognition by employing efficient substroke network search. 3) Capability of recognizing characters not included in the training data if defined as a sequence of substrokes in the dictionary. 4) Capability of recognizing characters written by various different stroke orders with multiple definitions per one character in the dictionary. 5) Easiness in HMM adaptation to the user with a few sample character data.},
  categories = {hwr, jaist}
}
@article{Kanno1997IEICE01,
  author = {Kanno, Sukeyasu and Shimodaira, Hiroshi},
  volume = {J80-D-II},
  title = {{Voiced Sound Detection under Nonstationary and Heavy Noisy Environment Using the Prediction Error of Low-Frequency Spectrum}},
  journal = {Trans. IEICE(D-II)},
  number = {1},
  month = {January},
  note = {(in Japanese)},
  year = {1997},
  pages = {26--35},
  categories = {asr, jaist}
}
@inproceedings{Keeni1998ICONIP,
  author = {Keeni, Kanad and Nakayama, Kenji and Shimodaira, Hiroshi},
  title = {{Automatic Generation of Initial Weights and Target Outputs of Multi-layer Neural Networks and its Application to Pattern Classification}},
  journal = {},
  month = {October},
  year = {1998},
  booktitle = {International Conference on Neural Information Processing (ICONIP'98)},
  pages = {1622--1625},
  categories = {ann, jaist}
}
@inproceedings{Rokui1999ICANN09,
  author = {Rokui, Jun and Shimodaira, Hiroshi},
  title = {{Multistage Building Learning based on Misclassification Measure}},
  booktitle = {9-th International Conference on Artificial Neural Networks, Edinburgh, UK},
  month = {September},
  year = {1999},
  journal = {},
  categories = {nn, mce, jaist}
}
@inproceedings{cuayahuitletal_interspeech07,
  author = {Cuayáhuitl, Heriberto and Renals, Steve and Lemon, Oliver and Shimodaira, Hiroshi},
  title = {Hierarchical Dialogue Optimization Using Semi-Markov Decision Processes},
  booktitle = {Proc. Interspeech},
  month = {August},
  year = {2007},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/SMDPs-interspeech2007.pdf},
  abstract = {This paper addresses the problem of dialogue optimization on large search spaces. For such a purpose, in this paper we propose to learn dialogue strategies using multiple Semi-Markov Decision Processes and hierarchical reinforcement learning. This approach factorizes state variables and actions in order to learn a hierarchy of policies. Our experiments are based on a simulated flight booking dialogue system and compare flat versus hierarchical reinforcement learning. Experimental results show that the proposed approach produced a dramatic search space reduction (99.36\%), and converged four orders of magnitude faster than flat reinforcement learning with a very small loss in optimality (on average 0.3 system turns). Results also report that the learnt policies outperformed a hand-crafted one under three different conditions of ASR confidence levels. This approach is appealing to dialogue optimization due to faster learning, reusable subsolutions, and scalability to larger problems.},
  categories = {Spoken dialogue systems, semi-Markov decision processes, hierarchical reinforcement learning.}
}
@inproceedings{Nakai2002ICPR,
  author = {Nakai, Mitsuru and Sudo, Takashi and Shimodaira, Hiroshi and Sagayama, Shigeki},
  title = {{Pen Pressure Features for Writer-Independent On-Line Handwriting Recognition Based on Substroke {HMM}}},
  journal = {},
  month = {August},
  year = {2002},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/Nakai2002ICPR.pdf},
  booktitle = {Proc. ICPR2002, III},
  pages = {220--223},
  categories = {hwr, jaist}
}
@inproceedings{Keeni1999ICCIMA,
  author = {Keeni, Kanad and Nakayama, Kenji and Shimodaira, Hiroshi},
  title = {{A Training Scheme for Pattern Classification Using Multi-layer Feed-forward Neural Networks}},
  journal = {},
  month = {September},
  year = {1999},
  booktitle = {IEEE International Conference on Computational Intelligence and Multimedia Applications},
  pages = {307--311},
  categories = {ann, jaist}
}
@inproceedings{lips08-gregpr,
  author = {Hofer, Gregor and Yamagishi, Junichi and Shimodaira, Hiroshi},
  title = {Speech-driven Lip Motion Generation with a Trajectory {HMM}},
  booktitle = {Proc. Interspeech 2008},
  address = {Brisbane, Australia},
  month = {September},
  pages = {2314--2317},
  key = {lips08-gregpr},
  year = {2008},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/GregorLipsChallenge08.pdf},
  abstract = {Automatic speech animation remains a challenging problem that can be described as finding the optimal sequence of animation parameter configurations given some speech. In this paper we present a novel technique to automatically synthesise lip motion trajectories from a speech signal. The developed system predicts lip motion units from the speech signal and generates animation trajectories automatically employing a "Trajectory Hidden Markov Model". Using the MLE criterion, its parameter generation algorithm produces the optimal smooth motion trajectories that are used to drive control points on the lips directly. Additionally, experiments were carried out to find a suitable model unit that produces the most accurate results. Finally a perceptual evaluation was conducted, that showed that the developed motion units perform better than phonemes.},
  categories = {visual speech synthesis, trajectory HMM, HTS}
}
@inproceedings{Matsuda2000ICASSP,
  author = {Matsuda, Shigeki and Nakai, Mitsuru and Shimodaira, Hiroshi and Sagayama, Shigeki},
  title = {{Asynchronous-Transition {HMM}}},
  booktitle = {Proc. ICASSP 2000 (Istanbul, Turkey), Vol. II},
  month = {June},
  pages = {1001--1004},
  year = {2000},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Matsuda2000ICASSP.pdf},
  abstract = {We propose a new class of hidden Markov model (HMM) called asynchronous-transition HMM (AT-HMM). Opposed to conventional HMMs where hidden state transition occurs simultaneously to all features, the new class of HMM allows state transitions asynchronous between individual features to better model asynchronous timings of acoustic feature changes. In this paper, we focus on a particular class of AT-HMM with sequential constraints introducing a concept of ``state tying across time''. To maximize the advantage of the new model, we also introduce feature-wise state tying technique. Speaker-dependent speech recognition experiments demonstrated that reduced error rates more than 30\% and 50\% in phoneme and isolated word recognition, respectively, compared with conventional HMMs.},
  categories = {asr, atr, jaist}
}
@inproceedings{Nakai2003ICDAR,
  author = {Nakai, Mitsuru and Shimodaira, Hiroshi and Sagayama, Shigeki},
  title = {{Generation of Hierarchical Dictionary for Stroke-order Free Kanji Handwriting Recognition Based on Substroke {HMM}}},
  journal = {},
  abstract = {This paper describes a method of generating a Kanjihierarchical structured dictionary for stroke-number and stroke-order free handwriting recognition based on sub-stroke HMM. In stroke-based methods, a large number of stroke-order variations can be easily expressed by justadding different stroke sequences to the dictionary and itis not necessary to train new reference patterns. The hierarchical structured dictionary has an advantage that thousands of stroke-order variations of Kanji characters can be produced using a small number of stroke-order rules defin-ing Kanji parts. Moreover, the recognition speed is fast since common sequences are shared in a substroke network, even if the total number of stroke-order combinations becomes enormous practically. In experiments, 300 differentstroke-order rules of Kanji parts were statistical ly chosen by using 60 writers' handwritings of 1,016 educational Kanjicharacters. By adding these new stroke-order rules to the dictionary, about 9,000 variations of different stroke-orderswere generated for 2,965 JIS 1st level Kanji characters. As a result, we successfully improved the recognition accuracyfrom 82.6\% to 90.2\% for stroke-order free handwritings.},
  month = {August},
  year = {2003},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/Nakai2003ICDAR.pdf},
  booktitle = {Proc. ICDAR2003},
  pages = {514--518},
  categories = {HWR, jaist}
}
@inproceedings{Shimodaira1992ICSLP,
  author = {Shimodaira, Hiroshi and Nakai, Mitsuru},
  title = {Robust Pitch Detection by Narrow Band Spectrum Analysis},
  booktitle = {Proc. ICSLP-92},
  month = {October},
  pages = {1597--1600},
  year = {1992},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1992/icslp92.pdf},
  abstract = {This paper proposes a new technique for detecting pitch patterns which is useful for automatic speech recognition, by using a narrow band spectrum analysis. The motivation of this approach is that humans perceive some kind of pitch in whispers where no fundamental frequencies can be observed, while most of the pitch determination algorithm (PDA) fails to detect such perceptual pitch. The narrow band spectrum analysis enable us to find pitch structure distributed locally in frequency domain. Incorporating this technique into PDA's is realized to applying the technique to the lag window based PDA. Experimental results show that pitch detection performance could be improved by 4\% for voiced sounds and 8\% for voiceless sounds.},
  categories = {F0, jaist}
}
@article{10.1109/MCG.2011.71,
  author = {Berger, Michael A. and Hofer, Gregor and Shimodaira, Hiroshi},
  publisher = {IEEE Computer Society},
  doi = {10.1109/MCG.2011.71},
  title = {Carnival -- Combining Speech Technology and Computer Animation},
  journal = {IEEE Computer Graphics and Applications},
  issn = {0272-1716},
  volume = {31},
  year = {2011},
  pages = {80-89},
  address = {Los Alamitos, CA, USA}
}
@inproceedings{cuayahuitletal_asru05,
  author = {Cuayáhuitl, Heriberto and Renals, Steve and Lemon, Oliver and Shimodaira, Hiroshi},
  title = {Human-Computer Dialogue Simulation Using Hidden Markov Models},
  booktitle = {Proc. IEEE Workshop on Automatic Speech Recognition and Understanding (ASRU)},
  month = {November},
  year = {2005},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/hcp-asru2005.pdf},
  abstract = {This paper presents a probabilistic method to simulate task-oriented human-computer dialogues at the intention level, that may be used to improve or to evaluate the performance of spoken dialogue systems. Our method uses a network of Hidden Markov Models (HMMs) to predict system and user intentions, where a ``language model'' predicts sequences of goals and the component HMMs predict sequences of intentions. We compare standard HMMs, Input HMMs and Input-Output HMMs in an effort to better predict sequences of intentions. In addition, we propose a dialogue similarity measure to evaluate the realism of the simulated dialogues. We performed experiments using the DARPA Communicator corpora and report results with three different metrics: dialogue length, dialogue similarity and precision-recall.},
  categories = {dialogue simulation, hidden markov models}
}
@inproceedings{Sagayama2001ISCA08b,
  author = {Sagayama, Shigeki and Shinoda, Koichi and Nakai, Mitsuru and Shimodaira, Hiroshi},
  title = {{Analytic Methods for Acoustic Model Adaptation: A Review}},
  journal = {},
  month = {August},
  note = {Invited Paper},
  year = {2001},
  booktitle = {Proc. ISCA Workshop on Adaptation Methods (Sophia Antipolis France)},
  pages = {67--76},
  categories = {asr, jaist}
}
@inproceedings{Shimodaira:kes06,
  author = {Shimodaira, Chie and Shimodaira, Hiroshi and Kunifuji, Susumu},
  title = {{A Divergent-Style Learning Support Tool for English Learners Using a Thesaurus Diagram}},
  booktitle = {{Proc. KES2006}},
  address = {Bournemouth, United Kingdom},
  month = {October},
  year = {2006},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2006/kes2006.pdf},
  abstract = {This paper proposes an English learning support tool which provides users with divergent information to find the right words and expressions. In contrast to a number of software tools for English translation and composition, the proposed tool is designed to give users not only the right answer to the user's query but also a lot of words and examples which are relevant to the query. Based on the lexical information provided by the lexical database, WordNet, the proposed tool provides users with a thesaurus diagram, in which synonym sets and relation links are presented in multiple windows to help users to choose adequate words and understand similarities and differences between words. Subjective experiments are carried out to evaluate the system.},
  categories = {knowledge engineering}
}
@inproceedings{Keeni2003ICONIP,
  author = {Keeni, Kanad and Goto, Kunio and Shimodaira, Hiroshi},
  title = {{Automatic Filtering of Network IntrusionDetection System Alarms Using Multi-layer Feed-forward Neural Networks}},
  journal = {},
  month = {June},
  year = {2003},
  booktitle = {International Conference on Neural Information Processing (ICONIP2003)},
  pages = {},
  categories = {ann}
}
@article{Tokuno2005IEICE01,
  author = {Tokuno, Junko and Inami, Nobuhito and Nakai, Mitsuru and Shimodaira, Hiroshi and Sagayama, Shigeki},
  volume = {J88-D2},
  title = {{Context-dependent Sub-stroke Model for {HMM}-based On-line Handwriting Recognition}},
  journal = {Trans. IEICE D-II},
  number = {8},
  month = {August},
  note = {(in press), (in Japanese)},
  year = {2005},
  abstract = {A new method is proposed for on-line Kanji handwriting recognition. The method employs sub-stroke HMMs as minimum units to constitute Kanji characters and utilizes the direction of pen motion. The present approach has the following advantages over the conventional methods that employ character HMMs. 1) Much smaller memory requirement for dictionary and models. 2) Fast recognition by employing efficient sub-stroke network search. 3) Capability of recognizing characters not included in the training data if defined as a sequence of sub-strokes in the dictionary. In experiments, we have achieved a correct recognition rate of above 96\% by using JAIST-IIPL database that includes 1,016 educational Kanji characters.},
  categories = {online handwritten character recognition}
}
@article{Nakai1997IEICE,
  author = {Nakai, Mitsuru and Singer, Harald and Sagisaka, Yoshimori and Shimodaira, Hiroshi},
  volume = {J80-D-II},
  title = {{Accent Phrase Segmentation Based on F0 Templates Using a Superpositional Prosodic Model}},
  journal = {Trans. IEICE (D-II)},
  number = {10},
  month = {October},
  note = {(in Japanese)},
  year = {1997},
  pages = {2605--2614},
  categories = {jaist}
}
@inproceedings{Keeni2002AIA,
  author = {Keeni, Kanad and Shimodaira, Hiroshi},
  title = {{On Selection of Training Data for Fast Learning of Neural Networks Using Back Propagation}},
  journal = {},
  month = {September},
  year = {2002},
  booktitle = {IASTED International Conference on Artificial Intelligence and Application (AIA2002)},
  pages = {474--478}
}
@inproceedings{Rokui1998ICONIP,
  author = {Rokui, Jun and Shimodaira, Hiroshi},
  title = {{Modified Minimum Classification Error Learning and Its Application to Neural Networks}},
  booktitle = {ICONIP'98, Kitakyushu, Japan},
  year = {1998},
  categories = {ann, mce, jaist},
  month = {October}
}
@inproceedings{Iida1998IIZUKA,
  author = {Iida, Eiji and Shimodaira, Hiroshi and Kunifuji, Susumu and Kimura, Masayuki},
  title = {{A system to Perform Human Problem Solving}},
  booktitle = {The 5th International Conference on Soft Computing and Information / Intelligent Systems (IIZUKA'98)},
  month = {October},
  year = {1998},
  pages = {},
  categories = {jaist}
}
@inproceedings{Shimodaira2000ICSLP10,
  author = {Shimodaira, Hiroshi and Akae, Toshihiko and Nakai, Mitsuru and Sagayama, Shigeki},
  title = {{Jacobian Adaptation of {HMM} with Initial Model Selection for Noisy Speech Recognition}},
  booktitle = {Proc. ICSLP2000},
  month = {October},
  pages = {1003--1006},
  year = {2000},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/Shimodaira2000ICSLP10.pdf},
  abstract = {An extension of Jacobian Adaptation (JA) of HMMs for degraded speech recognition is presented in which appropriate set of initial models is selected from a number of initial-model sets designed for different noise environments. Based on the first order Taylor series approximation in the acoustic feature domain, JA adapts the acoustic model parameters trained in the initial noise environment A to the new environment B much faster than PMC that creates the acoustic models for the target environment from scratch. Despite the advantage of JA to PMC, JA has a theoretical limitation that the change of acoustic parameters from the environment A to B should be small in order that the linear approximation holds. To extend the coverage of JA, the ideas of multiple sets of initial models and their automatic selection scheme are discussed. Speaker-dependent isolated-word recognition experiments are carried out to evaluate the proposed method.},
  categories = {asr, jaist}
}
@inproceedings{Koba1995HCIa,
  author = {and Hiroshi Shimodaira},
  title = {{HI Design Based on the Costs of Human Information-processing Model}},
  booktitle = {HIC international'95},
  month = {July},
  year = {1995},
  pages = {},
  categories = {HI, jaist}
}
@inproceedings{Tokuno2003HCII,
  author = {Junko, Tokuno and Akira, Naoto and Nakai, Mitsuru and Shimodaira, Hiroshi and Sagayama, Shigeki},
  title = {{Blind-handwriting Interface for Wearable Computing}},
  journal = {},
  month = {June},
  note = {},
  pages = {303--307},
  year = {2003},
  booktitle = {Proc. Human - Computer Interaction (HCI) International 2003, Volume 2},
  abstract = {This paper proposes a novel input interface that we call "blind handwriting" for wearable computing. The blind handwriting, which is a word similar to "blind typing" of keyboard, is a particular writing style where the user does not see the pen or the finger movement. Without visual feedback, written characters are distorted, as in the case when the user is blindfolded, and therefore existing on-line handwriting recognition systems fail to recognize them correctly. The sub-stroke based hidden Markov model approach is employed to tackle this problem. When the pen or touch pad is used as an input device, the proposed interface demonstrates a recognition rate of 83\% on a test set of 61 people where each person wrote 1016 Japanese Kanji characters.},
  categories = {HWR, jaist}
}
@inproceedings{Kawamoto2002PRICAI,
  author = {Kawamoto, Shin-ichi and Shimodaira, Hiroshi and Nitta, Tsuneo and Nishimoto, Takuya and Nakamura, Satoshi and Itou, Katsunobu and Morishima, Shigeo and Yotsukura, Tatsuo and Kai, Atsuhiko and Lee, Akinobu and Yamashita, Yoichi and Kobayashi, Takao and Tokuda, Keiichi and Hirose, Keikichi and Minematsu, Nobuaki and Yamada, Atsushi and Den, Yasuharu and Utsuro, Takehito and Sagayama, Shigeki},
  title = {{Open-source software for developing anthropomorphic spoken dialog agent}},
  booktitle = {Proc. PRICAI-02, International Workshop on Lifelike Animated Agents},
  month = {August},
  year = {2002},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/Kawamoto2002PRICAI.pdf},
  pages = {64--69},
  categories = {lifelike-agent, jaist}
}
@inproceedings{Shimodaira:iwfhr06,
  author = {Tokuno, Junko and Nakai, Mitsuru and Shimodaira, Hiroshi and Sagayama, Shigeki and Nakagawa, Masaki},
  title = {{On-line Handwritten Character Recognition Selectively employing Hierarchical Spatial Relationships among Subpatterns}},
  booktitle = {{Proc. IWFHR-10}},
  address = {La Baule, France},
  month = {October},
  year = {2006},
  abstract = {This paper proposes an on-line handwritten character pattern recognition method that examines spatial relationships among subpatterns which are components of a character pattern. Conventional methods evaluating spatial relationships among subpatterns have not considered characteristics of deformed handwritings and evaluate all the spatial relationships equally. However, the deformations of spatial features are different within a character pattern. In our approach, we assume that the distortions of spatial features are dependent on the hierarchy of character patterns so that we selectively evaluate hierarchical spatial relationships of subpatterns by employing Bayesian network as a post-processor of our sub-stroke based HMM recognition system. Experiments of on-line handwritten Kanji character recognition with a lexicon of 1,016 elementary characters revealed that the approach we propose improves the recognition accuracy for different types of deformations.},
  categories = {online handwriting recognition}
}
@inproceedings{Takeda2002MMSP,
  author = {Takeda, Haruto and Saito, Naoki and Otsuki, Tomoshi and Nakai, Mitsuru and Shimodaira, Hiroshi and Sagayama, Shigeki},
  title = {{Hidden Markov Model for AUtomatic Transcription of MIDI Signals}},
  journal = {},
  month = {December},
  year = {2002},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2002/Takeda2002MMSP12.pdf},
  booktitle = {2002 International Workshop on Multimedia Signal Processing},
  pages = {}
}
@inproceedings{Braude2013a,
  author = {Braude, David Adam and Shimodaira, Hiroshi and Ben Youssef, Atef},
  title = {Template-Warping Based Speech Driven Head Motion Synthesis},
  booktitle = {Interspeech},
  pages = {2763 -- 2767},
  year = {2013},
  keywords = {Head motion synthesis, GMMs, IOMM},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Braude-etal_IS13.pdf},
  abstract = {We propose a method for synthesising head motion from speech using a combination of an Input-Output Markov model (IOMM) and Gaussian mixture models trained in a supervised manner. A key difference of this approach compared to others is to model the head motion in each angle as a series of templates of motion rather than trying to recover a frame-wise function. The templates were chosen to reflect natural patterns in the head motion, and states for the IOMM were chosen based on statistics of the templates. This reduces the search space for the trajectories and stops impossible motions such as discontinuities from being possible. For synthesis our system warps the templates to account for the acoustic features and the other angles' warping parameters. We show our system is capable of recovering the statistics of the motion that were chosen for the states. Our system was then compared to a baseline that used a frame-wise mapping that is based on previously published work. A subjective preference test that includes multiple speakers showed participants have a preference for the segment based approach. Both of these systems were trained on storytelling free speech.}
}
@inproceedings{benyoussef:IS2013,
  author = {Ben Youssef, Atef and Shimodaira, Hiroshi and Braude, David A.},
  title = {Articulatory features for speech-driven head motion synthesis},
  booktitle = {Proc. Interspeech},
  address = {Lyon, France},
  month = {August},
  pages = {2758-2762},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/BenYoussef-et-al_IS13.pdf},
  abstract = {This study investigates the use of articulatory features for speech-driven head motion synthesis as opposed to prosody features such as F0 and energy which have been mainly used in the literature. In the proposed approach, multi-stream HMMs are trained jointly on the synchronous streams of speech and head motion data. Articulatory features can be regarded as an intermediate parametrisation of speech that are expected to have a close link with head movement. Measured head and articulatory movements acquired by EMA were synchronously recorded with speech. Measured articulatory data was compared to those predicted from speech using an HMM-based inversion mapping system trained in a semi-supervised fashion. Canonical correlation analysis (CCA) on a data set of free speech of 12 people shows that the articulatory features are more correlated with head rotation than prosodic and/or cepstral speech features. It is also shown that the synthesised head motion using articulatory features give higher correlations with the original head motion than when only prosodic features are used.}
}
@inproceedings{braude2013template,
  author = {Braude, David A. and Shimodaira, Hiroshi and Ben Youssef, Atef},
  title = {Template-Warping Based Speech Driven Head Motion Synthesis},
  booktitle = {Proc. Interspeech},
  address = {Lyon, France},
  month = {August},
  pages = {2763-2767},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Braude-etal_IS13.pdf},
  abstract = {We propose a method for synthesising head motion from speech using a combination of an Input-Output Markov model (IOMM) and Gaussian mixture models trained in a supervised manner. A key difference of this approach compared to others is to model the head motion in each angle as a series of templates of motion rather than trying to recover a frame-wise function. The templates were chosen to reflect natural patterns in the head motion, and states for the IOMM were chosen based on statistics of the templates. This reduces the search space for the trajectories and stops impossible motions such as discontinuities from being possible. For synthesis our system warps the templates to account for the acoustic features and the other angles’ warping parameters. We show our system is capable of recovering the statistics of the motion that were chosen for the states. Our system was then compared to a baseline that used a frame-wise mapping that is based on previously published work. A subjective preference test that includes multiple speakers showed participants have a preference for the segment based approach. Both of these systems were trained on storytelling free speech.}
}
@inproceedings{benyoussef:iva2013,
  author = {Ben Youssef, Atef and Shimodaira, Hiroshi and Braude, David A.},
  title = {Head Motion Analysis and Synthesis over Different Tasks},
  booktitle = {Proc. Intelligent Virtual Agents},
  month = {September},
  pages = {285-294},
  year = {2013},
  organization = {Springer},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/BenYoussef-et-al_IVA13.pdf},
  abstract = {It is known that subjects vary in their head movements. This paper presents an analysis of this variety over different tasks and speakers and their impact on head motion synthesis. Measured head and articulatory movements acquired by an ElectroMagnetic Articulograph (EMA) synchronously recorded with audio was used. Data set of speech of 12 people recorded on different tasks confirms that the head motion variate over tasks and speakers. Experimental results confirmed that the proposed models were capable of learning and synthesising task-dependent head motions from speech. Subjective evaluation of synthesised head motion using task models shows that trained models on the matched task is better than mismatched one and free speech data provide models that predict preferred motion by the participants compared to read speech data.}
}
@inproceedings{braude:iva2013,
  author = {Braude, David A. and Shimodaira, Hiroshi and Ben Youssef, Atef},
  title = {The {University of Edinburgh} Head-Motion and Audio Storytelling ({U}o{E}-{H}A{S}) Dataset},
  booktitle = {Proc. Intelligent Virtual Agents},
  year = {2013},
  pages = {466-467},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Braude-etal_IVA2013.pdf},
  organization = {Springer},
  abstract = {In this paper we announce the release of a large dataset of storytelling monologue with motion capture for the head and body. Initial tests on the dataset indicate that head motion is more dependant on the speaker than the style of speech.}
}
@inproceedings{benyoussef_shimodaira_icassp2014,
  author = {Ben Youssef, Atef and Shimodaira, Hiroshi and Braude, David},
  title = {Speech driven Talking Head from Estimated Articulatory Features},
  booktitle = {Proc. ICASSP},
  year = {2014},
  abstract = {In this paper, we present a talking head in which the lips and head motion are controlled using articulatory movements estimated from speech. A phonesize HMM-based inversion mapping is employed and trained in a semi-supervised fashion. The advantage of the use of articulatory features is that they can drive the lips motions and they have a close link with head movements. Speech inversion normally requires the training data recorded with electromagnetic articulograph (EMA), which restricts the naturalness of head movements. The present study considers a more realistic recording condition where the training data for the target speaker are recorded with a usual motion capture system rather than EMA. Different temporal clustering techniques are investigated for HMM-based mapping as well as a GMM-based frame-wise mapping as a baseline system. Objective and subjective experiments show that the synthesised motions are more natural using an HMM system than a GMM one, and estimated EMA features outperform prosodic features.},
  month = {May},
  address = {Florence, Italy},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/benyoussef_etal_icassp2014.pdf},
  pages = {4606--4610},
  categories = {acoustic-articulatory, inversion mapping, MLPG, talking heads}
}