The Centre for Speech Technology Research, The university of Edinburgh

Publications by Felipe Espic

s1373426.bib

@inproceedings{ronanki_blizzard2017,
  author = {Ronanki, Srikanth and Ribeiro, Sam and Espic, Felipe and Watts, Oliver},
  title = {{The CSTR entry to the Blizzard Challenge 2017}},
  booktitle = {Proc. Blizzard Challenge Workshop (Interspeech Satellite)},
  year = {2017},
  month = {August},
  key = {ronanki_blizzard2017},
  address = {Stockholm, Sweden},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2017/CSTR_Blizzard2017.pdf},
  abstract = {The annual Blizzard Challenge conducts side-by-side testing of a number of speech synthesis systems trained on a common set of speech data. Similar to 2016 Blizzard challenge, the task for this year is to train on expressively-read children's story-books, and to synthesise speech in the same domain. The Challenge therefore presents an opportunity to investigate the effectiveness of several techniques we have developed when applied to expressive and prosodically-varied audiobook data. This paper describes the text-to-speech system entered by The Centre for Speech Technology Research into the 2017 Blizzard Challenge. The current system is a hybrid synthesis system which drives a unit selection synthesiser using the output from a neural network based acoustic and duration model. We assess the performance of our system by reporting the results from formal listening tests provided by the challenge.},
  categories = {Merlin, hybrid speech synthesis, unit selection, deep neural networks}
}
@inproceedings{cstr2017blizzard,
  author = {Ronanki, Srikanth and Ribeiro, {Manuel Sam} and Espic, Felipe and Watts, Oliver},
  title = {The {CSTR} entry to the {Blizzard Challenge} 2017},
  booktitle = {Proc. Blizzard Challenge},
  year = {2017},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2017/CSTR_Blizzard2017.pdf},
  abstract = {The annual Blizzard Challenge conducts side-by-side testing of a number of speech synthesis systems trained on a common set of speech data. Similar to 2016 Blizzard challenge, the task for this year is to train on expressively-read children’s story-books, and to synthesise speech in the same domain. The Challenge therefore presents an opportunity to investigate the effectiveness of several techniques we have developed when applied to expressive and prosodically-varied audiobook data. This paper describes the text-to-speech system entered by The Centre for Speech Technology Research into the 2017 Blizzard Challenge. The current system is a hybrid synthesis system which drives a unit selection synthesiser using the output from a neural network based acoustic and duration model. We assess the performance of our system by reporting the results from formal listening tests provided by the challenge.},
  categories = {Merlin, hybrid speech synthesis, unit selection, deep neural networks}
}
@inproceedings{Espic2017,
  author = {Espic, Felipe and Valentini-Botinhao, Cassia and King, Simon},
  title = {Direct Modelling of Magnitude and Phase Spectra for Statistical Parametric Speech Synthesis},
  booktitle = {Proc. Interspeech},
  address = {Stochohlm, Sweden},
  month = aug,
  year = {2017},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2017/1647.PDF},
  abstract = {We propose a simple new representation for the FFT spectrum tailored to statistical parametric speech synthesis. It consists of four feature streams that describe magnitude, phase and fundamental frequency using real numbers. The proposed feature extraction method does not attempt to decompose the speech structure (e.g., into source+filter or harmonics+noise). By avoiding the simplifications inherent in decomposition, we can dramatically reduce the “phasiness” and “buzziness” typical of most vocoders. The method uses simple and computationally cheap operations and can operate at a lower frame rate than the 200 frames-per-second typical in many systems. It avoids heuristics and methods requiring approximate or iterative solutions, including phase unwrapping. Two DNN-based acoustic models were built - from male and female speech data - using the Merlin toolkit. Subjective comparisons were made with a state-of-the-art baseline, using the STRAIGHT vocoder. In all variants tested, and for both male and female voices, the proposed method substantially outperformed the baseline. We provide source code to enable our complete system to be replicated.},
  categories = {speech synthesis, vocoding, speeech features, phase modelling, spectral representation}
}
@inproceedings{Espic2016,
  author = {Espic, Felipe and Valentini-Botinhao, Cassia and Wu, Zhizheng and King, Simon},
  title = {Waveform generation based on signal reshaping for statistical parametric speech synthesis},
  booktitle = {Proc. Interspeech},
  address = {San Francisco, CA, USA},
  abstract = {We propose a new paradigm of waveform generation for Statistical Parametric Speech Synthesis that is based on neither source-filter separation nor sinusoidal modelling. We suggest that one of the main problems of current vocoding techniques is that they perform an extreme decomposition of the speech signal into source and filter, which is an underlying cause of “buzziness”, “musical artifacts”, or “muffled sound” in the synthetic speech. The proposed method avoids making unnecessary assumptions and decompositions as far as possible, and uses only the spectral envelope and F0 as parameters. Prerecorded speech is used as a base signal, which is “reshaped” to match the acoustic specification predicted by the statistical model, without any source-filter decomposition. A detailed description of the method is presented, including implementation details and adjustments. Subjective listening test evaluations of complete DNN-based text-to-speech systems were conducted for two voices: one female and one male. The results show that the proposed method tends to outperform the state-of-theart standard vocoder STRAIGHT, whilst using fewer acoustic parameters.},
  month = {September},
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/0487.PDF},
  pages = {2263-2267},
  categories = {speech synthesis, waveform generation, vocoding, statistical parametric speech synthesis}
}
@inproceedings{Villavicencio+2016,
  author = {Villavicencio, Fernando and Yamagishi, Junichi and Bonada, Jordi and Espic, Felipe},
  doi = {10.21437/Interspeech.2016-305},
  title = {Applying Spectral Normalisation and Efficient Envelope Estimation and Statistical Transformation for the Voice Conversion Challenge 2016},
  url = {http://hdl.handle.net/10230/32891},
  booktitle = {Interspeech},
  address = {San Francisco, USA},
  abstract = {In this work we present our entry for the Voice Conversion Challenge 2016, denoting new features to previous work on GMM-based voice conversion. We incorporate frequency warping and pitch transposition strategies to perform a normalisation of the spectral conditions, with benefits confirmed by objective and perceptual means. Moreover, the results of the challenge showed our entry among the highest performing systems in terms of perceived naturalness while maintaining the target similarity performance of GMM-based conversion.},
  month = sep,
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/0305.PDF},
  pages = {1657-61},
  categories = {voice conversion, speech synthesis, statistical spectral transformation, spectral envelope modeling}
}
@article{POBLETE20151,
  author = {Poblete, Victor and Espic, Felipe and King, Simon and Stern, Richard M. and Huenupan, Fernando and Fredes, Josue and Yoma, Nestor Becerra},
  doi = {https://doi.org/10.1016/j.csl.2014.10.006},
  title = {A perceptually-motivated low-complexity instantaneous linear channel normalization technique applied to speaker verification},
  url = {http://www.sciencedirect.com/science/article/pii/S0885230814001053},
  journal = {Computer Speech & Language},
  issn = {0885-2308},
  number = {1},
  abstract = {Abstract This paper proposes a new set of speech features called Locally-Normalized Cepstral Coefficients (LNCC) that are based on Seneff's Generalized Synchrony Detector (GSD). First, an analysis of the GSD frequency response is provided to show that it generates spurious peaks at harmonics of the detected frequency. Then, the GSD frequency response is modeled as a quotient of two filters centered at the detected frequency. The numerator is a triangular band pass filter centered around a particular frequency similar to the ordinary Mel filters. The denominator term is a filter that responds maximally to frequency components on either side of the numerator filter. As a result, a local normalization is performed without the spurious peaks of the original GSD. Speaker verification results demonstrate that the proposed LNCC features are of low computational complexity and far more effectively compensate for spectral tilt than ordinary MFCC coefficients. LNCC features do not require the computation and storage of a moving average of the feature values, and they provide relative reductions in Equal Error Rate (EER) as high as 47.7%, 34.0% or 25.8% when compared with MFCC, MFCC+CMN, or MFCC+RASTA in one case of variable spectral tilt, respectively.},
  volume = {31},
  year = {2015},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/paper_temp.pdf},
  pages = {1 - 27},
  categories = {Channel robust feature extraction, Auditorymodels, Spectral local normalization, Synchrony detection}
}