<?xml version="1.0" encoding="utf-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.0 20120330//EN" "JATS-journalpublishing1.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">INFORMATICA</journal-id>
<journal-title-group><journal-title>Informatica</journal-title></journal-title-group>
<issn pub-type="epub">1822-8844</issn>
<issn pub-type="ppub">0868-4952</issn>
<issn-l>0868-4952</issn-l>
<publisher>
<publisher-name>Vilnius University</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">INFO1233</article-id>
<article-id pub-id-type="doi">10.15388/Informatica.2019.219</article-id>
<article-categories><subj-group subj-group-type="heading">
<subject>Research Article</subject></subj-group></article-categories>
<title-group>
<article-title>Comparison of Phonemic and Graphemic Word to Sub-Word Unit Mappings for Lithuanian Phone-Level Speech Transcription</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Raškinis</surname><given-names>Gailius</given-names></name><email xlink:href="gailius.raskinis@vdu.lt">gailius.raskinis@vdu.lt</email><xref ref-type="aff" rid="j_info1233_aff_001">1</xref><xref ref-type="aff" rid="j_info1233_aff_004">4</xref><xref ref-type="corresp" rid="cor1">∗</xref><bio>
<p><bold>G. Raškinis</bold> (born in 1972) received a PhD in the field of informatics in 2000. Presently, he works at the Center of Computational Linguistics and teaches at the Faculty of Informatics of Vytautas Magnus University. His research interests include application of machine learning techniques to music recognition, speech recognition and natural language processing.</p></bio>
</contrib>
<contrib contrib-type="author">
<name><surname>Paškauskaitė</surname><given-names>Gintarė</given-names></name><email xlink:href="gintare.paskauskaite@ktu.lt">gintare.paskauskaite@ktu.lt</email><xref ref-type="aff" rid="j_info1233_aff_002">2</xref><bio>
<p><bold>G. Paškauskaitė</bold> (born in 1990) received BS and MS degrees from the Department of Automatics, Kaunas University of Technology. She is a PhD student in the Kaunas University of Technology from 2016. Her main research interests include automatic Lithuanian speech recognition.</p></bio>
</contrib>
<contrib contrib-type="author">
<name><surname>Saudargienė</surname><given-names>Aušra</given-names></name><email xlink:href="ausra.saudargiene@lsmuni.lt">ausra.saudargiene@lsmuni.lt</email><xref ref-type="aff" rid="j_info1233_aff_001">1</xref><xref ref-type="aff" rid="j_info1233_aff_003">3</xref><bio>
<p><bold>A. Saudargienė</bold> (born in 1970) received a PhD degree in the field of informatics from the Institute of Mathematics and Informatics, Vilnius. Currently she works at the Department of Applied Informatics, Vytautas Magnus University, and Neuroscience Institute, Lithuanian University of Health Sciences. Her research field is learning and memory in artificial and biological neural systems.</p></bio>
</contrib>
<contrib contrib-type="author">
<name><surname>Kazlauskienė</surname><given-names>Asta</given-names></name><email xlink:href="asta.kazlauskiene@vdu.lt">asta.kazlauskiene@vdu.lt</email><xref ref-type="aff" rid="j_info1233_aff_001">1</xref><bio>
<p><bold>A. Kazlauskienė</bold> (born in 1964) received a doctor’s degree in the field of humanities (philology) in 1998. She teaches at the Department of Lithuanian Studies of Vytautas Magnus University. Her research interests are phonology, phonotactics, accentuation, rhythm, applied linguistics.</p></bio>
</contrib>
<contrib contrib-type="author">
<name><surname>Vaičiūnas</surname><given-names>Airenas</given-names></name><email xlink:href="airenass@gmail.com">airenass@gmail.com</email><xref ref-type="aff" rid="j_info1233_aff_001">1</xref><bio>
<p><bold>A. Vaičiūnas</bold> (born in 1976) received a PhD in the field of informatics in 2006. Since then he has worked as software engineer and researcher in various computational linguistics projects. His research interests are human language technologies.</p></bio>
</contrib>
<aff id="j_info1233_aff_001"><label>1</label><institution>Vytautas Magnus University</institution>, K. Donelaičio 58, LT-44248, Kaunas, <country>Lithuania</country></aff>
<aff id="j_info1233_aff_002"><label>2</label><institution>Kaunas University of Technology</institution>, K. Donelaičio 73, LT-44249, Kaunas, <country>Lithuania</country></aff>
<aff id="j_info1233_aff_003"><label>3</label><institution>Lithuanian University of Health Sciences</institution>, Eivenių 4, LT-50161 Kaunas, <country>Lithuania</country></aff>
<aff id="j_info1233_aff_004"><label>4</label><institution>Recognisoft, Ltd.</institution>, K. Donelaičio 79-1, LT-44249, Kaunas, <country>Lithuania</country></aff>
</contrib-group>
<author-notes>
<corresp id="cor1"><label>∗</label>Corresponding author.</corresp>
</author-notes>
<pub-date pub-type="ppub"><year>2019</year></pub-date>
<pub-date pub-type="epub"><day>1</day><month>1</month><year>2019</year></pub-date><volume>30</volume><issue>3</issue><fpage>573</fpage><lpage>593</lpage>
<history>
<date date-type="received"><month>6</month><year>2018</year></date>
<date date-type="accepted"><month>5</month><year>2019</year></date>
</history>
<permissions><copyright-statement>© 2019 Vilnius University</copyright-statement><copyright-year>2019</copyright-year>
<license license-type="open-access" xlink:href="http://creativecommons.org/licenses/by/4.0/">
<license-p>Open access article under the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/">CC BY</ext-link> license.</license-p></license></permissions>
<abstract>
<p>Conventional large vocabulary automatic speech recognition (ASR) systems require a mapping from words into sub-word units to generalize over the words that were absent in the training data and to enable the robust estimation of acoustic model parameters. This paper surveys the research done during the last 15 years on the topic of word to sub-word mappings for Lithuanian ASR systems. It also compares various phoneme and grapheme based mappings across a broad range of acoustic modelling techniques including monophone and triphone based Hidden Markov models (HMM), speaker adaptively trained HMMs, subspace gaussian mixture models (SGMM), feed-forward time delay neural network (TDNN), and state-of-the-art low frame rate bidirectional long short term memory (LFR BLSTM) recurrent deep neural network. Experimental comparisons are based on a 50-hour speech corpus. This paper shows that the best phone-based mapping significantly outperforms a grapheme-based mapping. It also shows that the lowest phone error rate of an ASR system is achieved by the phoneme-based lexicon that explicitly models syllable stress and represents diphthongs as single phonetic units.</p>
</abstract>
<kwd-group>
<label>Key words</label>
<kwd>speech recognition</kwd>
<kwd>grapheme</kwd>
<kwd>phoneme</kwd>
<kwd>G2P conversion</kwd>
<kwd>HMM</kwd>
<kwd>SGMM</kwd>
<kwd>TDNN</kwd>
<kwd>BLSTM</kwd>
<kwd>Lithuanian</kwd>
</kwd-group>
<funding-group>
<award-group>
<funding-source xlink:href="https://doi.org/10.13039/501100004504">Research Council of Lithuania</funding-source>
<award-id>LIT-5-4</award-id>
</award-group>
<funding-statement>Part of this research has been supported by a grant from the Research Council of Lithuania under the National Lithuanian studies development programme for 2009–2015 through the project “A unified approach to Lithuanian prosody: the intonation, rhythm, and stress” (reg. no. LIT-5-4). </funding-statement>
</funding-group>
</article-meta>
</front>
<body>
<sec id="j_info1233_s_001">
<label>1</label>
<title>Introduction</title>
<p>Conventional large vocabulary automatic speech recognition (ASR) systems require a mapping from words into sub-word units to generalize over the words that were absent in the training data and to enable the robust estimation of acoustic model parameters. Mapping words into phones by constructing pronunciation dictionaries that take into account sound assimilation rules and coarticulation effects was the dominant approach for many years. This approach has the advantage of trying to match the process of speech production. Mapping words into graphemes (letters) is an alternative approach (Kanthak and Ney, <xref ref-type="bibr" rid="j_info1233_ref_009">2002</xref>; Killer <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_011">2003</xref>) advocated by some recent studies (Collobert <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_002">2016</xref>). It has the advantage of skipping the process of dictionary build-up that is costly and requires an involvement of linguistic experts. Grapheme based ASR systems showed relatively good performance for Lithuanian ASR as well (Gales <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_005">2015</xref>; Lileikytė <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_017">2016</xref>; Alumäe and Tilk, <xref ref-type="bibr" rid="j_info1233_ref_001">2016</xref>; Salimbajevs and Kapočiūtė-Dzikienė, <xref ref-type="bibr" rid="j_info1233_ref_028">2018</xref>). Finding the best lexicon of sub-word units for any particular language is a complex problem that can be answered only through an experimental investigation. ASR systems based on different word to sub-word unit mappings have to be built and their performance has to be compared. Much of the complexity originates from the fact that the optimum sub-word unit lexicon may depend on the size of the training corpus and on the setup of an ASR system, i.e. on selected acoustic modelling technique, amount of linguistic knowledge incorporated into the system, and performance comparison criteria. Experimental investigation is also costly in terms of computation time.</p>
<p>Multiple different word to sub-word unit mappings for the purposes of Lithuanian ASR were investigated and compared during the last 15 years. Studies addressing this topic often arrived to opposite conclusions or these conclusions were not supported by the tests of statistical significance. Thus, the practical question about which mapping should be chosen or tried first if someone has sizeable amounts of acoustic data (50 hours and more) and intends to build an ASR system remains open.</p>
<p>This study aims to obtain an additional insight into this question. The first distinction of this study is that we follow a “divide and conquer” approach to the ASR tuning. We eliminate lexical and syntactic-semantic layers of the ASR system and evaluate word to sub-word unit mappings on the basis of the performance of an acoustic model alone. Given that the language model (LM) and pronunciation dictionary are absent, we use Phone Error Rate (PER) rather than Word Error Rate (WER) as the ASR performance criterion. We believe that such approach makes our findings independent from the lexical content of the training/evaluation data. Second, we carefully prepare the data. Our investigations are based on a solid 50-hour speech corpus. Allophone-level annotations of the corpus have grapheme-to-phoneme (G2P) conversion ambiguities resolved by means of advanced G2P conversion tools. The third distinction of this study is that we compare word to sub-word mappings on the basis of a broad range of acoustic modelling techniques including state-of-the-art deep learning techniques. Finally, we dedicated lots of computational resources for the cross-validation experiments to verify the statistical significance of our findings.</p>
<p>The paper is organized as follows: Section <xref rid="j_info1233_s_002">2</xref> presents the background, describes the relationship between graphemes, phonemes and allophones of Lithuanian, and presents the prior work, Section <xref rid="j_info1233_s_005">3</xref> presents our methods, describes phonemic and graphemic mappings investigated in this paper, and presents the experimental setup, Section <xref rid="j_info1233_s_012">4</xref> presents the results, and finally the discussion and conclusions are presented in Section <xref rid="j_info1233_s_013">5</xref>.</p>
</sec>
<sec id="j_info1233_s_002">
<label>2</label>
<title>The Background</title>
<sec id="j_info1233_s_003">
<label>2.1</label>
<title>Lithuanian Graphemes, Phonemes and Allophones</title>
<p>Traditional Lithuanian spelling is based on the set of 32 graphemes: <italic>a</italic>, <italic>ą</italic>, <italic>b</italic>, <italic>c</italic>, <italic>č</italic>, <italic>d</italic>, <italic>e</italic>, <italic>ę</italic>, <italic>ė</italic>, <italic>f</italic>, <italic>g</italic>, <italic>h</italic>, <italic>i</italic>, <italic>į</italic>, <italic>y</italic>, <italic>j</italic>, <italic>k</italic>, <italic>l</italic>, <italic>m</italic>, <italic>n</italic>, <italic>o</italic>, <italic>p</italic>, <italic>r</italic>, <italic>s</italic>, <italic>š</italic>, <italic>t</italic>, <italic>u</italic>, <italic>ū</italic>, <italic>ų</italic>, <italic>v</italic>, <italic>z</italic>, <italic>ž</italic> that includes 9 diacritic symbols.<xref ref-type="fn" rid="j_info1233_fn_001">2</xref><fn id="j_info1233_fn_001"><label><sup>2</sup></label>
<p>Linguistic entity, like a grapheme or word written according to Lithuanian orthography is given in italics. International Phonetic Alphabet (IPA) based phonetic transcription is enclosed within square brackets. SAMPA-LT based allophonic transcription is given in plain text.</p></fn> Lithuanian orthography is essentially phonological, i.e. standardized spelling reflects the essential phonological changes but also tolerates phonological inaccuracies. The definition of Lithuanian phoneme is subject to debate among linguists. Girdenis (<xref ref-type="bibr" rid="j_info1233_ref_006">2014</xref>) describes Lithuanian as having 58 phonemes (13 vowels and 45 consonants) whereas Pakerys (<xref ref-type="bibr" rid="j_info1233_ref_020">2003</xref>) talks about 49 phonemes (12 vowels and 37 consonants). This study is not concerned by different phoneme definitions, because it focuses on allophones and their sets. The following considerations summarize the essence of the relationship among graphemes, phonemes and allophones and illustrate the main difficulties of Lithuanian G2P conversion:</p>
<list>
<list-item id="j_info1233_li_001">
<label>•</label>
<p>Lithuanian consonants are either palatalized, or non-palatalized. Palatalization property of a consonant is not exposed by its grapheme symbol,<xref ref-type="fn" rid="j_info1233_fn_002">3</xref><fn id="j_info1233_fn_002"><label><sup>3</sup></label>
<p>In certain cases, palatalization is indicated by the grapheme <italic>i</italic> written after the palatalized consonant, e.g. <italic>geriu</italic> (drink), <italic>gražios</italic> (nice), i.e. palatalization is represented by a digraph.</p></fn> but can be inferred from its right context. One right standing grapheme is often enough, as consonants are always palatalized before graphemes <italic>e</italic>, <italic>ę</italic>, <italic>ė</italic>, <italic>i</italic>, <italic>į</italic>, <italic>y</italic>, <italic>j</italic>. However, in rare cases four right standing graphemes are required to infer this property correctly, e.g. <italic>perskrido</italic> [<sup>1</sup>ˈpæ:r<sup>j</sup>s<sup>j</sup>kr<sup>j</sup>ɪdo:] (flew over).</p>
</list-item>
<list-item id="j_info1233_li_002">
<label>•</label>
<p>Lithuanian vowels are either short (lax), or long (tense). Duration property of a vowel is not exposed by graphemes <italic>a</italic>, <italic>e</italic>, <italic>o</italic> (see Table <xref rid="j_info1233_tab_001">1</xref>).</p>
</list-item>
<list-item id="j_info1233_li_003">
<label>•</label>
<p>Grapheme pairs <italic>ie</italic>, <italic>uo</italic>, <italic>ai</italic>, <italic>au</italic>, <italic>ei</italic>, <italic>ui</italic> make up a diphtong (e.g. <italic>paukštis</italic> [<sup>2</sup>ˈpɐu˙kʃ<sup>j</sup>t<sup>j</sup>ɪs] (bird)) or hiatus (e.g. <italic>paupys</italic> [pɐ.ʊ<sup>2</sup>ˈp<sup>j</sup>i:s] (riverside)) if they are within the same syllable or span syllable boundaries respectively.</p>
</list-item>
<list-item id="j_info1233_li_004">
<label>•</label>
<p>Grapheme pairs <italic>al, am, an, ar, el, em, en. er, il, im, in, ir, ul, um, un, ur</italic> make up a mixed diphthong if they are within the same syllable.</p>
</list-item>
<list-item id="j_info1233_li_005">
<label>•</label>
<p>Syllable boundaries are not exposed by standard spelling.</p>
</list-item>
<list-item id="j_info1233_li_006">
<label>•</label>
<p>Lithuanian syllables are either stressed, or unstressed. Stress falls on a nucleus of the syllable, where nucleus may be a vowel, a diphthong or a mixed diphthong. Lithuanian phonetics distinguishes between two syllable accents: acute and circumflex. If a diphthong or a mixed diphthong is stressed, the acute and the circumflex make their respective first (vowel) and the second (vowel or consonant) components more prominent. Syllable accent is not exposed by standard spelling.</p>
</list-item>
<list-item id="j_info1233_li_007">
<label>•</label>
<p>Traditional Lithuanian spelling uses irregular affricate encoding. Affricates are encoded either by graphemes such as <italic>c</italic> ([t͡s, t͡s<sup>j</sup>]), <italic>č</italic> ([t͡ʃ, t͡ʃ<sup>j</sup>]) or by digraphs: <italic>dz</italic> ([d͡z, d͡z<sup>j</sup>]), <italic>dž</italic> ([d͡ʒ, d͡ʒ<sup>j</sup>]).</p>
</list-item>
<list-item id="j_info1233_li_008">
<label>•</label>
<p>Digraph <italic>ch</italic> encodes sounds [x] and [x<sup>j</sup>].</p>
</list-item>
</list>
<table-wrap id="j_info1233_tab_001">
<label>Table 1</label>
<caption>
<p>The relationship of Lithuanian graphemes and vowels. Graphemes <italic>a</italic>, <italic>e</italic>, <italic>o</italic> represent both short and long vowels.</p>
</caption>
<table>
<thead>
<tr>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin">Grapheme</td>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin"><italic>a</italic></td>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin"><italic>ą</italic></td>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin"><italic>e</italic></td>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin"><italic>ę</italic></td>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin"><italic>ė</italic></td>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin"><italic>i</italic></td>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin"><italic>į</italic>, <italic>y</italic></td>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin"><italic>o</italic></td>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin"><italic>u</italic></td>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin"><italic>ų</italic>, <italic>ū</italic></td>
</tr>
</thead>
<tbody>
<tr>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">Phoneme</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">[ɐ], [ɑ:]</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">[ɑ:]</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">[ɛ], [æ:]</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">[æ:]</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">[e:]</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">[ɪ]</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">[i:]</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">[ɔ], [o:]</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">[ʊ]</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">[u:]</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The considerations above imply that G2P conversion of Lithuanian is quite complex. G2P converter that relies on a word spelling and grapheme rewrite rules (Greibus <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_007">2017</xref>; Lileikytė <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_018">2018</xref>), henceforth referred to as a shallow G2P converter, is incapable of resolving ambiguities related to vowel duration, syllable stress, and syllable boundaries and consequently is incapable of producing detailed and consistent allophone sequences. Only G2P converter making use of supplementary pronunciation dictionaries (Skripkauskas and Telksnys, <xref ref-type="bibr" rid="j_info1233_ref_030">2006</xref>) or of accentuation algorithms (Norkevičius <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_019">2005</xref>; Kazlauskienė <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_010">2010</xref>), henceforth referred to as a knowledge-rich G2P converter,<xref ref-type="fn" rid="j_info1233_fn_003">4</xref><fn id="j_info1233_fn_003"><label><sup>4</sup></label>
<p>Grapheme-to-allophone converter would be a more appropriate name.</p></fn> might be capable of disambiguating and modelling these phonological properties correctly.</p>
</sec>
<sec id="j_info1233_s_004">
<label>2.2</label>
<title>Related Work</title>
<p>The problem of finding the best word to sub-word unit mapping for the applications of Lithuanian ASR was first addressed by Raškinis and Raškinienė (<xref ref-type="bibr" rid="j_info1233_ref_024">2003</xref>), followed by Šilingas (<xref ref-type="bibr" rid="j_info1233_ref_031">2005</xref>), Laurinčiukaitė and Lipeika (<xref ref-type="bibr" rid="j_info1233_ref_014">2007</xref>), Gales <italic>et al.</italic> (<xref ref-type="bibr" rid="j_info1233_ref_005">2015</xref>), Greibus <italic>et al.</italic> (<xref ref-type="bibr" rid="j_info1233_ref_007">2017</xref>), Lileikytė <italic>et al.</italic> (<xref ref-type="bibr" rid="j_info1233_ref_018">2018</xref>), and Ratkevicius <italic>et al.</italic> (<xref ref-type="bibr" rid="j_info1233_ref_026">2018</xref>).</p>
<p>All abovementioned studies have used very different ASR setups (see Table <xref rid="j_info1233_tab_002">2</xref>). First, different proprietary speech corpora were used for ASR system training and evaluation (Laurinčiukaitė <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_013">2006</xref>; Harper, <xref ref-type="bibr" rid="j_info1233_ref_008">2016</xref>; Laurinčiukaitė <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_016">2018</xref>). Second, ASR setups were based on different acoustic modelling techniques, such as monophone HMM system (Šilingas, <xref ref-type="bibr" rid="j_info1233_ref_031">2005</xref>; Ratkevicius <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_026">2018</xref>), triphone HMM system (Raškinis and Raškinienė, <xref ref-type="bibr" rid="j_info1233_ref_024">2003</xref>; Šilingas, <xref ref-type="bibr" rid="j_info1233_ref_031">2005</xref>; Laurinčiukaitė, <xref ref-type="bibr" rid="j_info1233_ref_015">2008</xref>; Greibus <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_007">2017</xref>), or hybrid HMM – neural network models (Gales <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_005">2015</xref>; Lileikytė <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_018">2018</xref>). Third, different evaluation methodologies were used. Raškinis and Raškinienė (<xref ref-type="bibr" rid="j_info1233_ref_024">2003</xref>), Laurinčiukaitė and Lipeika (<xref ref-type="bibr" rid="j_info1233_ref_014">2007</xref>), Ratkevicius <italic>et al.</italic> (<xref ref-type="bibr" rid="j_info1233_ref_026">2018</xref>) and this study prefer accuracy estimation through cross-validation, whereas other studies estimate recognition accuracy on a held-out data, an approach that is less computation intensive. Fourth, different evaluation criteria were used. Studies differ by comparing PER (Šilingas, <xref ref-type="bibr" rid="j_info1233_ref_031">2005</xref>), WER (Raškinis and Raškinienė, <xref ref-type="bibr" rid="j_info1233_ref_024">2003</xref>; Šilingas, <xref ref-type="bibr" rid="j_info1233_ref_031">2005</xref>; Laurinčiukaitė and Lipeika, <xref ref-type="bibr" rid="j_info1233_ref_014">2007</xref>; Gales <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_005">2015</xref>; Lileikytė <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_018">2018</xref>; Ratkevicius <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_026">2018</xref>), and sentence error rate (Greibus <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_007">2017</xref>). Fifth, ASR setups incorporated different language models such as word loops (Raškinis and Raškinienė, <xref ref-type="bibr" rid="j_info1233_ref_024">2003</xref>; Šilingas, <xref ref-type="bibr" rid="j_info1233_ref_031">2005</xref>; Laurinčiukaitė and Lipeika, <xref ref-type="bibr" rid="j_info1233_ref_014">2007</xref>; Ratkevicius <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_026">2018</xref>), word <italic>n</italic>-grams (Gales <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_005">2015</xref>; Lileikytė <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_018">2018</xref>), command lists (Greibus <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_007">2017</xref>), and phone <italic>n</italic>-grams (this study).</p>
<table-wrap id="j_info1233_tab_002">
<label>Table 2</label>
<caption>
<p>Comparison of experimental setups used to compare phonemic, graphemic and syllabic lexicons in various studies (WER – Word Error Rate; PER – Phone Error Rate; ATWV/MTWV – Actual/Maximum Term-Weighted Value<xref ref-type="fn" rid="j_info1233_fn_004">5</xref>; SER – Sentence Error Rate).</p>
</caption>
<table>
<thead>
<tr>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin">Study</td>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin">Corpus</td>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin">Evaluation type</td>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin">Comparison criteria</td>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin">Language model</td>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin">Acoustic modelling technique</td>
</tr>
</thead>
<tbody>
<tr>
<td style="vertical-align: top; text-align: left">Raškinis and Raškinienė, 2003</td>
<td style="vertical-align: top; text-align: left">1 h of isolated words, 4 speakers</td>
<td style="vertical-align: top; text-align: left">4-fold cross-validation, 15 min per round</td>
<td style="vertical-align: top; text-align: left">WER</td>
<td style="vertical-align: top; text-align: left">Word-loop</td>
<td style="vertical-align: top; text-align: left">Triphone HMM</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">Šilingas, 2005</td>
<td style="vertical-align: top; text-align: left">9 h of broadcast speech</td>
<td style="vertical-align: top; text-align: left">Held out data, 14 min</td>
<td style="vertical-align: top; text-align: left">WER, PER</td>
<td style="vertical-align: top; text-align: left">Word-loop</td>
<td style="vertical-align: top; text-align: left">Monophone HMM, Triphone HMM</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">Laurinčiukaitė and Lipeika 2007</td>
<td style="vertical-align: top; text-align: left">23 speakers<xref ref-type="fn" rid="j_info1233_fn_005">6</xref></td>
<td style="vertical-align: top; text-align: left">10-fold cross-validation, 1 h per round</td>
<td style="vertical-align: top; text-align: left">WER</td>
<td style="vertical-align: top; text-align: left">Word-loop</td>
<td style="vertical-align: top; text-align: left">Triphone HMM</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">Gales et al., 2015</td>
<td style="vertical-align: top; text-align: left">3–40 h of convers. telephone</td>
<td style="vertical-align: top; text-align: left">Held out data, 10 hours</td>
<td style="vertical-align: top; text-align: left">WER</td>
<td style="vertical-align: top; text-align: left">Word n-gram</td>
<td style="vertical-align: top; text-align: left">Triphone HMM, Hybrid HMM-DNN system</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">Lileikytė et al., 2018</td>
<td style="vertical-align: top; text-align: left">speech</td>
<td style="vertical-align: top; text-align: left"/>
<td style="vertical-align: top; text-align: left">WER, ATWV/MTWV</td>
<td style="vertical-align: top; text-align: left">Word 3-gram</td>
<td style="vertical-align: top; text-align: left">Triphone HMM, Hybrid HMM-DNN system</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">Greibus et al., 2017</td>
<td style="vertical-align: top; text-align: left">46.5 h of read speech, 348 speakers</td>
<td style="vertical-align: top; text-align: left">Held out data, 6.78 hours</td>
<td style="vertical-align: top; text-align: left">SER</td>
<td style="vertical-align: top; text-align: left">Command list</td>
<td style="vertical-align: top; text-align: left">Triphone HMM</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">Ratkevičius et al., 2018</td>
<td style="vertical-align: top; text-align: left">2.5 h of isolated words</td>
<td style="vertical-align: top; text-align: left">5, 10-fold cross-validation</td>
<td style="vertical-align: top; text-align: left">WER</td>
<td style="vertical-align: top; text-align: left">Word-loop</td>
<td style="vertical-align: top; text-align: left">Monophone HMM</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">This study</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">50 h of read speech, 50 speakers</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">10-fold cross-validation 1 hour per round</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">PER</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">Fully interconnected triphones; phone 3-gram, 4-gram</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">Triphone HMM, LDA+MLLT Triphone HMM, SAT-HMM, SGMM, Hybrid HMM-TDNN, BLSTM (recurrent DNN)</td>
</tr>
</tbody>
</table>
</table-wrap>
<p><fn id="j_info1233_fn_004"><label><sup>5</sup></label>
<p>Actual/maximum term-weighted value is used to evaluate keyword spotting performance.</p></fn> <fn id="j_info1233_fn_005"><label><sup>6</sup></label>
<p>Data of 10 speakers makes up 89% of the corpus. Every speaker is present in both training and test data.</p></fn></p>
<p>Though word to grapheme mappings investigated by different studies are quite similar, word to phoneme mappings are different and mostly incompatible across studies. Each study makes its own choices about whether to and how to represent stress, duration, palatalization, affricates, diphthongs and mixed diphthongs in a phonemic lexicon (see Table <xref rid="j_info1233_tab_003">3</xref>). Laurinčiukaitė and Lipeika (<xref ref-type="bibr" rid="j_info1233_ref_014">2007</xref>) go beyond word to phoneme mappings and investigate word to sub-word unit mappings, where sub-words may be phonemes, syllables and pseudo-syllables.</p>
<table-wrap id="j_info1233_tab_003">
<label>Table 3</label>
<caption>
<p>Comparison of phonemic lexicons that were investigated by various studies. Symbols in the table denote fine-grained (✚), partial (✓), and absent (<bold>O</bold>) modelling of some phonetic property.</p>
</caption>
<table>
<thead>
<tr>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin">Study</td>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin">Phonemic lexicon as referenced by authors</td>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin">Syllable stress<xref ref-type="fn" rid="j_info1233_fn_006">7</xref> (vowels &amp; diphth.)</td>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin">Vowel<xref ref-type="fn" rid="j_info1233_fn_007">8</xref> duration</td>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin">Fronting of back<xref ref-type="fn" rid="j_info1233_fn_008">9</xref> vowels</td>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin">Syllable stress<xref ref-type="fn" rid="j_info1233_fn_009">10</xref> (consonants)</td>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin">Consonant<xref ref-type="fn" rid="j_info1233_fn_010">11</xref> palatalization</td>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin">Affricate<xref ref-type="fn" rid="j_info1233_fn_011">12</xref> modelling</td>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin">Diphthong<xref ref-type="fn" rid="j_info1233_fn_012">13</xref> modelling</td>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin">Mixed diphthong<xref ref-type="fn" rid="j_info1233_fn_013">14</xref> modelling</td>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin">Number of phonetic units</td>
</tr>
</thead>
<tbody>
<tr>
<td rowspan="5" style="vertical-align: top; text-align: left">Raškinis et al. 2003</td>
<td style="vertical-align: top; text-align: left">A</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">115</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">AB</td>
<td style="vertical-align: top; text-align: left">✓</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">101</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">ABC</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">73</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">ABD</td>
<td style="vertical-align: top; text-align: left">✓</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">76</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">ABCD</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">50</td>
</tr>
<tr>
<td rowspan="7" style="vertical-align: top; text-align: left">Šilingas, 2005</td>
<td style="vertical-align: top; text-align: left">BFR1</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">229</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">BFR2</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">140</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">BFR3</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">86</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">BFR4</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">139</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">BFR5</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">87</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">BFR6</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">71</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">BFR7</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">41</td>
</tr>
<tr>
<td rowspan="3" style="vertical-align: top; text-align: left">Greibus et al., 2017</td>
<td style="vertical-align: top; text-align: left">FZ1.3</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">✓</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">36</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">FZ15.5</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">✓</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">61</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">FPK1</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✓</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">93</td>
</tr>
<tr>
<td rowspan="4" style="vertical-align: top; text-align: left">Lileikytė et al., 2016</td>
<td style="vertical-align: top; text-align: left">FLP-32</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">✓</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">29</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">FLP-36</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">✓</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">33</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">FLP-38</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">✓</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">35</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">FLP-48</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">✓</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">45</td>
</tr>
<tr>
<td rowspan="6" style="vertical-align: top; text-align: left; border-bottom: solid thin">This study</td>
<td style="vertical-align: top; text-align: left">detailed</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✓</td>
<td style="vertical-align: top; text-align: left">130</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">no stress</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✓</td>
<td style="vertical-align: top; text-align: left">79</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">no palatalization</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✓</td>
<td style="vertical-align: top; text-align: left">98</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">no mixed dipthongs</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left">122</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">no diphthongs</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✚</td>
<td style="vertical-align: top; text-align: left">✓</td>
<td style="vertical-align: top; text-align: left">✓</td>
<td style="vertical-align: top; text-align: left">112</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">no affricates</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">✚</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">✚</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">✚</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">✚</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">✚</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin"><bold>O</bold></td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">✚</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">✓</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">122</td>
</tr>
</tbody>
</table>
</table-wrap>
<p><fn id="j_info1233_fn_006"><label><sup>7</sup></label>
<p>Lexicon includes allophones to represent differently stressed variants of all vowels and diphthongs (✚), or only diphthongs <italic>ai, au, ei, ui</italic> (✓). Lexicon ignores the opposition of stressed vs. non-stressed sounds (<bold>O</bold>).</p></fn> <fn id="j_info1233_fn_007"><label><sup>8</sup></label>
<p>Lexicon includes allophones to represent the opposition of short vs. long vowels and phone symbols in the actual transcription reflect this opposition consistently (✚), or to the extent that is possible with a shallow G2P converter (✓).</p></fn> <fn id="j_info1233_fn_008"><label><sup>9</sup></label>
<p>Lexicon represents (✚) or ignores (<bold>O</bold>) the opposition of fronted vs. regular back vowels (e.g. [ɔ̘], [ʊ̘] vs. [ɔ], [ʊ]).</p></fn> <fn id="j_info1233_fn_009"><label><sup>10</sup></label>
<p>Lexicon represents (✚) or ignores (<bold>O</bold>) the opposition of stressed vs. non-stressed consonants.</p></fn> <fn id="j_info1233_fn_010"><label><sup>11</sup></label>
<p>Lexicon represents (✚) or ignores (<bold>O</bold>) the opposition of palatalized and non-palatalized consonants.</p></fn> <fn id="j_info1233_fn_011"><label><sup>12</sup></label>
<p>Lexicon represents affricates by a single (✚) or two (<bold>O</bold>) consonants.</p></fn> <fn id="j_info1233_fn_012"><label><sup>13</sup></label>
<p>Lexicon includes allophones to represent all diphthongs (✚), or only diphthongs <italic>ie, uo</italic> (✓) by a single phone. Lexicon encodes all diphthongs by a sequence of two phones (<bold>O</bold>).</p></fn> <fn id="j_info1233_fn_013"><label><sup>14</sup></label>
<p>Lexicon represents mixed diphthongs by different dedicated allophones (✚). Lexicon models mixed diphthongs by the sequence of two constituent phones (<bold>O</bold>) but it also models sonorants which make part of a mixed diphthong as distinct allophones (✓).</p></fn></p>
<p>Given such a variety of the experimental setups it is not surprising that different studies came to different and even opposite conclusions. For instance, Raškinis and Raškinienė (<xref ref-type="bibr" rid="j_info1233_ref_024">2003</xref>) achieved the best WER by the word to phoneme mapping that ignored stress and preserved palatalization (see Table <xref rid="j_info1233_tab_003">3</xref>, ABC phonemic lexicon), whereas (Šilingas, <xref ref-type="bibr" rid="j_info1233_ref_031">2005</xref>) achieved best WER by preserving stress and ignoring palatalization (see Table <xref rid="j_info1233_tab_003">3</xref>, BFR6 phonemic lexicon). Greibus <italic>et al.</italic> (<xref ref-type="bibr" rid="j_info1233_ref_007">2017</xref>) achieved best SER by ignoring both stress and palatalization. Gales <italic>et al.</italic> (<xref ref-type="bibr" rid="j_info1233_ref_005">2015</xref>) found that grapheme-based system outperforms phoneme-based system, whereas Šilingas (<xref ref-type="bibr" rid="j_info1233_ref_031">2005</xref>), Greibus <italic>et al.</italic> (<xref ref-type="bibr" rid="j_info1233_ref_007">2017</xref>) and Lileikytė <italic>et al.</italic> (<xref ref-type="bibr" rid="j_info1233_ref_018">2018</xref>) came to an opposite result. Laurinčiukaitė and Lipeika (<xref ref-type="bibr" rid="j_info1233_ref_014">2007</xref>) found that mapping into a mixture of phonemes and syllable-like units improves WER.</p>
<p>Incompatible conclusions are partially due to the limitations of the experimental setups. Some findings are based on a small training corpus (Raškinis and Raškinienė, <xref ref-type="bibr" rid="j_info1233_ref_024">2003</xref>; Ratkevicius <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_026">2018</xref>) or on a limited carefully selected held-out data (Šilingas, <xref ref-type="bibr" rid="j_info1233_ref_031">2005</xref>). Other studies (Greibus <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_007">2017</xref>; Lileikytė <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_018">2018</xref>) are testing limited word-to-phoneme mappings due to the usage of a shallow G2P converter which is unable to produce allophone-rich phonemic transriptions. Conclusions of many studies are dependent on a single (though generally state-of-the-art at the time of investigation) acoustic modelling technique. Finally, recognition accuracies obtained by the majority of studies are not “pure” indicators of performance of different word to sub-word mappings as they are strongly influenced by different amounts of linguistic constraints embedded into ASR setups. For instance, Greibus <italic>et al.</italic> (<xref ref-type="bibr" rid="j_info1233_ref_007">2017</xref>) restrict their language model (LM) to a command list, where commands share 271 unique word types, and Ratkevicius <italic>et al.</italic> (<xref ref-type="bibr" rid="j_info1233_ref_026">2018</xref>) restrict their LM to a 10-digit word loop.</p>
</sec>
</sec>
<sec id="j_info1233_s_005">
<label>3</label>
<title>The Method</title>
<sec id="j_info1233_s_006">
<label>3.1</label>
<title>Investigated Phonemic and Graphemic Lexicons</title>
<p>In this study, we have adopted an experimental approach common to other similar studies (Raškinis and Raškinienė, <xref ref-type="bibr" rid="j_info1233_ref_024">2003</xref>; Šilingas, <xref ref-type="bibr" rid="j_info1233_ref_031">2005</xref>). It consists of defining some phonemic lexicon which serves as a reference point. Thereafter, reductions of this lexicon are derived by elimination of various phonological properties (e.g. stress, palatalization) or by splitting compound phonetic units (e.g. diphthongs, affricates) into sub-parts and measuring the performance of the ASR system for every reduced lexicon. Our reference phonemic lexicon consists of 130 allophones (henceforth referred as to “detailed” lexicon). It is presented in Table <xref rid="j_info1233_tab_004">4</xref> using SAMPA-LT (Raškinis <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_025">2003</xref>) encoding.</p>
<p>We have compared the “detailed” lexicon against 5 reduced phonemic lexicons and one graphemic lexicon in order to answer the questions about what is the best approach to: 
<list>
<list-item id="j_info1233_li_009">
<label>•</label>
<p>Stress modelling (present vs. absent),</p>
</list-item>
<list-item id="j_info1233_li_010">
<label>•</label>
<p>Palatalization modelling (present vs. absent),</p>
</list-item>
<list-item id="j_info1233_li_011">
<label>•</label>
<p>Diphthong modelling (one vs. two phones),</p>
</list-item>
<list-item id="j_info1233_li_012">
<label>•</label>
<p>Mixed diphthong modelling (distinguishing vs. not distinguishing constituent consonants),</p>
</list-item>
<list-item id="j_info1233_li_013">
<label>•</label>
<p>Affricate modelling (one vs. two phones),</p>
</list-item>
</list> 
and whether a phone-based ASR system outperforms a grapheme-based one.<fn id="j_info1233_fn_014"><label><sup>15</sup></label>
<p>N encodes velarized <italic>n</italic> ([ŋ]).</p></fn><xref ref-type="fn" rid="j_info1233_fn_015">16</xref><fn id="j_info1233_fn_015"><label><sup>16</sup></label>
<p>It may seem that a larger set of phonemes will always model pronunciation better with a sufficient corpus size. This may be true in case of monophone-based single-GMM (i.e. the simplest) acoustic models where model complexity directly depends on the number of phoneme symbols. Triphone acoustic models based on reduced lexicons may have more triphones than acoustic models based on detailed lexicons. Complexity of a triphone acoustic model, which can be expressed as a number of different acoustic states or a number of probability density functions (pdfs), isn’t directly related to the size of the symbol set. Pdf clustering procedure (to alleviate the data scarcity problem) usually makes all triphone models of approximately the same size/complexity for a given fixed corpus size.</p></fn> Answers to those questions are important from the practical perspective. As mentioned previously, extracting stress and syllabification data from word spelling is costly in terms of human expertise.</p>
<table-wrap id="j_info1233_tab_004">
<label>Table 4</label>
<caption>
<p>Detailed list of Lithuanian allophones in SAMPA-LT encoding. Acute and circumflex are encoded by double quote (”) and caret (∧) respectively. Column (:) distinguishes long vowels from short ones. Palatalization is encoded by a single quote (’). Sonorants that make part of a mixed diphthong are labelled by period (.).</p>
</caption>
<table>
<tbody>
<tr>
<td colspan="6" style="vertical-align: top; text-align: center; border-bottom: solid thin">(a) Vowels and diphthongs</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left"/>
<td colspan="2" style="vertical-align: top; text-align: left; border-bottom: solid thin">Short</td>
<td colspan="3" style="vertical-align: top; text-align: left; border-bottom: solid thin">Long</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left: left; border-bottom: solid thin"/>
<td style="vertical-align: top; text-align: left: left; border-bottom: solid thin">Unstressed</td>
<td style="vertical-align: top; text-align: left: left; border-bottom: solid thin">Stressed</td>
<td style="vertical-align: top; text-align: left: left; border-bottom: solid thin">Unstressed</td>
<td style="vertical-align: top; text-align: left: left; border-bottom: solid thin">Stressed (acute)</td>
<td style="vertical-align: top; text-align: left: left; border-bottom: solid thin">Stressed (circumflex)</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">Vowels</td>
<td style="vertical-align: top; text-align: left">a, e, i, o, u</td>
<td style="vertical-align: top; text-align: left">"a, "e, "i,</td>
<td style="vertical-align: top; text-align: left">a:, e:, E:,</td>
<td style="vertical-align: top; text-align: left">"a:, "e:, "E:,</td>
<td style="vertical-align: top; text-align: left">^a:, ^e:, ^E:,</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left"/>
<td style="vertical-align: top; text-align: left"/>
<td style="vertical-align: top; text-align: left">"o, "u</td>
<td style="vertical-align: top; text-align: left">i:, o:, u:</td>
<td style="vertical-align: top; text-align: left">"i:, "o:, "u:</td>
<td style="vertical-align: top; text-align: left">^i:, ^o:, ^u:</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">Fronted vowels</td>
<td style="vertical-align: top; text-align: left">io, iu</td>
<td style="vertical-align: top; text-align: left">"io, "iu</td>
<td style="vertical-align: top; text-align: left">io:, iu:</td>
<td style="vertical-align: top; text-align: left">"io:, "iu:</td>
<td style="vertical-align: top; text-align: left">^io:, ^iu:</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">Diphthongs</td>
<td style="vertical-align: top; text-align: left"/>
<td style="vertical-align: top; text-align: left"/>
<td style="vertical-align: top; text-align: left">ie, uo, iuo</td>
<td style="vertical-align: top; text-align: left">"ie, "uo, "iuo</td>
<td style="vertical-align: top; text-align: left">^ie, ^uo, ^iuo</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left"/>
<td style="vertical-align: top; text-align: left"/>
<td style="vertical-align: top; text-align: left"/>
<td style="vertical-align: top; text-align: left">ai, au, ei,</td>
<td style="vertical-align: top; text-align: left">"ai, "au, "ei,</td>
<td style="vertical-align: top; text-align: left">^ai, ^au, ^ei,</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin"/>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin"/>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin"/>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">eu, ui, iui</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">"eu, "ui, "iui</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">^eu, ^ui, ^iui</td>
</tr>
</tbody>
</table>
<table>
<thead>
<tr>
<td colspan="5" style="vertical-align: top; text-align: center; border-bottom: solid thin">(b) Plosives, fricatives and affricates</td>
</tr>
</thead>
<tbody>
<tr>
<td style="vertical-align: top; text-align: left"/>
<td colspan="2" style="vertical-align: top; text-align: left; border-bottom: solid thin">Non-palatalized</td>
<td colspan="2" style="vertical-align: top; text-align: left; border-bottom: solid thin">Palatalized</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin"/>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">Voiced</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">Unvoiced</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">Voiced</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">Unvoiced</td>
</tr>
</tbody><tbody>
<tr>
<td style="vertical-align: top; text-align: left">Plosives</td>
<td style="vertical-align: top; text-align: left">b, d, g</td>
<td style="vertical-align: top; text-align: left">p, t, k</td>
<td style="vertical-align: top; text-align: left">b’, d’, g’</td>
<td style="vertical-align: top; text-align: left">p’, t’, k’</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">Fricatives</td>
<td style="vertical-align: top; text-align: left">z, Z, G, v, j</td>
<td style="vertical-align: top; text-align: left">s, S, x, f</td>
<td style="vertical-align: top; text-align: left">z’, Z’, G’, v’</td>
<td style="vertical-align: top; text-align: left">s’, S’, x’, f’</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">Affricates</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">dz, dZ</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">ts, tS</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">dz’, dZ’</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">ts’, tS’</td>
</tr>
</tbody>
</table>
<table>
<thead>
<tr>
<td colspan="7" style="vertical-align: top; text-align: center; border-bottom: solid thin">(c) Sonorants</td>
</tr>
</thead>
<tbody>
<tr>
<td style="vertical-align: top; text-align: left"/>
<td colspan="3" style="vertical-align: top; text-align: left; border-bottom: solid thin">Non-palatalized</td>
<td colspan="3" style="vertical-align: top; text-align: left; border-bottom: solid thin">Palatalized</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left"/>
<td rowspan="2" style="vertical-align: top; text-align: left; border-bottom: solid thin">Standalone</td>
<td colspan="2" style="vertical-align: top; text-align: left; border-bottom: solid thin">Part of a mixed diphthong</td>
<td rowspan="2" style="vertical-align: top; text-align: left; border-bottom: solid thin">Standalone</td>
<td colspan="2" style="vertical-align: top; text-align: left; border-bottom: solid thin">Part of a mixed diphthong</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin"/>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">Unstressed</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">Stressed</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">Unstressed</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">Stressed</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">Sonorants</td>
<td style="vertical-align: top; text-align: left">l, m, n, r</td>
<td style="vertical-align: top; text-align: left">l., m., n.,</td>
<td style="vertical-align: top; text-align: left">^l., ^m., ^n.,</td>
<td style="vertical-align: top; text-align: left">l’, m’, n’, r’</td>
<td style="vertical-align: top; text-align: left">l.’, m.’, n.’,</td>
<td style="vertical-align: top; text-align: left">^l.’, ^m.’, ^n.’,</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin"/>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin"/>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">N.<xref ref-type="fn" rid="j_info1233_fn_014">15</xref>, r.</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">^N., ^r.</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin"/>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">N.’, r.’</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">^N.’, ^r.’</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The process by which different phonemic and graphemic transcriptions were obtained is described in Fig. <xref rid="j_info1233_fig_001">1</xref>. First, word-level ortographic transcription was processed by the knowledge-rich G2P converter (Kazlauskienė <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_010">2010</xref>) resulting in allophone sequence that observes intra-word sound assimilation rules. Thereafter optional sound assimilation rules were applied at word boundaries in an automatic way on a basis of a maximum-likelihood criterion. This resulted in “detailed” phone-level transcription encoded by SAMPA-LT symbols that served as our reference word-to-phoneme mapping. Reduced phonemic transcriptions were derived from the “detailed” transcription by subjecting it to one or more editing operations (see Fig. <xref rid="j_info1233_fig_001">1</xref>).</p>
<p>Graphemic transcription was obtained from the word-level ortographic transcription by means of a few editing operations that encoded graphemes by SAMPA-LT symbols. This encoding was necessary in order to harmonize phonemic and graphemic transcriptions for their comparison at a later stage (see 3.6). Graphemic lexicon may look like a phonemic one, but this is a false impression. Graphemic transcription was not subjected to sound assimilation rules, and the changes in graphemic transcriptions are simple and mostly reversible transliterations.</p>
<fig id="j_info1233_fig_001">
<label>Fig. 1</label>
<caption>
<p>The process of deriving phonemic and graphemic transcriptions from the input ortographic transcription. Arrow symbol denotes a broad “replace” operator (inclusive of “split” and “merge”).</p>
</caption>
<graphic xlink:href="info1233_g001.jpg"/>
</fig>
</sec>
<sec id="j_info1233_s_007">
<label>3.2</label>
<title>Speech Data and the ASR Cross-Fold Validation Setup</title>
<p>Our experiments were based on a 50-hour speech corpus that was compiled at Vytautas Magnus University. The corpus consisted of 50 speakers (25 males and 25 females) each reading book excerpts for approximately 1 hour.<xref ref-type="fn" rid="j_info1233_fn_016">17</xref><fn id="j_info1233_fn_016"><label><sup>17</sup></label>
<p>Silent segments make 15–20% of the corpus depending on the speaker.</p></fn> Word-level transcriptions of this corpus were manually adjusted to match misspellings and missacentuations present in audio recordings.</p>
<p>We built multiple ASR systems based on different phonemic/graphemic lexicons and tried to estimate their accuracies via a cross-validation technique. The cross-validation round consisted of training an ASR system (building acoustic and phone-level language models) on the speech and transcripts of 49 speakers and testing system accuracy on the speech of the held-out (or test) speaker. Full leave-one-out (or 50-fold) cross-validation was costly in terms of computational time. Instead, we approximated it with a “pessimistic” 10-fold cross-validation scheme. We call it “pessimistic” (with respect to the leave-one-out cross-validation) because of the inclusion of the most problematic speakers into the test set. The selection procedure clustered all speakers into 5 clusters of comparable size and selected 2 worst rated speakers per cluster for inclusion into the test set (<inline-formula id="j_info1233_ineq_001"><alternatives>
<mml:math><mml:mn>2</mml:mn><mml:mo>×</mml:mo><mml:mn>5</mml:mn><mml:mo>=</mml:mo><mml:mn>10</mml:mn></mml:math>
<tex-math><![CDATA[$2\times 5=10$]]></tex-math></alternatives></inline-formula> in total).<xref ref-type="fn" rid="j_info1233_fn_017">18</xref><fn id="j_info1233_fn_017"><label><sup>18</sup></label>
<p>Speaker rating was determined on the basis of WER obtained in our earlier recognition and adaptation experiments (Rudžionis <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_027">2013</xref>). Speaker clustering was based on the feature set that included insertion, deletion and substitution errors.</p></fn> Identifiers of selected speakers and their ratings are given in Table <xref rid="j_info1233_tab_006">6</xref>.</p>
</sec>
<sec id="j_info1233_s_008">
<label>3.3</label>
<title>Acoustic Models</title>
<p>It is reasonable to expect that a certain phonemic lexicon performs better when coupled with some particular acoustic modelling technique. In order to investigate this relationship and to assess the possible limitations of our conclusions we have built and compared the ASR systems based on the acoustic models of 7 different types<xref ref-type="fn" rid="j_info1233_fn_018">19</xref><fn id="j_info1233_fn_018"><label><sup>19</sup></label>
<p>We have used an open-source Kaldi ASR toolkit (Povey <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_021">2011a</xref>) for training and evaluating all ASR systems. Some other techniques, mostly discriminative training approaches, have been tried but not described in this paper, because their accuracy estimates correlated with the results of non-discriminative training.</p></fn> including: 
<list>
<list-item id="j_info1233_li_014">
<label>1.</label>
<p>Monophone HMM system (henceforth referred to as “mono” system) was the simplest ASR system, where each phone was modelled by a single HMM. HMMs had from 2 to 5 states (number of states was related to the average phone duration) and shared left-to-right topology. The number of Gaussian probability density functions (pdfs) per state was estimated as an exponential function of the state occupation counts targeting 1000 pdfs in total. Speech data was parametrized by extracting 13 mel-frequency cepstral coefficients (MFCC) and their first-order and second-order derivatives from 25 ms speech frames at 10 ms intervals. Per speaker cepstral mean normalization was applied.</p>
</list-item>
<list-item id="j_info1233_li_015">
<label>2.</label>
<p>Triphone HMM system (henceforth referred to as “tri-mfcc” system) was trained on the same features as “mono” system, but each phone was modelled by multiple context-dependent HMMs (triphones). The system targeted 11000 Gaussian pdfs in total. Triphone state tying was performed using decision-tree clustering technique and resulted in approximately 2000 clusters (tree leaves).</p>
</list-item>
<list-item id="j_info1233_li_016">
<label>3.</label>
<p>Triphone HMM system (henceforth referred to as “tri-lda” system) was trained in the same way as “tri-mfcc” system, but was based on a different speech parametrization. It consisted of splicing 13-dimensional MFCC vectors across 7 frames (3 frames on each side of the current frame) resulting in 91-dimensional feature vectors, applying Linear Discriminant Analysis (LDA) to reduce vector dimensionality to 40, and finally estimating the Maximum Likelihood Linear Transform (MLLT) (Gales, <xref ref-type="bibr" rid="j_info1233_ref_004">1999</xref>) over multiple iterations. MLLT represents a square feature transformation matrix with the objective function being the average per-frame log-likelihood of the transformed features given the model.</p>
</list-item>
<list-item id="j_info1233_li_017">
<label>4.</label>
<p>Speaker-adaptively trained (SAT) triphone HMM system (henceforth referred to as “tri-sat” system) differed from the previous one as speaker-specific feature-space maximum likelihood linear regression (fMLLR) adaptation was added on the top of LDA+MLLT speech parametrization. fMLLR is an affine feature transformation whose estimation techniques are detailed in Gales (<xref ref-type="bibr" rid="j_info1233_ref_003">1998</xref>).</p>
</list-item>
<list-item id="j_info1233_li_018">
<label>5.</label>
<p>System based on the Subspace Gaussian Mixture Models (SGMM) is a general HMM model where states share the same GMM structure (henceforth referred to as “sgmm” system). The acoustic model is defined by vectors associated with each state and by a global mapping from this vector space to the space of parameters of the GMM. Thus GMM means and mixture weights are constrained to vary in a subspace of the full parameter space (Povey <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_022">2011b</xref>). This system was trained on the top of fMLLR adapted speech features.</p>
</list-item>
<list-item id="j_info1233_li_019">
<label>6.</label>
<p>System based on a feed-forward deep neural network known as Time-Delay Neural Network (TDNN) henceforth referred to as “tdnn” system. This system was trained using procedure described in Zhang <italic>et al.</italic> (<xref ref-type="bibr" rid="j_info1233_ref_032">2014</xref>). First, “tri-sat” system was asked to produce frame-level state labelling for the training speech. Thereafter, state labels were used as targets to train the TDNN acoustic models. Speech data was parametrized by extracting 40 mel-frequency filterbank coefficients, splicing 40-dimensional vectors across 9 frames resulting in 360-dimensional feature vectors. Thus, TDNN had 360 inputs and aproximately 1750 outputs<xref ref-type="fn" rid="j_info1233_fn_019">20</xref><fn id="j_info1233_fn_019"><label><sup>20</sup></label>
<p>The exact number is dependent on the held-out speaker identity in a particular cross-validation round.</p></fn> corresponding to the context-dependent phone state labels. In between the input and the output layers TDNN had two hidden layers based on tangent non-linearity. TDNN was trained for 20 epochs by reducing learning rate during the first 15 epochs.</p>
</list-item>
<list-item id="j_info1233_li_020">
<label>7.</label>
<p>System based on a recurrent deep neural network known as Low Frame Rate Bidirectional Long Short Term Memory (LFR BLSTM). This system was trained using procedure described in Povey <italic>et al.</italic> (<xref ref-type="bibr" rid="j_info1233_ref_023">2016</xref>). Two additional speed perturbed copies of training data were used for 3-fold data augmentation (Ko <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_012">2015</xref>). 100-dimensional iVectors were extracted in online manner and were used as additional inputs to the BLSTM network to perform instantaneous adaptation of the neural network (Saon <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_029">2013</xref>). LFR BLSTM architecture had 3 forward and 3 backward layers. The model was trained for 4 epochs by linearly reducing learning rate throughout the training process. This ASR system is referred to as “blstm” in the subsequent sections.</p>
</list-item>
</list>
</p>
</sec>
<sec id="j_info1233_s_009">
<label>3.4</label>
<title>Phone-Level Language Models</title>
<p>We aim to build an experimental setup such that the ASR system is stripped from its lexical and grammatical knowledge (list of words of a language and probabilities associated to word sequences) that influences recognition accuracy, so that the accuracy of the ASR system reflects the performance of the word to sub-word unit mappings under investigation. It should be noted that phonotactic knowledge cannot be eliminated from our comparisons because it makes an integral part of an acoustic (starting from triphones) model. If we take a triphone acoustic model, extract the list of all triphones, and make a fully-connected triphone network that respects adjacency constraints (i.e. triphone a-x+y is connected to every triphone x-y+b in the list, where a, b, x, y denote any sub-word unit of the lexicon) we obtain a phone 3-gram with the uniform probability distribution over the outgoing links. It represents the set of categorial phonotactic constraints embedded into an acoustic model. Let’s call it the categorial phone 3-gram. Table <xref rid="j_info1233_tab_005">5</xref> compares perplexities of the categorial and probabilistic phone-level <italic>n</italic>-grams.</p>
<table-wrap id="j_info1233_tab_005">
<label>Table 5</label>
<caption>
<p>Average perplexities of the phone-level n-grams, measured on the held-out parts of the speech corpus.</p>
</caption>
<table>
<thead>
<tr>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin">Lexicon</td>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin">Categorial 3-gram</td>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin">1-gram</td>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin">2-gram</td>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin">3-gram</td>
<td style="vertical-align: top; text-align: left; border-top: solid thin; border-bottom: solid thin">4-gram</td>
</tr>
</thead>
<tbody>
<tr>
<td style="vertical-align: top; text-align: left">Detailed</td>
<td style="vertical-align: top; text-align: left">31.45</td>
<td style="vertical-align: top; text-align: left">60.66</td>
<td style="vertical-align: top; text-align: left">18.05</td>
<td style="vertical-align: top; text-align: left">12.23</td>
<td style="vertical-align: top; text-align: left">9.46</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">No stress</td>
<td style="vertical-align: top; text-align: left">27.00</td>
<td style="vertical-align: top; text-align: left">45.37</td>
<td style="vertical-align: top; text-align: left">14.57</td>
<td style="vertical-align: top; text-align: left">10.97</td>
<td style="vertical-align: top; text-align: left">8.81</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">No palatalization</td>
<td style="vertical-align: top; text-align: left">33.17</td>
<td style="vertical-align: top; text-align: left">44.24</td>
<td style="vertical-align: top; text-align: left">17.96</td>
<td style="vertical-align: top; text-align: left">12.32</td>
<td style="vertical-align: top; text-align: left">9.44</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">No mix. diphthongs</td>
<td style="vertical-align: top; text-align: left">32.05</td>
<td style="vertical-align: top; text-align: left">56.35</td>
<td style="vertical-align: top; text-align: left">18.59</td>
<td style="vertical-align: top; text-align: left">12.36</td>
<td style="vertical-align: top; text-align: left">9.53</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">No diphthongs</td>
<td style="vertical-align: top; text-align: left">30.54</td>
<td style="vertical-align: top; text-align: left">52.91</td>
<td style="vertical-align: top; text-align: left">16.76</td>
<td style="vertical-align: top; text-align: left">11.50</td>
<td style="vertical-align: top; text-align: left">8.94</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">No affricates</td>
<td style="vertical-align: top; text-align: left">30.89</td>
<td style="vertical-align: top; text-align: left">59.12</td>
<td style="vertical-align: top; text-align: left">17.83</td>
<td style="vertical-align: top; text-align: left">12.11</td>
<td style="vertical-align: top; text-align: left">9.35</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">Graphemes</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">23.55</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">22.14</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">12.79</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">9.67</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">7.86</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>We have taken the categorial phone 3-gram as our baseline decoding setup. In addition, we performed decoding experiments with phone 3-grams and 4-grams to observe how additional probabilistic phonotactic knowledge affected the ASR performance.<xref ref-type="fn" rid="j_info1233_fn_020">21</xref><fn id="j_info1233_fn_020"><label><sup>21</sup></label>
<p>We did not perform decoding with phone 1-grams and 2-grams because their decoding accuracies are hard to interpret. On the one hand, phone 1-grams and 2-grams are under-constrained with respect to the categorial phonotactic constraints integral to the triphone acoustic model, and, consequently, the decoder is forced to synthesize triphones that violate phonotactic constraints of the language. On the other hand, 1-grams and 2-grams are more constrained by probabilistic knowledge than the categorial 3-gram by taking advantage of statistics of the training corpus.</p></fn></p>
<p>Decoding with categorial 3-grams, probabilistic 3-grams and 4-grams exploited phonotactic but not lexical or syntactic-semantic knowledge, so we believe that our comparisons were independent from the lexical content of the training/evaluation data.</p>
<p>To summarize, our experimental investigation consisted of building 7 (phonemic/graphemic lexicons) × 7 (acoustic model types) × 10 (speaker-specific cross-validation rounds) = 490 different acoustic models and performing 490 (acoustic models) × 3 (phone-level language models) = 1470 decoding experiments in total.</p>
</sec>
<sec id="j_info1233_s_010">
<label>3.5</label>
<title>Scoring: Accuracy Estimation</title>
<p>We have used Phone Error Rate (PER) criterion to compare the performances of different ASR setups. It was calculated according to: 
<disp-formula id="j_info1233_eq_001">
<label>(1)</label><alternatives>
<mml:math display="block"><mml:mtable displaystyle="true"><mml:mtr><mml:mtd><mml:mi mathvariant="italic">PER</mml:mi><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:mfrac><mml:mrow><mml:mi mathvariant="italic">S</mml:mi><mml:mo>+</mml:mo><mml:mi mathvariant="italic">I</mml:mi><mml:mo>+</mml:mo><mml:mi mathvariant="italic">D</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="italic">N</mml:mi></mml:mrow></mml:mfrac></mml:mstyle><mml:mn>100</mml:mn><mml:mi mathvariant="normal">%</mml:mi><mml:mo mathvariant="normal">,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math>
<tex-math><![CDATA[\[ \mathit{PER}=\frac{S+I+D}{N}100\% ,\]]]></tex-math></alternatives>
</disp-formula> 
where <italic>S</italic>, <italic>I</italic> and <italic>D</italic> denote substitution, insertion and deletion errors respectively, and <italic>N</italic> is the total number of phones/graphemes in the test data. <italic>S</italic>, <italic>I</italic> and <italic>D</italic> estimates were extracted from automatic alignments of recognized and reference transcriptions.</p>
</sec>
<sec id="j_info1233_s_011">
<label>3.6</label>
<title>Scoring: Transcription Normalization</title>
<p>Automatic alignment of recognized and reference transcriptions was preceded by the transcription normalization step. This step consisted of projecting every individual phoneme/grapheme onto a symbol or a sequence of symbols over the normalized alphabet. Without projecting lexicons of different sizes into the common lexicon the comparison would be biased against allophone-rich ASR setups as they naturally tend to result in more substitution errors than ASR setups based on reduced lexicons. Moreover, without normalization, substitution errors involving, e.g. stressed vs. unstressed or palatalized vs. non-palatalized allophones of the same phoneme, will look like equally important as phoneme substitutions.</p>
<p>Normalized alphabet contained 27 symbols (a b d e E: f g G x i i: j k l m n o p r s S t u u: v z Z). It represented the intersection of all investigated lexicons, i.e. it contained symbols that were common to all lexicons. Other allophone units were projected into this alphabet by eliminating their phonetic properties or by spliting compound units (affricates, diphthongs, fronted back vowels) into the sequences of 2 or 3 symbols.<xref ref-type="fn" rid="j_info1233_fn_021">22</xref><fn id="j_info1233_fn_021"><label><sup>22</sup></label>
<p>For instance, graphemic lexicon lacks the symbol o: (see Table <xref rid="j_info1233_tab_001">1</xref> and bottom part of Fig. <xref rid="j_info1233_fig_001">1</xref>), so o: is not included into the normalized alphabet, and all phonemic lexicons are projecting o: → o. The “no affricates” lexicon lacks affricates ts, tS, dz, dZ, so other lexicons are projecting affricates into a sequence of two symbols.</p></fn> As the only exception to this rule, we have eliminated symbols a: and e: from the normalized alphabet even if these symbols were present in all investigated lexicons. The effect of this exception was that a / a: and e / e: substitutions were no longer interpreted as errors. This was done to eliminate the bias against the graphemic lexicon so that it was not penalized for the failures to resolve duration ambiguities it was hardly able to resolve.<xref ref-type="fn" rid="j_info1233_fn_022">23</xref><fn id="j_info1233_fn_022"><label><sup>23</sup></label>
<p>Acoustic models of graphemes <italic>a</italic> and <italic>ą</italic> are both trained on acoustic samples of [a:] (see Table <xref rid="j_info1233_tab_001">1</xref>).</p></fn></p>
<p>The process of projecting phonemic and graphemic transcripts into scoring transcripts over the normalized alphabet was realized by 4 steps:</p>
<list>
<list-item id="j_info1233_li_021">
<label>1.</label>
<p>Remove double quote(”), caret(^), single quote(’), period(.) from SAMPA-LT phone descriptions;</p>
</list-item>
<list-item id="j_info1233_li_022">
<label>2.</label>
<p>Split multi-symbol SAMPA-LT phone descriptions (ai, au, ei, eu, ie, ui, uo, iui, iuo, dz, dZ, ts, tS) into forming symbols;</p>
</list-item>
<list-item id="j_info1233_li_023">
<label>3.</label>
<p>Split: iu → i u, iu: → i u:, io → i o, io: → i o:;</p>
</list-item>
<list-item id="j_info1233_li_024">
<label>4.</label>
<p>Replace: e: → e, a: → a, o: → o, N → n.</p>
</list-item>
</list>
<p>Though grapheme-based and phoneme-based reference transcriptions are mapped to transcriptions over the same normalized alphabet, they are not identical. For instance, Lithuanian word <italic>džiaugsis</italic> (will rejoice) is mapped to d Z e u k s i s (phonemic) and d Z i a u g s i s (graphemic) over the same normalized alphabet. The difference stems from the fact that phonemic transcriptions by definition are transcriptions subjected to sound assimilation rules.<xref ref-type="fn" rid="j_info1233_fn_023">24</xref><fn id="j_info1233_fn_023"><label><sup>24</sup></label>
<p>The original word spelling cannot be restituted neither from a phonemic, nor from a graphemic transcription expressed over the normalized alphabet due to the one-to-many mapping, e.g. a t S i u: could be restituted as <italic>ačiū</italic> (thanks), <italic>ačių</italic>, <italic>atšiū</italic>, <italic>atšių</italic>, <italic>ąčiū</italic>, <italic>ąčių</italic>, <italic>ątšiū</italic>, <italic>ątšių</italic> (nonsense words).</p></fn></p>
</sec>
</sec>
<sec id="j_info1233_s_012">
<label>4</label>
<title>Experimental Results</title>
<p>Let PER<sub>LX,AM,LM,SPK</sub> denote a Phone Error Rate that is obtained by the ASR setup based on the lexicon LX, the acoustic modelling technique AM, the phone-level LM and corresponds to the cross validation round, in which the data of SPK speaker is decoded.<xref ref-type="fn" rid="j_info1233_fn_024">25</xref><fn id="j_info1233_fn_024"><label><sup>25</sup></label>
<p>LX ∈ {detailed, no-stress, no-palatalization, no-mixed diphthongs, no-diphthongs, no-affricates, graphemic}, AM ∈ {mono, tri-mfcc, tri-lda, tri-sat, sgmm, tdnn, blstm}, LM ∈ {categorial phone 3-gram, probabilistic phone 3-gram, probabilistic phone 4-gram}, SPK ∈ {ARM, BLA, CIZ, DEK, EID, JUK, LEO, MAL, RUP, SKA}.</p></fn> Values of PER<sub>detailed,*,categorial 3-gram,*</sub> and PER<sub>graphemic,*,categorial 3-gram,*</sub> are shown in Tables <xref rid="j_info1233_tab_006">6</xref>a and <xref rid="j_info1233_tab_006">6</xref>b respectively for illustration purposes. Each table corresponds to the PER values obtained by 70 different ASR setups (10 speaker specific cross validation rounds × 7 acoustic modelling techniques).</p>
<p><fn id="j_info1233_fn_025"><label><sup>26</sup></label>
<p>The best speaker has the rating of 1 and the worst speaker has the rating of 50.</p></fn></p>
<table-wrap id="j_info1233_tab_006">
<label>Table 6</label>
<caption>
<p>Phone error rates obtained by different ASR setups when decoding with categorial phone 3-gram. Columns represent different acoustic modelling techniques. Rows represent 10 speaker-specific cross validation rounds.</p>
</caption>
<table>
<tbody>
<tr>
<td colspan="8" style="vertical-align: top; text-align: center; border-bottom: solid thin">(a) ASR setups based on “detailed” phonemic lexicon (PER<sub>detailed,*,categorial 3-gram,*</sub>)</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">Speaker (rating<xref ref-type="fn" rid="j_info1233_fn_025">26</xref>)</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">mono</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">tri_mfcc</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">tri_lda</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">tri_sat</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">sgmm</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">tdnn</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">blstm</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">ARM (50)</td>
<td style="vertical-align: top; text-align: left">53.89</td>
<td style="vertical-align: top; text-align: left">48.90</td>
<td style="vertical-align: top; text-align: left">45.61</td>
<td style="vertical-align: top; text-align: left">33.36</td>
<td style="vertical-align: top; text-align: left">31.46</td>
<td style="vertical-align: top; text-align: left">28.82</td>
<td style="vertical-align: top; text-align: left">32.14</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">BLA (39)</td>
<td style="vertical-align: top; text-align: left">42.27</td>
<td style="vertical-align: top; text-align: left">28.98</td>
<td style="vertical-align: top; text-align: left">25.46</td>
<td style="vertical-align: top; text-align: left">21.37</td>
<td style="vertical-align: top; text-align: left">17.29</td>
<td style="vertical-align: top; text-align: left">14.62</td>
<td style="vertical-align: top; text-align: left">11.46</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">CIZ (37)</td>
<td style="vertical-align: top; text-align: left">44.60</td>
<td style="vertical-align: top; text-align: left">30.87</td>
<td style="vertical-align: top; text-align: left">29.30</td>
<td style="vertical-align: top; text-align: left">21.94</td>
<td style="vertical-align: top; text-align: left">17.11</td>
<td style="vertical-align: top; text-align: left">15.67</td>
<td style="vertical-align: top; text-align: left">12.46</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">EID (43)</td>
<td style="vertical-align: top; text-align: left">46.27</td>
<td style="vertical-align: top; text-align: left">34.93</td>
<td style="vertical-align: top; text-align: left">31.53</td>
<td style="vertical-align: top; text-align: left">26.39</td>
<td style="vertical-align: top; text-align: left">21.78</td>
<td style="vertical-align: top; text-align: left">19.27</td>
<td style="vertical-align: top; text-align: left">15.39</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">DEK (1)</td>
<td style="vertical-align: top; text-align: left">30.59</td>
<td style="vertical-align: top; text-align: left">17.02</td>
<td style="vertical-align: top; text-align: left">14.61</td>
<td style="vertical-align: top; text-align: left">12.38</td>
<td style="vertical-align: top; text-align: left">9.84</td>
<td style="vertical-align: top; text-align: left">9.06</td>
<td style="vertical-align: top; text-align: left">6.89</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">JUK (46)</td>
<td style="vertical-align: top; text-align: left">41.78</td>
<td style="vertical-align: top; text-align: left">32.76</td>
<td style="vertical-align: top; text-align: left">29.35</td>
<td style="vertical-align: top; text-align: left">25.14</td>
<td style="vertical-align: top; text-align: left">21.68</td>
<td style="vertical-align: top; text-align: left">19.11</td>
<td style="vertical-align: top; text-align: left">17.54</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">LEO (34)</td>
<td style="vertical-align: top; text-align: left">40.27</td>
<td style="vertical-align: top; text-align: left">27.38</td>
<td style="vertical-align: top; text-align: left">22.97</td>
<td style="vertical-align: top; text-align: left">18.31</td>
<td style="vertical-align: top; text-align: left">14.72</td>
<td style="vertical-align: top; text-align: left">12.85</td>
<td style="vertical-align: top; text-align: left">6.50</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">MAL (49)</td>
<td style="vertical-align: top; text-align: left">47.90</td>
<td style="vertical-align: top; text-align: left">37.95</td>
<td style="vertical-align: top; text-align: left">32.46</td>
<td style="vertical-align: top; text-align: left">27.27</td>
<td style="vertical-align: top; text-align: left">23.55</td>
<td style="vertical-align: top; text-align: left">22.24</td>
<td style="vertical-align: top; text-align: left">17.38</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">RUP (36)</td>
<td style="vertical-align: top; text-align: left">35.39</td>
<td style="vertical-align: top; text-align: left">24.30</td>
<td style="vertical-align: top; text-align: left">21.59</td>
<td style="vertical-align: top; text-align: left">17.46</td>
<td style="vertical-align: top; text-align: left">13.45</td>
<td style="vertical-align: top; text-align: left">11.65</td>
<td style="vertical-align: top; text-align: left">8.78</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">SKA (47)</td>
<td style="vertical-align: top; text-align: left">46.70</td>
<td style="vertical-align: top; text-align: left">37.55</td>
<td style="vertical-align: top; text-align: left">35.74</td>
<td style="vertical-align: top; text-align: left">31.27</td>
<td style="vertical-align: top; text-align: left">26.28</td>
<td style="vertical-align: top; text-align: left">24.59</td>
<td style="vertical-align: top; text-align: left">19.59</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin"><bold>Average</bold></td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin"><bold>42.97</bold></td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin"><bold>32.06</bold></td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin"><bold>28.86</bold></td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin"><bold>23.49</bold></td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin"><bold>19.72</bold></td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin"><bold>17.79</bold></td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin"><bold>14.82</bold></td>
</tr>
<tr>
<td colspan="8" style="vertical-align: top; text-align: center; border-bottom: solid thin">(b) ASR setups based on graphemic lexicon (PER<sub>graphemic,*,categorial 3-gram,*</sub>)</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">Speaker (rating)</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">mono</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">tri_mfcc</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">tri_lda</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">tri_sat</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">sgmm</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">tdnn</td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin">blstm</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">ARM (50)</td>
<td style="vertical-align: top; text-align: left">54.30</td>
<td style="vertical-align: top; text-align: left">52.81</td>
<td style="vertical-align: top; text-align: left">48.02</td>
<td style="vertical-align: top; text-align: left">37.57</td>
<td style="vertical-align: top; text-align: left">35.95</td>
<td style="vertical-align: top; text-align: left">31.32</td>
<td style="vertical-align: top; text-align: left">32.19</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">BLA (39)</td>
<td style="vertical-align: top; text-align: left">42.37</td>
<td style="vertical-align: top; text-align: left">33.29</td>
<td style="vertical-align: top; text-align: left">29.80</td>
<td style="vertical-align: top; text-align: left">25.57</td>
<td style="vertical-align: top; text-align: left">21.36</td>
<td style="vertical-align: top; text-align: left">17.14</td>
<td style="vertical-align: top; text-align: left">12.94</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">CIZ (37)</td>
<td style="vertical-align: top; text-align: left">46.98</td>
<td style="vertical-align: top; text-align: left">34.82</td>
<td style="vertical-align: top; text-align: left">32.93</td>
<td style="vertical-align: top; text-align: left">26.05</td>
<td style="vertical-align: top; text-align: left">21.47</td>
<td style="vertical-align: top; text-align: left">18.60</td>
<td style="vertical-align: top; text-align: left">12.03</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">EID (43)</td>
<td style="vertical-align: top; text-align: left">48.40</td>
<td style="vertical-align: top; text-align: left">38.47</td>
<td style="vertical-align: top; text-align: left">34.96</td>
<td style="vertical-align: top; text-align: left">30.16</td>
<td style="vertical-align: top; text-align: left">25.79</td>
<td style="vertical-align: top; text-align: left">22.41</td>
<td style="vertical-align: top; text-align: left">16.56</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">DEK (1)</td>
<td style="vertical-align: top; text-align: left">33.48</td>
<td style="vertical-align: top; text-align: left">20.22</td>
<td style="vertical-align: top; text-align: left">17.44</td>
<td style="vertical-align: top; text-align: left">15.46</td>
<td style="vertical-align: top; text-align: left">12.94</td>
<td style="vertical-align: top; text-align: left">10.95</td>
<td style="vertical-align: top; text-align: left">7.50</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">JUK (46)</td>
<td style="vertical-align: top; text-align: left">45.08</td>
<td style="vertical-align: top; text-align: left">35.70</td>
<td style="vertical-align: top; text-align: left">32.72</td>
<td style="vertical-align: top; text-align: left">28.66</td>
<td style="vertical-align: top; text-align: left">25.72</td>
<td style="vertical-align: top; text-align: left">21.38</td>
<td style="vertical-align: top; text-align: left">18.46</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">LEO (34)</td>
<td style="vertical-align: top; text-align: left">41.20</td>
<td style="vertical-align: top; text-align: left">31.24</td>
<td style="vertical-align: top; text-align: left">27.78</td>
<td style="vertical-align: top; text-align: left">22.02</td>
<td style="vertical-align: top; text-align: left">18.80</td>
<td style="vertical-align: top; text-align: left">16.06</td>
<td style="vertical-align: top; text-align: left">7.60</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">MAL (49)</td>
<td style="vertical-align: top; text-align: left">48.35</td>
<td style="vertical-align: top; text-align: left">40.08</td>
<td style="vertical-align: top; text-align: left">35.44</td>
<td style="vertical-align: top; text-align: left">30.59</td>
<td style="vertical-align: top; text-align: left">26.94</td>
<td style="vertical-align: top; text-align: left">24.23</td>
<td style="vertical-align: top; text-align: left">18.48</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">RUP (36)</td>
<td style="vertical-align: top; text-align: left">37.17</td>
<td style="vertical-align: top; text-align: left">28.27</td>
<td style="vertical-align: top; text-align: left">25.28</td>
<td style="vertical-align: top; text-align: left">21.26</td>
<td style="vertical-align: top; text-align: left">17.66</td>
<td style="vertical-align: top; text-align: left">14.49</td>
<td style="vertical-align: top; text-align: left">9.86</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left">SKA (47)</td>
<td style="vertical-align: top; text-align: left">49.89</td>
<td style="vertical-align: top; text-align: left">40.55</td>
<td style="vertical-align: top; text-align: left">38.55</td>
<td style="vertical-align: top; text-align: left">34.64</td>
<td style="vertical-align: top; text-align: left">29.94</td>
<td style="vertical-align: top; text-align: left">26.91</td>
<td style="vertical-align: top; text-align: left">20.28</td>
</tr>
<tr>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin"><bold>Average</bold></td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin"><bold>44.72</bold></td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin"><bold>35.54</bold></td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin"><bold>32.29</bold></td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin"><bold>27.20</bold></td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin"><bold>23.66</bold></td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin"><bold>20.35</bold></td>
<td style="vertical-align: top; text-align: left; border-bottom: solid thin"><bold>15.59</bold></td>
</tr>
</tbody>
</table>
</table-wrap>
<p>To compare different word to sub-word unit mappings, we are mainly interested not in the PER values themselves but in the differences between PER<sub>LX,AM,LM,SPK</sub> values for different choices of LX everything else being fixed (e.g. differences between corresponding cells of Tables <xref rid="j_info1233_tab_006">6</xref>a and <xref rid="j_info1233_tab_006">6</xref>b).</p>
<p>Let’s define a discrete random variable: 
<disp-formula id="j_info1233_eq_002">
<label>(2)</label><alternatives>
<mml:math display="block"><mml:mtable displaystyle="true"><mml:mtr><mml:mtd><mml:mi mathvariant="italic">X</mml:mi><mml:msub><mml:mrow/><mml:mrow><mml:mtext>LX1, LX2, AM, LM</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo maxsize="2.03em" minsize="2.03em" fence="true">{</mml:mo><mml:mstyle displaystyle="true"><mml:mfrac><mml:mrow><mml:mi mathvariant="italic">PER</mml:mi><mml:msub><mml:mrow/><mml:mrow><mml:mtext>LX2, AM, LM, i</mml:mtext></mml:mrow></mml:msub><mml:mo>−</mml:mo><mml:mi mathvariant="italic">PER</mml:mi><mml:msub><mml:mrow/><mml:mrow><mml:mtext>LX1, AM, LM, i</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi mathvariant="italic">PER</mml:mi><mml:msub><mml:mrow/><mml:mrow><mml:mtext>LX1, AM, LM,i</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mstyle><mml:mo maxsize="2.45em" minsize="2.45em" fence="true">}</mml:mo><mml:mo mathvariant="normal">,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math>
<tex-math><![CDATA[\[ X{_{\text{LX1, LX2, AM, LM}}}=\bigg\{\frac{\mathit{PER}{_{\text{LX2, AM, LM, i}}}-\mathit{PER}{_{\text{LX1, AM, LM, i}}}}{\mathit{PER}{_{\text{LX1, AM, LM,i}}}}\Bigg\},\]]]></tex-math></alternatives>
</disp-formula> 
where index i ranges over speaker identities. This random variable represents a relative increase (if it is positive) or relative decrease (if it is negative) of PER as a consequence of replacement of the lexicon LX1 with a lexicon LX2 in the ASR setup that has acoustic modelling technique AM and the phone-level language model LM fixed. Confidence intervals for this random variable could be computed by: 
<disp-formula id="j_info1233_eq_003">
<label>(3)</label><alternatives>
<mml:math display="block"><mml:mtable displaystyle="true"><mml:mtr><mml:mtd><mml:mover accent="false"><mml:mrow><mml:mi mathvariant="italic">X</mml:mi></mml:mrow><mml:mo accent="true">‾</mml:mo></mml:mover><mml:msub><mml:mrow/><mml:mrow><mml:mtext>LX1, LX2, AM, LM</mml:mtext></mml:mrow></mml:msub><mml:mo>±</mml:mo><mml:mi mathvariant="italic">t</mml:mi><mml:mo>×</mml:mo><mml:mstyle displaystyle="true"><mml:mfrac><mml:mrow><mml:mi mathvariant="italic">S</mml:mi><mml:msub><mml:mrow/><mml:mrow><mml:mtext>LX1, LX2, AM, LM</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:msqrt><mml:mrow><mml:mi mathvariant="italic">n</mml:mi></mml:mrow></mml:msqrt></mml:mrow></mml:mfrac></mml:mstyle><mml:mo mathvariant="normal">,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math>
<tex-math><![CDATA[\[ \overline{X}{_{\text{LX1, LX2, AM, LM}}}\pm t\times \frac{S{_{\text{LX1, LX2, AM, LM}}}}{\sqrt{n}},\]]]></tex-math></alternatives>
</disp-formula> 
where <inline-formula id="j_info1233_ineq_002"><alternatives>
<mml:math><mml:mover accent="false"><mml:mrow><mml:mi mathvariant="italic">X</mml:mi></mml:mrow><mml:mo accent="true">‾</mml:mo></mml:mover></mml:math>
<tex-math><![CDATA[$\overline{X}$]]></tex-math></alternatives></inline-formula><sub>LX1,LX2,AM,LM</sub> and S<sub>LX1,LX2,AM,LM</sub> are the mean and standard deviation of the random variable X<sub>LX1,LX2,AM,LM</sub>, <inline-formula id="j_info1233_ineq_003"><alternatives>
<mml:math><mml:mi mathvariant="italic">n</mml:mi><mml:mo>=</mml:mo><mml:mn>10</mml:mn></mml:math>
<tex-math><![CDATA[$n=10$]]></tex-math></alternatives></inline-formula> is the sample size and <inline-formula id="j_info1233_ineq_004"><alternatives>
<mml:math><mml:mi mathvariant="italic">t</mml:mi><mml:mo>=</mml:mo><mml:mn>2.262</mml:mn></mml:math>
<tex-math><![CDATA[$t=2.262$]]></tex-math></alternatives></inline-formula> is <italic>t</italic>-value for the 95% confidence level with <inline-formula id="j_info1233_ineq_005"><alternatives>
<mml:math><mml:mi mathvariant="italic">n</mml:mi><mml:mo>−</mml:mo><mml:mn>1</mml:mn></mml:math>
<tex-math><![CDATA[$n-1$]]></tex-math></alternatives></inline-formula> degrees of freedom. Fig. <xref rid="j_info1233_fig_002">2</xref> compares detailed phonemic lexicon with reduced phonemic lexicons and a graphemic lexicon. It shows means and confidence intervals for random variables X<sub>detailed,LX,AM,LM</sub> given different LX, AM, and LM values.</p>
<fig id="j_info1233_fig_002">
<label>Fig. 2</label>
<caption>
<p>Mean relative increase in phone error rate and 95% confidence intervals after substituting detailed phonemic transcription with a) graphemic transcription; b) “no-stress”, c) “no diphthongs”, d) “no palatalization”, e) “no affricates”, f) “no mixed diphthongs” phonemic transcriptions.</p>
</caption>
<graphic xlink:href="info1233_g002.jpg"/>
</fig>
<p>Plots of Fig. <xref rid="j_info1233_fig_002">2</xref> reveal the following tendencies:</p>
<list>
<list-item id="j_info1233_li_025">
<label>•</label>
<p>Detailed phonemic lexicon significantly outperforms graphemic lexicon across all investigated acoustic modelling techniques and all phone-level language models (Fig. <xref rid="j_info1233_fig_002">2</xref>(a)).</p>
</list-item>
<list-item id="j_info1233_li_026">
<label>•</label>
<p>Detailed phonemic lexicon that models diphthongs as a single unit significantly outperforms reduced phonemic lexicon that models diphthongs as a sequence of two units across all investigated acoustic modelling techniques and all phone-level language models (Fig. <xref rid="j_info1233_fig_002">2</xref>(c)).</p>
</list-item>
<list-item id="j_info1233_li_027">
<label>•</label>
<p>Detailed phonemic lexicon that preserves distinction of stressed vs. non-stressed vowels and the distinction of palatalized vs. non-palatalized consonants is performing significantly better with respect to the lexicons that ignore stress (Fig. <xref rid="j_info1233_fig_002">2</xref>(b)) or palatalization (Fig. <xref rid="j_info1233_fig_002">2</xref>(d)). PER obtained with the LFR-BLSTM acoustic model is the only, albeit statistically not significant, exception to this tendency. We hypothesize that bidirectional recurrent neural network is capable of capturing enough future context to model palatalization with a comparable accuracy to the lexicon that has distinct labels for palatalized and non-palatalized consonants.</p>
</list-item>
<list-item id="j_info1233_li_028">
<label>•</label>
<p>“No stress”, “no diphthongs” and graphemic lexicons (Fig. <xref rid="j_info1233_fig_002">2</xref>(b), <xref rid="j_info1233_fig_002">2</xref>(c), <xref rid="j_info1233_fig_002">2</xref>(a)) become even less attractive if decoder is provided with more phonotactic knowledge.</p>
</list-item>
<list-item id="j_info1233_li_029">
<label>•</label>
<p>“No affricates” phonemic lexicon (modelling affricates by two sub-word units) slightly outperforms detailed lexicon if ASR setup consists of GMM-based or TDNN acoustic models and decoding is done with a categorial phone 3-gram (Fig. <xref rid="j_info1233_fig_002">2</xref>(e)). Giving more phonotactical knowledge to the decoder (probabilistic phone 3-gram or 4-gram) seems to reverse this tendency. BLSTM acoustic model is also in favour of detailed lexicon. However, all observed differences between detailed and “no affricates” lexicon are not statistically significant.</p>
</list-item>
<list-item id="j_info1233_li_030">
<label>•</label>
<p>It seems that distinguishing sonorants that make part of a mixed diphthong from the regular ones may be slightly preferred (Fig. <xref rid="j_info1233_fig_002">2</xref>(f)). Though such preference is not proven to be statistically significant.</p>
</list-item>
<list-item id="j_info1233_li_031">
<label>•</label>
<p>The LFR BLSTM acoustic model shows higher variability (Fig. <xref rid="j_info1233_fig_002">2</xref>(d)–<xref rid="j_info1233_fig_002">2</xref>(f)) of the relative increase in PER in comparison to other acoustic models. Higher variability is due to the randomness of the BLSTM training procedure<xref ref-type="fn" rid="j_info1233_fn_026">27</xref><fn id="j_info1233_fn_026"><label><sup>27</sup></label>
<p>We have observed that PER on the test subset may differ by as much as 0.5–1.0% for two random initializations (training subset, validation subset, initial weights).</p></fn> and usually lower denominator values (variable X<sub>LX1, LX2, AM, LM</sub> in expression (2)).</p>
</list-item>
</list>
<p>Absolute PER values for different acoustic modelling techniques are shown in Fig. <xref rid="j_info1233_fig_003">3</xref>.</p>
<fig id="j_info1233_fig_003">
<label>Fig. 3</label>
<caption>
<p>Means and 95% confidence intervals of the phone error rate (PER).</p>
</caption>
<graphic xlink:href="info1233_g003.jpg"/>
</fig>
<p>Though many different acoustic modelling techniques have been tried in this study, we do not make claims about their relative performance,<xref ref-type="fn" rid="j_info1233_fn_027">28</xref><fn id="j_info1233_fn_027"><label><sup>28</sup></label>
<p>Decoding results obtained on the basis of LFR BLSTM acoustic models can not be directly compared to other decoding results because this ASR setup was trained on 3 copies of speed-perturbed data.</p></fn> because we would need to prove that the optimum configuration was chosen for every acoustic modelling technique. Such an investigation was out of the scope of this paper. However, it seems that ASR setups based on the recursive deep neural network acoustic models compare well to the other acoustic modelling techniques. This result is in-line with the general tendency in ASR domain and represent the direction to go forward.</p>
</sec>
<sec id="j_info1233_s_013">
<label>5</label>
<title>Discussion and Conclusions</title>
<p>This paper reviewed 15 years of research on the problem of the optimum word to sub-word unit mapping for the purposes of the Lithuanian ASR. It presented a common framework to compare different phonemic word to sub-word mappings. It also investigated and compared multiple phonemic and graphemic word to sub-word mappings across a broad range of acoustic modelling techniques.</p>
<p>Our investigation has shown that phonemic mappings outperform graphemic mappings by a large margin. We assume that other studies, that have found graphemic mappings better (Gales <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_005">2015</xref>) or comparable (Lileikytė <italic>et al.</italic>, <xref ref-type="bibr" rid="j_info1233_ref_018">2018</xref>) in performance to phonemic ones, came up to this result by contrasting graphemic mappings to phonemic mappings lacking important features. For instance, phonemic lexicons investigated by Lileikytė <italic>et al.</italic> (<xref ref-type="bibr" rid="j_info1233_ref_018">2018</xref>) lack stressed allophones, whereas the importance of distinguishing stressed and non-stressed allophones is demonstrated in this study.</p>
<p>Though our investigation has not revealed which phonemic mapping is the best one, it gave insights about which mappings should not be used. Phonemic mappings that model diphthongs by two symbols and/or ignore stress were statistically significantly outperformed by the most detailed lexicon. What is the best approach to model palatalization, mixed diphthongs and affricates is still subject to the future investigations.</p>
<p>Our findings were obtained in the framework of separately tuning an acoustic model of the ASR system. Categorial phone 3-gram and PER criterion have helped us to eliminate lexical and syntactic-semantic layers of the ASR system and to evaluate word to sub-word unit mappings on the basis of the performance of an acoustic model alone. It is worth addressing the question of the best word to sub-word unit mapping in the framework of jointly tuning the complete ASR system (acoustic and word-level language models together) and checking if the gains in PER observed with an isolated acoustic model translate into the WER gains of the jointly optimized system.</p>
<p>Detailed lexicon was among the best performing lexicons investigated in this study. Thus, we believe that data scarcity played no major role in our investigations and our findings might be valid for corpora that are larger than 50 hours. It might be worth investigating even more detailed word to sub-word unit mappings including syllables, sylable-like units, consonant clusters, etc. following the suggestion of Laurinčiukaitė (<xref ref-type="bibr" rid="j_info1233_ref_015">2008</xref>).</p>
</sec>
</body>
<back>
<ref-list id="j_info1233_reflist_001">
<title>References</title>
<ref id="j_info1233_ref_001">
<mixed-citation publication-type="chapter"><string-name><surname>Alumäe</surname>, <given-names>T.</given-names></string-name>, <string-name><surname>Tilk</surname>, <given-names>O.</given-names></string-name> (<year>2016</year>). <chapter-title>Automatic speech recognition system for Lithuanian broadcast audio</chapter-title>. In: <source>Human Language Technologies – The Baltic Perspective: Proceedings of the Seventh International Conference, Baltic HLT 2016</source>, Vol. <volume>289</volume>, pp. <fpage>39</fpage>–<lpage>45</lpage>.</mixed-citation>
</ref>
<ref id="j_info1233_ref_002">
<mixed-citation publication-type="other"><string-name><surname>Collobert</surname>, <given-names>R.</given-names></string-name>, <string-name><surname>Puhrsch</surname>, <given-names>C.</given-names></string-name>, <string-name><surname>Synnaeve</surname>, <given-names>G.</given-names></string-name> (2016). Wav2Letter: an end-to-end ConvNet-based speech recognition system. <ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/arXiv:1609.03193">arXiv:1609.03193</ext-link> [cs.LG].</mixed-citation>
</ref>
<ref id="j_info1233_ref_003">
<mixed-citation publication-type="journal"><string-name><surname>Gales</surname>, <given-names>M.J.F.</given-names></string-name> (<year>1998</year>). <article-title>Maximum likelihood linear transformations for HMM-based speech recognition</article-title>. <source>Computer Speech and Language</source>, <volume>12</volume>(<issue>2</issue>), <fpage>75</fpage>–<lpage>98</lpage>.</mixed-citation>
</ref>
<ref id="j_info1233_ref_004">
<mixed-citation publication-type="journal"><string-name><surname>Gales</surname>, <given-names>M.J.F.</given-names></string-name> (<year>1999</year>). <article-title>Semi-tied covariance matrices for hidden Markov models</article-title>. <source>IEEE Transactions on Speech and Audio Processing</source>, <volume>7</volume>, <fpage>272</fpage>–<lpage>281</lpage>.</mixed-citation>
</ref>
<ref id="j_info1233_ref_005">
<mixed-citation publication-type="chapter"><string-name><surname>Gales</surname>, <given-names>M.J.F.</given-names></string-name>, <string-name><surname>Knill</surname>, <given-names>K.M.</given-names></string-name>, <string-name><surname>Ragni</surname>, <given-names>A.</given-names></string-name> (<year>2015</year>). <chapter-title>Unicode-based graphemic systems for limited resource languages</chapter-title>. In: <source>2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</source>, pp. <fpage>5186</fpage>–<lpage>5190</lpage>.</mixed-citation>
</ref>
<ref id="j_info1233_ref_006">
<mixed-citation publication-type="book"><string-name><surname>Girdenis</surname>, <given-names>A.</given-names></string-name> (<year>2014</year>). <source>Theoretical Foundations of Lithuanian Phonology</source>. <comment>English translation by Steven Young, XVII, 413 p</comment>.</mixed-citation>
</ref>
<ref id="j_info1233_ref_007">
<mixed-citation publication-type="chapter"><string-name><surname>Greibus</surname>, <given-names>M.</given-names></string-name>, <string-name><surname>Ringelienė</surname>, <given-names>Ž.</given-names></string-name>, <string-name><surname>Telksnys</surname>, <given-names>A.L.</given-names></string-name> (<year>2017</year>). <chapter-title>The phoneme set influence for Lithuanian speech commands recognition accuracy</chapter-title>. In: <source>Proceedings of the Conference Electrical, Electronic and Information Sciences (eStream)</source>, pp. <fpage>1</fpage>–<lpage>4</lpage>.</mixed-citation>
</ref>
<ref id="j_info1233_ref_008">
<mixed-citation publication-type="other"><string-name><surname>Harper</surname>, <given-names>M.</given-names></string-name> (2016). <italic>Babel: US IARPA Project (2012–2016)</italic>. <ext-link ext-link-type="uri" xlink:href="https://www.iarpa.gov/index.php/research-programs/babel">https://www.iarpa.gov/index.php/research-programs/babel</ext-link>.</mixed-citation>
</ref>
<ref id="j_info1233_ref_009">
<mixed-citation publication-type="chapter"><string-name><surname>Kanthak</surname>, <given-names>S.</given-names></string-name>, <string-name><surname>Ney</surname>, <given-names>H.</given-names></string-name> (<year>2002</year>). <chapter-title>Context-dependent acoustic modeling using graphemes for large vocabulary speech recognition</chapter-title>. In: <source>Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</source>, Vol. <volume>2</volume>, pp. <fpage>845</fpage>–<lpage>848</lpage>.</mixed-citation>
</ref>
<ref id="j_info1233_ref_010">
<mixed-citation publication-type="other"><string-name><surname>Kazlauskienė</surname>, <given-names>A.</given-names></string-name>, <string-name><surname>Raškinis</surname>, <given-names>G.</given-names></string-name>, <string-name><surname>Vaičiūnas</surname>, <given-names>A.</given-names></string-name> (2010). <italic>Automatic Syllabification, Stress Assignment and Phonetic Transcription of Lithuanian Words</italic> (in Lithuanian).</mixed-citation>
</ref>
<ref id="j_info1233_ref_011">
<mixed-citation publication-type="chapter"><string-name><surname>Killer</surname>, <given-names>M.</given-names></string-name>, <string-name><surname>Stüker</surname>, <given-names>S.</given-names></string-name>, <string-name><surname>Schultz</surname>, <given-names>T.</given-names></string-name> (<year>2003</year>). <chapter-title>Grapheme based speech recognition</chapter-title>. In: <source>Proceedings of Interspeech-2003</source>, pp. <fpage>3141</fpage>–<lpage>3144</lpage>.</mixed-citation>
</ref>
<ref id="j_info1233_ref_012">
<mixed-citation publication-type="chapter"><string-name><surname>Ko</surname>, <given-names>T.</given-names></string-name>, <string-name><surname>Peddinti</surname>, <given-names>V.</given-names></string-name>, <string-name><surname>Povey</surname>, <given-names>D.</given-names></string-name>, <string-name><surname>Khudanpur</surname>, <given-names>S.</given-names></string-name> (<year>2015</year>). <chapter-title>Audio augmentation for speech recognition</chapter-title>. In: <source>Proceedings of Interspeech-2015</source>, pp. <fpage>3586</fpage>–<lpage>3589</lpage>.</mixed-citation>
</ref>
<ref id="j_info1233_ref_013">
<mixed-citation publication-type="journal"><string-name><surname>Laurinčiukaitė</surname>, <given-names>S.</given-names></string-name>, <string-name><surname>Šilingas</surname>, <given-names>D.</given-names></string-name>, <string-name><surname>Skripkauskas</surname>, <given-names>M.</given-names></string-name>, <string-name><surname>Telksnys</surname>, <given-names>L.</given-names></string-name> (<year>2006</year>). <article-title>Lithuanian continuous speech corpus LRN 0.1: design and potential applications</article-title>. <source>Information Technology and Control</source>, <volume>35</volume>(<issue>4</issue>), <fpage>431</fpage>–<lpage>440</lpage>.</mixed-citation>
</ref>
<ref id="j_info1233_ref_014">
<mixed-citation publication-type="journal"><string-name><surname>Laurinčiukaitė</surname>, <given-names>S.</given-names></string-name>, <string-name><surname>Lipeika</surname>, <given-names>A.</given-names></string-name> (<year>2007</year>). <article-title>Framework for choosing a set of syllables and phonemes for Lithuanian speech recognition</article-title>. <source>Informatica</source>, <volume>18</volume>(<issue>3</issue>), <fpage>395</fpage>–<lpage>406</lpage>.</mixed-citation>
</ref>
<ref id="j_info1233_ref_015">
<mixed-citation publication-type="other"><string-name><surname>Laurinčiukaitė</surname>, <given-names>S.</given-names></string-name> (2008). <italic>Acoustic Modeling of Lithuanian Speech Recogniton</italic>. PhD Thesis (in Lithuanian).</mixed-citation>
</ref>
<ref id="j_info1233_ref_016">
<mixed-citation publication-type="journal"><string-name><surname>Laurinčiukaitė</surname>, <given-names>S.</given-names></string-name>, <string-name><surname>Telksnys</surname>, <given-names>L.</given-names></string-name>, <string-name><surname>Kasparaitis</surname>, <given-names>P.</given-names></string-name>, <string-name><surname>Kliukienė</surname>, <given-names>R.</given-names></string-name>, <string-name><surname>Paukštytė</surname>, <given-names>V.</given-names></string-name> (<year>2018</year>). <article-title>Lithuanian speech corpus Liepa for development of human-computer interfaces working in voice recognition and synthesis mode</article-title>. <source>Informatica</source>, <volume>29</volume>(<issue>3</issue>), <fpage>487</fpage>–<lpage>498</lpage>.</mixed-citation>
</ref>
<ref id="j_info1233_ref_017">
<mixed-citation publication-type="journal"><string-name><surname>Lileikytė</surname>, <given-names>R.</given-names></string-name>, <string-name><surname>Gorin</surname>, <given-names>A.</given-names></string-name>, <string-name><surname>Lamel</surname>, <given-names>L.</given-names></string-name>, <string-name><surname>Gauvain</surname>, <given-names>J.</given-names></string-name>, <string-name><surname>Fraga-Silva</surname>, <given-names>T.</given-names></string-name> (<year>2016</year>). <article-title>Lithuanian broadcast speech transcription using semi-supervised acoustic model training</article-title>. <source>Proceedings of Computer Science</source>, <volume>81</volume>, <fpage>107</fpage>–<lpage>113</lpage>.</mixed-citation>
</ref>
<ref id="j_info1233_ref_018">
<mixed-citation publication-type="journal"><string-name><surname>Lileikytė</surname>, <given-names>R.</given-names></string-name>, <string-name><surname>Lamel</surname>, <given-names>L.</given-names></string-name>, <string-name><surname>Gauvain</surname>, <given-names>J.</given-names></string-name>, <string-name><surname>Gorin</surname>, <given-names>A.</given-names></string-name> (<year>2018</year>). <article-title>Conversational telephone speech recognition for Lithuanian</article-title>. <source>Computer Speech and Language</source>, <volume>49</volume>, <fpage>71</fpage>–<lpage>92</lpage>.</mixed-citation>
</ref>
<ref id="j_info1233_ref_019">
<mixed-citation publication-type="chapter"><string-name><surname>Norkevičius</surname>, <given-names>G.</given-names></string-name>, <string-name><surname>Raškinis</surname>, <given-names>G.</given-names></string-name>, <string-name><surname>Kazlauskienė</surname>, <given-names>A.</given-names></string-name> (<year>2005</year>). <chapter-title>Knowledge-based grapheme-to-phoneme conversion of Lithuanian words</chapter-title>. In: <source>SPECOM 2005, 10th International Conference Speech and Computer</source>, pp. <fpage>235</fpage>–<lpage>238</lpage>.</mixed-citation>
</ref>
<ref id="j_info1233_ref_020">
<mixed-citation publication-type="other"><string-name><surname>Pakerys</surname>, <given-names>A.</given-names></string-name> (2003). <italic>Lietuvių bendrinės kalbos fonetika</italic> [Phonetics of Standard Lithuanian]. Vilnius, Enciklopedija, 35, pp. 83–84.</mixed-citation>
</ref>
<ref id="j_info1233_ref_021">
<mixed-citation publication-type="chapter"><string-name><surname>Povey</surname>, <given-names>D.</given-names></string-name>, <string-name><surname>Ghoshal</surname>, <given-names>A.</given-names></string-name>, <string-name><surname>Boulianne</surname>, <given-names>G.</given-names></string-name>, <string-name><surname>Burget</surname>, <given-names>L.</given-names></string-name>, <string-name><surname>Glembek</surname>, <given-names>O.</given-names></string-name>, <string-name><surname>Goel</surname>, <given-names>N.</given-names></string-name>, <string-name><surname>Hannemann</surname>, <given-names>M.</given-names></string-name>, <string-name><surname>Motlicek</surname>, <given-names>P.</given-names></string-name>, <string-name><surname>Qian</surname>, <given-names>P.</given-names></string-name>, <string-name><surname>Schwarz</surname>, <given-names>P.</given-names></string-name>, <string-name><surname>Silovsky</surname>, <given-names>J.</given-names></string-name>, <string-name><surname>Stemmer</surname>, <given-names>G.</given-names></string-name>, <string-name><surname>Vesely</surname>, <given-names>K.</given-names></string-name> (<year>2011</year>a). <chapter-title>The Kaldi speech recognition toolkit</chapter-title>. In: <source>IEEE 2011 Workshop on Automatic Speech Recognition and Understanding (ASRU)</source>.</mixed-citation>
</ref>
<ref id="j_info1233_ref_022">
<mixed-citation publication-type="journal"><string-name><surname>Povey</surname>, <given-names>D.</given-names></string-name>, <string-name><surname>Burget</surname>, <given-names>L.</given-names></string-name>, <string-name><surname>Agarwal</surname>, <given-names>M.</given-names></string-name>, <string-name><surname>Akyazi</surname>, <given-names>P.</given-names></string-name>, <string-name><surname>Kai</surname>, <given-names>F.</given-names></string-name>, <string-name><surname>Ghoshal</surname>, <given-names>A.</given-names></string-name>, <string-name><surname>Glembek</surname>, <given-names>O.</given-names></string-name>, <string-name><surname>Goel</surname>, <given-names>N.</given-names></string-name>, <string-name><surname>Karafiát</surname>, <given-names>M.</given-names></string-name>, <string-name><surname>Rastrow</surname>, <given-names>A.</given-names></string-name>, <string-name><surname>Rose</surname>, <given-names>R.C.</given-names></string-name>, <string-name><surname>Schwarz</surname>, <given-names>P.</given-names></string-name>, <string-name><surname>Thomas</surname>, <given-names>S.</given-names></string-name> (<year>2011</year>b). <article-title>The subspace Gaussian mixture model – a structured model for speech recognition</article-title>. <source>Computer Speech and Language</source>, <volume>25</volume>(<issue>2</issue>), <fpage>404</fpage>–<lpage>439</lpage>.</mixed-citation>
</ref>
<ref id="j_info1233_ref_023">
<mixed-citation publication-type="chapter"><string-name><surname>Povey</surname>, <given-names>D.</given-names></string-name>, <string-name><surname>Peddinti</surname>, <given-names>V.</given-names></string-name>, <string-name><surname>Galvez</surname>, <given-names>D.</given-names></string-name>, <string-name><surname>Ghahremani</surname>, <given-names>P.</given-names></string-name>, <string-name><surname>Manohar</surname>, <given-names>V.</given-names></string-name>, <string-name><surname>Na</surname>, <given-names>X.</given-names></string-name>, <string-name><surname>Wang</surname>, <given-names>Y.</given-names></string-name>, <string-name><surname>Khudanpur</surname>, <given-names>S.</given-names></string-name> (<year>2016</year>). <chapter-title>Purely sequence-trained neural networks for asr based on lattice-free MMI</chapter-title>. In: <source>Proceedings of Interspeech-2016</source>, pp. <fpage>2751</fpage>–<lpage>2755</lpage>.</mixed-citation>
</ref>
<ref id="j_info1233_ref_024">
<mixed-citation publication-type="chapter"><string-name><surname>Raškinis</surname>, <given-names>G.</given-names></string-name>, <string-name><surname>Raškinienė</surname>, <given-names>D.</given-names></string-name> (<year>2003</year>). <chapter-title>Parameter investigation and optimization for the Lithuanian HMM-based speech recognition system</chapter-title>. In: <source>Proceedings of the Conference “Information Technologies 2003”</source>, pp. <fpage>41</fpage>–<lpage>48</lpage>.</mixed-citation>
</ref>
<ref id="j_info1233_ref_025">
<mixed-citation publication-type="journal"><string-name><surname>Raškinis</surname>, <given-names>A.</given-names></string-name>, <string-name><surname>Raškinis</surname>, <given-names>G.</given-names></string-name>, <string-name><surname>Kazlauskienė</surname>, <given-names>A.</given-names></string-name> (<year>2003</year>). <article-title>Speech assessment methods phonetic alphabet (SAMPA) for encoding transcriptions of Lithuanian speech corpora</article-title>. <source>Information Technology and Control</source>, <volume>29</volume>(<issue>4</issue>), <fpage>52</fpage>–<lpage>55</lpage>.</mixed-citation>
</ref>
<ref id="j_info1233_ref_026">
<mixed-citation publication-type="journal"><string-name><surname>Ratkevicius</surname>, <given-names>K.</given-names></string-name>, <string-name><surname>Paskauskaite</surname>, <given-names>G.</given-names></string-name>, <string-name><surname>Bartisiute</surname>, <given-names>G.</given-names></string-name> (<year>2018</year>). <article-title>Advanced recognition of Lithuanian digit names using hybrid approach</article-title>. <source>Elektronika ir Elektrotechnika</source>, <volume>24</volume>(<issue>2</issue>), <fpage>70</fpage>–<lpage>73</lpage>.</mixed-citation>
</ref>
<ref id="j_info1233_ref_027">
<mixed-citation publication-type="chapter"><string-name><surname>Rudžionis</surname>, <given-names>V.</given-names></string-name>, <string-name><surname>Ratkevičius</surname>, <given-names>K.</given-names></string-name>, <string-name><surname>Rudžionis</surname>, <given-names>A.</given-names></string-name>, <string-name><surname>Raškinis</surname>, <given-names>G.</given-names></string-name>, <string-name><surname>Maskeliūnas</surname>, <given-names>R.</given-names></string-name> (<year>2013</year>). <chapter-title>Recognition of voice commands using hybrid approach</chapter-title>. In: <source>Information and Software Technologies. ICIST 2013. Communications in Computer and Information Science</source>, Vol. 403, pp. <fpage>249</fpage>–<lpage>260</lpage>.</mixed-citation>
</ref>
<ref id="j_info1233_ref_028">
<mixed-citation publication-type="chapter"><string-name><surname>Salimbajevs</surname>, <given-names>A.</given-names></string-name>, <string-name><surname>Kapočiūtė-Dzikienė</surname>, <given-names>J.</given-names></string-name> (<year>2018</year>). <chapter-title>General-purpose Lithuanian automatic speech recognition system</chapter-title>. In: <source>Human Language Technologies – The Baltic Perspective</source>, pp. <fpage>150</fpage>–<lpage>157</lpage>.</mixed-citation>
</ref>
<ref id="j_info1233_ref_029">
<mixed-citation publication-type="chapter"><string-name><surname>Saon</surname>, <given-names>G.</given-names></string-name>, <string-name><surname>Soltau</surname>, <given-names>H.</given-names></string-name>, <string-name><surname>Nahamoo</surname>, <given-names>D.</given-names></string-name>, <string-name><surname>Picheny</surname>, <given-names>M.</given-names></string-name> (<year>2013</year>). <chapter-title>Speaker adaptation of neural network acoustic models using i-vectors</chapter-title>. In: <source>Automatic Speech Recognition and Understanding (ASRU), 2013 IEEE Workshop</source>, pp. <fpage>55</fpage>–<lpage>59</lpage>.</mixed-citation>
</ref>
<ref id="j_info1233_ref_030">
<mixed-citation publication-type="journal"><string-name><surname>Skripkauskas</surname>, <given-names>M.</given-names></string-name>, <string-name><surname>Telksnys</surname>, <given-names>L.</given-names></string-name> (<year>2006</year>). <article-title>Automatic transcription of Lithuanian text using dictionary</article-title>. <source>Informatica</source>, <volume>17</volume>(<issue>4</issue>), <fpage>587</fpage>–<lpage>600</lpage>.</mixed-citation>
</ref>
<ref id="j_info1233_ref_031">
<mixed-citation publication-type="other"><string-name><surname>Šilingas</surname>, <given-names>D.</given-names></string-name> (2005). <italic>Choosing Acoustic Modeling Units for Lithuanian Continuous Speech Recogniton Based on Hidden Markov Models</italic>. PhD Thesis (in Lithuanian).</mixed-citation>
</ref>
<ref id="j_info1233_ref_032">
<mixed-citation publication-type="chapter"><string-name><surname>Zhang</surname>, <given-names>X.</given-names></string-name>, <string-name><surname>Trmal</surname>, <given-names>J.</given-names></string-name>, <string-name><surname>Povey</surname>, <given-names>D.</given-names></string-name>, <string-name><surname>Khudanpur</surname>, <given-names>S.</given-names></string-name> (<year>2014</year>). <chapter-title>Improving deep neural network acoustic models using generalized maxout networks</chapter-title>. In: <source>2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</source>, pp. <fpage>215</fpage>–<lpage>219</lpage>.</mixed-citation>
</ref>
</ref-list>
</back>
</article>