@inproceedings{MTMT:33437265, title = {BEA-Base: A Benchmark for ASR of Spontaneous Hungarian}, url = {https://m2.mtmt.hu/api/publication/33437265}, author = {Mihajlik, Péter and Balog, András and Gráczi, Tekla Etelka and Kohári, Anna and Tarján, Balázs and Mády, Katalin}, booktitle = {Proceedings of the 13th Language Resources and Evaluation Conference}, unique-id = {33437265}, year = {2022}, pages = {1970-1977}, orcid-numbers = {Mihajlik, Péter/0000-0001-7532-9773; Gráczi, Tekla Etelka/0000-0003-3351-9661; Kohári, Anna/0000-0003-2500-0149; Tarján, Balázs/0000-0002-9676-3082} } @inbook{MTMT:33283256, title = {A BEA továbbfejlesztése és alkalmazása kontrasztív gépi beszédfelismerési kísérletekre}, url = {https://m2.mtmt.hu/api/publication/33283256}, author = {Mihajlik, Péter and Gráczi, Tekla Etelka and Kohári, Anna and Tarján, Balázs and Balog, András and Mády, Katalin}, booktitle = {Általános nyelvészeti tanulmányok 34.}, unique-id = {33283256}, year = {2022}, pages = {361-380}, orcid-numbers = {Mihajlik, Péter/0000-0001-7532-9773; Gráczi, Tekla Etelka/0000-0003-3351-9661; Kohári, Anna/0000-0003-2500-0149; Tarján, Balázs/0000-0002-9676-3082} } @article{MTMT:33267111, title = {Morphology aware data augmentation with neural language models for online hybrid ASR}, url = {https://m2.mtmt.hu/api/publication/33267111}, author = {Tarján, Balázs and Fegyó, Tibor and Mihajlik, Péter}, doi = {10.1556/2062.2022.00582}, journal-iso = {ACTA LING ACAD}, journal = {ACTA LINGUISTICA ACADEMICA}, volume = {69}, unique-id = {33267111}, issn = {2559-8201}, abstract = {Recognition of Hungarian conversational telephone speech is challenging due to the informal style and morphological richness of the language. Neural Network Language Models (NNLMs) can provide remedy for the high perplexity of the task; however, their high complexity makes them very difficult to apply in the first (single) pass of an online system. Recent studies showed that a considerable part of the knowledge of NNLMs can be transferred to traditional n-grams by using neural text generation based data augmentation. Data augmentation with NNLMs works well for isolating languages; however, we show that it causes a vocabulary explosion in a morphologically rich language. Therefore, we propose a new, morphology aware neural text augmentation method, where we retokenize the generated text into statistically derived subwords. We compare the performance of word-based and subword-based data augmentation techniques with recurrent and Transformer language models and show that subword-based methods can significantly improve the Word Error Rate (WER) while greatly reducing vocabulary size and memory requirements. Combining subword-based modeling and neural language model-based data augmentation, we were able to achieve 11% relative WER reduction and preserve real-time operation of our conversational telephone speech recognition system. Finally, we also demonstrate that subword-based neural text augmentation outperforms the word-based approach not only in terms of overall WER but also in recognition of Out-of-Vocabulary (OOV) words.}, year = {2022}, eissn = {2560-1016}, pages = {581-598}, orcid-numbers = {Tarján, Balázs/0000-0002-9676-3082; Fegyó, Tibor/0000-0003-0938-1965; Mihajlik, Péter/0000-0001-7532-9773} } @mastersthesis{MTMT:32498304, title = {Language Modeling for Hungarian Speech Recognition}, url = {https://m2.mtmt.hu/api/publication/32498304}, author = {Tarján, Balázs}, publisher = {Budapest University of Technology and Economics}, unique-id = {32498304}, year = {2021}, orcid-numbers = {Tarján, Balázs/0000-0002-9676-3082} } @inproceedings{MTMT:31881360, title = {End-to-end és hibrid mélyneuronháló alapú gépi leiratozás magyar nyelvű telefonos ügyfélszolgálati beszélgetésekre}, url = {https://m2.mtmt.hu/api/publication/31881360}, author = {Mihajlik, Péter and Balog, András and Tarján, Balázs and Fegyó, Tibor}, booktitle = {XVII. Magyar Számítógépes Nyelvészeti Konferencia : MSZNY 2021}, unique-id = {31881360}, year = {2021}, pages = {139-145}, orcid-numbers = {Mihajlik, Péter/0000-0001-7532-9773; Tarján, Balázs/0000-0002-9676-3082; Fegyó, Tibor/0000-0003-0938-1965} } @misc{MTMT:31855595, title = {Deep Transformer based Data Augmentation with Subword Units for Morphologically Rich Online ASR}, url = {https://m2.mtmt.hu/api/publication/31855595}, author = {Tarján, Balázs and Szaszák, György and Fegyó, Tibor and Mihajlik, Péter}, unique-id = {31855595}, year = {2020}, orcid-numbers = {Tarján, Balázs/0000-0002-9676-3082; Fegyó, Tibor/0000-0003-0938-1965; Mihajlik, Péter/0000-0001-7532-9773} } @inproceedings{MTMT:31621427, title = {Improving Real-time Recognition of Morphologically Rich Speech with Transformer Language Model}, url = {https://m2.mtmt.hu/api/publication/31621427}, author = {Tarján, Balázs and Szaszák, György and Fegyó, Tibor and Mihajlik, Péter}, booktitle = {11th IEEE International Conference on Cognitive Infocommunications (CogInfoCom 2020)}, doi = {10.1109/CogInfoCom50765.2020.9237817}, unique-id = {31621427}, year = {2020}, pages = {491-496}, orcid-numbers = {Tarján, Balázs/0000-0002-9676-3082; Fegyó, Tibor/0000-0003-0938-1965; Mihajlik, Péter/0000-0001-7532-9773} } @inproceedings{MTMT:31608551, title = {On the Effectiveness of Neural Text Generation Based Data Augmentation for Recognition of Morphologically Rich Speech}, url = {https://m2.mtmt.hu/api/publication/31608551}, author = {Tarján, Balázs and Szaszák, György and Fegyó, Tibor and Mihajlik, Péter}, booktitle = {Text, Speech, and Dialogue: TSD 2020}, doi = {10.1007/978-3-030-58323-1_47}, unique-id = {31608551}, year = {2020}, pages = {437-445}, orcid-numbers = {Tarján, Balázs/0000-0002-9676-3082; Fegyó, Tibor/0000-0003-0938-1965; Mihajlik, Péter/0000-0001-7532-9773} } @article{MTMT:31483149, title = {A low latency sequential model and its user -focused evaluation for automatic punctuation of ASR closed captions}, url = {https://m2.mtmt.hu/api/publication/31483149}, author = {Tündik, Máté Ákos and Tarján, Balázs and Szaszák, György}, doi = {10.1016/j.csl.2020.101076}, journal-iso = {COMPUT SPEECH LANG}, journal = {COMPUTER SPEECH AND LANGUAGE}, volume = {63}, unique-id = {31483149}, issn = {0885-2308}, abstract = {In Automatic Speech Recognition (ASR), inserting the punctuation marks into the word chain hypothesis has long been given low priority, as efforts were concentrated on minimizing word error rates. Punctuation, however, also has a high impact on the transcription quality perceived by the users. Prosody, textual context and their combination have since been used successfully for automatic punctuation of ASR outputs. The recently proposed RNN based solutions show encouraging performance. We believe that current bottlenecks of punctuation technology are on one hand the complex punctuation models, which, having high latency, are not suitable for use-cases with real-time requirements; and on the other hand, punctuation efforts have not been validated against human perception and user impression. The ambition of this paper is to propose a lightweight, yet powerful RNN punctuation model for on-line (real-time including low latency) environment, and also to assess user opinion, in general and also for target users living with hearing loss or impairment. The proposed online RNN punctuation model is evaluated against a Maximum Entropy (MaxEnt) baseline, for Hungarian and for English, whereas subjective assessment tests are carried out on real broadcast data subtitled with ASR (closed captioning). As it can be expected, the RNN outperforms the MaxEnt baseline system, but of course not the off-line systems: limiting the future context to minimize latency results only in a slighter performance drop, but ASR errors obviously influence punctuation performance considerably. A genre analysis is also carried out w.r.t. the punctuation performance showing that both recognition and punctuation of more spontaneous speech styles is challenging. Overall, the subjective tests confirmed that users perceive a significant quality improvement when punctuation is added, even in presence of word errors and even if punctuation is automatic and hence itself may contain further errors. For users living with hearing loss or deafness, an even higher, clear preference for the punctuated captions could be confirmed. (c) 2020 Elsevier Ltd. All rights reserved.}, keywords = {maximum entropy; LSTM; low latency; mean opinion score; Recurrent neural network; punctuation; Real-time modelling; User-focused evaluation; Closed captioning}, year = {2020}, eissn = {1095-8363}, orcid-numbers = {Tündik, Máté Ákos/0000-0003-2085-0136; Tarján, Balázs/0000-0002-9676-3082} } @inproceedings{MTMT:31640248, title = {N-gram Approximation of LSTM Recurrent Language Models for Single-pass Recognition of Hungarian Call Center Conversations}, url = {https://m2.mtmt.hu/api/publication/31640248}, author = {Tarján, Balázs and Szaszák, György and Fegyó, Tibor and Mihajlik, Péter}, booktitle = {10th IEEE International Conference on Cognitive Infocommunications, (CogInfoCom 2019)}, doi = {10.1109/CogInfoCom47531.2019.9089959}, unique-id = {31640248}, year = {2019}, pages = {131-136}, orcid-numbers = {Tarján, Balázs/0000-0002-9676-3082; Fegyó, Tibor/0000-0003-0938-1965; Mihajlik, Péter/0000-0001-7532-9773} }