@inproceedings{MTMT:32644299, title = {HuLU: magyar nyelvű benchmark adatbázis kiépítése a neurális nyelvmodellek kiértékelése céljából}, url = {https://m2.mtmt.hu/api/publication/32644299}, author = {Ligeti-Nagy, Noémi and Ferenczi, Gergő and Héja, Enikő and Jelencsik-Mátyus, Kinga and Laki, László János and Vadász, Noémi and Yang, Zijian Győző and Váradi, Tamás}, booktitle = {XVIII. Magyar Számítógépes Nyelvészeti Konferencia : MSZNY 2022}, unique-id = {32644299}, year = {2022}, pages = {431-446} } @CONFERENCE{MTMT:31436461, title = {Automatic Tag Recommendation for News Articles}, url = {https://m2.mtmt.hu/api/publication/31436461}, author = {Yang, Zijian Győző and Novák, Attila and Laki, László János}, booktitle = {Proceedings of the 11th International Conference on Applied Informatics (ICAI 2020)}, unique-id = {31436461}, abstract = {In this paper, we present an automatic neural tag recommendation system for Hungarian news articles and the results of our experiments concerning the effect of preprocessing applied to the texts and various parameter settings. A novelty of the approach is a combination of subword tokenization with character-n-gram-based representations, which resulted in high gains in recall. The best system yields 76% precision at 58% recall. Subjective performance is higher, because suggested labels missing from the reference often fit the document well or are similar to missing reference labels. We also created an online GUI for the tag recommendation system that makes it possible for the user to interactively set threshold parameters facilitating customization of precision and recall. Copyright © 2020 for this paper by its authors. Use permitted under Creative Commons License Attribution 4.0 International (CC BY 4.0).}, year = {2020}, pages = {442-451} } @article{MTMT:2844131, title = {Context-aware correction of spelling errors in Hungarian medical documents}, url = {https://m2.mtmt.hu/api/publication/2844131}, author = {Novák, Borbála and Novák, Attila and Prószéky, Gábor}, doi = {10.1016/j.csl.2014.09.001}, journal-iso = {COMPUT SPEECH LANG}, journal = {COMPUTER SPEECH AND LANGUAGE}, volume = {35}, unique-id = {2844131}, issn = {0885-2308}, abstract = {Abstract Owing to the growing need of acquiring medical data from clinical records, processing such documents is an important topic in natural language processing (NLP). However, for general NLP methods to work, a proper, normalized input is required. Otherwise the system is overwhelmed by the unusually high amount of noise generally characteristic of this kind of text. The different types of this noise originate from non-standard language use: short fragments instead of proper sentences, usage of Latin words, many acronyms and very frequent misspellings. In this paper, a method is described for the automated correction of spelling errors in Hungarian clinical records. First, a word-based algorithm was implemented to generate a ranked list of correction candidates for word forms regarded as incorrect. Second, the problem of spelling correction was modelled as a translation task, where the source language is the erroneous text and the target language is the corrected one. A Statistical Machine Translation (SMT) decoder performed the task of error correction. Since no orthographically correct proofread text from this domain is available, we could not use such a corpus for training the system. Instead, the word-based system was used to create translation models. In addition, a 3-gram token-based language model was used to model lexical context. Due to the high number of abbreviations and acronyms in the texts, the behaviour of these abbreviated forms was further examined both in the case of the context-unaware word-based and the SMT-decoder-based implementations. The results show that the SMT-based method outperforms the first candidate accuracy of the word-based ranking system. However, the normalization of abbreviations should be handled as a separate task.}, keywords = {agglutinating languages; spelling correction; medical text processing}, year = {2016}, eissn = {1095-8363}, pages = {219-233} } @inproceedings{MTMT:2811567, title = {Igei vonzatkeretek és tematikus szerepek felismerése nyelvi erőforrások összekapcsolásával egy kereslet-kínálat elvű szövegelemzőben}, url = {https://m2.mtmt.hu/api/publication/2811567}, author = {Miháltz, Márton and Indig, Balázs and Prószéky, Gábor}, booktitle = {XI. Magyar Számítógépes Nyelvészeti Konferencia : MSZNY 2015}, unique-id = {2811567}, year = {2015}, pages = {298-302}, orcid-numbers = {Indig, Balázs/0000-0001-8090-3661} } @{MTMT:2847504, title = {An MLU estimation method for Hungarian transcripts}, url = {https://m2.mtmt.hu/api/publication/2847504}, author = {Orosz, György and Jelencsik-Mátyus, Kinga}, booktitle = {Text, Speech, and Dialogue}, doi = {10.1007/978-3-319-10816-2_22}, unique-id = {2847504}, abstract = {Mean length of utterance (MLU) is an important indicator for measuring complexity in child language. A generally employed method for calculating MLU is to use the CLAN toolkit, which includes modules that enable the measurement of utterance length in morphemes. However, these methods are based on rules which are only available for just a few languages not involving Hungarian. Therefore, in order to automatically analyze and measure Hungarian transcripts adequate methods need to be developed. In this paper we describe a new toolkit which is able to estimate MLU counts (in morphemes) while providing morphosyntactic tagging as well. Its components are based on existing resources; however, many of them were adapted to the language of the transcripts. The tool-chain performs the annotation task with a high pre cision and its MLU estimates are correlated with that of human experts.}, year = {2014}, pages = {173-180} } @inproceedings{MTMT:2847502, title = {Hol a határ? Mondatok, szavak, klinikák}, url = {https://m2.mtmt.hu/api/publication/2847502}, author = {Orosz, György and Prószéky, Gábor}, booktitle = {X. Magyar Számítógépes Nyelvészeti Konferencia : MSZNY 2014}, unique-id = {2847502}, year = {2014}, pages = {177-187} } @{MTMT:2843822, title = {Identifying and clustering relevant terms in clinical records using unsupervised methods}, url = {https://m2.mtmt.hu/api/publication/2843822}, author = {Novák, Borbála and Novák, Attila}, booktitle = {Statistical Language and Speech Processing}, doi = {10.1007/978-3-319-11397-5_18}, unique-id = {2843822}, year = {2014}, pages = {233-243} } @inproceedings{MTMT:2843807, title = {A New Form of Humor – Mapping Constraint-Based Computational Morphologies to a Finite-State Representation}, url = {https://m2.mtmt.hu/api/publication/2843807}, author = {Novák, Attila}, booktitle = {LREC 2014 - NINTH INTERNATIONAL CONFERENCE ON LANGUAGE RESOURCES AND EVALUATION}, unique-id = {2843807}, year = {2014}, pages = {1068-1073} } @inproceedings{MTMT:2824578, title = {An Efficient Language Independent Toolkit for Complete Morphological Disambiguation}, url = {https://m2.mtmt.hu/api/publication/2824578}, author = {Laki, László János and Orosz, György}, booktitle = {LREC 2014 - NINTH INTERNATIONAL CONFERENCE ON LANGUAGE RESOURCES AND EVALUATION}, unique-id = {2824578}, year = {2014}, pages = {1625-1630} } @inbook{MTMT:2797769, title = {A számítógépes nyelvészet hatása a nyelvleírásra}, url = {https://m2.mtmt.hu/api/publication/2797769}, author = {Prószéky, Gábor}, booktitle = {Elmélet és módszer}, unique-id = {2797769}, year = {2014}, pages = {315-322} }