@article{MTMT:34679063, title = {Utility function-based generalization of sum of ranking differences–country-wise analysis of greenhouse gas emissions}, url = {https://m2.mtmt.hu/api/publication/34679063}, author = {Ipkovich, Ádám and Héberger, Károly and Sebestyén, Viktor and Abonyi, János}, doi = {10.1016/j.ecolind.2024.111734}, journal-iso = {ECOL INDIC}, journal = {ECOLOGICAL INDICATORS}, volume = {160}, unique-id = {34679063}, issn = {1470-160X}, year = {2024}, eissn = {1872-7034}, orcid-numbers = {Ipkovich, Ádám/0000-0003-0617-1831; Sebestyén, Viktor/0000-0003-0670-9195; Abonyi, János/0000-0001-8593-1493} } @article{MTMT:34517630, title = {Frequent Errors in Modeling by Machine Learning: A Prototype Case of Predicting the Timely Evolution of COVID-19 Pandemic}, url = {https://m2.mtmt.hu/api/publication/34517630}, author = {Héberger, Károly}, doi = {10.3390/a17010043}, journal-iso = {ALGORITHMS}, journal = {ALGORITHMS}, volume = {17}, unique-id = {34517630}, abstract = {Background: The development and application of machine learning (ML) methods have become so fast that almost nobody can follow their developments in every detail. It is no wonder that numerous errors and inconsistencies in their usage have also spread with a similar speed independently from the tasks: regression and classification. This work summarizes frequent errors committed by certain authors with the aim of helping scientists to avoid them. Methods: The principle of parsimony governs the train of thought. Fair method comparison can be completed with multicriteria decision-making techniques, preferably by the sum of ranking differences (SRD). Its coupling with analysis of variance (ANOVA) decomposes the effects of several factors. Earlier findings are summarized in a review-like manner: the abuse of the correlation coefficient and proper practices for model discrimination are also outlined. Results: Using an illustrative example, the correct practice and the methodology are summarized as guidelines for model discrimination, and for minimizing the prediction errors. The following factors are all prerequisites for successful modeling: proper data preprocessing, statistical tests, suitable performance parameters, appropriate degrees of freedom, fair comparison of models, and outlier detection, just to name a few. A checklist is provided in a tutorial manner on how to present ML modeling properly. The advocated practices are reviewed shortly in the discussion. Conclusions: Many of the errors can easily be filtered out with careful reviewing. Every authors’ responsibility is to adhere to the rules of modeling and validation. A representative sampling of recent literature outlines correct practices and emphasizes that no error-free publication exists.}, year = {2024}, eissn = {1999-4893} } @article{MTMT:33897785, title = {Selection of optimal validation methods for quantitative structure–activity relationships and applicability domain}, url = {https://m2.mtmt.hu/api/publication/33897785}, author = {Héberger, Károly}, doi = {10.1080/1062936X.2023.2214871}, journal-iso = {SAR QSAR ENVIRON RES}, journal = {SAR AND QSAR IN ENVIRONMENTAL RESEARCH}, volume = {34}, unique-id = {33897785}, issn = {1062-936X}, abstract = {This brief literature survey groups the (numerical) validation methods and emphasizes the contradictions and confusion considering bias, variance and predictive performance. A multicriteria decision-making analysis has been made using the sum of absolute ranking differences (SRD), illustrated with five case studies (seven examples). SRD was applied to compare external and cross-validation techniques, indicators of predictive performance, and to select optimal methods to determine the applicability domain (AD). The ordering of model validation methods was in accordance with the sayings of original authors, but they are contradictory within each other, suggesting that any variant of cross-validation can be superior or inferior to other variants depending on the algorithm, data structure and circumstances applied. A simple fivefold cross-validation proved to be superior to the Bayesian Information Criterion in the vast majority of situations. It is simply not sufficient to test a numerical validation method in one situation only, even if it is a well defined one. SRD as a preferable multicriteria decision-making algorithm is suitable for tailoring the techniques for validation, and for the optimal determination of the applicability domain according to the dataset in question.}, year = {2023}, eissn = {1029-046X}, pages = {415-434} } @article{MTMT:33753835, title = {Matrix factorization-based multi-objective ranking–What makes a good university?}, url = {https://m2.mtmt.hu/api/publication/33753835}, author = {Abonyi, János and Ipkovich, Ádám and Dörgő, Gyula and Héberger, Károly}, doi = {10.1371/journal.pone.0284078}, journal-iso = {PLOS ONE}, journal = {PLOS ONE}, volume = {18}, unique-id = {33753835}, issn = {1932-6203}, abstract = {Non-negative matrix factorization (NMF) efficiently reduces high dimensionality for many -objective ranking problems. In multi-objective optimization, as long as only three or four conflicting viewpoints are present, an optimal solution can be determined by finding the Pareto front. When the number of the objectives increases, the multi-objective problem evolves into a many -objective optimization task, where the Pareto front becomes oversaturated. The key idea is that NMF aggregates the objectives so that the Pareto front can be applied, while the Sum of Ranking Differences (SRD) method selects the objectives that have a detrimental effect on the aggregation, and validates the findings. The applicability of the method is illustrated by the ranking of 1176 universities based on 46 variables of the CWTS Leiden Ranking 2020 database. The performance of NMF is compared to principal component analysis (PCA) and sparse non-negative matrix factorization-based solutions. The results illustrate that PCA incorporates negatively correlated objectives into the same principal component. On the contrary, NMF only allows non-negative correlations, which enable the proper use of the Pareto front. With the combination of NMF and SRD, a non-biased ranking of the universities based on 46 criteria is established, where Harvard, Rockefeller and Stanford Universities are determined as the first three. To evaluate the ranking capabilities of the methods, measures based on Relative Entropy (RE) and Hypervolume (HV) are proposed. The results confirm that the sparse NMF method provides the most informative ranking. The results highlight that academic excellence can be improved by decreasing the proportion of unknown open-access publications and short distance collaborations. The proportion of gender indicators barely correlate with scientific impact. More authors, long-distance collaborations, publications that have more scientific impact and citations on average highly influence the university ranking in a positive direction.}, year = {2023}, eissn = {1932-6203}, orcid-numbers = {Abonyi, János/0000-0001-8593-1493; Ipkovich, Ádám/0000-0003-0617-1831} } @article{MTMT:32992489, title = {Molecular Dynamics Simulations and Diversity Selection by Extended Continuous Similarity Indices}, url = {https://m2.mtmt.hu/api/publication/32992489}, author = {Rácz, Anita and Mihalovits, Levente Márk and Bajusz, Dávid and Héberger, Károly and Miranda-Quintana, Ramón Alain}, doi = {10.1021/acs.jcim.2c00433}, journal-iso = {J CHEM INF MODEL}, journal = {JOURNAL OF CHEMICAL INFORMATION AND MODELING}, volume = {62}, unique-id = {32992489}, issn = {1549-9596}, year = {2022}, eissn = {1549-960X}, pages = {3415-3425}, orcid-numbers = {Mihalovits, Levente Márk/0000-0003-1022-3294; Bajusz, Dávid/0000-0003-4277-9481; Miranda-Quintana, Ramón Alain/0000-0003-2121-4449} } @article{MTMT:32898327, title = {Multiobject Optimization of National Football League Drafts: Comparison of Teams and Experts}, url = {https://m2.mtmt.hu/api/publication/32898327}, author = {Gere, Attila and Szakál, Dorina and Héberger, Károly}, doi = {10.3390/app12136303}, journal-iso = {APPL SCI-BASEL}, journal = {APPLIED SCIENCES-BASEL}, volume = {12}, unique-id = {32898327}, year = {2022}, eissn = {2076-3417}, orcid-numbers = {Gere, Attila/0000-0003-3075-1561} } @article{MTMT:32867980, title = {Comparison of Descriptor- and Fingerprint Sets in Machine Learning Models for ADME-Tox Targets}, url = {https://m2.mtmt.hu/api/publication/32867980}, author = {Orosz, Álmos and Héberger, Károly and Rácz, Anita}, doi = {10.3389/fchem.2022.852893}, journal-iso = {FRONT CHEM}, journal = {FRONTIERS IN CHEMISTRY}, volume = {10}, unique-id = {32867980}, issn = {2296-2646}, year = {2022}, eissn = {2296-2646} } @article{MTMT:32741735, title = {Extended continuous similarity indices: theory and application for QSAR descriptor selection}, url = {https://m2.mtmt.hu/api/publication/32741735}, author = {Rácz, Anita and Dunn, Timothy B. and Bajusz, Dávid and Kim, Taewon D. and Miranda-Quintana, Ramón Alain and Héberger, Károly}, doi = {10.1007/s10822-022-00444-7}, journal-iso = {J COMPUT AID MOL DES}, journal = {JOURNAL OF COMPUTER-AIDED MOLECULAR DESIGN}, volume = {36}, unique-id = {32741735}, issn = {0920-654X}, year = {2022}, eissn = {1573-4951}, pages = {157-173}, orcid-numbers = {Bajusz, Dávid/0000-0003-4277-9481; Miranda-Quintana, Ramón Alain/0000-0003-2121-4449} } @article{MTMT:32710274, title = {Factor analysis, sparse PCA, and Sum of Ranking Differences-based improvements of the Promethee-GAIA multicriteria decision support technique}, url = {https://m2.mtmt.hu/api/publication/32710274}, author = {Abonyi, János and Czvetkó, Tímea and Kosztyán, Zsolt Tibor and Héberger, Károly}, doi = {10.1371/journal.pone.0264277}, journal-iso = {PLOS ONE}, journal = {PLOS ONE}, volume = {17}, unique-id = {32710274}, issn = {1932-6203}, year = {2022}, eissn = {1932-6203}, orcid-numbers = {Abonyi, János/0000-0001-8593-1493; Kosztyán, Zsolt Tibor/0000-0001-7345-8336} } @misc{MTMT:32732679, title = {Testing Rankings with Cross-Validation}, url = {https://m2.mtmt.hu/api/publication/32732679}, author = {Sziklai, Balázs and Baranyi, Máté and Héberger, Károly}, unique-id = {32732679}, year = {2021}, orcid-numbers = {Baranyi, Máté/0000-0003-4415-4805} }