@inproceedings{rep-etal-2024-electras, title = "Are {ELECTRA}`s Sentence Embeddings Beyond Repair? The Case of Semantic Textual Similarity", author = "Rep, Ivan and Duki{\'c}, David and {\v{S}}najder, Jan", editor = "Al-Onaizan, Yaser and Bansal, Mohit and Chen, Yun-Nung", booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024", month = nov, year = "2024", address = "Miami, Florida, USA", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2024.findings-emnlp.535/", doi = "10.18653/v1/2024.findings-emnlp.535", pages = "9159--9169", abstract = "While BERT produces high-quality sentence embeddings, its pre-training computational cost is a significant drawback. In contrast, ELECTRA provides a cost-effective pre-training objective and downstream task performance improvements, but worse sentence embeddings. The community tacitly stopped utilizing ELECTRA`s sentence embeddings for semantic textual similarity (STS). We notice a significant drop in performance for the ELECTRA discriminator`s last layer in comparison to prior layers. We explore this drop and propose a way to repair the embeddings using a novel truncated model fine-tuning (TMFT) method. TMFT improves the Spearman correlation coefficient by over 8 points while increasing parameter efficiency on the STS Benchmark. We extend our analysis to various model sizes, languages, and two other tasks. Further, we discover the surprising efficacy of ELECTRA`s generator model, which performs on par with BERT, using significantly fewer parameters and a substantially smaller embedding size. Finally, we observe boosts by combining TMFT with word similarity or domain adaptive pre-training." }