ref.bib


@misc{frieder_mathematical_2023,
	title = {Mathematical {Capabilities} of {ChatGPT}},
	url = {http://arxiv.org/abs/2301.13867},
	abstract = {We investigate the mathematical capabilities of ChatGPT by testing it on publicly available datasets, as well as hand-crafted ones, and measuring its performance against other models trained on a mathematical corpus, such as Minerva. We also test whether ChatGPT can be a useful assistant to professional mathematicians by emulating various use cases that come up in the daily professional activities of mathematicians (question answering, theorem searching). In contrast to formal mathematics, where large databases of formal proofs are available (e.g., the Lean Mathematical Library), current datasets of natural-language mathematics, used to benchmark language models, only cover elementary mathematics. We address this issue by introducing a new dataset: GHOSTS. It is the first natural-language dataset made and curated by working researchers in mathematics that (1) aims to cover graduate-level mathematics and (2) provides a holistic overview of the mathematical capabilities of language models. We benchmark ChatGPT on GHOSTS and evaluate performance against fine-grained criteria. We make this new dataset publicly available to assist a community-driven comparison of ChatGPT with (future) large language models in terms of advanced mathematical comprehension. We conclude that contrary to many positive reports in the media (a potential case of selection bias), ChatGPT's mathematical abilities are significantly below those of an average mathematics graduate student. Our results show that ChatGPT often understands the question but fails to provide correct solutions. Hence, if your goal is to use it to pass a university exam, you would be better off copying from your average peer!},
	urldate = {2023-04-21},
	publisher = {arXiv},
	author = {Frieder, Simon and Pinchetti, Luca and Griffiths, Ryan-Rhys and Salvatori, Tommaso and Lukasiewicz, Thomas and Petersen, Philipp Christian and Chevalier, Alexis and Berner, Julius},
	month = jan,
	year = {2023},
	note = {arXiv:2301.13867 [cs]},
	keywords = {Computer Science - Machine Learning, Computer Science - Artificial Intelligence, Computer Science - Computation and Language, /unread},
	annote = {Comment: The GHOSTS dataset will be available at https://github.com/friederrr/science-GHOSTS},
	file = {arXiv.org Snapshot:files/1057/2301.html:text/html;Full Text PDF:files/1058/Frieder 等 - 2023 - Mathematical Capabilities of ChatGPT.pdf:application/pdf},
}

@misc{gudibande_false_2023,
	title = {The {False} {Promise} of {Imitating} {Proprietary} {LLMs}},
	url = {http://arxiv.org/abs/2305.15717},
	abstract = {An emerging method to cheaply improve a weaker language model is to finetune it on outputs from a stronger model, such as a proprietary system like ChatGPT (e.g., Alpaca, Self-Instruct, and others). This approach looks to cheaply imitate the proprietary model's capabilities using a weaker open-source model. In this work, we critically analyze this approach. We first finetune a series of LMs that imitate ChatGPT using varying base model sizes (1.5B--13B), data sources, and imitation data amounts (0.3M--150M tokens). We then evaluate the models using crowd raters and canonical NLP benchmarks. Initially, we were surprised by the output quality of our imitation models -- they appear far better at following instructions, and crowd workers rate their outputs as competitive with ChatGPT. However, when conducting more targeted automatic evaluations, we find that imitation models close little to none of the gap from the base LM to ChatGPT on tasks that are not heavily supported in the imitation data. We show that these performance discrepancies may slip past human raters because imitation models are adept at mimicking ChatGPT's style but not its factuality. Overall, we conclude that model imitation is a false promise: there exists a substantial capabilities gap between open and closed LMs that, with current methods, can only be bridged using an unwieldy amount of imitation data or by using more capable base LMs. In turn, we argue that the highest leverage action for improving open-source models is to tackle the difficult challenge of developing better base LMs, rather than taking the shortcut of imitating proprietary systems.},
	urldate = {2023-06-14},
	publisher = {arXiv},
	author = {Gudibande, Arnav and Wallace, Eric and Snell, Charlie and Geng, Xinyang and Liu, Hao and Abbeel, Pieter and Levine, Sergey and Song, Dawn},
	month = may,
	year = {2023},
	note = {arXiv:2305.15717 [cs]},
	keywords = {Computer Science - Computation and Language},
	file = {arXiv.org Snapshot:files/1298/2305.html:text/html;Full Text PDF:files/1297/Gudibande 等 - 2023 - The False Promise of Imitating Proprietary LLMs.pdf:application/pdf},
}