The official code for paper: Beyond Surface Structure: A Causal Assessment of LLMs' Comprehension Ability (ICLR'25).
OS: Ubuntu 22.04.2 LTS
python=3.10.14
torch=2.4.1
transformers=4.43.4
llama-recipes=0.0.3
datasets=2.21.0
accelerate=0.34.2
evaluate=0.4.3
cd finetune
# civil comments
python finetune_civil.py --model_name llama-3-8b --level 0.3
# analytic entailment
python finetune_entail.py
To test closed-source models, run commands below:
cd dce_calculation
# civil comments
bash agent_main.sh --dataset_name 2_digit_multiplication --prompt superhigh --interv_type mask --num_mask 2 --mask_fix_position 0
# 2_digit_multiplication
bash agent_main.sh --dataset_name 2_digit_multiplication --prompt superhigh --interv_type mask --num_mask 2 --mask_fix_position 0
# analytic_entailment
bash agent_main.sh --dataset_name analytic_entailment --interv_type rephrase --prompt superhigh --mask_fix_position 0 --num_mask 2
# GSM8k
bash agent_main.sh --dataset_name GSM8k --prompt superhigh --interv_type mask --mask_fix_position 0 --num_mask 2
# word_unscrambling
bash agent_main.sh --dataset_name word_unscrambling --mask_fix_position 2 --num_mask 1 --interv_type mask --prompt superhigh
# CommonsenseQA
bash agent_main.sh --dataset_name commonsenseqa --prompt csuperhigh --interv_type rephrase --num_mask 2 --mask_fix_position 0
To test open-source models, run commands below:
cd dce_calculation
# civil comments
bash white_main.sh --dataset_name 2_digit_multiplication --prompt superhigh --interv_type mask --num_mask 2 --mask_fix_position 0
# 2_digit_multiplication
bash white_main.sh --dataset_name 2_digit_multiplication --prompt superhigh --interv_type mask --num_mask 2 --mask_fix_position 0
# analytic_entailment
bash white_main.sh --dataset_name analytic_entailment --interv_type rephrase --prompt superhigh --num_mask 2 --mask_fix_position 0
# GSM8k
bash white_main.sh --dataset_name GSM8k --prompt superhigh --interv_type mask --num_mask 2 --mask_fix_position 0
# word_unscrambling
bash white_main.sh --dataset_name word_unscrambling --mask_fix_position 2 --num_mask 1 --interv_type mask --prompt superhigh
# CommonsenQA
bash white_main.sh --dataset_name commonsenseqa --prompt csuperhigh --interv_type rephrase --num_mask 2 --mask_fix_position 0
cd intervention_rephrase
python generate_intervention_commonsenseqa.py
Please cite our paper if this repository inspires your work.
@misc{han2024surfacestructurecausalassessment,
title={Beyond Surface Structure: A Causal Assessment of LLMs' Comprehension Ability},
author={Yujin Han and Lei Xu and Sirui Chen and Difan Zou and Chaochao Lu},
year={2024},
eprint={2411.19456},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2411.19456},
}