Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

2025-04-28-localization #163

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
454 changes: 454 additions & 0 deletions _posts/2025-04-28-localization.md

Large diffs are not rendered by default.

242 changes: 242 additions & 0 deletions assets/bibliography/2025-04-28-localization.bib
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
@article{li2024inference,
title={Inference-time intervention: Eliciting truthful answers from a language model},
author={Li, Kenneth and Patel, Oam and Vi{\'e}gas, Fernanda and Pfister, Hanspeter and Wattenberg, Martin},
journal={Advances in Neural Information Processing Systems},
volume={36},
year={2024}
}

@article{lin2021truthfulqa,
title={Truthfulqa: Measuring how models mimic human falsehoods},
author={Lin, Stephanie and Hilton, Jacob and Evans, Owain},
journal={arXiv preprint arXiv:2109.07958},
year={2021}
}

@article{hase2024does,
title={Does localization inform editing? surprising differences in causality-based localization vs. knowledge editing in language models},
author={Hase, Peter and Bansal, Mohit and Kim, Been and Ghandeharioun, Asma},
journal={Advances in Neural Information Processing Systems},
volume={36},
year={2024}
}

@article{taori2023alpaca,
title={Alpaca: A strong, replicable instruction-following model},
author={Taori, Rohan and Gulrajani, Ishaan and Zhang, Tianyi and Dubois, Yann and Li, Xuechen and Guestrin, Carlos and Liang, Percy and Hashimoto, Tatsunori B},
journal={Stanford Center for Research on Foundation Models. https://crfm. stanford. edu/2023/03/13/alpaca. html},
volume={3},
number={6},
pages={7},
year={2023}
}



@inproceedings{azar2024general,
title={A general theoretical paradigm to understand learning from human preferences},
author={Azar, Mohammad Gheshlaghi and Guo, Zhaohan Daniel and Piot, Bilal and Munos, Remi and Rowland, Mark and Valko, Michal and Calandriello, Daniele},
booktitle={International Conference on Artificial Intelligence and Statistics},
pages={4447--4455},
year={2024},
organization={PMLR}
}

@article{hu2021lora,
title={Lora: Low-rank adaptation of large language models},
author={Hu, Edward J and Shen, Yelong and Wallis, Phillip and Allen-Zhu, Zeyuan and Li, Yuanzhi and Wang, Shean and Wang, Lu and Chen, Weizhu},
journal={arXiv preprint arXiv:2106.09685},
year={2021}
}

## representation engineering
@article{zou2023representation,
title={Representation engineering: A top-down approach to ai transparency},
author={Zou, Andy and Phan, Long and Chen, Sarah and Campbell, James and Guo, Phillip and Ren, Richard and Pan, Alexander and Yin, Xuwang and Mazeika, Mantas and Dombrowski, Ann-Kathrin and others},
journal={arXiv preprint arXiv:2310.01405},
year={2023}
}


@article{arditi2024refusal,
title={Refusal in Language Models Is Mediated by a Single Direction},
author={Arditi, Andy and Obeso, Oscar and Syed, Aaquib and Paleka, Daniel and Rimsky, Nina and Gurnee, Wes and Nanda, Neel},
journal={arXiv preprint arXiv:2406.11717},
year={2024}
}

@article{wang2023backdoor,
title={Backdoor activation attack: Attack large language models using activation steering for safety-alignment},
author={Wang, Haoran and Shu, Kai},
journal={arXiv preprint arXiv:2311.09433},
year={2023}
}

@inproceedings{chen2024truth,
title={Truth forest: Toward multi-scale truthfulness in large language models through intervention without tuning},
author={Chen, Zhongzhi and Sun, Xingwu and Jiao, Xianfeng and Lian, Fengzong and Kang, Zhanhui and Wang, Di and Xu, Chengzhong},
booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
volume={38},
number={19},
pages={20967--20974},
year={2024}
}

@article{wei2024assessing,
title={Assessing the brittleness of safety alignment via pruning and low-rank modifications},
author={Wei, Boyi and Huang, Kaixuan and Huang, Yangsibo and Xie, Tinghao and Qi, Xiangyu and Xia, Mengzhou and Mittal, Prateek and Wang, Mengdi and Henderson, Peter},
journal={arXiv preprint arXiv:2402.05162},
year={2024}
}

## mechanistic intrepretability
@article{meng2022locating,
title={Locating and editing factual associations in GPT},
author={Meng, Kevin and Bau, David and Andonian, Alex and Belinkov, Yonatan},
journal={Advances in Neural Information Processing Systems},
volume={35},
pages={17359--17372},
year={2022}
}

@article{vig2020causal,
title={Causal mediation analysis for interpreting neural nlp: The case of gender bias},
author={Vig, Jesse and Gehrmann, Sebastian and Belinkov, Yonatan and Qian, Sharon and Nevo, Daniel and Sakenis, Simas and Huang, Jason and Singer, Yaron and Shieber, Stuart},
journal={arXiv preprint arXiv:2004.12265},
year={2020}
}

@article{geiger2021causal,
title={Causal abstractions of neural networks},
author={Geiger, Atticus and Lu, Hanson and Icard, Thomas and Potts, Christopher},
journal={Advances in Neural Information Processing Systems},
volume={34},
pages={9574--9586},
year={2021}
}


@article{soulos2019discovering,
title={Discovering the compositional structure of vector representations with role learning networks},
author={Soulos, Paul and McCoy, Tom and Linzen, Tal and Smolensky, Paul},
journal={arXiv preprint arXiv:1910.09113},
year={2019}
}

@article{finlayson2021causal,
title={Causal analysis of syntactic agreement mechanisms in neural language models},
author={Finlayson, Matthew and Mueller, Aaron and Gehrmann, Sebastian and Shieber, Stuart and Linzen, Tal and Belinkov, Yonatan},
journal={arXiv preprint arXiv:2106.06087},
year={2021}
}



@article{wang2022interpretability,
title={Interpretability in the wild: a circuit for indirect object identification in gpt-2 small},
author={Wang, Kevin and Variengien, Alexandre and Conmy, Arthur and Shlegeris, Buck and Steinhardt, Jacob},
journal={arXiv preprint arXiv:2211.00593},
year={2022}
}

@inproceedings{chan2022causal,
title={Causal scrubbing: A method for rigorously testing interpretability hypotheses},
author={Chan, Lawrence and Garriga-Alonso, Adria and Goldowsky-Dill, Nicholas and Greenblatt, Ryan and Nitishinskaya, Jenny and Radhakrishnan, Ansh and Shlegeris, Buck and Thomas, Nate},
booktitle={AI Alignment Forum},
pages={1828--1843},
year={2022}
}


@article{hanna2024does,
title={How does gpt-2 compute greater-than?: Interpreting mathematical abilities in a pre-trained language model},
author={Hanna, Michael and Liu, Ollie and Variengien, Alexandre},
journal={Advances in Neural Information Processing Systems},
volume={36},
year={2024}
}

@article{conmy2023towards,
title={Towards automated circuit discovery for mechanistic interpretability},
author={Conmy, Arthur and Mavor-Parker, Augustine and Lynch, Aengus and Heimersheim, Stefan and Garriga-Alonso, Adri{\`a}},
journal={Advances in Neural Information Processing Systems},
volume={36},
pages={16318--16352},
year={2023}
}


@article{todd2023function,
title={Function vectors in large language models},
author={Todd, Eric and Li, Millicent L and Sharma, Arnab Sen and Mueller, Aaron and Wallace, Byron C and Bau, David},
journal={arXiv preprint arXiv:2310.15213},
year={2023}
}


@article{hendel2023context,
title={In-context learning creates task vectors},
author={Hendel, Roee and Geva, Mor and Globerson, Amir},
journal={arXiv preprint arXiv:2310.15916},
year={2023}
}


## Truthfulness
@article{joshi2023personas,
title={Personas as a way to model truthfulness in language models},
author={Joshi, Nitish and Rando, Javier and Saparov, Abulhair and Kim, Najoung and He, He},
journal={arXiv preprint arXiv:2310.18168},
year={2023}
}

@article{wang2020language,
title={Language models are open knowledge graphs},
author={Wang, Chenguang and Liu, Xiao and Song, Dawn},
journal={arXiv preprint arXiv:2010.11967},
year={2020}
}

@article{kadavath2022language,
title={Language models (mostly) know what they know},
author={Kadavath, Saurav and Conerly, Tom and Askell, Amanda and Henighan, Tom and Drain, Dawn and Perez, Ethan and Schiefer, Nicholas and Hatfield-Dodds, Zac and DasSarma, Nova and Tran-Johnson, Eli and others},
journal={arXiv preprint arXiv:2207.05221},
year={2022}
}

@article{saunders2022self,
title={Self-critiquing models for assisting human evaluators},
author={Saunders, William and Yeh, Catherine and Wu, Jeff and Bills, Steven and Ouyang, Long and Ward, Jonathan and Leike, Jan},
journal={arXiv preprint arXiv:2206.05802},
year={2022}
}

@article{burns2022discovering,
title={Discovering latent knowledge in language models without supervision},
author={Burns, Collin and Ye, Haotian and Klein, Dan and Steinhardt, Jacob},
journal={arXiv preprint arXiv:2212.03827},
year={2022}
}

@misc{openai2020api,
author = {{OpenAI}},
title = {OpenAI API},
year = {2020},
url = {https://openai.com/blog/openai-api/},
note = {Accessed: 2021-08-19}
}


@inproceedings{makelov2023subspace,
title={Is this the subspace you are looking for? An interpretability illusion for subspace activation patching},
author={Makelov, Aleksandar and Lange, Georg and Geiger, Atticus and Nanda, Neel},
booktitle={The Twelfth International Conference on Learning Representations},
year={2023}
}

@article{niu2024does,
title={What does the Knowledge Neuron Thesis Have to do with Knowledge?},
author={Niu, Jingcheng and Liu, Andrew and Zhu, Zining and Penn, Gerald},
journal={arXiv preprint arXiv:2405.02421},
year={2024}
}
Binary file added assets/img/2025-04-28-localization/hist_ipo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added assets/img/2025-04-28-localization/hist_ipo_1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added assets/img/2025-04-28-localization/hist_iti.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added assets/img/2025-04-28-localization/iti_kl_mc.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading