iclr-blogposts · zihao12 · Nov 23, 2024 · Nov 23, 2024
diff --git a/_posts/2025-04-28-localization.md b/_posts/2025-04-28-localization.md
diff --git a/assets/bibliography/2025-04-28-localization.bib b/assets/bibliography/2025-04-28-localization.bib
@@ -0,0 +1,242 @@
+@article{li2024inference,
+  title={Inference-time intervention: Eliciting truthful answers from a language model},
+  author={Li, Kenneth and Patel, Oam and Vi{\'e}gas, Fernanda and Pfister, Hanspeter and Wattenberg, Martin},
+  journal={Advances in Neural Information Processing Systems},
+  volume={36},
+  year={2024}
+}
+
+@article{lin2021truthfulqa,
+  title={Truthfulqa: Measuring how models mimic human falsehoods},
+  author={Lin, Stephanie and Hilton, Jacob and Evans, Owain},
+  journal={arXiv preprint arXiv:2109.07958},
+  year={2021}
+}
+
+@article{hase2024does,
+  title={Does localization inform editing? surprising differences in causality-based localization vs. knowledge editing in language models},
+  author={Hase, Peter and Bansal, Mohit and Kim, Been and Ghandeharioun, Asma},
+  journal={Advances in Neural Information Processing Systems},
+  volume={36},
+  year={2024}
+}
+
+@article{taori2023alpaca,
+  title={Alpaca: A strong, replicable instruction-following model},
+  author={Taori, Rohan and Gulrajani, Ishaan and Zhang, Tianyi and Dubois, Yann and Li, Xuechen and Guestrin, Carlos and Liang, Percy and Hashimoto, Tatsunori B},
+  journal={Stanford Center for Research on Foundation Models. https://crfm. stanford. edu/2023/03/13/alpaca. html},
+  volume={3},
+  number={6},
+  pages={7},
+  year={2023}
+}
+
+
+
+@inproceedings{azar2024general,
+  title={A general theoretical paradigm to understand learning from human preferences},
+  author={Azar, Mohammad Gheshlaghi and Guo, Zhaohan Daniel and Piot, Bilal and Munos, Remi and Rowland, Mark and Valko, Michal and Calandriello, Daniele},
+  booktitle={International Conference on Artificial Intelligence and Statistics},
+  pages={4447--4455},
+  year={2024},
+  organization={PMLR}
+}
+
+@article{hu2021lora,
+  title={Lora: Low-rank adaptation of large language models},
+  author={Hu, Edward J and Shen, Yelong and Wallis, Phillip and Allen-Zhu, Zeyuan and Li, Yuanzhi and Wang, Shean and Wang, Lu and Chen, Weizhu},
+  journal={arXiv preprint arXiv:2106.09685},
+  year={2021}
+}
+
+## representation engineering
+@article{zou2023representation,
+  title={Representation engineering: A top-down approach to ai transparency},
+  author={Zou, Andy and Phan, Long and Chen, Sarah and Campbell, James and Guo, Phillip and Ren, Richard and Pan, Alexander and Yin, Xuwang and Mazeika, Mantas and Dombrowski, Ann-Kathrin and others},
+  journal={arXiv preprint arXiv:2310.01405},
+  year={2023}
+}
+
+
+@article{arditi2024refusal,
+  title={Refusal in Language Models Is Mediated by a Single Direction},
+  author={Arditi, Andy and Obeso, Oscar and Syed, Aaquib and Paleka, Daniel and Rimsky, Nina and Gurnee, Wes and Nanda, Neel},
+  journal={arXiv preprint arXiv:2406.11717},
+  year={2024}
+}
+
+@article{wang2023backdoor,
+  title={Backdoor activation attack: Attack large language models using activation steering for safety-alignment},
+  author={Wang, Haoran and Shu, Kai},
+  journal={arXiv preprint arXiv:2311.09433},
+  year={2023}
+}
+
+@inproceedings{chen2024truth,
+  title={Truth forest: Toward multi-scale truthfulness in large language models through intervention without tuning},
+  author={Chen, Zhongzhi and Sun, Xingwu and Jiao, Xianfeng and Lian, Fengzong and Kang, Zhanhui and Wang, Di and Xu, Chengzhong},
+  booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
+  volume={38},
+  number={19},
+  pages={20967--20974},
+  year={2024}
+}
+
+@article{wei2024assessing,
+  title={Assessing the brittleness of safety alignment via pruning and low-rank modifications},
+  author={Wei, Boyi and Huang, Kaixuan and Huang, Yangsibo and Xie, Tinghao and Qi, Xiangyu and Xia, Mengzhou and Mittal, Prateek and Wang, Mengdi and Henderson, Peter},
+  journal={arXiv preprint arXiv:2402.05162},
+  year={2024}
+}
+
+## mechanistic intrepretability
+@article{meng2022locating,
+  title={Locating and editing factual associations in GPT},
+  author={Meng, Kevin and Bau, David and Andonian, Alex and Belinkov, Yonatan},
+  journal={Advances in Neural Information Processing Systems},
+  volume={35},
+  pages={17359--17372},
+  year={2022}
+}
+
+@article{vig2020causal,
+  title={Causal mediation analysis for interpreting neural nlp: The case of gender bias},
+  author={Vig, Jesse and Gehrmann, Sebastian and Belinkov, Yonatan and Qian, Sharon and Nevo, Daniel and Sakenis, Simas and Huang, Jason and Singer, Yaron and Shieber, Stuart},
+  journal={arXiv preprint arXiv:2004.12265},
+  year={2020}
+}
+
+@article{geiger2021causal,
+  title={Causal abstractions of neural networks},
+  author={Geiger, Atticus and Lu, Hanson and Icard, Thomas and Potts, Christopher},
+  journal={Advances in Neural Information Processing Systems},
+  volume={34},
+  pages={9574--9586},
+  year={2021}
+}
+
+
+@article{soulos2019discovering,
+  title={Discovering the compositional structure of vector representations with role learning networks},
+  author={Soulos, Paul and McCoy, Tom and Linzen, Tal and Smolensky, Paul},
+  journal={arXiv preprint arXiv:1910.09113},
+  year={2019}
+}
+
+@article{finlayson2021causal,
+  title={Causal analysis of syntactic agreement mechanisms in neural language models},
+  author={Finlayson, Matthew and Mueller, Aaron and Gehrmann, Sebastian and Shieber, Stuart and Linzen, Tal and Belinkov, Yonatan},
+  journal={arXiv preprint arXiv:2106.06087},
+  year={2021}
+}
+
+
+
+@article{wang2022interpretability,
+  title={Interpretability in the wild: a circuit for indirect object identification in gpt-2 small},
+  author={Wang, Kevin and Variengien, Alexandre and Conmy, Arthur and Shlegeris, Buck and Steinhardt, Jacob},
+  journal={arXiv preprint arXiv:2211.00593},
+  year={2022}
+}
+
+@inproceedings{chan2022causal,
+  title={Causal scrubbing: A method for rigorously testing interpretability hypotheses},
+  author={Chan, Lawrence and Garriga-Alonso, Adria and Goldowsky-Dill, Nicholas and Greenblatt, Ryan and Nitishinskaya, Jenny and Radhakrishnan, Ansh and Shlegeris, Buck and Thomas, Nate},
+  booktitle={AI Alignment Forum},
+  pages={1828--1843},
+  year={2022}
+}
+
+
+@article{hanna2024does,
+  title={How does gpt-2 compute greater-than?: Interpreting mathematical abilities in a pre-trained language model},
+  author={Hanna, Michael and Liu, Ollie and Variengien, Alexandre},
+  journal={Advances in Neural Information Processing Systems},
+  volume={36},
+  year={2024}
+}
+
+@article{conmy2023towards,
+  title={Towards automated circuit discovery for mechanistic interpretability},
+  author={Conmy, Arthur and Mavor-Parker, Augustine and Lynch, Aengus and Heimersheim, Stefan and Garriga-Alonso, Adri{\`a}},
+  journal={Advances in Neural Information Processing Systems},
+  volume={36},
+  pages={16318--16352},
+  year={2023}
+}
+
+
+@article{todd2023function,
+  title={Function vectors in large language models},
+  author={Todd, Eric and Li, Millicent L and Sharma, Arnab Sen and Mueller, Aaron and Wallace, Byron C and Bau, David},
+  journal={arXiv preprint arXiv:2310.15213},
+  year={2023}
+}
+
+
+@article{hendel2023context,
+  title={In-context learning creates task vectors},
+  author={Hendel, Roee and Geva, Mor and Globerson, Amir},
+  journal={arXiv preprint arXiv:2310.15916},
+  year={2023}
+}
+
+
+## Truthfulness
+@article{joshi2023personas,
+  title={Personas as a way to model truthfulness in language models},
+  author={Joshi, Nitish and Rando, Javier and Saparov, Abulhair and Kim, Najoung and He, He},
+  journal={arXiv preprint arXiv:2310.18168},
+  year={2023}
+}
+
+@article{wang2020language,
+  title={Language models are open knowledge graphs},
+  author={Wang, Chenguang and Liu, Xiao and Song, Dawn},
+  journal={arXiv preprint arXiv:2010.11967},
+  year={2020}
+}
+
+@article{kadavath2022language,
+  title={Language models (mostly) know what they know},
+  author={Kadavath, Saurav and Conerly, Tom and Askell, Amanda and Henighan, Tom and Drain, Dawn and Perez, Ethan and Schiefer, Nicholas and Hatfield-Dodds, Zac and DasSarma, Nova and Tran-Johnson, Eli and others},
+  journal={arXiv preprint arXiv:2207.05221},
+  year={2022}
+}
+
+@article{saunders2022self,
+  title={Self-critiquing models for assisting human evaluators},
+  author={Saunders, William and Yeh, Catherine and Wu, Jeff and Bills, Steven and Ouyang, Long and Ward, Jonathan and Leike, Jan},
+  journal={arXiv preprint arXiv:2206.05802},
+  year={2022}
+}
+
+@article{burns2022discovering,
+  title={Discovering latent knowledge in language models without supervision},
+  author={Burns, Collin and Ye, Haotian and Klein, Dan and Steinhardt, Jacob},
+  journal={arXiv preprint arXiv:2212.03827},
+  year={2022}
+}
+
+@misc{openai2020api,
+  author = {{OpenAI}},
+  title = {OpenAI API},
+  year = {2020},
+  url = {https://openai.com/blog/openai-api/},
+  note = {Accessed: 2021-08-19}
+}
+
+
+@inproceedings{makelov2023subspace,
+  title={Is this the subspace you are looking for? An interpretability illusion for subspace activation patching},
+  author={Makelov, Aleksandar and Lange, Georg and Geiger, Atticus and Nanda, Neel},
+  booktitle={The Twelfth International Conference on Learning Representations},
+  year={2023}
+}
+
+@article{niu2024does,
+  title={What does the Knowledge Neuron Thesis Have to do with Knowledge?},
+  author={Niu, Jingcheng and Liu, Andrew and Zhu, Zining and Penn, Gerald},
+  journal={arXiv preprint arXiv:2405.02421},
+  year={2024}
+}
diff --git a/assets/img/2025-04-28-localization/hist_ipo.png b/assets/img/2025-04-28-localization/hist_ipo.png
diff --git a/assets/img/2025-04-28-localization/hist_ipo_1.png b/assets/img/2025-04-28-localization/hist_ipo_1.png
diff --git a/assets/img/2025-04-28-localization/hist_iti.png b/assets/img/2025-04-28-localization/hist_iti.png
diff --git a/assets/img/2025-04-28-localization/iti_kl_mc.png b/assets/img/2025-04-28-localization/iti_kl_mc.png
diff --git a/assets/img/2025-04-28-localization/iti_truth_info.png b/assets/img/2025-04-28-localization/iti_truth_info.png
diff --git a/assets/img/2025-04-28-localization/random_vs_top.png b/assets/img/2025-04-28-localization/random_vs_top.png
diff --git a/assets/img/2025-04-28-localization/random_vs_top_MC_KL.png b/assets/img/2025-04-28-localization/random_vs_top_MC_KL.png
diff --git a/assets/img/2025-04-28-localization/random_vs_top_truth_KL.png b/assets/img/2025-04-28-localization/random_vs_top_truth_KL.png
diff --git a/assets/img/2025-04-28-localization/single_vs_top.png b/assets/img/2025-04-28-localization/single_vs_top.png
diff --git a/assets/img/2025-04-28-localization/stronger_evidence_for_loc.png b/assets/img/2025-04-28-localization/stronger_evidence_for_loc.png